[tor-commits] [metrics-web/release] Rewrite censorship detector in Java.
karsten at torproject.org
karsten at torproject.org
Sat Nov 9 21:45:06 UTC 2019
commit a367168a782e864bdacb610857b1dc5d58fd192d
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Sun Dec 9 12:02:42 2018 +0100
Rewrite censorship detector in Java.
This allows us to remove the last remaining Python parts from the daily
updater.
Implements #21588.
---
build.xml | 26 --
.../torproject/metrics/stats/clients/Detector.java | 433 +++++++++++++++++++++
.../org/torproject/metrics/stats/clients/Main.java | 5 +
src/main/python/clients/country_info.py | 255 ------------
src/main/python/clients/detector.py | 242 ------------
5 files changed, 438 insertions(+), 523 deletions(-)
diff --git a/build.xml b/build.xml
index 6736e19..93eda7b 100644
--- a/build.xml
+++ b/build.xml
@@ -23,7 +23,6 @@
<property name="tardepends" value="war" />
<property name="Rsources" value="${basedir}/src/main/R" />
- <property name="pysources" value="${basedir}/src/main/python" />
<property name="specdir" value="${basedir}/generated/spec" />
@@ -360,32 +359,7 @@
<target name="clients" >
<property name="module.name" value="clients" />
- <property name="localmoddir" value="${modulebase}/${module.name}" />
-
- <property name="statsdir"
- value="${localmoddir}/stats" />
- <mkdir dir="${statsdir}" />
-
<antcall target="run-java" />
-
- <antcall target="run-R" >
- <param name="module.Rscript" value="userstats-detector.R" />
- </antcall>
-
- <exec executable="python"
- dir="${localmoddir}"
- failonerror="true" >
- <arg value="${pysources}/${module.name}/detector.py" />
- <arg value="userstats-detector.csv" />
- <arg value="userstats-ranges.csv" />
- </exec>
-
- <antcall target="run-R" >
- <param name="module.Rscript" value="merge-clients.R" />
- </antcall>
-
- <copy file="${localmoddir}/clients.csv" todir="${statsdir}" />
- <copy file="${localmoddir}/userstats-combined.csv" todir="${statsdir}" />
</target>
<target name="servers" >
diff --git a/src/main/java/org/torproject/metrics/stats/clients/Detector.java b/src/main/java/org/torproject/metrics/stats/clients/Detector.java
new file mode 100644
index 0000000..1a523c2
--- /dev/null
+++ b/src/main/java/org/torproject/metrics/stats/clients/Detector.java
@@ -0,0 +1,433 @@
+/* Copyright 2011 George Danezis <gdane at microsoft.com>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted (subject to the limitations in the
+ * disclaimer below) provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * * Neither the name of <Owner Organization> nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+ * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * (Clear BSD license:
+ * http://labs.metacarta.com/license-explanation.html#license)
+ *
+ * Copyright 2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.metrics.stats.clients;
+
+import org.apache.commons.math3.distribution.NormalDistribution;
+import org.apache.commons.math3.distribution.PoissonDistribution;
+import org.apache.commons.math3.stat.descriptive.moment.Mean;
+import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation;
+import org.apache.commons.math3.stat.descriptive.rank.Percentile;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.time.LocalDate;
+import java.time.format.DateTimeParseException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+/** Censorship detector that reads a .csv file of the number of Tor clients and
+ * finds anomalies that might be indicative of censorship. */
+public class Detector {
+
+ /** Input file. */
+ private static final Path INPUT_PATH = Paths.get("stats", "userstats.csv");
+
+ /** Output file. */
+ private static final Path OUTPUT_PATH = Paths.get("stats", "clients.csv");
+
+ /** Number of largest locations to be included in the detection algorithm. */
+ private static final int NUM_LARGEST_LOCATIONS = 50;
+
+ /** Time interval in days to model connection rates. */
+ private static final int INTERV = 7;
+
+ /** Compound key under which client estimates are stored in both input and
+ * output files. */
+ private static class ClientsKey implements Comparable<ClientsKey> {
+
+ /** Date when clients connected to the Tor network. */
+ private LocalDate date;
+
+ /** Whether clients connected via relays (true) or bridges (false). */
+ private boolean nodeIsRelay;
+
+ /** Two-letter lower-case country code of the country from which clients
+ * connected, "??" if the country could not be resolved, or left empty for
+ * all countries together. */
+ private String country;
+
+ /** Name of the transport used by clients to connect using bridges, or left
+ * empty for all transports together. */
+ private String transport = "";
+
+ /** IP version used by clients to connect using bridges, or left empty for
+ * all IP versions together. */
+ private String version = "";
+
+ ClientsKey(LocalDate date, boolean nodeIsRelay, String country) {
+ this.date = date;
+ this.nodeIsRelay = nodeIsRelay;
+ this.country = country;
+ }
+
+ ClientsKey(LocalDate date, boolean nodeIsRelay, String country,
+ String transport, String version) {
+ this(date, nodeIsRelay, country);
+ this.transport = transport;
+ this.version = version;
+ }
+
+ @Override
+ public int compareTo(ClientsKey other) {
+ if (!this.date.equals(other.date)) {
+ return this.date.compareTo(other.date);
+ } else if (!this.nodeIsRelay && other.nodeIsRelay) {
+ return -1;
+ } else if (this.nodeIsRelay && !other.nodeIsRelay) {
+ return 1;
+ } else if (!this.country.equals(other.country)) {
+ return this.country.compareTo(other.country);
+ } else if (!this.transport.equals(other.transport)) {
+ return this.transport.compareTo(other.transport);
+ } else if (!this.version.equals(other.version)) {
+ return this.version.compareTo(other.version);
+ } else {
+ return 0;
+ }
+ }
+
+ @Override
+ public boolean equals(Object otherObject) {
+ if (!(otherObject instanceof ClientsKey)) {
+ return false;
+ } else {
+ ClientsKey other = (ClientsKey) otherObject;
+ return this.date.equals(other.date)
+ && this.nodeIsRelay == other.nodeIsRelay
+ && this.country.equals(other.country)
+ && this.transport.equals(other.transport)
+ && this.version.equals(other.version);
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ return 3 * this.date.hashCode() + (this.nodeIsRelay ? 5 : 0)
+ + 7 * this.country.hashCode() + 11 * this.transport.hashCode()
+ + 13 * this.version.hashCode();
+ }
+
+ @Override
+ public String toString() {
+ return String.format("%s,%s,%s,%s,%s",
+ this.date.toString(), this.nodeIsRelay ? "relay" : "bridge",
+ this.country, this.transport, this.version);
+ }
+ }
+
+ /** Value class that stores everything we already knew about a specific
+ * subset of clients from the input file. */
+ private static class ClientsEstimates {
+
+ /** Estimated number of clients. */
+ private int clients;
+
+ /** Fraction of relays or bridges in percent that the estimate is based on,
+ * between 0 and 100. */
+ private int frac;
+
+ ClientsEstimates(int clients, int frac) {
+ this.clients = clients;
+ this.frac = frac;
+ }
+
+ @Override
+ public String toString() {
+ return String.format("%d,%d", this.clients, this.frac);
+ }
+ }
+
+ /** Value class that stores everything we're computing here about a specific
+ * subset of clients from the input file. */
+ private static class ClientsRanges {
+
+ /** Lower number of expected clients under the assumption that there has
+ * been no censorship event, as computed here. */
+ private int lower;
+
+ /** Upper number of expected clients under the assumption that there has
+ * been no release of censorship, as computed here. */
+ private int upper;
+
+ ClientsRanges(int lower, int upper) {
+ this.lower = lower;
+ this.upper = upper;
+ }
+
+ @Override
+ public String toString() {
+ return String.format("%d,%d", this.lower, this.upper);
+ }
+ }
+
+ /** Run censorship detection. */
+ public void detect() throws IOException {
+ SortedMap<ClientsKey, ClientsEstimates> estimates = readInputFile();
+ Set<String> largestLocations = findLargestLocations(estimates);
+ Map<LocalDate, List<Double>> ratios = computeRatiosOfLargestLocations(
+ estimates, largestLocations);
+ Map<LocalDate, List<Double>> ratiosWithoutOutliers = removeOutliers(ratios);
+ SortedMap<ClientsKey, ClientsRanges> ranges = computeRanges(estimates,
+ ratiosWithoutOutliers);
+ writeOutputFile(estimates, ranges);
+ }
+
+ /** Read and return the parsed input file containing comma-separated estimates
+ * of client numbers. */
+ private static SortedMap<ClientsKey, ClientsEstimates> readInputFile()
+ throws IOException {
+ SortedMap<ClientsKey, ClientsEstimates> estimates = new TreeMap<>();
+ File inputFile = INPUT_PATH.toFile();
+ if (!inputFile.exists()) {
+ throw new IOException(String.format("Input file %s does not exist.",
+ inputFile));
+ }
+ try (LineNumberReader lnr = new LineNumberReader(
+ new FileReader(inputFile))) {
+ String line = lnr.readLine();
+ if (!"date,node,country,transport,version,frac,users".equals(line)) {
+ throw new IOException(String.format("Unable to read input file %s with "
+ + "unrecognized header line '%s'. Not running detector.", inputFile,
+ line));
+ }
+ while ((line = lnr.readLine()) != null) {
+ ClientsKey key = null;
+ ClientsEstimates value = null;
+ boolean invalidLine = false;
+ String[] lineParts = line.split(",");
+ if (lineParts.length == 7) {
+ try {
+ LocalDate date = LocalDate.parse(lineParts[0]);
+ boolean nodeIsRelay = false;
+ if ("relay".equals(lineParts[1])) {
+ nodeIsRelay = true;
+ } else if (!"bridge".equals(lineParts[1])) {
+ invalidLine = true;
+ }
+ String country = lineParts[2].replaceAll("\"", "");
+ String transport = lineParts[3].replaceAll("\"", "");
+ String version = lineParts[4].replaceAll("\"", "");
+ key = new ClientsKey(date, nodeIsRelay, country, transport,
+ version);
+ } catch (DateTimeParseException e) {
+ invalidLine = true;
+ }
+ try {
+ int frac = Integer.parseInt(lineParts[5]);
+ int clients = Integer.parseInt(lineParts[6]);
+ value = new ClientsEstimates(clients, frac);
+ } catch (NumberFormatException e) {
+ invalidLine = true;
+ }
+ } else {
+ invalidLine = true;
+ }
+ if (invalidLine) {
+ throw new IOException(String.format(
+ "Invalid line %d '%s' in input file %s.", lnr.getLineNumber(),
+ line, inputFile));
+ } else {
+ estimates.put(key, value);
+ }
+ }
+ }
+ return estimates;
+ }
+
+ /** Return the NUM_LARGEST_LOCATIONS countries (except for "??") with the
+ * largest number of estimated clients on the last known date in the input
+ * data set.
+ *
+ * <p>Note that this implies that lower/upper values are going to change,
+ * depending on which countries had most clients on the last known date in the
+ * input data set.</p> */
+ private static Set<String> findLargestLocations(
+ SortedMap<ClientsKey, ClientsEstimates> clients) throws IOException {
+ LocalDate lastKnownDate = clients.keySet().stream()
+ .filter(c -> c.nodeIsRelay)
+ .map(c -> c.date)
+ .max(LocalDate::compareTo)
+ .orElseThrow(() -> new IOException("Unable to find maximum date. Was "
+ + "the input file empty or otherwise corrupt?"));
+ return clients.entrySet().stream()
+ .filter(c -> lastKnownDate.equals(c.getKey().date))
+ .filter(c -> c.getKey().nodeIsRelay)
+ .filter(c -> !"".equals(c.getKey().country))
+ .filter(c -> !"??".equals(c.getKey().country))
+ .sorted((c1, c2) -> Integer.compare(c2.getValue().clients,
+ c1.getValue().clients))
+ .map(c -> c.getKey().country)
+ .limit(NUM_LARGEST_LOCATIONS)
+ .collect(Collectors.toSet());
+ }
+
+ /** Compute the ratio of the client number estimate for a given date and
+ * country as compared to 1 week before, for all dates, for relay users, and
+ * for the largest locations. */
+ private static Map<LocalDate, List<Double>> computeRatiosOfLargestLocations(
+ SortedMap<ClientsKey, ClientsEstimates> estimates,
+ Set<String> largestLocations) {
+ Map<LocalDate, List<Double>> ratios = new HashMap<>();
+ for (Map.Entry<ClientsKey, ClientsEstimates> numerator
+ : estimates.entrySet()) {
+ if (!numerator.getKey().nodeIsRelay
+ || !largestLocations.contains(numerator.getKey().country)) {
+ continue;
+ }
+ ClientsEstimates denominator = estimates.get(new ClientsKey(
+ numerator.getKey().date.minusDays(INTERV), true,
+ numerator.getKey().country));
+ if (null == denominator || denominator.clients == 0) {
+ continue;
+ }
+ if (!ratios.containsKey(numerator.getKey().date)) {
+ ratios.put(numerator.getKey().date, new ArrayList<>());
+ }
+ ratios.get(numerator.getKey().date).add(
+ ((double) numerator.getValue().clients)
+ / (double) denominator.clients);
+ }
+ return ratios;
+ }
+
+ /** Exclude outliers from the given ratios by date that fall outside four
+ * inter-quartile ranges of the median and make sure that at least 8 ratio
+ * values remain. */
+ private static SortedMap<LocalDate, List<Double>> removeOutliers(
+ Map<LocalDate, List<Double>> ratios) {
+ SortedMap<LocalDate, List<Double>> ratiosWithoutOutliers = new TreeMap<>();
+ for (Map.Entry<LocalDate, List<Double>> e : ratios.entrySet()) {
+ double[] values = e.getValue().stream().mapToDouble(Double::doubleValue)
+ .toArray();
+ Percentile percentile = new Percentile()
+ .withEstimationType(Percentile.EstimationType.R_7);
+ percentile.setData(values);
+ double median = percentile.evaluate(50.0);
+ double firstQuarter = percentile.evaluate(25.0);
+ double thirdQuarter = percentile.evaluate(75.0);
+ double interQuartileRange = thirdQuarter - firstQuarter;
+ List<Double> valuesWithoutOutliers = new ArrayList<>();
+ for (double value : values) {
+ if (value > median - 4 * interQuartileRange
+ && value < median + 4 * interQuartileRange) {
+ valuesWithoutOutliers.add(value);
+ }
+ }
+ if (valuesWithoutOutliers.size() < 8) {
+ continue;
+ }
+ LocalDate date = e.getKey();
+ ratiosWithoutOutliers.put(date, valuesWithoutOutliers);
+ }
+ return ratiosWithoutOutliers;
+ }
+
+ /** Compute ranges as the expected minimum and maximum number of users. */
+ private static SortedMap<ClientsKey, ClientsRanges> computeRanges(
+ SortedMap<ClientsKey, ClientsEstimates> estimates,
+ Map<LocalDate, List<Double>> ratiosWithoutOutliers) {
+ SortedMap<ClientsKey, ClientsRanges> ranges = new TreeMap<>();
+ for (Map.Entry<ClientsKey, ClientsEstimates> estimatesEntry
+ : estimates.entrySet()) {
+ LocalDate date = estimatesEntry.getKey().date;
+ if (!estimatesEntry.getKey().nodeIsRelay
+ || "".equals(estimatesEntry.getKey().country)
+ || "??".equals(estimatesEntry.getKey().country)
+ || !ratiosWithoutOutliers.containsKey(date)) {
+ continue;
+ }
+ ClientsEstimates referenceEstimate = estimates.get(
+ new ClientsKey(date.minusDays(INTERV),
+ true, estimatesEntry.getKey().country));
+ if (null == referenceEstimate || referenceEstimate.clients == 0) {
+ continue;
+ }
+ double[] values = ratiosWithoutOutliers.get(date).stream()
+ .mapToDouble(Double::doubleValue).toArray();
+ double mean = new Mean().evaluate(values);
+ double std = new StandardDeviation(false).evaluate(values);
+ NormalDistribution normalDistribution = new NormalDistribution(mean, std);
+ PoissonDistribution poissonDistribution
+ = new PoissonDistribution(referenceEstimate.clients);
+ int lower = Math.max(0,
+ (int) (normalDistribution.inverseCumulativeProbability(0.0001)
+ * poissonDistribution.inverseCumulativeProbability(0.0001)));
+ int upper =
+ (int) (normalDistribution.inverseCumulativeProbability(0.9999)
+ * poissonDistribution.inverseCumulativeProbability(0.9999));
+ ranges.put(estimatesEntry.getKey(), new ClientsRanges(lower, upper));
+ }
+ return ranges;
+ }
+
+ /** Write client number estimates together with lower and upper bounds as
+ * comma-separated values to the output file. */
+ private static void writeOutputFile(
+ SortedMap<ClientsKey, ClientsEstimates> estimates,
+ SortedMap<ClientsKey, ClientsRanges> ranges) throws IOException {
+ try (BufferedWriter bw = new BufferedWriter(
+ new FileWriter(OUTPUT_PATH.toFile()))) {
+ bw.write(
+ "date,node,country,transport,version,lower,upper,clients,frac\n");
+ for (Map.Entry<ClientsKey, ClientsEstimates> e : estimates.entrySet()) {
+ String rangesString = ",";
+ if (ranges.containsKey(e.getKey())) {
+ rangesString = ranges.get(e.getKey()).toString();
+ }
+ bw.write(String.format("%s,%s,%s%n", e.getKey().toString(),
+ rangesString, e.getValue().toString()));
+ }
+ }
+ }
+}
+
diff --git a/src/main/java/org/torproject/metrics/stats/clients/Main.java b/src/main/java/org/torproject/metrics/stats/clients/Main.java
index 48d8d8d..0f1087b 100644
--- a/src/main/java/org/torproject/metrics/stats/clients/Main.java
+++ b/src/main/java/org/torproject/metrics/stats/clients/Main.java
@@ -59,6 +59,11 @@ public class Main {
log.info("Disconnecting from database.");
database.close();
+
+ log.info("Running detector.");
+ new Detector().detect();
+
+ log.info("Terminating clients module.");
}
private static final long ONE_HOUR_MILLIS = 60L * 60L * 1000L;
diff --git a/src/main/python/clients/country_info.py b/src/main/python/clients/country_info.py
deleted file mode 100644
index 1a505d0..0000000
--- a/src/main/python/clients/country_info.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# -*- coding: utf-8 -*-
-
-countries = {
- "ad" : "Andorra",
- "ae" : "the United Arab Emirates",
- "af" : "Afghanistan",
- "ag" : "Antigua and Barbuda",
- "ai" : "Anguilla",
- "al" : "Albania",
- "am" : "Armenia",
- "an" : "the Netherlands Antilles",
- "ao" : "Angola",
- "aq" : "Antarctica",
- "ar" : "Argentina",
- "as" : "American Samoa",
- "at" : "Austria",
- "au" : "Australia",
- "aw" : "Aruba",
- "ax" : "the Aland Islands",
- "az" : "Azerbaijan",
- "ba" : "Bosnia and Herzegovina",
- "bb" : "Barbados",
- "bd" : "Bangladesh",
- "be" : "Belgium",
- "bf" : "Burkina Faso",
- "bg" : "Bulgaria",
- "bh" : "Bahrain",
- "bi" : "Burundi",
- "bj" : "Benin",
- "bl" : "Saint Bartelemey",
- "bm" : "Bermuda",
- "bn" : "Brunei",
- "bo" : "Bolivia",
- "bq" : "Bonaire, Sint Eustatius and Saba",
- "br" : "Brazil",
- "bs" : "the Bahamas",
- "bt" : "Bhutan",
- "bv" : "the Bouvet Island",
- "bw" : "Botswana",
- "by" : "Belarus",
- "bz" : "Belize",
- "ca" : "Canada",
- "cc" : "the Cocos (Keeling) Islands",
- "cd" : "the Democratic Republic of the Congo",
- "cf" : "Central African Republic",
- "cg" : "Congo",
- "ch" : "Switzerland",
- "ci" : u"Côte d'Ivoire",
- "ck" : "the Cook Islands",
- "cl" : "Chile",
- "cm" : "Cameroon",
- "cn" : "China",
- "co" : "Colombia",
- "cr" : "Costa Rica",
- "cu" : "Cuba",
- "cv" : "Cape Verde",
- "cw" : u"Curaçao",
- "cx" : "the Christmas Island",
- "cy" : "Cyprus",
- "cz" : "the Czech Republic",
- "de" : "Germany",
- "dj" : "Djibouti",
- "dk" : "Denmark",
- "dm" : "Dominica",
- "do" : "the Dominican Republic",
- "dz" : "Algeria",
- "ec" : "Ecuador",
- "ee" : "Estonia",
- "eg" : "Egypt",
- "eh" : "the Western Sahara",
- "er" : "Eritrea",
- "es" : "Spain",
- "et" : "Ethiopia",
- "fi" : "Finland",
- "fj" : "Fiji",
- "fk" : "the Falkland Islands (Malvinas)",
- "fm" : "the Federated States of Micronesia",
- "fo" : "the Faroe Islands",
- "fr" : "France",
- "ga" : "Gabon",
- "gb" : "the United Kingdom",
- "gd" : "Grenada",
- "ge" : "Georgia",
- "gf" : "French Guiana",
- "gg" : "Guernsey",
- "gh" : "Ghana",
- "gi" : "Gibraltar",
- "gl" : "Greenland",
- "gm" : "Gambia",
- "gn" : "Guinea",
- "gp" : "Guadeloupe",
- "gq" : "Equatorial Guinea",
- "gr" : "Greece",
- "gs" : "South Georgia and the South Sandwich Islands",
- "gt" : "Guatemala",
- "gu" : "Guam",
- "gw" : "Guinea-Bissau",
- "gy" : "Guyana",
- "hk" : "Hong Kong",
- "hm" : "Heard Island and McDonald Islands",
- "hn" : "Honduras",
- "hr" : "Croatia",
- "ht" : "Haiti",
- "hu" : "Hungary",
- "id" : "Indonesia",
- "ie" : "Ireland",
- "il" : "Israel",
- "im" : "the Isle of Man",
- "in" : "India",
- "io" : "the British Indian Ocean Territory",
- "iq" : "Iraq",
- "ir" : "Iran",
- "is" : "Iceland",
- "it" : "Italy",
- "je" : "Jersey",
- "jm" : "Jamaica",
- "jo" : "Jordan",
- "jp" : "Japan",
- "ke" : "Kenya",
- "kg" : "Kyrgyzstan",
- "kh" : "Cambodia",
- "ki" : "Kiribati",
- "km" : "Comoros",
- "kn" : "Saint Kitts and Nevis",
- "kp" : "North Korea",
- "kr" : "the Republic of Korea",
- "kw" : "Kuwait",
- "ky" : "the Cayman Islands",
- "kz" : "Kazakhstan",
- "la" : "Laos",
- "lb" : "Lebanon",
- "lc" : "Saint Lucia",
- "li" : "Liechtenstein",
- "lk" : "Sri Lanka",
- "lr" : "Liberia",
- "ls" : "Lesotho",
- "lt" : "Lithuania",
- "lu" : "Luxembourg",
- "lv" : "Latvia",
- "ly" : "Libya",
- "ma" : "Morocco",
- "mc" : "Monaco",
- "md" : "the Republic of Moldova",
- "me" : "Montenegro",
- "mf" : "Saint Martin",
- "mg" : "Madagascar",
- "mh" : "the Marshall Islands",
- "mk" : "Macedonia",
- "ml" : "Mali",
- "mm" : "Burma",
- "mn" : "Mongolia",
- "mo" : "Macau",
- "mp" : "the Northern Mariana Islands",
- "mq" : "Martinique",
- "mr" : "Mauritania",
- "ms" : "Montserrat",
- "mt" : "Malta",
- "mu" : "Mauritius",
- "mv" : "the Maldives",
- "mw" : "Malawi",
- "mx" : "Mexico",
- "my" : "Malaysia",
- "mz" : "Mozambique",
- "na" : "Namibia",
- "nc" : "New Caledonia",
- "ne" : "Niger",
- "nf" : "Norfolk Island",
- "ng" : "Nigeria",
- "ni" : "Nicaragua",
- "nl" : "the Netherlands",
- "no" : "Norway",
- "np" : "Nepal",
- "nr" : "Nauru",
- "nu" : "Niue",
- "nz" : "New Zealand",
- "om" : "Oman",
- "pa" : "Panama",
- "pe" : "Peru",
- "pf" : "French Polynesia",
- "pg" : "Papua New Guinea",
- "ph" : "the Philippines",
- "pk" : "Pakistan",
- "pl" : "Poland",
- "pm" : "Saint Pierre and Miquelon",
- "pn" : "the Pitcairn Islands",
- "pr" : "Puerto Rico",
- "ps" : "the Palestinian Territory",
- "pt" : "Portugal",
- "pw" : "Palau",
- "py" : "Paraguay",
- "qa" : "Qatar",
- "re" : "Reunion",
- "ro" : "Romania",
- "rs" : "Serbia",
- "ru" : "Russia",
- "rw" : "Rwanda",
- "sa" : "Saudi Arabia",
- "sb" : "the Solomon Islands",
- "sc" : "the Seychelles",
- "sd" : "Sudan",
- "se" : "Sweden",
- "sg" : "Singapore",
- "sh" : "Saint Helena",
- "si" : "Slovenia",
- "sj" : "Svalbard and Jan Mayen",
- "sk" : "Slovakia",
- "sl" : "Sierra Leone",
- "sm" : "San Marino",
- "sn" : "Senegal",
- "so" : "Somalia",
- "sr" : "Suriname",
- "ss" : "South Sudan",
- "st" : u"São Tomé and Príncipe",
- "sv" : "El Salvador",
- "sx" : "Sint Maarten",
- "sy" : "the Syrian Arab Republic",
- "sz" : "Swaziland",
- "tc" : "Turks and Caicos Islands",
- "td" : "Chad",
- "tf" : "the French Southern Territories",
- "tg" : "Togo",
- "th" : "Thailand",
- "tj" : "Tajikistan",
- "tk" : "Tokelau",
- "tl" : "East Timor",
- "tm" : "Turkmenistan",
- "tn" : "Tunisia",
- "to" : "Tonga",
- "tr" : "Turkey",
- "tt" : "Trinidad and Tobago",
- "tv" : "Tuvalu",
- "tw" : "Taiwan",
- "tz" : "the United Republic of Tanzania",
- "ua" : "Ukraine",
- "ug" : "Uganda",
- "um" : "the United States Minor Outlying Islands",
- "us" : "the United States",
- "uy" : "Uruguay",
- "uz" : "Uzbekistan",
- "va" : "Vatican City",
- "vc" : "Saint Vincent and the Grenadines",
- "ve" : "Venezuela",
- "vg" : "the British Virgin Islands",
- "vi" : "the United States Virgin Islands",
- "vn" : "Vietnam",
- "vu" : "Vanuatu",
- "wf" : "Wallis and Futuna",
- "ws" : "Samoa",
- "xk" : "Kosovo",
- "ye" : "Yemen",
- "yt" : "Mayotte",
- "za" : "South Africa",
- "zm" : "Zambia",
- "zw" : "Zimbabwe"
- }
diff --git a/src/main/python/clients/detector.py b/src/main/python/clients/detector.py
deleted file mode 100644
index b0a98af..0000000
--- a/src/main/python/clients/detector.py
+++ /dev/null
@@ -1,242 +0,0 @@
-## Copyright (c) 2011 George Danezis <gdane at microsoft.com>
-##
-## All rights reserved.
-##
-## Redistribution and use in source and binary forms, with or without
-## modification, are permitted (subject to the limitations in the
-## disclaimer below) provided that the following conditions are met:
-##
-## * Redistributions of source code must retain the above copyright
-## notice, this list of conditions and the following disclaimer.
-##
-## * Redistributions in binary form must reproduce the above copyright
-## notice, this list of conditions and the following disclaimer in the
-## documentation and/or other materials provided with the
-## distribution.
-##
-## * Neither the name of <Owner Organization> nor the names of its
-## contributors may be used to endorse or promote products derived
-## from this software without specific prior written permission.
-##
-## NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-## GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
-## HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-## WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-## MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-## DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-## LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-## CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-## SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-## BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
-## IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-##
-## (Clear BSD license: http://labs.metacarta.com/license-explanation.html#license)
-
-## This script reads a .csv file of the number of Tor users and finds
-## anomalies that might be indicative of censorship.
-
-# Dep: numpy
-import numpy
-from numpy import mean, std, percentile
-
-# Dep: scipy
-import scipy.stats
-from scipy.stats.distributions import norm
-from scipy.stats.distributions import poisson
-
-# Std lib
-from datetime import date
-from datetime import timedelta
-import os.path
-
-# Country code -> Country names
-import country_info
-
-# write utf8 to file
-import codecs
-
-def get_country_name_from_cc(country_code):
- if (country_code.lower() in country_info.countries):
- return country_info.countries[country_code.lower()]
- return country_code # if we didn't find the cc in our map
-
-"""
-Represents a .csv file containing information on the number of
-connecting Tor users per country.
-
-'store': Dictionary with (<country code>, <counter>) as key, and the number of users as value.
- <country code> can also be "date"...
-'all_dates': List of the data intervals (with default timedelta: 1 day).
-'country_codes': List of all relevant country codes.
-'MAX_INDEX': Length of store, number of country codes etc.
-'date_min': The oldest date found in the .csv.
-'date_min': The latest date found in the .csv.
-"""
-class torstatstore:
- def __init__(self, file_name):
- f = file(file_name)
- country_codes = f.readline()
- country_codes = country_codes.strip().split(",")
-
- store = {}
- MAX_INDEX = 0
- for i, line in enumerate(f):
- MAX_INDEX += 1
- line_parsed = line.strip().split(",")
- for j, (ccode, val) in enumerate(zip(country_codes,line_parsed)):
- processed_val = None
- if ccode == "date":
- try:
- year, month, day = int(val[:4]), int(val[5:7]), int(val[8:10])
- processed_val = date(year, month, day)
- except Exception, e:
- print "Parsing error (ignoring line %s):" % j
- print "%s" % val,e
- break
-
- elif val != "NA":
- processed_val = int(val)
- store[(ccode, i)] = processed_val
-
- # min and max
- date_min = store[("date", 0)]
- date_max = store[("date", i)]
-
- all_dates = []
- d = date_min
- dt = timedelta(days=1)
- while d <= date_max:
- all_dates += [d]
- d = d + dt
-
- # Save for later
- self.store = store
- self.all_dates = all_dates
- self.country_codes = country_codes
- self.MAX_INDEX = MAX_INDEX
- self.date_min = date_min
- self.date_max = date_max
-
- """Return a list representing a time series of 'ccode' with respect
- to the number of connected users.
- """
- def get_country_series(self, ccode):
- assert ccode in self.country_codes
- series = {}
- for d in self.all_dates:
- series[d] = None
- for i in range(self.MAX_INDEX):
- series[self.store[("date", i)]] = self.store[(ccode, i)]
- sx = []
- for d in self.all_dates:
- sx += [series[d]]
- return sx
-
- """Return an ordered list containing tuples of the form (<number of
- users>, <country code>). The list is ordered with respect to the
- number of users for each country.
- """
- def get_largest(self, number):
- exclude = set(["all", "??", "date"])
- l = [(self.store[(c, self.MAX_INDEX-1)], c) for c in self.country_codes if c not in exclude]
- l.sort()
- l.reverse()
- return l[:number]
-
- """Return a dictionary, with <country code> as key, and the time
- series of the country code as the value.
- """
- def get_largest_locations(self, number):
- l = self.get_largest(number)
- res = {}
- for _, ccode in l[:number]:
- res[ccode] = self.get_country_series(ccode)
- return res
-
-"""Return a list containing lists (?) where each such list contains
-the difference in users for a time delta of 'days'
-"""
-def n_day_rel(series, days):
- rel = []
- for i, v in enumerate(series):
- if series[i] is None:
- rel += [None]
- continue
-
- if i - days < 0 or series[i-days] is None or series[i-days] == 0:
- rel += [None]
- else:
- rel += [ float(series[i]) / series[i-days]]
- return rel
-
-# Main model: computes the expected min / max range of number of users
-def make_tendencies_minmax(l, INTERVAL = 1):
- lminus1 = dict([(ccode, n_day_rel(l[ccode], INTERVAL)) for ccode in l])
- c = lminus1[lminus1.keys()[0]]
- dists = []
- minx = []
- maxx = []
- for i in range(len(c)):
- vals = [lminus1[ccode][i] for ccode in lminus1.keys() if lminus1[ccode][i] != None]
- if len(vals) < 8:
- dists += [None]
- minx += [None]
- maxx += [None]
- else:
- vals.sort()
- median = percentile(vals, 50)
- q1 = percentile(vals, 25)
- q2 = percentile(vals, 75)
- qd = q2 - q1
- vals = [v for v in vals if median - qd*4 < v and v < median + qd*4]
- if len(vals) < 8:
- dists += [None]
- minx += [None]
- maxx += [None]
- continue
- mu = mean(vals)
- signma = std(vals)
- dists += [(mu, signma)]
- maxx += [norm.ppf(0.9999, mu, signma)]
- minx += [norm.ppf(1 - 0.9999, mu, signma)]
- ## print minx[-1], maxx[-1]
- return minx, maxx
-
-"""Write a CSV report on the minimum/maximum users of each country per date."""
-def write_all(tss, minc, maxc, RANGES_FILE, INTERVAL=7):
- ranges_file = file(RANGES_FILE, "w")
- ranges_file.write("date,country,minusers,maxusers\n")
- exclude = set(["all", "??", "date"])
- for c in tss.country_codes:
- if c in exclude:
- continue
- series = tss.get_country_series(c)
- for i, v in enumerate(series):
- if i > 0 and i - INTERVAL >= 0 and series[i] != None and series[i-INTERVAL] != None and series[i-INTERVAL] != 0 and minc[i]!= None and maxc[i]!= None:
- minv = minc[i] * poisson.ppf(1-0.9999, series[i-INTERVAL])
- maxv = maxc[i] * poisson.ppf(0.9999, series[i-INTERVAL])
- if not minv < maxv:
- print minv, maxv, series[i-INTERVAL], minc[i], maxc[i]
- assert minv < maxv
- if minv < 0.0:
- minv = 0.0
- ranges_file.write("%s,%s,%s,%s\n" % (tss.all_dates[i], c, minv, maxv))
- ranges_file.close()
-
-# INTERV is the time interval to model connection rates;
-# consider maximum DAYS days back.
-def detect(CSV_FILE = "userstats-detector.csv",
- RANGES_FILE = "userstats-ranges.csv",
- INTERV = 7, DAYS = 6 * 31):
- tss = torstatstore(CSV_FILE)
- l = tss.get_largest_locations(50)
- minx, maxx = make_tendencies_minmax(l, INTERV)
- write_all(tss, minx, maxx, RANGES_FILE, INTERV)
-
-def main():
- detect()
-
-if __name__ == "__main__":
- main()
More information about the tor-commits
mailing list