[tor-commits] [metrics-tasks/master] Add code behind bridge user counting report (#5807).
karsten at torproject.org
karsten at torproject.org
Wed Oct 24 15:35:00 UTC 2012
commit d301479e3fdd8c1d17f02f6d72a8cf6eaa87e17e
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Wed Oct 24 11:34:10 2012 -0400
Add code behind bridge user counting report (#5807).
---
task-5807/bridge-dirreq-stats.R | 222 +++++++++++
task-5807/run.sh | 3 +
task-5807/src/EvalBridgeDirreqStats.java | 603 ++++++++++++++++++++++++++++++
3 files changed, 828 insertions(+), 0 deletions(-)
diff --git a/task-5807/bridge-dirreq-stats.R b/task-5807/bridge-dirreq-stats.R
new file mode 100644
index 0000000..2a07a3a
--- /dev/null
+++ b/task-5807/bridge-dirreq-stats.R
@@ -0,0 +1,222 @@
+library(ggplot2)
+library(reshape)
+library(scales)
+
+# Commented out, because this graph takes a while to draw...
+#d <- read.csv("out/dirreq-responses", stringsAsFactors = FALSE,
+# header = FALSE)
+#d <- data.frame(date = as.Date(d$V1), requests = d$V4,
+# asrelay = ifelse(d$V3, "also seen as\nnon-bridge relays",
+# "only seen as\nbridges"))
+#ggplot(d, aes(x = date, y = requests)) +
+#geom_point() +
+#facet_grid(asrelay ~ .) +
+#scale_x_date(name = "",
+# labels = date_format("%b %Y"),
+# minor_breaks = date_breaks("1 month")) +
+#scale_y_continuous(name = "", labels = comma_format(digits = 1))
+#ggsave("graphs/responses-single-bridges.png", width = 6, height = 3.5,
+# dpi = 100)
+
+# ALTERNATIVE: out/bridge-dirreq-stats-no-relays
+b <- read.csv("out/bridge-dirreq-stats-all-bridges",
+ stringsAsFactors = FALSE)
+b <- b[b$date >= "2011-07-01" & b$date <= "2012-09-30", ]
+
+x <- data.frame(date = b$date,
+ value = (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) /
+ ((b$ha + b$hc) * b$nabcd))
+x <- melt(x, id = "date")
+ggplot(x, aes(x = as.Date(date), y = value)) +
+geom_line() +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", limit = c(0, 1), labels = percent)
+ggsave("graphs/fraction.pdf", width = 6, height = 3, dpi = 100)
+
+ggplot(b, aes(x = as.Date(date), y = (ra + rb) / 86400)) +
+geom_line() +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/responses.pdf", width = 6, height = 3, dpi = 72)
+
+x <- data.frame(
+ date = as.Date(b$date),
+ responses = (b$ra + b$rb) / 86400,
+ fraction = (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) /
+ ((b$ha + b$hc) * b$nabcd),
+ totalresponses = ((b$ra + b$rb) * (b$ha + b$hc) *
+ b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) / 86400)
+x <- melt(x, id = "date")
+x <- data.frame(date = x$date, value = x$value, variable =
+ ifelse(x$variable == "responses",
+ "1. Reported directory\nrequests",
+ ifelse(x$variable == "fraction", paste("2. Estimated fraction\n",
+ "of bridges reporting\ndirectory requests", sep = ""),
+ "3. Estimated directory\nrequests in the\nnetwork")))
+ggplot(x, aes(x = as.Date(date), y = value)) +
+geom_line() +
+facet_grid(variable ~ ., scales = "free_y") +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/extrapolated-responses.pdf", width = 6, height = 5,
+ dpi = 72)
+
+ggplot(b, aes(x = as.Date(date), y = (na + nb) / nabcd)) +
+geom_line() +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", limit = c(0, 1), labels = percent)
+ggsave("graphs/fraction-unweighted.pdf", width = 6, height = 3, dpi = 72)
+
+x <- data.frame(date = b$date,
+ #x1 = (b$ra + b$rb) * b$nabcd / (b$na + b$nb),
+ x2 = ((b$ra + b$rb) * (b$ha + b$hc) *
+ b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb))
+#x <- melt(x, id = "date")
+ggplot(x, aes(x = as.Date(date), y = x2 / 86400)) +
+geom_line() +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/totalresponses.pdf", width = 6, height = 3, dpi = 72)
+
+n <- data.frame(date = b$date, na = b$na / 86400, nb = b$nb / 86400,
+ nc = b$nc / 86400, nd = (b$nabcd - b$na - b$nb - b$nc) / 86400)
+n <- melt(n, id = "date")
+ggplot(n, aes(x = as.Date(date), y = value)) +
+geom_line() +
+facet_grid(variable ~ .) +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/n.pdf", width = 6, height = 7, dpi = 100)
+
+h <- data.frame(date = b$date, value = (b$ha + b$hc) / 86400)
+ggplot(h, aes(x = as.Date(date), y = value)) +
+geom_line() +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/history-bytes.pdf", width = 6, height = 3, dpi = 100)
+
+h <- data.frame(date = b$date, ha = b$ha / 86400, hc = b$hc / 86400)
+h <- melt(h, id = "date")
+ggplot(h, aes(x = as.Date(date), y = value)) +
+geom_line() +
+facet_grid(variable ~ .) +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/h.pdf", width = 6, height = 5, dpi = 100)
+
+r <- data.frame(date = b$date, ra = b$ra / 86400, rb = b$rb / 86400)
+r <- melt(r, id = "date")
+ggplot(r, aes(x = as.Date(date), y = value)) +
+geom_line() +
+facet_grid(variable ~ .) +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/r.pdf", width = 6, height = 5, dpi = 100)
+
+x <- data.frame(date = b$date,
+ value = ((b$ra + b$rb) * (b$ha + b$hc) *
+ b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) / 864000,
+ stringsAsFactors = FALSE)
+x <- melt(x, id = "date")
+ggplot(x, aes(x = as.Date(date), y = value)) +
+geom_line() +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/totalusers.pdf", width = 6, height = 3, dpi = 100)
+x <- x[x$date >= '2012-07-01', ]
+#max_y <- max(x$value / 864000, na.omit = FALSE)
+ggplot(x, aes(x = as.Date(date), y = value)) +
+geom_line() +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ breaks = date_breaks("1 month"),
+ minor_breaks = date_breaks("1 week")) +
+scale_y_continuous(name = "", #limit = c(0, max_y),
+ labels = comma_format(digits = 1))
+ggsave("graphs/totalusers-q3-2012.pdf", width = 6, height = 3, dpi = 100)
+
+ggplot(b, aes(x = as.Date(date), y = consensuses)) +
+geom_point() +
+geom_hline(yintercept = 19.5, linetype = 2) +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/consensuses.pdf", width = 6, height = 3, dpi = 100)
+
+x <- data.frame(date = b$date,
+ value = (b$sy * (b$ra + b$rb) * (b$ha + b$hc) *
+ b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb))
+x <- melt(x, id = "date")
+ggplot(x, aes(x = as.Date(date), y = value / 864000)) +
+geom_line() +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/syusers.pdf", width = 6, height = 3, dpi = 100)
+
+u <- read.csv("bridge-users.csv", stringsAsFactors = FALSE)
+u <- u[u$date >= "2011-07-01" & u$date <= "2012-09-30", ]
+u <- data.frame(date = u$date, all = u$all)
+ggplot(u, aes(x = as.Date(date), y = all)) +
+geom_line() +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/totalusers-oldapproach.pdf", width = 6, height = 3,
+ dpi = 100)
+
+u <- read.csv("bridge-users.csv", stringsAsFactors = FALSE)
+u <- u[u$date >= "2011-07-01" & u$date <= "2012-09-30", ]
+u <- data.frame(date = u$date, value = u$all,
+ variable = "old approach based on\nunique IP addresses",
+ stringsAsFactors = FALSE)
+x <- data.frame(date = b$date,
+ value = ((b$ra + b$rb) * (b$ha + b$hc) *
+ b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) / 864000,
+ variable = "new approach based on\ndirectory requests",
+ stringsAsFactors = FALSE)
+u <- rbind(u, x)
+ggplot(u, aes(x = as.Date(date), y = value)) +
+geom_line() +
+facet_grid(variable ~ ., scales = "free_y") +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/compare-totalusers.pdf", width = 6, height = 4,
+ dpi = 100)
+u <- u[u$date >= '2012-07-01', ]
+ggplot(u, aes(x = as.Date(date), y = value)) +
+geom_line() +
+facet_grid(variable ~ ., scales = "free_y") +
+scale_x_date(name = "",
+ labels = date_format("%b %Y"),
+ breaks = date_breaks("1 month"),
+ minor_breaks = date_breaks("1 week")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/compare-totalusers-q3-2012.pdf", width = 6, height = 4,
+ dpi = 100)
+
diff --git a/task-5807/run.sh b/task-5807/run.sh
new file mode 100755
index 0000000..52d1ee7
--- /dev/null
+++ b/task-5807/run.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+javac -d bin/ -cp lib/commons-codec-1.4.jar:lib/commons-compress-1.3.jar:lib/descriptor.jar src/EvalBridgeDirreqStats.java && time java -Xmx6g -cp bin/:lib/commons-codec-1.4.jar:lib/commons-compress-1.3.jar:lib/descriptor.jar EvalBridgeDirreqStats
+
diff --git a/task-5807/src/EvalBridgeDirreqStats.java b/task-5807/src/EvalBridgeDirreqStats.java
new file mode 100644
index 0000000..c996a26
--- /dev/null
+++ b/task-5807/src/EvalBridgeDirreqStats.java
@@ -0,0 +1,603 @@
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TimeZone;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import org.apache.commons.codec.binary.Hex;
+import org.apache.commons.codec.digest.DigestUtils;
+import org.torproject.descriptor.BridgeNetworkStatus;
+import org.torproject.descriptor.Descriptor;
+import org.torproject.descriptor.DescriptorFile;
+import org.torproject.descriptor.DescriptorReader;
+import org.torproject.descriptor.DescriptorSourceFactory;
+import org.torproject.descriptor.ExtraInfoDescriptor;
+import org.torproject.descriptor.NetworkStatusEntry;
+import org.torproject.descriptor.RelayNetworkStatusConsensus;
+
+/* Extract relevant pieces of information from relay consensuses and
+ * bridge descriptors to estimate daily bridge users. See README for
+ * usage instructions. */
+public class EvalBridgeDirreqStats {
+ public static void main(String[] args) throws Exception {
+
+ /* Parse relay consensuses from in/relay-descriptors/. Skip this step
+ * if in/relay-descriptors/ does not exist. */
+ File consensusesDirectory = new File("in/relay-descriptors");
+ File hashedFingerprintsFile = new File("out/hashed-fingerprints");
+ File consensusesPerDayFile = new File("out/consensuses-per-day");
+ SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+ dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+ if (consensusesDirectory.exists()) {
+ SortedSet<String> hashedFingerprints = new TreeSet<String>();
+ SortedMap<String, Integer> consensusesPerDay =
+ new TreeMap<String, Integer>();
+ DescriptorReader descriptorReader =
+ DescriptorSourceFactory.createDescriptorReader();
+ descriptorReader.addDirectory(consensusesDirectory);
+ Iterator<DescriptorFile> descriptorFiles =
+ descriptorReader.readDescriptors();
+ while (descriptorFiles.hasNext()) {
+ DescriptorFile descriptorFile = descriptorFiles.next();
+ for (Descriptor descriptor : descriptorFile.getDescriptors()) {
+ if (!(descriptor instanceof RelayNetworkStatusConsensus)) {
+ continue;
+ }
+ RelayNetworkStatusConsensus consensus =
+ (RelayNetworkStatusConsensus) descriptor;
+
+ /* Extract hashed fingerprints of all known relays to remove
+ * those fingerprints from bridge usage statistics later on. */
+ for (NetworkStatusEntry statusEntry :
+ consensus.getStatusEntries().values()) {
+ hashedFingerprints.add(Hex.encodeHexString(DigestUtils.sha(
+ Hex.decodeHex(statusEntry.getFingerprint().
+ toCharArray()))).toUpperCase());
+ }
+
+ /* Count the number of consensuses per day. */
+ String date = dateFormat.format(
+ consensus.getValidAfterMillis());
+ int consensuses = 1;
+ if (consensusesPerDay.containsKey(date)) {
+ consensuses += consensusesPerDay.get(date);
+ }
+ consensusesPerDay.put(date, consensuses);
+ }
+ }
+ hashedFingerprintsFile.getParentFile().mkdirs();
+ BufferedWriter bw = new BufferedWriter(new FileWriter(
+ hashedFingerprintsFile));
+ for (String hashedFingerprint : hashedFingerprints) {
+ bw.write(hashedFingerprint + "\n");
+ }
+ bw.close();
+ consensusesPerDayFile.getParentFile().mkdirs();
+ bw = new BufferedWriter(new FileWriter(consensusesPerDayFile));
+ for (Map.Entry<String, Integer> e : consensusesPerDay.entrySet()) {
+ bw.write(e.getKey() + "," + e.getValue() + "\n");
+ }
+ bw.close();
+ }
+
+ /* Parse bridge network statuses from in/bridge-descriptors/. Skip
+ * this step if in/bridge-descriptors/ does not exist. */
+ File bridgeDescriptorsDirectory = new File("in/bridge-descriptors");
+ File bridgesPerDayFile = new File("out/bridges-per-day");
+ File dirreqResponsesFile = new File("out/dirreq-responses");
+ File dirreqWriteHistoryFile = new File("out/dirreq-write-history");
+ File bridgeStatsUsersFile = new File("out/bridge-stats-users");
+ SimpleDateFormat dateTimeFormat = new SimpleDateFormat(
+ "yyyy-MM-dd HH:mm:ss");
+ dateTimeFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+ if (bridgeDescriptorsDirectory.exists()) {
+
+ /* Read hashed fingerprints from disk, so that we can include in the
+ * intermediate files whether a bridge was running as non-bridge
+ * relay before. */
+ SortedSet<String> hashedFingerprints = new TreeSet<String>();
+ String line;
+ BufferedReader br = new BufferedReader(new FileReader(
+ hashedFingerprintsFile));
+ while ((line = br.readLine()) != null) {
+ hashedFingerprints.add(line.toUpperCase());
+ }
+ br.close();
+
+ /* Prepare data structures for first collecting everything we parse.
+ * There may be duplicates which we can best remove in memory. */
+ SortedMap<String, List<Integer>> bridgesPerDay =
+ new TreeMap<String, List<Integer>>();
+ SortedSet<String> dirreqResponses = new TreeSet<String>();
+ SortedMap<String, SortedMap<Long, Long>> dirreqWriteHistory =
+ new TreeMap<String, SortedMap<Long, Long>>();
+ SortedSet<String> bridgeIps = new TreeSet<String>();
+
+ /* Parse everything in in/bridge-descriptors/. */
+ DescriptorReader descriptorReader =
+ DescriptorSourceFactory.createDescriptorReader();
+ descriptorReader.addDirectory(bridgeDescriptorsDirectory);
+ Iterator<DescriptorFile> descriptorFiles =
+ descriptorReader.readDescriptors();
+ while (descriptorFiles.hasNext()) {
+ DescriptorFile descriptorFile = descriptorFiles.next();
+ for (Descriptor descriptor : descriptorFile.getDescriptors()) {
+ if (descriptor instanceof BridgeNetworkStatus) {
+ BridgeNetworkStatus status = (BridgeNetworkStatus) descriptor;
+
+ /* Extract number of running bridges to calculate daily means.
+ * Skip network statuses where less than 1% of bridges have
+ * the Running flag. */
+ String date = dateFormat.format(status.getPublishedMillis());
+ int totalBridges = 0, runningBridges = 0;
+ for (NetworkStatusEntry statusEntry :
+ status.getStatusEntries().values()) {
+ totalBridges++;
+ if (statusEntry.getFlags().contains("Running")) {
+ runningBridges++;
+ }
+ }
+ if (runningBridges * 100 > totalBridges) {
+ if (!bridgesPerDay.containsKey(date)) {
+ bridgesPerDay.put(date, new ArrayList<Integer>());
+ }
+ bridgesPerDay.get(date).add(runningBridges);
+ }
+ } else if (descriptor instanceof ExtraInfoDescriptor) {
+ ExtraInfoDescriptor extraInfoDescriptor =
+ (ExtraInfoDescriptor) descriptor;
+ String fingerprint = extraInfoDescriptor.getFingerprint().
+ toUpperCase();
+ String wasSeenAsRelay = hashedFingerprints.contains(
+ fingerprint) ? "TRUE" : "FALSE";
+
+ /* Extract v3 directory request response numbers from dirreq
+ * stats, if available. */
+ if (extraInfoDescriptor.getDirreqStatsEndMillis() >= 0 &&
+ extraInfoDescriptor.getDirreqStatsIntervalLength()
+ == 86400 &&
+ extraInfoDescriptor.getDirreqV3Resp() != null &&
+ extraInfoDescriptor.getDirreqV3Resp().containsKey("ok")) {
+ String dirreqStatsEnd = dateTimeFormat.format(
+ extraInfoDescriptor.getDirreqStatsEndMillis());
+ SortedMap<String, Integer> resp =
+ extraInfoDescriptor.getDirreqV3Resp();
+ String ok = String.valueOf(resp.get("ok"));
+ String notEnoughSigs = resp.containsKey("not-enough-sigs")
+ ? String.valueOf(resp.get("not-enough-sigs")) : "NA";
+ String unavailable = resp.containsKey("unavailable")
+ ? String.valueOf(resp.get("unavailable")) : "NA";
+ String notFound = resp.containsKey("not-found")
+ ? String.valueOf(resp.get("not-found")) : "NA";
+ String notModified = resp.containsKey("not-modified")
+ ? String.valueOf(resp.get("not-modified")) : "NA";
+ String busy = resp.containsKey("busy")
+ ? String.valueOf(resp.get("busy")) : "NA";
+ dirreqResponses.add(String.format(
+ "%s,%s,%s,%s,%s,%s,%s,%s%n", dirreqStatsEnd,
+ fingerprint, wasSeenAsRelay, ok, notEnoughSigs,
+ unavailable, notFound, notModified, busy));
+ }
+
+ /* Extract written directory bytes, if available. */
+ if (extraInfoDescriptor.getDirreqWriteHistory() != null &&
+ extraInfoDescriptor.getDirreqWriteHistory().
+ getIntervalLength() == 900) {
+ if (!dirreqWriteHistory.containsKey(fingerprint)) {
+ dirreqWriteHistory.put(fingerprint,
+ new TreeMap<Long, Long>());
+ }
+ dirreqWriteHistory.get(fingerprint).putAll(
+ extraInfoDescriptor.getDirreqWriteHistory().
+ getBandwidthValues());
+ }
+
+ /* Sum up unique IP address counts from .sy and from all
+ * countries from bridge stats, if available. */
+ if (extraInfoDescriptor.getBridgeStatsEndMillis() >= 0 &&
+ extraInfoDescriptor.getBridgeStatsIntervalLength()
+ == 86400 &&
+ extraInfoDescriptor.getBridgeIps() != null) {
+ String bridgeStatsEnd = dateTimeFormat.format(
+ extraInfoDescriptor.getBridgeStatsEndMillis());
+ int sy = 0, all = 0;
+ for (Map.Entry<String, Integer> e :
+ extraInfoDescriptor.getBridgeIps().entrySet()) {
+ String country = e.getKey();
+ int adjustedIps = e.getValue() - 4;
+ if (country.equals("sy")) {
+ sy = adjustedIps;
+ }
+ all += adjustedIps;
+ }
+ bridgeIps.add(String.format("%s,%s,%s,%d,%d%n",
+ bridgeStatsEnd, fingerprint, wasSeenAsRelay, sy, all));
+ }
+ }
+ }
+ }
+
+ /* Write to disk what we learned while parsing bridge extra-info
+ * descriptors. */
+ bridgesPerDayFile.getParentFile().mkdirs();
+ BufferedWriter bw = new BufferedWriter(new FileWriter(
+ bridgesPerDayFile));
+ for (Map.Entry<String, List<Integer>> e :
+ bridgesPerDay.entrySet()) {
+ String date = e.getKey();
+ List<Integer> bridges = e.getValue();
+ int sum = 0;
+ for (int b : bridges) {
+ sum += b;
+ }
+ bw.write(String.format("%s,%d%n", date, sum / bridges.size()));
+ }
+ bw.close();
+ dirreqResponsesFile.getParentFile().mkdirs();
+ bw = new BufferedWriter(new FileWriter(dirreqResponsesFile));
+ for (String resp : dirreqResponses) {
+ bw.write(resp);
+ }
+ bw.close();
+ bridgeStatsUsersFile.getParentFile().mkdirs();
+ bw = new BufferedWriter(new FileWriter(bridgeStatsUsersFile));
+ for (String ips : bridgeIps) {
+ bw.write(ips);
+ }
+ bw.close();
+ bw = new BufferedWriter(new FileWriter(dirreqWriteHistoryFile));
+ for (Map.Entry<String, SortedMap<Long, Long>> e :
+ dirreqWriteHistory.entrySet()) {
+ String fingerprint = e.getKey();
+ String wasSeenAsRelay = hashedFingerprints.contains(
+ fingerprint) ? "TRUE" : "FALSE";
+ for (Map.Entry<Long, Long> f : e.getValue().entrySet()) {
+ String historyIntervalEnd = dateTimeFormat.format(f.getKey());
+ bw.write(String.format("%s,%s,%d,%s%n", fingerprint,
+ historyIntervalEnd, f.getValue(), wasSeenAsRelay));
+ }
+ }
+ bw.close();
+ }
+
+ /* Aggregate the parse results from above and write relevant data for
+ * estimating daily bridge users to disk. Write results to
+ * out/bridge-dirreq-stats. This step is distinct from the parsing
+ * steps, so that the parsing only has to be done once, whereas the
+ * aggregation can be tweaked and re-run easily. */
+ File bridgeDirreqStatsNoRelaysFile =
+ new File("out/bridge-dirreq-stats-no-relays");
+ File bridgeDirreqStatsAllBridgesFile =
+ new File("out/bridge-dirreq-stats-all-bridges");
+ if (bridgesPerDayFile.exists() &&
+ dirreqResponsesFile.exists() &&
+ bridgeStatsUsersFile.exists() &&
+ dirreqWriteHistoryFile.exists() &&
+ consensusesPerDayFile.exists()) {
+
+ /* Run the aggregation twice, once for all bridges and once for only
+ * bridges which haven't been seen as non-bridge relays before. */
+ boolean[] exclude = new boolean[] { true, false };
+ File[] outFiles = new File[] { bridgeDirreqStatsNoRelaysFile,
+ bridgeDirreqStatsAllBridgesFile };
+ for (int r = 0; r < 2; r++) {
+ boolean excludeHashedFingerprints = exclude[r];
+ File outFile = outFiles[r];
+
+ /* Read parse results back to memory. */
+ SortedMap<String, Integer> bridgesPerDay =
+ new TreeMap<String, Integer>();
+ BufferedReader br = new BufferedReader(new FileReader(
+ bridgesPerDayFile));
+ String line;
+ while ((line = br.readLine()) != null) {
+ String[] parts = line.split(",");
+ bridgesPerDay.put(parts[0], Integer.parseInt(parts[1]));
+ }
+ br.close();
+ SortedMap<String, SortedMap<Long, Long>> dirreqOkResponses =
+ new TreeMap<String, SortedMap<Long, Long>>();
+ br = new BufferedReader(new FileReader(dirreqResponsesFile));
+ while ((line = br.readLine()) != null) {
+ String[] parts = line.split(",");
+ if (excludeHashedFingerprints && parts[2].equals("TRUE")) {
+ /* Skip, because this bridge has been seen as relay before. */
+ continue;
+ }
+ String fingerprint = parts[1].toUpperCase();
+ long dirreqStatsEndMillis = dateTimeFormat.parse(parts[0]).
+ getTime();
+ long ok = Long.parseLong(parts[3]);
+ if (!dirreqOkResponses.containsKey(fingerprint)) {
+ dirreqOkResponses.put(fingerprint, new TreeMap<Long, Long>());
+ }
+ dirreqOkResponses.get(fingerprint).put(dirreqStatsEndMillis,
+ ok);
+ }
+ br.close();
+ SortedMap<String, long[]> ipsPerDay =
+ new TreeMap<String, long[]>();
+ br = new BufferedReader(new FileReader(bridgeStatsUsersFile));
+ while ((line = br.readLine()) != null) {
+ String[] parts = line.split(",");
+ if (excludeHashedFingerprints && parts[2].equals("TRUE")) {
+ /* Skip, because this bridge has been seen as relay before. */
+ continue;
+ }
+ long bridgeStatsEndMillis = dateTimeFormat.parse(parts[0]).
+ getTime();
+ long bridgeStatsStartMillis = bridgeStatsEndMillis - 86400000L;
+ long currentStartMillis = bridgeStatsStartMillis;
+
+ /* Find UTC date break in the interval and make sure that we
+ * distribute IPs to the two days correctly. */
+ String[] dates = new String[] {
+ dateFormat.format(bridgeStatsStartMillis),
+ dateFormat.format(bridgeStatsEndMillis) };
+ long[] seconds = new long[2];
+ if (!dates[0].equals(dates[1])) {
+ long dateBreakMillis = (bridgeStatsEndMillis / 86400000L)
+ * 86400000L;
+ seconds[0] = (dateBreakMillis - bridgeStatsStartMillis)
+ / 1000L;
+ bridgeStatsStartMillis = dateBreakMillis;
+ }
+ seconds[1] = (bridgeStatsEndMillis - bridgeStatsStartMillis)
+ / 1000L;
+
+ /* Update per-day counters. */
+ for (int i = 0; i < dates.length; i++) {
+ String date = dates[i];
+ long sy = seconds[i] * Long.parseLong(parts[3]);
+ long all = seconds[i] * Long.parseLong(parts[4]);
+ if (!ipsPerDay.containsKey(date)) {
+ ipsPerDay.put(date, new long[] { 0L, 0L });
+ }
+ ipsPerDay.get(date)[0] += sy;
+ ipsPerDay.get(date)[1] += all;
+ }
+ }
+ br.close();
+ SortedMap<String, Integer> consensusesPerDay =
+ new TreeMap<String, Integer>();
+ br = new BufferedReader(new FileReader(consensusesPerDayFile));
+ while ((line = br.readLine()) != null) {
+ String[] parts = line.split(",");
+ consensusesPerDay.put(parts[0], Integer.parseInt(parts[1]));
+ }
+ br.close();
+ br = new BufferedReader(new FileReader(dirreqWriteHistoryFile));
+ SortedMap<String, SortedMap<Long, Long>> dirreqWriteHistory =
+ new TreeMap<String, SortedMap<Long, Long>>();
+ while ((line = br.readLine()) != null) {
+ String[] parts = line.split(",");
+ if (excludeHashedFingerprints && parts[3].equals("TRUE")) {
+ /* Skip, because this bridge has been seen as relay before. */
+ continue;
+ }
+ String fingerprint = parts[0].toUpperCase();
+ long historyIntervalEndMillis = dateTimeFormat.parse(parts[1]).
+ getTime();
+ long writtenBytes = Long.parseLong(parts[2]);
+ if (!dirreqWriteHistory.containsKey(fingerprint)) {
+ dirreqWriteHistory.put(fingerprint, new TreeMap<Long, Long>());
+ }
+ dirreqWriteHistory.get(fingerprint).put(historyIntervalEndMillis,
+ writtenBytes);
+ }
+ br.close();
+
+ /* For every day, count reported v3 directory request responses,
+ * reported written directory bytes, and reporting bridges.
+ * Distinguish between bridges reporting both responses and bytes,
+ * bridges reporting only responses, and bridges reporting. Map
+ * keys are dates, map values are the number of responses, bytes,
+ * or bridges. */
+ SortedMap<String, Long>
+ responsesReportingBoth = new TreeMap<String, Long>(),
+ responsesNotReportingBytes = new TreeMap<String, Long>(),
+ bytesReportingBoth = new TreeMap<String, Long>(),
+ bytesNotReportingResponses = new TreeMap<String, Long>(),
+ bridgesReportingBoth = new TreeMap<String, Long>(),
+ bridgesNotReportingBytes = new TreeMap<String, Long>(),
+ bridgesNotReportingResponses = new TreeMap<String, Long>();
+
+ /* Consider each bridge separately. */
+ SortedSet<String> allFingerprints = new TreeSet<String>();
+ allFingerprints.addAll(dirreqOkResponses.keySet());
+ allFingerprints.addAll(dirreqWriteHistory.keySet());
+ for (String fingerprint : allFingerprints) {
+
+ /* Obtain iterators over dirreq stats intervals and dirreq write
+ * history intervals, from oldest to newest. Either iterator
+ * may contain zero elements if the bridge did not report any
+ * values, but not both. */
+ SortedMap<Long, Long> bridgeDirreqOkResponses =
+ dirreqOkResponses.containsKey(fingerprint) ?
+ dirreqOkResponses.get(fingerprint) :
+ new TreeMap<Long, Long>();
+ SortedMap<Long, Long> bridgeDirreqWriteHistory =
+ dirreqWriteHistory.containsKey(fingerprint) ?
+ dirreqWriteHistory.get(fingerprint) :
+ new TreeMap<Long, Long>();
+ Iterator<Long> responsesIterator =
+ bridgeDirreqOkResponses.keySet().iterator();
+ Iterator<Long> historyIterator =
+ bridgeDirreqWriteHistory.keySet().iterator();
+
+ /* Keep references to the currently considered intervals. */
+ long responseEndMillis = responsesIterator.hasNext() ?
+ responsesIterator.next() : Long.MAX_VALUE;
+ long historyEndMillis = historyIterator.hasNext() ?
+ historyIterator.next() : Long.MAX_VALUE;
+
+ /* Keep the time until when we have processed statistics. */
+ long currentStartMillis = 0L;
+
+ /* Iterate over both responses and byte histories until we set
+ * both to Long.MAX_VALUE, indicating that there are no further
+ * values. */
+ while (responseEndMillis < Long.MAX_VALUE ||
+ historyEndMillis < Long.MAX_VALUE) {
+
+ /* Dirreq-stats intervals are guaranteed to be 24 hours long,
+ * and dirreq-write-history intervals are 15 minutes long.
+ * This is guaranteed in the parsing code above. It allows us
+ * to calculate interval starts. Also, if we have already
+ * processed part of an interval, move the considered interval
+ * start accordingly. */
+ long historyStartMillis = Math.max(currentStartMillis,
+ historyEndMillis - 900000L);
+ long responseStartMillis = Math.max(currentStartMillis,
+ responseEndMillis - 86400000L);
+
+ /* Determine start and end time of the next interval, and
+ * whether the bridge reported dirreq-stats in that interval,
+ * or dirreq histories, or both. */
+ long currentEndMillis;
+ boolean addHistory = false, addResponses = false;
+ if (historyStartMillis < responseStartMillis) {
+ currentStartMillis = historyStartMillis;
+ currentEndMillis = Math.min(historyEndMillis,
+ responseStartMillis);
+ addHistory = true;
+ } else if (responseStartMillis < historyStartMillis) {
+ currentStartMillis = responseStartMillis;
+ currentEndMillis = Math.min(historyStartMillis,
+ responseEndMillis);
+ addResponses = true;
+ } else {
+ currentStartMillis = historyStartMillis;
+ currentEndMillis = Math.min(historyEndMillis,
+ responseEndMillis);
+ addHistory = true;
+ addResponses = true;
+ }
+
+ /* Depending on which statistics the bridge reported in the
+ * determined interval, obtain the number of bytes or
+ * responses to add. */
+ long bytesInInterval = 0L, responsesInInterval = 0L;
+ if (addHistory) {
+ bytesInInterval = bridgeDirreqWriteHistory.
+ get(historyEndMillis);
+ }
+ if (addResponses) {
+ responsesInInterval = bridgeDirreqOkResponses.
+ get(responseEndMillis);
+ }
+
+ /* Find out if there is a UTC date break in the interval to be
+ * added. If there is, make sure that we distribute responses
+ * and bytes to the two days correctly. */
+ String[] dates = new String[] {
+ dateFormat.format(currentStartMillis),
+ dateFormat.format(currentEndMillis) };
+ long[] seconds = new long[2];
+ if (!dates[0].equals(dates[1])) {
+ long dateBreakMillis = (currentEndMillis / 86400000L)
+ * 86400000L;
+ seconds[0] = (dateBreakMillis - currentStartMillis) / 1000L;
+ currentStartMillis = dateBreakMillis;
+ }
+ seconds[1] = (currentEndMillis - currentStartMillis) / 1000L;
+
+ /* Update per-day counters. */
+ for (int i = 0; i < dates.length; i++) {
+ String date = dates[i];
+ long bytes = seconds[i] * bytesInInterval;
+ long responses = seconds[i] * responsesInInterval;
+ if (!bytesReportingBoth.containsKey(date)) {
+ bytesReportingBoth.put(date, 0L);
+ bytesNotReportingResponses.put(date, 0L);
+ responsesReportingBoth.put(date, 0L);
+ responsesNotReportingBytes.put(date, 0L);
+ bridgesReportingBoth.put(date, 0L);
+ bridgesNotReportingBytes.put(date, 0L);
+ bridgesNotReportingResponses.put(date, 0L);
+ }
+ if (addHistory) {
+ if (addResponses) {
+ bytesReportingBoth.put(date,
+ bytesReportingBoth.get(date) + bytes);
+ responsesReportingBoth.put(date,
+ responsesReportingBoth.get(date) + responses);
+ bridgesReportingBoth.put(date,
+ bridgesReportingBoth.get(date) + seconds[i]);
+ } else {
+ bytesNotReportingResponses.put(date,
+ bytesNotReportingResponses.get(date) + bytes);
+ bridgesNotReportingResponses.put(date,
+ bridgesNotReportingResponses.get(date)
+ + seconds[i]);
+ }
+ } else if (addResponses) {
+ responsesNotReportingBytes.put(date,
+ responsesNotReportingBytes.get(date) + responses);
+ bridgesNotReportingBytes.put(date,
+ bridgesNotReportingBytes.get(date) + seconds[i]);
+ }
+ }
+
+ /* Move next interval start to the current interval end, and
+ * possibly move to the next stats intervals. If we have run
+ * out of intervals in either or both of the sets, change the
+ * reference to Long.MAX_VALUE to add the other intervals and
+ * finally exit the loop. */
+ currentStartMillis = currentEndMillis;
+ if (historyEndMillis <= currentStartMillis) {
+ historyEndMillis = historyIterator.hasNext() ?
+ historyIterator.next() : Long.MAX_VALUE;
+ }
+ if (responseEndMillis <= currentStartMillis) {
+ responseEndMillis = responsesIterator.hasNext() ?
+ responsesIterator.next() : Long.MAX_VALUE;
+ }
+ }
+ }
+
+ /* Put together what we learned about bridge usage per day. */
+ outFile.getParentFile().mkdirs();
+ BufferedWriter bw = new BufferedWriter(new FileWriter(outFile));
+ bw.write("date,nabcd,sy,consensuses,ha,hc,ra,rb,na,nb,nc\n");
+ for (String date : bytesReportingBoth.keySet()) {
+ String bridges = "NA";
+ if (bridgesPerDay.containsKey(date)) {
+ bridges = String.valueOf(bridgesPerDay.get(date) * 86400L);
+ }
+ String sy = "NA";
+ if (ipsPerDay.containsKey(date)) {
+ long[] ips = ipsPerDay.get(date);
+ sy = String.format("%.5f", ((double) ips[0])
+ / ((double) ips[1]));
+ }
+ String consensuses = "NA";
+ if (consensusesPerDay.containsKey(date)) {
+ consensuses = String.valueOf(consensusesPerDay.get(date));
+ }
+ bw.write(String.format("%s,%s,%s,%s,%d,%d,%d,%d,%d,%d,%d%n",
+ date, bridges, sy, consensuses,
+ bytesReportingBoth.get(date),
+ bytesNotReportingResponses.get(date),
+ responsesReportingBoth.get(date),
+ responsesNotReportingBytes.get(date),
+ bridgesReportingBoth.get(date),
+ bridgesNotReportingBytes.get(date),
+ bridgesNotReportingResponses.get(date)));
+ }
+ bw.close();
+ }
+ }
+ }
+}
+
More information about the tor-commits
mailing list