[or-cvs] [metrics/master] Update parsing scripts for client requests to directories.
karsten at seul.org
karsten at seul.org
Wed Jul 1 18:15:44 UTC 2009
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Wed, 1 Jul 2009 19:20:07 +0200
Subject: Update parsing scripts for client requests to directories.
Commit: 88f7c49cd3ded6b50ef58493fd8f5122efe9f769
---
HOWTO | 6 +-
scripts/dirreq/dirreq-censored.R | 39 ---
scripts/dirreq/dirreq.R | 33 --
.../metrics/dirreq/ParseDirectoryRequests.java | 353 --------------------
.../torproject/metrics/dirreq/ParseGeoipStats.java | 234 +++++++++++++
5 files changed, 237 insertions(+), 428 deletions(-)
delete mode 100644 scripts/dirreq/dirreq-censored.R
delete mode 100644 scripts/dirreq/dirreq.R
delete mode 100644 src/org/torproject/metrics/dirreq/ParseDirectoryRequests.java
create mode 100644 src/org/torproject/metrics/dirreq/ParseGeoipStats.java
diff --git a/HOWTO b/HOWTO
index b49f228..94fd802 100644
--- a/HOWTO
+++ b/HOWTO
@@ -181,12 +181,12 @@ $ javac -d bin/ -cp src/:lib/* src/org/torproject/metrics/dirreq/*.java
Run the parsing script:
-$ java -cp bin/:lib/* org.torproject.metrics.dirreq.ParseDirectoryRequests
- data/dirreq/ out/dirreq/ 168 0
+$ java -cp bin/:lib/* org.torproject.metrics.dirreq.ParseGeoipStats
+ data/geoipstats/ out/geoipstats/
$ mkdir report/
$ mkdir report/dirreq/
-$ R -q --no-save < scripts/dirreq/dirreq.R
+$ R -q --no-save < scripts/dirreq/geoipstats.R
3 Bridge archives
diff --git a/scripts/dirreq/dirreq-censored.R b/scripts/dirreq/dirreq-censored.R
deleted file mode 100644
index dcbc755..0000000
--- a/scripts/dirreq/dirreq-censored.R
+++ /dev/null
@@ -1,39 +0,0 @@
-a <- read.csv("out/dirreq/moria-dir64.log-req.csv")
-b <- read.csv("out/dirreq/moria-dir128.log-req.csv")
-c <- read.csv("out/dirreq/gabelmoo-dir512.log-req.csv")
-a <- a/1e3
-b <- b/1e3
-c <- c/1e3
-a19 <- c(a$cnt, a$rut, a$egt, a$vnt, a$sat, a$irt, a$mat, a$jot, a$pkt, a$byt, a$kzt, a$syt, a$aet, a$sdt, a$uzt, a$azt, a$yet)
-b19 <- c(b$cnt, b$rut, b$egt, b$vnt, b$sat, b$irt, b$mat, b$jot, b$pkt, b$byt, b$kzt, b$syt, b$aet, b$sdt, b$uzt, b$azt, b$yet)
-c19 <- c(c$cnt, c$rut, c$egt, c$vnt, c$sat, c$irt, c$mat, c$jot, c$pkt, c$byt, c$kzt, c$syt, c$aet, c$sdt, c$uzt, c$azt, c$yet)
-print(a19)
-print(b19)
-print(c19)
-m <- matrix(rev(c(c19, b19, a19)), nrow=17, ncol=3, byrow=FALSE)
-pdf("report/dirreq/dirreq-censored.pdf", width=8, height=6)
-oldpar <- par(mar=c(2.1, 3.9, 1.4, 4.9))
-barplot(m, col = c("orange", "red", "purple", "darkgreen", "red", "yellow", "blue"), ylab = "Requests for network statuses seen in 1 week [in K]", main = "Requests to directory caches by country", border = "white", names.arg = c("Directory with 64 KB/s", "Directory with 128 KB/s", "Directory with 512 KB/s"))
-mtext("China", side=4, las=1, at=12.6) #(cn)
-mtext("Russia", side=4, las=1, at=4.65) #(ru)
-mtext("Egypt", side=4, las=1, at=2.78) #(eg)
-mtext("Viet Nam", side=4, las=1, at=2.13) #(vn)
-mtext("Saudi Arabia", side=4, las=1, at=1.55) #(sa)
-mtext("Iran", side=4, las=1, at=1.0) #(ir)
-#mtext("Kazakhstan", side=4, las=1, at=1000) #(kz)
-#mtext("Belarus", side=4, las=1, at=1659) #(by)
-#mtext("Pakistan", side=4, las=1, at=1459) #(pk)
-#mtext("Jordan", side=4, las=1, at=1259) #(jo)
-#mtext("Syria", side=4, las=1, at=1059) #(sy)
-#mtext("U.A.E.", side=4, las=1, at=859) #(ae)
-#mtext("Uzbekistan", side=4, las=1, at=659) #(uz)
-#mtext("Yemen", side=4, las=1, at=459) #(ye)
-#mtext("Azerbaijan", side=4, las=1, at=259) #(az)
-#mtext("Egypt", side=4, las=1, at=59) #(eg)
-##mtext("Myanmar", side=4, las=1, at=59) #(mm)
-##mtext("Morocco", side=4, las=1, at=59) #(ma)
-##mtext("Sudan", side=4, las=1, at=59) #(sd)
-##mtext("Tunisia", side=4, las=1, at=59) #(tn)
-par(oldpar)
-dev.off();
-
diff --git a/scripts/dirreq/dirreq.R b/scripts/dirreq/dirreq.R
deleted file mode 100644
index a450af6..0000000
--- a/scripts/dirreq/dirreq.R
+++ /dev/null
@@ -1,33 +0,0 @@
-a <- read.csv("out/dirreq/moria-dir64.log-req.csv")
-b <- read.csv("out/dirreq/moria-dir128.log-req.csv")
-c <- read.csv("out/dirreq/gabelmoo-dir512.log-req.csv")
-a <- a/1e3
-b <- b/1e3
-c <- c/1e3
-sort(apply(a[,seq(7,466,3)], 2, mean), decreasing = TRUE)[1:15]
-sort(apply(b[,seq(7,478,3)], 2, mean), decreasing = TRUE)[1:15]
-sort(apply(c[,seq(7,523,3)], 2, mean), decreasing = TRUE)[1:15]
-asum <- sum(sort(apply(a[,seq(7,466,3)], 2, mean), decreasing = TRUE))
-bsum <- sum(sort(apply(b[,seq(7,478,3)], 2, mean), decreasing = TRUE))
-csum <- sum(sort(apply(c[,seq(7,523,3)], 2, mean), decreasing = TRUE))
-atop10 <- c(a$ust, a$det, a$cnt, a$itt, a$krt, a$gbt, a$frt, a$rut, a$cat, a$jpt)
-btop10 <- c(b$ust, b$det, b$cnt, b$itt, b$krt, b$gbt, b$frt, b$rut, b$cat, b$jpt)
-ctop10 <- c(c$ust, c$det, c$cnt, c$itt, c$krt, c$gbt, c$frt, c$rut, c$cat, c$jpt)
-m <- matrix(rev(c(ctop10, csum - sum(ctop10), btop10, bsum - sum(btop10), atop10, asum - sum(atop10))), nrow=11, ncol=3, byrow=FALSE)
-pdf("report/dirreq/dirreq.pdf", width=8, height=6)
-oldpar <- par(mar=c(2.1, 3.9, 1.4, 4.9))
-barplot(m, col = c("orange", "red", "purple", "darkgreen", "red", "yellow", "blue"), ylab = "Requests for network statuses seen in 1 week [in K]", main = "Requests to directory caches by country", border = "white", names.arg = c("Directory with 64 KB/s", "Directory with 128 KB/s", "Directory with 512 KB/s"))
-mtext("U.S.A.", side=4, las=1, at=93.5)
-mtext("Germany", side=4, las=1, at=75.5)
-mtext("China", side=4, las=1, at=61)
-mtext("Italy", side=4, las=1, at=52)
-mtext("South Korea", side=4, las=1, at=48)
-mtext("U.K.", side=4, las=1, at=44.2)
-mtext("France", side=4, las=1, at=41)
-mtext("Russia", side=4, las=1, at=38.1)
-mtext("Canada", side=4, las=1, at=35.3)
-mtext("Japan", side=4, las=1, at=32.5)
-mtext("Others", side=4, las=1, at=15.5)
-par(oldpar)
-dev.off();
-
diff --git a/src/org/torproject/metrics/dirreq/ParseDirectoryRequests.java b/src/org/torproject/metrics/dirreq/ParseDirectoryRequests.java
deleted file mode 100644
index 76e8831..0000000
--- a/src/org/torproject/metrics/dirreq/ParseDirectoryRequests.java
+++ /dev/null
@@ -1,353 +0,0 @@
-/* Copyright 2009 Karsten Loesing
- * See LICENSE for licensing information */
-package org.torproject.metrics.dirreq;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.text.ParsePosition;
-import java.text.SimpleDateFormat;
-import java.util.Calendar;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import java.util.SortedMap;
-import java.util.SortedSet;
-import java.util.TimeZone;
-import java.util.TreeMap;
-import java.util.TreeSet;
-
-import com.maxmind.geoip.LookupService;
-
-public final class ParseDirectoryRequests {
-
- private ParseDirectoryRequests() {
- }
-
- public static void main(final String[] args) throws Exception {
-
- // check input parameters
- if (args.length < 4) {
- System.err.println("Usage: java "
- + ParseDirectoryRequests.class.getSimpleName()
- + " <input directory> <output directory> "
- + "<unique interval length> <accumulate unique IPs>");
- System.exit(1);
- }
- File inputDirectory = new File(args[0]);
- if (!inputDirectory.exists() || !inputDirectory.isDirectory()) {
- System.err.println("Input directory '"
- + inputDirectory.getAbsolutePath()
- + "' does not exist or is not a directory.");
- System.exit(1);
- }
- File outputDirectory = new File(args[1]);
- if (outputDirectory.exists() && !outputDirectory.isDirectory()) {
- System.err.println("Output directory '"
- + outputDirectory.getAbsolutePath()
- + "' exists, but is not a directory.");
- System.exit(1);
- }
- outputDirectory.mkdir();
- long uniqueIntervalLength = Long.parseLong(args[2])
- * 60L * 60L * 1000L;
- boolean accumulate = Integer.parseInt(args[3]) != 0;
-
- long started = System.currentTimeMillis();
-
- String dbfile2 = "res/GeoIP.dat";
- LookupService cl = new LookupService(dbfile2,
- LookupService.GEOIP_MEMORY_CACHE);
-
- // parse input files
- for (File inputFile : inputDirectory.listFiles()) {
-
- // this is a terrible hack! but it works for our data..
- String timeZone = "Europe/Berlin";
- if (inputFile.getName().startsWith("moria")) {
- timeZone = "US/Eastern";
- }
- System.out.println("Parsing " + inputFile.getName()
- + " with timezone " + timeZone);
-
- // Tor's logs don't contain years. use the current year
- // instead. this is a pretty bad hack, but it works. just
- // make sure that logs don't cross year boundaries!!
- int currentYear = Calendar.getInstance().get(Calendar.YEAR);
-
- // only consider events in a fixed interval. of course, this
- // depends on the parsed data, so change it with the data!
- long intervalBegin = 1233777600000L, intervalEnd = 1234382400000L;
-
- // prepare for collection of unique addresses by country and total
- // requests
- SortedMap<Long, SortedMap<String, Map<String, int[]>>>
- allRequests = new TreeMap<Long, SortedMap<String,
- Map<String, int[]>>>();
- SortedMap<String, Map<String, int[]>> uniqueIPs =
- new TreeMap<String, Map<String, int[]>>();
- Set<String> ipsSeenSoFar = new HashSet<String>();
- long currentInterval = intervalBegin;
- SortedSet<String> allCountries = new TreeSet<String>();
-
- // prepare parsing
- Calendar c = Calendar.getInstance();
- SimpleDateFormat timeFormat =
- new SimpleDateFormat("MMM dd HH:mm:ss.SSS");
- timeFormat.setTimeZone(TimeZone.getTimeZone(timeZone));
-
- // parse input file
- BufferedReader br = new BufferedReader(new FileReader(
- inputFile));
- String line = null;
- long timestamp = -1L;
- while ((line = br.readLine()) != null) {
-
- // parse timestamp
- Date logTime = timeFormat.parse(line.substring(0, 19),
- new ParsePosition(0));
- c.setTimeInMillis(logTime.getTime());
- c.set(Calendar.YEAR, currentYear);
- timestamp = c.getTimeInMillis();
-
- // if this event happened before the considered interval, move
- // on
- if (timestamp < intervalBegin) {
- continue;
- }
-
- // check if we should evaluate the current interval now
- // (this approach requires that we have at least 1 event
- // after the last considered interval, that we don't
- // evaluate -- this is a tiny hack which fits our data,
- // though.)
- if (timestamp > currentInterval + uniqueIntervalLength) {
- // save collected data for later evaluation
- allRequests.put(currentInterval, uniqueIPs);
- // go on with next interval
- uniqueIPs = new TreeMap<String, Map<String, int[]>>();
- currentInterval += uniqueIntervalLength;
- }
-
- // if this event happened after the considered interval, move on
- if (timestamp > intervalEnd) {
- continue;
- }
-
- // parse the rest
- String[] split = line.substring(line.indexOf(
- "GET request by client ") + 22).split(" ");
- String address = split[0].replace("'", "");
-
- // look up in geoIP database
- String country = cl.getCountry(address).getCode()
- .toLowerCase();
- String url = split[3].replace("'", "");
- url = url.substring(0, url.length() - 1);
-
- // don't distinguish between compressed and non-compressed
- if (url.endsWith(".z")) {
- url = url.substring(0, url.length() - 2);
- }
- //String type = "unknown";
- int countAsVersion = -1;
-
- // version 1 dir protocol
- if (url.equals("/tor/")) {
- // full directory -- however, these requests
- // are probably not tor clients, but crawlers;
- // tor rewrites all requests for /<x> to /tor/<x>,
- // so that these requests could also be for /.
- // also, v1 tor clients would ask for /tor/dir.z,
- // not /tor/.
- } else if (url.equals("/tor/dir")) {
- // fetch compressed full directory (rather: dir.z)
- //type = "v1status";
- } else if (url.equals("/tor/running-routers")) {
- // fetch network-status descriptor
- // (do we want to count all three requests? what do
- // clients request typically?)
- //type = "v1status";
-
- // version 2 network status
- } else if (url.equals("/tor/status/all")) {
- // network-status documents from all known authorities
- //type = "v2status";
- countAsVersion = 2;
- } else if (url.equals("/tor/status/authority")) {
- // network-status document by this authority
- //type = "v2status";
- countAsVersion = 2;
- } else if (url.startsWith("/tor/status/fp/")) {
- // network-status document(s) by identity fingerprint
- // (unfortunately, we didn't preserve all fingerprints,
- // but only the first)
- //type = "v2status";
- countAsVersion = 2;
-
- // version 3 network-status consensus, votes, and certificates
- } else if (url.equals("/tor/status-vote/current/consensus")) {
- // current network-status consensus
- //type = "v3status";
- countAsVersion = 3;
- } else if (url.startsWith(
- "/tor/status-vote/current/consensus/")) {
- // current network-status consensus, created by the
- // authorities the client trusts (unfortunately, we didn't
- // preserve all identities, but only the first;
- // otherwise, we might re-construct which versions clients
- // are using)
- //type = "v3status";
- countAsVersion = 3;
-
- } else if (url.startsWith("/tor/status-vote/current/")
- && !url.startsWith(
- "/tor/status-vote/current/consensus")) {
- // other documents used in the v3 directory protocol in the
- // current voting period, e.g., votes, signatures.
- //type = "v3other";
- } else if (url.startsWith("/tor/status-vote/next/")) {
- // documents used in the v3 directory protocol in the next
- // voting period, e.g., consensus, votes, signatures.
- //type = "v3other";
- } else if (url.equals("/tor/keys/fp")) {
- // _empty_ list of key certificates? this is a bug that
- // DoSes the authorities! there were more than only 1 IPs
- // requesting this URL; is there a pattern?
- // are these clients (unlikely, because they wouldn't work
- // and be so persistent) are relays?
- } else if (url.startsWith("/tor/keys/")
- && !url.equals("/tor/keys/fp")) {
- // list of key certificates
- //type = "v3other";
-
- // router descriptors
- } else if (url.equals("/tor/server/all")) {
- // all router descriptors (which versions request such a
- // thing?!)
- //type = "router";
- } else if (url.equals("/tor/server/authority")) {
- // router descriptor of this relay,
- // mainly requested for debugging purposes and self test
- //type = "router";
- } else if (url.startsWith("/tor/server/d/")) {
- // router descriptor by descriptor identifier
- //type = "router";
- } else if (url.startsWith("/tor/server/fp/")) {
- // router descriptor by router identity (should be avoided)
- //type = "router";
-
- // extra-info documents
- } else if (url.equals("/tor/extra/all")) {
- // all extra-info documents
- //type = "extra";
- } else if (url.equals("/tor/extra/authority")) {
- // extra-info document of this relay
- //type = "extra";
- } else if (url.startsWith("/tor/extra/d/")) {
- // extra-info document by identifier
- //type = "extra";
- } else if (url.startsWith("/tor/extra/fp/")) {
- // extra-info document by router identity
- //type = "extra";
- }
-
- // should this request be considered for evaluation?
- if (countAsVersion > 0) {
- /*if (bridgeIPs.contains(address))
- bridgeRequests++;
- else
- nonBridgeRequests++;*/
- // consider IP address for 24 hour interval
- if (!accumulate || !ipsSeenSoFar.contains(address)) {
- ipsSeenSoFar.add(address);
- if (uniqueIPs.containsKey(country)) {
- Map<String, int[]> ips = uniqueIPs.get(country);
- if (ips.containsKey(address)) {
- int[] versions = ips.get(address);
- versions[countAsVersion - 2] += 1;
- } else {
- int[] versions = new int[2];
- versions[countAsVersion - 2] = 1;
- ips.put(address, versions);
- }
- } else {
- Map<String, int[]> ips =
- new HashMap<String, int[]>();
- int[] versions = new int[2];
- versions[countAsVersion - 2] = 1;
- ips.put(address, versions);
- uniqueIPs.put(country, ips);
- }
- }
- allCountries.add(country);
- }
- }
- // close input file
- br.close();
-
- File fileIPA = new File(outputDirectory.getAbsolutePath()
- + File.separatorChar + inputFile.getName() + "-uip.csv");
- File fileReq = new File(outputDirectory.getAbsolutePath()
- + File.separatorChar + inputFile.getName() + "-req.csv");
- BufferedWriter outIPA = new BufferedWriter(new FileWriter(
- fileIPA, false));
- BufferedWriter outReq = new BufferedWriter(new FileWriter(
- fileReq, false));
- StringBuilder sb = new StringBuilder();
- for (String f : allCountries) {
- sb.append(f + "2," + f + "3," + f + "t,");
- }
- outIPA.write("time," + sb.toString() + "total2,total3,total\n");
- outReq.write("time," + sb.toString() + "total2,total3,total\n");
- for (Map.Entry<Long, SortedMap<String, Map<String, int[]>>> hour
- : allRequests.entrySet()) {
- int totalUnique2 = 0, totalUnique3 = 0, totalUniqueT = 0;
- int totalRequests2 = 0, totalRequests3 = 0;
- outIPA.write(hour.getKey() + ",");
- outReq.write(hour.getKey() + ",");
- SortedMap<String, Map<String, int[]>> req = hour.getValue();
- for (String f : allCountries) {
- if (req.containsKey(f)) {
- int unique2 = 0, unique3 = 0, uniqueT = 0;
- int requests2 = 0, requests3 = 0;
- for (int[] vers : req.get(f).values()) {
- unique2 += vers[0] > 0 ? 1 : 0;
- unique3 += vers[1] > 0 ? 1 : 0;
- uniqueT++;
- requests2 += vers[0];
- requests3 += vers[1];
- }
- outIPA.write(unique2 + "," + unique3 + ","
- + uniqueT + ",");
- outReq.write(requests2 + "," + requests3 + ","
- + (requests2 + requests3) + ",");
- totalUnique2 += unique2;
- totalUnique3 += unique3;
- totalUniqueT += uniqueT;
- totalRequests2 += requests2;
- totalRequests3 += requests3;
- } else {
- outIPA.write("0,0,0,");
- outReq.write("0,0,0,");
- }
- }
- outIPA.write(totalUnique2 + "," + totalUnique3 + ","
- + totalUniqueT + "\n");
- outReq.write(totalRequests2 + "," + totalRequests3 + ","
- + (totalRequests2 + totalRequests3) + "\n");
- }
- outIPA.close();
- outReq.close();
- }
-
- System.out.println("Parsing finished after "
- + ((System.currentTimeMillis() - started) / 1000)
- + " seconds.");
- }
-}
-
diff --git a/src/org/torproject/metrics/dirreq/ParseGeoipStats.java b/src/org/torproject/metrics/dirreq/ParseGeoipStats.java
new file mode 100644
index 0000000..7f69e7f
--- /dev/null
+++ b/src/org/torproject/metrics/dirreq/ParseGeoipStats.java
@@ -0,0 +1,234 @@
+/* Copyright 2009 Karsten Loesing
+ * See LICENSE for licensing information */
+package org.torproject.metrics.dirreq;
+
+import java.io.*;
+import java.text.*;
+import java.util.*;
+
+import com.maxmind.geoip.LookupService;
+
+public final class ParseGeoipStats {
+
+ private static class DataPoint {
+ String date;
+ SortedMap<String, Integer> v2Ips;
+ SortedMap<String, Integer> v3Ips;
+ SortedMap<String, Integer> v2Reqs;
+ SortedMap<String, Integer> v3Reqs;
+ int v2Share;
+ int v3Share;
+ }
+
+ private static SortedSet<String> allCountries = new TreeSet<String>();
+ private static SortedSet<String> allDates = new TreeSet<String>();
+ private static SortedMap<String, SortedMap<String, DataPoint>> allDataPoints
+ = new TreeMap<String, SortedMap<String, DataPoint>>();
+
+ private static SortedMap<String, Integer> parseCountryLine(String line) {
+ SortedMap<String, Integer> result = new TreeMap<String, Integer>();
+ if (line.length() < 2 || line.split(" ").length < 2) {
+ return result;
+ }
+ String[] countries = line.split(" ")[1].split(",");
+ for (String part : countries) {
+ String country = part.split("=")[0];
+ Integer count = Integer.parseInt(part.split("=")[1]) - 4;
+ allCountries.add(country);
+ result.put(country, count);
+ }
+ return result;
+ }
+
+ private static String estimateRequestsAndClients(int localRequests,
+ int localIpsInt, int shareAsInt) {
+ double share = ((double) shareAsInt) / 10000 * 5 / 4;
+ double totalRequests = (double) localRequests / share;
+ double totalClients = 10000.0D;
+ double localIps = (double) localIpsInt;
+ int maxIterations = 40;
+ double step = 10000.0D;
+ boolean add = true;
+ while (maxIterations-- > 0) {
+ double c = totalClients * (1.0D - Math.pow(1.0D -share,
+ totalRequests / totalClients));
+ if (Math.abs(localIps - c) < 0.1D) {
+ break;
+ } else if (c > localIps) {
+ if (add) step /= 2.0;
+ totalClients -= step;
+ } else if (c < localIps) {
+ if (!add) step /= 2.0;
+ totalClients += step;
+ }
+ }
+ double requestsPerClient = totalRequests / totalClients;
+ return String.format("%d,%d,%.2f", (int) totalRequests,
+ (int) totalClients, requestsPerClient);
+ }
+
+ private ParseGeoipStats() {
+ }
+
+ public static void main(final String[] args) throws Exception {
+
+ // check input parameters
+ if (args.length < 2) {
+ System.err.println("Usage: java "
+ + ParseGeoipStats.class.getSimpleName()
+ + " <input directory> <output directory>");
+ System.exit(1);
+ }
+ File inputDirectory = new File(args[0]);
+ if (!inputDirectory.exists() || !inputDirectory.isDirectory()) {
+ System.err.println("Input directory '"
+ + inputDirectory.getAbsolutePath()
+ + "' does not exist or is not a directory.");
+ System.exit(1);
+ }
+ File outputDirectory = new File(args[1]);
+ if (outputDirectory.exists() && !outputDirectory.isDirectory()) {
+ System.err.println("Output directory '"
+ + outputDirectory.getAbsolutePath()
+ + "' exists, but is not a directory.");
+ System.exit(1);
+ }
+ outputDirectory.mkdir();
+
+ long started = System.currentTimeMillis();
+
+ // parse input files
+ for (File inputFile : inputDirectory.listFiles()) {
+ SortedMap<String, DataPoint> currentDataPoints
+ = new TreeMap<String, DataPoint>();
+ allDataPoints.put(inputFile.getName(), currentDataPoints);
+ BufferedReader br = new BufferedReader(new FileReader(
+ inputFile));
+ String line = null;
+ String currentDate = null;
+ DataPoint currentDataPoint = null;
+ boolean haveSeenActualNumbers = false;
+ while ((line = br.readLine()) != null) {
+ if (line.startsWith("written ")) {
+ if (haveSeenActualNumbers) {
+ currentDataPoints.put(currentDate, currentDataPoint);
+ }
+ currentDataPoint = new DataPoint();
+ currentDate = line.split(" ")[1];
+ allDates.add(currentDate);
+ } else if (line.startsWith("started-at ")) {
+ // ignored
+ } else if (line.startsWith("ns-ips ")) {
+ currentDataPoint.v3Ips = parseCountryLine(line);
+ if (line.split(" ").length > 1) {
+ haveSeenActualNumbers = true;
+ }
+ } else if (line.startsWith("ns-v2-ips ")) {
+ currentDataPoint.v2Ips = parseCountryLine(line);
+ if (line.split(" ").length > 1) {
+ haveSeenActualNumbers = true;
+ }
+ } else if (line.startsWith("requests-start ")) {
+ // ignored
+ } else if (line.startsWith("n-ns-reqs ")) {
+ currentDataPoint.v3Reqs = parseCountryLine(line);
+ if (line.split(" ").length > 1) {
+ haveSeenActualNumbers = true;
+ }
+ } else if (line.startsWith("n-v2-ns-reqs ")) {
+ currentDataPoint.v2Reqs = parseCountryLine(line);
+ if (line.split(" ").length > 1) {
+ haveSeenActualNumbers = true;
+ }
+ } else if (line.startsWith("v2-ns-share ")) {
+ currentDataPoint.v2Share = Integer.parseInt(
+ line.split(" ")[1].replace('.', ';')
+ .replace('%', ';').replaceAll(";", ""));
+ } else if (line.startsWith("v3-ns-share ")) {
+ currentDataPoint.v3Share = Integer.parseInt(
+ line.split(" ")[1].replace('.', ';')
+ .replace('%', ';').replaceAll(";", ""));
+ }
+ }
+ if (haveSeenActualNumbers) {
+ currentDataPoints.put(currentDate, currentDataPoint);
+ }
+ br.close();
+ }
+
+ System.out.printf("We have seen %d countries on %d days on %d "
+ + "directories.%n", allCountries.size(), allDates.size(),
+ allDataPoints.size());
+
+ for (Map.Entry<String, SortedMap<String, DataPoint>> e
+ : allDataPoints.entrySet()) {
+ String directory = e.getKey();
+ SortedMap<String, DataPoint> dataPoints = e.getValue();
+ File outFile = new File(outputDirectory.getAbsolutePath()
+ + File.separatorChar + directory + ".csv");
+ BufferedWriter out = new BufferedWriter(new FileWriter(
+ outFile, false));
+ out.write("time,");
+ for (String f : allCountries) {
+ out.write(String.format("ip2%s,ip3%<s,ipt%<s,"
+ + "req2%<s,req3%<s,reqt%<s,", f));
+ }
+ out.write("ip2total,ip3total,ipttotal,"
+ + "req2total,req3total,reqttotal,"
+ + "v2share,v3share,"
+ + "req2estimate,ip2estimate,reqperip2,"
+ + "req3estimate,ip3estimate,reqperip3\n");
+ for (String date : allDates) {
+ if (!dataPoints.containsKey(date)) {
+ out.write(date + ",");
+ int nas = allCountries.size() * 6 + 7;
+ for (int i = 0; i < nas; i++) {
+ out.write("NA,");
+ }
+ out.write("NA\n");
+ } else {
+ DataPoint currentDataPoint = dataPoints.get(date);
+ out.write(date + ",");
+ int ip2total = 0, ip3total = 0, ipttotal = 0,
+ req2total = 0, req3total = 0, reqttotal = 0;
+ for (String f : allCountries) {
+ int v2Ips = currentDataPoint.v2Ips.containsKey(f)
+ ? currentDataPoint.v2Ips.get(f) : 0;
+ int v3Ips = currentDataPoint.v3Ips.containsKey(f)
+ ? currentDataPoint.v3Ips.get(f) : 0;
+ int v2Reqs = currentDataPoint.v2Reqs.containsKey(f)
+ ? currentDataPoint.v2Reqs.get(f) : 0;
+ int v3Reqs = currentDataPoint.v3Reqs.containsKey(f)
+ ? currentDataPoint.v3Reqs.get(f) : 0;
+ ip2total += v2Ips;
+ ip3total += v3Ips;
+ ipttotal += v2Ips + v3Ips;
+ req2total += v2Reqs;
+ req3total += v3Reqs;
+ reqttotal += v2Reqs + v3Reqs;
+ out.write(String.format("%d,%d,%d,%d,%d,%d,",
+ v2Ips, v3Ips, v2Ips + v3Ips,
+ v2Reqs, v3Reqs, v2Reqs + v3Reqs));
+ }
+ out.write(String.format("%d,%d,%d,%d,%d,%d,%d,%d",
+ ip2total, ip3total, ipttotal,
+ req2total, req3total, reqttotal,
+ currentDataPoint.v2Share,
+ currentDataPoint.v3Share));
+ out.write(String.format(",%s",
+ estimateRequestsAndClients(req2total, ip2total,
+ currentDataPoint.v2Share)));
+ out.write(String.format(",%s%n",
+ estimateRequestsAndClients(req3total, ip3total,
+ currentDataPoint.v3Share)));
+ }
+ }
+ out.close();
+ }
+
+ System.out.println("Parsing finished after "
+ + ((System.currentTimeMillis() - started) / 1000)
+ + " seconds.");
+ }
+}
+
--
1.5.6.5
More information about the tor-commits
mailing list