[or-cvs] [ernie/master] Add bridge descriptor aggregator and sanitizer.
karsten at torproject.org
karsten at torproject.org
Wed Apr 7 20:53:13 UTC 2010
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Wed, 7 Apr 2010 22:52:17 +0200
Subject: Add bridge descriptor aggregator and sanitizer.
Commit: 24258ea40671ac197ee076c151923be1c288848e
---
config | 3 +
src/ArchiveWriter.java | 2 +-
src/BridgeDescriptorParser.java | 152 ++++--
src/BridgeSnapshotReader.java | 84 ++-
src/Configuration.java | 7 +
src/GeoIPDatabaseManager.java | 56 ++-
src/Main.java | 21 +-
src/SanitizedBridgesReader.java | 19 +-
src/SanitizedBridgesWriter.java | 1107 +++++++++++++++++++++++++++++++++++++++
9 files changed, 1341 insertions(+), 110 deletions(-)
create mode 100644 src/SanitizedBridgesWriter.java
diff --git a/config b/config
index f76c05c..1eb25fb 100644
--- a/config
+++ b/config
@@ -67,6 +67,9 @@
## JDBC string for relay descriptor database
#RelayDescriptorDatabaseJDBC jdbc:postgresql://localhost/tordir?user=ernie&password=password
+## Write sanitized bridges to disk
+#WriteSanitizedBridges 0
+
## Import sanitized bridges from disk, if available
#ImportSanitizedBridges 1
diff --git a/src/ArchiveWriter.java b/src/ArchiveWriter.java
index c4374fd..726ecfc 100644
--- a/src/ArchiveWriter.java
+++ b/src/ArchiveWriter.java
@@ -8,7 +8,7 @@ import org.apache.commons.codec.binary.*;
public class ArchiveWriter {
private Logger logger;
public ArchiveWriter() {
- this.logger = Logger.getLogger(RelayDescriptorParser.class.getName());
+ this.logger = Logger.getLogger(ArchiveWriter.class.getName());
}
private void store(byte[] data, String filename) {
diff --git a/src/BridgeDescriptorParser.java b/src/BridgeDescriptorParser.java
index 7099571..d67ab47 100644
--- a/src/BridgeDescriptorParser.java
+++ b/src/BridgeDescriptorParser.java
@@ -7,78 +7,114 @@ import org.apache.commons.codec.digest.*;
public class BridgeDescriptorParser {
private ConsensusStatsFileHandler csfh;
private BridgeStatsFileHandler bsfh;
+ private SanitizedBridgesWriter sbw;
private SortedSet<String> countries;
private Logger logger;
public BridgeDescriptorParser(ConsensusStatsFileHandler csfh,
- BridgeStatsFileHandler bsfh, SortedSet<String> countries) {
+ BridgeStatsFileHandler bsfh, SanitizedBridgesWriter sbw,
+ SortedSet<String> countries) {
this.csfh = csfh;
this.bsfh = bsfh;
+ this.sbw = sbw;
this.countries = countries;
this.logger =
Logger.getLogger(BridgeDescriptorParser.class.getName());
}
- public void parse(BufferedReader br, String dateTime, boolean sanitized)
- throws IOException, ParseException {
- SimpleDateFormat timeFormat = new SimpleDateFormat(
- "yyyy-MM-dd HH:mm:ss");
- timeFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
- String hashedIdentity = null, publishedLine = null,
- geoipStartTimeLine = null;
- boolean skip = false;
- String line = null;
- while ((line = br.readLine()) != null) {
- if (line.startsWith("r ")) {
- int runningBridges = 0;
- while ((line = br.readLine()) != null) {
- if (line.startsWith("s ") && line.contains(" Running")) {
- runningBridges++;
+ public void parse(byte[] allData, String dateTime, boolean sanitized) {
+ try {
+ BufferedReader br = new BufferedReader(new StringReader(
+ new String(allData, "US-ASCII")));
+ SimpleDateFormat timeFormat = new SimpleDateFormat(
+ "yyyy-MM-dd HH:mm:ss");
+ timeFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+ String hashedIdentity = null, publishedLine = null,
+ geoipStartTimeLine = null;
+ boolean skip = false;
+ String line = null;
+ while ((line = br.readLine()) != null) {
+ if (line.startsWith("r ")) {
+ if (this.sbw != null) {
+ if (sanitized) {
+ this.sbw.storeSanitizedNetworkStatus(allData, dateTime);
+ } else {
+ this.sbw.sanitizeAndStoreNetworkStatus(allData, dateTime);
+ }
}
- }
- if (this.csfh != null) {
- this.csfh.addBridgeConsensusResults(dateTime, runningBridges);
- }
- } else if (line.startsWith("extra-info ")) {
- hashedIdentity = sanitized ? line.split(" ")[2]
- : DigestUtils.shaHex(line.split(" ")[2]).toUpperCase();
- if (this.bsfh != null) {
- skip = this.bsfh.isKnownRelay(hashedIdentity);
- }
- } else if (!skip && line.startsWith("published ")) {
- publishedLine = line;
- } else if (!skip && line.startsWith("geoip-start-time ")) {
- geoipStartTimeLine = line;
- } else if (!skip && line.startsWith("geoip-client-origins")
- && line.split(" ").length > 1) {
- if (publishedLine == null ||
- geoipStartTimeLine == null) {
- this.logger.warning("Either published line or "
- + "geoip-start-time line is not present in "
- + (sanitized ? "sanitized" : "non-sanitized")
- + " bridge descriptors from " + dateTime + ".");
- break;
- }
- long published = timeFormat.parse(publishedLine.
- substring("published ".length())).getTime();
- long started = timeFormat.parse(geoipStartTimeLine.
- substring("geoip-start-time ".length())).getTime();
- long seconds = (published - started) / 1000L;
- Map<String, String> obs = new HashMap<String, String>();
- String[] parts = line.split(" ")[1].split(",");
- for (String p : parts) {
- for (String c : countries) {
- if (p.startsWith(c)) {
- obs.put(c, String.format("%.2f",
- ((double) Long.parseLong(p.substring(3)) - 4L)
- * 86400.0D / ((double) seconds)));
+ int runningBridges = 0;
+ while ((line = br.readLine()) != null) {
+ if (line.startsWith("s ") && line.contains(" Running")) {
+ runningBridges++;
}
}
- }
- String date = publishedLine.split(" ")[1];
- String time = publishedLine.split(" ")[2];
- if (this.bsfh != null) {
- bsfh.addObs(hashedIdentity, date, time, obs);
+ if (this.csfh != null) {
+ this.csfh.addBridgeConsensusResults(dateTime, runningBridges);
+ }
+ } else if (line.startsWith("router ")) {
+ if (this.sbw != null) {
+ if (sanitized) {
+ this.sbw.storeSanitizedServerDescriptor(allData);
+ } else {
+ this.sbw.sanitizeAndStoreServerDescriptor(allData);
+ }
+ }
+ } else if (line.startsWith("extra-info ")) {
+ if (this.sbw != null) {
+ if (sanitized) {
+ this.sbw.storeSanitizedExtraInfoDescriptor(allData);
+ } else {
+ this.sbw.sanitizeAndStoreExtraInfoDescriptor(allData);
+ }
+ }
+ hashedIdentity = sanitized ? line.split(" ")[2]
+ : DigestUtils.shaHex(line.split(" ")[2]).toUpperCase();
+ if (this.bsfh != null) {
+ skip = this.bsfh.isKnownRelay(hashedIdentity);
+ }
+ } else if (!skip && line.startsWith("published ")) {
+ publishedLine = line;
+ } else if (!skip && line.startsWith("geoip-start-time ")) {
+ geoipStartTimeLine = line;
+ } else if (!skip && line.startsWith("geoip-client-origins")
+ && line.split(" ").length > 1) {
+ if (publishedLine == null ||
+ geoipStartTimeLine == null) {
+ this.logger.warning("Either published line or "
+ + "geoip-start-time line is not present in "
+ + (sanitized ? "sanitized" : "non-sanitized")
+ + " bridge descriptors from " + dateTime + ".");
+ break;
+ }
+ long published = timeFormat.parse(publishedLine.
+ substring("published ".length())).getTime();
+ long started = timeFormat.parse(geoipStartTimeLine.
+ substring("geoip-start-time ".length())).getTime();
+ long seconds = (published - started) / 1000L;
+ Map<String, String> obs = new HashMap<String, String>();
+ String[] parts = line.split(" ")[1].split(",");
+ for (String p : parts) {
+ for (String c : countries) {
+ if (p.startsWith(c)) {
+ obs.put(c, String.format("%.2f",
+ ((double) Long.parseLong(p.substring(3)) - 4L)
+ * 86400.0D / ((double) seconds)));
+ }
+ }
+ }
+ String date = publishedLine.split(" ")[1];
+ String time = publishedLine.split(" ")[2];
+ if (this.bsfh != null) {
+ bsfh.addObs(hashedIdentity, date, time, obs);
+ }
}
}
+ } catch (IOException e) {
+ this.logger.log(Level.WARNING, "Could not parse bridge descriptor.",
+ e);
+ return;
+ } catch (ParseException e) {
+ this.logger.log(Level.WARNING, "Could not parse bridge descriptor.",
+ e);
+ return;
}
}
}
diff --git a/src/BridgeSnapshotReader.java b/src/BridgeSnapshotReader.java
index 50a9978..f278b90 100644
--- a/src/BridgeSnapshotReader.java
+++ b/src/BridgeSnapshotReader.java
@@ -1,5 +1,4 @@
import java.io.*;
-import java.text.*;
import java.util.*;
import java.util.logging.*;
import org.apache.commons.compress.compressors.gzip.*;
@@ -39,7 +38,6 @@ public class BridgeSnapshotReader {
+ "/...");
Stack<File> filesInInputDir = new Stack<File>();
filesInInputDir.add(bdDir);
- List<File> problems = new ArrayList<File>();
while (!filesInInputDir.isEmpty()) {
File pop = filesInInputDir.pop();
if (pop.isDirectory()) {
@@ -53,48 +51,74 @@ public class BridgeSnapshotReader {
GzipCompressorInputStream gcis =
new GzipCompressorInputStream(in);
TarArchiveInputStream tais = new TarArchiveInputStream(gcis);
- InputStreamReader isr = new InputStreamReader(tais);
- BufferedReader br = new BufferedReader(isr);
+ BufferedInputStream bis = new BufferedInputStream(tais);
String fn = pop.getName();
String dateTime = fn.substring(11, 21) + " "
+ fn.substring(22, 24) + ":" + fn.substring(24, 26)
+ ":" + fn.substring(26, 28);
while ((tais.getNextTarEntry()) != null) {
- bdp.parse(br, dateTime, false);
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ int len;
+ byte[] data = new byte[1024];
+ while ((len = bis.read(data, 0, 1024)) >= 0) {
+ baos.write(data, 0, len);
+ }
+ byte[] allData = baos.toByteArray();
+ String ascii = new String(allData, "US-ASCII");
+ BufferedReader br3 = new BufferedReader(new StringReader(
+ ascii));
+ String firstLine = null;
+ while ((firstLine = br3.readLine()) != null) {
+ if (firstLine.startsWith("@")) {
+ continue;
+ } else {
+ break;
+ }
+ }
+ if (firstLine.startsWith("r ")) {
+ bdp.parse(allData, dateTime, false);
+ } else {
+ int start = -1, sig = -1, end = -1;
+ String startToken =
+ firstLine.startsWith("router ") ?
+ "router " : "extra-info ";
+ String sigToken = "\nrouter-signature\n";
+ String endToken = "\n-----END SIGNATURE-----\n";
+ while (end < ascii.length()) {
+ start = ascii.indexOf(startToken, end);
+ if (start < 0) {
+ break;
+ }
+ sig = ascii.indexOf(sigToken, start);
+ if (sig < 0) {
+ break;
+ }
+ sig += sigToken.length();
+ end = ascii.indexOf(endToken, sig);
+ if (end < 0) {
+ break;
+ }
+ end += endToken.length();
+ byte[] descBytes = new byte[end - start];
+ System.arraycopy(allData, start, descBytes, 0,
+ end - start);
+ bdp.parse(descBytes, dateTime, false);
+ }
+ }
}
}
in.close();
parsed.add(pop.getName());
modified = true;
- } catch (ParseException e) {
- problems.add(pop);
- if (problems.size() > 3) {
- break;
- }
} catch (IOException e) {
- problems.add(pop);
- if (problems.size() > 3) {
- break;
- }
- }
- }
- }
- if (problems.isEmpty()) {
- logger.fine("Finished importing files in directory "
- + bridgeDirectoriesDir + "/.");
- } else {
- StringBuilder sb = new StringBuilder("Failed importing files in "
- + "directory " + bridgeDirectoriesDir + "/:");
- int printed = 0;
- for (File f : problems) {
- sb.append("\n " + f.getAbsolutePath());
- if (++printed >= 3) {
- sb.append("\n ... more");
- break;
+ logger.log(Level.WARNING, "Could not parse bridge snapshot!",
+ e);
+ continue;
}
}
- logger.warning(sb.toString());
}
+ logger.fine("Finished importing files in directory "
+ + bridgeDirectoriesDir + "/.");
if (!parsed.isEmpty() && modified) {
logger.fine("Writing file " + pbdFile.getAbsolutePath() + "...");
try {
diff --git a/src/Configuration.java b/src/Configuration.java
index 3c724c9..ace0c3a 100644
--- a/src/Configuration.java
+++ b/src/Configuration.java
@@ -29,6 +29,7 @@ public class Configuration {
private boolean writeRelayDescriptorDatabase = false;
private String relayDescriptorDatabaseJdbc =
"jdbc:postgresql://localhost/tordir?user=ernie&password=password";
+ private boolean writeSanitizedBridges = false;
private boolean importSanitizedBridges = true;
private boolean importBridgeSnapshots = true;
private boolean importWriteTorperfStats = true;
@@ -102,6 +103,9 @@ public class Configuration {
line.split(" ")[1]) != 0;
} else if (line.startsWith("RelayDescriptorDatabaseJDBC")) {
this.relayDescriptorDatabaseJdbc = line.split(" ")[1];
+ } else if (line.startsWith("WriteSanitizedBridges")) {
+ this.writeSanitizedBridges = Integer.parseInt(
+ line.split(" ")[1]) != 0;
} else if (line.startsWith("ImportSanitizedBridges")) {
this.importSanitizedBridges = Integer.parseInt(
line.split(" ")[1]) != 0;
@@ -216,6 +220,9 @@ public class Configuration {
public String getRelayDescriptorDatabaseJDBC() {
return this.relayDescriptorDatabaseJdbc;
}
+ public boolean getWriteSanitizedBridges() {
+ return this.writeSanitizedBridges;
+ }
public boolean getImportSanitizedBridges() {
return this.importSanitizedBridges;
}
diff --git a/src/GeoIPDatabaseManager.java b/src/GeoIPDatabaseManager.java
index 7438003..15e5ea1 100644
--- a/src/GeoIPDatabaseManager.java
+++ b/src/GeoIPDatabaseManager.java
@@ -11,6 +11,10 @@ import java.util.zip.*;
* Supports importing CSV-formatted databases from disk and downloading
* the most recent commercial Maxmind GeoIP database from their server
* using a license key.
+ *
+ * 0 databases: all requests answered with ZZ
+ * 1 database: all requests answered from that database
+ * 2+ databases: requests answered by most recent database at given date
*/
public class GeoIPDatabaseManager {
@@ -69,6 +73,8 @@ public class GeoIPDatabaseManager {
*/
private Logger logger;
+ private Set<String> unresolvedCountryCodes;
+
/**
* Initializes this class by reading in the database versions known so
* far.
@@ -80,6 +86,8 @@ public class GeoIPDatabaseManager {
this.combinedDatabase = new TreeMap<Long, DatabaseEntry>();
this.allDatabases = new ArrayList<String>();
this.combinedDatabaseModified = false;
+ this.unresolvedCountryCodes = new HashSet<String>(Arrays.asList(
+ "--,a1,a2,eu,ap".split(",")));
/* Initialize logger. */
this.logger = Logger.getLogger(RelayDescriptorParser.class.getName());
@@ -344,13 +352,48 @@ public class GeoIPDatabaseManager {
}
}
+ public String getCountryForIPOneWeek(String ipAddress, String date) {
+ SimpleDateFormat parseFormat = new SimpleDateFormat("yyyy-MM-dd");
+ parseFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+ try {
+ String dateMinusOneWeek = parseFormat.format(new Date(
+ parseFormat.parse(date).getTime() -
+ 7L * 24L * 60L * 60L * 1000L));
+ return this.getCountryForIP(ipAddress, dateMinusOneWeek);
+ } catch (ParseException e) {
+ this.logger.log(Level.WARNING, "Could not parse date '" + date
+ + "'.", e);
+ return null;
+ }
+ }
+
/**
* Returns the uppercase two-letter country code that was assigned to
* <code>ipAddress</code> (in dotted notation) in the most recent
- * commercial Maxmind GeoIP database published at least 1 day before
+ * commercial Maxmind GeoIP database published before or at
* <code>date</code> (in the format yyyy-MM-dd).
*/
public String getCountryForIP(String ipAddress, String date) {
+ if (this.allDatabases.isEmpty()) {
+ return "ZZ";
+ }
+ String dateShort = date.substring(0, 4) + date.substring(5, 7)
+ + date.substring(8, 10); // TODO put full date in allDatabases
+ String dbDate = null;
+ if (this.allDatabases.contains(dateShort)) {
+ dbDate = dateShort;
+ } else {
+ SortedSet<String> subset = new TreeSet<String>(this.allDatabases).
+ headSet(dateShort);
+ if (!subset.isEmpty()) {
+ dbDate = subset.last();
+ } else {
+ dbDate = this.allDatabases.get(0);
+ }
+ }
+ if (dbDate == null || !this.allDatabases.contains(dbDate)) {
+ return "ZZ";
+ }
String[] parts = ipAddress.split("\\.");
long ipNum = Long.parseLong(parts[0]) * 256 * 256 * 256 +
Long.parseLong(parts[1]) * 256 * 256 +
@@ -364,14 +407,11 @@ public class GeoIPDatabaseManager {
} else {
return "ZZ";
}
- String dateShort = date.substring(0, 4) + date.substring(5, 7)
- + date.substring(8, 10);
- SortedSet<String> subset = new TreeSet<String>(this.allDatabases).
- headSet(dateShort);
- if (subset.isEmpty()) {
+ String countryCode = countries.substring(1).split(",")[
+ this.allDatabases.indexOf(dbDate)];
+ if (unresolvedCountryCodes.contains(countryCode)) {
return "ZZ";
}
- int index = allDatabases.indexOf(subset.last());
- return countries.substring(1).split(",")[index];
+ return countryCode;
}
}
diff --git a/src/Main.java b/src/Main.java
index 8a2ee06..e7573eb 100644
--- a/src/Main.java
+++ b/src/Main.java
@@ -114,18 +114,29 @@ public class Main {
gd.writeCombinedDatabase();
}
+ // Prepare sanitized bridge descriptor writer
+ SanitizedBridgesWriter sbw = config.getWriteSanitizedBridges() ?
+ new SanitizedBridgesWriter(gd, "sanitized-bridges") : null;
+
// Prepare bridge descriptor parser
- BridgeDescriptorParser bdp = config.getWriteConsensusStats() &&
- config.getWriteBridgeStats() ? new BridgeDescriptorParser(
- csfh, bsfh, countries) : null;
+ BridgeDescriptorParser bdp = config.getWriteConsensusStats() ||
+ config.getWriteBridgeStats() || config.getWriteSanitizedBridges()
+ ? new BridgeDescriptorParser(csfh, bsfh, sbw, countries) : null;
// Import bridge descriptors
- if (config.getImportSanitizedBridges()) {
+ if (bdp != null && config.getImportSanitizedBridges()) {
new SanitizedBridgesReader(bdp, "bridges", countries);
}
- if (config.getImportBridgeSnapshots()) {
+ if (bdp != null && config.getImportBridgeSnapshots()) {
new BridgeSnapshotReader(bdp, "bridge-directories", countries);
}
+ // TODO check configuration sanity: data source without sink?
+
+ // Finish writing sanitized bridge descriptors to disk
+ if (sbw != null) {
+ sbw.finishWriting();
+ sbw = null;
+ }
// Write updated stats files to disk
if (bsfh != null) {
diff --git a/src/SanitizedBridgesReader.java b/src/SanitizedBridgesReader.java
index 341a55f..f6fc100 100644
--- a/src/SanitizedBridgesReader.java
+++ b/src/SanitizedBridgesReader.java
@@ -22,18 +22,21 @@ public class SanitizedBridgesReader {
continue;
} else {
try {
- BufferedReader br = new BufferedReader(new FileReader(pop));
+ BufferedInputStream bis = new BufferedInputStream(
+ new FileInputStream(pop));
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ int len;
+ byte[] data = new byte[1024];
+ while ((len = bis.read(data, 0, 1024)) >= 0) {
+ baos.write(data, 0, len);
+ }
+ bis.close();
+ byte[] allData = baos.toByteArray();
String fn = pop.getName();
String dateTime = fn.substring(0, 4) + "-" + fn.substring(4, 6)
+ "-" + fn.substring(6, 8) + " " + fn.substring(9, 11)
+ ":" + fn.substring(11, 13) + ":" + fn.substring(13, 15);
- bdp.parse(br, dateTime, true);
- br.close();
- } catch (ParseException e) {
- problems.add(pop);
- if (problems.size() > 3) {
- break;
- }
+ bdp.parse(allData, dateTime, true);
} catch (IOException e) {
problems.add(pop);
if (problems.size() > 3) {
diff --git a/src/SanitizedBridgesWriter.java b/src/SanitizedBridgesWriter.java
new file mode 100644
index 0000000..faa589e
--- /dev/null
+++ b/src/SanitizedBridgesWriter.java
@@ -0,0 +1,1107 @@
+import java.io.*;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.logging.*;
+
+import org.apache.commons.codec.DecoderException;
+import org.apache.commons.codec.digest.*;
+import org.apache.commons.codec.binary.*;
+
+/**
+ * Sanitizes bridge descriptors, i.e., removes all possibly sensitive
+ * information from them, and writes them to a local directory structure.
+ * During the sanitizing process, all information about the bridge
+ * identity or IP address are removed or replaced. The goal is to keep the
+ * sanitized bridge descriptors useful for statistical analysis while not
+ * making it easier for an adversary to enumerate bridges.
+ *
+ * There are three types of bridge descriptors: bridge network statuses
+ * (lists of all bridges at a given time), server descriptors (published
+ * by the bridge to advertise their capabilities), and extra-info
+ * descriptors (published by the bridge, mainly for statistical analysis).
+ *
+ * Network statuses, server descriptors, and extra-info descriptors are
+ * linked via descriptor digests: extra-info descriptors are referenced
+ * from server descriptors, and server descriptors are referenced from
+ * network statuses. These references need to be changed during the
+ * sanitizing process, because descriptor contents change and so do the
+ * descriptor digests. Furthermore, extra-info descriptors require either
+ * the network status or server descriptor to be parsed first to learn the
+ * bridge's country code that is part of its new nickname.
+ *
+ * As a result, there is no possible order in which bridge descriptors can
+ * be parsed without having to update a previously written bridge
+ * descriptor. The approach taken here is to sanitize bridge descriptors
+ * even with incomplete knowledge about references or country codes and to
+ * update them as soon as these information get known. We are keeping a
+ * persistent data structure, the bridge descriptor mapping, to hold
+ * information about every single descriptor. The idea is that every
+ * descriptor is (a) referenced from a network status and consists of
+ * (b) a server descriptor and (c) an extra-info descriptor, both of which
+ * are published at the same time. Using this data structure, we can
+ * repair references as soon as we learn more about the descriptor and
+ * regardless of the order of incoming bridge descriptors.
+ *
+ * The process of sanitizing a bridge descriptor is as follows, depending
+ * on the type of descriptor:
+ *
+ * Network statuses are processed by sanitizing every r line separately
+ * and looking up whether the descriptor mapping contains a bridge with
+ * given identity hash and descriptor publication time. If either server
+ * descriptor or extra-info descriptor have been published before and if
+ * the GeoIP lookup of the bridge's IP address reveals a new country code
+ * for this bridge, extra-info descriptor and server descriptor are
+ * re-written.
+ *
+ * Server descriptors are processed by looking up their bridge identity
+ * hash and publication time in the descriptor mapping. If the GeoIP
+ * lookup reveals a new country code and if the extra-info descriptor was
+ * parsed before, the extra-info descriptor is re-written. After
+ * sanitizing a server descriptor, its publication time is noted down, so
+ * that all network statuses that might be referencing this server
+ * descriptor can be re-written at the end of the sanitizing procedure.
+ *
+ * Extra-info descriptors are also processed by looking up their bridge
+ * identity hash and publication time in the descriptor mapping. If the
+ * corresponding server descriptor was sanitized before, it is re-written
+ * to include the new extra-info descriptor digest. The publication time
+ * is noted down, too, so that all network statuses possibly referencing
+ * this extra-info descriptor and its corresponding server descriptor can
+ * be re-written at the end of the sanitizing procedure.
+ *
+ * After sanitizing all bridge descriptors, the network statuses that
+ * might be referencing server descriptors which have been (re-)written
+ * during this execution are re-written, too. This may be necessary in
+ * order to update previously broken references to server descriptors.
+ */
+public class SanitizedBridgesWriter {
+
+ /**
+ * Hex representation of null reference that is written to bridge
+ * descriptors if we don't have the real reference, yet.
+ */
+ private static final String NULL_REFERENCE =
+ "0000000000000000000000000000000000000000";
+
+ /**
+ * Mapping between a descriptor as referenced from a network status to
+ * a country code and the digests of server descriptor and extra-info
+ * descriptor.
+ */
+ private static class DescriptorMapping {
+
+ /**
+ * Creates a new mapping from comma-separated values as read from the
+ * persistent mapping file.
+ */
+ private DescriptorMapping(String commaSeparatedValues) {
+ String[] parts = commaSeparatedValues.split(",");
+ this.hashedBridgeIdentity = parts[0];
+ this.published = parts[1];
+ this.countryCode = parts[2];
+ this.serverDescriptorIdentifier = parts[3];
+ this.extraInfoDescriptorIdentifier = parts[4];
+ }
+
+ /**
+ * Creates a new mapping for a given identity hash and descriptor
+ * publication time that has ZZ as country code and all 0's as
+ * descriptor digests.
+ */
+ private DescriptorMapping(String hashedBridgeIdentity,
+ String published) {
+ this.hashedBridgeIdentity = hashedBridgeIdentity;
+ this.published = published;
+ this.countryCode = "ZZ";
+ this.serverDescriptorIdentifier = NULL_REFERENCE;
+ this.extraInfoDescriptorIdentifier = NULL_REFERENCE;
+ }
+ private String hashedBridgeIdentity;
+ private String published;
+ private String countryCode;
+ private String serverDescriptorIdentifier;
+ private String extraInfoDescriptorIdentifier;
+
+ /**
+ * Returns a string representation of this descriptor mapping that can
+ * be written to the persistent mapping file.
+ */
+ public String toString() {
+ return this.hashedBridgeIdentity + "," + this.published + ","
+ + this.countryCode + "," + this.serverDescriptorIdentifier + ","
+ + this.extraInfoDescriptorIdentifier;
+ }
+ }
+
+ /**
+ * File containing the mapping between network status entries, server
+ * descriptors, and extra-info descriptors.
+ */
+ private File bridgeDescriptorMappingsFile;
+
+ /**
+ * Mapping between status entries, server descriptors, and extra-info
+ * descriptors. This mapping is required to re-establish the references
+ * from status entries to server descriptors and from server descriptors
+ * to extra-info descriptors. The original references are broken when
+ * sanitizing, because descriptor contents change and so do the
+ * descriptor digests that are used for referencing. Map key contains
+ * hashed bridge identity and descriptor publication time, map value
+ * contains map key plus country code, new server descriptor identifier,
+ * and new extra-info descriptor identifier.
+ */
+ private SortedMap<String, DescriptorMapping> bridgeDescriptorMappings;
+
+ /**
+ * GeoIP database used for resolving bridge IP addresses to two-letter
+ * country codes.
+ */
+ private GeoIPDatabaseManager gd;
+
+ /**
+ * Logger for this class.
+ */
+ private Logger logger;
+
+ /**
+ * Publication times of server descriptors and extra-info descriptors
+ * parsed in the current execution. These times are used to determine
+ * which statuses need to be rewritten at the end of the execution.
+ */
+ private SortedSet<String> descriptorPublicationTimes;
+
+ /**
+ * Output directory for writing sanitized bridge descriptors.
+ */
+ private String sanitizedBridgesDir;
+
+ /**
+ * Initializes this class, including reading in the known descriptor
+ * mapping.
+ */
+ public SanitizedBridgesWriter(GeoIPDatabaseManager gd, String dir) {
+
+ /* Memorize argument values. */
+ this.gd = gd;
+ this.sanitizedBridgesDir = dir;
+
+ /* Initialize logger. */
+ this.logger = Logger.getLogger(
+ SanitizedBridgesWriter.class.getName());
+
+ /* Initialize data structure. */
+ this.bridgeDescriptorMappings = new TreeMap<String,
+ DescriptorMapping>();
+ this.descriptorPublicationTimes = new TreeSet<String>();
+
+ /* Read known descriptor mappings from disk. */
+ this.bridgeDescriptorMappingsFile = new File(
+ "stats/bridge-descriptor-mappings");
+ if (this.bridgeDescriptorMappingsFile.exists()) {
+ try {
+ BufferedReader br = new BufferedReader(new FileReader(
+ this.bridgeDescriptorMappingsFile));
+ String line = null;
+ while ((line = br.readLine()) != null) {
+ if (line.split(",").length == 5) {
+ String[] parts = line.split(",");
+ DescriptorMapping dm = new DescriptorMapping(line);
+ dm.hashedBridgeIdentity = parts[0];
+ dm.published = parts[1];
+ dm.countryCode = parts[2];
+ dm.serverDescriptorIdentifier = parts[3];
+ dm.extraInfoDescriptorIdentifier = parts[4];
+ this.bridgeDescriptorMappings.put(line.split(",")[0] + ","
+ + line.split(",")[1], dm);
+ } else {
+ this.logger.warning("Corrupt line '" + line + "' in "
+ + this.bridgeDescriptorMappingsFile.getAbsolutePath()
+ + ". Skipping.");
+ continue;
+ }
+ }
+ br.close();
+ } catch (IOException e) {
+ this.logger.log(Level.WARNING, "Could not read in "
+ + this.bridgeDescriptorMappingsFile.getAbsolutePath()
+ + ".");
+ return;
+ }
+ }
+ }
+
+ /**
+ * Sanitizes a network status and writes it to disk. Processes every r
+ * line separately and looks up whether the descriptor mapping contains
+ * a bridge with given identity hash and descriptor publication time. If
+ * either server descriptor or extra-info descriptor have been published
+ * before and if the GeoIP lookup of the bridge's IP address reveals a
+ * new country code for this bridge, extra-info descriptor and server
+ * descriptor are re-written.
+ */
+ public void sanitizeAndStoreNetworkStatus(byte[] data,
+ String publicationTime) {
+
+ /* Parse the given network status line by line. */
+ StringBuilder scrubbed = new StringBuilder();
+ try {
+ BufferedReader br = new BufferedReader(new StringReader(new String(
+ data, "US-ASCII")));
+ String line = null;
+ while ((line = br.readLine()) != null) {
+
+ /* r lines contain sensitive information that needs to be removed
+ * or replaced. */
+ if (line.startsWith("r ")) {
+
+ /* Parse the relevant parts of this r line. */
+ String[] parts = line.split(" ");
+ String bridgeIdentity = parts[2];
+ String descPublicationTime = parts[4] + " " + parts[5];
+ String ipAddress = parts[6];
+ String orPort = parts[7];
+ String dirPort = parts[8];
+
+ /* Look up the descriptor in the descriptor mapping, or add a
+ * new mapping entry if there is none. */
+ String hashedBridgeIdentityHex = Hex.encodeHexString(
+ DigestUtils.sha(Base64.decodeBase64(bridgeIdentity
+ + "=="))).toLowerCase();
+ String mappingKey = hashedBridgeIdentityHex + ","
+ + descPublicationTime;
+ DescriptorMapping mapping = null;
+ if (this.bridgeDescriptorMappings.containsKey(mappingKey)) {
+ mapping = this.bridgeDescriptorMappings.get(mappingKey);
+ } else {
+ mapping = new DescriptorMapping(hashedBridgeIdentityHex.
+ toLowerCase(), descPublicationTime);
+ this.bridgeDescriptorMappings.put(mappingKey, mapping);
+ }
+
+ /* Look up the bridge's IP address in the GeoIP database. */
+ String newCountryCode = this.gd.getCountryForIPOneWeek(
+ ipAddress, descPublicationTime);
+
+ /* If we just learned a new IP address, we might have to
+ * re-write the (indirectly) referenced extra-info descriptor
+ * that has UnnamedZZ as its nickname and the corresponding
+ * server descriptor that gets an updated extra-info-digest
+ * line. */
+ if (!newCountryCode.equals(mapping.countryCode)) {
+ mapping.countryCode = newCountryCode;
+ if (!mapping.extraInfoDescriptorIdentifier.equals(
+ NULL_REFERENCE)) {
+ this.rewriteExtraInfoDescriptor(mapping);
+ }
+ if (!mapping.serverDescriptorIdentifier.equals(
+ NULL_REFERENCE)) {
+ this.rewriteServerDescriptor(mapping);
+ }
+ }
+
+ /* Write scrubbed r line to buffer. */
+ String nickname = "Unnamed" + mapping.countryCode;
+ String hashedBridgeIdentityBase64 = Base64.encodeBase64String(
+ DigestUtils.sha(Base64.decodeBase64(bridgeIdentity
+ + "=="))).substring(0, 27);
+ String sdi = Base64.encodeBase64String(Hex.decodeHex(
+ mapping.serverDescriptorIdentifier.toCharArray())).
+ substring(0, 27);
+ scrubbed.append("r " + nickname + " "
+ + hashedBridgeIdentityBase64 + " " + sdi + " "
+ + descPublicationTime + " 127.0.0.1 " + orPort + " "
+ + dirPort + "\n");
+
+ /* Nothing special about s lines; just copy them. */
+ } else if (line.startsWith("s ")) {
+ scrubbed.append(line + "\n");
+
+ /* There should be nothing else but r and s lines in the network
+ * status. If there is, we should probably learn before writing
+ * anything to the sanitized descriptors. */
+ } else {
+ this.logger.warning("Unknown line '" + line + "' in bridge "
+ + "network status. Not writing to disk!");
+ return;
+ }
+ }
+ br.close();
+
+ } catch (IOException e) {
+ this.logger.log(Level.WARNING, "Could not parse bridge network "
+ + "status.", e);
+ return;
+ } catch (DecoderException e) {
+ this.logger.log(Level.WARNING, "Could not parse bridge network "
+ + "status.", e);
+ return;
+ }
+
+ /* Write the sanitized network status to disk. */
+ try {
+
+ /* Determine file name. */
+ String syear = publicationTime.substring(0, 4);
+ String smonth = publicationTime.substring(5, 7);
+ String sday = publicationTime.substring(8, 10);
+ String stime = publicationTime.substring(11, 13)
+ + publicationTime.substring(14, 16)
+ + publicationTime.substring(17, 19);
+ File statusFile = new File(this.sanitizedBridgesDir + "/" + syear
+ + "/" + smonth + "/statuses/" + sday + "/" + syear + smonth
+ + sday + "-" + stime + "-"
+ + "4A0CCD2DDC7995083D73F5D667100C8A5831F16D");
+
+ /* Create all parent directories to write this network status. */
+ statusFile.getParentFile().mkdirs();
+
+ /* Write sanitized network status to disk. */
+ BufferedWriter bw = new BufferedWriter(new FileWriter(statusFile));
+ bw.write(scrubbed.toString());
+ bw.close();
+
+ } catch (IOException e) {
+ this.logger.log(Level.WARNING, "Could not write sanitized bridge "
+ + "network status to disk.", e);
+ return;
+ }
+ }
+
+ /**
+ * Sanitizes a bridge server descriptor and writes it to disk. Looks up
+ * up bridge identity hash and publication time in the descriptor
+ * mapping. If the GeoIP lookup reveals a new country code and if the
+ * corresponding extra-info descriptor was parsed before, the extra-info
+ * descriptor is re-written. After sanitizing a server descriptor, its
+ * publication time is noted down, so that all network statuses that
+ * might be referencing this server descriptor can be re-written at the
+ * end of the sanitizing procedure.
+ */
+ public void sanitizeAndStoreServerDescriptor(byte[] data) {
+
+ /* Parse descriptor to generate a sanitized version and to look it up
+ * in the descriptor mapping. */
+ String scrubbedDesc = null;
+ DescriptorMapping mapping = null;
+ try {
+ BufferedReader br = new BufferedReader(new StringReader(
+ new String(data, "US-ASCII")));
+ StringBuilder scrubbed = new StringBuilder();
+ String line = null, ipAddress = null, hashedBridgeIdentity = null,
+ published = null;
+ boolean skipCrypto = false, contactWritten = false;
+ while ((line = br.readLine()) != null) {
+
+ /* When we have parsed both published and fingerprint line, look
+ * up descriptor in the descriptor mapping or create a new one if
+ * there is none. */
+ if (mapping == null && published != null &&
+ hashedBridgeIdentity != null) {
+ String mappingKey = hashedBridgeIdentity + "," + published;
+ if (this.bridgeDescriptorMappings.containsKey(mappingKey)) {
+ mapping = this.bridgeDescriptorMappings.get(mappingKey);
+ } else {
+ mapping = new DescriptorMapping(hashedBridgeIdentity,
+ published);
+ this.bridgeDescriptorMappings.put(mappingKey, mapping);
+ }
+
+ /* Look up IP address in the GeoIP database. If our knowledge
+ * about the bridge's country code has changed, we might have to
+ * re-write the extra-info descriptor corresponding to this
+ * server descriptor. */
+ String newCountryCode = this.gd.getCountryForIPOneWeek(ipAddress,
+ published);
+ if (!newCountryCode.equals(mapping.countryCode)) {
+ mapping.countryCode = newCountryCode;
+ if (!mapping.extraInfoDescriptorIdentifier.equals(
+ NULL_REFERENCE)) {
+ this.rewriteExtraInfoDescriptor(mapping);
+ }
+ }
+ }
+
+ /* Skip all crypto parts that might be used to derive the bridge's
+ * identity fingerprint. */
+ if (skipCrypto && !line.startsWith("-----END ")) {
+ continue;
+
+ /* Parse the original IP address for looking it up in the GeoIP
+ * database and replace it with 127.0.0.1 in the scrubbed
+ * version. */
+ } else if (line.startsWith("router ")) {
+ ipAddress = line.split(" ")[2];
+ scrubbed = new StringBuilder("127.0.0.1 " + line.split(" ")[3]
+ + " " + line.split(" ")[4] + " " + line.split(" ")[5]
+ + "\n");
+
+ /* Parse the publication time and add it to the list of descriptor
+ * publication times to re-write network statuses at the end of
+ * the sanitizing procedure. */
+ } else if (line.startsWith("published ")) {
+ published = line.substring("published ".length());
+ this.descriptorPublicationTimes.add(published);
+ scrubbed.append(line + "\n");
+
+ /* Parse the fingerprint to determine the hashed bridge
+ * identity. */
+ } else if (line.startsWith("opt fingerprint ")) {
+ String fingerprint = line.substring(line.startsWith("opt ") ?
+ "opt fingerprint".length() : "fingerprint".length()).
+ replaceAll(" ", "").toLowerCase();
+ hashedBridgeIdentity = DigestUtils.shaHex(Hex.decodeHex(
+ fingerprint.toCharArray())).toLowerCase();
+ scrubbed.append("opt fingerprint");
+ for (int i = 0; i < hashedBridgeIdentity.length() / 4; i++)
+ scrubbed.append(" " + hashedBridgeIdentity.substring(4 * i,
+ 4 * (i + 1)).toUpperCase());
+ scrubbed.append("\n");
+
+ /* Replace the contact line (if present) with a generic line that
+ * contains the bridge's country code as last two characters. */
+ } else if (line.startsWith("contact ")) {
+ scrubbed.append("contact somebody at example dot "
+ + mapping.countryCode.toLowerCase() + "\n");
+ contactWritten = true;
+
+ /* When we reach the signature, we're done. Write the sanitized
+ * descriptor to disk below. */
+ } else if (line.startsWith("router-signature")) {
+ scrubbedDesc = "router Unnamed"
+ + mapping.countryCode.toUpperCase() + " "
+ + scrubbed.toString();
+ break;
+
+ /* Replace extra-info digest with the one we know from our
+ * descriptor mapping (which might be all 0's if we didn't parse
+ * the extra-info descriptor before). */
+ } else if (line.startsWith("opt extra-info-digest ")) {
+ scrubbed.append("opt extra-info-digest "
+ + mapping.extraInfoDescriptorIdentifier.toUpperCase()
+ + "\n");
+
+ /* Before writing the exit policy, check if we wrote a contact
+ * line before. If not, there was no contact line in the original
+ * descriptor. In that case, add a generic contact line with the
+ * bridge's country code as last two characters. */
+ } else if (line.startsWith("reject ")
+ || line.startsWith("accept ")) {
+ if (!contactWritten) {
+ scrubbed.append("contact nobody at example dot "
+ + mapping.countryCode.toLowerCase() + "\n");
+ contactWritten = true;
+ }
+ scrubbed.append(line + "\n");
+
+ /* Write the following lines unmodified to the sanitized
+ * descriptor. */
+ } else if (line.startsWith("platform ")
+ || line.startsWith("opt protocols ")
+ || line.startsWith("uptime ")
+ || line.startsWith("bandwidth ")
+ || line.startsWith("opt hibernating ")
+ || line.equals("opt hidden-service-dir")
+ || line.equals("opt caches-extra-info")
+ || line.equals("opt allow-single-hop-exits")) {
+ scrubbed.append(line + "\n");
+
+ /* Replace node fingerprints in the family line with their hashes
+ * and nicknames with Unnamed. */
+ } else if (line.startsWith("family ")) {
+ StringBuilder familyLine = new StringBuilder("family");
+ for (String s : line.substring(7).split(" ")) {
+ if (s.startsWith("$")) {
+ familyLine.append(" $" + DigestUtils.shaHex(Hex.decodeHex(
+ s.substring(1).toCharArray())).toUpperCase());
+ } else {
+ familyLine.append(" Unnamed");
+ }
+ }
+ scrubbed.append(familyLine.toString() + "\n");
+
+ /* Skip the purpose line that the bridge authority adds to its
+ * cached-descriptors file. */
+ } else if (line.startsWith("@purpose ")) {
+ continue;
+
+ /* Skip all crypto parts that might leak the bridge's identity
+ * fingerprint. */
+ } else if (line.startsWith("-----BEGIN ")
+ || line.equals("onion-key") || line.equals("signing-key")) {
+ skipCrypto = true;
+
+ /* Stop skipping lines when the crypto parts are over. */
+ } else if (line.startsWith("-----END ")) {
+ skipCrypto = false;
+
+ /* If we encounter an unrecognized line, stop parsing and print
+ * out a warning. We might have overlooked sensitive information
+ * that we need to remove or replace for the sanitized descriptor
+ * version. */
+ } else {
+ this.logger.warning("Unrecognized line '" + line
+ + "'. Skipping.");
+ return;
+ }
+ }
+ br.close();
+ } catch (Exception e) {
+ this.logger.log(Level.WARNING, "Could not parse server "
+ + "descriptor.", e);
+ return;
+ }
+
+ /* Determine new descriptor digest and write it to descriptor
+ * mapping. */
+ String scrubbedHash = DigestUtils.shaHex(scrubbedDesc);
+ mapping.serverDescriptorIdentifier = scrubbedHash;
+
+ /* Determine filename of sanitized server descriptor. */
+ String dyear = mapping.published.substring(0, 4);
+ String dmonth = mapping.published.substring(5, 7);
+ String dday = mapping.published.substring(8, 10);
+ File newFile = new File(this.sanitizedBridgesDir + "/"
+ + dyear + "/" + dmonth + "/server-descriptors/" + dday
+ + "/" + scrubbedHash.charAt(0) + "/"
+ + scrubbedHash.charAt(1) + "/"
+ + scrubbedHash);
+
+ /* Write sanitized server descriptor to disk, including all its parent
+ * directories. */
+ try {
+ newFile.getParentFile().mkdirs();
+ BufferedWriter bw = new BufferedWriter(new FileWriter(newFile));
+ bw.write(scrubbedDesc);
+ bw.close();
+ } catch (IOException e) {
+ this.logger.log(Level.WARNING, "Could not write sanitized server "
+ + "descriptor to disk.", e);
+ return;
+ }
+ }
+
+ /**
+ * Sanitizes an extra-info descriptor and writes it to disk. Looks up
+ * the bridge identity hash and publication time in the descriptor
+ * mapping. If the corresponding server descriptor was sanitized before,
+ * it is re-written to include the new extra-info descriptor digest.
+ * The publication time is noted down, too, so that all network statuses
+ * possibly referencing this extra-info descriptor and its corresponding
+ * server descriptor can be re-written at the end of the sanitizing
+ * procedure.
+ */
+ public void sanitizeAndStoreExtraInfoDescriptor(byte[] data) {
+
+ /* Parse descriptor to generate a sanitized version and to look it up
+ * in the descriptor mapping. */
+ String scrubbedDesc = null;
+ DescriptorMapping mapping = null;
+ try {
+ BufferedReader br = new BufferedReader(new StringReader(new String(
+ data, "US-ASCII")));
+ String line = null;
+ StringBuilder scrubbed = null;
+ String hashedBridgeIdentity = null, published = null;
+ while ((line = br.readLine()) != null) {
+
+ /* When we have parsed both published and fingerprint line, look
+ * up descriptor in the descriptor mapping or create a new one if
+ * there is none. */
+ if (mapping == null && published != null &&
+ hashedBridgeIdentity != null) {
+ String mappingKey = hashedBridgeIdentity + "," + published;
+ if (this.bridgeDescriptorMappings.containsKey(mappingKey)) {
+ mapping = this.bridgeDescriptorMappings.get(mappingKey);
+ } else {
+ mapping = new DescriptorMapping(hashedBridgeIdentity,
+ published);
+ this.bridgeDescriptorMappings.put(mappingKey, mapping);
+ }
+ }
+
+ /* Parse bridge identity from extra-info line and replace it with
+ * its hash in the sanitized descriptor. */
+ if (line.startsWith("extra-info ")) {
+ hashedBridgeIdentity = DigestUtils.shaHex(Hex.decodeHex(
+ line.split(" ")[2].toCharArray())).toLowerCase();
+ scrubbed = new StringBuilder(hashedBridgeIdentity.toUpperCase()
+ + "\n");
+
+ /* Parse the publication time and add it to the list of descriptor
+ * publication times to re-write network statuses at the end of
+ * the sanitizing procedure. */
+ } else if (line.startsWith("published ")) {
+ scrubbed.append(line + "\n");
+ published = line.substring("published ".length());
+ this.descriptorPublicationTimes.add(published);
+
+ /* Write the following lines unmodified to the sanitized
+ * descriptor. */
+ } else if (line.startsWith("write-history ")
+ || line.startsWith("read-history ")
+ || line.startsWith("geoip-start-time ")
+ || line.startsWith("geoip-client-origins ")
+ || line.startsWith("bridge-stats-end ")
+ || line.startsWith("bridge-ips ")) {
+ scrubbed.append(line + "\n");
+
+ /* When we reach the signature, we're done. Write the sanitized
+ * descriptor to disk below. */
+ } else if (line.startsWith("router-signature")) {
+ scrubbedDesc = "extra-info Unnamed"
+ + mapping.countryCode + " " + scrubbed.toString();
+ break;
+ /* Don't include statistics that should only be contained in relay
+ * extra-info descriptors. */
+ } else if (line.startsWith("dirreq-") || line.startsWith("cell-")
+ || line.startsWith("exit-")) {
+ continue;
+
+ /* If we encounter an unrecognized line, stop parsing and print
+ * out a warning. We might have overlooked sensitive information
+ * that we need to remove or replace for the sanitized descriptor
+ * version. */
+ } else {
+ this.logger.warning("Unrecognized line '" + line
+ + "'. Skipping");
+ return;
+ }
+ }
+ br.close();
+ } catch (IOException e) {
+ this.logger.log(Level.WARNING, "Could not parse extra-info "
+ + "descriptor.", e);
+ return;
+ } catch (DecoderException e) {
+ this.logger.log(Level.WARNING, "Could not parse extra-info "
+ + "descriptor.", e);
+ return;
+ }
+
+ /* Determine new descriptor digest and check if write it to descriptor
+ * mapping. */
+ String scrubbedDescHash = DigestUtils.shaHex(scrubbedDesc);
+ boolean extraInfoDescriptorIdentifierHasChanged =
+ !scrubbedDescHash.equals(mapping.extraInfoDescriptorIdentifier);
+ mapping.extraInfoDescriptorIdentifier = scrubbedDescHash;
+ if (extraInfoDescriptorIdentifierHasChanged &&
+ !mapping.serverDescriptorIdentifier.equals(NULL_REFERENCE)) {
+ this.rewriteServerDescriptor(mapping);
+ }
+
+ /* Determine filename of sanitized server descriptor. */
+ String dyear = mapping.published.substring(0, 4);
+ String dmonth = mapping.published.substring(5, 7);
+ String dday = mapping.published.substring(8, 10);
+ File newFile = new File(this.sanitizedBridgesDir + "/"
+ + dyear + "/" + dmonth + "/extra-infos/" + dday
+ + "/" + scrubbedDescHash.charAt(0) + "/"
+ + scrubbedDescHash.charAt(1) + "/"
+ + scrubbedDescHash);
+
+ /* Write sanitized server descriptor to disk, including all its parent
+ * directories. */
+ try {
+ newFile.getParentFile().mkdirs();
+ BufferedWriter bw = new BufferedWriter(new FileWriter(newFile));
+ bw.write(scrubbedDesc);
+ bw.close();
+ } catch (Exception e) {
+ this.logger.log(Level.WARNING, "Could not write sanitized "
+ + "extra-info descriptor to disk.", e);
+ }
+ }
+
+ public void storeSanitizedNetworkStatus(byte[] data, String published) {
+ String scrubbed = null;
+ try {
+ String ascii = new String(data, "US-ASCII");
+ BufferedReader br2 = new BufferedReader(new StringReader(ascii));
+ StringBuilder sb = new StringBuilder();
+ String line = null;
+ while ((line = br2.readLine()) != null) {
+ if (line.startsWith("r ")) {
+ String readCountryCode = line.split(" ")[1].substring(
+ "Unnamed".length());
+ String hashedBridgeIdentity = Hex.encodeHexString(
+ Base64.decodeBase64(line.split(" ")[2] + "==")).
+ toLowerCase();
+ String hashedBridgeIdentityBase64 =
+ Base64.encodeBase64String(DigestUtils.sha(
+ Base64.decodeBase64(line.split(" ")[2] + "=="))).
+ substring(0, 27);
+ String readServerDescId = Hex.encodeHexString(
+ Base64.decodeBase64(line.split(" ")[3] + "==")).
+ toLowerCase();
+ String descPublished = line.split(" ")[4] + " "
+ + line.split(" ")[5];
+ String mappingKey = (hashedBridgeIdentity + ","
+ + descPublished).toLowerCase();
+ DescriptorMapping mapping = null;
+ if (this.bridgeDescriptorMappings.containsKey(mappingKey)) {
+ mapping = this.bridgeDescriptorMappings.get(mappingKey);
+ } else {
+ mapping = new DescriptorMapping(hashedBridgeIdentity.
+ toLowerCase(), descPublished);
+ mapping.countryCode = readCountryCode;
+ mapping.serverDescriptorIdentifier = readServerDescId;
+ this.bridgeDescriptorMappings.put(mappingKey, mapping);
+ }
+ String nickname = "Unnamed" + mapping.countryCode;
+ String sdi = Base64.encodeBase64String(Hex.decodeHex(
+ mapping.serverDescriptorIdentifier.toCharArray())).
+ substring(0, 27);
+ String orPort = line.split(" ")[7];
+ String dirPort = line.split(" ")[8];
+ sb.append("r " + nickname + " "
+ + hashedBridgeIdentityBase64 + " " + sdi + " "
+ + descPublished + " 127.0.0.1 " + orPort + " "
+ + dirPort + "\n");
+ } else {
+ sb.append(line + "\n");
+ }
+ }
+ scrubbed = sb.toString();
+ br2.close();
+ } catch (DecoderException e) {
+ this.logger.log(Level.WARNING, "Could not parse server descriptor "
+ + "identifier. This must be a bug.", e);
+ return;
+ } catch (IOException e) {
+ this.logger.log(Level.WARNING, "Could not parse previously "
+ + "sanitized network status.", e);
+ return;
+ }
+
+ try {
+ /* Determine file name. */
+ String syear = published.substring(0, 4);
+ String smonth = published.substring(5, 7);
+ String sday = published.substring(8, 10);
+ String stime = published.substring(11, 13)
+ + published.substring(14, 16)
+ + published.substring(17, 19);
+ File statusFile = new File(this.sanitizedBridgesDir + "/" + syear
+ + "/" + smonth + "/statuses/" + sday + "/" + syear + smonth
+ + sday + "-" + stime + "-"
+ + "4A0CCD2DDC7995083D73F5D667100C8A5831F16D");
+
+ /* Create all parent directories to write this network status. */
+ statusFile.getParentFile().mkdirs();
+
+ /* Write sanitized network status to disk. */
+ BufferedWriter bw = new BufferedWriter(new FileWriter(statusFile));
+ bw.write(scrubbed);
+ bw.close();
+ } catch (IOException e) {
+ this.logger.log(Level.WARNING, "Could not write previously "
+ + "sanitized network status.", e);
+ return;
+ }
+ }
+
+ public void storeSanitizedServerDescriptor(byte[] data) {
+ try {
+ String ascii = new String(data, "US-ASCII");
+ BufferedReader br2 = new BufferedReader(new StringReader(ascii));
+ StringBuilder sb = new StringBuilder();
+ String line2 = null, published = null;
+ String hashedBridgeIdentity = null;
+ DescriptorMapping mapping = null;
+ while ((line2 = br2.readLine()) != null) {
+ if (mapping == null && published != null &&
+ hashedBridgeIdentity != null) {
+ String mappingKey = (hashedBridgeIdentity + "," + published).
+ toLowerCase();
+ if (this.bridgeDescriptorMappings.containsKey(mappingKey)) {
+ mapping = this.bridgeDescriptorMappings.get(mappingKey);
+ } else {
+ mapping = new DescriptorMapping(hashedBridgeIdentity.
+ toLowerCase(), published);
+ this.bridgeDescriptorMappings.put(mappingKey, mapping);
+ }
+ }
+ if (line2.startsWith("router ")) {
+ sb.append(" 127.0.0.1 " + line2.split(" ")[3] + " "
+ + line2.split(" ")[4] + " " + line2.split(" ")[5]
+ + "\n");
+ } else if (line2.startsWith("published ")) {
+ published = line2.substring("published ".length());
+ sb.append(line2 + "\n");
+ this.descriptorPublicationTimes.add(published);
+ } else if (line2.startsWith("opt fingerprint ")) {
+ hashedBridgeIdentity = line2.substring("opt fingerprint".
+ length()).replaceAll(" ", "").toLowerCase();
+ sb.append(line2 + "\n");
+ } else if (line2.startsWith("opt extra-info-digest ")) {
+ sb.append("opt extra-info-digest "
+ + mapping.extraInfoDescriptorIdentifier.toUpperCase()
+ + "\n");
+ } else {
+ sb.append(line2 + "\n");
+ }
+ }
+ br2.close();
+ String scrubbedDesc = "router Unnamed" + mapping.countryCode
+ + sb.toString();
+ String scrubbedHash = DigestUtils.shaHex(scrubbedDesc);
+
+ mapping.serverDescriptorIdentifier = scrubbedHash;
+ String dyear = published.substring(0, 4);
+ String dmonth = published.substring(5, 7);
+ String dday = published.substring(8, 10);
+ File newFile = new File(this.sanitizedBridgesDir + "/"
+ + dyear + "/" + dmonth + "/server-descriptors/" + dday
+ + "/" + scrubbedHash.substring(0, 1) + "/"
+ + scrubbedHash.substring(1, 2) + "/"
+ + scrubbedHash);
+ this.logger.finer("Storing server descriptor "
+ + newFile.getAbsolutePath());
+ newFile.getParentFile().mkdirs();
+ BufferedWriter bw = new BufferedWriter(new FileWriter(
+ newFile));
+ bw.write(scrubbedDesc);
+ bw.close();
+ } catch (IOException e) {
+ this.logger.log(Level.WARNING, "Could not store unsanitized server "
+ + "descriptor.", e);
+ }
+ }
+
+ public void storeSanitizedExtraInfoDescriptor(byte[] data) {
+ try {
+ String ascii = new String(data, "US-ASCII");
+ BufferedReader br2 = new BufferedReader(new StringReader(ascii));
+ StringBuilder sb = new StringBuilder();
+ String line2 = null, published = null;
+ String hashedBridgeIdentity = null;
+ DescriptorMapping mapping = null;
+ while ((line2 = br2.readLine()) != null) {
+ if (mapping == null && published != null &&
+ hashedBridgeIdentity != null) {
+ String mappingKey = (hashedBridgeIdentity + "," + published).
+ toLowerCase();
+ if (this.bridgeDescriptorMappings.containsKey(mappingKey)) {
+ mapping = this.bridgeDescriptorMappings.get(mappingKey);
+ } else {
+ mapping = new DescriptorMapping(hashedBridgeIdentity.
+ toLowerCase(), published);
+ this.bridgeDescriptorMappings.put(mappingKey, mapping);
+ }
+ }
+ if (line2.startsWith("extra-info ")) {
+ hashedBridgeIdentity = line2.split(" ")[2];
+ sb.append(hashedBridgeIdentity + "\n");
+ } else if (line2.startsWith("published ")) {
+ sb.append(line2 + "\n");
+ published = line2.substring("published ".length());
+ this.descriptorPublicationTimes.add(published);
+ } else if (line2.startsWith(
+ "contact somebody at example dot ") ||
+ line2.startsWith("contact nobody at example dot ")) {
+ sb.append(line2.substring(0, line2.indexOf("dot ")
+ + "dot ".length()) + mapping.countryCode.toLowerCase()
+ + "\n");
+ } else {
+ sb.append(line2 + "\n");
+ }
+ }
+ br2.close();
+ String scrubbedDesc = "extra-info Unnamed"
+ + mapping.countryCode.toUpperCase() + " " + sb.toString();
+ String scrubbedHash = DigestUtils.shaHex(scrubbedDesc);
+ mapping.extraInfoDescriptorIdentifier = scrubbedHash;
+ String dyear = published.substring(0, 4);
+ String dmonth = published.substring(5, 7);
+ String dday = published.substring(8, 10);
+ File newFile = new File(this.sanitizedBridgesDir + "/"
+ + dyear + "/" + dmonth + "/extra-infos/" + dday + "/"
+ + scrubbedHash.substring(0, 1) + "/"
+ + scrubbedHash.substring(1, 2) + "/"
+ + scrubbedHash);
+ this.logger.finer("Storing extra-info descriptor "
+ + newFile.getAbsolutePath());
+ newFile.getParentFile().mkdirs();
+ BufferedWriter bw = new BufferedWriter(new FileWriter(
+ newFile));
+ bw.write(scrubbedDesc);
+ bw.close();
+ } catch (IOException e) {
+ this.logger.log(Level.WARNING, "Could not store sanitized "
+ + "extra-info descriptor.", e);
+ }
+ }
+
+ private void rewriteNetworkStatus(File status, String published) {
+ try {
+ FileInputStream fis = new FileInputStream(status);
+ BufferedInputStream bis = new BufferedInputStream(fis);
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ int len;
+ byte[] data2 = new byte[1024];
+ while ((len = bis.read(data2, 0, 1024)) >= 0) {
+ baos.write(data2, 0, len);
+ }
+ fis.close();
+ byte[] allData = baos.toByteArray();
+ this.storeSanitizedNetworkStatus(allData, published);
+ } catch (IOException e) {
+ this.logger.log(Level.WARNING, "Could not rewrite network "
+ + "status.", e);
+ }
+ }
+
+ private void rewriteServerDescriptor(DescriptorMapping mapping) {
+ try {
+ String dyear = mapping.published.substring(0, 4);
+ String dmonth = mapping.published.substring(5, 7);
+ String dday = mapping.published.substring(8, 10);
+ File serverDescriptorFile = new File(
+ this.sanitizedBridgesDir + "/"
+ + dyear + "/" + dmonth + "/server-descriptors/" + dday
+ + "/" + mapping.serverDescriptorIdentifier.substring(0, 1) + "/"
+ + mapping.serverDescriptorIdentifier.substring(1, 2) + "/"
+ + mapping.serverDescriptorIdentifier);
+ FileInputStream fis = new FileInputStream(serverDescriptorFile);
+ BufferedInputStream bis = new BufferedInputStream(fis);
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ int len;
+ byte[] data2 = new byte[1024];
+ while ((len = bis.read(data2, 0, 1024)) >= 0) {
+ baos.write(data2, 0, len);
+ }
+ fis.close();
+ byte[] allData = baos.toByteArray();
+ this.storeSanitizedServerDescriptor(allData);
+ serverDescriptorFile.delete();
+ this.logger.finer("Deleting server descriptor "
+ + serverDescriptorFile.getAbsolutePath());
+ } catch (IOException e) {
+ this.logger.log(Level.WARNING, "Could not rewrite server "
+ + "descriptor.", e);
+ }
+ }
+
+ private void rewriteExtraInfoDescriptor(DescriptorMapping mapping) {
+ try {
+ String dyear = mapping.published.substring(0, 4);
+ String dmonth = mapping.published.substring(5, 7);
+ String dday = mapping.published.substring(8, 10);
+ File extraInfoDescriptorFile = new File(
+ this.sanitizedBridgesDir + "/"
+ + dyear + "/" + dmonth + "/extra-infos/" + dday + "/"
+ + mapping.extraInfoDescriptorIdentifier.substring(0, 1) + "/"
+ + mapping.extraInfoDescriptorIdentifier.substring(1, 2) + "/"
+ + mapping.extraInfoDescriptorIdentifier);
+ FileInputStream fis = new FileInputStream(extraInfoDescriptorFile);
+ BufferedInputStream bis = new BufferedInputStream(fis);
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ int len;
+ byte[] data2 = new byte[1024];
+ while ((len = bis.read(data2, 0, 1024)) >= 0) {
+ baos.write(data2, 0, len);
+ }
+ fis.close();
+ byte[] allData = baos.toByteArray();
+ this.storeSanitizedExtraInfoDescriptor(allData);
+ extraInfoDescriptorFile.delete();
+ this.logger.finer("Deleting extra-info descriptor "
+ + extraInfoDescriptorFile.getAbsolutePath());
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * Rewrite all network statuses that might contain references to server
+ * descriptors we added or updated in this execution. This applies to
+ * all statuses that have been published up to 24 hours after any added
+ * or updated server descriptor.
+ */
+ public void finishWriting() {
+
+ /* Prepare parsing and formatting timestamps. */
+ SimpleDateFormat dateTimeFormat =
+ new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ dateTimeFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+ SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+ dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+ SimpleDateFormat statusFileFormat =
+ new SimpleDateFormat("yyyyMMdd-HHmmss");
+ statusFileFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+
+ /* Iterate over publication timestamps of previously sanitized
+ * descriptors. For every publication timestamp, we want to re-write
+ * the network statuses that we published up to 24 hours after that
+ * descriptor. We keep the timestamp of the last re-written network
+ * status in order to make sure we re-writing any network status at
+ * most once. */
+ String lastRewrittenStatusMinus24Hours = "1970-01-01 00:00:00";
+ for (String published : this.descriptorPublicationTimes) {
+ if (published.compareTo(lastRewrittenStatusMinus24Hours) <= 0) {
+ continue;
+ }
+ // find statuses 24 hours after published
+ SortedSet<File> statusesToRewrite = new TreeSet<File>();
+ long publishedTime;
+ try {
+ publishedTime = dateTimeFormat.parse(published).getTime();
+ } catch (ParseException e) {
+ this.logger.log(Level.WARNING, "Could not parse publication "
+ + "timestamp '" + published + "'. Skipping.", e);
+ continue;
+ }
+ String[] dayOne = dateFormat.format(publishedTime).split("-");
+
+ File publishedDayOne = new File(this.sanitizedBridgesDir + "/"
+ + dayOne[0] + "/" + dayOne[1] + "/statuses/" + dayOne[2]);
+ if (publishedDayOne.exists()) {
+ statusesToRewrite.addAll(Arrays.asList(publishedDayOne.
+ listFiles()));
+ }
+ long plus24Hours = publishedTime + 24L * 60L * 60L * 1000L;
+ String[] dayTwo = dateFormat.format(plus24Hours).split("-");
+ File publishedDayTwo = new File(this.sanitizedBridgesDir + "/"
+ + dayTwo[0] + "/" + dayTwo[1] + "/statuses/" + dayTwo[2]);
+ if (publishedDayTwo.exists()) {
+ statusesToRewrite.addAll(Arrays.asList(publishedDayTwo.
+ listFiles()));
+ }
+ for (File status : statusesToRewrite) {
+ String statusPublished = status.getName().substring(0, 15);
+ long statusTime;
+ try {
+ statusTime = statusFileFormat.parse(statusPublished).getTime();
+ } catch (ParseException e) {
+ this.logger.log(Level.WARNING, "Could not parse network "
+ + "status publication timestamp '" + published
+ + "'. Skipping.", e);
+ continue;
+ }
+ if (statusTime < publishedTime || statusTime > plus24Hours) {
+ continue;
+ }
+ this.rewriteNetworkStatus(status,
+ dateTimeFormat.format(statusTime));
+ lastRewrittenStatusMinus24Hours = dateTimeFormat.format(
+ statusTime - 24L * 60L * 60L * 1000L);
+ }
+ }
+
+ /* Write descriptor mappings to disk. */
+ try {
+ BufferedWriter bw = new BufferedWriter(new FileWriter(
+ this.bridgeDescriptorMappingsFile));
+ for (DescriptorMapping mapping :
+ this.bridgeDescriptorMappings.values()) {
+ bw.write(mapping.toString() + "\n");
+ }
+ bw.close();
+ } catch (IOException e) {
+ this.logger.log(Level.WARNING, "Could not write descriptor "
+ + "mappings to disk.", e);
+ }
+ }
+}
+
--
1.6.5
More information about the tor-commits
mailing list