[tor-commits] [metrics-tasks/master] Add #4499 sources and tech report draft.
karsten at torproject.org
karsten at torproject.org
Mon Feb 6 14:55:06 UTC 2012
commit 9b1e4846c3d602f8097f2700cf228c3df3765e98
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Mon Feb 6 15:54:16 2012 +0100
Add #4499 sources and tech report draft.
---
task-4499/.gitignore | 9 +
.../GenerateSampleBridgeDescriptorTarballs.java | 266 ++++++++++++++++++++
task-4499/README | 56 ++++
task-4499/bridge-scaling.R | 23 ++
task-4499/bridge-scaling.csv | 24 ++
task-4499/bridge-scaling.tex | 117 +++++++++
6 files changed, 495 insertions(+), 0 deletions(-)
diff --git a/task-4499/.gitignore b/task-4499/.gitignore
new file mode 100644
index 0000000..44c09e8
--- /dev/null
+++ b/task-4499/.gitignore
@@ -0,0 +1,9 @@
+*.class
+*.jar
+in/
+out/
+*.png
+*.aux
+*.log
+*.pdf
+
diff --git a/task-4499/GenerateSampleBridgeDescriptorTarballs.java b/task-4499/GenerateSampleBridgeDescriptorTarballs.java
new file mode 100755
index 0000000..a43fd73
--- /dev/null
+++ b/task-4499/GenerateSampleBridgeDescriptorTarballs.java
@@ -0,0 +1,266 @@
+import java.io.*;
+import java.util.*;
+import org.apache.commons.codec.*;
+import org.apache.commons.codec.digest.*;
+import org.apache.commons.codec.binary.*;
+
+/* Generate sample bridge descriptor tarball contents for metrics-db and
+ * BridgeDB load tests. Accept an extracted, non-sanitized bridge
+ * descriptor tarball as input and generate sample tarball contents with
+ * multiples of bridges up to a given maximum multiplier as output.
+ * Descriptors are multiplied by overwriting the first four hex characters
+ * of bridge fingerprints with 0000, 0001, etc., keeping references
+ * between descriptors intact.
+ *
+ * NOTE THAT THE OUTPUT TARBALL CONTENTS ARE NOT SANITIZED!
+ *
+ * The changes are only sufficient to trick metrics-db and BridgeDB that
+ * bridges are distinct. Descriptors may still contain original IP
+ * addresses in exit policies and other contact information. Sanitized
+ * descriptors could not be used as input, because they may have skewed
+ * results too much. */
+public class GenerateSampleBridgeDescriptorTarballs {
+ public static void main(String[] args) throws Exception {
+ if (args.length != 3) {
+ System.err.println("Usage: java "
+ + GenerateSampleBridgeDescriptorTarballs.class.getName()
+ + " in-directory out-directory max-multiplier");
+ System.exit(1);
+ }
+ File inDirectory = new File(args[0]);
+ File outDirectory = new File(args[1]);
+ int maxMultiplier = Integer.parseInt(args[2]);
+ readDescriptors(inDirectory);
+ for (int multiplier = 1; multiplier <= maxMultiplier;
+ multiplier *= 2) {
+ writeDescriptors(new File(outDirectory, String.format("%04d",
+ multiplier)), multiplier);
+ }
+ }
+
+ private static void readDescriptors(File inDirectory) throws Exception {
+ readNetworkstatusBridges(new File(inDirectory,
+ "networkstatus-bridges"));
+ readBridgeDescriptors(new File(inDirectory, "bridge-descriptors"));
+ readCachedExtrainfos(new File(inDirectory, "cached-extrainfo"));
+ readCachedExtrainfos(new File(inDirectory, "cached-extrainfo.new"));
+ }
+
+ private static SortedMap<String, String> networkstatusEntries =
+ new TreeMap<String, String>();
+ private static void readNetworkstatusBridges(
+ File networkstatusBridgesFile) throws Exception {
+ BufferedReader br = new BufferedReader(new FileReader(
+ networkstatusBridgesFile));
+ String line, fingerprint = null, published = null;
+ StringBuilder sb = null;
+ while ((line = br.readLine()) != null) {
+ if (line.startsWith("r ")) {
+ if (sb != null) {
+ networkstatusEntries.put(fingerprint + " " + published,
+ sb.toString());
+ }
+ sb = new StringBuilder();
+ String[] parts = line.split(" ");
+ fingerprint = Hex.encodeHexString(Base64.decodeBase64(
+ parts[2] + "=")).toUpperCase();
+ published = parts[4] + " " + parts[5];
+ }
+ sb.append(line + "\n");
+ }
+ if (sb != null) {
+ networkstatusEntries.put(fingerprint + " " + published,
+ sb.toString());
+ }
+ br.close();
+ }
+
+ private static SortedMap<String, String> bridgeDescriptors =
+ new TreeMap<String, String>();
+ private static void readBridgeDescriptors(File bridgeDescriptorsFile)
+ throws Exception {
+ BufferedReader br = new BufferedReader(new FileReader(
+ bridgeDescriptorsFile));
+ String line, fingerprint = null, published = null;
+ StringBuilder sb = null;
+ while ((line = br.readLine()) != null) {
+ if (line.startsWith("@purpose ")) {
+ if (sb != null) {
+ bridgeDescriptors.put(fingerprint + " " + published,
+ sb.toString());
+ }
+ sb = new StringBuilder();
+ } else if (line.startsWith("published ")) {
+ published = line.substring("published ".length());
+ } else if (line.startsWith("opt fingerprint ")) {
+ fingerprint = line.substring("opt fingerprint ".length()).
+ replaceAll(" ", "");
+ }
+ sb.append(line + "\n");
+ }
+ if (sb != null) {
+ bridgeDescriptors.put(fingerprint + " " + published, sb.toString());
+ }
+ br.close();
+
+ }
+
+ private static SortedMap<String, String> cachedExtrainfos =
+ new TreeMap<String, String>();
+ private static void readCachedExtrainfos(File cachedExtrainfoFile)
+ throws Exception {
+ BufferedReader br = new BufferedReader(new FileReader(
+ cachedExtrainfoFile));
+ String line, fingerprint = null, published = null;
+ StringBuilder sb = null;
+ while ((line = br.readLine()) != null) {
+ if (line.startsWith("extra-info ")) {
+ if (sb != null) {
+ cachedExtrainfos.put(fingerprint + " " + published,
+ sb.toString());
+ }
+ sb = new StringBuilder();
+ fingerprint = line.split(" ")[2];
+ } else if (line.startsWith("published ")) {
+ published = line.substring("published ".length());
+ }
+ sb.append(line + "\n");
+ }
+ if (sb != null) {
+ cachedExtrainfos.put(fingerprint + " " + published, sb.toString());
+ }
+ br.close();
+ }
+
+ private static void writeDescriptors(File outDirectory, int multiplier)
+ throws Exception {
+ outDirectory.mkdirs();
+ for (File file : outDirectory.listFiles()) {
+ file.delete();
+ }
+ for (int i = 0; i < multiplier; i++) {
+ String fingerprintPrefix = String.format("%04x", i);
+ SortedMap<String, String> extraInfoDigests = writeCachedExtrainfos(
+ outDirectory, fingerprintPrefix);
+ SortedMap<String, String> descriptorDigests =
+ writeBridgeDescriptors(outDirectory, extraInfoDigests,
+ fingerprintPrefix);
+ writeNetworkstatusBridges(outDirectory, descriptorDigests,
+ fingerprintPrefix);
+ }
+ }
+
+ private static SortedMap<String, String> writeCachedExtrainfos(
+ File outDirectory, String fingerprintPrefix) throws Exception {
+ SortedMap<String, String> extraInfoDigests =
+ new TreeMap<String, String>();
+ BufferedWriter bw = new BufferedWriter(new FileWriter(new File(
+ outDirectory, "cached-extrainfo"), true));
+ for (Map.Entry<String, String> e : cachedExtrainfos.entrySet()) {
+ String fingerprintPublished = e.getKey();
+ String cachedExtrainfo = e.getValue();
+ BufferedReader br = new BufferedReader(new StringReader(
+ cachedExtrainfo));
+ String line;
+ StringBuilder sb = new StringBuilder();
+ while ((line = br.readLine()) != null) {
+ if (line.startsWith("extra-info ")) {
+ String[] parts = line.split(" ");
+ sb.append(parts[0] + " " + parts[1] + " " + fingerprintPrefix
+ + parts[2].substring(4) + "\n");
+ } else if (line.equals("router-signature")) {
+ sb.append(line + "\n");
+ String digest = DigestUtils.shaHex(sb.toString()).toUpperCase();
+ extraInfoDigests.put(fingerprintPublished, digest);
+ } else {
+ sb.append(line + "\n");
+ }
+ }
+ bw.write(sb.toString());
+ }
+ bw.close();
+ return extraInfoDigests;
+ }
+
+ private static SortedMap<String, String> writeBridgeDescriptors(
+ File outDirectory, SortedMap<String, String> extraInfoDigests,
+ String fingerprintPrefix) throws Exception {
+ SortedMap<String, String> descriptorDigests =
+ new TreeMap<String, String>();
+ BufferedWriter bw = new BufferedWriter(new FileWriter(new File(
+ outDirectory, "bridge-descriptors"), true));
+ for (Map.Entry<String, String> e : bridgeDescriptors.entrySet()) {
+ String fingerprintPublished = e.getKey();
+ String bridgeDescriptor = e.getValue();
+ BufferedReader br = new BufferedReader(new StringReader(
+ bridgeDescriptor));
+ String line;
+ StringBuilder sb = new StringBuilder();
+ while ((line = br.readLine()) != null) {
+ if (line.startsWith("@purpose ")) {
+ } else if (line.startsWith("opt fingerprint ")) {
+ sb.append("opt fingerprint " + fingerprintPrefix
+ + line.substring("opt fingerprint 0000".length()) + "\n");
+ } else if (line.startsWith("opt extra-info-digest ")) {
+ String extraInfoDigest = null;
+ if (extraInfoDigests.containsKey(fingerprintPublished)) {
+ extraInfoDigest = extraInfoDigests.get(fingerprintPublished);
+ } else {
+ extraInfoDigest = fingerprintPrefix
+ + line.split(" ")[2].substring(4);
+ }
+ sb.append("opt extra-info-digest " + extraInfoDigest + "\n");
+ } else if (line.equals("router-signature")) {
+ sb.append(line + "\n");
+ String digest = DigestUtils.shaHex(sb.toString()).toUpperCase();
+ descriptorDigests.put(fingerprintPublished, digest);
+ } else {
+ sb.append(line + "\n");
+ }
+ }
+ bw.write("@purpose bridge\n" + sb.toString());
+ }
+ bw.close();
+ return descriptorDigests;
+ }
+
+ private static void writeNetworkstatusBridges(File outDirectory,
+ SortedMap<String, String> descriptorDigests,
+ String fingerprintPrefix) throws Exception {
+ BufferedWriter bw = new BufferedWriter(new FileWriter(new File(
+ outDirectory, "networkstatus-bridges"), true));
+ for (Map.Entry<String, String> e : networkstatusEntries.entrySet()) {
+ String fingerprintPublished = e.getKey();
+ String networkstatusEntry = e.getValue();
+ BufferedReader br = new BufferedReader(new StringReader(
+ networkstatusEntry));
+ String line;
+ StringBuilder sb = new StringBuilder();
+ while ((line = br.readLine()) != null) {
+ if (line.startsWith("r ")) {
+ String[] parts = line.split(" ");
+ String fingerprint = parts[2], descriptorDigest = parts[3];
+ String newFingerprint = Base64.encodeBase64String(Hex.decodeHex(
+ (fingerprintPrefix + fingerprintPublished.split(" ")[0].
+ substring(4)).toCharArray())).substring(0, 27);
+ String newDescriptorDigest = null;
+ if (descriptorDigests.containsKey(fingerprintPublished)) {
+ newDescriptorDigest = Base64.encodeBase64String(Hex.decodeHex(
+ descriptorDigests.get(fingerprintPublished).
+ toCharArray())).substring(0, 27);
+ } else {
+ newDescriptorDigest = "AA" + descriptorDigest.substring(2);
+ }
+ sb.append("r " + parts[1] + " " + newFingerprint + " "
+ + newDescriptorDigest + " " + parts[4] + " " + parts[5]
+ + " " + parts[6] + " " + parts[7] + " " + parts[8] + "\n");
+ } else {
+ sb.append(line + "\n");
+ }
+ }
+ bw.write(sb.toString());
+ }
+ bw.close();
+ }
+}
+
diff --git a/task-4499/README b/task-4499/README
new file mode 100644
index 0000000..4bf9264
--- /dev/null
+++ b/task-4499/README
@@ -0,0 +1,56 @@
+1 Generating sample bridge descriptors
+=======================================
+
+This is a simple Java class to generate sample bridge descriptors for
+metrics-db and BridgeDB load tests.
+
+==========================================================================
+======== NOTE THAT THE OUTPUT TARBALL CONTENTS ARE NOT SANITIZED! ========
+==========================================================================
+
+The changes are only sufficient to trick metrics-db and BridgeDB that
+bridges are distinct. Descriptors may still contain original IP addresses
+in exit policies and other contact information. Sanitized descriptors
+could not be used as input, because they may have skewed results too much.
+
+Here's how you generate sample bridge descriptors from original
+descriptors.
+
+Extract a non-sanitized bridge descriptor tarball to in/, so that there
+are four files:
+
+ in/bridge-descriptors
+ in/cached-extrainfo.new
+ in/cached-extrainfo
+ in/networkstatus-bridges
+
+Download the Apache Commons Codec .jar file and put in the root directory,
+e.g.,
+
+ commons-codec-1.4.jar
+
+Compile the Java class:
+
+ $ javac -cp commons-codec-1.4.jar \
+ GenerateSampleBridgeDescriptorTarballs.java
+
+Run the Java class to generate sample data up to a factor of 256 times the
+descriptors in the in/ directory:
+
+ $ java -cp .:commons-codec-1.4.jar \
+ GenerateSampleBridgeDescriptorTarballs in out 256
+
+Find the generated sample data in the out/ directory.
+
+
+2 Building the tech report
+===========================
+
+Generate the graph:
+
+ $ R --slave -f bridge-scaling.R
+
+Build the PDF:
+
+ $ pdflatex bridge-scaling.tex
+
diff --git a/task-4499/bridge-scaling.R b/task-4499/bridge-scaling.R
new file mode 100644
index 0000000..972f240
--- /dev/null
+++ b/task-4499/bridge-scaling.R
@@ -0,0 +1,23 @@
+library(ggplot2)
+d <- read.csv("bridge-scaling.csv", header = TRUE)
+t <- d[d$variable == "1tarball", ]
+b <- d[d$variable == "2bridgedb", ]
+m <- d[d$variable == "3metricsdb", ]
+d <- rbind(
+ data.frame(x = t$x, y = t$y, colour = t$colour,
+ variable = "Tarball size in GiB/day"),
+ data.frame(x = b$x, y = b$y, colour = b$colour,
+ variable = "BridgeDB time in min"),
+ data.frame(x = m$x, y = m$y, colour = m$colour,
+ variable = "metrics-db time in min"))
+ggplot(d, aes(x = x, y = y, colour = colour)) +
+geom_line(colour = "black") +
+geom_point() +
+facet_grid(variable ~ ., scales = "free_y") +
+scale_x_continuous(name = "\nRunning bridges (2012-01-31 = 838, red)") +
+scale_y_continuous(name = "") +
+scale_colour_manual(name = "", value = c("black", "red")) +
+opts(legend.position = "none",
+ title = "Scalability of Tor's bridge infrastructure\n")
+ggsave("bridge-scaling.png", width = 7, height = 6, dpi = 100)
+
diff --git a/task-4499/bridge-scaling.csv b/task-4499/bridge-scaling.csv
new file mode 100644
index 0000000..24bbbf3
--- /dev/null
+++ b/task-4499/bridge-scaling.csv
@@ -0,0 +1,24 @@
+x,y,variable,colour
+NA,0,1tarball,black
+838,0.103,1tarball,red
+1676,0.206,1tarball,black
+3352,0.412,1tarball,black
+6704,0.843,1tarball,black
+13408,1.64,1tarball,black
+26816,3.281,1tarball,black
+53632,6.562,1tarball,black
+NA,0,2bridgedb,black
+838,0.0833,2bridgedb,red
+1676,0.1833,2bridgedb,black
+3352,0.3833,2bridgedb,black
+6704,0.7666,2bridgedb,black
+13408,1.5833,2bridgedb,black
+26816,3.3,2bridgedb,black
+53632,6.283,2bridgedb,black
+NA,0,3metricsdb,black
+838,0.583,3metricsdb,red
+1676,1.366,3metricsdb,black
+3352,3.816,3metricsdb,black
+6704,7.9,3metricsdb,black
+13408,20.216,3metricsdb,black
+26816,44.75,3metricsdb,black
diff --git a/task-4499/bridge-scaling.tex b/task-4499/bridge-scaling.tex
new file mode 100644
index 0000000..14dae1a
--- /dev/null
+++ b/task-4499/bridge-scaling.tex
@@ -0,0 +1,117 @@
+\documentclass{article}
+\usepackage{url}
+\usepackage[pdftex]{graphicx}
+\usepackage{graphics}
+\usepackage{color}
+\begin{document}
+\title{Investigating scaling points to handle more bridges}
+\author{Karsten Loesing\\{\tt karsten at torproject.org}}
+
+\maketitle
+
+\section{Introduction}
+
+The current bridge infrastructure relies on a central bridge authority to
+collect, distribute, and publish bridge relay descriptors.
+We believe the current infrastructure can handle up to 10,000 bridges.
+
+The scaling points involve the database of descriptors, the metrics portal
+and its ability to handle this many descriptors for analysis, and the
+reachability testing part of the code for the bridge authority.
+We should investigate scaling points to handle more than 10,000 bridge
+descriptors.
+
+\section{Early results}
+
+We started this analysis by writing a small tool to generate sample data
+for BridgeDB and metrics-db.
+This tool takes the contents from one of Tonga's bridge tarball as input,
+copies them a given number of times, and overwrites the first two bytes of
+relay fingerprints in every copy with 0000, 0001, etc.
+The tool also fixes references between network statuses, server
+descriptors, and extra-info descriptors.
+This is sufficient to trick BridgeDB and metrics-db into thinking that
+relays in the copies are distinct relays.
+We used the tool to generate tarballs with 2, 4, 8, 16, 32, and 64 times
+as many bridge descriptors in them.
+
+In the next step we fed the tarballs into BridgeDB and metrics-db.
+BridgeDB reads the network statuses and server descriptors from the latest
+tarball and writes them to a local database.
+metrics-db sanitizes two half-hourly created tarballs every hour,
+establishes an internal mapping between descriptors, and writes sanitized
+descriptors with fixed references to disk.
+
+Figure~\ref{fig:bridgescaling} shows the results.
+
+\begin{figure}[t]
+\includegraphics[width=\textwidth]{bridge-scaling.png}
+%\caption{}
+\label{fig:bridgescaling}
+\end{figure}
+
+The upper graph shows how the tarballs grow in size with more bridge
+descriptors in them.
+This growth is, unsurprisingly, linear.
+One thing to keep in mind here is that bandwidth and storage requirements
+to the hosts transferring and storing bridge tarballs are growing with the
+tarballs.
+We'll want to pay extra attention to disk space running out on those
+hosts.
+
+The middle graph shows how long BridgeDB takes to load descriptors from a
+tarball.
+This graph is linear, too, which indicates that BridgeDB can handle an
+increase in the number of bridges pretty well.
+One thing we couldn't check is whether BridgeDB's ability to serve client
+requests is in any way affected during the descriptor import.
+We assume it'll be fine.
+We should ask Aaron, if there are other things in BridgeDB that we
+overlooked that may not scale.
+
+The lower graph shows how metrics-db can or cannot handle more bridges.
+The growth is slightly worse than linear.
+In any case, the absolute time required to handle 25K bridges is worrisome
+(we didn't try 50K).
+metrics-db runs in an hourly cronjob, and if that cronjob doesn't finish
+within 1 hour, we cannot start the next run and will be missing some data.
+We might have to sanitize bridge descriptors in a different thread or
+process than the one that fetches all the other metrics data.
+We can also look into other Java libraries to handle .gz-compressed files
+that are faster than the one we're using.
+So, we can probably handle 25K bridges somehow, and maybe even 50K.
+Somehow.
+
+Finally, note that we left out the most important part of this analysis:
+can Tonga, or more generally, a single bridge authority handle this
+increase in bridges?
+We're not sure how to test such a setting, or at least without running 50K
+bridges in a private network.
+We could imagine this requires some more sophisticated sample data
+generation including getting the crypto right and then talking to Tonga's
+DirPort.
+If there's an easy way to test this, we'll do it.
+If not, we can always hope for the best.
+What can go wrong.
+
+\section{Work left to do}
+
+If we end up with way too many bridges, here are a few things we'll want
+to look at updating:
+
+\begin{itemize}
+\item Tonga still does a reachability test on each bridge every 21 minutes
+or so.
+Eventually the number of TLS handshakes it's doing will overwhelm its cpu.
+\item The tarballs we make every half hour have substantial overlap.
+If we have tens of thousands of descriptors, we would want to get smarter
+at sending diffs over to bridgedb.
+\item Somebody should check whether BridgeDB's interaction with users
+freezes while it's reading a new set of data.
+\end{itemize}
+
+%\bibliography{bridge-scaling}
+%\bibliographystyle{plain}
+
+\end{document}
+
More information about the tor-commits
mailing list