[or-cvs] [metrics-db/master 1/2] Remove descriptor stats that are now part of the DB schema.

karsten at torproject.org karsten at torproject.org
Fri Nov 19 13:36:18 UTC 2010


Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Fri, 19 Nov 2010 09:45:13 +0100
Subject: Remove descriptor stats that are now part of the DB schema.
Commit: 777cb6f3e0ad3bf1f89162799ca1711c39ea5535

---
 config                                             |   10 -
 src/org/torproject/ernie/db/Configuration.java     |   30 +-
 src/org/torproject/ernie/db/Main.java              |   16 +-
 .../torproject/ernie/db/RelayDescriptorParser.java |   41 +-
 .../ernie/db/ServerDescriptorStatsFileHandler.java |  624 --------------------
 5 files changed, 16 insertions(+), 705 deletions(-)
 delete mode 100644 src/org/torproject/ernie/db/ServerDescriptorStatsFileHandler.java

diff --git a/config b/config
index 8fd75b9..f5486e7 100644
--- a/config
+++ b/config
@@ -128,14 +128,4 @@
 #
 ## Write bridge stats to disk
 #WriteBridgeStats 0
-#
-## Write server descriptors stats to disk
-#WriteServerDescriptorStats 0
-#
-## Comma-separated list of relay versions to be included in version-stats
-#RelayVersions 0.1.2,0.2.0,0.2.1,0.2.2
-#
-## Comma-separated list of relay platforms to be included in
-## platform-stats
-#RelayPlatforms Linux,Windows,Darwin,FreeBSD
 
diff --git a/src/org/torproject/ernie/db/Configuration.java b/src/org/torproject/ernie/db/Configuration.java
index 428dbb5..ea694a6 100644
--- a/src/org/torproject/ernie/db/Configuration.java
+++ b/src/org/torproject/ernie/db/Configuration.java
@@ -22,11 +22,6 @@ public class Configuration {
       Arrays.asList(("8522EB98C91496E80EC238E732594D1509158E77,"
       + "9695DFC35FFEB861329B9F1AB04C46397020CE31").split(",")));
   private boolean writeBridgeStats = false;
-  private boolean writeServerDescriptorStats = false;
-  private List<String> relayVersions = new ArrayList<String>(
-      Arrays.asList("0.1.2,0.2.0,0.2.1,0.2.2".split(",")));
-  private List<String> relayPlatforms = new ArrayList<String>(
-      Arrays.asList("Linux,Windows,Darwin,FreeBSD".split(",")));
   private boolean writeDirectoryArchives = false;
   private String directoryArchivesOutputDirectory = "directory-archive/";
   private boolean importCachedRelayDescriptors = false;
@@ -108,15 +103,6 @@ public class Configuration {
         } else if (line.startsWith("WriteBridgeStats")) {
           this.writeBridgeStats = Integer.parseInt(
               line.split(" ")[1]) != 0;
-        } else if (line.startsWith("WriteServerDescriptorStats")) {
-          this.writeServerDescriptorStats = Integer.parseInt(
-              line.split(" ")[1]) != 0;
-        } else if (line.startsWith("RelayVersions")) {
-          this.relayVersions = new ArrayList<String>(
-              Arrays.asList(line.split(" ")[1].split(",")));
-        } else if (line.startsWith("RelayPlatforms")) {
-          this.relayPlatforms = new ArrayList<String>(
-              Arrays.asList(line.split(" ")[1].split(",")));
         } else if (line.startsWith("WriteDirectoryArchives")) {
           this.writeDirectoryArchives = Integer.parseInt(
               line.split(" ")[1]) != 0;
@@ -251,7 +237,7 @@ public class Configuration {
         !this.writeAggregateStatsDatabase &&
         !this.writeSanitizedBridges && !this.writeConsensusStats &&
         !this.writeDirreqStats && !this.writeBridgeStats &&
-        !this.writeServerDescriptorStats && !this.writeConsensusHealth) {
+        !this.writeConsensusHealth) {
       logger.warning("We have not been configured to read data from any "
           + "data source or write data to any data sink. You need to "
           + "edit your config file (" + configFile.getAbsolutePath()
@@ -264,7 +250,7 @@ public class Configuration {
         this.writeRelayDescriptorDatabase ||
         this.writeRelayDescriptorsRawFiles || this.writeConsensusStats ||
         this.writeDirreqStats || this.writeBridgeStats ||
-        this.writeServerDescriptorStats || this.writeConsensusHealth)) {
+        this.writeConsensusHealth)) {
       logger.warning("We are configured to import/download relay "
           + "descriptors, but we don't have a single data sink to write "
           + "relay descriptors to.");
@@ -272,8 +258,7 @@ public class Configuration {
     if (!(this.importCachedRelayDescriptors ||
         this.importDirectoryArchives || this.downloadRelayDescriptors) &&
         (this.writeDirectoryArchives ||
-        this.writeRelayDescriptorDatabase || this.writeDirreqStats ||
-        this.writeServerDescriptorStats)) {
+        this.writeRelayDescriptorDatabase || this.writeDirreqStats)) {
       logger.warning("We are configured to write relay descriptor to at "
           + "least one data sink, but we don't have a single data source "
           + "containing relay descriptors.");
@@ -315,15 +300,6 @@ public class Configuration {
   public boolean getWriteBridgeStats() {
     return this.writeBridgeStats;
   }
-  public boolean getWriteServerDescriptorStats() {
-    return this.writeServerDescriptorStats;
-  }
-  public List<String> getRelayVersions() {
-    return this.relayVersions;
-  }
-  public List<String> getRelayPlatforms() {
-    return this.relayPlatforms;
-  }
   public boolean getWriteDirectoryArchives() {
     return this.writeDirectoryArchives;
   }
diff --git a/src/org/torproject/ernie/db/Main.java b/src/org/torproject/ernie/db/Main.java
index 207c594..9f5079f 100644
--- a/src/org/torproject/ernie/db/Main.java
+++ b/src/org/torproject/ernie/db/Main.java
@@ -47,10 +47,6 @@ public class Main {
         new DirreqStatsFileHandler(countries,
         config.getWriteAggregateStatsDatabase() ?
         config.getRelayDescriptorDatabaseJDBC() : null) : null;
-    ServerDescriptorStatsFileHandler sdsfh =
-        config.getWriteServerDescriptorStats() ?
-        new ServerDescriptorStatsFileHandler(config.getRelayVersions(),
-        config.getRelayPlatforms()) : null;
 
     // Prepare consensus health checker
     ConsensusHealthChecker chc = config.getWriteConsensusHealth() ?
@@ -75,12 +71,11 @@ public class Main {
     // directory archives to disk)
     RelayDescriptorParser rdp = config.getWriteConsensusStats() ||
         config.getWriteBridgeStats() || config.getWriteDirreqStats() ||
-        config.getWriteServerDescriptorStats() ||
         config.getWriteDirectoryArchives() ||
         config.getWriteRelayDescriptorDatabase() ||
         config.getWriteRelayDescriptorsRawFiles() ||
         config.getWriteConsensusHealth() ?
-        new RelayDescriptorParser(csfh, bsfh, dsfh, sdsfh, aw, rddi, chc,
+        new RelayDescriptorParser(csfh, bsfh, dsfh, aw, rddi, chc,
             countries, directories) : null;
 
     // Import/download relay descriptors from the various sources
@@ -90,10 +85,9 @@ public class Main {
         List<String> dirSources =
             config.getDownloadFromDirectoryAuthorities();
         boolean downloadCurrentConsensus = aw != null || csfh != null ||
-            bsfh != null || sdsfh != null || rddi != null || chc != null;
+            bsfh != null || rddi != null || chc != null;
         boolean downloadCurrentVotes = aw != null || chc != null;
-        boolean downloadAllServerDescriptors = aw != null ||
-            sdsfh != null || rddi != null;
+        boolean downloadAllServerDescriptors = aw != null || rddi != null;
         boolean downloadAllExtraInfos = aw != null;
         Set<String> downloadDescriptorsForRelays = bsfh != null ||
             dsfh != null ? directories : new HashSet<String>();
@@ -148,10 +142,6 @@ public class Main {
       dsfh.writeFile();
       dsfh = null;
     }
-    if (sdsfh != null) {
-      sdsfh.writeFiles();
-      sdsfh = null;
-    }
 
     // Prepare sanitized bridge descriptor writer
     SanitizedBridgesWriter sbw = config.getWriteSanitizedBridges() ?
diff --git a/src/org/torproject/ernie/db/RelayDescriptorParser.java b/src/org/torproject/ernie/db/RelayDescriptorParser.java
index 1dc10c4..bbc1f16 100644
--- a/src/org/torproject/ernie/db/RelayDescriptorParser.java
+++ b/src/org/torproject/ernie/db/RelayDescriptorParser.java
@@ -35,12 +35,6 @@ public class RelayDescriptorParser {
   private BridgeStatsFileHandler bsfh;
 
   /**
-   * Stats file handler that accepts parse results for server descriptor
-   * statistics.
-   */
-  private ServerDescriptorStatsFileHandler sdsfh;
-
-  /**
    * File writer that writes descriptor contents to files in a
    * directory-archive directory structure.
    */
@@ -83,13 +77,12 @@ public class RelayDescriptorParser {
    */
   public RelayDescriptorParser(ConsensusStatsFileHandler csfh,
       BridgeStatsFileHandler bsfh, DirreqStatsFileHandler dsfh,
-      ServerDescriptorStatsFileHandler sdsfh, ArchiveWriter aw,
-      RelayDescriptorDatabaseImporter rddi, ConsensusHealthChecker chc,
-      SortedSet<String> countries, SortedSet<String> directories) {
+      ArchiveWriter aw, RelayDescriptorDatabaseImporter rddi,
+      ConsensusHealthChecker chc, SortedSet<String> countries,
+      SortedSet<String> directories) {
     this.csfh = csfh;
     this.bsfh = bsfh;
     this.dsfh = dsfh;
-    this.sdsfh = sdsfh;
     this.aw = aw;
     this.rddi = rddi;
     this.chc = chc;
@@ -130,10 +123,9 @@ public class RelayDescriptorParser {
         // consensuses
         boolean isConsensus = true;
         int exit = 0, fast = 0, guard = 0, running = 0, stable = 0;
-        String validAfterTime = null, descriptorIdentity = null,
-            nickname = null, relayIdentity = null, serverDesc = null,
-            version = null, ports = null;
-        StringBuilder descriptorIdentities = new StringBuilder();
+        String validAfterTime = null, nickname = null,
+            relayIdentity = null, serverDesc = null, version = null,
+            ports = null;
         String fingerprint = null, dirSource = null, address = null;
         long validAfter = -1L, published = -1L, bandwidth = -1L,
             orPort = 0L, dirPort = 0L;
@@ -181,7 +173,6 @@ public class RelayDescriptorParser {
             hashedRelayIdentities.add(DigestUtils.shaHex(
                 Base64.decodeBase64(parts[2] + "=")).
                 toUpperCase());
-            descriptorIdentity = parts[3];
             published = parseFormat.parse(parts[4] + " " + parts[5]).
                 getTime();
             address = parts[6];
@@ -195,7 +186,6 @@ public class RelayDescriptorParser {
               guard += line.contains(" Guard") ? 1 : 0;
               stable += line.contains(" Stable") ? 1 : 0;
               running++;
-              descriptorIdentities.append("," + descriptorIdentity);
             }
             relayFlags = new TreeSet<String>();
             if (line.length() > 2) {
@@ -240,10 +230,6 @@ public class RelayDescriptorParser {
             this.csfh.addConsensusResults(validAfterTime, exit, fast,
                 guard, running, stable);
           }
-          if (this.sdsfh != null) {
-            this.sdsfh.addConsensus(validAfterTime,
-                descriptorIdentities.toString().substring(1));
-          }
           if (this.rdd != null) {
             this.rdd.haveParsedConsensus(validAfterTime, dirSources,
                 serverDescriptors);
@@ -283,9 +269,9 @@ public class RelayDescriptorParser {
           }
         }
       } else if (line.startsWith("router ")) {
-        String platformLine = null, publishedLine = null,
-            publishedTime = null, bandwidthLine = null,
-            extraInfoDigest = null, relayIdentifier = null;
+        String platformLine = null, publishedTime = null,
+            bandwidthLine = null, extraInfoDigest = null,
+            relayIdentifier = null;
         String[] parts = line.split(" ");
         String nickname = parts[1];
         String address = parts[2];
@@ -296,7 +282,6 @@ public class RelayDescriptorParser {
           if (line.startsWith("platform ")) {
             platformLine = line;
           } else if (line.startsWith("published ")) {
-            publishedLine = line;
             publishedTime = line.substring("published ".length());
             published = parseFormat.parse(publishedTime).getTime();
           } else if (line.startsWith("opt fingerprint") ||
@@ -320,12 +305,10 @@ public class RelayDescriptorParser {
         String sigToken = "\nrouter-signature\n";
         int start = ascii.indexOf(startToken);
         int sig = ascii.indexOf(sigToken) + sigToken.length();
-        String digest = null, descriptorIdentity = null;
+        String digest = null;
         if (start >= 0 || sig >= 0 || sig > start) {
           byte[] forDigest = new byte[sig - start];
           System.arraycopy(data, start, forDigest, 0, sig - start);
-          descriptorIdentity = Base64.encodeBase64String(
-              DigestUtils.sha(forDigest)).substring(0, 27);
           digest = DigestUtils.shaHex(forDigest);
         }
         if (this.aw != null && digest != null) {
@@ -335,10 +318,6 @@ public class RelayDescriptorParser {
           this.rdd.haveParsedServerDescriptor(publishedTime,
               relayIdentifier, digest, extraInfoDigest);
         }
-        if (this.sdsfh != null && descriptorIdentity != null) {
-          this.sdsfh.addServerDescriptor(descriptorIdentity, platformLine,
-              publishedLine, bandwidthLine);
-        }
         if (this.rddi != null && digest != null) {
           String[] bwParts = bandwidthLine.split(" ");
           long bandwidthAvg = Long.parseLong(bwParts[1]);
diff --git a/src/org/torproject/ernie/db/ServerDescriptorStatsFileHandler.java b/src/org/torproject/ernie/db/ServerDescriptorStatsFileHandler.java
deleted file mode 100644
index 9368b28..0000000
--- a/src/org/torproject/ernie/db/ServerDescriptorStatsFileHandler.java
+++ /dev/null
@@ -1,624 +0,0 @@
-/* Copyright 2010 The Tor Project
- * See LICENSE for licensing information */
-package org.torproject.ernie.db;
-
-import java.io.*;
-import java.text.*;
-import java.util.*;
-import java.util.logging.*;
-
-/**
- * Generates statistics about relays in the Tor network from data that
- * relays write to their server descriptors. Accepts lists of referenced
- * descriptors in network status consensuses and selected lines from
- * server descriptors from <code>RelayDescriptorParser</code>. Keeps two
- * intermediate results files <code>stats/consensuses-raw</code> and
- * <code>stats/descriptors-raw</code> and writes three final results files
- * <code>stats/version-stats</code>, <code>stats/platform-stats</code>,
- * and <code>stats/bandwidth-stats</code>.
- */
-public class ServerDescriptorStatsFileHandler {
-
-  /**
-   * Intermediate results file <code>stats/consensuses-raw</code>
-   * containing consensuses and the referenced descriptor identities of
-   * relays with the Running flag set. The file format is
-   * "valid-after,descid,descid,descid...\n" for each consensus. Lines are
-   * ordered by valid-after time in ascending order.
-   */
-  private File consensusesFile;
-
-  /**
-   * Temporary file for writing <code>stats/consensuses-raw</code> while
-   * reading that file at the same time. After read and write operations
-   * are complete, the original file is deleted and the temporary file
-   * renamed to be the new intermediate results file.
-   */
-  private File consensusesTempFile;
-
-  /**
-   * Intermediate results file <code>stats/descriptors-raw</code>
-   * containing server descriptors with relevant fields for statistics.
-   * The file format is "published,descid,version,platform,advbw\n" for
-   * each server descriptors. Lines are first ordered by published time,
-   * then by descid.
-   */
-  private File descriptorsFile;
-
-  /**
-   * Temporary file for writing <code>stats/descriptors-raw</code> while
-   * reading that file at the same time. After read and write operations
-   * are complete, the original file is deleted and the temporary file
-   * renamed to be the new intermediate results file.
-   */
-  private File descriptorsTempFile;
-
-  /**
-   * Final results file <code>stats/version-stats</code> containing
-   * statistics about Tor versions of relays in the network. The file
-   * format is "date,version1,version2,...,other" with versions as
-   * specified in config option RelayVersions.
-   */
-  private File versionStatsFile;
-
-  /**
-   * Final results file <code>stats/platform-stats</code> containing
-   * statistics about operating systems of relays in the network. The
-   * file format is "date,os1,os2,...,other" with operating systems as
-   * specified in config option RelayPlatforms.
-   */
-  private File platformStatsFile;
-
-  /**
-   * Final results file <code>stats/bandwidth-stats</code> containing
-   * statistics about the advertised bandwidth of relays in the network.
-   * The file format is "date,advbw".
-   */
-  private File bandwidthStatsFile;
-
-  /**
-   * Consensuses and referenced descriptor identities of relays with the
-   * Running flag set. This data structure only holds those consensuses
-   * that were parsed in this execution, not the previously parsed
-   * consensuses as read from disk. Map keys are valid-after times
-   * formatted as "yyyy-MM-dd HH:mm:ss", map values are valid-after times
-   * followed by a comma-separated list of base-64-formatted descriptor
-   * identifiers.
-   */
-  private SortedMap<String, String> consensuses;
-
-  /**
-   * Server descriptors with relevant fields for statistics, ordered by
-   * published time and descriptor identifier. Map keys are publication
-   * times of descriptors formatted as "yyyy-MM-dd HH:mm:ss", a comma, and
-   * base-64-formatted descriptor identifiers. An example key is
-   * "2009-09-30 20:42:19,ZQZ5zq4q1U8Uynyk6lkUy5uAsdM" (length 47). Map
-   * values are map keys plus version, platform, and advertised bandwidth
-   * written as "published,descid,version,platform,advbw". Note that the
-   * platform string may contain commas.
-   */
-  private SortedMap<String, String> descriptors;
-
-  /**
-   * Server descriptors as in <code>descriptors</code>, accessible by
-   * descriptor identifiers only, without knowing the publication time.
-   * Map keys are base-64-formatted descriptor identifiers, map values
-   * are formatted as map values in <code>descriptors</code>.
-   */
-  private SortedMap<String, String> descById;
-
-  /**
-   * Tor relay versions that we care about.
-   */
-  private List<String> relayVersions;
-
-  /**
-   * Platforms (operating systems) that we care about.
-   */
-  private List<String> relayPlatforms;
-
-  /**
-   * Logger for this class.
-   */
-  private Logger logger;
-
-  // TODO should there be a modified flag, too?
-
-  /**
-   * Initializes this class, without reading in any files. We're only
-   * reading in files when writing results to disk in
-   * <code>writeFiles</code>.
-   */
-  public ServerDescriptorStatsFileHandler(List<String> relayVersions,
-      List<String> relayPlatforms) {
-
-    /* Memorize versions and platforms that we care about. */
-    this.relayVersions = relayVersions;
-    this.relayPlatforms = relayPlatforms;
-
-    /* Initialize local data structures. */
-    this.consensuses = new TreeMap<String, String>();
-    this.descriptors = new TreeMap<String, String>();
-    this.descById = new TreeMap<String, String>();
-
-    /* Initialize file names for intermediate and final results files. */
-    this.versionStatsFile = new File("stats/version-stats");
-    this.platformStatsFile = new File("stats/platform-stats");
-    this.bandwidthStatsFile = new File("stats/bandwidth-stats");
-    this.consensusesFile = new File("stats/consensuses-raw");
-    this.consensusesTempFile = new File("stats/consensuses-raw.temp");
-    this.descriptorsFile = new File("stats/descriptors-raw");
-    this.descriptorsTempFile = new File("stats/descriptors-raw.temp");
-
-    /* Initialize logger. */
-    this.logger =
-        Logger.getLogger(ServerDescriptorStatsFileHandler.class.getName());
-  }
-
-  /**
-   * Adds a consensus to the list with its valid-after time and a list of
-   * descriptor identifiers of relays that have the Running flag set. If
-   * the number of consensuses in memory exceeds a certain number, an
-   * auto-save mechanism is triggered by calling <code>writeFiles</code>.
-   */
-  public void addConsensus(String validAfter,
-      String descriptorIdentities) {
-
-    /* Add consensus to the list. */
-    if (!this.consensuses.containsKey(validAfter)) {
-      this.logger.finer("Adding consensus published at " + validAfter
-          + ".");
-    } else {
-      this.logger.fine("We already learned about a consensus published "
-          + "at " + validAfter + " in this execution. Overwriting.");
-    }
-    this.consensuses.put(validAfter, validAfter + ","
-        + descriptorIdentities);
-
-    /* Check if we have more 240 consensuses in memory (covering 10 days).
-     * If so, trigger the auto-save mechanism. */
-    if (this.consensuses.size() > 240) {
-      this.logger.fine("Autosave triggered by adding consensus: We have "
-          + this.consensuses.size() + " consensuses and "
-          + this.descriptors.size() + " descriptors in memory. Writing "
-          + "to disk now.");
-      this.writeFiles();
-    }
-  }
-
-  /**
-   * Adds a server descriptor to the list with its identity and the
-   * platform, published, and bandwidth lines. Version and operating
-   * system are parsed from the platform line. The parsed version consists
-   * only of the dotted numbers part (e.g. "0.2.1.2") without any
-   * additions like "-alpha". The operating system is the substring after
-   * " on " up to the first encountered opening curly bracket ("{").
-   * The publication time is extracted from the published line. The
-   * advertised bandwidth is calculated from the bandwidth line by taking
-   * the minimum of average and observed bandwidth, divided by 1024 to
-   * obtain KiB/s.
-   */
-  public void addServerDescriptor(String descriptorIdentity,
-      String platformLine, String publishedLine, String bandwidthLine) {
-
-    /* Parse version, platform, and advertised bandwidth from the given
-     * lines. */
-    String version = "", platform = "", published = "", advBw = "";
-    if (platformLine.contains(" Tor ")) {
-      version = platformLine.substring(platformLine.indexOf(" Tor ") + 5).
-        split(" ")[0];
-    }
-    if (platformLine.contains(" on ")) {
-      platform = platformLine.substring(platformLine.indexOf(" on ") + 4);
-      if (platform.contains("{")) {
-        platform = platform.substring(0, platform.indexOf("{")).trim();
-      }
-    }
-    published = publishedLine.substring("published ".length());
-    String[] bwParts = bandwidthLine.split(" ");
-    if (bwParts.length == 4) {
-      try {
-        advBw = "" + (Math.min(Long.parseLong(bwParts[1]),
-            Long.parseLong(bwParts[3])) / 1024L);
-      } catch (NumberFormatException e) {
-        this.logger.log(Level.WARNING, "Exception while parsing average "
-            + "and observed bandwidth from line '" + bandwidthLine
-            + "'. Not adding server descriptor!", e);
-        return;
-      }
-    }
-    String key = published + "," + descriptorIdentity;
-    String line = key + "," + version + "," + platform + "," + advBw;
-    if (!this.descriptors.containsKey(key)) {
-      this.logger.finer("Adding server descriptor with identifier "
-          + descriptorIdentity + ".");
-    } else {
-      this.logger.fine("We already learned about a server descriptor "
-          + "with identifier " + descriptorIdentity + ", published at "
-          + published + " in this execution. Overwriting.");
-    }
-    this.descriptors.put(key, line);
-    this.descById.put(descriptorIdentity, line);
-
-    /* Check if we have more 50K server descriptors in memory (covering 10
-     * days as of early 2010). If so, trigger the auto-save mechanism. */
-    if (this.descriptors.size() > 50000) {
-      this.logger.fine("Autosave triggered by adding server descriptor: "
-          + "We have " + this.consensuses.size() + " consensuses and "
-          + this.descriptors.size() + " descriptors in memory. Writing "
-          + "to disk now.");
-      this.writeFiles();
-    }
-  }
-
-  /**
-   * Merges the newly learned consensuses and server descriptors with the
-   * ones we wrote to disk earlier and extracts new statistics about relay
-   * version, platforms, and advertised bandwidth.
-   *
-   * This method is rather complex, because we can only store a limited
-   * number of consensuses and serer descriptors in memory. Also, we want
-   * to avoid going through the files twice, once for merging old and new
-   * lines and another time for extracting statistics.
-   */
-  public void writeFiles() {
-
-   String lastWrittenDay = null;
-
-   try {
-
-      /* Initialize readers for reading intermediate results files from
-       * disk. */
-      BufferedReader consensusesReader = null;
-      if (this.consensusesFile.exists()) {
-        consensusesReader = new BufferedReader(new FileReader(
-            this.consensusesFile));
-      }
-      BufferedReader descriptorsReader = null;
-      if (this.descriptorsFile.exists()) {
-        descriptorsReader = new BufferedReader(new FileReader(
-          this.descriptorsFile));
-      }
-
-      /* Prepare writing intermediate results. The idea is to write to
-       * temporary files while reading from the originals, delete the
-       * originals, and rename the temporary files to be the new
-       * originals. */
-      this.consensusesTempFile.getParentFile().mkdirs();
-      BufferedWriter consensusesWriter = new BufferedWriter(
-          new FileWriter(this.consensusesTempFile));
-      BufferedWriter descriptorsWriter = new BufferedWriter(
-          new FileWriter(this.descriptorsTempFile));
-
-      /* Prepare date format parsers. */
-      SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
-      dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
-      SimpleDateFormat dateTimeFormat =
-          new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-      dateTimeFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
-
-      /* Prepare extracting statistics and writing them to disk. */
-      String statsDate = null;
-      int[] versionStats = new int[this.relayVersions.size() + 1];
-      int[] platformStats = new int[this.relayPlatforms.size() + 1];
-      long bandwidthStats = 0L;
-      int consensusesAtThisDay = 0;
-      BufferedWriter versionWriter = new BufferedWriter(new FileWriter(
-          this.versionStatsFile));
-      BufferedWriter platformWriter = new BufferedWriter(new FileWriter(
-          this.platformStatsFile));
-      BufferedWriter bandwidthWriter = new BufferedWriter(new FileWriter(
-          this.bandwidthStatsFile));
-      versionWriter.write("date");
-      for (String v : this.relayVersions) {
-        versionWriter.write("," + v);
-      }
-      versionWriter.write(",other\n");
-      platformWriter.write("date");
-      for (String p : this.relayPlatforms) {
-        platformWriter.write("," + p);
-      }
-      platformWriter.write(",other\n");
-      bandwidthWriter.write("date,advbw\n");
-
-      /* Always keep one line of the consensuses and descriptors file in
-       * memory. */
-      String consensusLine = consensusesReader != null ?
-          consensusesReader.readLine() : null;
-      String descriptorLine = descriptorsReader != null ?
-          descriptorsReader.readLine() : null;
-
-      /* Iterate over both the consensus file and the consensus strings
-       * that we have in memory at the same time. Whichever has an earlier
-       * valid-after time gets processed. */
-      while (consensusLine != null || !this.consensuses.isEmpty()) {
-
-        /* Find out which line we want to process now, memorize it for
-         * parsing below, advance the source from where we got the line,
-         * and write the line to disk. Afterwards, variable line contains
-         * the consensus line we want to parse in this iteration. */
-        String line = null;
-        if (consensusLine != null) {
-          if (!this.consensuses.isEmpty()) {
-            String fileKey = consensusLine.split(",")[0];
-            String memKey = this.consensuses.firstKey();
-            if (fileKey.equals(memKey)) {
-              this.logger.finer("The consensus we read from disk has the "
-                  + "same valid-after time (" + fileKey + ") time as a "
-                  + "consensus we have in memory. Using the consensus "
-                  + "from memory.");
-              consensusLine = consensusesReader.readLine();
-              continue;
-            } else if (fileKey.compareTo(memKey) < 0) {
-              line = consensusLine;
-              consensusLine = consensusesReader.readLine();
-            } else {
-              line = this.consensuses.remove(memKey);
-            }
-          } else {
-            line = consensusLine;
-            consensusLine = consensusesReader.readLine();
-          }
-        } else {
-          line = this.consensuses.remove(this.consensuses.firstKey());
-        }
-        consensusesWriter.write(line + "\n");
-
-        /* Write all server descriptors to disk that were published more
-         * than 24 hours before the consensus we're about to process. Also
-         * remove those server descriptors from memory. The idea is that
-         * those server descriptors cannot be referenced from the
-         * consensus anyway and would only bloat our memory. */
-        String minus24h = dateTimeFormat.format(new Date(
-            dateTimeFormat.parse(line.split(",")[0]).getTime() -
-            (24L * 60L * 60L * 1000L)));
-        while ((descriptorLine != null &&
-            descriptorLine.split(",")[0].compareTo(minus24h) < 0) ||
-            (!this.descriptors.isEmpty() &&
-            this.descriptors.firstKey().split(",")[0].
-              compareTo(minus24h) < 0)) {
-          if (descriptorLine != null) {
-            if (!this.descriptors.isEmpty()) {
-              /* The first 47 chars contain the publication time (19
-               * chars), a comma (1 char), and the descriptor identifier
-               * (27 chars). */
-              String fileKey = descriptorLine.substring(0, 47);
-              String memKey = this.descriptors.firstKey();
-              if (fileKey.equals(memKey)) {
-                this.logger.finer("The server descriptor we read from "
-                    + "disk has the same publication time and identifier "
-                    + "(" + fileKey + ") as a server descriptor we have "
-                    + "in memory. Using the server descriptor from "
-                    + "memory.");
-                descriptorLine = descriptorsReader.readLine();
-                continue;
-              } else if (fileKey.compareTo(memKey) < 0) {
-                descriptorsWriter.write(descriptorLine + "\n");
-                descriptorLine = descriptorsReader.readLine();
-              } else {
-                String removed = this.descriptors.remove(memKey);
-                this.descById.remove(removed.split(",")[1]);
-                descriptorsWriter.write(removed + "\n");
-              }
-            } else {
-              descriptorsWriter.write(descriptorLine + "\n");
-              descriptorLine = descriptorsReader.readLine();
-            }
-          } else {
-            String removed = this.descriptors.remove(
-                this.descriptors.firstKey());
-            this.descById.remove(removed.split(",")[1]);
-            descriptorsWriter.write(removed + "\n");
-          }
-        }
-
-        /* Read in all server descriptors that were published in the last
-         * 24 hours before the consensus that we're just processing. These
-         * server descriptors might be referenced from the consensus.
-         * Store references to these server descriptors by identifier to
-         * facilitate matching a consensus entry with the corresponding
-         * server descriptor. */
-        String validAfter = line.split(",")[0];
-        while (descriptorsReader != null && descriptorLine != null &&
-            descriptorLine.split(",")[0].compareTo(validAfter) < 0) {
-          this.descriptors.put(descriptorLine.substring(0, 47),
-              descriptorLine);
-          this.descById.put(descriptorLine.split(",")[1], descriptorLine);
-          descriptorLine = descriptorsReader.readLine();
-        }
-
-        /* Now we have a consensus line we want to parse and all possibly
-         * referenced descriptors in descById. Let's write some stats. */
-        String consensusDate = line.substring(0, 10);
-        if (statsDate == null) {
-          statsDate = consensusDate;
-        }
-        if (!statsDate.equals(consensusDate)) {
-          /* We have finished one day of consensuses. If we have parsed at
-           * least half of the possible 24 consensuses of that day, write
-           * stats to disk. */
-          if (consensusesAtThisDay >= 12) {
-            lastWrittenDay = statsDate;
-            versionWriter.write(statsDate);
-            for (int i = 0; i < versionStats.length; i++) {
-              versionWriter.write("," + (versionStats[i] /
-                  consensusesAtThisDay));
-            }
-            versionWriter.write("\n");
-            platformWriter.write(statsDate);
-            for (int i = 0; i < platformStats.length; i++) {
-              platformWriter.write("," + (platformStats[i] /
-                  consensusesAtThisDay));
-            }
-            platformWriter.write("\n");
-            bandwidthWriter.write(statsDate + ","
-                + (bandwidthStats / consensusesAtThisDay) + "\n");
-          } else {
-            this.logger.fine("Not enough consensuses to write to stats.");
-          }
-          /* Fill in NA's for missing dates. */
-          long writtenMillis = dateFormat.parse(statsDate).getTime();
-          if (consensusesAtThisDay < 12) {
-            writtenMillis -= 24L * 60L * 60L * 1000L;
-          }
-          long nextMillis = dateFormat.parse(consensusDate).getTime();
-          while (writtenMillis + (24L * 60L * 60L * 1000L) < nextMillis) {
-            writtenMillis += 24L * 60L * 60L * 1000L;
-            String date = dateFormat.format(new Date(writtenMillis));
-            versionWriter.write(date);
-            for (int i = 0; i < versionStats.length; i++) {
-              versionWriter.write(",NA");
-            }
-            versionWriter.write("\n");
-            platformWriter.write(date);
-            for (int i = 0; i < platformStats.length; i++) {
-              platformWriter.write(",NA");
-            }
-            platformWriter.write("\n");
-            bandwidthWriter.write(date + ",NA\n");
-          }
-          /* Clear counters to collect next day's statistics. */
-          versionStats = new int[this.relayVersions.size() + 1];
-          platformStats = new int[this.relayPlatforms.size() + 1];
-          bandwidthStats = 0L;
-          consensusesAtThisDay = 0;
-          statsDate = consensusDate;
-        }
-
-        /* For the given consensus, parse all referenced server
-         * descriptors to obtain statistics on versions, platforms, and
-         * advertised bandwidth. Only include these values if we have at
-         * least 90 % of all referenced server descriptors. */
-        int[] versionStatsCons = new int[this.relayVersions.size() + 1];
-        int[] platformStatsCons = new int[this.relayPlatforms.size() + 1];
-        long bandwidthStatsCons = 0L;
-        String[] ids = line.split(",");
-        int seenDescs = 0;
-        for (int i = 1; i < ids.length; i++) {
-          if (this.descById.containsKey(ids[i])) {
-            seenDescs++;
-            String desc = this.descById.get(ids[i]);
-            String[] parts = desc.split(",");
-            String version = parts[2].substring(0,
-                parts[2].lastIndexOf("."));
-            if (this.relayVersions.contains(version)) {
-              versionStatsCons[this.relayVersions.indexOf(version)]++;
-            } else {
-              versionStatsCons[versionStatsCons.length - 1]++;
-            }
-            String platform = parts[3].toLowerCase();
-            boolean isOther = true;
-            for (String p : this.relayPlatforms) {
-              if (platform.contains(p.toLowerCase())) {
-                platformStatsCons[this.relayPlatforms.indexOf(p)]++;
-                isOther = false;
-                break;
-              }
-            }
-            if (isOther) {
-              platformStatsCons[platformStatsCons.length - 1]++;
-            }
-            bandwidthStatsCons += Long.parseLong(desc.substring(
-                desc.lastIndexOf(",") + 1));
-          }
-        }
-        if (10 * seenDescs / (ids.length - 1) >= 9) {
-          for (int i = 0; i < versionStatsCons.length; i++) {
-            versionStats[i] += versionStatsCons[i];
-          }
-          for (int i = 0; i < platformStatsCons.length; i++) {
-            platformStats[i] += platformStatsCons[i];
-          }
-          bandwidthStats += bandwidthStatsCons;
-          consensusesAtThisDay++;
-        } else {
-          this.logger.fine("Not enough referenced server descriptors for "
-              + "consensus with valid-after time " + line.substring(0, 19)
-              + ". Not including this consensus in the statistics.");
-        }
-
-        /* We're done reading one consensus. */
-      }
-
-      /* We're done reading all consensuses, both from disk and from
-       * memory. Write remaining server descriptors to disk. These are the
-       * server descriptors that were published 24 hours before the last
-       * parsed consensus and those server descriptors published
-       * afterwards. */
-      while (descriptorLine != null || !this.descriptors.isEmpty()) {
-        if (descriptorLine != null) {
-          if (!this.descriptors.isEmpty()) {
-            String fileKey = descriptorLine.substring(0, 47);
-            String memKey = this.descriptors.firstKey();
-            if (fileKey.equals(memKey)) {
-              this.logger.finer("The server descriptor we read from "
-                    + "disk has the same publication time and identifier "
-                    + "(" + fileKey + ") as a server descriptor we have "
-                    + "in memory. Using the server descriptor from "
-                    + "memory.");
-              descriptorLine = descriptorsReader.readLine();
-              continue;
-            } else if (fileKey.compareTo(memKey) < 0) {
-              descriptorsWriter.write(descriptorLine + "\n");
-              descriptorLine = descriptorsReader.readLine();
-            } else {
-              descriptorsWriter.write(this.descriptors.remove(memKey)
-                  + "\n");
-            }
-          } else {
-            descriptorsWriter.write(descriptorLine + "\n");
-            descriptorLine = descriptorsReader.readLine();
-          }
-        } else {
-          descriptorsWriter.write(this.descriptors.remove(
-              this.descriptors.firstKey()) + "\n");
-        }
-      }
-      this.descById.clear();
-
-      /* Close the files that we read from and wrote to. */
-      if (consensusesReader != null) {
-        consensusesReader.close();
-      }
-      if (descriptorsReader != null) {
-        descriptorsReader.close();
-      }
-      consensusesWriter.close();
-      descriptorsWriter.close();
-      bandwidthWriter.close();
-      versionWriter.close();
-      platformWriter.close();
-
-      /* Delete original files and rename temporary files to be the new
-       * originals. */
-      if (this.consensusesFile.exists()) {
-        this.consensusesFile.delete();
-      }
-      this.consensusesTempFile.renameTo(this.consensusesFile);
-      if (this.descriptorsFile.exists()) {
-        this.descriptorsFile.delete();
-      }
-      this.descriptorsTempFile.renameTo(this.descriptorsFile);
-
-      /* Done. Whee! */
-      this.logger.fine("Finished writing.");
-
-    } catch (Exception e) {
-      this.logger.log(Level.WARNING, "Exception while writing files.", e);
-    }
-
-    /* Write stats. (Including the number of added consensuses and server
-     * descriptors isn't trivial here, because we don't have the full set
-     * of descriptors in memory when adding new ones. */
-    StringBuilder dumpStats = new StringBuilder("Finished writing "
-        + "statistics information contained in consensuses and server "
-        + "descriptors.\n");
-    if (lastWrittenDay == null) {
-      dumpStats.append("No statistics written so far.");
-    } else {
-      dumpStats.append("Last written day of statistics was "
-          + lastWrittenDay);
-    }
-    this.logger.info(dumpStats.toString());
-  }
-}
-- 
1.7.1




More information about the tor-commits mailing list