[tor-commits] [metrics-lib/master] Add descriptor source to fetch descriptors from CollecTor.
karsten at torproject.org
karsten at torproject.org
Thu May 28 08:12:58 UTC 2015
commit e3d381f4c12eb61cc1d6491f31f1ac250602b3d9
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Thu May 21 17:13:17 2015 +0200
Add descriptor source to fetch descriptors from CollecTor.
Includes some really good suggestions from iwakeh.
Implements #16151.
---
.../torproject/descriptor/DescriptorCollector.java | 34 +++
.../descriptor/DescriptorSourceFactory.java | 12 +
.../descriptor/impl/DescriptorCollectorImpl.java | 240 ++++++++++++++++++++
.../torproject/descriptor/impl/ParseHelper.java | 2 +-
.../impl/DescriptorCollectorImplTest.java | 120 ++++++++++
5 files changed, 407 insertions(+), 1 deletion(-)
diff --git a/src/org/torproject/descriptor/DescriptorCollector.java b/src/org/torproject/descriptor/DescriptorCollector.java
new file mode 100644
index 0000000..bd29fb0
--- /dev/null
+++ b/src/org/torproject/descriptor/DescriptorCollector.java
@@ -0,0 +1,34 @@
+/* Copyright 2015 The Tor Project
+ * See LICENSE for licensing information */
+package org.torproject.descriptor;
+
+import java.io.File;
+
+/** Fetch descriptors from the CollecTor service available at
+ * https://collector.torproject.org/ and store them to a local
+ * directory. */
+public interface DescriptorCollector {
+
+ /**
+ * Fetch remote files from a CollecTor instance that do not yet exist
+ * locally and possibly delete local files that do not exist remotely
+ * anymore.
+ *
+ * @param collecTorBaseUrl CollecTor base URL without trailing slash,
+ * e.g., "https://collector.torproject.org".
+ * @param remoteDirectories Remote directories to collect descriptors
+ * from, e.g., "/recent/relay-descriptors/server-descriptors/". Only
+ * files in this directory will be collected, no files in its sub
+ * directories.
+ * @param minLastModified Minimum last-modified time in milliseconds of
+ * files to be collected. Set to 0 for collecting all files.
+ * @param localDirectory Directory where collected files will be
+ * written.
+ * @param deleteExtraneousLocalFiles Whether to delete all local files
+ * that do not exist remotely anymore.
+ */
+ public void collectRemoteFiles(String collecTorBaseUrl,
+ String[] remoteDirectories, long minLastModified,
+ File localDirectory, boolean deleteExtraneousLocalFiles);
+}
+
diff --git a/src/org/torproject/descriptor/DescriptorSourceFactory.java b/src/org/torproject/descriptor/DescriptorSourceFactory.java
index 9bfd81f..49fcdc6 100644
--- a/src/org/torproject/descriptor/DescriptorSourceFactory.java
+++ b/src/org/torproject/descriptor/DescriptorSourceFactory.java
@@ -12,11 +12,14 @@ public final class DescriptorSourceFactory {
"org.torproject.descriptor.impl.DescriptorParserImpl";
public final static String READER_DEFAULT =
"org.torproject.descriptor.impl.DescriptorReaderImpl";
+ public final static String COLLECTOR_DEFAULT =
+ "org.torproject.descriptor.impl.DescriptorCollectorImpl";
/* property names */
public final static String PARSER_PROPERTY = "onionoo.parser";
public final static String READER_PROPERTY = "onionoo.property";
public final static String LOADER_PROPERTY = "onionoo.downloader";
+ public final static String COLLECTOR_PROPERTY = "onionoo.collector";
/**
* Create a descriptor parser.
@@ -39,6 +42,13 @@ public final class DescriptorSourceFactory {
return (DescriptorDownloader) retrieve(LOADER_PROPERTY);
}
+ /**
+ * Create a descriptor collector.
+ */
+ public final static DescriptorCollector createDescriptorCollector() {
+ return (DescriptorCollector) retrieve(COLLECTOR_PROPERTY);
+ }
+
private final static <T> Object retrieve(String type) {
Object object;
String clazzName = null;
@@ -49,6 +59,8 @@ public final class DescriptorSourceFactory {
clazzName = System.getProperty(type, LOADER_DEFAULT);
} else if (READER_PROPERTY.equals(type)) {
clazzName = System.getProperty(type, READER_DEFAULT);
+ } else if (COLLECTOR_PROPERTY.equals(type)) {
+ clazzName = System.getProperty(type, COLLECTOR_DEFAULT);
}
object = ClassLoader.getSystemClassLoader().loadClass(clazzName).
newInstance();
diff --git a/src/org/torproject/descriptor/impl/DescriptorCollectorImpl.java b/src/org/torproject/descriptor/impl/DescriptorCollectorImpl.java
new file mode 100644
index 0000000..ed88906
--- /dev/null
+++ b/src/org/torproject/descriptor/impl/DescriptorCollectorImpl.java
@@ -0,0 +1,240 @@
+/* Copyright 2015 The Tor Project
+ * See LICENSE for licensing information */
+package org.torproject.descriptor.impl;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.text.DateFormat;
+import java.text.ParseException;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.Stack;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.zip.GZIPInputStream;
+
+import org.torproject.descriptor.DescriptorCollector;
+
+public class DescriptorCollectorImpl implements DescriptorCollector {
+
+ @Override
+ public void collectRemoteFiles(String collecTorBaseUrl,
+ String[] remoteDirectories, long minLastModified,
+ File localDirectory, boolean deleteExtraneousLocalFiles) {
+ collecTorBaseUrl = collecTorBaseUrl.endsWith("/")
+ ? collecTorBaseUrl.substring(0, collecTorBaseUrl.length() - 1)
+ : collecTorBaseUrl;
+ if (minLastModified < 0) {
+ throw new IllegalArgumentException("A negative minimum "
+ + "last-modified time is not permitted.");
+ }
+ if (localDirectory.exists() && !localDirectory.isDirectory()) {
+ throw new IllegalArgumentException("Local directory already exists "
+ + "and is not a directory.");
+ }
+ SortedMap<String, Long> localFiles =
+ this.statLocalDirectory(localDirectory);
+ SortedMap<String, String> fetchedDirectoryListings =
+ this.fetchRemoteDirectories(collecTorBaseUrl, remoteDirectories);
+ SortedSet<String> parsedDirectories = new TreeSet<String>();
+ SortedMap<String, Long> remoteFiles = new TreeMap<String, Long>();
+ for (Map.Entry<String, String> e :
+ fetchedDirectoryListings.entrySet()) {
+ String remoteDirectory = e.getKey();
+ String directoryListing = e.getValue();
+ SortedMap<String, Long> parsedRemoteFiles =
+ this.parseDirectoryListing(remoteDirectory, directoryListing);
+ if (parsedRemoteFiles == null) {
+ continue;
+ }
+ parsedDirectories.add(remoteDirectory);
+ remoteFiles.putAll(parsedRemoteFiles);
+ }
+ this.fetchRemoteFiles(collecTorBaseUrl, remoteFiles, minLastModified,
+ localDirectory, localFiles);
+ if (deleteExtraneousLocalFiles) {
+ this.deleteExtraneousLocalFiles(parsedDirectories, remoteFiles,
+ localDirectory, localFiles);
+ }
+ }
+
+ SortedMap<String, Long> statLocalDirectory(
+ File localDirectory) {
+ SortedMap<String, Long> localFiles = new TreeMap<String, Long>();
+ if (!localDirectory.exists()) {
+ return localFiles;
+ }
+ Stack<File> files = new Stack<File>();
+ files.add(localDirectory);
+ while (!files.isEmpty()) {
+ File file = files.pop();
+ if (file.isDirectory()) {
+ files.addAll(Arrays.asList(file.listFiles()));
+ } else {
+ String localPath = file.getPath().substring(
+ localDirectory.getPath().length());
+ localFiles.put(localPath, file.lastModified());
+ }
+ }
+ return localFiles;
+ }
+
+ SortedMap<String, String> fetchRemoteDirectories(
+ String collecTorBaseUrl, String[] remoteDirectories) {
+ SortedMap<String, String> fetchedDirectoryListings =
+ new TreeMap<String, String>();
+ for (String remoteDirectory : remoteDirectories) {
+ String remoteDirectoryWithSlashAtBeginAndEnd =
+ (remoteDirectory.startsWith("/") ? "" : "/") + remoteDirectory
+ + (remoteDirectory.endsWith("/") ? "" : "/");
+ String directoryUrl = collecTorBaseUrl
+ + remoteDirectoryWithSlashAtBeginAndEnd;
+ String directoryListing = this.fetchRemoteDirectory(directoryUrl);
+ if (directoryListing.length() > 0) {
+ fetchedDirectoryListings.put(
+ remoteDirectoryWithSlashAtBeginAndEnd, directoryListing);
+ }
+ }
+ return fetchedDirectoryListings;
+ }
+
+ String fetchRemoteDirectory(String url) {
+ StringBuilder sb = new StringBuilder();
+ try {
+ URL u = new URL(url);
+ HttpURLConnection huc = (HttpURLConnection) u.openConnection();
+ huc.setRequestMethod("GET");
+ huc.connect();
+ int responseCode = huc.getResponseCode();
+ if (responseCode == 200) {
+ BufferedReader br = new BufferedReader(new InputStreamReader(
+ huc.getInputStream()));
+ String line;
+ while ((line = br.readLine()) != null) {
+ sb.append(line + "\n");
+ }
+ br.close();
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ return "";
+ }
+ return sb.toString();
+ }
+
+ final Pattern DIRECTORY_LISTING_LINE_PATTERN =
+ Pattern.compile(".* href=\"([^\"/]+)\"" /* filename */
+ + ".*>(\\d{2}-\\w{3}-\\d{4} \\d{2}:\\d{2})\\s*<.*"); /* dateTime */
+
+ SortedMap<String, Long> parseDirectoryListing(
+ String remoteDirectory, String directoryListing) {
+ SortedMap<String, Long> remoteFiles = new TreeMap<String, Long>();
+ DateFormat dateTimeFormat = ParseHelper.getDateFormat(
+ "dd-MMM-yyyy HH:mm");
+ try {
+ Scanner s = new Scanner(directoryListing);
+ s.useDelimiter("\n");
+ while (s.hasNext()) {
+ String line = s.next();
+ Matcher matcher = DIRECTORY_LISTING_LINE_PATTERN.matcher(line);
+ if (matcher.matches()) {
+ String filename = matcher.group(1);
+ long lastModifiedMillis = dateTimeFormat.parse(
+ matcher.group(2)).getTime();
+ remoteFiles.put(remoteDirectory + filename, lastModifiedMillis);
+ }
+ }
+ s.close();
+ } catch (ParseException e) {
+ e.printStackTrace();
+ return null;
+ }
+ return remoteFiles;
+ }
+
+ void fetchRemoteFiles(String collecTorBaseUrl,
+ SortedMap<String, Long> remoteFiles, long minLastModified,
+ File localDirectory, SortedMap<String, Long> localFiles) {
+ for (Map.Entry<String, Long> e : remoteFiles.entrySet()) {
+ String filename = e.getKey();
+ long lastModifiedMillis = e.getValue();
+ if (lastModifiedMillis < minLastModified ||
+ (localFiles.containsKey(filename) &&
+ localFiles.get(filename) >= lastModifiedMillis)) {
+ continue;
+ }
+ String url = collecTorBaseUrl + filename;
+ File destinationFile = new File(localDirectory.getPath()
+ + filename);
+ this.fetchRemoteFile(url, destinationFile, lastModifiedMillis);
+ }
+ }
+
+ void fetchRemoteFile(String url, File destinationFile,
+ long lastModifiedMillis) {
+ try {
+ File destinationDirectory = destinationFile.getParentFile();
+ destinationDirectory.mkdirs();
+ File tempDestinationFile = new File(destinationDirectory, "."
+ + destinationFile.getName());
+ FileOutputStream fos = new FileOutputStream(tempDestinationFile);
+ URL u = new URL(url);
+ HttpURLConnection huc = (HttpURLConnection) u.openConnection();
+ huc.setRequestMethod("GET");
+ if (!url.endsWith(".xz")) {
+ huc.addRequestProperty("Accept-Encoding", "gzip");
+ }
+ huc.connect();
+ int responseCode = huc.getResponseCode();
+ if (responseCode == 200) {
+ InputStream is;
+ if (huc.getContentEncoding() != null &&
+ huc.getContentEncoding().equalsIgnoreCase("gzip")) {
+ is = new GZIPInputStream(huc.getInputStream());
+ } else {
+ is = huc.getInputStream();
+ }
+ BufferedInputStream bis = new BufferedInputStream(is);
+ int len;
+ byte[] data = new byte[1024];
+ while ((len = bis.read(data, 0, 1024)) >= 0) {
+ fos.write(data, 0, len);
+ }
+ bis.close();
+ fos.close();
+ tempDestinationFile.renameTo(destinationFile);
+ destinationFile.setLastModified(lastModifiedMillis);
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ void deleteExtraneousLocalFiles(
+ SortedSet<String> parsedDirectories,
+ SortedMap<String, Long> remoteFiles, File localDirectory,
+ SortedMap<String, Long> localFiles) {
+ for (String localPath : localFiles.keySet()) {
+ for (String remoteDirectory : parsedDirectories) {
+ if (localPath.startsWith(remoteDirectory)) {
+ if (!remoteFiles.containsKey(localPath)) {
+ new File(localDirectory.getPath() + localPath).delete();
+ }
+ }
+ }
+ }
+ }
+}
+
diff --git a/src/org/torproject/descriptor/impl/ParseHelper.java b/src/org/torproject/descriptor/impl/ParseHelper.java
index 226bf80..09534c7 100644
--- a/src/org/torproject/descriptor/impl/ParseHelper.java
+++ b/src/org/torproject/descriptor/impl/ParseHelper.java
@@ -153,7 +153,7 @@ public class ParseHelper {
super.set(value);
}
};
- private static DateFormat getDateFormat(String format) {
+ static DateFormat getDateFormat(String format) {
Map<String, DateFormat> threadDateFormats = dateFormats.get();
if (!threadDateFormats.containsKey(format)) {
DateFormat dateFormat = new SimpleDateFormat(format);
diff --git a/test/org/torproject/descriptor/impl/DescriptorCollectorImplTest.java b/test/org/torproject/descriptor/impl/DescriptorCollectorImplTest.java
new file mode 100644
index 0000000..2715f12
--- /dev/null
+++ b/test/org/torproject/descriptor/impl/DescriptorCollectorImplTest.java
@@ -0,0 +1,120 @@
+/* Copyright 2015 The Tor Project
+ * See LICENSE for licensing information */
+package org.torproject.descriptor.impl;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+
+import java.util.SortedMap;
+
+import org.junit.Test;
+
+public class DescriptorCollectorImplTest {
+
+ private static final String REMOTE_DIRECTORY_CONSENSUSES =
+ "/recent/relay-descriptors/consensuses/";
+
+ @Test()
+ public void testOneFile() {
+ String remoteFilename = "2015-05-24-12-00-00-consensus";
+ String directoryListing = "<tr><td valign=\"top\">"
+ + "<img src=\"/icons/unknown.gif\" alt=\"[ ]\"></td><td>"
+ + "<a href=\"" + remoteFilename + "\">"
+ + "2015-05-24-12-00-00-consensus</a></td>"
+ + "<td align=\"right\">24-May-2015 12:08 </td>"
+ + "<td align=\"right\">1.5M</td><td> </td></tr>";
+ SortedMap<String, Long> remoteFiles =
+ new DescriptorCollectorImpl().parseDirectoryListing(
+ REMOTE_DIRECTORY_CONSENSUSES, directoryListing);
+ assertNotNull(remoteFiles);
+ assertSame(1, remoteFiles.size());
+ assertEquals(REMOTE_DIRECTORY_CONSENSUSES + remoteFilename,
+ remoteFiles.firstKey());
+ assertEquals((Long) 1432469280000L,
+ remoteFiles.get(remoteFiles.firstKey()));
+ }
+
+ @Test()
+ public void testSameFileTwoTimestampsLastWins() {
+ String remoteFilename = "2015-05-24-12-00-00-consensus";
+ String firstTimestamp = "24-May-2015 12:04";
+ String secondTimestamp = "24-May-2015 12:08";
+ String lineFormat = "<tr><td valign=\"top\">"
+ + "<img src=\"/icons/unknown.gif\" alt=\"[ ]\"></td><td>"
+ + "<a href=\"%s\">2015-05-24-12-00-00-consensus</a></td>"
+ + "<td align=\"right\">%s </td>"
+ + "<td align=\"right\">1.5M</td><td> </td></tr>\n";
+ String directoryListing = String.format(lineFormat + lineFormat,
+ remoteFilename, firstTimestamp, remoteFilename, secondTimestamp);
+ SortedMap<String, Long> remoteFiles =
+ new DescriptorCollectorImpl().parseDirectoryListing(
+ REMOTE_DIRECTORY_CONSENSUSES, directoryListing);
+ assertNotNull(remoteFiles);
+ assertSame(1, remoteFiles.size());
+ assertEquals(REMOTE_DIRECTORY_CONSENSUSES + remoteFilename,
+ remoteFiles.firstKey());
+ assertEquals((Long) 1432469280000L,
+ remoteFiles.get(remoteFiles.firstKey()));
+ }
+
+ @Test()
+ public void testSubDirectoryOnly() {
+ String directoryListing = "<tr><td valign=\"top\">"
+ + "<img src=\"/icons/folder.gif\" alt=\"[DIR]\"></td><td>"
+ + "<a href=\"subdir/\">subdir/</a></td>"
+ + "<td align=\"right\">27-May-2015 14:07 </td>"
+ + "<td align=\"right\"> - </td><td> </td></tr>";
+ DescriptorCollectorImpl collector = new DescriptorCollectorImpl();
+ SortedMap<String, Long> remoteFiles = collector.parseDirectoryListing(
+ REMOTE_DIRECTORY_CONSENSUSES, directoryListing);
+ assertNotNull(remoteFiles);
+ assertTrue(remoteFiles.isEmpty());
+ }
+
+ @Test()
+ public void testParentDirectoryOnly() {
+ String directoryListing = "<tr><td valign=\"top\">"
+ + "<img src=\"/icons/back.gif\" alt=\"[DIR]\"></td><td>"
+ + "<a href=\"/recent/relay-descriptors/\">Parent Directory</a>"
+ + "</td><td> </td><td align=\"right\"> - </td>"
+ + "<td> </td></tr>";
+ DescriptorCollectorImpl collector = new DescriptorCollectorImpl();
+ SortedMap<String, Long> remoteFiles = collector.parseDirectoryListing(
+ REMOTE_DIRECTORY_CONSENSUSES, directoryListing);
+ assertNotNull(remoteFiles);
+ assertTrue(remoteFiles.isEmpty());
+ }
+
+ @Test()
+ public void testUnexpectedDateFormat() {
+ String directoryListing = "<tr><td valign=\"top\">"
+ + "<img src=\"/icons/unknown.gif\" alt=\"[ ]\"></td><td>"
+ + "<a href=\"2015-05-24-12-00-00-consensus\">"
+ + "2015-05-24-12-00-00-consensus</a></td>"
+ + "<td align=\"right\">2015-05-24 12:08 </td>"
+ + "<td align=\"right\">1.5M</td><td> </td></tr>";
+ SortedMap<String, Long> remoteFiles =
+ new DescriptorCollectorImpl().parseDirectoryListing(
+ REMOTE_DIRECTORY_CONSENSUSES, directoryListing);
+ assertNotNull(remoteFiles);
+ assertTrue(remoteFiles.isEmpty());
+ }
+
+ @Test()
+ public void testInvalidDate() {
+ String directoryListing = "<tr><td valign=\"top\">"
+ + "<img src=\"/icons/unknown.gif\" alt=\"[ ]\"></td><td>"
+ + "<a href=\"2015-05-24-12-00-00-consensus\">"
+ + "2015-05-24-12-00-00-consensus</a></td>"
+ + "<td align=\"right\">34-May-2015 12:08 </td>"
+ + "<td align=\"right\">1.5M</td><td> </td></tr>";
+ SortedMap<String, Long> remoteFiles =
+ new DescriptorCollectorImpl().parseDirectoryListing(
+ REMOTE_DIRECTORY_CONSENSUSES, directoryListing);
+ assertNull(remoteFiles);
+ }
+}
+
More information about the tor-commits
mailing list