[tor-commits] [metrics-tasks/master] Implement a multi-GeoIP database in Java (#6471).
karsten at torproject.org
karsten at torproject.org
Wed Sep 12 23:13:58 UTC 2012
commit 1ad8f400c93d1178d83a91a3a33e107a652e985e
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Wed Sep 12 18:49:43 2012 -0400
Implement a multi-GeoIP database in Java (#6471).
---
task-6471/.gitignore | 2 +
task-6471/README | 3 +
task-6471/java/.gitignore | 5 +
task-6471/java/build.xml | 42 ++
.../java/src/org/torproject/task6471/Database.java | 70 +++
.../src/org/torproject/task6471/DatabaseImpl.java | 652 ++++++++++++++++++++
.../task6471/DatabasePerformanceExample.java | 146 +++++
.../src/org/torproject/task6471/DatabaseTest.java | 141 +++++
8 files changed, 1061 insertions(+), 0 deletions(-)
diff --git a/task-6471/.gitignore b/task-6471/.gitignore
new file mode 100644
index 0000000..6dbccb1
--- /dev/null
+++ b/task-6471/.gitignore
@@ -0,0 +1,2 @@
+data/
+
diff --git a/task-6471/README b/task-6471/README
new file mode 100644
index 0000000..bb1a572
--- /dev/null
+++ b/task-6471/README
@@ -0,0 +1,3 @@
+Task 6471 -- Designing a file format and Python/Java library for multiple
+ GeoIP or AS databases
+
diff --git a/task-6471/java/.gitignore b/task-6471/java/.gitignore
new file mode 100644
index 0000000..86b7b14
--- /dev/null
+++ b/task-6471/java/.gitignore
@@ -0,0 +1,5 @@
+.classpath
+.project
+lib/
+classes/
+
diff --git a/task-6471/java/build.xml b/task-6471/java/build.xml
new file mode 100644
index 0000000..9eb1223
--- /dev/null
+++ b/task-6471/java/build.xml
@@ -0,0 +1,42 @@
+<project default="test" name="task-6471" basedir=".">
+ <property name="sources" value="src"/>
+ <property name="classes" value="classes"/>
+ <property name="libs" value="lib"/>
+ <path id="classpath">
+ <pathelement path="${classes}"/>
+ <fileset dir="${libs}">
+ <include name="*.jar"/>
+ </fileset>
+ </path>
+ <target name="init">
+ <mkdir dir="${classes}"/>
+ </target>
+ <target name="compile"
+ depends="init">
+ <javac destdir="${classes}"
+ srcdir="${sources}"
+ source="1.5"
+ target="1.5"
+ debug="true"
+ deprecation="true"
+ optimize="false"
+ failonerror="true"
+ includeantruntime="false">
+ <classpath refid="classpath"/>
+ </javac>
+ </target>
+ <target name="test" depends="compile">
+ <junit fork="true"
+ haltonfailure="true"
+ maxmemory="1g"
+ printsummary="off">
+ <classpath refid="classpath"/>
+ <formatter type="plain" usefile="false"/>
+ <batchtest>
+ <fileset dir="${classes}"
+ includes="**/*Test.class"/>
+ </batchtest>
+ </junit>
+ </target>
+</project>
+
diff --git a/task-6471/java/src/org/torproject/task6471/Database.java b/task-6471/java/src/org/torproject/task6471/Database.java
new file mode 100644
index 0000000..447b3c2
--- /dev/null
+++ b/task-6471/java/src/org/torproject/task6471/Database.java
@@ -0,0 +1,70 @@
+/* Copyright 2012 The Tor Project */
+package org.torproject.task6471;
+
+/**
+ * Database storing multiple GeoIP databases and supporting efficient
+ * ip-to-country lookups in the most recent of those databases for any
+ * given date.
+ *
+ * A typical query for this GeoIP database is: "to which country was IPv4
+ * address 1.2.3.4 assigned on date 20120912?" This query is answered by
+ * looking at the entries from the most recent database published on or
+ * before 20120912. If the earliest known database was published after
+ * 20120912, the earliest known database is used to resolve the request.
+ */
+public interface Database {
+
+ /**
+ * Import the contents of one or more IP address assignments files
+ * published by the Regional Internet Registries. The file or files
+ * are expected to conform to the RIR Statistics Exchange Format.
+ * Only IPv4 address ranges are imported, whereas ASN and IPv6 lines are
+ * ignored. Only the country code, start address, and address range
+ * length fields are imported. (TODO Extend to IPv6 and find similar
+ * data source for ASN.)
+ *
+ * A typical entry from a RIR file is:
+ * "ripencc|FR|ipv4|2.0.0.0|1048576|20100712|allocated".
+ *
+ * It is important to note that all five registry files (AfriNIC, APNIC,
+ * ARIN, LACNIC, and RIPE NCC) published on a given day should be
+ * imported, or the missing address ranges will be considered as
+ * unassigned from that day until the next database publication day.
+ * (TODO We could be smarter here by checking that less than five
+ * registry files have been imported for the same day, or something.)
+ *
+ * @param path Path to a stats file or directory.
+ * @return True if importing the file or directory was successful,
+ * false otherwise.
+ */
+ public boolean importRegionalRegistryStatsFileOrDirectory(String path);
+
+ /**
+ * Save the combined databases in a format that can later be loaded much
+ * more efficiently than importing the original RIR files again.
+ *
+ * @param path Path to the combined database file.
+ * @return True if saving the combined database file was successful,
+ * false otherwise.
+ */
+ public boolean saveCombinedDatabases(String path);
+
+ /**
+ * Load a combined databases file.
+ *
+ * @param path Path to the combined database file.
+ * @return True if loading the combined database file was successful,
+ * false otherwise.
+ */
+ public boolean loadCombinedDatabases(String path);
+
+ /**
+ * Query the database for an IPv4 address and assignment date.
+ *
+ * @param address IPv4 address in dotted-quad notation.
+ * @param date Assignment date in format yyyymmdd.
+ * @return Assigned country code, or null if no assignment could be
+ * found.
+ */
+ public String lookupAddress(String address, String date);
+}
diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseImpl.java b/task-6471/java/src/org/torproject/task6471/DatabaseImpl.java
new file mode 100644
index 0000000..43e5a95
--- /dev/null
+++ b/task-6471/java/src/org/torproject/task6471/DatabaseImpl.java
@@ -0,0 +1,652 @@
+/* Copyright 2012 The Tor Project */
+package org.torproject.task6471;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.Stack;
+import java.util.TimeZone;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+/**
+ * Implementation of database holding multiple GeoIP databases with
+ * special focus on lookup performance, import performance, and memory
+ * consumption (in that order).
+ *
+ * This implementation uses a single tree to store IP address and date
+ * ranges. Each tree element is stored under a long integer consisting of
+ * the start IPv4 address in the higher bits and the first database
+ * publication date containing that range in the lower bits. The tree
+ * element itself contains the end IPv4 address, last database publication
+ * date, last database index number (see explanation further below), and
+ * country code.
+ *
+ * Lookups for a given address and random date only require iterating
+ * backwards over ranges with start address smaller than or equaling the
+ * requested address and can terminate as soon as a range with a smaller
+ * end address is encountered.
+ *
+ * As a requirement for lookups to work correctly, address ranges may
+ * never overlap for different assignment periods. Similarly, date
+ * assignment ranges for a given address range may not overlap. These
+ * requirements make the import process somewhat more complex and
+ * time-consuming, which is a tradeoff for faster lookup performance.
+ *
+ * The purpose of storing the last database index number is to fix ranges
+ * that are contained in two or more databases, but that are missing in a
+ * database that was published between the others but imported after them.
+ * The database index number defines that the range is only valid for
+ * databases imported until a given database, not necessarily for
+ * databases importer later on. A separate repair run is necessary to
+ * check whether later imported databases require splitting a range into
+ * two or more sub ranges to correctly reflect that the range was not
+ * contained in those databases.
+ */
+public class DatabaseImpl implements Database {
+
+ /**
+ * Tree element containing an end IPv4 address, last database date,
+ * last database index, and country code. Start IPv4 address and first
+ * database date are encoded in the key under which the element is
+ * stored.
+ */
+ private static class TreeElement {
+ private long endAddress;
+ private int lastDbDate;
+ private int lastKnownDbIndex;
+ private String countryCode;
+ TreeElement(long endAddress, int lastDbDate, int lastKnownDbIndex,
+ String countryCode) {
+ this.endAddress = endAddress;
+ this.lastDbDate = lastDbDate;
+ this.lastKnownDbIndex = lastKnownDbIndex;
+ this.countryCode = countryCode;
+ }
+ }
+
+ /**
+ * IPv4 address and date ranges, ordered backwards by start address and
+ * first database date.
+ */
+ private SortedMap<Long, TreeElement> ranges =
+ new TreeMap<Long, TreeElement>(Collections.reverseOrder());
+
+ /**
+ * Return number of contained ranges.
+ */
+ int getNumberOfElements() {
+ return this.ranges.size();
+ }
+
+ /**
+ * Database dates ordered from oldest to youngest.
+ */
+ private SortedSet<Integer> databaseDates = new TreeSet<Integer>();
+
+ /**
+ * Ordered list of database dates to find out their indexes.
+ */
+ private List<Integer> databaseIndexes = new ArrayList<Integer>();
+
+ /**
+ * Parse one or more stats files.
+ */
+ public boolean importRegionalRegistryStatsFileOrDirectory(String path) {
+ boolean allImportsSuccessful = true;
+ Stack<File> files = new Stack<File>();
+ files.add(new File(path));
+ while (!files.isEmpty()) {
+ File file = files.pop();
+ if (file.isDirectory()) {
+ files.addAll(Arrays.asList(file.listFiles()));
+ } else if (file.getName().endsWith(".md5") ||
+ file.getName().endsWith(".md5.gz") ||
+ file.getName().endsWith(".asc") ||
+ file.getName().endsWith(".asc.gz")) {
+ System.err.println("Signature and digest files are not supported "
+ + "yet: '" + file.getAbsolutePath() + "'. Skipping.");
+ /* TODO Implement checking signatures/digests. */
+ } else if (file.getName().endsWith(".gz") ||
+ file.getName().endsWith(".bz2")) {
+ System.err.println("Parsing compressed files is not supported "
+ + "yet: '" + file.getAbsolutePath() + "'. Skipping.");
+ /* TODO Implement parsing compressed files. */
+ } else if (!this.importRegionalRegistryStatsFile(file)) {
+ allImportsSuccessful = false;
+ }
+ }
+ return allImportsSuccessful;
+ }
+
+ /**
+ * Simple and not very robust implementation of an RIR stats file
+ * parser.
+ */
+ private boolean importRegionalRegistryStatsFile(File file) {
+ try {
+ BufferedReader br = new BufferedReader(new FileReader(file));
+ String line;
+ String databaseDateString =
+ file.getName().substring(file.getName().length() - 8);
+ while ((line = br.readLine()) != null) {
+ if (line.startsWith("#") || line.length() == 0) {
+ /* Skip comment line. */
+ continue;
+ }
+ String[] parts = line.split("\\|");
+ if (parts[0].equals("2")) {
+ continue;
+ }
+ if (parts[1].equals("*")) {
+ /* Skip summary line. */
+ continue;
+ }
+ String type = parts[2];
+ if (type.equals("asn")) {
+ continue;
+ } else if (type.equals("ipv6")) {
+ continue;
+ }
+ String countryCode = parts[1].toLowerCase();
+ String startAddressString = parts[3];
+ long addresses = Long.parseLong(parts[4]);
+ this.addRange(databaseDateString, countryCode, startAddressString,
+ addresses);
+ }
+ br.close();
+ this.repairIndex();
+ } catch (IOException e) {
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Internal counters for import statistics.
+ */
+ private int rangeImports = 0, rangeImportsKeyLookups = 0;
+
+ /**
+ * Add a single address and date range to the database, which may
+ * require splitting up existing ranges.
+ *
+ * This method has default visibility and is not specified in the
+ * interface, because the caller needs to make sure that repairIndex()
+ * is called prior to any lookupAddress() calls. No further checks are
+ * performed that the tree is repaired before look up an address.
+ */
+ void addRange(String databaseDateString, String countryCode,
+ String startAddressString, long addresses) {
+
+ this.rangeImports++;
+ int databaseDate = convertDateStringToNumber(databaseDateString);
+ long startAddress = convertAddressStringToNumber(startAddressString);
+ long endAddress = startAddress + addresses - 1L;
+
+ /* Add new database date if it's not yet contained. */
+ if (!this.databaseDates.contains(databaseDate)) {
+ this.databaseDates.add(databaseDate);
+ this.databaseIndexes.add(databaseDate);
+ if (this.databaseIndexes.size() > 1) {
+ this.needToFixDatabase = true;
+ }
+ }
+
+ /* We might have to split existing ranges or the new range before
+ * adding it to the database, and we might have to remove existing
+ * ranges. We shouldn't mess with the tree directly while iterating
+ * over it, so let's for now only calculate what changes we want to
+ * make. */
+ SortedMap<Long, TreeElement> updateElements =
+ this.getUpdatesForAddingRange(databaseDate, countryCode,
+ startAddress, endAddress);
+
+ /* Apply updates. Elements with non-null values are added, elements
+ * with null values are removed. */
+ for (Map.Entry<Long, TreeElement> e : updateElements.entrySet()) {
+ if (e.getValue() == null) {
+ this.ranges.remove(e.getKey());
+ } else {
+ this.ranges.put(e.getKey(), e.getValue());
+ }
+ }
+ }
+
+ /**
+ * Calculate necessary changes to the tree to add a range.
+ */
+ private SortedMap<Long, TreeElement> getUpdatesForAddingRange(
+ int databaseDate, String countryCode, long startAddress,
+ long endAddress) {
+
+ /* Keep updates in a single tree where non-null values will later be
+ * added, possibly replacing existing elements, and null values will
+ * be removed from the tree. */
+ SortedMap<Long, TreeElement> updateElements =
+ new TreeMap<Long, TreeElement>();
+
+ /* Find out previous and next database, so that we can possibly merge
+ * ranges. */
+ int previousDatabaseDate =
+ this.databaseDates.headSet(databaseDate).isEmpty() ? -1 :
+ this.databaseDates.headSet(databaseDate).last();
+ int nextDatabaseDate =
+ this.databaseDates.tailSet(databaseDate + 1).isEmpty() ? -1 :
+ this.databaseDates.tailSet(databaseDate + 1).first();
+
+ /* Look up database index number of the range to be added. */
+ int dbIndex = this.databaseIndexes.indexOf(databaseDate);
+
+ /* Remember the address boundaries of the next (partial) range to be
+ * added. */
+ long nextStartAddress = startAddress, nextEndAddress = endAddress;
+ int nextFirstDbDate = databaseDate, nextLastDbDate = databaseDate;
+
+ /* Iterate backwards over the existing ranges, starting at the end
+ * address of the range to be added and at the last conceivable
+ * database publication date. */
+ for (Map.Entry<Long, TreeElement> e :
+ this.ranges.tailMap(((endAddress + 1L) << 16) - 1L).entrySet()) {
+ this.rangeImportsKeyLookups++;
+
+ /* Extract everything we need to know from the next existing range
+ * we're looking at. */
+ long eStartAddress = e.getKey() >> 16;
+ long eEndAddress = e.getValue().endAddress;
+ int eFirstDbDate = (int) (e.getKey() & ((1L << 16) - 1L));
+ int eLastDbDate = e.getValue().lastDbDate;
+ int eLastKnownDbIndex = e.getValue().lastKnownDbIndex;
+ String eCountryCode = e.getValue().countryCode;
+
+ /* If the next (partial) range starts after the current element
+ * ends, add the new range. */
+ if (nextStartAddress > eEndAddress &&
+ nextEndAddress >= startAddress) {
+ updateElements.put((nextStartAddress << 16) + nextFirstDbDate,
+ new TreeElement(nextEndAddress, nextLastDbDate, dbIndex,
+ countryCode));
+ nextEndAddress = nextStartAddress - 1L;
+ nextStartAddress = startAddress;
+ nextFirstDbDate = databaseDate;
+ nextLastDbDate = databaseDate;
+ }
+
+ /* If the next (partial) range still ends after the current element
+ * ends, add the new range. */
+ if (nextEndAddress > eEndAddress &&
+ nextEndAddress >= startAddress) {
+ updateElements.put(((eEndAddress + 1L) << 16) + databaseDate,
+ new TreeElement(nextEndAddress, databaseDate, dbIndex,
+ countryCode));
+ nextEndAddress = eEndAddress;
+ nextStartAddress = startAddress;
+ nextFirstDbDate = databaseDate;
+ nextLastDbDate = databaseDate;
+ }
+
+ /* If the existing range ends before the new range starts, we don't
+ * have to look at any more existing ranges. */
+ if (eEndAddress < startAddress) {
+ break;
+ }
+
+ /* Cut off any address range parts of the existing element that are
+ * not contained in the currently added range. First check whether
+ * the existing range ends after the newly added range. In that
+ * case cut off the overlapping part and store it as a new
+ * element.*/
+ if (eStartAddress <= endAddress && eEndAddress > endAddress) {
+ updateElements.put(((endAddress + 1L) << 16) + eFirstDbDate,
+ new TreeElement(eEndAddress, eLastDbDate, eLastKnownDbIndex,
+ eCountryCode));
+ updateElements.put((eStartAddress << 16) + eFirstDbDate,
+ new TreeElement(endAddress, eLastDbDate, eLastKnownDbIndex,
+ eCountryCode));
+ eEndAddress = endAddress;
+ }
+
+ /* Similarly, check whether the existing range starts before the
+ * newly added one. If so, cut off the overlapping part and store
+ * it as new element. */
+ if (eStartAddress < startAddress && eEndAddress >= startAddress) {
+ updateElements.put((eStartAddress << 16) + eFirstDbDate,
+ new TreeElement(startAddress - 1L, eLastDbDate,
+ eLastKnownDbIndex, eCountryCode));
+ updateElements.put((startAddress << 16) + eFirstDbDate,
+ new TreeElement(eEndAddress, eLastDbDate, eLastKnownDbIndex,
+ eCountryCode));
+ eStartAddress = startAddress;
+ }
+
+ /* Now we're sure the existing element doesn't exceed the newly
+ * added element, address-wise. */
+ nextStartAddress = eStartAddress;
+ nextEndAddress = eEndAddress;
+
+ /* If the range is already contained, maybe even with different
+ * country code, ignore it. */
+ if (eFirstDbDate <= databaseDate && eLastDbDate >= databaseDate) {
+ updateElements.clear();
+ return updateElements;
+ }
+
+ /* See if we can merge the new range with the previous or next
+ * range. If so, extend our database range and mark the existing
+ * element for deletion. */
+ if (eCountryCode.equals(countryCode)) {
+ if (eLastDbDate == previousDatabaseDate) {
+ nextFirstDbDate = eFirstDbDate;
+ updateElements.put((eStartAddress << 16) + eFirstDbDate, null);
+ } else if (eFirstDbDate == nextDatabaseDate) {
+ nextLastDbDate = eLastDbDate;
+ updateElements.put((eStartAddress << 16) + eFirstDbDate, null);
+ }
+ }
+ }
+
+ /* If there's still some part (or the whole?) address range left to
+ * add, add it now. */
+ while (nextEndAddress >= startAddress) {
+ updateElements.put((nextStartAddress << 16) + nextFirstDbDate,
+ new TreeElement(nextEndAddress, nextLastDbDate, dbIndex,
+ countryCode));
+ nextEndAddress = nextStartAddress - 1L;
+ nextStartAddress = startAddress;
+ nextFirstDbDate = databaseDate;
+ nextLastDbDate = databaseDate;
+ }
+
+ /* Return the tree updates that will add the given range. */
+ return updateElements;
+ }
+
+ /* Do we have to repair the tree? */
+ private boolean needToFixDatabase = false;
+
+ /**
+ * Repair tree by making sure that any range from a given database date
+ * to another is still valid when considering any other database that
+ * was imported later.
+ */
+ void repairIndex() {
+ if (!needToFixDatabase) {
+ return;
+ }
+ int maxDatabaseIndex = databaseIndexes.size() - 1;
+ if (maxDatabaseIndex < 1) {
+ return;
+ }
+ SortedMap<Long, TreeElement> updateElements =
+ new TreeMap<Long, TreeElement>();
+ for (Map.Entry<Long, TreeElement> e : this.ranges.entrySet()) {
+ if (e.getValue().lastKnownDbIndex < maxDatabaseIndex) {
+ int eFirstDbDate = (int) (e.getKey() & ((1L << 16) - 1L));
+ int eLastDbDate = e.getValue().lastDbDate;
+ List<Integer> splitAtDates = new ArrayList<Integer>();
+ for (int dbIndex = e.getValue().lastKnownDbIndex + 1;
+ dbIndex <= maxDatabaseIndex; dbIndex++) {
+ int dbDate = databaseIndexes.get(dbIndex);
+ if (eFirstDbDate < dbDate && eLastDbDate > dbDate) {
+ splitAtDates.add(dbDate);
+ }
+ }
+ if (splitAtDates.isEmpty()) {
+ e.getValue().lastKnownDbIndex = maxDatabaseIndex;
+ } else {
+ long eStartAddress = e.getKey() >> 16;
+ long eEndAddress = e.getValue().endAddress;
+ String eCountryCode = e.getValue().countryCode;
+ int start = eFirstDbDate, end = eFirstDbDate;
+ for (int cur : this.databaseDates.tailSet(eFirstDbDate)) {
+ if (cur > eLastDbDate) {
+ break;
+ }
+ if (splitAtDates.contains(cur)) {
+ if (start >= 0 && end >= 0) {
+ updateElements.put((eStartAddress << 16) + start,
+ new TreeElement(eEndAddress, end,
+ maxDatabaseIndex, eCountryCode));
+ start = end = -1;
+ }
+ } else if (start < 0) {
+ start = end = cur;
+ } else {
+ end = cur;
+ }
+ }
+ if (start >= 0 && end >= 0) {
+ updateElements.put((eStartAddress << 16) + start,
+ new TreeElement(eEndAddress, end,
+ maxDatabaseIndex, eCountryCode));
+ }
+ }
+ }
+ }
+ for (Map.Entry<Long, TreeElement> e : updateElements.entrySet()) {
+ this.ranges.put(e.getKey(), e.getValue());
+ }
+ this.needToFixDatabase = false;
+ }
+
+ /**
+ * Internal counters for lookup statistics.
+ */
+ private int addressLookups = 0, addressLookupsKeyLookups = 0;
+
+ /**
+ * Look up address and date by iterating backwards over possibly
+ * matching ranges.
+ */
+ public String lookupAddress(String addressString, String dateString) {
+ this.addressLookups++;
+
+ long address = convertAddressStringToNumber(addressString);
+ int date = convertDateStringToNumber(dateString);
+
+ if (this.databaseDates.isEmpty()) {
+ return null;
+ }
+
+ /* Look up which database we want. */
+ int databaseDate = this.databaseDates.headSet(date + 1).isEmpty() ?
+ this.databaseDates.first() :
+ this.databaseDates.headSet(date + 1).last();
+
+ /* Iterate backwards over the existing ranges, starting at the last
+ * possible date of the address to be found. */
+ for (Map.Entry<Long, TreeElement> e :
+ this.ranges.tailMap(((address + 1L) << 16) - 1L).entrySet()) {
+ this.addressLookupsKeyLookups++;
+
+ /* If either the end address or end date of the range we're looking
+ * at is smaller than the values we're looking for, we can be sure
+ * not to find it anymore. */
+ if (e.getValue().endAddress < address ||
+ e.getValue().lastDbDate < databaseDate) {
+ return null;
+ }
+
+ /* If the range starts at a later date, skip it and look at the next
+ * one. */
+ long startDate = e.getKey() & ((1L << 16) - 1L);
+ if (startDate > databaseDate) {
+ continue;
+ }
+
+ /* Both address and date ranges match, so return the assigned
+ * country code. */
+ return e.getValue().countryCode;
+ }
+
+ /* No ranges (left) to look at. We don't have what we were looking
+ * for. */
+ return null;
+ }
+
+ /* Helper: convert a dotted-quad formatted address string to its
+ * corresponding long integer number. */
+ static long convertAddressStringToNumber(String addressString) {
+ long address = 0;
+ String[] addressParts = addressString.split("\\.");
+ for (int i = 0; i < 4; i++) {
+ address += Long.parseLong(addressParts[i]) << ((3 - i) * 8);
+ }
+ return address;
+ }
+
+ /* Helper: convert a long integer address number to its corresponding
+ * dotted-quad formatted string. */
+ static String convertAddressNumberToString(long address) {
+ return "" + (address / 256 / 256 / 256) + "."
+ + ((address / 256 / 256) % 256) + "."
+ + ((address / 256) % 256) + "." + (address % 256);
+ }
+
+ /* Helper: date format parser/formatter. */
+ private static SimpleDateFormat dateFormat;
+ static {
+ dateFormat = new SimpleDateFormat("yyyyMMdd");
+ dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+ }
+
+ /* Helper: convert date string in format yyyymmdd to integer containing
+ * days passed since 1970-01-01. */
+ static int convertDateStringToNumber(String dateString)
+ throws IllegalArgumentException {
+ try {
+ SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMdd");
+ dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+ return (int) (dateFormat.parse(dateString).getTime() / 86400000);
+ } catch (ParseException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+
+ /* Helper: convert integer containing days passed since 1970-01-01 to
+ * date string in format yyyymmdd. */
+ static String convertDateNumberToString(int date) {
+ return dateFormat.format(((long) date) * 86400000);
+ }
+
+ /* Return a nicely formatted string summarizing database contents and
+ * usage statistics. */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(String.format("Database contains %d databases and %d "
+ + "combined address ranges.\n"
+ + "Performed %d address range imports requiring %d lookups.\n"
+ + "Performed %d address lookups requiring %d lookups.\n"
+ + "First 10 entries, in reverse order, are:",
+ this.databaseDates.size(), this.ranges.size(), rangeImports,
+ rangeImportsKeyLookups, addressLookups,
+ addressLookupsKeyLookups));
+ int entries = 10;
+ for (Map.Entry<Long, TreeElement> e : this.ranges.entrySet()) {
+ sb.append(String.format("%n %s %s %s %s %s %d",
+ convertAddressNumberToString(e.getKey() >> 16),
+ convertAddressNumberToString(e.getValue().endAddress),
+ e.getValue().countryCode,
+ convertDateNumberToString(
+ (int) (e.getKey() & ((1L << 16) - 1L))),
+ convertDateNumberToString(e.getValue().lastDbDate),
+ e.getValue().lastKnownDbIndex));
+ if (--entries <= 0) {
+ break;
+ }
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Save the combined databases to disk.
+ */
+ public boolean saveCombinedDatabases(String path) {
+ try {
+
+ /* Create parent directories if necessary. */
+ File file = new File(path);
+ if (file.getParentFile() != null) {
+ file.getParentFile().mkdirs();
+ }
+
+ /* Start with writing all contained database dates to the file
+ * header. */
+ BufferedWriter bw = new BufferedWriter(new FileWriter(file));
+ for (int dbDate : this.databaseDates) {
+ bw.write("!" + convertDateNumberToString(dbDate) + "\n");
+ }
+
+ /* Next write all database ranges in the same order as they are
+ * currently contained in memory. The only information we can drop
+ * is the last known database index of each range, because we assume
+ * the tree is already in repaired state. */
+ for (Map.Entry<Long, TreeElement> e : this.ranges.entrySet()) {
+ bw.write(String.format("%s,%s,%s,%s,%s%n",
+ convertAddressNumberToString(e.getKey() >> 16),
+ convertAddressNumberToString(e.getValue().endAddress),
+ e.getValue().countryCode,
+ convertDateNumberToString(
+ (int) (e.getKey() & ((1L << 16) - 1L))),
+ convertDateNumberToString(e.getValue().lastDbDate)));
+ }
+ bw.close();
+ } catch (IOException e) {
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Load previously saved combined databases from disk. This code is not
+ * at all robust against external changes of the combined database file.
+ */
+ public boolean loadCombinedDatabases(String path) {
+ try {
+ File file = new File(path);
+ BufferedReader br = new BufferedReader(new FileReader(file));
+ String line;
+ int maxDbIndex = -1;
+ while ((line = br.readLine()) != null) {
+ if (line.startsWith("!")) {
+
+ /* First read file header containing database dates. */
+ int dbDate = convertDateStringToNumber(line.substring(1));
+ this.databaseDates.add(dbDate);
+ this.databaseIndexes.add(dbDate);
+ maxDbIndex = this.databaseIndexes.size() - 1;
+ } else {
+
+ /* Next read all ranges. Set last known database index for each
+ * range to the last database we read from the header, because
+ * the tree will immediately be in repaired state. */
+ String[] parts = line.split(",");
+ long startAddress = convertAddressStringToNumber(parts[0]);
+ long endAddress = convertAddressStringToNumber(parts[1]);
+ String countryCode = parts[2];
+ int firstDbDate = convertDateStringToNumber(parts[3]);
+ int lastDbDate = convertDateStringToNumber(parts[4]);
+ this.ranges.put((startAddress << 16) + firstDbDate,
+ new TreeElement(endAddress, lastDbDate, maxDbIndex,
+ countryCode));
+ }
+ }
+ br.close();
+ } catch (IOException e) {
+ return false;
+ }
+ return true;
+ }
+}
diff --git a/task-6471/java/src/org/torproject/task6471/DatabasePerformanceExample.java b/task-6471/java/src/org/torproject/task6471/DatabasePerformanceExample.java
new file mode 100644
index 0000000..3b273bc
--- /dev/null
+++ b/task-6471/java/src/org/torproject/task6471/DatabasePerformanceExample.java
@@ -0,0 +1,146 @@
+package org.torproject.task6471;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.Stack;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+public class DatabasePerformanceExample {
+ public static void main(String[] args) {
+
+ System.out.print("Generating test cases... ");
+ long startMillis = System.currentTimeMillis();
+ List<Long> tests = new ArrayList<Long>();
+ SortedMap<Long, String> results = new TreeMap<Long, String>();
+ Random rnd = new Random(1L);
+ int startDate = DatabaseImpl.convertDateStringToNumber("20071001");
+ int endDate = DatabaseImpl.convertDateStringToNumber("20120930");
+ /* Skipping Dec 1--3, 2009, because the first available database from
+ * December 2009 was published on the 4th, and generating test cases
+ * was just too confusing when taking that into account. */
+ List<Integer> skipDates = new ArrayList<Integer>();
+ skipDates.add(DatabaseImpl.convertDateStringToNumber("20091201"));
+ skipDates.add(DatabaseImpl.convertDateStringToNumber("20091202"));
+ skipDates.add(DatabaseImpl.convertDateStringToNumber("20091203"));
+ for (int i = 0; i < 100000; i++) {
+ long testAddress = rnd.nextLong() & ((1L << 32) - 1L);
+ int testDate = startDate + rnd.nextInt(endDate - startDate);
+ if (skipDates.contains(testDate)) {
+ i--;
+ } else {
+ tests.add((testAddress << 16) + testDate);
+ }
+ }
+ Stack<File> stackedFiles = new Stack<File>();
+ stackedFiles.add(new File("../data"));
+ SortedSet<File> files = new TreeSet<File>();
+ while (!stackedFiles.isEmpty()) {
+ File file = stackedFiles.pop();
+ if (file.isDirectory()) {
+ stackedFiles.addAll(Arrays.asList(file.listFiles()));
+ } else if (!file.getName().endsWith(".md5") &&
+ !file.getName().endsWith(".md5.gz") &&
+ !file.getName().endsWith(".asc") &&
+ !file.getName().endsWith(".asc.gz")) {
+ files.add(file);
+ }
+ }
+ for (File file : files) {
+ String dbMonth = file.getName().substring(
+ file.getName().length() - 8);
+ dbMonth = dbMonth.substring(0, 6);
+ Database temp = new DatabaseImpl();
+ temp.importRegionalRegistryStatsFileOrDirectory(
+ file.getAbsolutePath());
+ for (long test : tests) {
+ int testDate = (int) (test & ((1 << 16) - 1));
+ String testMonth = DatabaseImpl.convertDateNumberToString(
+ testDate).substring(0, 6);
+ if (testMonth.equals(dbMonth)) {
+ String testAddressString = DatabaseImpl.
+ convertAddressNumberToString(test >> 16);
+ String testDateString = DatabaseImpl.convertDateNumberToString(
+ testDate);
+ String countryCode = temp.lookupAddress(testAddressString,
+ testDateString);
+ if (countryCode != null) {
+ results.put(test, countryCode);
+ }
+ }
+ }
+ }
+ long endMillis = System.currentTimeMillis();
+ System.out.println((endMillis - startMillis) + " millis.");
+
+ System.out.print("Importing files... ");
+ startMillis = endMillis;
+ Database database = new DatabaseImpl();
+ database.importRegionalRegistryStatsFileOrDirectory("../data");
+ endMillis = System.currentTimeMillis();
+ System.out.println((endMillis - startMillis) + " millis.");
+
+ System.out.print("Making test requests... ");
+ startMillis = endMillis;
+ int failures = 0;
+ for (long test : tests) {
+ String testAddress = DatabaseImpl.convertAddressNumberToString(
+ test >> 16);
+ String testDate = DatabaseImpl.convertDateNumberToString(
+ (int) (test & ((1 << 16) - 1)));
+ String expected = results.get(test);
+ String result = database.lookupAddress(testAddress, testDate);
+ if ((expected == null && result != null) ||
+ (expected != null && !expected.equals(result))) {
+ //System.out.println("Expected " + expected + " for "
+ // + testAddress + " " + testDate + ", but got " + result);
+ failures++;
+ }
+ }
+ endMillis = System.currentTimeMillis();
+ System.out.println((endMillis - startMillis) + " millis, " + failures
+ + " out of " + tests.size() + " tests failed.");
+
+ System.out.println(database);
+
+ System.out.print("Saving combined databases to disk... ");
+ startMillis = endMillis;
+ database.saveCombinedDatabases("geoip-2007-10-2012-09.csv");
+ endMillis = System.currentTimeMillis();
+ System.out.println((endMillis - startMillis) + " millis.");
+ startMillis = endMillis;
+
+ System.out.print("Loading combined databases from disk... ");
+ startMillis = endMillis;
+ database = new DatabaseImpl();
+ database.loadCombinedDatabases("geoip-2007-10-2012-09.csv");
+ endMillis = System.currentTimeMillis();
+ System.out.println((endMillis - startMillis) + " millis.");
+
+ System.out.print("Making a second round of test requests... ");
+ startMillis = endMillis;
+ failures = 0;
+ for (long test : tests) {
+ String testAddress = DatabaseImpl.convertAddressNumberToString(
+ test >> 16);
+ String testDate = DatabaseImpl.convertDateNumberToString(
+ (int) (test & ((1 << 16) - 1)));
+ String expected = results.get(test);
+ String result = database.lookupAddress(testAddress, testDate);
+ if ((expected == null && result != null) ||
+ (expected != null && !expected.equals(result))) {
+ //System.out.println("Expected " + expected + " for "
+ // + testAddress + " " + testDate + ", but got " + result);
+ failures++;
+ }
+ }
+ endMillis = System.currentTimeMillis();
+ System.out.println((endMillis - startMillis) + " millis, " + failures
+ + " out of " + tests.size() + " tests failed.");
+ }
+}
diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseTest.java b/task-6471/java/src/org/torproject/task6471/DatabaseTest.java
new file mode 100644
index 0000000..8f90b3d
--- /dev/null
+++ b/task-6471/java/src/org/torproject/task6471/DatabaseTest.java
@@ -0,0 +1,141 @@
+/* Copyright 2012 The Tor Project */
+package org.torproject.task6471;
+
+import static org.junit.Assert.assertEquals;
+import org.junit.Test;
+
+/**
+ * Test the multi-GeoIP database implementation.
+ */
+public class DatabaseTest {
+
+ @Test()
+ public void testSingleIpRangeSingleDatebase() {
+ DatabaseImpl database = new DatabaseImpl();
+ database.addRange("20120901", "us", "3.0.0.0", 16777216);
+ database.repairIndex();
+ assertEquals(1, ((DatabaseImpl) database).getNumberOfElements());
+ assertEquals(null, database.lookupAddress("2.255.255.255",
+ "19920901"));
+ assertEquals(null, database.lookupAddress("2.255.255.255",
+ "20020901"));
+ assertEquals(null, database.lookupAddress("2.255.255.255",
+ "20120901"));
+ assertEquals(null, database.lookupAddress("2.255.255.255",
+ "20220901"));
+ assertEquals("us", database.lookupAddress("3.0.0.0", "19920901"));
+ assertEquals("us", database.lookupAddress("3.0.0.0", "20020901"));
+ assertEquals("us", database.lookupAddress("3.0.0.0", "20120901"));
+ assertEquals("us", database.lookupAddress("3.0.0.0", "20220901"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "19920901"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20020901"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20220901"));
+ assertEquals("us", database.lookupAddress("3.255.255.255",
+ "19920901"));
+ assertEquals("us", database.lookupAddress("3.255.255.255",
+ "20020901"));
+ assertEquals("us", database.lookupAddress("3.255.255.255",
+ "20120901"));
+ assertEquals("us", database.lookupAddress("3.255.255.255",
+ "20220901"));
+ assertEquals(null, database.lookupAddress("4.0.0.0", "19920901"));
+ assertEquals(null, database.lookupAddress("4.0.0.0", "20020901"));
+ assertEquals(null, database.lookupAddress("4.0.0.0", "20120901"));
+ assertEquals(null, database.lookupAddress("4.0.0.0", "20220901"));
+ }
+
+ @Test()
+ public void testTwoAdjacentIpRangesSingleDatabase() {
+ DatabaseImpl database = new DatabaseImpl();
+ database.addRange("20120901", "us", "3.0.0.0", 16777216);
+ database.addRange("20120901", "ca", "4.0.0.0", 16777216);
+ database.repairIndex();
+ assertEquals(2, ((DatabaseImpl) database).getNumberOfElements());
+ assertEquals(null, database.lookupAddress("2.255.255.255",
+ "20120901"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+ assertEquals("ca", database.lookupAddress("4.127.0.0", "20120901"));
+ assertEquals("ca", database.lookupAddress("4.127.0.0", "20120901"));
+ assertEquals("ca", database.lookupAddress("4.127.0.0", "20120901"));
+ assertEquals(null, database.lookupAddress("5.0.0.0", "20120901"));
+ }
+
+ @Test()
+ public void testTwoNonAdjacentIpDateRangesSingleDatabase() {
+ DatabaseImpl database = new DatabaseImpl();
+ database.addRange("20120901", "us", "3.0.0.0", 16777216);
+ database.addRange("20120901", "ca", "6.0.0.0", 16777216);
+ database.repairIndex();
+ assertEquals(2, ((DatabaseImpl) database).getNumberOfElements());
+ assertEquals(null, database.lookupAddress("2.255.255.255", "20120901"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+ assertEquals(null, database.lookupAddress("4.255.255.255", "20120901"));
+ assertEquals("ca", database.lookupAddress("6.127.0.0", "20120901"));
+ assertEquals(null, database.lookupAddress("7.0.0.0", "20120901"));
+ }
+
+ @Test()
+ public void testDuplicateImport() {
+ DatabaseImpl database = new DatabaseImpl();
+ database.addRange("20120901", "us", "3.0.0.0", 16777216);
+ database.addRange("20120901", "us", "3.0.0.0", 16777216);
+ database.repairIndex();
+ assertEquals(1, ((DatabaseImpl) database).getNumberOfElements());
+ assertEquals(null, database.lookupAddress("2.255.255.255", "20120901"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+ assertEquals(null, database.lookupAddress("4.0.0.0", "20120901"));
+ }
+
+ @Test()
+ public void testDuplicateImportDifferentCountryCode() {
+ DatabaseImpl database = new DatabaseImpl();
+ database.addRange("20120901", "us", "3.0.0.0", 16777216);
+ database.addRange("20120901", "ca", "3.0.0.0", 16777216);
+ database.repairIndex();
+ assertEquals(1, ((DatabaseImpl) database).getNumberOfElements());
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+ }
+
+ @Test()
+ public void testLeaveIpChangeUnchanged() {
+ DatabaseImpl database = new DatabaseImpl();
+ database.addRange("20120901", "us", "3.0.0.0", 16777216);
+ database.addRange("20121001", "us", "3.0.0.0", 16777216);
+ database.repairIndex();
+ assertEquals(1, ((DatabaseImpl) database).getNumberOfElements());
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20120801"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20121001"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20121101"));
+ }
+
+ @Test()
+ public void testLeaveIpChangeUnchangedReverseOrder() {
+ DatabaseImpl database = new DatabaseImpl();
+ database.addRange("20121001", "us", "3.0.0.0", 16777216);
+ database.addRange("20120901", "us", "3.0.0.0", 16777216);
+ database.repairIndex();
+ assertEquals(1, ((DatabaseImpl) database).getNumberOfElements());
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20120801"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20121001"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20121101"));
+ }
+
+ @Test()
+ public void testMissingIpRange() {
+ DatabaseImpl database = new DatabaseImpl();
+ database.addRange("20120901", "us", "3.0.0.0", 16777216);
+ database.addRange("20121101", "us", "3.0.0.0", 16777216);
+ database.addRange("20121001", "us", "6.0.0.0", 16777216);
+ database.repairIndex();
+ assertEquals(3, ((DatabaseImpl) database).getNumberOfElements());
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20120801"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+ assertEquals(null, database.lookupAddress("3.127.0.0", "20121001"));
+ assertEquals("us", database.lookupAddress("3.127.0.0", "20121101"));
+ }
+}
More information about the tor-commits
mailing list