[tor-commits] [metrics-tasks/master] Add simple Python database lookup code (#6471).
karsten at torproject.org
karsten at torproject.org
Tue Nov 6 15:33:22 UTC 2012
commit 5a202017333cb64f5b9ca7d862e7ef9677761221
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Tue Nov 6 10:19:40 2012 -0500
Add simple Python database lookup code (#6471).
---
task-6471/python/.gitignore | 3 +
task-6471/python/pygeodate.py | 140 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 143 insertions(+), 0 deletions(-)
diff --git a/task-6471/python/.gitignore b/task-6471/python/.gitignore
new file mode 100644
index 0000000..4d9663e
--- /dev/null
+++ b/task-6471/python/.gitignore
@@ -0,0 +1,3 @@
+*.csv
+*.csv.bz2
+
diff --git a/task-6471/python/pygeodate.py b/task-6471/python/pygeodate.py
new file mode 100644
index 0000000..6480f20
--- /dev/null
+++ b/task-6471/python/pygeodate.py
@@ -0,0 +1,140 @@
+import bisect
+import socket
+import struct
+import datetime
+
+class Range:
+ def __init__(self, line):
+ parts = line.split(',')
+ # TODO Extend to IPv6. The cool thing in Python is that ints have
+ # variable size, so we can use 48 bit keys for IPv4 (32 bit for the
+ # IPv4 address and 16 bit for the database date) and 144 bit keys for
+ # IPv6.
+ self.start_address = Database.address_ston(parts[0])
+ self.end_address = Database.address_ston(parts[1])
+ self.code = parts[2]
+ self.start_date = Database.date_ston(parts[3])
+ self.end_date = Database.date_ston(parts[4])
+ self.key = Database.create_key(self.start_address, self.start_date)
+
+ def __str__(self):
+ return "%s,%s,%s,%s,%s" % \
+ (Database.address_ntos(self.start_address),
+ Database.address_ntos(self.end_address),
+ self.code,
+ Database.date_ntos(self.start_date),
+ Database.date_ntos(self.end_date))
+
+class Database:
+ def __init__(self):
+ # TODO Replace with crit-bit tree if performance becomes a problem
+ self.data = []
+ self.dates = []
+ self.keys = []
+
+ @staticmethod
+ def address_ston(address_string):
+ try:
+ address_struct = socket.inet_pton(socket.AF_INET, address_string)
+ except socket.error:
+ raise ValueError
+ return struct.unpack('!I', address_struct)[0]
+
+ @staticmethod
+ def address_ntos(address):
+ return socket.inet_ntop(socket.AF_INET, struct.pack('!I', address))
+
+ @staticmethod
+ def date_ston(date_string):
+ date_datetime = datetime.datetime.strptime(date_string, '%Y%m%d')
+ return int(date_datetime.strftime('%s')) / 86400
+
+ @staticmethod
+ def date_ntos(date):
+ return datetime.datetime.fromtimestamp(date * 86400).strftime('%Y%m%d')
+
+ @staticmethod
+ def address_kton(key):
+ return key >> 16
+
+ @staticmethod
+ def date_kton(key):
+ return key & 0xffff
+
+ @staticmethod
+ def address_ktos(key):
+ return Database.address_ntos(Database.address_kton(key))
+
+ @staticmethod
+ def date_ktos(key):
+ return Database.date_ntos(Database.date_kton(key))
+
+ @staticmethod
+ def create_key(address, date):
+ return (address << 16) + date
+
+ def load_combined_databases(self, path):
+ with open(path) as input_file:
+ for line in input_file.readlines():
+ line = line.strip()
+ if line.startswith('!'):
+ self.add_date(line)
+ continue
+ else:
+ self.add_range(line)
+ self.data.sort()
+ self.keys = [r[0] for r in self.data]
+
+ def add_date(self, line):
+ date = line.split("!")[1]
+ if date not in self.dates:
+ bisect.insort(self.dates, date)
+
+ def add_range(self, line):
+ r = Range(line)
+ self.data.append((r.key, r))
+
+ def lookup_address_and_date(self, address_string, date_string):
+ if len(self.data) == 0:
+ return '??'
+ dates_pos = max(0, bisect.bisect(self.dates, date_string) - 1)
+ address = Database.address_ston(address_string)
+ date = Database.date_ston(self.dates[dates_pos])
+ key = Database.create_key(address, date)
+ pos = bisect.bisect(self.keys, key + 1)
+ # Look up address and date by iterating backwards over possibly
+ # matching ranges.
+ while pos:
+ pos = pos - 1
+ r = self.data[pos][1]
+ # If either the end address or end date of the range we're looking
+ # at is smaller than the values we're looking for, we can be sure
+ # not to find it anymore.
+ if r.end_address < address or r.end_date < date:
+ return '??'
+ # If the range starts at a later date, skip it and look at the next
+ # one.
+ if r.start_date > date:
+ continue
+ # Both address and date ranges match, so return the assigned
+ # code.
+ return r.code
+ # No ranges (left) to look at. We don't have what we were looking
+ # for. */
+ return '??';
+
+if __name__ == "__main__":
+ db = Database()
+ db.load_combined_databases('geoip-2007-10-2012-09.csv')
+ with open('test-cases-2007-10-2012-09.csv') as input_file:
+ for line in input_file.readlines():
+ line = line.strip()
+ parts = line.split(',')
+ address_string = parts[0]
+ date_string = parts[1]
+ expected = parts[2]
+ result = db.lookup_address_and_date(address_string, date_string)
+ if (expected != result):
+ print "! %s -> %s" % (line, result)
+ break
+
More information about the tor-commits
mailing list