[or-cvs] [metrics-utils/master 1/3] Filtered out the dates without exit lists.
karsten at torproject.org
karsten at torproject.org
Tue Jan 4 07:44:15 UTC 2011
Author: Kiyoto Tamura <owenestea at gmail.com>
Date: Mon, 27 Dec 2010 22:39:12 -0600
Subject: Filtered out the dates without exit lists.
Commit: 9c692faaa3f002e07df74a1d7888a5a041ddbc3c
---
visitor/visitor.py | 19 +++++++++++--------
1 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/visitor/visitor.py b/visitor/visitor.py
index b18e0ef..504971d 100644
--- a/visitor/visitor.py
+++ b/visitor/visitor.py
@@ -8,7 +8,7 @@
import re
import sys
import os
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, date
import bisect
from time import strptime, mktime, gmtime # datetime.strptime does not exist for version < 2.5
from cStringIO import StringIO
@@ -44,6 +44,8 @@ def get_exitlist(exitlist_filepath):
exist address was recorded.
"""
exitlist = {}
+ first_exit_date = date.today() + timedelta(1)
+ last_exit_date = date(1970, 1, 1) # Unix epoch. Should suffice
for dirpath, _, filenames in os.walk(exitlist_filepath, topdown = False):
for filename in filenames:
fn = os.path.join(dirpath, filename)
@@ -53,6 +55,9 @@ def get_exitlist(exitlist_filepath):
if line.startswith('ExitAddress'):
_, ip, dt = line.split(' ', 2)
yr, mo, d, h, m, s, _, _, _ = strptime(dt.rstrip('\s\n'), '%Y-%m-%d %H:%M:%S')
+ curr_date = date(yr, mo, d)
+ last_exit_date = max(first_exit_date, curr_date)
+ first_exit_date = min(first_exit_date, curr_date)
if not ip in exitlist:
exitlist[ip] = []
timestamp = datetime(yr, mo, d, h, m, s)
@@ -61,7 +66,7 @@ def get_exitlist(exitlist_filepath):
except IOError:
print >> sys.stderr, 'could not open %s. Skipping it.'%fn
- return exitlist
+ return exitlist, first_exit_date, last_exit_date
def apache_time2datetime(time_str, timediff_str):
"""
@@ -107,7 +112,7 @@ def analyze(apache_log_path, exitlist_path, output = sys.stdout):
The main script. It reads the exit list, and goes through the Apache access log line by line, and checks if
if it is a Tor request. TODO: filter out the bots.
"""
- exitlist = get_exitlist(exitlist_path)
+ exitlist, first_exit_date, last_exit_date = get_exitlist(exitlist_path)
tor_stats = {}
tor_ua = TOR_USERAGENTS
@@ -146,12 +151,10 @@ def analyze(apache_log_path, exitlist_path, output = sys.stdout):
for tor_type, _ in tor_ua:
col_list.append(tor_type)
buffer.write(','.join(col_list) + '\n')
- apache_dates = tor_stats.keys()
- apache_dates.sort()
- curr_apache_date = apache_dates[0]
- last_apache_date = apache_dates[-1]
+ apache_dates = sorted(tor_stats.keys())
+ curr_apache_date = first_exit_date
- while curr_apache_date <= last_apache_date:
+ while curr_apache_date <= last_exit_date:
stats = tor_stats.get(curr_apache_date)
if stats == None:
stats = {'date': curr_apache_date}
--
1.7.1
More information about the tor-commits
mailing list