[tor-commits] [metrics-tasks/master] Revise calculation script following code review (thanks atagar) (#7241).
karsten at torproject.org
karsten at torproject.org
Mon Feb 4 09:00:43 UTC 2013
commit 1625bbe6cb482d8ec599362fefdd99cf06a1c66c
Author: peer <peer at lavabit.com>
Date: Tue Jan 22 11:53:57 2013 +0000
Revise calculation script following code review (thanks atagar) (#7241).
---
task-7241/first_pass.py | 138 +++++++++++++++++++++-------------------------
1 files changed, 63 insertions(+), 75 deletions(-)
diff --git a/task-7241/first_pass.py b/task-7241/first_pass.py
index b453b84..457a4f7 100644
--- a/task-7241/first_pass.py
+++ b/task-7241/first_pass.py
@@ -1,106 +1,94 @@
-import sys
-from datetime import datetime, timedelta
+# calculate frac_relays, frac_cw to compare consensus documents over time
-from stem.descriptor import parse_file
+# let Y be the base document and X be some hours before the base document
+# frac_relays = count(intersection(Y, X)) / count(Y)
+# frac_cw = sum(cw(Y) over intersection(Y,X)) / sum(cw(Y))
-# http://stackoverflow.com/questions/82831/how-do-i-check-if-a-file-exists-using-python
-def file_check(file_path):
- try:
- with open(file_path) as f:
- return True
- except IOError:
- return False
+import os
+from datetime import datetime, timedelta
+from stem.descriptor import parse_file
+# generate expected consensus filepath from time
def filepath_from_time(cur_datetime):
- consensus_path = 'consensuses-'
- consensus_path += cur_datetime.strftime('%Y-%m')
- consensus_path += '/'
- consensus_path += cur_datetime.strftime('%d')
- consensus_path += '/'
- consensus_path += cur_datetime.strftime('%Y-%m-%d-%H-%M-%S')
- consensus_path += '-consensus'
-
- return consensus_path
+ return os.path.join(
+ 'consensuses-%s' % cur_datetime.strftime('%Y-%m'),
+ cur_datetime.strftime('%d'),
+ '%s-consensus' % cur_datetime.strftime('%Y-%m-%d-%H-%M-%S'),
+ )
-def filename_from_time(cur_datetime):
- consensus_filename = cur_datetime.strftime('%Y-%m-%d-%H-%M-%S')
- consensus_filename += '-consensus'
-
- return consensus_filename
+# router bw storage by fingerprint
+router_data = {}
+# unit time interval
time_interval = timedelta(0, 60*60) # one hour
+# interval multipliers for analysis: 1 hour to 7 days
+time_interval_list = [1,2,3,4,5,6,12,24,36,48,72,96,120,144,168] # hours
+
# base consensuses for examination
initial_time_info_bound = datetime(2012, 1, 1) # inclusive
final_time_info_bound = datetime(2013, 1, 1) # exclusive
-router_data = {}
-
# data range for consensuses
initial_time_data_bound = datetime(2011, 12, 1) # inclusive
final_time_data_bound = datetime(2013, 1, 1) # exclusive
# load information
-cur_datetime = initial_time_data_bound - time_interval
-while cur_datetime < final_time_data_bound - time_interval:
- cur_datetime += time_interval
-
+cur_datetime = initial_time_data_bound
+while cur_datetime < final_time_data_bound:
cur_filepath = filepath_from_time(cur_datetime)
- cur_filename = filename_from_time(cur_datetime)
-
- if file_check(cur_filepath) == True:
- routers = {}
+ cur_filename = os.path.basename(cur_filepath)
+ try:
with open(cur_filepath) as consensus_file:
- for router in parse_file(consensus_file):
- routers[router.fingerprint] = router.bandwidth
-
- router_data[cur_filename] = routers
-
-# interval multipliers
-time_interval_list = [1,2,3,4,5,6,12,24,36,48,72,96,120,144,168] # hours
+ router_data[cur_filename] = dict([(r.fingerprint, r.bandwidth)
+ for r in parse_file(consensus_file)])
+ except IOError:
+ pass # file does not exist (possible situation) and iterate
-# iterate over base consensuses
-cur_datetime = initial_time_info_bound - time_interval
-while cur_datetime < final_time_info_bound - time_interval:
+ # next file to read
cur_datetime += time_interval
+# iterate over base consensuses for frac_relays, frac_cw
+cur_datetime = initial_time_info_bound
+while cur_datetime < final_time_info_bound:
cur_filepath = filepath_from_time(cur_datetime) # current
- cur_filename = filename_from_time(cur_datetime) # current
+ cur_filename = os.path.basename(cur_filepath) # current
- if file_check(cur_filepath) == True:
+ # find base data, if data exists
+ if cur_filename in router_data:
base_routers = router_data[cur_filename]
- base_router_count = 0
- base_router_bandwidth = 0
- for fingerprint in router_data[cur_filename].keys():
- base_router_count += 1
- base_router_bandwidth += router_data[cur_filename][fingerprint]
-
- for comparison_time_interval_multiplier in time_interval_list:
- comparison_time_interval = timedelta(0, comparison_time_interval_multiplier*60*60)
- comparison_datetime = cur_datetime - comparison_time_interval
-
- comparison_filepath = filepath_from_time(comparison_datetime) # comparison
- comparison_filename = filename_from_time(comparison_datetime) # comparison
-
- if file_check(comparison_filepath) == True:
- comparison_router_count = 0
- comparison_router_bandwidth = 0
- comparison_router_overlap_bandwidth = 0
- base_router_overlap_bandwidth = 0
- comparison_router_overlap_count = 0
-
- for fingerprint in router_data[comparison_filename].keys():
- comparison_router_count += 1
- comparison_router_bandwidth += router_data[comparison_filename][fingerprint]
+ base_router_count = len(router_data[cur_filename])
+ base_router_bw = sum(router_data[cur_filename].values())
+
+ # for each analysis analysis interval, find comparison locator
+ for time_interval_multiplier in time_interval_list:
+ comp_time_interval = time_interval_multiplier*time_interval
+ comp_datetime = cur_datetime - comp_time_interval
+
+ comp_filepath = filepath_from_time(comp_datetime) # comp
+ comp_filename = os.path.basename(comp_filepath) # comp
+ # find comparison data, if data exists
+ if comp_filename in router_data:
+ router_overlap_count = 0
+ base_router_overlap_bw = 0
+
+ # determine intersection(Y,X) and sum cw over intersection(Y,X)
+ for fingerprint in router_data[comp_filename]:
if fingerprint in base_routers:
- base_router_overlap_bandwidth += base_routers[fingerprint]
- comparison_router_overlap_count += 1
- comparison_router_overlap_bandwidth += router_data[comparison_filename][fingerprint]
+ router_overlap_count += 1
+ base_router_overlap_bw += base_routers[fingerprint]
+
+ # determine ratios
+ frac_relays = float(router_overlap_count)/float(base_router_count)
+ frac_cw = float(base_router_overlap_bw)/float(base_router_bw)
- frac_relays = float(comparison_router_overlap_count)/float(base_router_count)
- frac_cw = float(base_router_overlap_bandwidth)/float(base_router_bandwidth)
+ # output
+ print '%s,%d,%f,%f,%d,%d,%s' % (cur_filename, time_interval_multiplier,
+ frac_relays, frac_cw, cur_datetime.month, cur_datetime.day,
+ cur_datetime.strftime('%w'))
- print '%s,%d,%f,%f,%d,%d,%s' % (cur_filename, comparison_time_interval_multiplier, frac_relays, frac_cw, cur_datetime.month, cur_datetime.day, cur_datetime.strftime('%w'))
+ # next base consensus
+ cur_datetime += time_interval
More information about the tor-commits
mailing list