[tor-commits] [metrics-tasks/master] Calculate and graph frac_relays and frac_cw using stem (#7241).
karsten at torproject.org
karsten at torproject.org
Fri Jan 18 10:19:14 UTC 2013
commit 0e4f962092b1e4c5545136485bb338cd839401f2
Author: peer <peer at lavabit.com>
Date: Thu Jan 17 14:41:54 2013 +0000
Calculate and graph frac_relays and frac_cw using stem (#7241).
---
task-7241/README | 22 +++++++++
task-7241/first_pass.py | 109 +++++++++++++++++++++++++++++++++++++++++++++++
task-7241/plot.R | 13 ++++++
3 files changed, 144 insertions(+), 0 deletions(-)
diff --git a/task-7241/README b/task-7241/README
new file mode 100644
index 0000000..80a710f
--- /dev/null
+++ b/task-7241/README
@@ -0,0 +1,22 @@
+Uses stem to parse network consensus documents to determine frac_relays and frac_cw based on fingerprint.
+
+*Definitions*
+
+Let Y be the consensus listed (now) and X the consensus some hours ago (now - hours).
+
+Let intersection(X,Y) be the routers in both X and Y based on fingerprint.
+
+frac_relay is count(intersection(X,Y))/count(Y).
+
+frac_cw is the sum of consensus weights in Y over intersection(X,Y) divided by the sum of consensus weights in Y.
+
+*Notes*
+
+Output is in CSV format and does not include a header. Fields are consensus, hour, frac_relays, frac_cw, month, day, and day of week.
+
+Unavailable network consensus documents based on path are ignored.
+
+Change initial_time_info_bound, final_time_info_bound, initial_time_data_bound, and final_time_data_bound to explore different time ranges. initial_time_data_bound should be at least 168 hours before initial_time_info_bound.
+
+Four months of hourly data (fingerprint, consensus weights) uses about 1.5GB of space.
+
diff --git a/task-7241/first_pass.py b/task-7241/first_pass.py
new file mode 100644
index 0000000..f3f9707
--- /dev/null
+++ b/task-7241/first_pass.py
@@ -0,0 +1,109 @@
+import sys
+from datetime import datetime, timedelta
+
+from stem.descriptor.networkstatus import NetworkStatusDocumentV3
+
+# http://stackoverflow.com/questions/82831/how-do-i-check-if-a-file-exists-using-python
+def file_check(file_path):
+ try:
+ with open(file_path) as f:
+ return True
+ except IOError:
+ return False
+
+def filepath_from_time(cur_datetime):
+ consensus_path = 'consensuses-'
+ consensus_path += cur_datetime.strftime('%Y-%m')
+ consensus_path += '/'
+ consensus_path += cur_datetime.strftime('%d')
+ consensus_path += '/'
+ consensus_path += cur_datetime.strftime('%Y-%m-%d-%H-%M-%S')
+ consensus_path += '-consensus'
+
+ return consensus_path
+
+def filename_from_time(cur_datetime):
+ consensus_filename = cur_datetime.strftime('%Y-%m-%d-%H-%M-%S')
+ consensus_filename += '-consensus'
+
+ return consensus_filename
+
+time_interval = timedelta(0, 60*60) # one hour
+
+# base consensuses for examination
+initial_time_info_bound = datetime(2012, 1, 1) # inclusive
+final_time_info_bound = datetime(2013, 1, 1) # exclusive
+
+router_data = {}
+
+# data range for consensuses
+initial_time_data_bound = datetime(2011, 12, 1) # inclusive
+final_time_data_bound = datetime(2013, 1, 1) # exclusive
+
+# load information
+cur_datetime = initial_time_data_bound - time_interval
+while cur_datetime < final_time_data_bound - time_interval:
+ cur_datetime += time_interval
+
+ cur_filepath = filepath_from_time(cur_datetime)
+ cur_filename = filename_from_time(cur_datetime)
+
+ if file_check(cur_filepath) == True:
+ consensus_file = open(cur_filepath, 'r')
+ consensus_file.readline()
+ consensus = NetworkStatusDocumentV3(consensus_file.read())
+ consensus_file.close()
+
+ routers = {}
+ for router in consensus.routers:
+ routers[router.fingerprint] = router.bandwidth
+
+ router_data[cur_filename] = routers
+
+# interval multipliers
+time_interval_list = [1,2,3,4,5,6,12,24,36,48,72,96,120,144,168] # hours
+
+# iterate over base consensuses
+cur_datetime = initial_time_info_bound - time_interval
+while cur_datetime < final_time_info_bound - time_interval:
+ cur_datetime += time_interval
+
+ cur_filepath = filepath_from_time(cur_datetime) # current
+ cur_filename = filename_from_time(cur_datetime) # current
+
+ if file_check(cur_filepath) == True:
+ base_routers = router_data[cur_filename]
+ base_router_count = 0
+ base_router_bandwidth = 0
+ for fingerprint in router_data[cur_filename].keys():
+ base_router_count += 1
+ base_router_bandwidth += router_data[cur_filename][fingerprint]
+
+ for comparison_time_interval_multiplier in time_interval_list:
+ comparison_time_interval = timedelta(0, comparison_time_interval_multiplier*60*60)
+ comparison_datetime = cur_datetime - comparison_time_interval
+
+ comparison_filepath = filepath_from_time(comparison_datetime) # comparison
+ comparison_filename = filename_from_time(comparison_datetime) # comparison
+
+ if file_check(comparison_filepath) == True:
+ comparison_router_count = 0
+ comparison_router_bandwidth = 0
+ comparison_router_overlap_bandwidth = 0
+ base_router_overlap_bandwidth = 0
+ comparison_router_overlap_count = 0
+
+ for fingerprint in router_data[comparison_filename].keys():
+ comparison_router_count += 1
+ comparison_router_bandwidth += router_data[comparison_filename][fingerprint]
+
+ if fingerprint in base_routers:
+ base_router_overlap_bandwidth += base_routers[fingerprint]
+ comparison_router_overlap_count += 1
+ comparison_router_overlap_bandwidth += router_data[comparison_filename][fingerprint]
+
+ frac_relays = float(comparison_router_overlap_count)/float(base_router_count)
+ frac_cw = float(base_router_overlap_bandwidth)/float(base_router_bandwidth)
+
+ print '%s,%d,%f,%f,%d,%d,%s' % (cur_filename, comparison_time_interval_multiplier, frac_relays, frac_cw, cur_datetime.month, cur_datetime.day, cur_datetime.strftime('%w'))
+
diff --git a/task-7241/plot.R b/task-7241/plot.R
new file mode 100644
index 0000000..66d39d9
--- /dev/null
+++ b/task-7241/plot.R
@@ -0,0 +1,13 @@
+require(ggplot2)
+data <- read.csv("s2012.csv", header=TRUE)
+
+# frac_relays
+p <- ggplot(data, aes(factor(hours), frac_relays))
+p + geom_boxplot() + ylab("frac_relays") + xlab("time interval") + ggtitle(2012)
+ggsave("2012_frac_relays.png", height=6, width=6)
+
+# frac_cw
+p <- ggplot(data, aes(factor(hours), frac_cw))
+p + geom_boxplot() + ylab("frac_cw") + xlab("time interval") + ggtitle(2012)
+ggsave("2012_frac_cw.png", height=6, width=6)
+
More information about the tor-commits
mailing list