[tor-commits] [metrics-tasks/master] Add George's v2 detector script (#2718).
karsten at torproject.org
karsten at torproject.org
Thu Oct 6 06:08:44 UTC 2011
commit 13a1a7911d8a84923cd071160775eccc24d8b47e
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Wed Oct 5 08:37:06 2011 +0200
Add George's v2 detector script (#2718).
---
task-2718/.gitignore | 2 +
task-2718/detectorv2.py | 531 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 533 insertions(+), 0 deletions(-)
diff --git a/task-2718/.gitignore b/task-2718/.gitignore
index 917bb1b..0c965b5 100644
--- a/task-2718/.gitignore
+++ b/task-2718/.gitignore
@@ -1,3 +1,5 @@
*.csv
*.pdf
+*.aux
+*.log
diff --git a/task-2718/detectorv2.py b/task-2718/detectorv2.py
new file mode 100755
index 0000000..5dce711
--- /dev/null
+++ b/task-2718/detectorv2.py
@@ -0,0 +1,531 @@
+## Copyright (c) 2011 George Danezis <gdane at microsoft.com>
+##
+## All rights reserved.
+##
+## Redistribution and use in source and binary forms, with or without
+## modification, are permitted (subject to the limitations in the
+## disclaimer below) provided that the following conditions are met:
+##
+## * Redistributions of source code must retain the above copyright
+## notice, this list of conditions and the following disclaimer.
+##
+## * Redistributions in binary form must reproduce the above copyright
+## notice, this list of conditions and the following disclaimer in the
+## documentation and/or other materials provided with the
+## distribution.
+##
+## * Neither the name of <Owner Organization> nor the names of its
+## contributors may be used to endorse or promote products derived
+## from this software without specific prior written permission.
+##
+## NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+## GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+## HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+## WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+## MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+## DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+## LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+## CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+## SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+## BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+## IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+##
+## (Clear BSD license: http://labs.metacarta.com/license-explanation.html#license)
+
+## This script reads a .csv file of the number of Tor users and finds
+## anomalies that might be indicative of censorship.
+
+# Dep: matplotlib
+from pylab import *
+import matplotlib
+
+# Dep: numpy
+import numpy
+
+# Dep: scipy
+import scipy.stats
+from scipy.stats.distributions import norm
+from scipy.stats.distributions import poisson
+from scipy.stats.distributions import gamma
+
+# Std lib
+from datetime import date
+from datetime import timedelta
+import os.path
+
+import random
+
+days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
+
+# read the .csv file
+class torstatstore:
+ def __init__(self, file_name, DAYS):
+ self.DAYS = DAYS
+ f = file(file_name)
+ country_codes = f.readline()
+ country_codes = country_codes.strip().split(",")
+
+ store = {}
+ MAX_INDEX = 0
+ for i, line in enumerate(f):
+ MAX_INDEX += 1
+ line_parsed = line.strip().split(",")
+ for j, (ccode, val) in enumerate(zip(country_codes,line_parsed)):
+ processed_val = None
+ if ccode == "date":
+ try:
+ year, month, day = int(val[:4]), int(val[5:7]), int(val[8:10])
+ processed_val = date(year, month, day)
+ except Exception, e:
+ print "Parsing error (ignoring line %s):" % j
+ print "%s" % val,e
+ break
+
+ elif val != "NA":
+ processed_val = int(val)
+ store[(ccode, i)] = processed_val
+
+ # min and max
+ date_min = store[("date", 0)]
+ date_max = store[("date", i)]
+
+ all_dates = []
+ d = date_min
+ dt = timedelta(days=1)
+ while d <= date_max:
+ all_dates += [d]
+ d = d + dt
+
+ # Save for later
+ self.store = store
+ self.all_dates = all_dates
+ self.country_codes = country_codes
+ self.MAX_INDEX = MAX_INDEX
+ self.date_min = date_min
+ self.date_max = date_max
+
+ def get_country_series(self, ccode):
+ assert ccode in self.country_codes
+ series = {}
+ for d in self.all_dates:
+ series[d] = None
+ for i in range(self.MAX_INDEX):
+ series[self.store[("date", i)]] = self.store[(ccode, i)]
+ sx = []
+ for d in self.all_dates:
+ sx += [series[d]]
+ return sx[-self.DAYS:]
+
+ def get_dates(self):
+ return self.all_dates[-self.DAYS:]
+
+ def get_largest(self, number):
+ exclude = set(["all", "??", "date"])
+ l = [(self.store[(c, self.MAX_INDEX-1)], c) for c in self.country_codes if c not in exclude]
+ l.sort()
+ l.reverse()
+ return [c for _, c in l][:number]
+
+ def get_largest_locations(self, number):
+ l = self.get_largest(number)
+ res = {}
+ for ccode in l[:number]:
+ res[ccode] = self.get_country_series(ccode)
+ return res
+
+ def get_codes(self):
+ return self.country_codes + []
+
+## Run a particle filter based inference algorithm
+## given a data series and a model of traffic over time
+def particle_filter_detector(ser1, taps, models):
+ # particle : (id, rate, censor, last_censor prev_particle)
+
+ # Model paramaters
+ normal_std_factor = 4
+ censorship_std_factor = 7
+ censorship_prior_model = 0.01
+ change_tap_prior_model = 0.1
+
+ # Sampling parameters
+ change_tap_sample = 0.2
+ censorship_prior_sample = 0.3
+ particle_number = 1000
+ mult_particles = 1
+
+ # Check consistancy once
+ for t in models:
+ assert len(ser1) == len(models[t])
+
+ # Clean up a bit the data
+ series2 = []
+ last = None
+ first = None
+ # Process series
+ for s in ser1:
+ if s == None:
+ series2 += [last]
+ else:
+ if first == None:
+ first = s
+ series2 += [s]
+ last = s
+
+ series2 = [s if s != None else first for s in series2]
+ series = series2
+
+ # Data structures to keep logs
+ particles = {}
+ outputlog = [(series[0],series[0])]
+
+ # Initial particles:
+ particles[0] = []
+ G = gamma(max(1,series[0]), 1)
+ for pi, r in enumerate(G.rvs(particle_number)):
+ particles[0] += [(pi, r, False, None, 0, random.choice(taps), False)]
+
+ # Now run the sampler for all times
+ for pi in range(1, len(series)):
+ assert models != None
+ assert taps != None
+
+ # Normal distributions from taps and the model standard deviation for normality and censorship
+ round_models = {}
+ for ti in taps:
+ NoCensor = norm(models[ti][pi][0], (models[ti][pi][1] * normal_std_factor)**2)
+ Censor = norm(models[ti][pi][0], (models[ti][pi][1] * censorship_std_factor)**2)
+ round_models[ti] = (NoCensor, Censor)
+
+ # Store for expanded pool of particles
+ temporary_particles = []
+
+ # Expand the distribution
+ for p in particles[pi-1]:
+ p_old, C_old, j = tracebackp(particles, p, pi-1, p[5] - 1) # taps[0] - 1)
+
+ # Serial number of old particle
+ p_old_num = None
+ if p_old != None:
+ p_old_num = p_old[0]
+
+ # Create a number of candidate particles from each previous particle
+ for _ in range(mult_particles):
+
+ # Sample a new tap for the candidate particle
+ new_tap = p[5]
+ if random.random() < change_tap_sample:
+ new_tap = random.choice(taps)
+
+ # Update this censorship flag
+ C = False
+ if random.random() < censorship_prior_sample:
+ C = True
+
+ # Determine new rate
+ new_p = None
+ if p_old == None:
+ new_p = p[1] # continue as before
+ if C | C_old:
+ while new_p == None or new_p < 0:
+ new_p = p_old[1] * (1 + round_models[new_tap][1].rvs(1)[0]) ## censor models
+ else:
+ while new_p == None or new_p < 0:
+ new_p = p_old[1] * (1 + round_models[new_tap][0].rvs(1)[0]) ## no censor models
+
+ # Build and register new particle
+ newpi = (None, new_p, C, p[0], pi, new_tap, C | C_old)
+ temporary_particles += [newpi]
+
+
+ # Assign a weight to each sampled candidtae particle
+ weights = []
+ for px in temporary_particles:
+ wx = 1.0
+
+ # Adjust weight to observation
+ if not series[pi] == None:
+ poisson_prob = poisson.pmf(series[pi], px[1])
+ #print poisson_prob, px
+ wx *= poisson_prob
+
+ # Adjust the probability of censorship
+ if px[2]:
+ wx *= censorship_prior_model / censorship_prior_sample
+ else:
+ wx *= (1 - censorship_prior_model) / (1 - censorship_prior_sample)
+
+ # Adjust the probability of changing the tap
+ if px[5] == particles[pi-1][px[3]][5]:
+ wx *= (1 - change_tap_prior_model) / (((1-change_tap_sample) + change_tap_sample*(1.0 / len(taps))))
+ else:
+ wx *= (change_tap_prior_model) / (1 - (((1-change_tap_sample) + change_tap_sample*(1.0 / len(taps)))))
+
+ weights += [wx]
+
+ weights_sum = sum(weights)
+
+ ## Resample according to weight
+ particles[pi] = []
+ for pid in range(particle_number):
+ px = samplep(weights, weights_sum, temporary_particles)
+ px = (pid, px[1], px[2], px[3], px[4], px[5], px[6])
+ particles[pi] += [px]
+
+ ## Collect some statistics
+
+ ## stats
+ Ci = 0
+ mean = 0
+ for px in particles[pi]:
+ if px[2]:
+ Ci += 1
+ mean += px[1]
+ mean = mean / len(particles[pi])
+
+ # Diversity
+ Div = len(set([pv[3] for pv in particles[pi]]))
+
+ # Range of values
+ range_normal = sorted([pn[1] for pn in temporary_particles if not pn[2]])
+ Base = range_normal[len(range_normal)/2]
+ Mn = range_normal[len(range_normal)*1/100]
+ Mx = range_normal[len(range_normal)*99/100]
+ outputlog += [(Mn, Mx)]
+
+ # How many are using the censorship model at any time?
+ censor_model_stat = len([1 for pn in particles[pi] if pn[6]])* 100 / len(particles[pi])
+
+ # Build histogram of taps
+ tap_hist = {}
+ for px in particles[pi]:
+ tap_hist[px[5]] = tap_hist.get(px[5], 0) + 1
+
+ print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (pi, Ci, mean, series[pi], tap_hist, Base, Mn, Mx, Div, censor_model_stat)
+ # print " [%s - %s]" % (key_series_point*(1+NoCensor.ppf(0.00001)), key_series_point*(1+NoCensor.ppf(0.99999)))
+
+ return particles, outputlog
+
+## Get number of censorship particles, particles that use previous censorship models,
+## and total number of particles over time.
+def get_events(particles):
+ events = []
+ for ps in sorted(particles):
+ censor_model_stat = len([1 for pn in particles[ps] if pn[6]])
+ events += [(len([1 for p in particles[ps] if p[2]]), censor_model_stat, len(particles[ps]))]
+ return events
+
+## Make pretty graphs of the data and censorship events
+def plotparticles(series, particles, outputlog, labels, xtitle, events):
+ assert len(xtitle) == 3
+ fname, stitle, slegend = xtitle
+
+ font = {'family' : 'Bitstream Vera Sans',
+ 'weight' : 'normal',
+ 'size' : 8}
+ matplotlib.rc('font', **font)
+
+ mmx = max(series)
+ if mmx == None:
+ return # There is no data here!
+ diff = abs(-mmx*0.1 - mmx*1.1)
+
+ ylim( (-mmx*0.1, 1+mmx*1.1) )
+ plot(labels, series, linewidth=1.0, label="Users")
+
+ F = gcf()
+
+ wherefill = []
+ minc, maxc = [], []
+ for mm,mx in outputlog:
+ wherefill += [not (mm == None and mx == None)]
+ assert mm <= mx or (mm == None and mx == None)
+ minc += [mm]
+ maxc += [mx]
+
+ fill_between(labels, minc, maxc, where=wherefill, color="gray", label="Prediction")
+
+ vdown = []
+ vup = []
+ active_region_20 = []
+ active_region_50 = []
+ for i,v in enumerate(series):
+ if minc[i] == None or maxc[i] == None:
+ continue
+
+ mean = (minc[i] + maxc[i]) / 2
+
+ v2 = v
+ if v2 == None:
+ v2 = 0
+
+ if events[i][0] * 100 / events[i][2] > 10:
+ if v2 <= mean:
+ vdown += [(labels[i], v2, events[i][0], events[i][2])]
+ # print vdown[-1]
+ else:
+ vup += [(labels[i], v2, events[i][0], events[i][2])]
+
+ active_region_20 += [events[i][1] * 100 / events[i][2] > 20]
+ active_region_50 += [events[i][1] * 100 / events[i][2] > 50]
+
+ fill_between(labels, -mmx*0.1*ones(len(labels)), (1+mmx*1.1)*ones(len(labels)), where=active_region_20, color="r", alpha=0.15)
+ fill_between(labels, -mmx*0.1*ones(len(labels)), (1+mmx*1.1)*ones(len(labels)), where=active_region_50, color="r", alpha=0.15)
+
+ x = [p[0] for p in vdown]
+ y = [p[1] for p in vdown]
+ s = [20 + p[2]*100 / p[3] for p in vdown]
+ if len(x) > 0:
+ scatter(x,y,s=s, marker='v', c='r')
+ for xi,yi, score, total in vdown:
+ if 100 * score / total > 10:
+ text(xi, yi - diff*5 / 100, "%2d%%" % (100 * float(score) / total), color="r")
+
+ x = [p[0] for p in vup]
+ y = [p[1] for p in vup]
+ s = [20+ p[2]*100 / p[3] for p in vup]
+ if len(x) > 0:
+ scatter(x,y,s=s, marker='^', c='g')
+ for xi,yi, score, total in vup:
+ if 100 * score / total > 10:
+ text(xi, yi+diff*5 / 100, "%2d%%" % (100 * float(score) / total), color="g")
+
+
+ legend(loc=2)
+
+ xlabel('Time (days)')
+ ylabel('Users')
+ title(stitle)
+ grid(True)
+
+
+ F.set_size_inches(10,5)
+ F.savefig(fname, format="png", dpi = (150))
+ close()
+
+## Get a particle from a trace at time current_round - delay
+def tracebackp(particles, start_particle, current_round, delay):
+ if current_round - delay < 0:
+ return None, False, 0
+
+ j = current_round
+ this_particle = start_particle
+ C = False
+ r = None
+ # print "-----"
+ while not j < current_round - delay:
+ # print this_particle
+ C |= this_particle[2] # set the censorship flag
+ j = j-1
+ if not (not j < current_round - delay):
+ break
+ this_particle = particles[j][this_particle[3]]
+ assert j+1 == this_particle[4] == current_round - delay
+ return (this_particle, C, j+1)
+
+# Sample a number of items according to their weights
+def samplep(weights, total, samples):
+ rx = random.random() * total
+ stotal = 0.0
+ for i,w in enumerate(weights):
+ stotal += w
+ if stotal >= rx:
+ return samples[i]
+
+ assert False
+
+# Makes an estimate of the rate from sample observations
+def infer_sample_rate(series):
+ seriesNone = series + []
+ series = series + [] # we need a fresh copy!
+ for i,s in enumerate(series):
+ if seriesNone[i] == None:
+ series[i] = 0
+ if series[i] == 0:
+ series[i] = 0.001
+ rates = list(gamma.rvs(series, 1))
+ for i,r in enumerate(rates):
+ if seriesNone[i] == None or seriesNone[i] == 0:
+ rates[i] = None
+ return rates
+
+# Get (mean, std) for the top-50 series and different day delays (e.g. taps=[1,7])
+def make_daytoday_normal_models(tss, taps):
+ codes = tss.get_largest_locations(50)
+ series = {}
+ L = len(codes.values()[0])
+ for c in codes:
+ series[(c, 0)] = infer_sample_rate(tss.get_country_series(c))
+ assert len(series[(c, 0)]) == L
+ for d in taps:
+ series[(c, d)] = historic_frac(series[(c, 0)], d)
+ assert len(series[(c, d)]) == L
+
+ models = {}
+ for d in taps:
+ models[d] = []
+ for i in range(L):
+ v = []
+ for c in codes:
+ vi = series[(c, d)][i]
+ if not vi == None:
+ v += [vi]
+ if len(v) > 1:
+ v.sort()
+ v = v[len(v)*5/100:len(v)*95/100:]
+ if (numpy.mean(v) > 10) or (numpy.mean(v) < -10):
+ # models[d] += [(0.0, 0.02)]
+ print i, [(numpy.mean(v), numpy.std(v))]
+ models[d] += [(numpy.mean(v), numpy.std(v))]
+ else:
+ models[d] += [(numpy.mean(v), numpy.std(v))]
+ else:
+ models[d] += [(0.0, 0.02)]
+ # print d, i, models[d][-1] # , v[:5]
+
+ return models
+
+# Get historic fractions of traffic to train models
+def historic_frac(rates, delay):
+ assert delay > 0
+ diff= []
+ for i,r in enumerate(rates):
+ if i - delay < 0 or rates[i-delay] == None or rates[i] == None or rates[i-delay] == 0:
+ diff += [None]
+ else:
+ diff += [(rates[i] - rates[i-delay]) / rates[i-delay]]
+ return diff
+
+def plot_country(tss, models, taps, country_code, GRAPH_DIR):
+ series = tss.get_country_series(country_code)
+ particles, outputlog = particle_filter_detector(series, taps, models) ## Run the inference algorithm
+ events = get_events(particles) ## Extract events from particles -- % censorship over time
+ labels = tss.get_dates()
+ xtitle = (os.path.join(GRAPH_DIR, "%s-censor.png" % country_code), "Tor report for %s" % country_code,"")
+ plotparticles(series, particles, outputlog, labels, xtitle, events)
+
+
+#def main():
+if True:
+ # Change these to customize script
+ ## (Model parameters are still in particle filter function)
+ CSV_FILE = "direct-users.csv"
+ GRAPH_DIR = "img2"
+ DAYS= 4 * 31
+
+ tss = torstatstore(CSV_FILE, DAYS)
+ gr = tss.get_country_series('gr')
+ rates_gr = infer_sample_rate(gr)
+ # print historic_frac(rates_gr, 1)
+
+ models = make_daytoday_normal_models(tss, [1, 7])
+
+ # for country_code in tss.get_largest(250):
+ #country_code = "kr"
+ for country_code in ["cn", "eg", "ly", "kr", "de", "mm"]:
+ print country_code
+ plot_country(tss, models, [1, 7], country_code, GRAPH_DIR)
+
+#if __name__ == "__main__":
+# main()
More information about the tor-commits
mailing list