[tor-commits] [ooni-probe/master] Do some benckmarks on domclass and figure out where optimization should be
art at torproject.org
art at torproject.org
Sun Oct 7 16:32:51 UTC 2012
commit c30049b0d9daf748378f3da717411f4c0b85a23d
Author: Arturo Filastò <arturo at filasto.net>
Date: Sun Oct 7 16:32:00 2012 +0000
Do some benckmarks on domclass and figure out where optimization should be
done.
(We must optimize how the DOM tree is parsed and how we compute the couple
matrix)
---
ooni/kit/domclass.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 51 insertions(+), 2 deletions(-)
diff --git a/ooni/kit/domclass.py b/ooni/kit/domclass.py
index d50647c..33c960a 100644
--- a/ooni/kit/domclass.py
+++ b/ooni/kit/domclass.py
@@ -136,14 +136,63 @@ def compute_correlation(matrix_a, matrix_b):
correlation = (correlation + 1)/2
return correlation
-def example():
+def benchmark():
+ """
+ Running some very basic benchmarks we assets this:
+
+ Read file B
+ done in 0.74356508255
+ Read file A
+ done in 0.94336104393
+ Computing prob matrix
+ done in 0.0432229042053
+ Computing eigenvalues
+ done in 0.00188422203064
+ Corelation: 0.999999079331
+
+ this was with:
+ 683 filea.txt
+ 678 fileb.txt
+
+ diff file* | wc -l
+ 283
+
+
+ What this means is that the bottleneck is not in the maths, but is rather
+ in the computation of the DOM tree matrix.
+
+ XXX We should focus on optimizing the parsing of the HTML and the
+ computation of the couple matrix.
+ """
+ import time
+ start = time.time()
+
+ print "Read file B"
site_a = readDOM(filename='filea.txt')
+ print "done in %s" % (time.time() - start)
+ start = time.time()
+
+ print "Read file A"
site_b = readDOM(filename='fileb.txt')
+ print "done in %s" % (time.time() - start)
+ start = time.time()
+
+
a = {}
+ print "Computing prob matrix"
a['matrix'] = compute_probability_matrix(site_a)
+
+ print "done in %s" % (time.time() - start)
+ start = time.time()
+
+ print "Computing eigenvalues"
a['eigen'] = compute_eigenvalues(a['matrix'])
+ print "done in %s" % (time.time() - start)
+ start = time.time()
+
+
b = {}
b['matrix'] = compute_probability_matrix(site_b)
b['eigen'] = compute_eigenvalues(b['matrix'])
@@ -151,4 +200,4 @@ def example():
correlation = compute_correlation(a['eigen'], b['eigen'])
print "Corelation: %s" % correlation
-
+#benchmark()
More information about the tor-commits
mailing list