[tor-commits] [ooni-probe/master] Do some more thorough benchmarks that identify the issue in BS
art at torproject.org
art at torproject.org
Sun Oct 7 16:44:29 UTC 2012
commit e977bc9e2c03c198d256f3cb1db745024791be63
Author: Arturo Filastò <arturo at filasto.net>
Date: Sun Oct 7 16:44:14 2012 +0000
Do some more thorough benchmarks that identify the issue in BS
---
ooni/kit/benchmarks.txt | 35 +++++++++++++++++++
ooni/kit/domclass.py | 87 ++++++++++++++++++++++++++++++++--------------
2 files changed, 95 insertions(+), 27 deletions(-)
diff --git a/ooni/kit/benchmarks.txt b/ooni/kit/benchmarks.txt
new file mode 100644
index 0000000..c2070e5
--- /dev/null
+++ b/ooni/kit/benchmarks.txt
@@ -0,0 +1,35 @@
+Read file B
+done in 0.74356508255
+Read file A
+done in 0.94336104393
+Computing prob matrix
+done in 0.0432229042053
+Computing eigenvalues
+done in 0.00188422203064
+Corelation: 0.999999079331
+
+
+Read file B
+done in 0.946599960327
+Read file A
+done in 0.909075975418
+Computing prob matrix
+done in 0.0541450977325
+Computing eigenvalues
+done in 0.00259518623352
+Computing prob matrix B
+done in 0.060467004776
+Computing eigen B
+done in 0.00102496147156
+Computing correlation
+done in 0.000158071517944
+Corelation: 0.999999079331
+
+this was with:
+683 filea.txt
+678 fileb.txt
+
+diff file* | wc -l
+283
+
+
diff --git a/ooni/kit/domclass.py b/ooni/kit/domclass.py
index 33c960a..02c26dc 100644
--- a/ooni/kit/domclass.py
+++ b/ooni/kit/domclass.py
@@ -32,6 +32,7 @@
import yaml
import numpy
from bs4 import BeautifulSoup
+import time
# All HTML4 tags
# XXX add link to W3C page where these came from
@@ -123,10 +124,17 @@ def readDOM(content=None, filename=None):
content = ''.join(f.readlines())
f.close()
+ start = time.time()
+ print "Running BeautifulSoup on content"
dom = BeautifulSoup(content)
+ print "done in %s" % (time.time() - start)
+
+ start = time.time()
+ print "Creating couples matrix"
couples = []
for x in dom.findAll():
couples.append((str(x.parent.name), str(x.name)))
+ print "done in %s" % (time.time() - start)
return couples
@@ -138,66 +146,91 @@ def compute_correlation(matrix_a, matrix_b):
def benchmark():
"""
- Running some very basic benchmarks we assets this:
-
- Read file B
- done in 0.74356508255
- Read file A
- done in 0.94336104393
- Computing prob matrix
- done in 0.0432229042053
- Computing eigenvalues
- done in 0.00188422203064
- Corelation: 0.999999079331
+ Running some very basic benchmarks on this input data:
- this was with:
+ Data files:
683 filea.txt
678 fileb.txt
diff file* | wc -l
283
+ We get such results:
+
+ Read file B
+ Running BeautifulSoup on content
+ done in 0.768223047256
+ Creating couples matrix
+ done in 0.023903131485
+ --------
+ total done in 0.796372890472
+ Read file A
+ Running BeautifulSoup on content
+ done in 0.752885818481
+ Creating couples matrix
+ done in 0.0163578987122
+ --------
+ total done in 0.770951986313
+ Computing prob matrix
+ done in 0.0475239753723
+ Computing eigenvalues
+ done in 0.00161099433899
+ Computing prob matrix B
+ done in 0.0408289432526
+ Computing eigen B
+ done in 0.000268936157227
+ Computing correlation
+ done in 0.00016713142395
+ Corelation: 0.999999079331
What this means is that the bottleneck is not in the maths, but is rather
in the computation of the DOM tree matrix.
- XXX We should focus on optimizing the parsing of the HTML and the
- computation of the couple matrix.
+ XXX We should focus on optimizing the parsing of the HTML (this depends on
+ beautiful soup). Perhaps we can find and alternative to it that is
+ sufficient for us.
"""
- import time
start = time.time()
-
print "Read file B"
site_a = readDOM(filename='filea.txt')
- print "done in %s" % (time.time() - start)
- start = time.time()
+ print "--------"
+ print "total done in %s" % (time.time() - start)
+ start = time.time()
print "Read file A"
site_b = readDOM(filename='fileb.txt')
- print "done in %s" % (time.time() - start)
- start = time.time()
-
-
+ print "--------"
+ print "total done in %s" % (time.time() - start)
a = {}
+ b = {}
+
+ start = time.time()
print "Computing prob matrix"
a['matrix'] = compute_probability_matrix(site_a)
-
print "done in %s" % (time.time() - start)
start = time.time()
print "Computing eigenvalues"
a['eigen'] = compute_eigenvalues(a['matrix'])
-
print "done in %s" % (time.time() - start)
start = time.time()
-
- b = {}
+ start = time.time()
+ print "Computing prob matrix B"
b['matrix'] = compute_probability_matrix(site_b)
+ print "done in %s" % (time.time() - start)
+
+ start = time.time()
+ print "Computing eigen B"
b['eigen'] = compute_eigenvalues(b['matrix'])
+ print "done in %s" % (time.time() - start)
+ start = time.time()
+ print "Computing correlation"
correlation = compute_correlation(a['eigen'], b['eigen'])
+ print "done in %s" % (time.time() - start)
+
print "Corelation: %s" % correlation
-#benchmark()
+benchmark()
More information about the tor-commits
mailing list