[tor-commits] [ooni-probe/master] Implement collector for classifying website with domclass.
art at torproject.org
art at torproject.org
Sun Oct 7 17:06:23 UTC 2012
commit 3d1afe284b3437ee460841828e7a7175f93564e3
Author: Arturo Filastò <arturo at filasto.net>
Date: Sun Oct 7 17:05:33 2012 +0000
Implement collector for classifying website with domclass.
Next step run this on all the sites we want to test.
---
nettests/experimental/domclass_collector.py | 33 +++++++++++++++++++++++++++
ooni/kit/__init__.py | 1 +
ooni/kit/domclass.py | 30 ++++++++++++++++--------
3 files changed, 54 insertions(+), 10 deletions(-)
diff --git a/nettests/experimental/domclass_collector.py b/nettests/experimental/domclass_collector.py
new file mode 100644
index 0000000..9b2c8d8
--- /dev/null
+++ b/nettests/experimental/domclass_collector.py
@@ -0,0 +1,33 @@
+# -*- encoding: utf-8 -*-
+#
+# The purpose of this collector is to compute the eigenvector for the input
+# file containing a list of sites.
+#
+#
+# :authors: Arturo Filastò
+# :licence: see LICENSE
+
+from twisted.internet import threads, defer
+
+from ooni.kit import domclass
+from ooni.templates import httpt
+
+class DOMClassCollector(httpt.HTTPTest):
+ name = "DOM class collector"
+ author = "Arturo Filastò"
+ version = 0.1
+
+ inputs = ['http://news.google.com/', 'http://wikileaks.org/']
+ #inputFile = ['f', 'file', None, 'The list of urls to build a domclass for']
+
+ def test_collect(self):
+ if self.input:
+ url = self.input
+ return self.doRequest(url)
+ else:
+ raise Exception("No input specified")
+
+ def processResponseBody(self, body):
+ eigenvalues = domclass.compute_eigenvalues_from_DOM(content=body)
+ self.report['eigenvalues'] = eigenvalues
+
diff --git a/ooni/kit/__init__.py b/ooni/kit/__init__.py
new file mode 100644
index 0000000..55374c9
--- /dev/null
+++ b/ooni/kit/__init__.py
@@ -0,0 +1 @@
+__all__ = ['domclass']
diff --git a/ooni/kit/domclass.py b/ooni/kit/domclass.py
index 02c26dc..1cf33a0 100644
--- a/ooni/kit/domclass.py
+++ b/ooni/kit/domclass.py
@@ -108,7 +108,7 @@ def compute_eigenvalues(matrix):
"""
return numpy.linalg.eigvals(matrix)
-def readDOM(content=None, filename=None):
+def readDOM(content=None, filename=None, debug=False):
"""
Parses the DOM of the HTML page and returns an array of parent, child
pairs.
@@ -124,20 +124,30 @@ def readDOM(content=None, filename=None):
content = ''.join(f.readlines())
f.close()
- start = time.time()
- print "Running BeautifulSoup on content"
+ if debug:
+ start = time.time()
+ print "Running BeautifulSoup on content"
dom = BeautifulSoup(content)
- print "done in %s" % (time.time() - start)
+ if debug:
+ print "done in %s" % (time.time() - start)
- start = time.time()
- print "Creating couples matrix"
+ if debug:
+ start = time.time()
+ print "Creating couples matrix"
couples = []
for x in dom.findAll():
couples.append((str(x.parent.name), str(x.name)))
- print "done in %s" % (time.time() - start)
+ if debug:
+ print "done in %s" % (time.time() - start)
return couples
+def compute_eigenvalues_from_DOM(*arg,**kw):
+ dom = readDOM(*arg, **kw)
+ probability_matrix = compute_probability_matrix(dom)
+ eigenvalues = compute_eigenvalues(probability_matrix)
+ return eigenvalues
+
def compute_correlation(matrix_a, matrix_b):
correlation = numpy.vdot(matrix_a, matrix_b)
correlation /= numpy.linalg.norm(matrix_a)*numpy.linalg.norm(matrix_b)
@@ -192,13 +202,13 @@ def benchmark():
"""
start = time.time()
print "Read file B"
- site_a = readDOM(filename='filea.txt')
+ site_a = readDOM(filename='filea.txt', debug=True)
print "--------"
print "total done in %s" % (time.time() - start)
start = time.time()
print "Read file A"
- site_b = readDOM(filename='fileb.txt')
+ site_b = readDOM(filename='fileb.txt', debug=True)
print "--------"
print "total done in %s" % (time.time() - start)
@@ -233,4 +243,4 @@ def benchmark():
print "Corelation: %s" % correlation
-benchmark()
+#benchmark()
More information about the tor-commits
mailing list