[tor-commits] [ooni-probe/master] Write some documentation for DOMClass

art at torproject.org art at torproject.org
Wed Aug 22 06:14:39 UTC 2012


commit 78fe6dea0b46c4055fbc195cf9329001c062cb17
Author: Arturo Filastò <hellais at torproject.org>
Date:   Tue Aug 21 01:51:58 2012 +0200

    Write some documentation for DOMClass
---
 ooni/plugins/domclass.py |   97 ++++++++++++++++++++++++++++++++++-----------
 ooni/protocols/http.py   |    5 ++
 2 files changed, 78 insertions(+), 24 deletions(-)

diff --git a/ooni/plugins/domclass.py b/ooni/plugins/domclass.py
index 31e2e41..cdcd508 100644
--- a/ooni/plugins/domclass.py
+++ b/ooni/plugins/domclass.py
@@ -1,8 +1,33 @@
-"""
-This is a self genrated test created by scaffolding.py.
-you will need to fill it up with all your necessities.
-Safe hacking :).
-"""
+#
+#
+#    domclass
+#    ********
+#
+#    :copyright: (c) 2012 by Arturo Filastò
+#    :license: see LICENSE for more details.
+#
+#    how this works
+#    --------------
+#
+#    This classifier uses the DOM structure of a website to determine how similar
+#    the two sites are.
+#    The procedure we use is the following:
+#        * First we parse all the DOM tree of the web page and we build a list of
+#          TAG parent child relationships (ex. <html><a><b></b></a><c></c></html> =>
+#          (html, a), (a, b), (html, c)).
+#
+#        * We then use this information to build a matrix (M) where m[i][j] = P(of
+#          transitioning from tag[i] to tag[j]). If tag[i] does not exists P() = 0.
+#          Note: M is a square matrix that is number_of_tags wide.
+#
+#        * We then calculate the eigenvectors (v_i) and eigenvalues (e) of M.
+#
+#        * The corelation between page A and B is given via this formula:
+#          correlation = dot_product(e_A, e_B), where e_A and e_B are
+#          resepectively the eigenvalues for the probability matrix A and the
+#          probability matrix B.
+#
+
 from zope.interface import implements
 from twisted.python import usage
 from twisted.plugin import IPlugin
@@ -18,6 +43,8 @@ class domclassArgs(usage.Options):
                      ['asset', 'a', None, 'URL List'],
                      ['resume', 'r', 0, 'Resume at this index']]
 
+# All HTML4 tags
+# XXX add link to W3C page where these came from
 alltags = ['A', 'ABBR', 'ACRONYM', 'ADDRESS', 'APPLET', 'AREA', 'B', 'BASE',
            'BASEFONT', 'BD', 'BIG', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION',
            'CENTER', 'CITE', 'CODE', 'COL', 'COLGROUP', 'DD', 'DEL', 'DFN', 'DIR', 'DIV',
@@ -29,6 +56,7 @@ alltags = ['A', 'ABBR', 'ACRONYM', 'ADDRESS', 'APPLET', 'AREA', 'B', 'BASE',
            'STRIKE', 'STRONG', 'STYLE', 'SUB', 'SUP', 'TABLE', 'TBODY', 'TD',
            'TEXTAREA', 'TFOOT', 'TH', 'THEAD', 'TITLE', 'TR', 'TT', 'U', 'UL', 'VAR']
 
+# Reduced subset of only the most common tags
 commontags = ['A', 'B', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION',
            'CENTER', 'CITE', 'CODE', 'COL', 'DD', 'DIV',
            'DL', 'DT', 'EM', 'FIELDSET', 'FONT', 'FORM', 'FRAME', 'FRAMESET', 'H1', 'H2',
@@ -39,12 +67,18 @@ commontags = ['A', 'B', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION',
            'STRIKE', 'STRONG', 'STYLE', 'SUB', 'SUP', 'TABLE', 'TBODY', 'TD',
            'TEXTAREA', 'TFOOT', 'TH', 'THEAD', 'TITLE', 'TR', 'TT', 'U', 'UL']
 
+# The tags we are intested in using for our analysis
 thetags = ['A', 'DIV', 'FRAME', 'H1', 'H2',
            'H3', 'H4', 'IFRAME ', 'INPUT',
            'LABEL','LI', 'P', 'SCRIPT', 'SPAN',
            'STYLE', 'TR']
 
-def compute_matrix(dataset):
+def compute_probability_matrix(dataset):
+    """
+    Compute the probability matrix based on the input dataset.
+
+    :dataset: an array of pairs representing the parent child relationships.
+    """
     import itertools
     import numpy
     ret = {}
@@ -74,21 +108,38 @@ def compute_matrix(dataset):
             if possibilities != 0:
                 matrix[x][i] = matrix[x][i]/possibilities
 
-    ret['matrix'] = matrix
-    ret['eigen'] = numpy.linalg.eigvals(matrix)
-    return ret
+    return matrix
+
+def compute_eigenvalues(matrix):
+    """
+    Returns the eigenvalues of the supplied square matrix.
+
+    :matrix: must be a square matrix and diagonalizable.
+    """
+    return numpy.linalg.eigvals(matrix)
 
 def readDOM(content=None, filename=None):
+    """
+    Parses the DOM of the HTML page and returns an array of parent, child
+    pairs.
+
+    :content: the content of the HTML page to be read.
+
+    :filename: the filename to be read from for getting the content of the
+               page.
+    """
     from bs4 import BeautifulSoup
+
     if filename:
         f = open(filename)
         content = ''.join(f.readlines())
+        f.close()
 
     dom = BeautifulSoup(content)
     couples = []
     for x in dom.findAll():
         couples.append((str(x.parent.name), str(x.name)))
-    #f.close()
+
     return couples
 
 class domclassTest(HTTPTest):
@@ -100,17 +151,20 @@ class domclassTest(HTTPTest):
     options = domclassArgs
     blocking = False
 
-    tool = True
+    follow_redirects = True
+    #tool = True
 
     def runTool(self):
         import yaml, numpy
         site_a = readDOM(filename=self.local_options['file'])
         site_b = readDOM(filename=self.local_options['fileb'])
-        a = compute_matrix(site_a)
+        a['matrix'] = compute_probability_matrix(site_a)
+        a['eigen'] = compute_eigenvalue(a['matrix'])
+
         self.result['eigenvalues'] = a['eigen']
-        #self.result['matrix'] = str(a['matrix']
-        #self.result['content'] = data[:200]
-        b = compute_matrix(site_b)
+        b['matrix'] = compute_probability_matrix(site_b)
+        b['eigen'] = compute_eigenvalue(b['matrix'])
+
         #print "A: %s" % a
         #print "B: %s" % b
         correlation = numpy.vdot(a['eigen'],b['eigen'])
@@ -122,24 +176,19 @@ class domclassTest(HTTPTest):
         import yaml, numpy
         site_a = readDOM(data)
         #site_b = readDOM(self.local_options['fileb'])
-        a = compute_matrix(site_a)
+        a['matrix'] = compute_probability_matrix(site_a)
+        a['eigen'] = compute_eigenvalue(a['matrix'])
+
 
         if len(data) == 0:
             self.result['eigenvalues'] = None
             self.result['matrix'] = None
         else:
             self.result['eigenvalues'] = a['eigen']
-            #self.result['matrix'] = str(a['matrix'])
+            #self.result['matrix'] = a['matrix']
         #self.result['content'] = data[:200]
         #b = compute_matrix(site_b)
         print "A: %s" % a
         return a['eigen']
-        #print "B: %s" % b
-        #correlation = numpy.vdot(a['eigen'],b['eigen'])
-        #correlation /= numpy.linalg.norm(a['eigen'])*numpy.linalg.norm(b['eigen'])
-        #correlation = (correlation + 1)/2
-        #print "Corelation: %s" % correlation
 
-# We need to instantiate it otherwise getPlugins does not detect it
-# XXX Find a way to load plugins without instantiating them.
 domclass = domclassTest(None, None, None)
diff --git a/ooni/protocols/http.py b/ooni/protocols/http.py
index 5254a5c..2b38f28 100644
--- a/ooni/protocols/http.py
+++ b/ooni/protocols/http.py
@@ -40,12 +40,17 @@ class HTTPTest(OONITest):
     and once the request body has been received.
     """
     randomize_ua = True
+    follow_redirects = False
 
     def initialize(self):
         from twisted.web.client import Agent
         import yaml
 
         self.agent = Agent(self.reactor)
+        if self.follow_redirects:
+            from twisted.web.client import RedirectAgent
+            self.agent = RedirectAgent(self.agent)
+
         self.request = {}
         self.response = {}
 



More information about the tor-commits mailing list