[or-cvs] r18444: {torflow} Improve snakeinspector.py to diff JS/HTML content. After ins (torflow/trunk/NetworkScanners)

Mon Feb 9 12:04:02 UTC 2009

Author: mikeperry
Date: 2009-02-09 07:04:01 -0500 (Mon, 09 Feb 2009)
New Revision: 18444

Modified:
   torflow/trunk/NetworkScanners/libsoat.py
   torflow/trunk/NetworkScanners/snakeinspector.py
   torflow/trunk/NetworkScanners/soat.py
Log:

Improve snakeinspector.py to diff JS/HTML content. After
inspecting some results, try to reduce false positives by
using Tor cookies for second Non-Tor fetch. Also flatten Soup
tag structure to make filtering of changing tags less
blinding.



Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================

--- torflow/trunk/NetworkScanners/libsoat.py	2009-02-09 10:36:42 UTC (rev 18443)
+++ torflow/trunk/NetworkScanners/libsoat.py	2009-02-09 12:04:01 UTC (rev 18444)
@@ -9,8 +9,9 @@
 import sys
 import time
 import traceback
+import difflib
 sys.path.append("./libs")
-from BeautifulSoup.BeautifulSoup import Tag
+from BeautifulSoup.BeautifulSoup import Tag, SoupStrainer
 
 
 import sets
@@ -66,7 +67,8 @@
     self.reason = reason
     self.false_positive=False
     self.false_positive_reason="None"
-  
+    self.verbose=False
+ 
   def mark_false_positive(self, reason):
     pass
 
@@ -196,14 +198,29 @@
     except: pass
 
   def __str__(self):
-    # XXX: Re-run the JSDiffer and compare these differences
     ret = TestResult.__str__(self)
-    if self.content:
-      ret += " "+self.content+"\n"
-    if self.content_old:
-      ret += " "+self.content_old+"\n"
-    if self.content_exit:
-      ret += " "+self.content_exit+"\n"
+    if self.verbose:
+      if self.content and self.content_old:
+        diff = difflib.unified_diff(open(self.content).read().split("\n"),
+                             open(self.content_old).read().split("\n"), 
+                             "Non-Tor1", "Non-Tor2",
+                             lineterm="")
+        for line in diff:
+          ret+=line+"\n"
+      if self.content and self.content_exit:
+        diff = difflib.unified_diff(open(self.content).read().split("\n"),
+                             open(self.content_exit).read().split("\n"), 
+                              "Non-Tor", "Exit",
+                              lineterm="")
+        for line in diff:
+          ret+=line+"\n"
+    else:
+      if self.content:
+        ret += " "+self.content+"\n"
+      if self.content_old:
+        ret += " "+self.content_old+"\n"
+      if self.content_exit:
+        ret += " "+self.content_exit+"\n"
     return ret
 
 class HtmlTestResult(TestResult):
@@ -232,14 +249,37 @@
     except: pass
 
   def __str__(self):
-    # XXX: Re-run the SoupDiffer+JSDiffer and compare these differences
     ret = TestResult.__str__(self)
-    if self.content:
-      ret += " "+self.content+"\n"
-    if self.content_old:
-      ret += " "+self.content_old+"\n"
-    if self.content_exit:
-      ret += " "+self.content_exit+"\n"
+    if self.verbose:
+      if self.content and self.content_old:
+        content = open(self.content).read().decode('ascii', 'ignore')
+        content_old = open(self.content_old).read().decode('ascii', 'ignore')
+        soup = FullyStrainedSoup(content)
+        old_soup = FullyStrainedSoup(content_old)
+        tags = map(str, soup.findAll())
+        old_tags = map(str, old_soup.findAll())
+        diff = difflib.unified_diff(tags, old_tags, "Non-Tor1", "Non-Tor1",
+                                    lineterm="")
+        for line in diff:
+          ret+=line+"\n"
+      if self.content and self.content_exit:
+        content = open(self.content).read().decode('ascii', 'ignore')
+        content_exit = open(self.content_exit).read().decode('ascii', 'ignore')
+        soup = FullyStrainedSoup(content)
+        tor_soup = FullyStrainedSoup(content_exit)
+        tags = map(str, soup.findAll())
+        tor_tags = map(str, tor_soup.findAll())
+        diff = difflib.unified_diff(tags, tor_tags, "Non-Tor", "Exit",
+                                    lineterm="")
+        for line in diff:
+          ret+=line+"\n"
+    else:
+      if self.content:
+        ret += " "+self.content+"\n"
+      if self.content_old:
+        ret += " "+self.content_old+"\n"
+      if self.content_exit:
+        ret += " "+self.content_exit+"\n"
     return ret
 
 class SSHTestResult(TestResult):
@@ -402,6 +442,53 @@
     pickle.dump(result, result_file)
     result_file.close()
 
+
+# These three bits are needed to fully recursively strain the parsed soup.
+# For some reason, the SoupStrainer does not get applied recursively..
+__first_strainer = SoupStrainer(lambda name, attrs: name in tags_to_check or 
+   len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_check))) > 0)
+
+def __tag_not_worthy(tag):
+  if tag.name in tags_to_check:
+    return False
+  for attr in tag.attrs:
+    if attr[0] in attrs_to_check_map:
+      return False
+  return True
+
+def FullyStrainedSoup(html):
+  """ Remove all tags that are of no interest. Also remove content """
+  soup = TheChosenSoup(html, __first_strainer)
+  to_extract = []
+  for tag in soup.findAll():
+    to_prune = []
+    for attr in tag.attrs:
+      if attr[0] in attrs_to_prune:
+        to_prune.append(attr)
+    for attr in to_prune:
+      tag.attrs.remove(attr)
+    if __tag_not_worthy(tag):
+      to_extract.append(tag)
+    if tag.name not in tags_preserve_inner:
+      for child in tag.childGenerator():
+        if not isinstance(child, Tag) or __tag_not_worthy(child):
+          to_extract.append(child)
+  for tag in to_extract:
+    if isinstance(tag, Tag):
+      parent = tag.findParent()
+      for child in tag.findChildren():
+        parent.append(child)
+  for tag in to_extract:
+    tag.extract()
+  # Also flatten the tag structure
+  flattened_tags = soup.findAll()
+  for tag in flattened_tags:
+    if isinstance(tag, Tag): # Don't extract script/CSS strings.
+      tag.extract() 
+  for tag in flattened_tags:
+    soup.append(tag)
+  return soup      
+
 class SoupDiffer:
   """ Diff two soup tag sets, optionally writing diffs to outfile. """
   def __init__(self, soup_old, soup_new):

Modified: torflow/trunk/NetworkScanners/snakeinspector.py
===================================================================
--- torflow/trunk/NetworkScanners/snakeinspector.py	2009-02-09 10:36:42 UTC (rev 18443)
+++ torflow/trunk/NetworkScanners/snakeinspector.py	2009-02-09 12:04:01 UTC (rev 18444)
@@ -31,7 +31,8 @@
     results = [dh.getResult(argv[1])]
 
   for r in results:
-    if r.status == TEST_FAILURE:
+    r.verbose = True
+    if r.status == TEST_FAILURE and r.reason == "FailureExitOnly":
       print r
       print "\n-----------------------------\n"
 

Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py	2009-02-09 10:36:42 UTC (rev 18443)
+++ torflow/trunk/NetworkScanners/soat.py	2009-02-09 12:04:01 UTC (rev 18444)
@@ -578,8 +578,10 @@
     # if content doesnt match, update the direct content and use new cookies
     # If we have alternate IPs to bind to on this box, use them?
     # Sometimes pages have the client IP encoded in them..
+    # Also, use the Tor cookies, since those identifiers are
+    # probably embeded in the Tor page as well.
     BindingSocket.bind_to = refetch_ip
-    (code_new, new_cookies_new, mime_type_new, content_new) = http_request(address, orig_cookie_jar, self.headers)
+    (code_new, new_cookies_new, mime_type_new, content_new) = http_request(address, orig_tor_cookie_jar, self.headers)
     BindingSocket.bind_to = None
     
     if not content_new:
@@ -601,9 +603,12 @@
 
     # Need to do set subtraction and only save new cookies.. 
     # or extract/make_cookies
+    
     self.cookie_jar = orig_cookie_jar
     new_cookie_jar = cookielib.MozillaCookieJar()
-    for cookie in new_cookies_new: new_cookie_jar.set_cookie(cookie)
+    for cookie in new_cookies_new: 
+      new_cookie_jar.set_cookie(cookie)
+      self.cookie_jar.set_cookie(cookie) # Update..
     os.rename(content_prefix+'.cookies', content_prefix+'.cookies-old')
     try:
       new_cookie_jar.save(content_prefix+'.cookies')
@@ -784,41 +789,7 @@
         self.fetch_queue.put_nowait(i)
       else:
         plog("NOTICE", "Skipping "+i[0]+" target: "+i[1])
-
-
-  def _tag_not_worthy(self, tag):
-    if tag.name in tags_to_check:
-      return False
-    for attr in tag.attrs:
-      if attr[0] in attrs_to_check_map:
-        return False
-    return True
  
-  def _recursive_strain(self, soup):
-    """ Remove all tags that are of no interest. Also remove content """
-    to_extract = []
-    for tag in soup.findAll():
-      to_prune = []
-      for attr in tag.attrs:
-        if attr[0] in attrs_to_prune:
-          to_prune.append(attr)
-      for attr in to_prune:
-        tag.attrs.remove(attr)
-      if self._tag_not_worthy(tag):
-        to_extract.append(tag)
-      if tag.name not in tags_preserve_inner:
-        for child in tag.childGenerator():
-          if not isinstance(child, Tag) or self._tag_not_worthy(child):
-            to_extract.append(child)
-    for tag in to_extract:
-      if isinstance(tag, Tag):
-        parent = tag.findParent()
-        for child in tag.findChildren():
-          parent.append(child)
-    for tag in to_extract:
-      tag.extract()
-    return soup      
-
   def check_js(self, address):
     plog('INFO', 'Conducting a js test with destination ' + address)
     ret = self.check_http_nodynamic(address)
@@ -872,15 +843,9 @@
     content_prefix = http_content_dir+address_file
     failed_prefix = http_failed_dir+address_file
 
-    elements = SoupStrainer(lambda name, attrs: name in tags_to_check or 
-     len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_check))) > 0)
+    orig_soup = FullyStrainedSoup(orig_html.decode('ascii', 'ignore'))
+    tor_soup = FullyStrainedSoup(tor_html.decode('ascii', 'ignore'))
 
-    orig_soup = self._recursive_strain(TheChosenSoup(orig_html.decode('ascii',
-                                       'ignore'), parseOnlyThese=elements))
-
-    tor_soup = self._recursive_strain(TheChosenSoup(tor_html.decode('ascii',
-                                      'ignore'), parseOnlyThese=elements))
-
     # Also find recursive urls
     recurse_elements = SoupStrainer(lambda name, attrs: 
         name in tags_to_recurse and 
@@ -908,9 +873,8 @@
       self.datahandler.saveResult(result)
       return TEST_INCONCLUSIVE
 
+    new_soup = FullyStrainedSoup(content_new)
 
-    new_soup = self._recursive_strain(TheChosenSoup(content_new,
-                                     parseOnlyThese=elements))
     # compare the new and old content
     # if they match, means the node has been changing the content
     if str(orig_soup) == str(new_soup):