[or-cvs] r18340: {torflow} We now diff Javascript in soat. (torflow/trunk/NetworkScanners)

mikeperry at seul.org mikeperry at seul.org
Fri Jan 30 14:21:17 UTC 2009


Author: mikeperry
Date: 2009-01-30 09:21:17 -0500 (Fri, 30 Jan 2009)
New Revision: 18340

Modified:
   torflow/trunk/NetworkScanners/libsoat.py
   torflow/trunk/NetworkScanners/soat.py
   torflow/trunk/NetworkScanners/soatstats.py
Log:

We now diff Javascript in soat.



Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py	2009-01-30 14:17:02 UTC (rev 18339)
+++ torflow/trunk/NetworkScanners/libsoat.py	2009-01-30 14:21:17 UTC (rev 18340)
@@ -8,27 +8,23 @@
 import pickle
 import sys
 import time
-import difflib
+import traceback
 sys.path.append("./libs")
 from BeautifulSoup.BeautifulSoup import BeautifulSoup, Tag
 
+
 import sets
 from sets import Set
 
-#
-# Data storage
-#
+from soat_config import *
+sys.path.append("../")
+from TorCtl.TorUtil import *
 
-# data locations
+sys.path.append("./libs/pypy-svn/")
+import pypy.rlib.parsing.parsing
+import pypy.lang.js.jsparser
 
-data_dir = './data/soat/'
-ssl_certs_dir = data_dir + 'ssl/certs/'
 
-http_data_dir = data_dir + 'http/'
-http_content_dir = data_dir + 'http/content/'
-http_failed_dir = data_dir + 'http/failed/'
-http_inconclusive_dir = data_dir + 'http/inconclusive/'
-
 # constants
 
 TEST_SUCCESS = 0
@@ -291,6 +287,32 @@
     ret.sort()
     return ret
 
+  def changed_tags_with_attrs(self):
+    """ Create a map of changed tags to ALL attributes that tag
+        has ever had (changed or not) """
+    changed_tags = {}
+    for tags in map(BeautifulSoup, self.changed_tags()):
+      for t in tags.findAll():
+        if t.name not in changed_tags:
+          changed_tags[t.name] = sets.Set([])
+        for attr in t.attrs:
+          changed_tags[t.name].add(attr[0])
+    return changed_tags
+
+  def has_more_changed_tags(self, tag_attr_map):
+    """ Returns true if we have additional tags with additional
+        attributes that were not present in tag_attr_map 
+        (returned from changed_tags_with_attrs) """
+    for tags in map(BeautifulSoup, self.changed_tags()):
+      for t in tags.findAll():
+        if t.name not in tag_attr_map:
+          return True
+        else:
+          for attr in t.attrs:
+            if attr[0] not in tag_attr_map[t.name]:
+              return True
+    return False
+
   def _get_attributes(self):
     attrs_old = [(tag.name, tag.attrs) for tag in self.soup_old.findAll()]
     attrs_new = [(tag.name, tag.attrs) for tag in self.soup_new.findAll()]
@@ -311,6 +333,29 @@
     ret.sort()
     return ret
 
+  def changed_attributes_by_tag(self):
+    """ Transform the list of (tag, attribute) pairings for new/changed
+        attributes into a map. This allows us to quickly see
+        if any attributes changed for a specific tag. """
+    changed_attributes = {}
+    for (tag, attr) in self.changed_attributes():
+      if tag not in changed_attributes:
+        changed_attributes[tag] = sets.Set([])
+      changed_attributes[tag].add(attr[0])
+    return changed_attributes 
+
+  def has_more_changed_attrs(self, attrs_by_tag):
+    """ Returns true if we have any tags with additional
+        changed attributes that were not present in attrs_by_tag
+        (returned from changed_attributes_by_tag) """
+    for (tag, attr) in self.changed_attributes():
+      if tag in attrs_by_tag:
+        if attr[0] not in attrs_by_tag[tag]:
+          return True
+      else:
+        return True
+    return False
+
   def changed_content(self):
     """ Return a list of tag contents changed in soup_new """
     tags_old = sets.Set(map(str, 
@@ -321,29 +366,6 @@
     ret.sort()
     return ret
 
-  def diff_tags(self):
-    tags_old = map(str, [tag for tag in self.soup_old.findAll() if isinstance(tag, Tag)])
-    tags_new = map(str, [tag for tag in self.soup_new.findAll() if isinstance(tag, Tag)])
-    tags_old.sort()
-    tags_new.sort()
-    diff = difflib.SequenceMatcher(None, tags_old, tags_new)
-    return diff
-
-  def diff_attributes(self):
-    (attr_old, attr_new) = self._get_attributes()
-    attr_old.sort()
-    attr_new.sort()
-    diff = difflib.SequenceMatcher(None, attr_old, attr_new)
-    return diff
-
-  def diff_content(self):
-    tags_old = sets.Set(map(str, 
-      [tag for tag in self.soup_old.findAll() if not isinstance(tag, Tag)]))
-    tags_new = sets.Set(map(str, 
-      [tag for tag in self.soup_new.findAll() if not isinstance(tag, Tag)]))
-    diff = difflib.SequenceMatcher(None, tags_old, tags_new)
-    return diff
-
   def __str__(self):
     tags = self.changed_tags()
     out = "Tags:\n"+"\n".join(tags)
@@ -359,4 +381,109 @@
     f = open(outfile, "w")
     f.write(str(self))
     f.close()
+
+
+class JSDiffer:
+  def __init__(self, js_string):
+    self.ast_cnts = self.count_ast_elements(js_string)
+
+  def _ast_recursive_worker(ast, ast_cnts):
+    if not ast.symbol in ast_cnts:
+      ast_cnts[ast.symbol] = 1
+    else: ast_cnts[ast.symbol] += 1
+    if isinstance(ast, pypy.rlib.parsing.tree.Nonterminal):
+      for child in ast.children:
+        JSDiffer._ast_recursive_worker(child, ast_cnts)
+  _ast_recursive_worker = Callable(_ast_recursive_worker)
  
+  def count_ast_elements(self, js_string, name="global"):
+    ast_cnts = {}
+    try:
+      ast = pypy.lang.js.jsparser.parse(js_string)
+      JSDiffer._ast_recursive_worker(ast, ast_cnts)
+    except (pypy.rlib.parsing.deterministic.LexerError, UnicodeDecodeError, pypy.rlib.parsing.parsing.ParseError), e:
+      # Store info about the name and type of parse error
+      # so we can match that up too.
+      name+=":"+e.__class__.__name__
+      if "source_pos" in e.__dict__:
+        name+=":"+str(e.source_pos)
+      plog("INFO", "Parse error "+name+" on "+js_string)
+      if not "ParseError:"+name in ast_cnts:
+        ast_cnts["ParseError:"+name] = 1
+      else: ast_cnts["ParseError:"+name] +=1
+    return ast_cnts
+
+  def _difference_pruner(self, other_cnts):
+    for node in self.ast_cnts.iterkeys():
+      if node not in other_cnts:
+        self.ast_cnts[node] = 0
+      elif self.ast_cnts[node] != other_cnts[node]:
+        self.ast_cnts[node] = 0
+    for node in other_cnts.iterkeys():
+      if node not in self.ast_cnts:
+        self.ast_cnts[node] = 0
+
+  def _difference_checker(self, other_cnts):
+    for node in self.ast_cnts.iterkeys():
+      if not self.ast_cnts[node]: continue # pruned difference
+      if node not in other_cnts:
+        return True
+      elif self.ast_cnts[node] != other_cnts[node]:
+        return True
+    for node in other_cnts.iterkeys():
+      if node not in self.ast_cnts:
+        return True
+    return False
+
+  def prune_differences(self, other_string):
+    other_cnts = self.count_ast_elements(other_string)
+    self._difference_pruner(other_cnts)
+
+  def contains_differences(self, other_string):
+    other_cnts = self.count_ast_elements(other_string)
+    return self._difference_checker(other_cnts) 
+
+class JSSoupDiffer(JSDiffer):
+  def _add_cnts(tag_cnts, ast_cnts):
+    ret_cnts = {}
+    for n in tag_cnts.iterkeys():
+      if n in ast_cnts:
+        ret_cnts[n] = tag_cnts[n]+ast_cnts[n]
+      else:
+        ret_cnts[n] = tag_cnts[n]
+    for n in ast_cnts.iterkeys():
+      if n not in tag_cnts:
+        ret_cnts[n] = ast_cnts[n]
+    return ret_cnts
+  _add_cnts = Callable(_add_cnts)
+
+  def count_ast_elements(self, soup, name="Soup"):
+    ast_cnts = {}
+    for tag in soup.findAll():
+      if tag.name == 'script':
+        for child in tag.childGenerator():
+          if isinstance(child, Tag):
+            plog("ERROR", "Script tag with subtag!")
+          else:
+            tag_cnts = JSDiffer.count_ast_elements(self, str(child), tag.name)
+            ast_cnts = JSSoupDiffer._add_cnts(tag_cnts, ast_cnts)
+      for attr in tag.attrs:
+        # hrmm.. %-encoding too? Firefox negs on it..
+        parse = ""
+        if attr[1].replace(" ","")[:11] == "javascript:":
+          split_at = attr[1].find(":")+1
+          parse = str(attr[1][split_at:])
+        elif attr[0] in attrs_with_raw_script_map:
+          parse = str(attr[1])
+        if not parse: continue
+        tag_cnts = JSDiffer.count_ast_elements(self,parse,tag.name+":"+attr[0])
+        ast_cnts = JSSoupDiffer._add_cnts(tag_cnts, ast_cnts)
+    return ast_cnts
+
+  def prune_differences(self, other_soup):
+    other_cnts = self.count_ast_elements(other_soup)
+    self._difference_pruner(other_cnts)
+
+  def contains_differences(self, other_soup):
+    other_cnts = self.count_ast_elements(other_soup)
+    return self._difference_checker(other_cnts) 

Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py	2009-01-30 14:17:02 UTC (rev 18339)
+++ torflow/trunk/NetworkScanners/soat.py	2009-01-30 14:21:17 UTC (rev 18340)
@@ -43,7 +43,6 @@
 import cookielib
 import sha
 import Queue
-import difflib
 
 from libsoat import *
 
@@ -64,162 +63,11 @@
 from SocksiPy import socks
 import Pyssh.pyssh
 
-#
-# config stuff
-#
+from soat_config import *
 
-# these are used when searching for 'random' urls for testing
-wordlist_file = './wordlist.txt';
-# Hrmm.. Too many of these and Google really h8s us..
-scan_filetypes = ['exe','pdf','doc','msi']#,'rpm','dmg','pkg','dpkg']
-
-# Avoid vmware images+isos plz. Nobody could possibly have the patience
-# to download anything much larger than 30MB over Tor anyways ;)
-# XXX: 30MB?? Who the hell am I kidding. For testing this needs to be like 1MB
-max_content_size = 1024*1024 # 30*1024*1024
-
-# Kill fetches if they drop below 1kbyte/sec
-min_rate=1024
-
-
-firefox_headers = {
-  'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
-  'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-  'Accept-Language':"en-us,en;q=0.5",
-  'Accept-Encoding':"gzip,deflate",
-  'Accept-Charset': "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
-  'Keep-Alive':"300",
-  'Connection':"keep-alive"
-}
-
-# http://www.voidspace.org.uk/python/articles/cookielib.shtml
-search_cookie_file="search_cookies.lwp"
 search_cookies=None
 
-yahoo_search_mode = {"host" : "search.yahoo.com", "query":"p", "filetype": "originurlextension:", "inurl":None, "class":"yschttl", "useragent":False}
-google_search_mode = {"host" : "www.google.com", "query":"q", "filetype":"filetype:", "inurl":"inurl:", "class" : "l", "useragent":True}
- 
-# FIXME: This does not affect the ssl search.. no other search engines have
-# a working "inurl:" that allows you to pick the scheme to be https like google...
-default_search_mode = google_search_mode
-
-# ports to test in the consistency test
-
-ports_to_check = [
-  ["pop", ExitPolicyRestriction('255.255.255.255', 110), "pops", ExitPolicyRestriction('255.255.255.255', 995)],
-  ["imap", ExitPolicyRestriction('255.255.255.255', 143), "imaps", ExitPolicyRestriction('255.255.255.255', 993)],
-  ["telnet", ExitPolicyRestriction('255.255.255.255', 23), "ssh", ExitPolicyRestriction('255.255.255.255', 22)],
-  ["smtp", ExitPolicyRestriction('255.255.255.255', 25), "smtps", ExitPolicyRestriction('255.255.255.255', 465)],
-  ["http", ExitPolicyRestriction('255.255.255.255', 80), "https",
-ExitPolicyRestriction('255.255.255.255', 443)],
-  ["email", NodeRestrictionList([
-ExitPolicyRestriction('255.255.255.255',110),
-ExitPolicyRestriction('255.255.255.255',143)
-]),
-"secure email",
-OrNodeRestriction([
-ExitPolicyRestriction('255.255.255.255',995),
-ExitPolicyRestriction('255.255.255.255',993),
-ExitPolicyRestriction('255.255.255.255',465),
-ExitPolicyRestriction('255.255.255.255',587)
-])],
-  ["plaintext", AtLeastNNodeRestriction([
-ExitPolicyRestriction('255.255.255.255',110),
-ExitPolicyRestriction('255.255.255.255',143),
-ExitPolicyRestriction('255.255.255.255',23),
-ExitPolicyRestriction('255.255.255.255',21),
-ExitPolicyRestriction('255.255.255.255',80)
-#ExitPolicyRestriction('255.255.255.255',25),
-], 4),
-"secure",
-OrNodeRestriction([
-ExitPolicyRestriction('255.255.255.255',995),
-ExitPolicyRestriction('255.255.255.255',993),
-ExitPolicyRestriction('255.255.255.255',22),
-ExitPolicyRestriction('255.255.255.255',465),
-ExitPolicyRestriction('255.255.255.255',587),
-ExitPolicyRestriction('255.255.255.255',443)
-])]
-]
-
 #
-# non-public IPv4 address ranges network portions
-# refer to: www.iana.org/assignments/ipv4-address-space, www.iana.org/assignments/multicast-addresses
-# 
-ipv4_nonpublic = [
-  '00000000',     # default route and its network: 0.0.0.0/8
-  '00001010',     # private 10.0.0.0/8
-  '01111111',     # loopback 127.0.0.0/8
-  '1010100111111110', # link-local 169.254.0.0/16
-  '101011000001',   # private 172.16.0.0/12
-  '1100000010101000', # private 192.168.0.0/16
-  '111'         # multicast & experimental 224.0.0.0/3
-]
-
-# Tags and attributes to check in the http test.
-# The general idea is to grab tags with attributes known
-# to either hold script, or cause automatic network actvitity
-# Note: the more we add, the greater the potential for false positives...  
-# We also only care about the ones that work for FF2/FF3. 
-
-# TODO: If we cut down on these tags, we can cut down on false 
-# positives. The ultimate acid test would be to have two different Google 
-# queries come back with the same tag structure after filtering them.
-# Unfortunately, Google munges its javascript, so we have to do
-# some more advanced processing to reach that goal..
-# Also, I'm somewhat torn on dropping 'a' tags..
-tags_to_check = ['a', 'applet', 'area', 'base', 'embed', 'form',
-                 'frame', 'iframe', 'img', 'input', 'link', 'meta', 
-                 'object', 'script', 'style', 'layer', 'ilayer']
-tags_preserve_inner = ['script','style'] 
-
-# Merged from:
-# http://www.w3.org/TR/REC-html40/index/attributes.html
-# http://www.w3.org/TR/REC-html40/index/elements.html  
-# http://web.archive.org/web/20060113072810/www.mozilla.org/docs/dom/domref/dom_event_ref33.html
-# http://scrivna.com/blog/2008/09/18/php-xss-filtering-function/
-# https://svn.typo3.org/TYPO3v4/Core/trunk/typo3/contrib/RemoveXSS/RemoveXSS.php
-# http://www.expertzzz.com/Downloadz/view/3424
-# http://kallahar.com/smallprojects/php_xss_filter_function.php
-# and http://ha.ckers.org/xss.html
-attrs_to_check = ['background', 'cite', 'classid', 'codebase', 'data',
-'longdesc', 'onabort', 'onactivate', 'onafterprint', 'onafterupdate',
-'onattrmodified', 'onbeforeactivate', 'onbeforecopy', 'onbeforecut',
-'onbeforedeactivate', 'onbeforeeditfocus', 'onbeforepaste', 'onbeforeprint',
-'onbeforeunload', 'onbeforeupdate', 'onblur', 'onbounce', 'onbroadcast',
-'oncellchange', 'onchange', 'oncharacterdatamodified', 'onclick', 'onclose',
-'oncommand', 'oncommandupdate', 'oncontextmenu', 'oncontrolselect', 'oncopy',
-'oncut', 'ondataavaible', 'ondataavailable', 'ondatasetchanged',
-'ondatasetcomplete', 'ondblclick', 'ondeactivate', 'ondrag', 'ondragdrop',
-'ondragend', 'ondragenter', 'ondragexit', 'ondraggesture', 'ondragleave',
-'ondragover', 'ondragstart', 'ondrop', 'onerror', 'onerrorupdate',
-'onfilterchange', 'onfilterupdate', 'onfinish', 'onfocus', 'onfocusin',
-'onfocusout', 'onhelp', 'oninput', 'onkeydown', 'onkeypress', 'onkeyup',
-'onlayoutcomplete', 'onload', 'onlosecapture', 'onmousedown', 'onmouseenter',
-'onmouseleave', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup',
-'onmousewheel', 'onmove', 'onmoveend', 'onmoveout', 'onmovestart',
-'onnodeinserted', 'onnodeinsertedintodocument', 'onnoderemoved',
-'onnoderemovedfromdocument', 'onoverflowchanged', 'onpaint', 'onpaste',
-'onpopupHidden', 'onpopupHiding', 'onpopupShowing', 'onpopupShown',
-'onpropertychange', 'onreadystatechange', 'onreset', 'onresize',
-'onresizeend', 'onresizestart', 'onrowenter', 'onrowexit', 'onrowsdelete',
-'onrowsinserted', 'onscroll', 'onselect', 'onselectionchange',
-'onselectstart', 'onstart', 'onstop', 'onsubmit', 'onsubtreemodified',
-'ontext', 'onunderflow', 'onunload', 'overflow', 'profile', 'src', 'style',
-'usemap']
-attrs_to_check_map = {}
-for __a in attrs_to_check: attrs_to_check_map[__a]=1
-attrs_to_prune = ['alt', 'label', 'prompt' 'standby', 'summary', 'title',
-                  'abbr']
-
-# For recursive fetching of urls:
-tags_to_recurse = ['a', 'applet', 'embed', 'frame', 'iframe', #'img',
-                   'link', 'object', 'script', 'layer', 'ilayer'] 
-recurse_html = ['frame', 'iframe', 'layer', 'ilayer']
-attrs_to_recurse = ['background', 'codebase', 'data', 'href',
-                    'pluginurl', 'src']
-
-#
 # constants
 #
 
@@ -317,6 +165,9 @@
       plog("INFO", "Using the following urls for "+self.proto+" scan:\n\t"+targets) 
     self.tests_run = 0
     self.nodes_marked = 0
+    # XXX: We really need to register an eventhandler
+    # and register a callback for it when this list 
+    # changes due to dropping either "Running" or "Fast"
     self.nodes = self.mt.get_nodes_for_port(self.port)
     self.node_map = {}
     for n in self.nodes: 
@@ -752,6 +603,7 @@
         for a in t.attrs:
           attr_name = str(a[0])
           attr_tgt = str(a[1])
+          # TODO: Split off javascript
           if attr_name in attrs_to_recurse:
             if str(t.name) in recurse_html:
               plog("NOTICE", "Adding html "+str(t.name)+" target: "+attr_tgt)
@@ -803,6 +655,7 @@
     return soup      
 
   def check_html(self, address):
+    # XXX: Check mimetype to decide what to do..
     ''' check whether a http connection to a given address is molested '''
     plog('INFO', 'Conducting an html test with destination ' + address)
 
@@ -976,59 +829,29 @@
     new_vs_old = SoupDiffer(soup_new, soup)
     new_vs_tor = SoupDiffer(soup_new, psoup)
 
-    # TODO: Consider storing these changing attributes
-    # for more than just this run..
-    # FIXME: Also consider refactoring this into SoupDiffer.
-    # It's kind of a mess..
-    changed_tags = {}
-    changed_attributes = {}
     # I'm an evil man and I'm going to CPU hell..
-    for tags in map(BeautifulSoup, old_vs_new.changed_tags()):
-      for t in tags.findAll():
-        if t.name not in changed_tags:
-          changed_tags[t.name] = sets.Set([])
-        for attr in t.attrs:
-          changed_tags[t.name].add(attr[0])
-    for tags in map(BeautifulSoup, new_vs_old.changed_tags()):
-      for t in tags.findAll():
-        if t.name not in changed_tags:
-          changed_tags[t.name] = sets.Set([])
-        for attr in t.attrs:
-          changed_tags[t.name].add(attr[0])
-    for (tag, attr) in old_vs_new.changed_attributes():
-      if tag not in changed_attributes:
-        changed_attributes[tag] = {}
-      changed_attributes[tag][attr[0]] = 1 
-    for (tag, attr) in new_vs_old.changed_attributes():
-      changed_attributes[attr[0]] = 1 
-      if tag not in changed_attributes:
-        changed_attributes[tag] = {}
-      changed_attributes[tag][attr[0]] = 1 
-    
-    changed_content = bool(old_vs_new.changed_content() or old_vs_new.changed_content())
+    changed_tags = old_vs_new.changed_tags_with_attrs()
+    changed_tags.update(new_vs_old.changed_tags_with_attrs())
 
-    false_positive = True 
-    for tags in map(BeautifulSoup, new_vs_tor.changed_tags()):
-      for t in tags.findAll():
-        if t.name not in changed_tags:
-          false_positive = False
-        else:
-           for attr in t.attrs:
-             if attr[0] not in changed_tags[t.name]:
-               false_positive = False
-    for (tag, attr) in new_vs_tor.changed_attributes():
-      if tag in changed_attributes:
-        if attr[0] not in changed_attributes[tag]:
-          false_positive=False
-      else:
-        if not false_positive:
-          plog("ERROR", "False positive contradiction at "+exit_node+" for "+address)
-          false_positive = False
+    changed_attributes = old_vs_new.changed_attributes_by_tag()
+    changed_attributes.update(new_vs_old.changed_attributes_by_tag())
 
-    if new_vs_tor.changed_content() and not changed_content:
+    changed_content = bool(old_vs_new.changed_content() or old_vs_new.changed_content())
+ 
+    # Verify all of our changed tags are present here 
+    if new_vs_tor.has_more_changed_tags(changed_tags) or \
+      new_vs_tor.has_more_changed_attrs(changed_attributes) or \
+      new_vs_tor.changed_content() and not changed_content:
       false_positive = False
+    else:
+      false_positive = True
 
     if false_positive:
+      jsdiff = JSSoupDiffer(soup)
+      jsdiff.prune_differences(soup_new)
+      false_positive = not jsdiff.contains_differences(psoup)
+
+    if false_positive:
       plog("NOTICE", "False positive detected for dynamic change at "+address+" via "+exit_node)
       result = HtmlTestResult(exit_node, address, TEST_SUCCESS)
       self.results.append(result)

Modified: torflow/trunk/NetworkScanners/soatstats.py
===================================================================
--- torflow/trunk/NetworkScanners/soatstats.py	2009-01-30 14:17:02 UTC (rev 18339)
+++ torflow/trunk/NetworkScanners/soatstats.py	2009-01-30 14:21:17 UTC (rev 18340)
@@ -18,44 +18,6 @@
 sys.path.append("../")
 from TorCtl.TorUtil import *
 
-sys.path.append("./libs/pypy-svn/")
-import pypy.rlib.parsing.parsing
-import pypy.lang.js.jsparser
-
-attrs_with_raw_script = [
-'onabort', 'onactivate', 'onafterprint', 'onafterupdate',
-'onattrmodified', 'onbeforeactivate', 'onbeforecopy', 'onbeforecut',
-'onbeforedeactivate', 'onbeforeeditfocus', 'onbeforepaste', 'onbeforeprint',
-'onbeforeunload', 'onbeforeupdate', 'onblur', 'onbounce', 'onbroadcast',
-'oncellchange', 'onchange', 'oncharacterdatamodified', 'onclick', 'onclose',
-'oncommand', 'oncommandupdate', 'oncontextmenu', 'oncontrolselect', 'oncopy',
-'oncut', 'ondataavaible', 'ondataavailable', 'ondatasetchanged',
-'ondatasetcomplete', 'ondblclick', 'ondeactivate', 'ondrag', 'ondragdrop',
-'ondragend', 'ondragenter', 'ondragexit', 'ondraggesture', 'ondragleave',
-'ondragover', 'ondragstart', 'ondrop', 'onerror', 'onerrorupdate',
-'onfilterchange', 'onfilterupdate', 'onfinish', 'onfocus', 'onfocusin',
-'onfocusout', 'onhelp', 'oninput', 'onkeydown', 'onkeypress', 'onkeyup',
-'onlayoutcomplete', 'onload', 'onlosecapture', 'onmousedown', 'onmouseenter',
-'onmouseleave', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup',
-'onmousewheel', 'onmove', 'onmoveend', 'onmoveout', 'onmovestart',
-'onnodeinserted', 'onnodeinsertedintodocument', 'onnoderemoved',
-'onnoderemovedfromdocument', 'onoverflowchanged', 'onpaint', 'onpaste',
-'onpopupHidden', 'onpopupHiding', 'onpopupShowing', 'onpopupShown',
-'onpropertychange', 'onreadystatechange', 'onreset', 'onresize',
-'onresizeend', 'onresizestart', 'onrowenter', 'onrowexit', 'onrowsdelete',
-'onrowsinserted', 'onscroll', 'onselect', 'onselectionchange',
-'onselectstart', 'onstart', 'onstop', 'onsubmit', 'onsubtreemodified',
-'ontext', 'onunderflow', 'onunload' 
-]
-attrs_to_check = ['background', 'cite', 'classid', 'codebase', 'data',
-'longdesc', 'profile', 'src', 'style', 'usemap']
-attrs_to_check.extend(attrs_with_raw_script)
-attrs_to_check_map = {}
-for __a in attrs_to_check: attrs_to_check_map[__a]=1
-attrs_with_raw_script_map = {}
-for __a in attrs_with_raw_script: attrs_with_raw_script_map[__a]=1
-
-
 class ResultCount:
   def __init__(self, type):
     self.type = type
@@ -158,148 +120,30 @@
       old_vs_new = SoupDiffer(old_soup, new_soup)
       new_vs_tor = SoupDiffer(new_soup, tor_soup)
 
-      changed_tags = {}
-      changed_attributes = {}
       # I'm an evil man and I'm going to CPU hell..
-      for tags in map(BeautifulSoup, old_vs_new.changed_tags()):
-        for t in tags.findAll():
-          if t.name not in changed_tags:
-            changed_tags[t.name] = sets.Set([])
-          for attr in t.attrs:
-            changed_tags[t.name].add(attr[0])
-      for tags in map(BeautifulSoup, new_vs_old.changed_tags()):
-        for t in tags.findAll():
-          if t.name not in changed_tags:
-            changed_tags[t.name] = sets.Set([])
-          for attr in t.attrs:
-            changed_tags[t.name].add(attr[0])
-      for (tag, attr) in old_vs_new.changed_attributes():
-        if tag not in changed_attributes:
-          changed_attributes[tag] = {}
-        changed_attributes[tag][attr[0]] = 1 
-      for (tag, attr) in new_vs_old.changed_attributes():
-        changed_attributes[attr[0]] = 1 
-        if tag not in changed_attributes:
-          changed_attributes[tag] = {}
-        changed_attributes[tag][attr[0]] = 1 
-      
+      changed_tags = old_vs_new.changed_tags_with_attrs()
+      changed_tags.update(new_vs_old.changed_tags_with_attrs())
+
+      changed_attributes = old_vs_new.changed_attributes_by_tag()
+      changed_attributes.update(new_vs_old.changed_attributes_by_tag())
+
       changed_content = bool(old_vs_new.changed_content() or old_vs_new.changed_content())
-  
-      false_positive = True 
-      for tags in map(BeautifulSoup, new_vs_tor.changed_tags()):
-        for t in tags.findAll():
-          if t.name not in changed_tags:
-            false_positive = False
-          else:
-             for attr in t.attrs:
-               if attr[0] not in changed_tags[t.name]:
-                 false_positive = False
-      for (tag, attr) in new_vs_tor.changed_attributes():
-        if tag in changed_attributes:
-          if attr[0] not in changed_attributes[tag]:
-            false_positive=False
-        else:
-          if not false_positive:
-            plog("ERROR", "False positive contradiction at "+exit_node+" for "+address)
-            false_positive = False
-  
-      if new_vs_tor.changed_content() and not changed_content:
+ 
+      # Verify all of our changed tags are present here 
+      if new_vs_tor.has_more_changed_tags(changed_tags) or \
+        new_vs_tor.has_more_changed_attrs(changed_attributes) or \
+        new_vs_tor.changed_content() and not changed_content:
         false_positive = False
-  
-      def ast_recurse(ast, map):
-        if not ast.symbol in map:
-          map[ast.symbol] = 1
-        else: map[ast.symbol] += 1
-        if isinstance(ast, pypy.rlib.parsing.tree.Nonterminal):
-          for child in ast.children:
-            ast_recurse(child, map)
-  
-      def count_ast(map, tags):
-        for tag_l in tags: 
-          for tag in tag_l.findAll():
-            did_parse = False
-            if tag.name == 'script':
-              for child in tag.childGenerator():
-                if isinstance(child, Tag):
-                  plog("ERROR", "Script tag with subtag!")
-                else:
-                  try:
-                    did_parse = True
-                    ast = pypy.lang.js.jsparser.parse(str(child))
-                    ast_recurse(ast, map)
-                  except (pypy.rlib.parsing.deterministic.LexerError, UnicodeDecodeError, pypy.rlib.parsing.parsing.ParseError):
-                    plog("NOTICE", "Parse error on "+str(child))
-                    if not "ParseError"+tag.name in map:
-                      map["ParseError"+tag.name] = 1
-                    else: map["ParseError"+tag.name] +=1 
-               
-            for attr in tag.attrs:
-              # XXX: %-encoding too
-              parse = ""
-              if attr[1].replace(" ","")[:11] == "javascript:":
-                split_at = attr[1].find(":")+1
-                parse = str(attr[1][split_at:])
-              elif attr[0] in attrs_with_raw_script_map:
-                parse = str(attr[1])
-              if not parse: continue
-              try:
-                did_parse = True
-                ast = pypy.lang.js.jsparser.parse(parse)
-                ast_recurse(ast, map)
-              except (pypy.rlib.parsing.deterministic.LexerError, UnicodeDecodeError, pypy.rlib.parsing.parsing.ParseError):
-                plog("NOTICE", "Parse error on "+parse+" in "+attr[0]+"="+attr[1])
-                if not "ParseError"+tag.name+attr[0] in map:
-                  map["ParseError"+tag.name+attr[0]] = 1
-                else: map["ParseError"+attr[0]] +=1
+      else:
+        false_positive = True
 
       if false_positive:
         # Use http://codespeak.net/pypy/dist/pypy/lang/js/ to parse
         # links and attributes that contain javascript
+        jsdiff = JSSoupDiffer(old_soup)
+        jsdiff.prune_differences(new_soup)
+        false_positive = not jsdiff.contains_differences(tor_soup)
   
-        old_vs_new_cnt = {}
-        count_ast(old_vs_new_cnt, [old_soup])
- 
-        new_vs_old_cnt = {}
-        count_ast(new_vs_old_cnt, [new_soup])
-  
-        # for each changed tag, count all tree elements in a hash table.
-        # Then, compare the counts between the two fetches
-        # If any count changes, mark its count as -1
-        # Make sure the terminal counts of the tor fetch match
-        # except for the -1 terminals
-  
-        for node in old_vs_new_cnt.iterkeys():
-          if node not in new_vs_old_cnt:
-            plog("INFO", "Javascript AST element "+node+" absent..")
-            new_vs_old_cnt[node] = 0
-          elif new_vs_old_cnt[node] != old_vs_new_cnt[node]:
-            plog("INFO", "Javascript AST count differs for "+node+": "+str(new_vs_old_cnt[node])+" vs "+str(old_vs_new_cnt[node]))
-            new_vs_old_cnt[node] = 0
-
-        for node in new_vs_old_cnt.iterkeys():
-          if node not in old_vs_new_cnt:
-            plog("INFO", "Javascript AST element "+node+" absent..")
-            new_vs_old_cnt[node] = 0
-        
-        new_vs_tor_cnt = {} 
-        count_ast(new_vs_tor_cnt, [tor_soup])
-  
-        for node in new_vs_old_cnt.iterkeys():
-          if not new_vs_old_cnt[node]:
-            continue
-          if node not in new_vs_tor_cnt:
-            plog("ERROR", "Javascript AST element "+node+" absent from Tor.")
-            false_positive = False
-          elif new_vs_old_cnt[node] != new_vs_tor_cnt[node]:
-            plog("ERROR", "Javascript AST count differs for "+node+": "+str(new_vs_old_cnt[node])+" vs "+str(new_vs_tor_cnt[node]))
-            false_positive = False
-        
-        for node in new_vs_tor_cnt.iterkeys():
-          if node not in new_vs_old_cnt:
-            plog("ERROR", "Javascript AST element "+node+" present only in Tor")
-            false_positive = False
-
- 
       print false_positive      
 
   print ""



More information about the tor-commits mailing list