[or-cvs] r18401: {torflow} Refactor HTML test to leverage HTTP SHA1 test first before p (torflow/trunk/NetworkScanners)

Thu Feb 5 12:23:49 UTC 2009

Author: mikeperry
Date: 2009-02-05 07:23:49 -0500 (Thu, 05 Feb 2009)
New Revision: 18401

Modified:
   torflow/trunk/NetworkScanners/libsoat.py
   torflow/trunk/NetworkScanners/soat.py
Log:

Refactor HTML test to leverage HTTP SHA1 test first before
performing tag or js diffing.



Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================

--- torflow/trunk/NetworkScanners/libsoat.py	2009-02-05 10:53:24 UTC (rev 18400)
+++ torflow/trunk/NetworkScanners/libsoat.py	2009-02-05 12:23:49 UTC (rev 18401)
@@ -156,14 +156,10 @@
 class HtmlTestResult(TestResult):
   ''' Represents the result of a http test '''
   def __init__(self, exit_node, website, status, reason=None, 
-               tags=None, exit_tags=None, content=None, 
-               content_exit=None, content_old=None, tags_old=None):
+               content=None, content_exit=None, content_old=None):
     super(HtmlTestResult, self).__init__(exit_node, website, status)
     self.proto = "http"
     self.reason = reason
-    self.tags = tags
-    self.tags_old = tags_old
-    self.exit_tags = exit_tags
     self.content = content
     self.content_exit = content_exit
     self.content_old = content_old
@@ -173,20 +169,11 @@
   def mark_false_positive(self, reason):
     self.false_positive=True
     self.false_positive_reason=reason
-    self.tags=self.move_file(self.tags,http_falsepositive_dir)
-    self.tags_old=self.move_file(self.tags_old,http_falsepositive_dir)
-    self.exit_tags=self.move_file(self.exit_tags,http_falsepositive_dir)
     self.content=self.move_file(self.content,http_falsepositive_dir)
     self.content_old=self.move_file(self.content_old, http_falsepositive_dir)
     self.content_exit=self.move_file(self.content_exit,http_falsepositive_dir)
 
   def remove_files(self):
-    try: os.unlink(self.tags)
-    except: pass
-    try: os.unlink(self.tags_old)
-    except: pass
-    try: os.unlink(self.exit_tags)
-    except: pass
     try: os.unlink(self.content)
     except: pass
     try: os.unlink(self.content_old)

Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py	2009-02-05 10:53:24 UTC (rev 18400)
+++ torflow/trunk/NetworkScanners/soat.py	2009-02-05 12:23:49 UTC (rev 18401)
@@ -355,8 +355,8 @@
 
   def run_test(self):
     # A single test should have a single cookie jar
-    self.tor_cookie_jar = cookielib.LWPCookieJar()
-    self.cookie_jar = cookielib.LWPCookieJar()
+    self.tor_cookie_jar = cookielib.MozillaCookieJar()
+    self.cookie_jar = cookielib.MozillaCookieJar()
     # XXX: Change these headers (esp accept) based on 
     # url type
     self.headers = copy.copy(firefox_headers)
@@ -441,7 +441,9 @@
       plog("ERROR", self.proto+" http error code failure at "+exit_node+". This makes "+str(err_cnt)+" node failures for "+address)
     
 
-  def check_http_nodynamic(self, address):
+  def check_http_nodynamic(self, address, nocontent=False):
+    # TODO: use nocontent to cause us to not load content into memory.
+    # This will require refactoring http_response though.
     ''' check whether a http connection to a given address is molested '''
     plog('INFO', 'Conducting an http test with destination ' + address)
 
@@ -453,9 +455,9 @@
     
     # Keep a copy of the cookie jar before mods for refetch or
     # to restore on errors that cancel a fetch
-    orig_cookie_jar = cookielib.LWPCookieJar()
+    orig_cookie_jar = cookielib.MozillaCookieJar()
     for cookie in self.cookie_jar: orig_cookie_jar.set_cookie(cookie)
-    orig_tor_cookie_jar = cookielib.LWPCookieJar()
+    orig_tor_cookie_jar = cookielib.MozillaCookieJar()
     for cookie in self.tor_cookie_jar: orig_tor_cookie_jar.set_cookie(cookie)
 
     try:
@@ -496,7 +498,7 @@
       
       # Need to do set subtraction and only save new cookies.. 
       # or extract/make_cookies
-      new_cookie_jar = cookielib.LWPCookieJar()
+      new_cookie_jar = cookielib.MozillaCookieJar()
       for cookie in new_cookies: new_cookie_jar.set_cookie(cookie)
       try:
         new_cookie_jar.save(content_prefix+'.cookies')
@@ -608,7 +610,7 @@
 
     # Need to do set subtraction and only save new cookies.. 
     # or extract/make_cookies
-    new_cookie_jar = cookielib.LWPCookieJar()
+    new_cookie_jar = cookielib.MozillaCookieJar()
     for cookie in new_cookies_new: new_cookie_jar.set_cookie(cookie)
     os.rename(content_prefix+'.cookies', content_prefix+'.cookies-old')
     try:
@@ -627,7 +629,7 @@
       else: self.successes[address]=1
       return TEST_SUCCESS
  
-    if not content:
+    if not content and not nocontent:
       content_file = open(content_prefix+'.content', 'r')
       content = content_file.read()
       content_file.close()
@@ -681,8 +683,8 @@
  
   def run_test(self):
     # A single test should have a single cookie jar
-    self.tor_cookie_jar = cookielib.LWPCookieJar()
-    self.cookie_jar = cookielib.LWPCookieJar()
+    self.tor_cookie_jar = cookielib.MozillaCookieJar()
+    self.cookie_jar = cookielib.MozillaCookieJar()
     # XXX: Change these headers (esp accept) based on 
     # url type
     self.headers = copy.copy(firefox_headers)
@@ -846,155 +848,44 @@
 
       return TEST_FAILURE
 
-  def check_html_notags(self, address):
-    plog('INFO', 'Conducting a html tagless test with destination ' + address)
+  def check_html(self, address):
+    plog('INFO', 'Conducting an html test with destination ' + address)
+
+    # Keep a copy of the cookie jar before mods for refetch
+    orig_cookie_jar = cookielib.MozillaCookieJar()
+    for cookie in self.cookie_jar: orig_cookie_jar.set_cookie(cookie)
+
     ret = self.check_http_nodynamic(address)
     
     if type(ret) == int:
       return ret
-    (tor_js, tsha, orig_js, osha, new_js, nsha, exit_node) = ret
-    pass
+    (tor_html, tsha, orig_html, osha, new_html, nsha, exit_node) = ret
 
-  def check_html(self, address):
-    # FIXME: Is there any reason not to just do SHA1 until we
-    # hit a difference, and then pull in the Soup stuff for false positives?
-    # Would eliminate a lot of semi-duplicate code...
-    ''' check whether a http connection to a given address is molested '''
-    plog('INFO', 'Conducting an html test with destination ' + address)
-
     # an address representation acceptable for a filename 
     address_file = self.datahandler.safeFilename(address[7:])
     content_prefix = http_content_dir+address_file
     failed_prefix = http_failed_dir+address_file
 
-    # Keep a copy of the cookie jar before mods for refetch
-    orig_cookie_jar = cookielib.LWPCookieJar()
-    for cookie in self.cookie_jar: orig_cookie_jar.set_cookie(cookie)
-    orig_tor_cookie_jar = cookielib.LWPCookieJar()
-    for cookie in self.tor_cookie_jar: orig_tor_cookie_jar.set_cookie(cookie)
-
     elements = SoupStrainer(lambda name, attrs: name in tags_to_check or 
-        len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_check))) > 0)
+     len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_check))) > 0)
 
-    # load the original tag structure
-    # if we don't have any yet, get it
-    soup = 0
-    try:
-      tag_file = open(content_prefix+'.tags', 'r')
-      soup = BeautifulSoup(tag_file.read())
-      tag_file.close()
-      
-      self.cookie_jar.load(content_prefix+'.cookies', 'w')
+    orig_soup = self._recursive_strain(BeautifulSoup(orig_html.decode('ascii',
+                                       'ignore'), parseOnlyThese=elements))
 
-    except IOError:
-      (code, new_cookies, mime_type, content) = http_request(address, self.cookie_jar, self.headers)
+    tor_soup = self._recursive_strain(BeautifulSoup(tor_html.decode('ascii',
+                                      'ignore'), parseOnlyThese=elements))
 
-      if code - (code % 100) != 200:
-        plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
-        # Just remove it
-        self.remove_target(address, FALSEPOSITIVE_HTTPERRORS)
-        # Restore cookie jars
-        self.cookie_jar = orig_cookie_jar
-        self.tor_cookie_jar = orig_tor_cookie_jar
-        return TEST_INCONCLUSIVE
-
-      content = content.decode('ascii','ignore')
-      soup = self._recursive_strain(BeautifulSoup(content, parseOnlyThese=elements))
-
-      # XXX: str of this may be bad if there are unicode chars inside
-      string_soup = str(soup)
-      if not string_soup:
-        plog("WARN", "Empty soup for "+address)
-      tag_file = open(content_prefix+'.tags', 'w')
-      tag_file.write(string_soup) 
-      tag_file.close()
-
-      # Need to do set subtraction and only save new cookies.. 
-      # or extract/make_cookies
-      new_cookie_jar = cookielib.LWPCookieJar()
-      for cookie in new_cookies: new_cookie_jar.set_cookie(cookie)
-      try:
-        new_cookie_jar.save(content_prefix+'.cookies')
-      except:
-        traceback.print_exc()
-        plog("WARN", "Error saving cookies in "+str(new_cookie_jar)+" to "+content_prefix+".cookies")
-
-      content_file = open(content_prefix+'.content', 'w')
-      content_file.write(content)
-      content_file.close()
-
-    except TypeError, e:
-      plog('ERROR', 'Failed parsing the tag tree for ' + address)
-      plog('ERROR', e)
-      # Restore cookie jars
-      self.cookie_jar = orig_cookie_jar
-      self.tor_cookie_jar = orig_tor_cookie_jar
-      return TEST_INCONCLUSIVE
-
-    if soup == 0:
-      plog('ERROR', 'Failed to get the correct tag structure for ' + address)
-      # Restore cookie jars
-      self.cookie_jar = orig_cookie_jar
-      self.tor_cookie_jar = orig_tor_cookie_jar
-      return TEST_INCONCLUSIVE
-
-
-    defaultsocket = socket.socket
-    socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, tor_host, tor_port)
-    socket.socket = socks.socksocket
-
-    # Wikipedia and others can give us 403.. So what do we do about that?
-    # Count the number of occurrances vs successful runs then remove the url
-    (pcode, pcookies, pmime_type, pcontent) = http_request(address, self.tor_cookie_jar, self.headers)
-
-    # reset the connection to direct
-    socket.socket = defaultsocket
-
-    exit_node = self.mt.get_exit_node()
-    if exit_node == 0 or exit_node == '0' or not exit_node:
-      plog('WARN', 'We had no exit node to test, skipping to the next test.')
-      # Restore cookie jars
-      self.cookie_jar = orig_cookie_jar
-      self.tor_cookie_jar = orig_tor_cookie_jar
-      return TEST_SUCCESS
-
-    if pcode - (pcode % 100) != 200:
-      plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address)
-      result = HttpTestResult(exit_node, address, TEST_INCONCLUSIVE,
-                              INCONCLUSIVE_BADHTTPCODE+str(pcode))
-      self.results.append(result)
-      self.datahandler.saveResult(result)
-      self.register_httpcode_failure(address, exit_node)
-      # Restore cookie jars
-      self.cookie_jar = orig_cookie_jar
-      self.tor_cookie_jar = orig_tor_cookie_jar
-      return TEST_INCONCLUSIVE
-
-    # if we have no content, we had a connection error
-    if pcontent == "":
-      plog("NOTICE", exit_node+" failed to fetch content for "+address)
-      result = HtmlTestResult(exit_node, address, TEST_INCONCLUSIVE,
-                              INCONCLUSIVE_NOEXITCONTENT)
-      self.results.append(result)
-      self.datahandler.saveResult(result)
-      # Restore cookie jars
-      self.cookie_jar = orig_cookie_jar
-      self.tor_cookie_jar = orig_tor_cookie_jar
-      return TEST_INCONCLUSIVE
-
-    pcontent = pcontent.decode('ascii', 'ignore')
-    psoup = self._recursive_strain(BeautifulSoup(pcontent, parseOnlyThese=elements))
-
     # Also find recursive urls
     recurse_elements = SoupStrainer(lambda name, attrs: 
-         name in tags_to_recurse and 
-            len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_recurse))) > 0)
-    self._add_recursive_targets(BeautifulSoup(pcontent, recurse_elements), 
-                               address) 
+        name in tags_to_recurse and 
+       len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_recurse))) > 0)
+    self._add_recursive_targets(BeautifulSoup(tor_html.decode('ascii',
+                                   'ignore'), recurse_elements), address) 
 
     # compare the content
     # if content matches, everything is ok
-    if str(psoup) == str(soup):
+    if str(orig_soup) == str(tor_soup):
+      plog("INFO", "Successful soup comparison after SHA1 fail for "+address+" via "+exit_node)
       result = HtmlTestResult(exit_node, address, TEST_SUCCESS)
       self.results.append(result)
       #self.datahandler.saveResult(result)
@@ -1006,10 +897,10 @@
     # If we have alternate IPs to bind to on this box, use them?
     # Sometimes pages have the client IP encoded in them..
     BindingSocket.bind_to = refetch_ip
-    (code_new, new_cookies_new, mime_type_new, content_new) = http_request(address, orig_cookie_jar, self.headers)
+    (code_new, new_cookies_new, mime_type_new, new_html) = http_request(address, orig_cookie_jar, self.headers)
     BindingSocket.bind_to = None
 
-    content_new = content_new.decode('ascii', 'ignore')
+    content_new = new_html.decode('ascii', 'ignore')
     if not content_new:
       plog("WARN", "Failed to re-frech "+address+" outside of Tor. Did our network fail?")
       result = HtmlTestResult(exit_node, address, TEST_INCONCLUSIVE, 
@@ -1018,23 +909,18 @@
       self.datahandler.saveResult(result)
       return TEST_INCONCLUSIVE
 
-    soup_new = self._recursive_strain(BeautifulSoup(content_new,
+    new_soup = self._recursive_strain(BeautifulSoup(content_new,
                                      parseOnlyThese=elements))
     # compare the new and old content
     # if they match, means the node has been changing the content
-    if str(soup) == str(soup_new):
+    if str(orig_soup) == str(new_soup):
       # XXX: Check for existence of this file before overwriting
-      exit_tag_file = open(failed_prefix+'.tags.'+exit_node[1:],'w')
-      exit_tag_file.write(str(psoup))
-      exit_tag_file.close()
-
       exit_content_file = open(failed_prefix+'.content.'+exit_node[1:], 'w')
-      exit_content_file.write(pcontent)
+      exit_content_file.write(tor_html)
       exit_content_file.close()
 
       result = HtmlTestResult(exit_node, address, TEST_FAILURE, 
-                              FAILURE_EXITONLY, tag_file.name, 
-                              exit_tag_file.name, content_prefix+".content",
+                              FAILURE_EXITONLY, content_prefix+".content",
                               exit_content_file.name)
       self.results.append(result)
       self.datahandler.saveResult(result)
@@ -1042,19 +928,16 @@
       self.register_exit_failure(address, exit_node)
       return TEST_FAILURE
 
-    # if content has changed outside of tor, update the saved file
-    os.rename(content_prefix+'.tags', content_prefix+'.tags-old')
-    string_soup_new = str(soup_new)
-    if not string_soup_new:
-      plog("WARN", "Empty soup for "+address)
-    tag_file = open(content_prefix+'.tags', 'w')
-    tag_file.write(string_soup_new) 
-    tag_file.close()
+    # if content has changed outside of tor, update the saved files
+    os.rename(content_prefix+'.content', content_prefix+'.content-old')
+    new_content_file = open(content_prefix+'.content', 'w')
+    new_content_file.write(new_html)
+    new_content_file.close()
       
     os.rename(content_prefix+'.cookies', content_prefix+'.cookies-old')
     # Need to do set subtraction and only save new cookies.. 
     # or extract/make_cookies
-    new_cookie_jar = cookielib.LWPCookieJar()
+    new_cookie_jar = cookielib.MozillaCookieJar()
     for cookie in new_cookies_new: new_cookie_jar.set_cookie(cookie)
     try:
       new_cookie_jar.save(content_prefix+'.cookies')
@@ -1062,30 +945,15 @@
       traceback.print_exc()
       plog("WARN", "Error saving cookies in "+str(new_cookie_jar)+" to "+content_prefix+".cookies")
 
-    os.rename(content_prefix+'.content', content_prefix+'.content-old')
-    new_content_file = open(content_prefix+'.content', 'w')
-    new_content_file.write(content_new)
-    new_content_file.close()
-
-    # compare the node content and the new content
-    # if it matches, everything is ok
-    if str(psoup) == str(soup_new):
-      result = HtmlTestResult(exit_node, address, TEST_SUCCESS)
-      self.results.append(result)
-      #self.datahandler.saveResult(result)
-      if address in self.successes: self.successes[address]+=1
-      else: self.successes[address]=1
-      return TEST_SUCCESS
-
     # Lets try getting just the tag differences
     # 1. Take difference between old and new tags both ways
     # 2. Make map of tags that change to their attributes
     # 3. Compare list of changed tags for tor vs new and
     #    see if any extra tags changed or if new attributes
     #    were added to additional tags
-    old_vs_new = SoupDiffer(soup, soup_new)
-    new_vs_old = SoupDiffer(soup_new, soup)
-    new_vs_tor = SoupDiffer(soup_new, psoup)
+    old_vs_new = SoupDiffer(orig_soup, new_soup)
+    new_vs_old = SoupDiffer(new_soup, orig_soup)
+    new_vs_tor = SoupDiffer(new_soup, tor_soup)
 
     # I'm an evil man and I'm going to CPU hell..
     changed_tags = old_vs_new.changed_tags_with_attrs()
@@ -1105,9 +973,9 @@
       false_positive = True
 
     if false_positive:
-      jsdiff = JSSoupDiffer(soup)
-      jsdiff.prune_differences(soup_new)
-      false_positive = not jsdiff.contains_differences(psoup)
+      jsdiff = JSSoupDiffer(orig_soup)
+      jsdiff.prune_differences(new_soup)
+      false_positive = not jsdiff.contains_differences(tor_soup)
 
     if false_positive:
       plog("NOTICE", "False positive detected for dynamic change at "+address+" via "+exit_node)
@@ -1119,20 +987,14 @@
       return TEST_SUCCESS
 
     # XXX: Check for existence of this file before overwriting
-    exit_tag_file = open(failed_prefix+'.dyn-tags.'+exit_node[1:],'w')
-    exit_tag_file.write(str(psoup))
-    exit_tag_file.close()
-
     exit_content_file = open(failed_prefix+'.dyn-content.'+exit_node[1:], 'w')
-    exit_content_file.write(pcontent)
+    exit_content_file.write(tor_html)
     exit_content_file.close()
 
     result = HtmlTestResult(exit_node, address, TEST_FAILURE, 
-                            FAILURE_DYNAMICTAGS, tag_file.name, 
-                            exit_tag_file.name, new_content_file.name,
+                            FAILURE_DYNAMICTAGS, new_content_file.name,
                             exit_content_file.name, 
-                            content_prefix+'.content-old',
-                            content_prefix+'.tags-old')
+                            content_prefix+'.content-old')
     self.results.append(result)
     self.datahandler.saveResult(result)