[or-cvs] r19246: {torflow} Track HTTP header changes and properly classify urllib timeo (torflow/trunk/NetworkScanners)

mikeperry at seul.org mikeperry at seul.org
Thu Apr 9 00:41:26 UTC 2009


Author: mikeperry
Date: 2009-04-08 20:41:26 -0400 (Wed, 08 Apr 2009)
New Revision: 19246

Modified:
   torflow/trunk/NetworkScanners/libsoat.py
   torflow/trunk/NetworkScanners/soat.py
Log:

Track HTTP header changes and properly classify urllib
timeouts as such.



Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py	2009-04-08 09:17:21 UTC (rev 19245)
+++ torflow/trunk/NetworkScanners/libsoat.py	2009-04-09 00:41:26 UTC (rev 19246)
@@ -76,6 +76,7 @@
 FAILURE_URLERROR = "FailureURLError" # can also mean timeout...
 FAILURE_CRYPTOERROR = "FailureCryptoError"
 FAILURE_TIMEOUT = "FailureTimeout"
+FAILURE_HEADERCHANGE = "FailureHeaderChange"
 
 # False positive reasons
 FALSEPOSITIVE_HTTPERRORS = "FalsePositiveHTTPErrors"
@@ -833,7 +834,44 @@
     ret.sort()
     return ret
 
+class HeaderDiffer:
+  def __init__(self, orig_headers):
+    self.header_pool = sets.Set(orig_headers)
+    self.changed_headers = sets.Set([])
+    self._pickle_revision = 0
+ 
+  def filter_headers(headers):
+    ret = []
+    for h in headers:
+      matched = False
+      for i in ignore_http_headers:
+        if re.match(i, h[0]):
+          matched = True
+      if not matched: ret.append(h)
+    return sets.Set(ret)
+  filter_headers = Callable(filter_headers)
+ 
+  def depickle_upgrade(self):
+    pass
 
+  def prune_differences(self, new_headers):
+    new_headers = sets.Set(new_headers)
+    changed = new_headers - self.header_pool
+    for i in changed:
+      self.changed_headers.add(i[0])
+    self.header_pool.union_update(new_headers)
+
+  def show_differences(self, new_headers):
+    ret = ""
+    changed = sets.Set(new_headers) - self.header_pool
+    for i in changed:
+      if i[0] not in self.changed_headers:
+        ret += " "+i[0]+": "+i[1]+"\n"
+    if ret:
+      return "New HTTP Headers:\n"+ret
+    else: 
+      return ret
+
 class JSDiffer:
   def __init__(self, js_string):
     self._pickle_revision = 0    
@@ -987,4 +1025,3 @@
         ast_cnts = JSSoupDiffer._add_cnts(tag_cnts, ast_cnts)
     return ast_cnts
 
-

Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py	2009-04-08 09:17:21 UTC (rev 19245)
+++ torflow/trunk/NetworkScanners/soat.py	2009-04-09 00:41:26 UTC (rev 19246)
@@ -136,8 +136,7 @@
       plog("WARN", "Max content size exceeded for "+address+": "+length)
       return (reply.code, None, [], "", "")
     mime_type = reply.info().type.lower()
-    reply_headers = sets.Set(filter(lambda h: h[0] not in ignore_http_headers, 
-                          reply.info().items()))
+    reply_headers = HeaderDiffer.filter_headers(reply.info().items())
     reply_headers.add(("mime-type", mime_type))
     plog("DEBUG", "Mime type is "+mime_type+", length "+str(length))
     content = decompress_response_data(reply)
@@ -147,8 +146,11 @@
     return (-6.0, None, [], "", e.__class__.__name__+str(e)) 
   except urllib2.HTTPError, e:
     plog('NOTICE', "HTTP Error during request of "+address+": "+str(e))
-    traceback.print_exc()
-    return (e.code, None, [], "", e.__class__.__name__+str(e)) 
+    if str(e) == "<urlopen error timed out>": # Yah, super ghetto...
+      return (-6.0, None, [], "", e.__class__.__name__+str(e)) 
+    else:
+      traceback.print_exc()
+      return (e.code, None, [], "", e.__class__.__name__+str(e)) 
   except (ValueError, urllib2.URLError), e:
     plog('WARN', 'The http-request address ' + address + ' is malformed')
     traceback.print_exc()
@@ -506,8 +508,7 @@
           plog('ERROR', 'Soup-scraping of http://'+host+search_path+" failed")
           traceback.print_exc()
           print "Content is: "+str(content)
-          return [protocol+"://www.eff.org", protocol+"://www.fastmail.fm", protocol+"://www.torproject.org", protocol+"://secure.wikileaks.org/"]
-        
+          return [protocol+"://www.eff.org", protocol+"://www.fastmail.fm", protocol+"://www.torproject.org", protocol+"://secure.wikileaks.org/"] 
         # get the links and do some additional filtering
         for link in soup.findAll('a'):
           skip = True
@@ -516,7 +517,10 @@
               skip = False
               break
           if skip: continue
-          url = link['href']
+          if link.has_key(search_mode['realtgt']):
+            url = link[search_mode['realtgt']]
+          else:
+            url = link['href']
           if protocol == 'any': prot_list = None
           else: prot_list = [protocol]
           if filetype == 'any': file_list = None
@@ -699,7 +703,11 @@
       added_cookie_jar.load(content_prefix+'.cookies', ignore_discard=True)
       self.cookie_jar.load(content_prefix+'.cookies', ignore_discard=True)
 
-      content = None 
+      header_file = open(content_prefix+'.headerdiff', 'r')
+      headerdiffer = pickle.load(header_file)
+      header_file.close()
+
+      content = None
       mime_type = None 
 
     except IOError:
@@ -727,6 +735,11 @@
       content_file = open(content_prefix+'.content', 'w')
       content_file.write(content)
       content_file.close()
+
+      header_file = open(content_prefix+'.headerdiff', 'w')
+      headerdiffer = HeaderDiffer(resp_headers)
+      pickle.dump(headerdiffer, header_file)
+      header_file.close()
       
       # Need to do set subtraction and only save new cookies.. 
       # or extract/make_cookies
@@ -823,9 +836,13 @@
       self.tor_cookie_jar = orig_tor_cookie_jar
       return TEST_FAILURE
 
+    hdiffs = headerdiffer.show_differences(presp_headers)
+    if hdiffs:
+      plog("NOTICE", "Header differences for "+address+": \n"+hdiffs)
+
     # compare the content
     # if content matches, everything is ok
-    if psha1sum.hexdigest() == sha1sum.hexdigest():
+    if not hdiffs and psha1sum.hexdigest() == sha1sum.hexdigest():
       result = HttpTestResult(exit_node, self.node_map[exit_node[1:]].nickname, 
                               address, TEST_SUCCESS)
       self.register_success(result)
@@ -876,6 +893,13 @@
       datahandler.saveResult(result)
       return TEST_INCONCLUSIVE
 
+    headerdiffer.prune_differences(resp_headers_new)
+    hdiffs = headerdiffer.show_differences(presp_headers)
+
+    header_file = open(content_prefix+'.headerdiff', 'w')
+    pickle.dump(headerdiffer, header_file)
+    header_file.close()
+
     sha1sum_new = sha.sha(content_new)
 
     if sha1sum.hexdigest() != sha1sum_new.hexdigest():
@@ -900,6 +924,18 @@
       traceback.print_exc()
       plog("WARN", "Error saving cookies in "+str(new_cookie_jar)+" to "+content_prefix+".cookies")
 
+    if hdiffs:
+      # XXX: We probably should store the header differ + exit headers 
+      # for later comparison (ie if the header differ picks up more diffs)
+      plog("NOTICE", "Post-refetch header changes for "+address+": \n"+hdiffs)
+      result = HttpTestResult(exit_node,
+                              self.node_map[exit_node[1:]].nickname, 
+                              address, TEST_FAILURE, FAILURE_HEADERCHANGE)
+      result.extra_info = hdiffs
+      self.register_dynamic_failure(result)
+      # Lets let the rest of the tests run too actually
+      #return TEST_FAILURE 
+
     # compare the node content and the new content
     # if it matches, everything is ok
     if psha1sum.hexdigest() == sha1sum_new.hexdigest():
@@ -985,7 +1021,7 @@
   def _reset(self):
     HTTPTest._reset(self)
     self.targets = [] # FIXME: Lame..
-    self.soupdiffer_files = {}
+    self.soupdiffer_files = {} # XXX: These two are now deprecated
     self.jsdiffer_files = {}
  
   def depickle_upgrade(self):
@@ -1057,6 +1093,7 @@
     Test._remove_target_addr(self, target)
     if target in self.soupdiffer_files: del self.soupdiffer_files[target]
     if target in self.jsdiffer_files: del self.jsdiffer_files[target]
+
   def refill_targets(self):
     Test.refill_targets(self)
 
@@ -1082,6 +1119,7 @@
             elif t.name in recurse_script:
               if t.name == "link":
                 for a in t.attrs:
+                  a = map(lambda x: x.lower(), a)
                   # Special case CSS and favicons
                   if (a[0] == "type" and a[1] == "text/css") or \
                    ((a[0] == "rel" or a[0] == "rev") and a[1] == "stylesheet"):
@@ -1092,7 +1130,7 @@
                     plog("INFO", "Adding favicon of: "+str(t))
                     found_favicon = True
                     targets.append(("image", urlparse.urljoin(orig_addr, attr_tgt)))
-                  elif a[0] == "type" and a[1] in script_mime_types:
+                  elif a[0] == "type" and self.is_script(a[1], ""):
                     plog("INFO", "Adding link script of: "+str(t))
                     targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
               else:
@@ -1140,7 +1178,7 @@
   def is_html(self, mime_type, content):
     is_html = False
     for type_match in html_mime_types:
-      if re.match(type_match, mime_type): 
+      if re.match(type_match, mime_type.lower()): 
         is_html = True
         break
     return is_html
@@ -1148,7 +1186,7 @@
   def is_script(self, mime_type, content):
     is_script = False
     for type_match in script_mime_types:
-      if re.match(type_match, mime_type): 
+      if re.match(type_match, mime_type.lower()): 
         is_script = True
         break
     return is_script
@@ -1168,17 +1206,16 @@
     content_prefix = http_content_dir+address_file
     failed_prefix = http_failed_dir+address_file
 
-    if address in self.jsdiffer_files:
+    if os.path.exists(content_prefix+".jsdiff"):
       plog("DEBUG", "Loading jsdiff for "+address)
-      jsdiff = pickle.load(open(self.jsdiffer_files[address], 'r'))
+      jsdiff = pickle.load(open(content_prefix+".jsdiff", 'r'))
       jsdiff.depickle_upgrade()
     else:
       plog("DEBUG", "No jsdiff for "+address+". Creating+dumping")
       jsdiff = JSDiffer(orig_js)
-      self.jsdiffer_files[address] = content_prefix+".jsdiff"
     
     jsdiff.prune_differences(new_js)
-    pickle.dump(jsdiff, open(self.jsdiffer_files[address], 'w'))
+    pickle.dump(jsdiff, open(content_prefix+".jsdiff", 'w'))
 
     has_js_changes = jsdiff.contains_differences(tor_js)
 
@@ -1196,7 +1233,7 @@
                              address, TEST_FAILURE, FAILURE_DYNAMIC, 
                              content_prefix+".content", exit_content_file.name, 
                              content_prefix+'.content-old',
-                             self.jsdiffer_files[address])
+                             content_prefix+".jsdiff")
       self.register_dynamic_failure(result)
       return TEST_FAILURE
 
@@ -1277,17 +1314,16 @@
     # 3. Compare list of changed tags for tor vs new and
     #    see if any extra tags changed or if new attributes
     #    were added to additional tags
-    if address in self.soupdiffer_files:
+    if os.path.exists(content_prefix+".soupdiff"):
       plog("DEBUG", "Loading soupdiff for "+address)
-      soupdiff = pickle.load(open(self.soupdiffer_files[address], 'r'))
+      soupdiff = pickle.load(open(content_prefix+".soupdiff", 'r'))
       soupdiff.depickle_upgrade()
       soupdiff.prune_differences(new_soup)
     else:
       plog("DEBUG", "No soupdiff for "+address+". Creating+dumping")
       soupdiff = SoupDiffer(orig_soup, new_soup)
-      self.soupdiffer_files[address] = content_prefix+".soupdiff"
 
-    pickle.dump(soupdiff, open(self.soupdiffer_files[address], 'w'))
+    pickle.dump(soupdiff, open(content_prefix+".soupdiff", 'w'))
     
     more_tags = soupdiff.show_changed_tags(tor_soup)     
     more_attrs = soupdiff.show_changed_attrs(tor_soup)
@@ -1306,17 +1342,16 @@
       false_positive = True
 
     if false_positive:
-      if address in self.jsdiffer_files:
+      if os.path.exists(content_prefix+".jsdiff"):
         plog("DEBUG", "Loading jsdiff for "+address)
-        jsdiff = pickle.load(open(self.jsdiffer_files[address], 'r'))
+        jsdiff = pickle.load(open(content_prefix+".jsdiff", 'r'))
         jsdiff.depickle_upgrade()
       else:
         plog("DEBUG", "No jsdiff for "+address+". Creating+dumping")
         jsdiff = JSSoupDiffer(orig_soup)
-        self.jsdiffer_files[address] = content_prefix+".jsdiff"
       
       jsdiff.prune_differences(new_soup)
-      pickle.dump(jsdiff, open(self.jsdiffer_files[address], 'w'))
+      pickle.dump(jsdiff, open(content_prefix+".jsdiff", 'w'))
 
       differences = jsdiff.show_differences(tor_soup)
       false_positive = not differences
@@ -1335,11 +1370,11 @@
     exit_content_file.write(tor_html)
     exit_content_file.close()
  
-    if address in self.jsdiffer_files: 
-      jsdiff_file = self.jsdiffer_files[address]
+    if os.path.exists(content_prefix+".jsdiff"):
+      jsdiff_file = content_prefix+".jsdiff"
     else: jsdiff_file = None
-    if address in self.soupdiffer_files: 
-      soupdiff_file = self.soupdiffer_files[address]
+    if os.path.exists(content_prefix+".soupdiff"):
+      soupdiff_file = content_prefix+".soupdiff"
     else: soupdiff_file = None
 
     result = HtmlTestResult(exit_node, self.node_map[exit_node[1:]].nickname, 
@@ -2588,8 +2623,8 @@
   global refetch_ip
   BindingSocket.bind_to = refetch_ip
   try:
-    s = socket.socket()
-  except socket.error, e:
+    socket.socket()
+  except socket.error:
     plog("WARN", "Cannot bind to "+refetch_ip+". Ignoring refetch_ip setting.")
     refetch_ip = None
   BindingSocket.bind_to = None



More information about the tor-commits mailing list