[or-cvs] r18399: {torflow} Fix some more cookie test symmetry issues. Also fix a dynami (torflow/trunk/NetworkScanners)

Thu Feb 5 10:29:15 UTC 2009

Author: mikeperry
Date: 2009-02-05 05:29:14 -0500 (Thu, 05 Feb 2009)
New Revision: 18399

Modified:
   torflow/trunk/NetworkScanners/libsoat.py
   torflow/trunk/NetworkScanners/soat.py
Log:

Fix some more cookie test symmetry issues. Also fix a dynamic content
filtering issue and store reasons for dynamic content filtering.



Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================

--- torflow/trunk/NetworkScanners/libsoat.py	2009-02-05 08:32:51 UTC (rev 18398)
+++ torflow/trunk/NetworkScanners/libsoat.py	2009-02-05 10:29:14 UTC (rev 18399)
@@ -46,6 +46,11 @@
 FAILURE_DYNAMICBINARY = "FailureDynamicBinary" 
 FAILURE_COOKIEMISMATCH = "FailureCookieMismatch"
 
+# False positive reasons
+FALSEPOSITIVE_HTTPERRORS = "FalsePositiveHTTPErrors"
+FALSEPOSITIVE_DYNAMIC = "FalsePositiveDynamic"
+FALSEPOSITIVE_DYNAMIC_TOR = "FalsePositiveDynamicTor"
+
 # classes to use with pickle to dump test results into files
 
 class TestResult(object):
@@ -57,10 +62,11 @@
     self.status = status
     self.false_positive=False
   
-  def mark_false_positive(self):
+  def mark_false_positive(self, reason):
     pass
 
   def move_file(self, file, to_dir):
+    if not file: return None
     try:
       basename = os.path.basename(file)
       new_file = to_dir+basename
@@ -92,9 +98,12 @@
     self.content = content
     self.content_exit = content_exit
     self.content_old = content_old
+    self.false_positive=False
+    self.false_positive_reason="None"
 
-  def mark_false_positive(self):
+  def mark_false_positive(self, reason):
     self.false_positive=True
+    self.false_positive_reason=reason
     self.content=self.move_file(self.content, http_falsepositive_dir)
     self.content_old=self.move_file(self.content_old, http_falsepositive_dir)
     self.content_exit=self.move_file(self.content_exit,http_falsepositive_dir)
@@ -126,9 +135,12 @@
     self.content = content
     self.content_exit = content_exit
     self.content_old = content_old
+    self.false_positive=False
+    self.false_positive_reason="None"
 
-  def mark_false_positive(self):
+  def mark_false_positive(self, reason):
     self.false_positive=True
+    self.false_positive_reason=reason
     self.content=self.move_file(self.content, http_falsepositive_dir)
     self.content_old=self.move_file(self.content_old, http_falsepositive_dir)
     self.content_exit=self.move_file(self.content_exit,http_falsepositive_dir)
@@ -155,9 +167,12 @@
     self.content = content
     self.content_exit = content_exit
     self.content_old = content_old
+    self.false_positive=False
+    self.false_positive_reason="None"
 
-  def mark_false_positive(self):
+  def mark_false_positive(self, reason):
     self.false_positive=True
+    self.false_positive_reason=reason
     self.tags=self.move_file(self.tags,http_falsepositive_dir)
     self.tags_old=self.move_file(self.tags_old,http_falsepositive_dir)
     self.exit_tags=self.move_file(self.exit_tags,http_falsepositive_dir)
@@ -527,7 +542,7 @@
           if isinstance(child, Tag):
             plog("ERROR", "Script tag with subtag!")
           else:
-            script = str(child).replace("<!--", "").replace("-->", "")
+            script = str(child).replace("<!--", "").replace("-->", "").replace("<![CDATA[", "").replace("]]>", "")
             tag_cnts = JSDiffer._count_ast_elements(self, script, tag.name)
             ast_cnts = JSSoupDiffer._add_cnts(tag_cnts, ast_cnts)
       for attr in tag.attrs:

Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py	2009-02-05 08:32:51 UTC (rev 18398)
+++ torflow/trunk/NetworkScanners/soat.py	2009-02-05 10:29:14 UTC (rev 18399)
@@ -173,7 +173,7 @@
   def get_node(self):
     return random.choice(self.nodes)
 
-  def remove_target(self, target):
+  def remove_target(self, target, reason="None"):
     if target in self.targets: self.targets.remove(target)
     if len(self.targets) < self.min_targets:
       plog("NOTICE", self.proto+" scanner short on targets. Adding more")
@@ -392,8 +392,8 @@
           urls[ftype].append(url)
     return urls     
  
-  def remove_target(self, address):
-    SearchBasedTest.remove_target(self, address)
+  def remove_target(self, address, reason):
+    SearchBasedTest.remove_target(self, address, reason)
     if address in self.httpcode_fails: del self.httpcode_fails[address]
     if address in self.successes: del self.successes[address]
     if address in self.exit_fails: del self.exit_fails[address]
@@ -418,7 +418,7 @@
       if address not in self.successes: self.successes[address] = 0
       plog("NOTICE", "Excessive HTTP 2-way failure ("+str(err_cnt)+" vs "+str(self.successes[address])+") for "+address+". Removing.")
   
-      self.remove_target(address)
+      self.remove_target(address, FALSEPOSITIVE_DYNAMIC_TOR)
     else:
       plog("ERROR", self.proto+" 2-way failure at "+exit_node+". This makes "+str(err_cnt)+" node failures for "+address)
 
@@ -436,7 +436,7 @@
       if address not in self.successes: self.successes[address] = 0
       plog("NOTICE", "Excessive HTTP error code failure ("+str(err_cnt)+" vs "+str(self.successes[address])+") for "+address+". Removing.")
 
-      self.remove_target(address)
+      self.remove_target(address, FALSEPOSITIVE_HTTPERRORS)
     else:
       plog("ERROR", self.proto+" http error code failure at "+exit_node+". This makes "+str(err_cnt)+" node failures for "+address)
     
@@ -451,9 +451,12 @@
     content_prefix = http_content_dir+address_file
     failed_prefix = http_failed_dir+address_file
     
-    # Keep a copy of the cookie jar before mods for refetch
+    # Keep a copy of the cookie jar before mods for refetch or
+    # to restore on errors that cancel a fetch
     orig_cookie_jar = cookielib.LWPCookieJar()
     for cookie in self.cookie_jar: orig_cookie_jar.set_cookie(cookie)
+    orig_tor_cookie_jar = cookielib.LWPCookieJar()
+    for cookie in self.tor_cookie_jar: orig_tor_cookie_jar.set_cookie(cookie)
 
     try:
       # Load content from disk, md5
@@ -473,11 +476,17 @@
       if code - (code % 100) != 200:
         plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
         # Just remove it
-        self.remove_target(address)
+        self.remove_target(address, FALSEPOSITIVE_HTTPERRORS)
+        # Restore cookie jars
+        self.cookie_jar = orig_cookie_jar
+        self.tor_cookie_jar = orig_tor_cookie_jar
         return TEST_INCONCLUSIVE
 
       if not content:
         plog("WARN", "Failed to direct load "+address)
+        # Restore cookie jar
+        self.cookie_jar = orig_cookie_jar
+        self.tor_cookie_jar = orig_tor_cookie_jar
         return TEST_INCONCLUSIVE 
       sha1sum = sha.sha(content)
 
@@ -498,6 +507,9 @@
     except TypeError, e:
       plog('ERROR', 'Failed obtaining the shasum for ' + address)
       plog('ERROR', e)
+      # Restore cookie jars
+      self.cookie_jar = orig_cookie_jar
+      self.tor_cookie_jar = orig_tor_cookie_jar
       return TEST_INCONCLUSIVE
 
 
@@ -514,6 +526,9 @@
     exit_node = self.mt.get_exit_node()
     if exit_node == 0 or exit_node == '0' or not exit_node:
       plog('WARN', 'We had no exit node to test, skipping to the next test.')
+      # Restore cookie jars
+      self.cookie_jar = orig_cookie_jar
+      self.tor_cookie_jar = orig_tor_cookie_jar
       return TEST_SUCCESS
 
     if pcode - (pcode % 100) != 200:
@@ -523,6 +538,9 @@
       self.results.append(result)
       self.datahandler.saveResult(result)
       self.register_httpcode_failure(address, exit_node)
+      # Restore cookie jars
+      self.cookie_jar = orig_cookie_jar
+      self.tor_cookie_jar = orig_tor_cookie_jar
       return TEST_INCONCLUSIVE
 
     # if we have no content, we had a connection error
@@ -532,6 +550,9 @@
                               INCONCLUSIVE_NOEXITCONTENT)
       self.results.append(result)
       self.datahandler.saveResult(result)
+      # Restore cookie jars
+      self.cookie_jar = orig_cookie_jar
+      self.tor_cookie_jar = orig_tor_cookie_jar
       return TEST_INCONCLUSIVE
 
     # compare the content
@@ -642,7 +663,7 @@
 
     # The HTTP Test should remove address immediately.
     plog("WARN", "HTTP Test is removing dynamic URL "+address)
-    self.remove_target(address)
+    self.remove_target(address, FALSEPOSITIVE_DYNAMIC)
     return TEST_FAILURE
 
 class HTMLTest(HTTPTest):
@@ -693,8 +714,8 @@
   def get_targets(self):
     return self.get_search_urls('http', self.fetch_targets) 
 
-  def remove_target(self, address):
-    HTTPTest.remove_target(self, address)
+  def remove_target(self, address, reason):
+    HTTPTest.remove_target(self, address, reason)
     if address in self.dynamic_fails: del self.dynamic_fails[address]
 
   def register_dynamic_failure(self, address, exit_node):
@@ -711,7 +732,7 @@
       if address not in self.successes: self.successes[address] = 0
       plog("NOTICE", "Excessive HTTP 3-way failure ("+str(err_cnt)+" vs "+str(self.successes[address])+") for "+address+". Removing.")
 
-      self.remove_target(address)
+      self.remove_target(address, FALSEPOSITIVE_DYNAMIC)
     else:
       plog("ERROR", self.proto+" 3-way failure at "+exit_node+". This makes "+str(err_cnt)+" node failures for "+address)
 
@@ -835,8 +856,9 @@
     pass
 
   def check_html(self, address):
-    # TODO: Break this out into a check_html_notags that just does a sha check
-    # FIXME: Also check+store non-tor mime types
+    # FIXME: Is there any reason not to just do SHA1 until we
+    # hit a difference, and then pull in the Soup stuff for false positives?
+    # Would eliminate a lot of semi-duplicate code...
     ''' check whether a http connection to a given address is molested '''
     plog('INFO', 'Conducting an html test with destination ' + address)
 
@@ -848,6 +870,8 @@
     # Keep a copy of the cookie jar before mods for refetch
     orig_cookie_jar = cookielib.LWPCookieJar()
     for cookie in self.cookie_jar: orig_cookie_jar.set_cookie(cookie)
+    orig_tor_cookie_jar = cookielib.LWPCookieJar()
+    for cookie in self.tor_cookie_jar: orig_tor_cookie_jar.set_cookie(cookie)
 
     elements = SoupStrainer(lambda name, attrs: name in tags_to_check or 
         len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_check))) > 0)
@@ -868,7 +892,10 @@
       if code - (code % 100) != 200:
         plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
         # Just remove it
-        self.remove_target(address)
+        self.remove_target(address, FALSEPOSITIVE_HTTPERRORS)
+        # Restore cookie jars
+        self.cookie_jar = orig_cookie_jar
+        self.tor_cookie_jar = orig_tor_cookie_jar
         return TEST_INCONCLUSIVE
 
       content = content.decode('ascii','ignore')
@@ -899,10 +926,16 @@
     except TypeError, e:
       plog('ERROR', 'Failed parsing the tag tree for ' + address)
       plog('ERROR', e)
+      # Restore cookie jars
+      self.cookie_jar = orig_cookie_jar
+      self.tor_cookie_jar = orig_tor_cookie_jar
       return TEST_INCONCLUSIVE
 
     if soup == 0:
       plog('ERROR', 'Failed to get the correct tag structure for ' + address)
+      # Restore cookie jars
+      self.cookie_jar = orig_cookie_jar
+      self.tor_cookie_jar = orig_tor_cookie_jar
       return TEST_INCONCLUSIVE
 
 
@@ -920,6 +953,9 @@
     exit_node = self.mt.get_exit_node()
     if exit_node == 0 or exit_node == '0' or not exit_node:
       plog('WARN', 'We had no exit node to test, skipping to the next test.')
+      # Restore cookie jars
+      self.cookie_jar = orig_cookie_jar
+      self.tor_cookie_jar = orig_tor_cookie_jar
       return TEST_SUCCESS
 
     if pcode - (pcode % 100) != 200:
@@ -929,6 +965,9 @@
       self.results.append(result)
       self.datahandler.saveResult(result)
       self.register_httpcode_failure(address, exit_node)
+      # Restore cookie jars
+      self.cookie_jar = orig_cookie_jar
+      self.tor_cookie_jar = orig_tor_cookie_jar
       return TEST_INCONCLUSIVE
 
     # if we have no content, we had a connection error
@@ -938,6 +977,9 @@
                               INCONCLUSIVE_NOEXITCONTENT)
       self.results.append(result)
       self.datahandler.saveResult(result)
+      # Restore cookie jars
+      self.cookie_jar = orig_cookie_jar
+      self.tor_cookie_jar = orig_tor_cookie_jar
       return TEST_INCONCLUSIVE
 
     pcontent = pcontent.decode('ascii', 'ignore')