[or-cvs] r18318: {torflow} Woo hoo! A false positive filter that works! Yay! Someone sh (torflow/trunk/NetworkScanners)
mikeperry at seul.org
mikeperry at seul.org
Thu Jan 29 14:22:47 UTC 2009
Author: mikeperry
Date: 2009-01-29 09:22:47 -0500 (Thu, 29 Jan 2009)
New Revision: 18318
Modified:
torflow/trunk/NetworkScanners/libsoat.py
torflow/trunk/NetworkScanners/soat.py
torflow/trunk/NetworkScanners/soatstats.py
Log:
Woo hoo! A false positive filter that works! Yay! Someone
should verify this isn't too permisive. Also, attributes to
filter for were updated by combining XSS filters from around
the internets.
Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py 2009-01-29 13:17:43 UTC (rev 18317)
+++ torflow/trunk/NetworkScanners/libsoat.py 2009-01-29 14:22:47 UTC (rev 18318)
@@ -8,6 +8,9 @@
import pickle
import sys
import time
+import difflib
+sys.path.append("./libs")
+from BeautifulSoup.BeautifulSoup import BeautifulSoup, Tag
import sets
from sets import Set
@@ -40,6 +43,7 @@
# Failed reasons
FAILURE_EXITONLY = "FailureExitOnly"
FAILURE_DYNAMICTAGS = "FailureDynamicTags"
+FAILURE_DYNAMICBINARY = "FailureDynamicBinary"
FAILURE_COOKIEMISMATCH = "FailureCookieMismatch"
# classes to use with pickle to dump test results into files
@@ -232,11 +236,10 @@
for root, dirs, files in os.walk(dir):
for file in files:
- if file.endswith('result'):
+ if file[:-41].endswith('result'):
fh = open(os.path.join(root, file))
result = pickle.load(fh)
results.append(result)
-
return results
def safeFilename(self, str):
@@ -272,3 +275,86 @@
pickle.dump(result, result_file)
result_file.close()
+class SoupDiffer:
+ """ Diff two soup tag sets, optionally writing diffs to outfile. """
+ def __init__(self, soup_old, soup_new):
+ self.soup_old = soup_old
+ self.soup_new = soup_new
+
+ def changed_tags(self):
+ """ Return a list of tags changed or added to soup_new as strings """
+ tags_old = sets.Set(map(str,
+ [tag for tag in self.soup_old.findAll() if isinstance(tag, Tag)]))
+ tags_new = sets.Set(map(str,
+ [tag for tag in self.soup_new.findAll() if isinstance(tag, Tag)]))
+ ret = list(tags_new - tags_old)
+ ret.sort()
+ return ret
+
+ def _get_attributes(self):
+ attrs_old = [tag.attrs for tag in self.soup_old.findAll()]
+ attrs_new = [tag.attrs for tag in self.soup_new.findAll()]
+ attr_old = []
+ for attr_list in attrs_old:
+ attr_old.extend(attr_list)
+ attr_new = []
+ for attr_list in attrs_new:
+ attr_new.extend(attr_list)
+ return (attr_old, attr_new)
+
+ def changed_attributes(self):
+ """ Return a list of attributes added to soup_new """
+ (attr_old, attr_new) = self._get_attributes()
+ ret = list(sets.Set(attr_new) - sets.Set(attr_old))
+ ret.sort()
+ return ret
+
+ def changed_content(self):
+ """ Return a list of tag contents changed in soup_new """
+ tags_old = sets.Set(map(str,
+ [tag for tag in self.soup_old.findAll() if not isinstance(tag, Tag)]))
+ tags_new = sets.Set(map(str,
+ [tag for tag in self.soup_new.findAll() if not isinstance(tag, Tag)]))
+ ret = list(tags_new - tags_old)
+ ret.sort()
+ return ret
+
+ def diff_tags(self):
+ tags_old = map(str, [tag for tag in self.soup_old.findAll() if isinstance(tag, Tag)])
+ tags_new = map(str, [tag for tag in self.soup_new.findAll() if isinstance(tag, Tag)])
+ tags_old.sort()
+ tags_new.sort()
+ diff = difflib.SequenceMatcher(None, tags_old, tags_new)
+ return diff
+
+ def diff_attributes(self):
+ (attr_old, attr_new) = self._get_attributes()
+ attr_old.sort()
+ attr_new.sort()
+ diff = difflib.SequenceMatcher(None, attr_old, attr_new)
+ return diff
+
+ def diff_content(self):
+ tags_old = sets.Set(map(str,
+ [tag for tag in self.soup_old.findAll() if not isinstance(tag, Tag)]))
+ tags_new = sets.Set(map(str,
+ [tag for tag in self.soup_new.findAll() if not isinstance(tag, Tag)]))
+ diff = difflib.SequenceMatcher(None, tags_old, tags_new)
+ return diff
+
+ def __str__(self):
+ tags = self.changed_tags()
+ out = "Tags:\n"+"\n".join(tags)
+ attrs = self.changed_attributes()
+ out += "\n\nAttrs:\n"
+ for a in attrs:
+ out += a[0]+"="+a[1]+"\n"
+ content = self.changed_content()
+ out += "\n\nContent:\n"+"\n".join(map(str, content))
+ return out
+
+ def write_diff(self, outfile):
+ f = open(outfile, "w")
+ f.write(str(self))
+ f.close()
+
Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py 2009-01-29 13:17:43 UTC (rev 18317)
+++ torflow/trunk/NetworkScanners/soat.py 2009-01-29 14:22:47 UTC (rev 18318)
@@ -59,6 +59,7 @@
from OpenSSL import *
sys.path.append("./libs/")
+# XXX: Try to determine if we should be using MinimalSoup
from BeautifulSoup.BeautifulSoup import BeautifulSoup, SoupStrainer, Tag
from SocksiPy import socks
import Pyssh.pyssh
@@ -82,7 +83,7 @@
firefox_headers = {
- 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.8.1) Gecko/20061010 Firefox/2.0',
+ 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5'
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':"en-us,en;q=0.5",
'Accept-Encoding':"gzip,deflate",
@@ -158,9 +159,6 @@
# Tags and attributes to check in the http test.
# The general idea is to grab tags with attributes known
# to either hold script, or cause automatic network actvitity
-# See: http://www.w3.org/TR/REC-html40/index/attributes.html
-# http://www.w3.org/TR/REC-html40/index/elements.html
-# and http://ha.ckers.org/xss.html
# Note: the more we add, the greater the potential for false positives...
# We also only care about the ones that work for FF2/FF3.
@@ -170,24 +168,55 @@
# Unfortunately, Google munges its javascript, so we have to do
# some more advanced processing to reach that goal..
# Also, I'm somewhat torn on dropping 'a' tags..
-tags_to_check = ['a', 'applet', 'area', 'base', 'body', 'embed', 'form',
+tags_to_check = ['a', 'applet', 'area', 'base', 'embed', 'form',
'frame', 'iframe', 'img', 'input', 'link', 'meta',
- 'object', 'script', 'style']
+ 'object', 'script', 'style', 'layer', 'ilayer']
tags_preserve_inner = ['script','style']
-attrs_to_check = ['background', 'cite', 'classid', 'codebase', 'data',
- 'longdesc', 'onblur',
- 'onchange', 'onclick', 'ondblclick', 'onfocus', 'onkeydown',
- 'onkeypress', 'onkeyup','onload', 'onmousedown', 'onmousemove',
- 'onmouseout', 'onmouseover','onmouseup', 'onreset', 'onselect',
- 'onsubmit', 'onunload', 'profile', 'src', 'usemap']
+
+# Merged from:
+# http://www.w3.org/TR/REC-html40/index/attributes.html
+# http://www.w3.org/TR/REC-html40/index/elements.html
+# http://web.archive.org/web/20060113072810/www.mozilla.org/docs/dom/domref/dom_event_ref33.html
+# http://scrivna.com/blog/2008/09/18/php-xss-filtering-function/
+# https://svn.typo3.org/TYPO3v4/Core/trunk/typo3/contrib/RemoveXSS/RemoveXSS.php
+# http://www.expertzzz.com/Downloadz/view/3424
+# http://kallahar.com/smallprojects/php_xss_filter_function.php
+# and http://ha.ckers.org/xss.html
+attrs_to_check = ['background', 'cite', 'classid', 'codebase', 'data',
+'longdesc', 'onabort', 'onactivate', 'onafterprint', 'onafterupdate',
+'onattrmodified', 'onbeforeactivate', 'onbeforecopy', 'onbeforecut',
+'onbeforedeactivate', 'onbeforeeditfocus', 'onbeforepaste', 'onbeforeprint',
+'onbeforeunload', 'onbeforeupdate', 'onblur', 'onbounce', 'onbroadcast',
+'oncellchange', 'onchange', 'oncharacterdatamodified', 'onclick', 'onclose',
+'oncommand', 'oncommandupdate', 'oncontextmenu', 'oncontrolselect', 'oncopy',
+'oncut', 'ondataavaible', 'ondataavailable', 'ondatasetchanged',
+'ondatasetcomplete', 'ondblclick', 'ondeactivate', 'ondrag', 'ondragdrop',
+'ondragend', 'ondragenter', 'ondragexit', 'ondraggesture', 'ondragleave',
+'ondragover', 'ondragstart', 'ondrop', 'onerror', 'onerrorupdate',
+'onfilterchange', 'onfilterupdate', 'onfinish', 'onfocus', 'onfocusin',
+'onfocusout', 'onhelp', 'oninput', 'onkeydown', 'onkeypress', 'onkeyup',
+'onlayoutcomplete', 'onload', 'onlosecapture', 'onmousedown', 'onmouseenter',
+'onmouseleave', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup',
+'onmousewheel', 'onmove', 'onmoveend', 'onmoveout', 'onmovestart',
+'onnodeinserted', 'onnodeinsertedintodocument', 'onnoderemoved',
+'onnoderemovedfromdocument', 'onoverflowchanged', 'onpaint', 'onpaste',
+'onpopupHidden', 'onpopupHiding', 'onpopupShowing', 'onpopupShown',
+'onpropertychange', 'onreadystatechange', 'onreset', 'onresize',
+'onresizeend', 'onresizestart', 'onrowenter', 'onrowexit', 'onrowsdelete',
+'onrowsinserted', 'onscroll', 'onselect', 'onselectionchange',
+'onselectstart', 'onstart', 'onstop', 'onsubmit', 'onsubtreemodified',
+'ontext', 'onunderflow', 'onunload', 'overflow', 'profile', 'src', 'style',
+'usemap']
+attrs_to_check_map = {}
+for a in attrs_to_check: attrs_to_check_map[a]=1
attrs_to_prune = ['alt', 'label', 'prompt' 'standby', 'summary', 'title',
'abbr']
-
+# For recursive fetching of urls:
tags_to_recurse = ['a', 'applet', 'embed', 'frame', 'iframe', #'img',
- 'link', 'object', 'script']
-recurse_html = ['frame', 'iframe']
-attrs_to_recurse = ['background', 'classid', 'codebase', 'data', 'href',
+ 'link', 'object', 'script', 'layer', 'ilayer']
+recurse_html = ['frame', 'iframe', 'layer', 'ilayer']
+attrs_to_recurse = ['background', 'codebase', 'data', 'href',
'pluginurl', 'src']
#
@@ -237,6 +266,7 @@
traceback.print_exc()
return (0, "")
+ # TODO: Consider also returning mime type here
return (reply.code, content)
class Test:
@@ -379,12 +409,10 @@
def __init__(self, mt, wordlist, filetypes=scan_filetypes):
SearchBasedTest.__init__(self, mt, "HTTP", 80, wordlist)
self.fetch_targets = 5
- self.three_way_fails = {}
self.httpcode_fails = {}
- self.two_way_fails = {}
+ self.exit_fails = {}
self.successes = {}
- self.three_way_limit = 10
- self.two_way_limit = 100
+ self.exit_limit = 100
self.httpcode_limit = 100
self.scan_filetypes = filetypes
self.results = []
@@ -446,28 +474,28 @@
def remove_target(self, address):
SearchBasedTest.remove_target(self, address)
- del self.httpcode_limit[address]
- del self.three_way_limit[address]
- del self.successes[address]
- del self.two_way_limit[address]
+ if address in self.httpcode_fails: del self.httpcode_fails[address]
+ if address in self.successes: del self.successes[address]
+ if address in self.exit_fails: del self.exit_fails[address]
kill_results = []
for r in self.results:
if r.site == address:
kill_results.append(r)
for r in kill_results:
+ # XXX: Move files instead of removing them..
#r.remove_files()
self.results.remove(r)
def register_exit_failure(self, address, exit_node):
- if address in self.two_way_fails:
- self.two_way_fails[address].add(exit_node)
+ if address in self.exit_fails:
+ self.exit_fails[address].add(exit_node)
else:
- self.two_way_fails[address] = sets.Set([exit_node])
+ self.exit_fails[address] = sets.Set([exit_node])
# TODO: Do something if abundance of succesful tests?
# Problem is this can still trigger for localized content
- err_cnt = len(self.two_way_fails[address])
- if err_cnt > self.two_way_limit:
+ err_cnt = len(self.exit_fails[address])
+ if err_cnt > self.exit_limit:
if address not in self.successes: self.successes[address] = 0
plog("NOTICE", "Excessive HTTP 2-way failure ("+str(err_cnt)+" vs "+str(self.successes[address])+") for "+address+". Removing.")
@@ -493,23 +521,6 @@
else:
plog("ERROR", self.proto+" http error code failure at "+exit_node+". This makes "+str(err_cnt)+" node failures for "+address)
- def register_dynamic_failure(self, address, exit_node):
- if address in self.three_way_fails:
- self.three_way_fails[address].add(exit_node)
- else:
- self.three_way_fails[address] = sets.Set([exit_node])
-
- err_cnt = len(self.three_way_fails[address])
- if err_cnt > self.three_way_limit:
- # Remove all associated data for this url.
- # (Note, this also seems to imply we should report BadExit in bulk,
- # after we've had a chance for these false positives to be weeded out)
- if address not in self.successes: self.successes[address] = 0
- plog("NOTICE", "Excessive HTTP 3-way failure ("+str(err_cnt)+" vs "+str(self.successes[address])+") for "+address+". Removing.")
-
- self.remove_target(address)
- else:
- plog("ERROR", self.proto+" 3-way failure at "+exit_node+". This makes "+str(err_cnt)+" node failures for "+address)
def check_http(self, address):
''' check whether a http connection to a given address is molested '''
@@ -648,7 +659,7 @@
exit_content_file.close()
result = HttpTestResult(exit_node, address, TEST_FAILURE,
- FAILURE_DYNAMICTAGS, sha1sum_new.hexdigest(),
+ FAILURE_DYNAMICBINARY, sha1sum_new.hexdigest(),
psha1sum.hexdigest(), new_content_file.name,
exit_content_file.name,
content_prefix+'.content-old',
@@ -656,7 +667,9 @@
self.results.append(result)
self.datahandler.saveResult(result)
- self.register_dynamic_failure(address, exit_node)
+ # The HTTP Test should remove address immediately.
+ plog("NOTICE", "HTTP Test is removing dynamic URL "+address)
+ self.remove_target(address)
return TEST_FAILURE
class HTMLTest(HTTPTest):
@@ -667,6 +680,8 @@
self.min_targets = 9
self.recurse_filetypes = recurse_filetypes
self.fetch_queue = Queue.Queue()
+ self.dynamic_fails = {}
+ self.dynamic_limit = 10
def run_test(self):
# A single test should have a single cookie jar
@@ -698,6 +713,28 @@
def get_targets(self):
return self.get_search_urls('http', self.fetch_targets)
+ def remove_target(self, address):
+ HTTPTest.remove_target(self, address)
+ if address in self.dynamic_fails: del self.dynamic_fails[address]
+
+ def register_dynamic_failure(self, address, exit_node):
+ if address in self.dynamic_fails:
+ self.dynamic_fails[address].add(exit_node)
+ else:
+ self.dynamic_fails[address] = sets.Set([exit_node])
+
+ err_cnt = len(self.dynamic_fails[address])
+ if err_cnt > self.dynamic_limit:
+ # Remove all associated data for this url.
+ # (Note, this also seems to imply we should report BadExit in bulk,
+ # after we've had a chance for these false positives to be weeded out)
+ if address not in self.successes: self.successes[address] = 0
+ plog("NOTICE", "Excessive HTTP 3-way failure ("+str(err_cnt)+" vs "+str(self.successes[address])+") for "+address+". Removing.")
+
+ self.remove_target(address)
+ else:
+ plog("ERROR", self.proto+" 3-way failure at "+exit_node+". This makes "+str(err_cnt)+" node failures for "+address)
+
def _add_recursive_targets(self, soup, orig_addr):
# XXX: Watch for spider-traps! (ie mutually sourcing iframes)
# Only pull at most one filetype from the list of 'a' links
@@ -732,7 +769,7 @@
if str(tag.name) in tags_to_check:
return False
for attr in tag.attrs:
- if attr[0] in attrs_to_check:
+ if attr[0] in attrs_to_check_map:
return False
return True
@@ -760,7 +797,7 @@
for tag in to_extract:
tag.extract()
return soup
-
+
def check_html(self, address):
''' check whether a http connection to a given address is molested '''
plog('INFO', 'Conducting an html test with destination ' + address)
@@ -925,33 +962,55 @@
else: self.successes[address]=1
return TEST_SUCCESS
- # TODO: Can we create some kind of diff/masking filter
- # between the two non-Tor soups, and apply it to the
- # Tor soup, to see if anything additional has changed?
- # http://bramcohen.livejournal.com/37690.html
- # -> patiencediff.py vs difflib
- # "For small files difflib wins". And it's standard. Yay!
- tor_v_new = difflib.SequenceMatcher(lambda x: x == " ", str(psoup), str(soup_new))
- tor_v_orig = difflib.SequenceMatcher(lambda x: x == " ", str(psoup), str(soup))
- orig_v_new = difflib.SequenceMatcher(lambda x: x == " ", str(soup), str(soup_new))
+ # Lets try getting just the tag differences
+ # 1. Take difference between old and new tags both ways
+ # 2. Make map of tags that change to their attributes
+ # 3. Compare list of changed tags for tor vs new and
+ # see if any extra tags changed or if new attributes
+ # were added to additional tags
+ old_vs_new = SoupDiffer(soup, soup_new)
+ new_vs_old = SoupDiffer(soup_new, soup)
+ new_vs_tor = SoupDiffer(soup_new, psoup)
- # The key property is that the differences between the two non-tor fetches
- # match the differences between the Tor and the regular fetches
+ changed_tags = {}
+ # I'm an evil man and I'm going to CPU hell..
+ for tags in map(BeautifulSoup, old_vs_new.changed_tags()):
+ for t in tags.findAll():
+ if t.name not in changed_tags:
+ changed_tags[t.name] = sets.Set([])
+ for attr in t.attrs:
+ changed_tags[t.name].add(attr[0])
+ for tags in map(BeautifulSoup, new_vs_old.changed_tags()):
+ for t in tags.findAll():
+ if t.name not in changed_tags:
+ changed_tags[t.name] = sets.Set([])
+ for attr in t.attrs:
+ changed_tags[t.name].add(attr[0])
+
+ changed_content = bool(old_vs_new.changed_content() or old_vs_new.changed_content())
- plog("NOTICE", "Diffing charcateristics: "+str((orig_v_new.get_opcodes()==tor_v_orig.get_opcodes(),
- orig_v_new.get_matching_blocks()==tor_v_orig.get_matching_blocks(),
- orig_v_new.get_opcodes()==tor_v_new.get_opcodes(),
- orig_v_new.get_matching_blocks()==tor_v_new.get_matching_blocks())))
+ false_positive = True
+ for tags in map(BeautifulSoup, new_vs_tor.changed_tags()):
+ for t in tags.findAll():
+ if t.name not in changed_tags:
+ false_positive = False
+ else:
+ for attr in t.attrs:
+ if attr[0] not in changed_tags[t.name]:
+ false_positive = False
- diff_file = open(failed_prefix+'.diffs.'+exit_node[1:],'w')
- diff_file.write("orig_v_new.get_matching_blocks() =\n\t"+str(orig_v_new.get_matching_blocks())+"\n")
- diff_file.write("orig_v_new.get_opcodes() =\n\t"+str(orig_v_new.get_opcodes())+"\n\n")
- diff_file.write("tor_v_new.get_matching_blocks() =\n\t"+str(tor_v_new.get_matching_blocks())+"\n")
- diff_file.write("tor_v_new.get_opcodes() =\n\t"+str(tor_v_new.get_opcodes())+"\n\n")
- diff_file.write("tor_v_orig.get_matching_blocks() =\n\t"+str(tor_v_orig.get_matching_blocks())+"\n")
- diff_file.write("tor_v_orig.get_opcodes() =\n\t"+str(tor_v_orig.get_opcodes())+"\n\n")
- diff_file.close()
+ if new_vs_tor.changed_content() and not changed_content:
+ false_positive = False
+ if false_positive:
+ plog("NOTICE", "False positive detected for dynamic change at "+address+" via "+exit_node)
+ result = HtmlTestResult(exit_node, address, TEST_SUCCESS)
+ self.results.append(result)
+ #self.datahandler.saveResult(result)
+ if address in self.successes: self.successes[address]+=1
+ else: self.successes[address]=1
+ return TEST_SUCCESS
+
# XXX: Check for existence of this file before overwriting
exit_tag_file = open(failed_prefix+'.dyn-tags.'+exit_node[1:],'w')
exit_tag_file.write(str(psoup))
Modified: torflow/trunk/NetworkScanners/soatstats.py
===================================================================
--- torflow/trunk/NetworkScanners/soatstats.py 2009-01-29 13:17:43 UTC (rev 18317)
+++ torflow/trunk/NetworkScanners/soatstats.py 2009-01-29 14:22:47 UTC (rev 18318)
@@ -96,6 +96,54 @@
if node.counts[test].inconclusive != 0:
print `node.idhex` + "\t" + `node.counts[test].inconclusive`
+
+ # False positive test left in for verifcation and tweaking
+ # TODO: Remove this bit eventually
+ for result in data:
+ if result.__class__.__name__ == "HtmlTestResult":
+ if not result.tags_old or not result.tags or not result.exit_tags:
+ continue
+ new_vs_old = SoupDiffer(BeautifulSoup(open(result.tags, "r").read()),
+ BeautifulSoup(open(result.tags_old,
+ "r").read()))
+ old_vs_new = SoupDiffer(BeautifulSoup(open(result.tags_old, "r").read()),
+ BeautifulSoup(open(result.tags,
+ "r").read()))
+ new_vs_tor = SoupDiffer(BeautifulSoup(open(result.tags, "r").read()),
+ BeautifulSoup(open(result.exit_tags,
+ "r").read()))
+ changed_tags = {}
+ # I'm an evil man and I'm going to CPU hell..
+ for tags in map(BeautifulSoup, old_vs_new.changed_tags()):
+ for t in tags.findAll():
+ if t.name not in changed_tags:
+ changed_tags[t.name] = sets.Set([])
+ for attr in t.attrs:
+ changed_tags[t.name].add(attr[0])
+ for tags in map(BeautifulSoup, new_vs_old.changed_tags()):
+ for t in tags.findAll():
+ if t.name not in changed_tags:
+ changed_tags[t.name] = sets.Set([])
+ for attr in t.attrs:
+ changed_tags[t.name].add(attr[0])
+
+ changed_content = bool(old_vs_new.changed_content() or old_vs_new.changed_content())
+
+ false_positive = True
+ for tags in map(BeautifulSoup, new_vs_tor.changed_tags()):
+ for t in tags.findAll():
+ if t.name not in changed_tags:
+ false_positive = False
+ else:
+ for attr in t.attrs:
+ if attr[0] not in changed_tags[t.name]:
+ false_positive = False
+
+ if new_vs_tor.changed_content() and not changed_content:
+ false_positive = False
+
+ print false_positive
+
print ""
if __name__ == "__main__":
More information about the tor-commits
mailing list