[or-cvs] r18444: {torflow} Improve snakeinspector.py to diff JS/HTML content. After ins (torflow/trunk/NetworkScanners)
mikeperry at seul.org
mikeperry at seul.org
Mon Feb 9 12:04:02 UTC 2009
Author: mikeperry
Date: 2009-02-09 07:04:01 -0500 (Mon, 09 Feb 2009)
New Revision: 18444
Modified:
torflow/trunk/NetworkScanners/libsoat.py
torflow/trunk/NetworkScanners/snakeinspector.py
torflow/trunk/NetworkScanners/soat.py
Log:
Improve snakeinspector.py to diff JS/HTML content. After
inspecting some results, try to reduce false positives by
using Tor cookies for second Non-Tor fetch. Also flatten Soup
tag structure to make filtering of changing tags less
blinding.
Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py 2009-02-09 10:36:42 UTC (rev 18443)
+++ torflow/trunk/NetworkScanners/libsoat.py 2009-02-09 12:04:01 UTC (rev 18444)
@@ -9,8 +9,9 @@
import sys
import time
import traceback
+import difflib
sys.path.append("./libs")
-from BeautifulSoup.BeautifulSoup import Tag
+from BeautifulSoup.BeautifulSoup import Tag, SoupStrainer
import sets
@@ -66,7 +67,8 @@
self.reason = reason
self.false_positive=False
self.false_positive_reason="None"
-
+ self.verbose=False
+
def mark_false_positive(self, reason):
pass
@@ -196,14 +198,29 @@
except: pass
def __str__(self):
- # XXX: Re-run the JSDiffer and compare these differences
ret = TestResult.__str__(self)
- if self.content:
- ret += " "+self.content+"\n"
- if self.content_old:
- ret += " "+self.content_old+"\n"
- if self.content_exit:
- ret += " "+self.content_exit+"\n"
+ if self.verbose:
+ if self.content and self.content_old:
+ diff = difflib.unified_diff(open(self.content).read().split("\n"),
+ open(self.content_old).read().split("\n"),
+ "Non-Tor1", "Non-Tor2",
+ lineterm="")
+ for line in diff:
+ ret+=line+"\n"
+ if self.content and self.content_exit:
+ diff = difflib.unified_diff(open(self.content).read().split("\n"),
+ open(self.content_exit).read().split("\n"),
+ "Non-Tor", "Exit",
+ lineterm="")
+ for line in diff:
+ ret+=line+"\n"
+ else:
+ if self.content:
+ ret += " "+self.content+"\n"
+ if self.content_old:
+ ret += " "+self.content_old+"\n"
+ if self.content_exit:
+ ret += " "+self.content_exit+"\n"
return ret
class HtmlTestResult(TestResult):
@@ -232,14 +249,37 @@
except: pass
def __str__(self):
- # XXX: Re-run the SoupDiffer+JSDiffer and compare these differences
ret = TestResult.__str__(self)
- if self.content:
- ret += " "+self.content+"\n"
- if self.content_old:
- ret += " "+self.content_old+"\n"
- if self.content_exit:
- ret += " "+self.content_exit+"\n"
+ if self.verbose:
+ if self.content and self.content_old:
+ content = open(self.content).read().decode('ascii', 'ignore')
+ content_old = open(self.content_old).read().decode('ascii', 'ignore')
+ soup = FullyStrainedSoup(content)
+ old_soup = FullyStrainedSoup(content_old)
+ tags = map(str, soup.findAll())
+ old_tags = map(str, old_soup.findAll())
+ diff = difflib.unified_diff(tags, old_tags, "Non-Tor1", "Non-Tor1",
+ lineterm="")
+ for line in diff:
+ ret+=line+"\n"
+ if self.content and self.content_exit:
+ content = open(self.content).read().decode('ascii', 'ignore')
+ content_exit = open(self.content_exit).read().decode('ascii', 'ignore')
+ soup = FullyStrainedSoup(content)
+ tor_soup = FullyStrainedSoup(content_exit)
+ tags = map(str, soup.findAll())
+ tor_tags = map(str, tor_soup.findAll())
+ diff = difflib.unified_diff(tags, tor_tags, "Non-Tor", "Exit",
+ lineterm="")
+ for line in diff:
+ ret+=line+"\n"
+ else:
+ if self.content:
+ ret += " "+self.content+"\n"
+ if self.content_old:
+ ret += " "+self.content_old+"\n"
+ if self.content_exit:
+ ret += " "+self.content_exit+"\n"
return ret
class SSHTestResult(TestResult):
@@ -402,6 +442,53 @@
pickle.dump(result, result_file)
result_file.close()
+
+# These three bits are needed to fully recursively strain the parsed soup.
+# For some reason, the SoupStrainer does not get applied recursively..
+__first_strainer = SoupStrainer(lambda name, attrs: name in tags_to_check or
+ len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_check))) > 0)
+
+def __tag_not_worthy(tag):
+ if tag.name in tags_to_check:
+ return False
+ for attr in tag.attrs:
+ if attr[0] in attrs_to_check_map:
+ return False
+ return True
+
+def FullyStrainedSoup(html):
+ """ Remove all tags that are of no interest. Also remove content """
+ soup = TheChosenSoup(html, __first_strainer)
+ to_extract = []
+ for tag in soup.findAll():
+ to_prune = []
+ for attr in tag.attrs:
+ if attr[0] in attrs_to_prune:
+ to_prune.append(attr)
+ for attr in to_prune:
+ tag.attrs.remove(attr)
+ if __tag_not_worthy(tag):
+ to_extract.append(tag)
+ if tag.name not in tags_preserve_inner:
+ for child in tag.childGenerator():
+ if not isinstance(child, Tag) or __tag_not_worthy(child):
+ to_extract.append(child)
+ for tag in to_extract:
+ if isinstance(tag, Tag):
+ parent = tag.findParent()
+ for child in tag.findChildren():
+ parent.append(child)
+ for tag in to_extract:
+ tag.extract()
+ # Also flatten the tag structure
+ flattened_tags = soup.findAll()
+ for tag in flattened_tags:
+ if isinstance(tag, Tag): # Don't extract script/CSS strings.
+ tag.extract()
+ for tag in flattened_tags:
+ soup.append(tag)
+ return soup
+
class SoupDiffer:
""" Diff two soup tag sets, optionally writing diffs to outfile. """
def __init__(self, soup_old, soup_new):
Modified: torflow/trunk/NetworkScanners/snakeinspector.py
===================================================================
--- torflow/trunk/NetworkScanners/snakeinspector.py 2009-02-09 10:36:42 UTC (rev 18443)
+++ torflow/trunk/NetworkScanners/snakeinspector.py 2009-02-09 12:04:01 UTC (rev 18444)
@@ -31,7 +31,8 @@
results = [dh.getResult(argv[1])]
for r in results:
- if r.status == TEST_FAILURE:
+ r.verbose = True
+ if r.status == TEST_FAILURE and r.reason == "FailureExitOnly":
print r
print "\n-----------------------------\n"
Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py 2009-02-09 10:36:42 UTC (rev 18443)
+++ torflow/trunk/NetworkScanners/soat.py 2009-02-09 12:04:01 UTC (rev 18444)
@@ -578,8 +578,10 @@
# if content doesnt match, update the direct content and use new cookies
# If we have alternate IPs to bind to on this box, use them?
# Sometimes pages have the client IP encoded in them..
+ # Also, use the Tor cookies, since those identifiers are
+ # probably embeded in the Tor page as well.
BindingSocket.bind_to = refetch_ip
- (code_new, new_cookies_new, mime_type_new, content_new) = http_request(address, orig_cookie_jar, self.headers)
+ (code_new, new_cookies_new, mime_type_new, content_new) = http_request(address, orig_tor_cookie_jar, self.headers)
BindingSocket.bind_to = None
if not content_new:
@@ -601,9 +603,12 @@
# Need to do set subtraction and only save new cookies..
# or extract/make_cookies
+
self.cookie_jar = orig_cookie_jar
new_cookie_jar = cookielib.MozillaCookieJar()
- for cookie in new_cookies_new: new_cookie_jar.set_cookie(cookie)
+ for cookie in new_cookies_new:
+ new_cookie_jar.set_cookie(cookie)
+ self.cookie_jar.set_cookie(cookie) # Update..
os.rename(content_prefix+'.cookies', content_prefix+'.cookies-old')
try:
new_cookie_jar.save(content_prefix+'.cookies')
@@ -784,41 +789,7 @@
self.fetch_queue.put_nowait(i)
else:
plog("NOTICE", "Skipping "+i[0]+" target: "+i[1])
-
-
- def _tag_not_worthy(self, tag):
- if tag.name in tags_to_check:
- return False
- for attr in tag.attrs:
- if attr[0] in attrs_to_check_map:
- return False
- return True
- def _recursive_strain(self, soup):
- """ Remove all tags that are of no interest. Also remove content """
- to_extract = []
- for tag in soup.findAll():
- to_prune = []
- for attr in tag.attrs:
- if attr[0] in attrs_to_prune:
- to_prune.append(attr)
- for attr in to_prune:
- tag.attrs.remove(attr)
- if self._tag_not_worthy(tag):
- to_extract.append(tag)
- if tag.name not in tags_preserve_inner:
- for child in tag.childGenerator():
- if not isinstance(child, Tag) or self._tag_not_worthy(child):
- to_extract.append(child)
- for tag in to_extract:
- if isinstance(tag, Tag):
- parent = tag.findParent()
- for child in tag.findChildren():
- parent.append(child)
- for tag in to_extract:
- tag.extract()
- return soup
-
def check_js(self, address):
plog('INFO', 'Conducting a js test with destination ' + address)
ret = self.check_http_nodynamic(address)
@@ -872,15 +843,9 @@
content_prefix = http_content_dir+address_file
failed_prefix = http_failed_dir+address_file
- elements = SoupStrainer(lambda name, attrs: name in tags_to_check or
- len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_check))) > 0)
+ orig_soup = FullyStrainedSoup(orig_html.decode('ascii', 'ignore'))
+ tor_soup = FullyStrainedSoup(tor_html.decode('ascii', 'ignore'))
- orig_soup = self._recursive_strain(TheChosenSoup(orig_html.decode('ascii',
- 'ignore'), parseOnlyThese=elements))
-
- tor_soup = self._recursive_strain(TheChosenSoup(tor_html.decode('ascii',
- 'ignore'), parseOnlyThese=elements))
-
# Also find recursive urls
recurse_elements = SoupStrainer(lambda name, attrs:
name in tags_to_recurse and
@@ -908,9 +873,8 @@
self.datahandler.saveResult(result)
return TEST_INCONCLUSIVE
+ new_soup = FullyStrainedSoup(content_new)
- new_soup = self._recursive_strain(TheChosenSoup(content_new,
- parseOnlyThese=elements))
# compare the new and old content
# if they match, means the node has been changing the content
if str(orig_soup) == str(new_soup):
More information about the tor-commits
mailing list