[or-cvs] r18300: {torflow} Add ability to prune attributes filled with (often localized (torflow/trunk/NetworkScanners)
mikeperry at seul.org
mikeperry at seul.org
Wed Jan 28 17:11:52 UTC 2009
Author: mikeperry
Date: 2009-01-28 12:11:52 -0500 (Wed, 28 Jan 2009)
New Revision: 18300
Modified:
torflow/trunk/NetworkScanners/libsoat.py
torflow/trunk/NetworkScanners/soat.py
Log:
Add ability to prune attributes filled with (often localized)
text. Also print out some information on difflibing seemingly
dynamic content to see if we can detect automatic changes
between fetches.
Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py 2009-01-28 16:30:06 UTC (rev 18299)
+++ torflow/trunk/NetworkScanners/libsoat.py 2009-01-28 17:11:52 UTC (rev 18300)
@@ -74,6 +74,7 @@
self.content_exit = content_exit
self.content_old = content_old
+ # XXX: Instead of removing these, move them to a 'falsepositives' dir
def remove_files(self):
try: os.unlink(self.content)
except: pass
Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py 2009-01-28 16:30:06 UTC (rev 18299)
+++ torflow/trunk/NetworkScanners/soat.py 2009-01-28 17:11:52 UTC (rev 18300)
@@ -43,6 +43,7 @@
import cookielib
import sha
import Queue
+import difflib
from libsoat import *
@@ -179,6 +180,8 @@
'onkeypress', 'onkeyup','onload', 'onmousedown', 'onmousemove',
'onmouseout', 'onmouseover','onmouseup', 'onreset', 'onselect',
'onsubmit', 'onunload', 'profile', 'src', 'usemap']
+attrs_to_prune = ['alt', 'label', 'prompt' 'standby', 'summary', 'title',
+ 'abbr']
tags_to_recurse = ['a', 'applet', 'embed', 'frame', 'iframe', #'img',
@@ -559,6 +562,8 @@
sha1sum.update(buf)
buf = content_file.read(4096)
content_file.close()
+
+ self.cookie_jar.load(content_prefix+'.cookies', 'w')
except IOError:
(code, content) = http_request(address, self.cookie_jar, self.headers)
@@ -570,6 +575,8 @@
content_file = open(content_prefix+'.content', 'w')
content_file.write(content)
content_file.close()
+
+ self.cookie_jar.save(content_prefix+'.cookies', 'w')
except TypeError, e:
plog('ERROR', 'Failed obtaining the shasum for ' + address)
@@ -621,6 +628,9 @@
new_content_file = open(content_prefix+'.content', 'w')
new_content_file.write(content_new)
new_content_file.close()
+
+ os.rename(content_prefix+'.cookies', content_prefix+'.cookies-old')
+ self.cookie_jar.save(content_prefix+'.cookies', 'w')
# compare the node content and the new content
# if it matches, everything is ok
@@ -730,6 +740,12 @@
""" Remove all tags that are of no interest. Also remove content """
to_extract = []
for tag in soup.findAll():
+ to_prune = []
+ for attr in tag.attrs:
+ if attr[0] in attrs_to_prune:
+ to_prune.append(attr)
+ for attr in to_prune:
+ tag.attrs.remove(attr)
if self._tag_not_worthy(tag):
to_extract.append(tag)
if tag.name not in tags_preserve_inner:
@@ -804,6 +820,8 @@
soup = BeautifulSoup(tag_file.read())
tag_file.close()
+ self.cookie_jar.load(content_prefix+'.cookies', 'w')
+
except IOError:
(code, content) = http_request(address, self.cookie_jar, self.headers)
content = content.decode('ascii','ignore')
@@ -816,6 +834,8 @@
tag_file.write(string_soup)
tag_file.close()
+ self.cookie_jar.save(content_prefix+'.cookies', 'w')
+
content_file = open(content_prefix+'.content', 'w')
content_file.write(content)
content_file.close()
@@ -881,6 +901,9 @@
tag_file = open(content_prefix+'.tags', 'w')
tag_file.write(string_soup_new)
tag_file.close()
+
+ os.rename(content_prefix+'.cookies', content_prefix+'.cookies-old')
+ self.cookie_jar.save(content_prefix+'.cookies', 'w')
os.rename(content_prefix+'.content', content_prefix+'.content-old')
new_content_file = open(content_prefix+'.content', 'w')
@@ -903,10 +926,30 @@
# http://bramcohen.livejournal.com/37690.html
# -> patiencediff.py vs difflib
# "For small files difflib wins". And it's standard. Yay!
+ tor_v_new = difflib.SequenceMatcher(lambda x: x == " ", str(psoup), str(soup_new))
+ tor_v_orig = difflib.SequenceMatcher(lambda x: x == " ", str(psoup), str(soup))
+ orig_v_new = difflib.SequenceMatcher(lambda x: x == " ", str(soup), str(soup_new))
+ # The key property is that the differences between the two non-tor fetches
+ # match the differences between the Tor and the regular fetches
+
+ plog("NOTICE", "Diffing charcateristics: "+str((orig_v_new.get_opcodes()==tor_v_orig.get_opcodes(),
+ orig_v_new.get_matching_blocks()==tor_v_orig.get_matching_blocks(),
+ orig_v_new.get_opcodes()==tor_v_new.get_opcodes(),
+ orig_v_new.get_matching_blocks()==tor_v_new.get_matching_blocks())))
+
+ diff_file = open(failed_prefix+'.diffs.'+exit_node[1:],'w')
+ diff_file.write("orig_v_new.get_matching_blocks() =\n\t"+str(orig_v_new.get_matching_blocks())+"\n")
+ diff_file.write("orig_v_new.get_opcodes() =\n\t"+str(orig_v_new.get_opcodes())+"\n\n")
+ diff_file.write("tor_v_new.get_matching_blocks() =\n\t"+str(tor_v_new.get_matching_blocks())+"\n")
+ diff_file.write("tor_v_new.get_opcodes() =\n\t"+str(tor_v_new.get_opcodes())+"\n\n")
+ diff_file.write("tor_v_orig.get_matching_blocks() =\n\t"+str(tor_v_orig.get_matching_blocks())+"\n")
+ diff_file.write("tor_v_orig.get_opcodes() =\n\t"+str(tor_v_orig.get_opcodes())+"\n\n")
+ diff_file.close()
+
# XXX: Check for existence of this file before overwriting
exit_tag_file = open(failed_prefix+'.dyn-tags.'+exit_node[1:],'w')
- exit_tag_file.write(psoup.__str__())
+ exit_tag_file.write(str(psoup))
exit_tag_file.close()
exit_content_file = open(failed_prefix+'.dyn-content.'+exit_node[1:], 'w')
More information about the tor-commits
mailing list