[or-cvs] r19246: {torflow} Track HTTP header changes and properly classify urllib timeo (torflow/trunk/NetworkScanners)
mikeperry at seul.org
mikeperry at seul.org
Thu Apr 9 00:41:26 UTC 2009
Author: mikeperry
Date: 2009-04-08 20:41:26 -0400 (Wed, 08 Apr 2009)
New Revision: 19246
Modified:
torflow/trunk/NetworkScanners/libsoat.py
torflow/trunk/NetworkScanners/soat.py
Log:
Track HTTP header changes and properly classify urllib
timeouts as such.
Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py 2009-04-08 09:17:21 UTC (rev 19245)
+++ torflow/trunk/NetworkScanners/libsoat.py 2009-04-09 00:41:26 UTC (rev 19246)
@@ -76,6 +76,7 @@
FAILURE_URLERROR = "FailureURLError" # can also mean timeout...
FAILURE_CRYPTOERROR = "FailureCryptoError"
FAILURE_TIMEOUT = "FailureTimeout"
+FAILURE_HEADERCHANGE = "FailureHeaderChange"
# False positive reasons
FALSEPOSITIVE_HTTPERRORS = "FalsePositiveHTTPErrors"
@@ -833,7 +834,44 @@
ret.sort()
return ret
+class HeaderDiffer:
+ def __init__(self, orig_headers):
+ self.header_pool = sets.Set(orig_headers)
+ self.changed_headers = sets.Set([])
+ self._pickle_revision = 0
+
+ def filter_headers(headers):
+ ret = []
+ for h in headers:
+ matched = False
+ for i in ignore_http_headers:
+ if re.match(i, h[0]):
+ matched = True
+ if not matched: ret.append(h)
+ return sets.Set(ret)
+ filter_headers = Callable(filter_headers)
+
+ def depickle_upgrade(self):
+ pass
+ def prune_differences(self, new_headers):
+ new_headers = sets.Set(new_headers)
+ changed = new_headers - self.header_pool
+ for i in changed:
+ self.changed_headers.add(i[0])
+ self.header_pool.union_update(new_headers)
+
+ def show_differences(self, new_headers):
+ ret = ""
+ changed = sets.Set(new_headers) - self.header_pool
+ for i in changed:
+ if i[0] not in self.changed_headers:
+ ret += " "+i[0]+": "+i[1]+"\n"
+ if ret:
+ return "New HTTP Headers:\n"+ret
+ else:
+ return ret
+
class JSDiffer:
def __init__(self, js_string):
self._pickle_revision = 0
@@ -987,4 +1025,3 @@
ast_cnts = JSSoupDiffer._add_cnts(tag_cnts, ast_cnts)
return ast_cnts
-
Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py 2009-04-08 09:17:21 UTC (rev 19245)
+++ torflow/trunk/NetworkScanners/soat.py 2009-04-09 00:41:26 UTC (rev 19246)
@@ -136,8 +136,7 @@
plog("WARN", "Max content size exceeded for "+address+": "+length)
return (reply.code, None, [], "", "")
mime_type = reply.info().type.lower()
- reply_headers = sets.Set(filter(lambda h: h[0] not in ignore_http_headers,
- reply.info().items()))
+ reply_headers = HeaderDiffer.filter_headers(reply.info().items())
reply_headers.add(("mime-type", mime_type))
plog("DEBUG", "Mime type is "+mime_type+", length "+str(length))
content = decompress_response_data(reply)
@@ -147,8 +146,11 @@
return (-6.0, None, [], "", e.__class__.__name__+str(e))
except urllib2.HTTPError, e:
plog('NOTICE', "HTTP Error during request of "+address+": "+str(e))
- traceback.print_exc()
- return (e.code, None, [], "", e.__class__.__name__+str(e))
+ if str(e) == "<urlopen error timed out>": # Yah, super ghetto...
+ return (-6.0, None, [], "", e.__class__.__name__+str(e))
+ else:
+ traceback.print_exc()
+ return (e.code, None, [], "", e.__class__.__name__+str(e))
except (ValueError, urllib2.URLError), e:
plog('WARN', 'The http-request address ' + address + ' is malformed')
traceback.print_exc()
@@ -506,8 +508,7 @@
plog('ERROR', 'Soup-scraping of http://'+host+search_path+" failed")
traceback.print_exc()
print "Content is: "+str(content)
- return [protocol+"://www.eff.org", protocol+"://www.fastmail.fm", protocol+"://www.torproject.org", protocol+"://secure.wikileaks.org/"]
-
+ return [protocol+"://www.eff.org", protocol+"://www.fastmail.fm", protocol+"://www.torproject.org", protocol+"://secure.wikileaks.org/"]
# get the links and do some additional filtering
for link in soup.findAll('a'):
skip = True
@@ -516,7 +517,10 @@
skip = False
break
if skip: continue
- url = link['href']
+ if link.has_key(search_mode['realtgt']):
+ url = link[search_mode['realtgt']]
+ else:
+ url = link['href']
if protocol == 'any': prot_list = None
else: prot_list = [protocol]
if filetype == 'any': file_list = None
@@ -699,7 +703,11 @@
added_cookie_jar.load(content_prefix+'.cookies', ignore_discard=True)
self.cookie_jar.load(content_prefix+'.cookies', ignore_discard=True)
- content = None
+ header_file = open(content_prefix+'.headerdiff', 'r')
+ headerdiffer = pickle.load(header_file)
+ header_file.close()
+
+ content = None
mime_type = None
except IOError:
@@ -727,6 +735,11 @@
content_file = open(content_prefix+'.content', 'w')
content_file.write(content)
content_file.close()
+
+ header_file = open(content_prefix+'.headerdiff', 'w')
+ headerdiffer = HeaderDiffer(resp_headers)
+ pickle.dump(headerdiffer, header_file)
+ header_file.close()
# Need to do set subtraction and only save new cookies..
# or extract/make_cookies
@@ -823,9 +836,13 @@
self.tor_cookie_jar = orig_tor_cookie_jar
return TEST_FAILURE
+ hdiffs = headerdiffer.show_differences(presp_headers)
+ if hdiffs:
+ plog("NOTICE", "Header differences for "+address+": \n"+hdiffs)
+
# compare the content
# if content matches, everything is ok
- if psha1sum.hexdigest() == sha1sum.hexdigest():
+ if not hdiffs and psha1sum.hexdigest() == sha1sum.hexdigest():
result = HttpTestResult(exit_node, self.node_map[exit_node[1:]].nickname,
address, TEST_SUCCESS)
self.register_success(result)
@@ -876,6 +893,13 @@
datahandler.saveResult(result)
return TEST_INCONCLUSIVE
+ headerdiffer.prune_differences(resp_headers_new)
+ hdiffs = headerdiffer.show_differences(presp_headers)
+
+ header_file = open(content_prefix+'.headerdiff', 'w')
+ pickle.dump(headerdiffer, header_file)
+ header_file.close()
+
sha1sum_new = sha.sha(content_new)
if sha1sum.hexdigest() != sha1sum_new.hexdigest():
@@ -900,6 +924,18 @@
traceback.print_exc()
plog("WARN", "Error saving cookies in "+str(new_cookie_jar)+" to "+content_prefix+".cookies")
+ if hdiffs:
+ # XXX: We probably should store the header differ + exit headers
+ # for later comparison (ie if the header differ picks up more diffs)
+ plog("NOTICE", "Post-refetch header changes for "+address+": \n"+hdiffs)
+ result = HttpTestResult(exit_node,
+ self.node_map[exit_node[1:]].nickname,
+ address, TEST_FAILURE, FAILURE_HEADERCHANGE)
+ result.extra_info = hdiffs
+ self.register_dynamic_failure(result)
+ # Lets let the rest of the tests run too actually
+ #return TEST_FAILURE
+
# compare the node content and the new content
# if it matches, everything is ok
if psha1sum.hexdigest() == sha1sum_new.hexdigest():
@@ -985,7 +1021,7 @@
def _reset(self):
HTTPTest._reset(self)
self.targets = [] # FIXME: Lame..
- self.soupdiffer_files = {}
+ self.soupdiffer_files = {} # XXX: These two are now deprecated
self.jsdiffer_files = {}
def depickle_upgrade(self):
@@ -1057,6 +1093,7 @@
Test._remove_target_addr(self, target)
if target in self.soupdiffer_files: del self.soupdiffer_files[target]
if target in self.jsdiffer_files: del self.jsdiffer_files[target]
+
def refill_targets(self):
Test.refill_targets(self)
@@ -1082,6 +1119,7 @@
elif t.name in recurse_script:
if t.name == "link":
for a in t.attrs:
+ a = map(lambda x: x.lower(), a)
# Special case CSS and favicons
if (a[0] == "type" and a[1] == "text/css") or \
((a[0] == "rel" or a[0] == "rev") and a[1] == "stylesheet"):
@@ -1092,7 +1130,7 @@
plog("INFO", "Adding favicon of: "+str(t))
found_favicon = True
targets.append(("image", urlparse.urljoin(orig_addr, attr_tgt)))
- elif a[0] == "type" and a[1] in script_mime_types:
+ elif a[0] == "type" and self.is_script(a[1], ""):
plog("INFO", "Adding link script of: "+str(t))
targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
else:
@@ -1140,7 +1178,7 @@
def is_html(self, mime_type, content):
is_html = False
for type_match in html_mime_types:
- if re.match(type_match, mime_type):
+ if re.match(type_match, mime_type.lower()):
is_html = True
break
return is_html
@@ -1148,7 +1186,7 @@
def is_script(self, mime_type, content):
is_script = False
for type_match in script_mime_types:
- if re.match(type_match, mime_type):
+ if re.match(type_match, mime_type.lower()):
is_script = True
break
return is_script
@@ -1168,17 +1206,16 @@
content_prefix = http_content_dir+address_file
failed_prefix = http_failed_dir+address_file
- if address in self.jsdiffer_files:
+ if os.path.exists(content_prefix+".jsdiff"):
plog("DEBUG", "Loading jsdiff for "+address)
- jsdiff = pickle.load(open(self.jsdiffer_files[address], 'r'))
+ jsdiff = pickle.load(open(content_prefix+".jsdiff", 'r'))
jsdiff.depickle_upgrade()
else:
plog("DEBUG", "No jsdiff for "+address+". Creating+dumping")
jsdiff = JSDiffer(orig_js)
- self.jsdiffer_files[address] = content_prefix+".jsdiff"
jsdiff.prune_differences(new_js)
- pickle.dump(jsdiff, open(self.jsdiffer_files[address], 'w'))
+ pickle.dump(jsdiff, open(content_prefix+".jsdiff", 'w'))
has_js_changes = jsdiff.contains_differences(tor_js)
@@ -1196,7 +1233,7 @@
address, TEST_FAILURE, FAILURE_DYNAMIC,
content_prefix+".content", exit_content_file.name,
content_prefix+'.content-old',
- self.jsdiffer_files[address])
+ content_prefix+".jsdiff")
self.register_dynamic_failure(result)
return TEST_FAILURE
@@ -1277,17 +1314,16 @@
# 3. Compare list of changed tags for tor vs new and
# see if any extra tags changed or if new attributes
# were added to additional tags
- if address in self.soupdiffer_files:
+ if os.path.exists(content_prefix+".soupdiff"):
plog("DEBUG", "Loading soupdiff for "+address)
- soupdiff = pickle.load(open(self.soupdiffer_files[address], 'r'))
+ soupdiff = pickle.load(open(content_prefix+".soupdiff", 'r'))
soupdiff.depickle_upgrade()
soupdiff.prune_differences(new_soup)
else:
plog("DEBUG", "No soupdiff for "+address+". Creating+dumping")
soupdiff = SoupDiffer(orig_soup, new_soup)
- self.soupdiffer_files[address] = content_prefix+".soupdiff"
- pickle.dump(soupdiff, open(self.soupdiffer_files[address], 'w'))
+ pickle.dump(soupdiff, open(content_prefix+".soupdiff", 'w'))
more_tags = soupdiff.show_changed_tags(tor_soup)
more_attrs = soupdiff.show_changed_attrs(tor_soup)
@@ -1306,17 +1342,16 @@
false_positive = True
if false_positive:
- if address in self.jsdiffer_files:
+ if os.path.exists(content_prefix+".jsdiff"):
plog("DEBUG", "Loading jsdiff for "+address)
- jsdiff = pickle.load(open(self.jsdiffer_files[address], 'r'))
+ jsdiff = pickle.load(open(content_prefix+".jsdiff", 'r'))
jsdiff.depickle_upgrade()
else:
plog("DEBUG", "No jsdiff for "+address+". Creating+dumping")
jsdiff = JSSoupDiffer(orig_soup)
- self.jsdiffer_files[address] = content_prefix+".jsdiff"
jsdiff.prune_differences(new_soup)
- pickle.dump(jsdiff, open(self.jsdiffer_files[address], 'w'))
+ pickle.dump(jsdiff, open(content_prefix+".jsdiff", 'w'))
differences = jsdiff.show_differences(tor_soup)
false_positive = not differences
@@ -1335,11 +1370,11 @@
exit_content_file.write(tor_html)
exit_content_file.close()
- if address in self.jsdiffer_files:
- jsdiff_file = self.jsdiffer_files[address]
+ if os.path.exists(content_prefix+".jsdiff"):
+ jsdiff_file = content_prefix+".jsdiff"
else: jsdiff_file = None
- if address in self.soupdiffer_files:
- soupdiff_file = self.soupdiffer_files[address]
+ if os.path.exists(content_prefix+".soupdiff"):
+ soupdiff_file = content_prefix+".soupdiff"
else: soupdiff_file = None
result = HtmlTestResult(exit_node, self.node_map[exit_node[1:]].nickname,
@@ -2588,8 +2623,8 @@
global refetch_ip
BindingSocket.bind_to = refetch_ip
try:
- s = socket.socket()
- except socket.error, e:
+ socket.socket()
+ except socket.error:
plog("WARN", "Cannot bind to "+refetch_ip+". Ignoring refetch_ip setting.")
refetch_ip = None
BindingSocket.bind_to = None
More information about the tor-commits
mailing list