[or-cvs] r18382: {torflow} Improve cookie handling and http error code handling. (torflow/trunk/NetworkScanners)
mikeperry at seul.org
mikeperry at seul.org
Wed Feb 4 07:33:54 UTC 2009
Author: mikeperry
Date: 2009-02-04 02:33:53 -0500 (Wed, 04 Feb 2009)
New Revision: 18382
Modified:
torflow/trunk/NetworkScanners/libsoat.py
torflow/trunk/NetworkScanners/soat.py
Log:
Improve cookie handling and http error code handling.
Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py 2009-02-03 15:42:38 UTC (rev 18381)
+++ torflow/trunk/NetworkScanners/libsoat.py 2009-02-04 07:33:53 UTC (rev 18382)
@@ -492,7 +492,8 @@
if isinstance(child, Tag):
plog("ERROR", "Script tag with subtag!")
else:
- tag_cnts = JSDiffer._count_ast_elements(self, str(child), tag.name)
+ script = str(child).replace("<!--", "").replace("-->", "")
+ tag_cnts = JSDiffer._count_ast_elements(self, script, tag.name)
ast_cnts = JSSoupDiffer._add_cnts(tag_cnts, ast_cnts)
for attr in tag.attrs:
# hrmm.. %-encoding too? Firefox negs on it..
Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py 2009-02-03 15:42:38 UTC (rev 18381)
+++ torflow/trunk/NetworkScanners/soat.py 2009-02-04 07:33:53 UTC (rev 18382)
@@ -92,41 +92,45 @@
request.add_header(h, headers[h])
content = ""
+ new_cookies = []
+ mime_type = ""
try:
if cookie_jar != None:
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
reply = opener.open(request)
if "__filename" in cookie_jar.__dict__:
cookie_jar.save(cookie_jar.__filename)
+ new_cookies = cookie_jar.make_cookies(reply, request)
else:
reply = urllib2.urlopen(request)
length = reply.info().get("Content-Length")
if length and int(length) > max_content_size:
plog("WARN", "Max content size exceeded for "+address+": "+length)
- return (reply.code, "")
+ return (reply.code, [], "", "")
+ mime_type = reply.info().type
content = decompress_response_data(reply)
except urllib2.HTTPError, e:
plog('WARN', "HTTP Error during request of "+address)
traceback.print_exc()
- return (e.code, "")
+ return (e.code, [], "", "")
except (ValueError, urllib2.URLError):
plog('WARN', 'The http-request address ' + address + ' is malformed')
traceback.print_exc()
- return (0, "")
+ return (0, [], "", "")
except (IndexError, TypeError, socks.Socks5Error), e:
plog('WARN', 'An error occured while negotiating socks5 with Tor: '+str(e))
traceback.print_exc()
- return (0, "")
+ return (0, [], "", "")
except KeyboardInterrupt:
raise KeyboardInterrupt
except:
plog('WARN', 'An unknown HTTP error occured for '+address)
traceback.print_exc()
- return (0, "")
+ return (0, [], "", "")
# TODO: Consider also returning mime type here
- return (reply.code, content)
+ return (reply.code, new_cookies, mime_type, content)
class Test:
""" Base class for our tests """
@@ -243,11 +247,11 @@
try:
# XXX: This does not handle http error codes.. (like 302!)
if search_mode["useragent"]:
- (code, content) = http_request(search_url, search_cookies)
+ (code, new_cookies, mime_type, content) = http_request(search_url, search_cookies)
else:
headers = copy.copy(firefox_headers)
del headers["User-Agent"]
- (code, content) = http_request(search_url, search_cookies, headers)[1]
+ (code, new_cookies, mime_type, content) = http_request(search_url, search_cookies, headers)[1]
except socket.gaierror:
plog('ERROR', 'Scraping of http://'+host+search_path+" failed")
traceback.print_exc()
@@ -318,11 +322,12 @@
exit_node = self.mt.get_exit_node()
plog("ERROR", "Cookie mismatch at "+exit_node+":\nTor Cookies:"+tor_cookies+"\nPlain Cookies:\n"+plain_cookies)
result = CookieTestResult(exit_node, TEST_FAILURE,
- FAILURE_COOKIEMISMATCH, plain_cookies,
- tor_cookies)
+ FAILURE_COOKIEMISMATCH, plain_cookies,
+ tor_cookies)
self.results.append(result)
self.datahandler.saveResult(result)
return TEST_FAILURE
+
return TEST_SUCCESS
def run_test(self):
@@ -418,44 +423,16 @@
''' check whether a http connection to a given address is molested '''
plog('INFO', 'Conducting an http test with destination ' + address)
- defaultsocket = socket.socket
- socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, tor_host, tor_port)
- socket.socket = socks.socksocket
- (pcode, pcontent) = http_request(address, self.tor_cookie_jar, self.headers)
- psha1sum = sha.sha(pcontent)
-
- # reset the connection to direct
- socket.socket = defaultsocket
-
- exit_node = self.mt.get_exit_node()
- if exit_node == 0 or exit_node == '0' or not exit_node:
- plog('WARN', 'We had no exit node to test, skipping to the next test.')
- return TEST_SUCCESS
-
- if pcode - (pcode % 100) != 200:
- plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address)
- result = HttpTestResult(exit_node, address, TEST_INCONCLUSIVE,
- INCONCLUSIVE_BADHTTPCODE+str(pcode))
- self.results.append(result)
- self.datahandler.saveResult(result)
- self.register_httpcode_failure(address, exit_node)
- return TEST_INCONCLUSIVE
-
# an address representation acceptable for a filename
address_file = self.datahandler.safeFilename(address[7:])
content_prefix = http_content_dir+address_file
failed_prefix = http_failed_dir+address_file
+
+ # Keep a copy of the cookie jar before mods for refetch
+ orig_cookie_jar = cookielib.LWPCookieJar()
+ for cookie in self.cookie_jar: orig_cookie_jar.set_cookie(cookie)
- # if we have no content, we had a connection error
- if pcontent == "":
- plog("NOTICE", exit_node+" failed to fetch content for "+address)
- result = HttpTestResult(exit_node, address, TEST_INCONCLUSIVE,
- INCONCLUSIVE_NOEXITCONTENT)
- self.results.append(result)
- self.datahandler.saveResult(result)
- return TEST_INCONCLUSIVE
-
try:
# Load content from disk, md5
content_file = open(content_prefix+'.content', 'r')
@@ -469,7 +446,14 @@
content = None
except IOError:
- (code, content) = http_request(address, self.cookie_jar, self.headers)
+ (code, new_cookies, mime_type, content) = http_request(address, self.cookie_jar, self.headers)
+
+ if code - (code % 100) != 200:
+ plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
+ # Just remove it
+ self.remove_target(address)
+ return TEST_INCONCLUSIVE
+
if not content:
plog("WARN", "Failed to direct load "+address)
return TEST_INCONCLUSIVE
@@ -479,8 +463,12 @@
content_file.write(content)
content_file.close()
+ # Need to do set subtraction and only save new cookies..
+ # or extract/make_cookies
+ new_cookie_jar = cookielib.LWPCookieJar()
+ for cookie in new_cookies: new_cookie_jar.set_cookie(cookie)
try:
- self.cookie_jar.save(content_prefix+'.cookies')
+ new_cookie_jar.save(content_prefix+'.cookies')
except:
traceback.print_exc()
plog("WARN", "Error saving cookies in "+str(self.cookie_jar)+" to "+content_prefix+".cookies")
@@ -490,6 +478,40 @@
plog('ERROR', e)
return TEST_INCONCLUSIVE
+
+ defaultsocket = socket.socket
+ socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, tor_host, tor_port)
+ socket.socket = socks.socksocket
+
+ (pcode, pnew_cookies, pmime_type, pcontent) = http_request(address, self.tor_cookie_jar, self.headers)
+ psha1sum = sha.sha(pcontent)
+
+ # reset the connection to direct
+ socket.socket = defaultsocket
+
+ exit_node = self.mt.get_exit_node()
+ if exit_node == 0 or exit_node == '0' or not exit_node:
+ plog('WARN', 'We had no exit node to test, skipping to the next test.')
+ return TEST_SUCCESS
+
+ if pcode - (pcode % 100) != 200:
+ plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address)
+ result = HttpTestResult(exit_node, address, TEST_INCONCLUSIVE,
+ INCONCLUSIVE_BADHTTPCODE+str(pcode))
+ self.results.append(result)
+ self.datahandler.saveResult(result)
+ self.register_httpcode_failure(address, exit_node)
+ return TEST_INCONCLUSIVE
+
+ # if we have no content, we had a connection error
+ if pcontent == "":
+ plog("NOTICE", exit_node+" failed to fetch content for "+address)
+ result = HttpTestResult(exit_node, address, TEST_INCONCLUSIVE,
+ INCONCLUSIVE_NOEXITCONTENT)
+ self.results.append(result)
+ self.datahandler.saveResult(result)
+ return TEST_INCONCLUSIVE
+
# compare the content
# if content matches, everything is ok
if psha1sum.hexdigest() == sha1sum.hexdigest():
@@ -501,11 +523,10 @@
return TEST_SUCCESS
# if content doesnt match, update the direct content and use new cookies
- self.cookie_jar = cookielib.LWPCookieJar()
# If we have alternate IPs to bind to on this box, use them?
# Sometimes pages have the client IP encoded in them..
BindingSocket.bind_to = refetch_ip
- (code_new, content_new) = http_request(address, self.cookie_jar, self.headers)
+ (code_new, new_cookies_new, mime_type_new, content_new) = http_request(address, orig_cookie_jar, self.headers)
BindingSocket.bind_to = None
if not content_new:
@@ -541,13 +562,17 @@
new_content_file = open(content_prefix+'.content', 'w')
new_content_file.write(content_new)
new_content_file.close()
-
+
+ # Need to do set subtraction and only save new cookies..
+ # or extract/make_cookies
+ new_cookie_jar = cookielib.LWPCookieJar()
+ for cookie in new_cookies_new: new_cookie_jar.set_cookie(cookie)
os.rename(content_prefix+'.cookies', content_prefix+'.cookies-old')
try:
- self.cookie_jar.save(content_prefix+'.cookies')
+ new_cookie_jar.save(content_prefix+'.cookies')
except:
traceback.print_exc()
- plog("WARN", "Error saving cookies in "+str(self.cookie_jar)+" to "+content_prefix+".cookies")
+ plog("WARN", "Error saving cookies in "+str(new_cookie_jar)+" to "+content_prefix+".cookies")
# compare the node content and the new content
# if it matches, everything is ok
@@ -676,19 +701,19 @@
for t in tags:
#plog("DEBUG", "Got tag: "+str(t))
for a in t.attrs:
- attr_name = str(a[0])
- attr_tgt = str(a[1])
+ attr_name = a[0]
+ attr_tgt = a[1]
if attr_name in attrs_to_recurse:
- if str(t.name) in recurse_html:
+ if t.name in recurse_html:
targets.append(("html", urlparse.urljoin(orig_addr, attr_tgt)))
- elif str(t.name) in recurse_script:
- if str(t.name) == "link":
+ elif t.name in recurse_script:
+ if t.name == "link":
for a in t.attrs:
- if str(a[0]) == "type" and str(a[1]) in link_script_types:
+ if a[0] == "type" and a[1] in link_script_types:
targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
else:
targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
- elif str(t.name) == 'a':
+ elif t.name == 'a':
if attr_name == "href":
for f in self.recurse_filetypes:
if f not in got_type and attr_tgt[-len(f):] == f:
@@ -705,7 +730,7 @@
def _tag_not_worthy(self, tag):
- if str(tag.name) in tags_to_check:
+ if tag.name in tags_to_check:
return False
for attr in tag.attrs:
if attr[0] in attrs_to_check_map:
@@ -791,58 +816,18 @@
''' check whether a http connection to a given address is molested '''
plog('INFO', 'Conducting an html test with destination ' + address)
- defaultsocket = socket.socket
- socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, tor_host, tor_port)
- socket.socket = socks.socksocket
-
- # Wikipedia and others can give us 403.. So what do we do about that?
- # Count the number of occurrances vs successful runs then remove the url
- (pcode, pcontent) = http_request(address, self.tor_cookie_jar, self.headers)
-
- # reset the connection to direct
- socket.socket = defaultsocket
-
- exit_node = self.mt.get_exit_node()
- if exit_node == 0 or exit_node == '0' or not exit_node:
- plog('WARN', 'We had no exit node to test, skipping to the next test.')
- return TEST_SUCCESS
-
- # XXX: Fetch via non-tor first...
- if pcode - (pcode % 100) != 200:
- plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address)
- result = HttpTestResult(exit_node, address, TEST_INCONCLUSIVE,
- INCONCLUSIVE_BADHTTPCODE+str(pcode))
- self.results.append(result)
- self.datahandler.saveResult(result)
- self.register_httpcode_failure(address, exit_node)
- return TEST_INCONCLUSIVE
-
# an address representation acceptable for a filename
address_file = self.datahandler.safeFilename(address[7:])
content_prefix = http_content_dir+address_file
failed_prefix = http_failed_dir+address_file
- # if we have no content, we had a connection error
- if pcontent == "":
- plog("NOTICE", exit_node+" failed to fetch content for "+address)
- result = HtmlTestResult(exit_node, address, TEST_INCONCLUSIVE,
- INCONCLUSIVE_NOEXITCONTENT)
- self.results.append(result)
- self.datahandler.saveResult(result)
- return TEST_INCONCLUSIVE
+ # Keep a copy of the cookie jar before mods for refetch
+ orig_cookie_jar = cookielib.LWPCookieJar()
+ for cookie in self.cookie_jar: orig_cookie_jar.set_cookie(cookie)
elements = SoupStrainer(lambda name, attrs: name in tags_to_check or
len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_check))) > 0)
- pcontent = pcontent.decode('ascii', 'ignore')
- psoup = self._recursive_strain(BeautifulSoup(pcontent, parseOnlyThese=elements))
- # Also find recursive urls
- recurse_elements = SoupStrainer(lambda name, attrs:
- name in tags_to_recurse and
- len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_recurse))) > 0)
- self._add_recursive_targets(BeautifulSoup(pcontent, recurse_elements),
- address)
-
# load the original tag structure
# if we don't have any yet, get it
soup = 0
@@ -854,10 +839,18 @@
self.cookie_jar.load(content_prefix+'.cookies', 'w')
except IOError:
- (code, content) = http_request(address, self.cookie_jar, self.headers)
+ (code, new_cookies, mime_type, content) = http_request(address, self.cookie_jar, self.headers)
+
+ if code - (code % 100) != 200:
+ plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
+ # Just remove it
+ self.remove_target(address)
+ return TEST_INCONCLUSIVE
+
content = content.decode('ascii','ignore')
soup = self._recursive_strain(BeautifulSoup(content, parseOnlyThese=elements))
+ # XXX: str of this may be bad if there are unicode chars inside
string_soup = str(soup)
if not string_soup:
plog("WARN", "Empty soup for "+address)
@@ -865,11 +858,15 @@
tag_file.write(string_soup)
tag_file.close()
+ # Need to do set subtraction and only save new cookies..
+ # or extract/make_cookies
+ new_cookie_jar = cookielib.LWPCookieJar()
+ for cookie in new_cookies: new_cookie_jar.set_cookie(cookie)
try:
- self.cookie_jar.save(content_prefix+'.cookies')
+ new_cookie_jar.save(content_prefix+'.cookies')
except:
traceback.print_exc()
- plog("WARN", "Error saving cookies in "+str(self.cookie_jar)+" to "+content_prefix+".cookies")
+ plog("WARN", "Error saving cookies in "+str(new_cookie_jar)+" to "+content_prefix+".cookies")
content_file = open(content_prefix+'.content', 'w')
content_file.write(content)
@@ -879,10 +876,56 @@
plog('ERROR', 'Failed parsing the tag tree for ' + address)
plog('ERROR', e)
return TEST_INCONCLUSIVE
+
if soup == 0:
plog('ERROR', 'Failed to get the correct tag structure for ' + address)
return TEST_INCONCLUSIVE
+
+ defaultsocket = socket.socket
+ socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, tor_host, tor_port)
+ socket.socket = socks.socksocket
+
+ # Wikipedia and others can give us 403.. So what do we do about that?
+ # Count the number of occurrances vs successful runs then remove the url
+ (pcode, pcookies, pmime_type, pcontent) = http_request(address, self.tor_cookie_jar, self.headers)
+
+ # reset the connection to direct
+ socket.socket = defaultsocket
+
+ exit_node = self.mt.get_exit_node()
+ if exit_node == 0 or exit_node == '0' or not exit_node:
+ plog('WARN', 'We had no exit node to test, skipping to the next test.')
+ return TEST_SUCCESS
+
+ if pcode - (pcode % 100) != 200:
+ plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address)
+ result = HttpTestResult(exit_node, address, TEST_INCONCLUSIVE,
+ INCONCLUSIVE_BADHTTPCODE+str(pcode))
+ self.results.append(result)
+ self.datahandler.saveResult(result)
+ self.register_httpcode_failure(address, exit_node)
+ return TEST_INCONCLUSIVE
+
+ # if we have no content, we had a connection error
+ if pcontent == "":
+ plog("NOTICE", exit_node+" failed to fetch content for "+address)
+ result = HtmlTestResult(exit_node, address, TEST_INCONCLUSIVE,
+ INCONCLUSIVE_NOEXITCONTENT)
+ self.results.append(result)
+ self.datahandler.saveResult(result)
+ return TEST_INCONCLUSIVE
+
+ pcontent = pcontent.decode('ascii', 'ignore')
+ psoup = self._recursive_strain(BeautifulSoup(pcontent, parseOnlyThese=elements))
+
+ # Also find recursive urls
+ recurse_elements = SoupStrainer(lambda name, attrs:
+ name in tags_to_recurse and
+ len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_recurse))) > 0)
+ self._add_recursive_targets(BeautifulSoup(pcontent, recurse_elements),
+ address)
+
# compare the content
# if content matches, everything is ok
if str(psoup) == str(soup):
@@ -894,12 +937,10 @@
return TEST_SUCCESS
# if content doesnt match, update the direct content and use new cookies
- self.cookie_jar = cookielib.LWPCookieJar()
-
# If we have alternate IPs to bind to on this box, use them?
# Sometimes pages have the client IP encoded in them..
BindingSocket.bind_to = refetch_ip
- (code_new, content_new) = http_request(address, self.cookie_jar, self.headers)
+ (code_new, new_cookies_new, mime_type_new, content_new) = http_request(address, orig_cookie_jar, self.headers)
BindingSocket.bind_to = None
content_new = content_new.decode('ascii', 'ignore')
@@ -945,11 +986,15 @@
tag_file.close()
os.rename(content_prefix+'.cookies', content_prefix+'.cookies-old')
+ # Need to do set subtraction and only save new cookies..
+ # or extract/make_cookies
+ new_cookie_jar = cookielib.LWPCookieJar()
+ for cookie in new_cookies_new: new_cookie_jar.set_cookie(cookie)
try:
- self.cookie_jar.save(content_prefix+'.cookies')
+ new_cookie_jar.save(content_prefix+'.cookies')
except:
traceback.print_exc()
- plog("WARN", "Error saving cookies in "+str(self.cookie_jar)+" to "+content_prefix+".cookies")
+ plog("WARN", "Error saving cookies in "+str(new_cookie_jar)+" to "+content_prefix+".cookies")
os.rename(content_prefix+'.content', content_prefix+'.content-old')
new_content_file = open(content_prefix+'.content', 'w')
More information about the tor-commits
mailing list