[or-cvs] r18294: {torflow} Handle HTTP error codes better. Also fix issues with tag sav (torflow/trunk/NetworkScanners)
mikeperry at seul.org
mikeperry at seul.org
Wed Jan 28 13:58:02 UTC 2009
Author: mikeperry
Date: 2009-01-28 08:58:02 -0500 (Wed, 28 Jan 2009)
New Revision: 18294
Modified:
torflow/trunk/NetworkScanners/libsoat.py
torflow/trunk/NetworkScanners/soat.py
Log:
Handle HTTP error codes better. Also fix issues with tag
saving/reloading causing false positives.
Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py 2009-01-28 13:15:08 UTC (rev 18293)
+++ torflow/trunk/NetworkScanners/libsoat.py 2009-01-28 13:58:02 UTC (rev 18294)
@@ -35,6 +35,7 @@
# Inconclusive reasons
INCONCLUSIVE_NOEXITCONTENT = "InconclusiveNoExitContent"
INCONCLUSIVE_NOLOCALCONTENT = "InconclusiveNoLocalContent"
+INCONCLUSIVE_BADHTTPCODE = "InconclusiveBadHTTPCode"
# Failed reasons
FAILURE_EXITONLY = "FailureExitOnly"
Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py 2009-01-28 13:15:08 UTC (rev 18293)
+++ torflow/trunk/NetworkScanners/soat.py 2009-01-28 13:58:02 UTC (rev 18294)
@@ -99,7 +99,7 @@
# FIXME: This does not affect the ssl search.. no other search engines have
# a working "inurl:" that allows you to pick the scheme to be https like google...
-default_search_mode = yahoo_search_mode
+default_search_mode = google_search_mode
# ports to test in the consistency test
@@ -207,24 +207,28 @@
length = reply.info().get("Content-Length")
if length and int(length) > max_content_size:
plog("WARN", "Max content size exceeded for "+address+": "+length)
- return ""
+ return (reply.code, "")
content = decompress_response_data(reply)
+ except urllib2.HTTPError, e:
+ plog('WARN', "HTTP Error during request of "+address)
+ traceback.print_exc()
+ return (e.code, "")
except (ValueError, urllib2.URLError):
plog('WARN', 'The http-request address ' + address + ' is malformed')
traceback.print_exc()
- return ""
+ return (0, "")
except (IndexError, TypeError, socks.Socks5Error), e:
plog('WARN', 'An error occured while negotiating socks5 with Tor: '+str(e))
traceback.print_exc()
- return ""
+ return (0, "")
except KeyboardInterrupt:
raise KeyboardInterrupt
except:
plog('WARN', 'An unknown HTTP error occured for '+address)
traceback.print_exc()
- return ""
+ return (0, "")
- return content
+ return (reply.code, content)
class Test:
""" Base class for our tests """
@@ -316,11 +320,11 @@
try:
# XXX: This does not handle http error codes.. (like 302!)
if search_mode["useragent"]:
- content = http_request(search_url, search_cookies)
+ (code, content) = http_request(search_url, search_cookies)
else:
headers = copy.copy(firefox_headers)
del headers["User-Agent"]
- content = http_request(search_url, search_cookies, headers)
+ (code, content) = http_request(search_url, search_cookies, headers)[1]
except socket.gaierror:
plog('ERROR', 'Scraping of http://'+host+search_path+" failed")
traceback.print_exc()
@@ -367,10 +371,12 @@
SearchBasedTest.__init__(self, mt, "HTTP", 80, wordlist)
self.fetch_targets = 5
self.three_way_fails = {}
+ self.httpcode_fails = {}
self.two_way_fails = {}
self.successes = {}
self.three_way_limit = 10
- self.two_way_limit = 250
+ self.two_way_limit = 100
+ self.httpcode_limit = 100
self.scan_filetypes = filetypes
self.results = []
@@ -428,7 +434,21 @@
if url[-len(ftype):] == ftype:
urls[ftype].append(url)
return urls
-
+
+ def remove_target(self, address):
+ SearchBasedTest.remove_target(self, address)
+ del self.httpcode_limit[address]
+ del self.three_way_limit[address]
+ del self.successes[address]
+ del self.two_way_limit[address]
+ kill_results = []
+ for r in self.results:
+ if r.site == address:
+ kill_results.append(r)
+ for r in kill_results:
+ #r.remove_files()
+ self.results.remove(r)
+
def register_exit_failure(self, address, exit_node):
if address in self.two_way_fails:
self.two_way_fails[address].add(exit_node)
@@ -443,18 +463,27 @@
plog("NOTICE", "Excessive HTTP 2-way failure ("+str(err_cnt)+" vs "+str(self.successes[address])+") for "+address+". Removing.")
self.remove_target(address)
- del self.three_way_limit[address]
- del self.successes[address]
- del self.two_way_limit[address]
- kill_results = []
- for r in self.results:
- kill_results.append(r)
- for r in kill_results:
- #r.remove_files()
- self.results.remove(r)
else:
plog("ERROR", self.proto+" 2-way failure at "+exit_node+". This makes "+str(err_cnt)+" node failures for "+address)
+ def register_httpcode_failure(self, address, exit_node):
+ if address in self.httpcode_fails:
+ self.httpcode_fails[address].add(exit_node)
+ else:
+ self.httpcode_fails[address] = sets.Set([exit_node])
+
+ err_cnt = len(self.httpcode_fails[address])
+ if err_cnt > self.httpcode_limit:
+ # Remove all associated data for this url.
+ # (Note, this also seems to imply we should report BadExit in bulk,
+ # after we've had a chance for these false positives to be weeded out)
+ if address not in self.successes: self.successes[address] = 0
+ plog("NOTICE", "Excessive HTTP error code failure ("+str(err_cnt)+" vs "+str(self.successes[address])+") for "+address+". Removing.")
+
+ self.remove_target(address)
+ else:
+ plog("ERROR", self.proto+" 3-way failure at "+exit_node+". This makes "+str(err_cnt)+" node failures for "+address)
+
def register_dynamic_failure(self, address, exit_node):
if address in self.three_way_fails:
self.three_way_fails[address].add(exit_node)
@@ -470,15 +499,6 @@
plog("NOTICE", "Excessive HTTP 3-way failure ("+str(err_cnt)+" vs "+str(self.successes[address])+") for "+address+". Removing.")
self.remove_target(address)
- del self.three_way_limit[address]
- del self.successes[address]
- del self.two_way_limit[address]
- kill_results = []
- for r in self.results:
- kill_results.append(r)
- for r in kill_results:
- #r.remove_files()
- self.results.remove(r)
else:
plog("ERROR", self.proto+" 3-way failure at "+exit_node+". This makes "+str(err_cnt)+" node failures for "+address)
@@ -490,7 +510,7 @@
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, tor_host, tor_port)
socket.socket = socks.socksocket
- pcontent = http_request(address, self.tor_cookie_jar, self.headers)
+ (pcode, pcontent) = http_request(address, self.tor_cookie_jar, self.headers)
psha1sum = sha.sha(pcontent)
# reset the connection to direct
@@ -501,6 +521,15 @@
plog('WARN', 'We had no exit node to test, skipping to the next test.')
return TEST_SUCCESS
+ if pcode - (pcode % 100) != 200:
+ plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address)
+ result = HttpTestResult(exit_node, address, TEST_INCONCLUSIVE,
+ INCONCLUSIVE_BADHTTPCODE+str(pcode))
+ self.results.append(result)
+ self.datahandler.saveResult(result)
+ self.register_httpcode_failure(address, exit_node)
+ return TEST_INCONCLUSIVE
+
# an address representation acceptable for a filename
address_file = self.datahandler.safeFilename(address[7:])
content_prefix = http_content_dir+address_file
@@ -526,7 +555,7 @@
content_file.close()
except IOError:
- content = http_request(address, self.cookie_jar, self.headers)
+ (code, content) = http_request(address, self.cookie_jar, self.headers)
if not content:
plog("WARN", "Failed to direct load "+address)
return TEST_INCONCLUSIVE
@@ -552,7 +581,7 @@
return TEST_SUCCESS
# if content doesnt match, update the direct content
- content_new = http_request(address, self.cookie_jar, self.headers)
+ (code_new, content_new) = http_request(address, self.cookie_jar, self.headers)
if not content_new:
plog("WARN", "Failed to re-frech "+address+" outside of Tor. Did our network fail?")
result = HttpTestResult(exit_node, address, TEST_INCONCLUSIVE,
@@ -713,7 +742,10 @@
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, tor_host, tor_port)
socket.socket = socks.socksocket
- pcontent = http_request(address, self.tor_cookie_jar, self.headers)
+ # XXX: Wikipedia and others can give us 403.. So what do we do about that?
+ # Probably should count the number of occurrances vs successful runs
+ # then remove the url
+ (pcode, pcontent) = http_request(address, self.tor_cookie_jar, self.headers)
# reset the connection to direct
socket.socket = defaultsocket
@@ -723,6 +755,15 @@
plog('WARN', 'We had no exit node to test, skipping to the next test.')
return TEST_SUCCESS
+ if pcode - (pcode % 100) != 200:
+ plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address)
+ result = HttpTestResult(exit_node, address, TEST_INCONCLUSIVE,
+ INCONCLUSIVE_BADHTTPCODE+str(pcode))
+ self.results.append(result)
+ self.datahandler.saveResult(result)
+ self.register_httpcode_failure(address, exit_node)
+ return TEST_INCONCLUSIVE
+
# an address representation acceptable for a filename
address_file = self.datahandler.safeFilename(address[7:])
content_prefix = http_content_dir+address_file
@@ -758,13 +799,15 @@
tag_file.close()
except IOError:
- content = http_request(address, self.cookie_jar, self.headers)
+ (code, content) = http_request(address, self.cookie_jar, self.headers)
content = content.decode('ascii','ignore')
soup = self._recursive_strain(BeautifulSoup(content, parseOnlyThese=elements))
+ string_soup = str(soup)
+ if not string_soup:
+ plog("WARN", "Empty soup for "+address)
tag_file = open(content_prefix+'.tags', 'w')
- # the space is needed in case we have some page with no matching tags at all
- tag_file.write(soup.__str__() + ' ')
+ tag_file.write(string_soup)
tag_file.close()
content_file = open(content_prefix+'.content', 'w')
@@ -790,7 +833,7 @@
return TEST_SUCCESS
# if content doesnt match, update the direct content
- content_new = http_request(address, self.cookie_jar, self.headers)
+ (code_new, content_new) = http_request(address, self.cookie_jar, self.headers)
content_new = content_new.decode('ascii', 'ignore')
if not content_new:
plog("WARN", "Failed to re-frech "+address+" outside of Tor. Did our network fail?")
@@ -807,7 +850,7 @@
if soup == soup_new:
# XXX: Check for existence of this file before overwriting
exit_tag_file = open(failed_prefix+'.tags.'+exit_node[1:],'w')
- exit_tag_file.write(psoup.__str__())
+ exit_tag_file.write(str(psoup))
exit_tag_file.close()
exit_content_file = open(failed_prefix+'.content.'+exit_node[1:], 'w')
@@ -826,10 +869,13 @@
# if content has changed outside of tor, update the saved file
os.rename(content_prefix+'.tags', content_prefix+'.tags-old')
+ string_soup_new = str(soup_new)
+ if not string_soup_new:
+ plog("WARN", "Empty soup for "+address)
tag_file = open(content_prefix+'.tags', 'w')
- tag_file.write(soup_new.__str__()+' ')
+ tag_file.write(string_soup_new)
tag_file.close()
-
+
os.rename(content_prefix+'.content', content_prefix+'.content-old')
new_content_file = open(content_prefix+'.content', 'w')
new_content_file.write(content_new)
More information about the tor-commits
mailing list