[or-cvs] r18285: {torflow} - Implemented an HTTP scanner that does SHA1 sums of non-HTM (in torflow/trunk/NetworkScanners: . data/soat/http)
mikeperry at seul.org
mikeperry at seul.org
Wed Jan 28 04:29:25 UTC 2009
Author: mikeperry
Date: 2009-01-27 23:29:25 -0500 (Tue, 27 Jan 2009)
New Revision: 18285
Added:
torflow/trunk/NetworkScanners/data/soat/http/content/
Removed:
torflow/trunk/NetworkScanners/data/soat/http/tags/
Modified:
torflow/trunk/NetworkScanners/libsoat.py
torflow/trunk/NetworkScanners/soat.py
torflow/trunk/NetworkScanners/soatcli.py
Log:
- Implemented an HTTP scanner that does SHA1 sums of non-HTML
content. Scan common juicy filetypes.
- Made HTML scanner more robust in the face of dynamic URLs
- Properly filter out only the tags of interest, and
increase this tag set to cover more scriptable items.
- Add logic to remove results+URLs that are still dynamic
from scan
- Additionally, make the HTML scanner crawl the document a
bit for interesting items to also scan at that exit, and to
make it look more like a browser. Add cookie support during
a single exit test for the same reason.
- Support using Yahoo for scraping documents and filetypes
for http scanning since Google is utterly stingy with the
search queries.
- Change storage of results to be a bit more clearly
organized and store some information on result subtypes.
*phew*! Probably shoulda broke this up into seperate commits,
but I was on a roll.
Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py 2009-01-28 04:17:46 UTC (rev 18284)
+++ torflow/trunk/NetworkScanners/libsoat.py 2009-01-28 04:29:25 UTC (rev 18285)
@@ -20,14 +20,27 @@
data_dir = './data/soat/'
ssl_certs_dir = data_dir + 'ssl/certs/'
-http_tags_dir = data_dir + 'http/tags/'
+http_data_dir = data_dir + 'http/'
+http_content_dir = data_dir + 'http/content'
+http_failed_dir = data_dir + 'http/failed/'
+http_inconclusive_dir = data_dir + 'http/inconclusive/'
+
# constants
TEST_SUCCESS = 0
TEST_INCONCLUSIVE = 1
TEST_FAILURE = 2
+# Inconclusive reasons
+INCONCLUSIVE_NOEXITCONTENT = "InconclusiveNoExitContent"
+INCONCLUSIVE_NOLOCALCONTENT = "InconclusiveNoLocalContent"
+
+# Failed reasons
+FAILURE_EXITONLY = "FailureExitOnly"
+FAILURE_DYNAMICTAGS = "FailureDynamicTags"
+
+
# classes to use with pickle to dump test results into files
class TestResult(object):
@@ -43,42 +56,96 @@
def __init__(self, exit_node, ssl_site, cert_file, status):
super(SSLTestResult, self).__init__(exit_node, ssl_site, status)
self.cert = cert_file
+ self.proto = "ssl"
class HttpTestResult(TestResult):
''' Represents the result of a http test '''
- def __init__(self, exit_node, website, tag_prints, status):
+ def __init__(self, exit_node, website, status, reason=None,
+ sha1sum=None, exit_sha1sum=None, content=None,
+ content_exit=None, content_old=None, sha1sum_old=None):
super(HttpTestResult, self).__init__(exit_node, website, status)
- self.tag_prints = tag_prints
+ self.proto = "http"
+ self.reason = reason
+ self.sha1sum = sha1sum
+ self.sha1sum_old = sha1sum_old
+ self.exit_sha1sum = exit_sha1sum
+ self.content = content
+ self.content_exit = content_exit
+ self.content_old = content_old
+ def remove_files(self):
+ try: os.unlink(self.content)
+ except: pass
+ try: os.unlink(self.content_old)
+ except: pass
+ try: os.unlink(self.content_exit)
+ except: pass
+
+
+class HtmlTestResult(TestResult):
+ ''' Represents the result of a http test '''
+ def __init__(self, exit_node, website, status, reason=None,
+ tags=None, exit_tags=None, content=None,
+ content_exit=None, content_old=None, tags_old=None):
+ super(HtmlTestResult, self).__init__(exit_node, website, status)
+ self.proto = "http"
+ self.reason = reason
+ self.tags = tags
+ self.tags_old = tags_old
+ self.exit_tags = exit_tags
+ self.content = content
+ self.content_exit = content_exit
+ self.content_old = content_old
+
+ def remove_files(self):
+ try: os.unlink(self.tags)
+ except: pass
+ try: os.unlink(self.tags_old)
+ except: pass
+ try: os.unlink(self.exit_tags)
+ except: pass
+ try: os.unlink(self.content)
+ except: pass
+ try: os.unlink(self.content_old)
+ except: pass
+ try: os.unlink(self.content_exit)
+ except: pass
+
class SSHTestResult(TestResult):
''' Represents the result of an ssh test '''
def __init__(self, exit_node, ssh_site, status):
super(SSHTestResult, self).__init__(exit_node, ssh_site, status)
+ self.proto = "ssh"
class DNSTestResult(TestResult):
''' Represents the result of a dns test '''
def __init__(self, exit_node, dns_site, status):
super(DNSTestResult, self).__init__(exit_node, dns_site, status)
+ self.proto = "dns"
class DNSRebindTestResult(TestResult):
''' Represents the result of a dns rebind test '''
def __init__(self, exit_node, dns_rebind_site, status):
super(DNSRebindTestResult, self).__init__(exit_node, dns_rebind_site, status)
+ self.proto = "dns"
class SMTPTestResult(TestResult):
''' Represents the result of an smtp test '''
def __init__(self, exit_node, smtp_site, status):
super(SMTPTestResult, self).__init__(exit_node, smtp_site, status)
+ self.proto = "smtp"
class IMAPTestResult(TestResult):
''' Represents the result of an imap test '''
def __init__(self, exit_node, imap_site, status):
super(IMAPTestResult, self).__init__(exit_node, imap_site, status)
+ self.proto = "imap"
class POPTestResult(TestResult):
''' Represents the result of a pop test '''
def __init__(self, exit_node, pop_site, status):
super(POPTestResult, self).__init__(exit_node, pop_site, status)
+ self.proto = "pop"
class DataHandler:
''' Class for saving and managing test result data '''
@@ -174,7 +241,7 @@
def saveResult(self, result):
''' generic method for saving test results '''
address = ''
- if result.__class__.__name__ == 'HttpTestResult':
+ if result.__class__.__name__ == 'HtmlTestResult' or result.__class__.__name__ == 'HttpTestResult':
address = self.safeFilename(result.site[7:])
elif result.__class__.__name__ == 'SSLTestResult':
address = self.safeFilename(result.site[8:])
@@ -183,7 +250,7 @@
else:
raise Exception, 'This doesn\'t seems to be a result instance.'
- dir = data_dir + result.__class__.__name__[:-10].lower() + '/'
+ dir = data_dir+result.proto.lower()+'/'
if result.status == TEST_SUCCESS:
dir += 'successful/'
if result.status == TEST_INCONCLUSIVE:
@@ -191,7 +258,7 @@
if result.status == TEST_FAILURE:
dir += 'failed/'
- result_file = open(dir + result.exit_node[1:] + "-" + address + '.result', 'w')
+ result_file = open(dir+address+'.result.'+result.exit_node[1:], 'w')
pickle.dump(result, result_file)
result_file.close()
Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py 2009-01-28 04:17:46 UTC (rev 18284)
+++ torflow/trunk/NetworkScanners/soat.py 2009-01-28 04:29:25 UTC (rev 18285)
@@ -41,6 +41,8 @@
import zlib,gzip
import urlparse
import cookielib
+import sha
+import Queue
from libsoat import *
@@ -66,9 +68,13 @@
# these are used when searching for 'random' urls for testing
wordlist_file = './wordlist.txt';
-allowed_filetypes = ['all','pdf']
-result_per_type = 5
+# Hrmm.. Too many of these and Google really h8s us..
+scan_filetypes = ['exe','pdf','doc','msi']#,'rpm','dmg','pkg','dpkg']
+# Avoid vmware images+isos plz. Nobody could possibly have the patience
+# to download anything much larger than 30MB over Tor anyways ;)
+max_content_size = 30*1024*1024
+
firefox_headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.8.1) Gecko/20061010 Firefox/2.0',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@@ -80,12 +86,18 @@
}
# http://www.voidspace.org.uk/python/articles/cookielib.shtml
-google_cookie_file="google_cookies.lwp"
-google_cookies=None
+search_cookie_file="search_cookies.lwp"
+search_cookies=None
-#
+yahoo_search_mode = {"host" : "search.yahoo.com", "query":"p", "filetype": "originurlextension:", "inurl":None, "class":"yschttl", "useragent":False}
+google_search_mode = {"host" : "www.google.com", "query":"q", "filetype":"filetype:", "inurl":"inurl:", "class" : "l", "useragent":True}
+
+# XXX: This does not affect the ssl search.. no other search engines have
+# a working inurl that allows you to pick the scheme to be https like google...
+default_search_mode = yahoo_search_mode
+
+
# ports to test in the consistency test
-#
ports_to_check = [
["pop", ExitPolicyRestriction('255.255.255.255', 110), "pops", ExitPolicyRestriction('255.255.255.255', 995)],
@@ -143,9 +155,25 @@
# Note: the more we add, the greater the potential for false positives...
# We also only care about the ones that work for FF2/FF3.
tags_to_check = ['a', 'area', 'base', 'applet', 'embed', 'form', 'frame',
- 'iframe', 'img', 'link', 'object', 'script', 'meta', 'body']
-attrs_to_check = ['onclick', 'ondblclick', 'onmousedown', 'onmouseup', 'onmouseover',
- 'onmousemove', 'onmouseout', 'onkeypress','onkeydown','onkeyup']
+ 'input', 'iframe', 'img', 'link', 'object', 'script', 'meta',
+ 'body', 'style']
+
+tags_preserve_inner = ['script','style']
+
+attrs_to_check = ['onblur', 'onchange', 'onclick', 'ondblclick', 'onfocus',
+ 'onkeydown', 'onkeypress', 'onkeyup', 'onload','onmousedown',
+ 'onmouseup', 'onmouseover', 'onmousemove', 'onmouseout',
+ 'onreset', 'onselect', 'onsubmit', 'onunload', 'profile',
+ 'src', 'usemap', 'background', 'data', 'classid',
+ 'codebase', 'profile']
+
+tags_to_recurse = ['applet', 'embed', 'object', 'script', 'frame', 'iframe',
+ 'img', 'link', 'a']
+
+recurse_html = ['frame', 'iframe']
+attrs_to_recurse = ['src', 'pluginurl', 'data', 'classid', 'codebase', 'href',
+ 'background']
+
#
# constants
#
@@ -153,11 +181,11 @@
linebreak = '\r\n'
# Http request handling
-def http_request(address, cookie_jar=None):
+def http_request(address, cookie_jar=None, headers=firefox_headers):
''' perform a http GET-request and return the content received '''
request = urllib2.Request(address)
- for h in firefox_headers.iterkeys():
- request.add_header(h, firefox_headers[h])
+ for h in headers.iterkeys():
+ request.add_header(h, headers[h])
content = ""
try:
@@ -168,12 +196,19 @@
cookie_jar.save(cookie_jar.__filename)
else:
reply = urllib2.urlopen(request)
+
+ length = reply.info().get("Content-Length")
+ if length and int(length) > max_content_size:
+ plog("WARN", "Max content size exceeded for "+address+": "+length)
+ return ""
content = decompress_response_data(reply)
except (ValueError, urllib2.URLError):
plog('WARN', 'The http-request address ' + address + ' is malformed')
+ traceback.print_exc()
return ""
except (IndexError, TypeError):
plog('WARN', 'An error occured while negotiating socks5 with Tor')
+ traceback.print_exc()
return ""
except KeyboardInterrupt:
raise KeyboardInterrupt
@@ -192,7 +227,6 @@
self.mt = mt
self.datahandler = DataHandler()
self.min_targets = 10
- self.rewind()
def run_test(self):
raise NotImplemented()
@@ -223,8 +257,14 @@
self.targets = self.get_targets()
if not self.targets:
raise NoURLsFound("No URLS found for protocol "+self.proto)
- targets = "\n\t".join(self.targets)
- plog("INFO", "Using the following urls for "+self.proto+" scan:\n\t"+targets)
+ if type(self.targets) == dict:
+ for subtype in self.targets.iterkeys():
+ targets = "\n\t".join(self.targets[subtype])
+ plog("INFO", "Using the following urls for "+self.proto+"/"+subtype+" scan:\n\t"+targets)
+
+ else:
+ targets = "\n\t".join(self.targets)
+ plog("INFO", "Using the following urls for "+self.proto+" scan:\n\t"+targets)
self.tests_run = 0
self.nodes_marked = 0
self.nodes = self.mt.get_nodes_for_port(self.port)
@@ -234,17 +274,14 @@
self.total_nodes = len(self.nodes)
-class GoogleBasedTest(Test):
+class SearchBasedTest(Test):
def __init__(self, mt, proto, port, wordlist):
self.wordlist = wordlist
Test.__init__(self, mt, proto, port)
- def get_google_urls(self, protocol='any', results_per_type=10, host_only=False, filetypes=['any']):
+ def get_search_urls(self, protocol='any', results_per_type=10, host_only=False, filetypes=['any'], search_mode=default_search_mode):
'''
construct a list of urls based on the wordlist, filetypes and protocol.
-
- Note: since we currently use google, which doesn't index by protocol,
- searches for anything but 'any' could be rather slow
'''
plog('INFO', 'Searching google for relevant sites...')
@@ -255,22 +292,28 @@
while len(type_urls) < results_per_type:
query = random.choice(self.wordlist)
if filetype != 'any':
- query += ' filetype:' + filetype
- if protocol != 'any':
- query += ' inurl:' + protocol # this isn't too reliable, but we'll re-filter results later
+ query += " "+search_mode["filetype"]+filetype
+ if protocol != 'any' and search_mode["inurl"]:
+ query += " "+search_mode["inurl"]+protocol # this isn't too reliable, but we'll re-filter results later
#query += '&num=' + `g_results_per_page`
# search google for relevant pages
# note: google only accepts requests from idenitified browsers
# TODO gracefully handle the case when google doesn't want to give us result anymore
- host = 'www.google.com'
- params = urllib.urlencode({'q' : query})
+ host = search_mode["host"]
+ params = urllib.urlencode({search_mode["query"] : query})
search_path = '/search' + '?' + params
search_url = "http://"+host+search_path
-
+
+ plog("INFO", "Search url: "+search_url)
try:
# XXX: This does not handle http error codes.. (like 302!)
- content = http_request(search_url, google_cookies)
+ if search_mode["useragent"]:
+ content = http_request(search_url, search_cookies)
+ else:
+ headers = copy.copy(firefox_headers)
+ del headers["User-Agent"]
+ content = http_request(search_url, search_cookies, headers)
except socket.gaierror:
plog('ERROR', 'Scraping of http://'+host+search_path+" failed")
traceback.print_exc()
@@ -291,7 +334,7 @@
return [protocol+"://www.eff.org", protocol+"://www.fastmail.fm", protocol+"://www.torproject.org", protocol+"://secure.wikileaks.org/"]
# get the links and do some additional filtering
- for link in soup.findAll('a', {'class' : 'l'}):
+ for link in soup.findAll('a', {'class' : search_mode["class"]}):
url = link['href']
if (protocol != 'any' and url[:len(protocol)] != protocol or
filetype != 'any' and url[-len(filetype):] != filetype):
@@ -302,34 +345,115 @@
type_urls.append(host)
else:
type_urls.append(link['href'])
-
- if type_urls > results_per_type:
- type_urls = random.sample(type_urls, results_per_type) # make sure we don't get more urls than needed
+ plog("INFO", "Have "+str(len(type_urls))+"/"+str(results_per_type)+" google urls so far..")
+
+ # make sure we don't get more urls than needed
+ # hrmm...
+ #if type_urls > results_per_type:
+ # type_urls = random.sample(type_urls, results_per_type)
urllist.extend(type_urls)
return list(Set(urllist))
-class HTTPTest(GoogleBasedTest):
- # TODO: Create an MD5HTTPTest for filetype scans, and also have this
- # test spawn instances of it for script, image, and object tags
- # Also, spawn copies of ourself for frame and iframe tags
- def __init__(self, mt, wordlist):
- # XXX: Change these to 10 and 20 once we exercise the fetch logic
- self.fetch_targets = 10
- GoogleBasedTest.__init__(self, mt, "HTTP", 80, wordlist)
- self.min_targets = 9
+class HTTPTest(SearchBasedTest):
+ def __init__(self, mt, wordlist, filetypes=scan_filetypes):
+ SearchBasedTest.__init__(self, mt, "HTTP", 80, wordlist)
+ self.fetch_targets = 5
self.three_way_fails = {}
self.two_way_fails = {}
+ self.successes = {}
self.three_way_limit = 10
self.two_way_limit = 250
-
+ self.scan_filetypes = filetypes
+ self.results = []
+
def run_test(self):
+ # A single test should have a single cookie jar
+ self.tor_cookie_jar = cookielib.LWPCookieJar()
+ self.cookie_jar = cookielib.LWPCookieJar()
+ self.headers = copy.copy(firefox_headers)
+
+ ret_result = TEST_SUCCESS
self.tests_run += 1
- return self.check_http(random.choice(self.targets))
+ n_tests = random.choice(xrange(1,len(self.scan_filetypes)+1))
+ filetypes = random.sample(self.scan_filetypes, n_tests)
+
+ plog("INFO", "HTTPTest decided to fetch "+str(n_tests)+" urls of types: "+str(filetypes))
+
+ for ftype in filetypes:
+ # XXX: Set referrer to random or none for each of these
+ address = random.choice(self.targets[ftype])
+ result = self.check_http(address)
+ if result > ret_result:
+ ret_result = result
+ return ret_result
+
def get_targets(self):
- return self.get_google_urls('http', self.fetch_targets)
+ raw_urls = self.get_search_urls('http', self.fetch_targets, filetypes=self.scan_filetypes)
+ urls = {}
+ # Slow, but meh..
+ for ftype in self.scan_filetypes: urls[ftype] = []
+ for url in raw_urls:
+ for ftype in self.scan_filetypes:
+ if url[-len(ftype):] == ftype:
+ urls[ftype].append(url)
+ return urls
+
+ def register_exit_failure(self, address, exit_node):
+ if address in self.two_way_fails:
+ self.two_way_fails[address].add(exit_node)
+ else:
+ self.two_way_fails[address] = sets.Set([exit_node])
+
+ # TODO: Do something if abundance of succesful tests?
+ # Problem is this can still trigger for localized content
+ err_cnt = len(self.two_way_fails[address])
+ if err_cnt > self.two_way_limit:
+ if address not in self.successes: self.successes[address] = 0
+ plog("NOTICE", "Excessive HTTP 2-way failure ("+str(err_cnt)+" vs "+str(self.successes[address])+") for "+address+". Removing.")
+
+ self.remove_target(address)
+ del self.three_way_limit[address]
+ del self.successes[address]
+ del self.two_way_limit[address]
+ kill_results = []
+ for r in self.results:
+ kill_results.append(r)
+ for r in kill_results:
+ #r.remove_files()
+ self.results.remove(r)
+ else:
+ plog("ERROR", self.proto+" 2-way failure at "+exit_node+". This makes "+str(err_cnt)+" node failures for "+address)
+
+ def register_dynamic_failure(self, address, exit_node):
+ if address in self.three_way_fails:
+ self.three_way_fails[address].add(exit_node)
+ else:
+ self.three_way_fails[address] = sets.Set([exit_node])
+
+ err_cnt = len(self.three_way_fails[address])
+ if err_cnt > self.three_way_limit:
+ # Remove all associated data for this url.
+ # (Note, this also seems to imply we should report BadExit in bulk,
+ # after we've had a chance for these false positives to be weeded out)
+ if address not in self.successes: self.successes[address] = 0
+ plog("NOTICE", "Excessive HTTP 3-way failure ("+str(err_cnt)+" vs "+str(self.successes[address])+") for "+address+". Removing.")
+
+ self.remove_target(address)
+ del self.three_way_limit[address]
+ del self.successes[address]
+ del self.two_way_limit[address]
+ kill_results = []
+ for r in self.results:
+ kill_results.append(r)
+ for r in kill_results:
+ #r.remove_files()
+ self.results.remove(r)
+ else:
+ plog("ERROR", self.proto+" 3-way failure at "+exit_node+". This makes "+str(err_cnt)+" node failures for "+address)
+
def check_http(self, address):
''' check whether a http connection to a given address is molested '''
plog('INFO', 'Conducting an http test with destination ' + address)
@@ -338,48 +462,276 @@
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, tor_host, tor_port)
socket.socket = socks.socksocket
- pcontent = http_request(address)
+ pcontent = http_request(address, self.tor_cookie_jar, self.headers)
+ psha1sum = sha.sha(pcontent)
# reset the connection to direct
socket.socket = defaultsocket
exit_node = self.mt.get_exit_node()
if exit_node == 0 or exit_node == '0' or not exit_node:
- plog('INFO', 'We had no exit node to test, skipping to the next test.')
+ plog('WARN', 'We had no exit node to test, skipping to the next test.')
return TEST_SUCCESS
# an address representation acceptable for a filename
address_file = self.datahandler.safeFilename(address[7:])
+ content_prefix = http_content_dir+address_file
+ failed_prefix = http_failed_dir+address_file
# if we have no content, we had a connection error
if pcontent == "":
- result = HttpTestResult(exit_node, address, 0, TEST_INCONCLUSIVE)
+ plog("NOTICE", exit_node+" failed to fetch content for "+address)
+ result = HttpTestResult(exit_node, address, TEST_INCONCLUSIVE,
+ INCONCLUSIVE_NOEXITCONTENT)
+ self.results.append(result)
self.datahandler.saveResult(result)
return TEST_INCONCLUSIVE
- elements = SoupStrainer(lambda name, attrs : name in tags_to_check or
- len(Set(attrs).intersection(Set(attrs_to_check))) > 0)
+ try:
+ # Load content from disk, md5
+ content_file = open(content_prefix+'.content', 'r')
+ sha1sum = sha.sha()
+ buf = content_file.read(4096)
+ while buf:
+ sha1sum.update(buf)
+ buf = content_file.read(4096)
+ content_file.close()
+
+ except IOError:
+ content = http_request(address, self.cookie_jar, self.headers)
+ if not content:
+ plog("WARN", "Failed to direct load "+address)
+ return TEST_INCONCLUSIVE
+ sha1sum = sha.sha(content)
+
+ content_file = open(content_prefix+'.content', 'w')
+ content_file.write(content)
+ content_file.close()
+
+ except TypeError, e:
+ plog('ERROR', 'Failed obtaining the shasum for ' + address)
+ plog('ERROR', e)
+ return TEST_INCONCLUSIVE
+
+ # compare the content
+ # if content matches, everything is ok
+ if psha1sum.hexdigest() == sha1sum.hexdigest():
+ result = HttpTestResult(exit_node, address, TEST_SUCCESS)
+ self.results.append(result)
+ #self.datahandler.saveResult(result)
+ if address in self.successes: self.successes[address]+=1
+ else: self.successes[address]=1
+ return TEST_SUCCESS
+
+ # if content doesnt match, update the direct content
+ content_new = http_request(address, self.cookie_jar, self.headers)
+ if not content_new:
+ plog("WARN", "Failed to re-frech "+address+" outside of Tor. Did our network fail?")
+ result = HttpTestResult(exit_node, address, TEST_INCONCLUSIVE,
+ INCONCLUSIVE_NOLOCALCONTENT)
+ self.results.append(result)
+ self.datahandler.saveResult(result)
+ return TEST_INCONCLUSIVE
+
+ sha1sum_new = sha.sha(content_new)
+
+ # compare the new and old content
+ # if they match, means the node has been changing the content
+ if sha1sum.hexdigest() == sha1sum_new.hexdigest():
+ # XXX: Check for existence of this file before overwriting
+ exit_content_file = open(failed_prefix+'.content.'+exit_node[1:], 'w')
+ exit_content_file.write(pcontent)
+ exit_content_file.close()
+
+ result = HttpTestResult(exit_node, address, TEST_FAILURE,
+ FAILURE_EXITONLY, sha1sum.hexdigest(),
+ psha1sum.hexdigest(), content_prefix+".content",
+ exit_content_file.name)
+ self.results.append(result)
+ self.datahandler.saveResult(result)
+
+ self.register_exit_failure(address, exit_node)
+ return TEST_FAILURE
+
+ # if content has changed outside of tor, update the saved file
+ os.rename(content_prefix+'.content', content_prefix+'.content-old')
+ new_content_file = open(content_prefix+'.content', 'w')
+ new_content_file.write(content_new)
+ new_content_file.close()
+
+ # compare the node content and the new content
+ # if it matches, everything is ok
+ if psha1sum.hexdigest() == sha1sum_new.hexdigest():
+ result = HttpTestResult(exit_node, address, TEST_SUCCESS)
+ self.results.append(result)
+ #self.datahandler.saveResult(result)
+ if address in self.successes: self.successes[address]+=1
+ else: self.successes[address]=1
+ return TEST_SUCCESS
+
+ # XXX: Check for existence of this file before overwriting
+ exit_content_file = open(failed_prefix+'.dyn-content.'+exit_node[1:], 'w')
+ exit_content_file.write(pcontent)
+ exit_content_file.close()
+
+ result = HttpTestResult(exit_node, address, TEST_FAILURE,
+ FAILURE_DYNAMICTAGS, sha1sum_new.hexdigest(),
+ psha1sum.hexdigest(), new_content_file.name,
+ exit_content_file.name,
+ content_prefix+'.content-old',
+ sha1sum.hexdigest())
+ self.results.append(result)
+ self.datahandler.saveResult(result)
+
+ self.register_dynamic_failure(address, exit_node)
+ return TEST_FAILURE
+
+class HTMLTest(HTTPTest):
+ def __init__(self, mt, wordlist, recurse_filetypes=scan_filetypes):
+ # XXX: Change these to 10 and 20 once we exercise the fetch logic
+ HTTPTest.__init__(self, mt, wordlist, recurse_filetypes)
+ self.proto = "HTML"
+ self.min_targets = 9
+ self.recurse_filetypes = recurse_filetypes
+ self.fetch_queue = Queue.Queue()
+
+ def run_test(self):
+ # A single test should have a single cookie jar
+ self.tor_cookie_jar = cookielib.LWPCookieJar()
+ self.cookie_jar = cookielib.LWPCookieJar()
+ self.headers = copy.copy(firefox_headers)
+
+ ret_result = TEST_SUCCESS
+ self.tests_run += 1
+ # XXX: Set referrer to address for subsequent fetches
+ # XXX: Set referrer to random or none for initial fetch
+ address = random.choice(self.targets)
+
+ self.fetch_queue.put_nowait(("html", address))
+ while not self.fetch_queue.empty():
+ (test, url) = self.fetch_queue.get_nowait()
+ if test == "html": result = self.check_html(url)
+ elif test == "http": result = self.check_http(url)
+ else:
+ plog("WARN", "Unknown test type: "+test+" for "+url)
+ result = TEST_SUCCESS
+ if result > ret_result:
+ ret_result = result
+ return ret_result
+
+ def get_targets(self):
+ return self.get_search_urls('http', self.fetch_targets)
+
+ def add_recursive_targets(self, soup, orig_addr):
+ # XXX: Watch for spider-traps! (ie mutually sourcing iframes)
+ # Only pull at most one filetype from the list of 'a' links
+ targets = []
+ got_type = {}
+ # Hrmm, if we recursively strained only these tags, this might be faster
+ for tag in tags_to_recurse:
+ tags = soup.findAll(tag)
+ for t in tags:
+ plog("DEBUG", "Got tag: "+str(t))
+ for a in t.attrs:
+ attr_name = str(a[0])
+ attr_tgt = str(a[1])
+ if attr_name in attrs_to_recurse:
+ if str(t.name) in recurse_html:
+ plog("NOTICE", "Adding html "+str(t.name)+" target: "+attr_tgt)
+ targets.append(("html", urlparse.urljoin(orig_addr, attr_tgt)))
+ elif str(t.name) == 'a':
+ if attr_name == "href":
+ for f in self.recurse_filetypes:
+ if f not in got_type and attr_tgt[-len(f):] == f:
+ got_type[f] = 1
+ plog("NOTICE", "Adding http a target: "+attr_tgt)
+ targets.append(("http", urlparse.urljoin(orig_addr, attr_tgt)))
+ else:
+ plog("NOTICE", "Adding http "+str(t.name)+" target: "+attr_tgt)
+ targets.append(("http", urlparse.urljoin(orig_addr, attr_tgt)))
+ for i in sets.Set(targets):
+ self.fetch_queue.put_nowait(i)
+
+ def recursive_strain(self, soup):
+ """ Remove all tags that are of no interest. Also remove content """
+ to_extract = []
+ for tag in soup.findAll():
+ if not tag.name in tags_to_check or not tag.attr in attrs_to_check:
+ to_extract.append(tag)
+ if tag.name not in tags_preserve_inner:
+ for child in tag.childGenerator():
+ to_extract.append(child)
+ for tag in to_extract:
+ tag.extract()
+ return soup
+
+ def check_html(self, address):
+ ''' check whether a http connection to a given address is molested '''
+ plog('INFO', 'Conducting an html test with destination ' + address)
+
+ defaultsocket = socket.socket
+ socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, tor_host, tor_port)
+ socket.socket = socks.socksocket
+
+ pcontent = http_request(address, self.tor_cookie_jar, self.headers)
+
+ # reset the connection to direct
+ socket.socket = defaultsocket
+
+ exit_node = self.mt.get_exit_node()
+ if exit_node == 0 or exit_node == '0' or not exit_node:
+ plog('WARN', 'We had no exit node to test, skipping to the next test.')
+ return TEST_SUCCESS
+
+ # an address representation acceptable for a filename
+ address_file = self.datahandler.safeFilename(address[7:])
+ content_prefix = http_content_dir+address_file
+ failed_prefix = http_failed_dir+address_file
+
+ # if we have no content, we had a connection error
+ if pcontent == "":
+ plog("NOTICE", exit_node+" failed to fetch content for "+address)
+ result = HtmlTestResult(exit_node, address, TEST_INCONCLUSIVE,
+ INCONCLUSIVE_NOEXITCONTENT)
+ self.results.append(result)
+ self.datahandler.saveResult(result)
+ return TEST_INCONCLUSIVE
+
+ # XXX: We need to remove the content between some of these tags..
+ elements = SoupStrainer(lambda name, attrs: name in tags_to_check or
+ len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_check))) > 0)
pcontent = pcontent.decode('ascii', 'ignore')
- psoup = BeautifulSoup(pcontent, parseOnlyThese=elements)
+ psoup = self.recursive_strain(BeautifulSoup(pcontent, parseOnlyThese=elements))
+ # Also find recursive urls
+ recurse_elements = SoupStrainer(lambda name, attrs:
+ name in tags_to_recurse and
+ len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_recurse))) > 0)
+ self.add_recursive_targets(BeautifulSoup(pcontent, recurse_elements),
+ address)
+
# load the original tag structure
# if we don't have any yet, get it
soup = 0
try:
- tag_file = open(http_tags_dir + address_file + '.tags', 'r')
+ tag_file = open(content_prefix+'.tags', 'r')
soup = BeautifulSoup(tag_file.read())
tag_file.close()
+
except IOError:
- content = http_request(address)
+ content = http_request(address, self.cookie_jar, self.headers)
content = content.decode('ascii','ignore')
soup = BeautifulSoup(content, parseOnlyThese=elements)
- tag_file = open(http_tags_dir + address_file + '.tags', 'w')
- tag_file.write(soup.__str__() + ' ') # the space is needed in case we have some page with no matching tags at all
+
+ tag_file = open(content_prefix+'.tags', 'w')
+ # the space is needed in case we have some page with no matching tags at all
+ tag_file.write(soup.__str__() + ' ')
tag_file.close()
- content_file = open(http_tags_dir+address_file+'.content-orig', 'w')
+ content_file = open(content_prefix+'.content', 'w')
content_file.write(content)
content_file.close()
+
except TypeError, e:
plog('ERROR', 'Failed parsing the tag tree for ' + address)
plog('ERROR', e)
@@ -391,100 +743,102 @@
# compare the content
# if content matches, everything is ok
if psoup == soup:
- result = HttpTestResult(exit_node, address, 0, TEST_SUCCESS)
- self.datahandler.saveResult(result)
+ result = HtmlTestResult(exit_node, address, TEST_SUCCESS)
+ self.results.append(result)
+ #self.datahandler.saveResult(result)
+ if address in self.successes: self.successes[address]+=1
+ else: self.successes[address]=1
return TEST_SUCCESS
# if content doesnt match, update the direct content
- content_new = http_request(address)
+ content_new = http_request(address, self.cookie_jar, self.headers)
content_new = content_new.decode('ascii', 'ignore')
if not content_new:
- result = HttpTestResult(exit_node, address, 0, TEST_INCONCLUSIVE)
+ plog("WARN", "Failed to re-frech "+address+" outside of Tor. Did our network fail?")
+ result = HtmlTestResult(exit_node, address, TEST_INCONCLUSIVE,
+ INCONCLUSIVE_NOLOCALCONTENT)
+ self.results.append(result)
self.datahandler.saveResult(result)
return TEST_INCONCLUSIVE
- soup_new = BeautifulSoup(content_new, parseOnlyThese=elements)
+ soup_new = self.recursive_strain(BeautifulSoup(content_new,
+ parseOnlyThese=elements))
# compare the new and old content
# if they match, means the node has been changing the content
if soup == soup_new:
- result = HttpTestResult(exit_node, address, 0, TEST_FAILURE)
+ # XXX: Check for existence of this file before overwriting
+ exit_tag_file = open(failed_prefix+'.tags.'+exit_node[1:],'w')
+ exit_tag_file.write(psoup.__str__())
+ exit_tag_file.close()
+
+ exit_content_file = open(failed_prefix+'.content.'+exit_node[1:], 'w')
+ exit_content_file.write(pcontent)
+ exit_content_file.close()
+
+ result = HtmlTestResult(exit_node, address, TEST_FAILURE,
+ FAILURE_EXITONLY, tag_file.name,
+ exit_tag_file.name, content_prefix+".content",
+ exit_content_file.name)
+ self.results.append(result)
self.datahandler.saveResult(result)
- tag_file = open(http_tags_dir + exit_node[1:] + '_' + address_file + '.tags', 'w')
- tag_file.write(psoup.__str__())
- tag_file.close()
-
- content_file = open(http_tags_dir+exit_node[1:]+'_'+address_file+'.content-exit', 'w')
- content_file.write(pcontent)
- content_file.close()
- if address in self.two_way_fails:
- self.two_way_fails[address].add(exit_node.idhex)
- else:
- self.two_way_fails[address] = sets.Set([exit_node.idhex])
-
- err_cnt = len(self.two_way_fails[address])
- if err_cnt > self.two_way_limit:
- plog("NOTICE", "Excessive HTTP 2-way failure for "+address+". Removing.")
- self.remove_target(address)
- else:
- plog("ERROR", "HTTP 2-way failure at "+exit_node+". This makes "+str(err_cnt)+" node failures for "+address)
+ self.register_exit_failure(address, exit_node)
return TEST_FAILURE
# if content has changed outside of tor, update the saved file
- tag_file = open(http_tags_dir + address_file + '.tags', 'w')
+ os.rename(content_prefix+'.tags', content_prefix+'.tags-old')
+ tag_file = open(content_prefix+'.tags', 'w')
tag_file.write(soup_new.__str__())
tag_file.close()
+ os.rename(content_prefix+'.content', content_prefix+'.content-old')
+ new_content_file = open(content_prefix+'.content', 'w')
+ new_content_file.write(content_new)
+ new_content_file.close()
+
# compare the node content and the new content
# if it matches, everything is ok
if psoup == soup_new:
- result = HttpTestResult(exit_node, address, 0, TEST_SUCCESS)
- self.datahandler.saveResult(result)
+ result = HtmlTestResult(exit_node, address, TEST_SUCCESS)
+ self.results.append(result)
+ #self.datahandler.saveResult(result)
+ if address in self.successes: self.successes[address]+=1
+ else: self.successes[address]=1
return TEST_SUCCESS
- # if it doesn't match, means the node has been changing the content
- result = HttpTestResult(exit_node, address, 0, TEST_FAILURE)
- self.datahandler.saveResult(result)
- tag_file = open(http_tags_dir + exit_node[1:] + '_' + address_file + '.tags', 'w')
- tag_file.write(psoup.__str__())
- tag_file.close()
+ # XXX: Check for existence of this file before overwriting
+ exit_tag_file = open(failed_prefix+'.dyn-tags.'+exit_node[1:],'w')
+ exit_tag_file.write(psoup.__str__())
+ exit_tag_file.close()
- content_file = open(http_tags_dir+exit_node[1:]+'_'+address_file+'.content-exit', 'w')
- content_file.write(pcontent)
- content_file.close()
+ exit_content_file = open(failed_prefix+'.dyn-content.'+exit_node[1:], 'w')
+ exit_content_file.write(pcontent)
+ exit_content_file.close()
- content_file = open(http_tags_dir+exit_node[1:]+'_'+address_file+'.content-new', 'w')
- content_file.write(content_new)
- content_file.close()
+ result = HtmlTestResult(exit_node, address, TEST_FAILURE,
+ FAILURE_DYNAMICTAGS, tag_file.name,
+ exit_tag_file.name, new_content_file.name,
+ exit_content_file.name,
+ content_prefix+'.content-old',
+ content_prefix+'.tags-old')
+ self.results.append(result)
+ self.datahandler.saveResult(result)
- if address in self.three_way_fails:
- self.three_way_fails[address].add(exit_node.idhex)
- else:
- self.three_way_fails[address] = sets.Set([exit_node.idhex])
+ self.register_dynamic_failure(address, exit_node)
+ return TEST_FAILURE
- err_cnt = len(self.three_way_fails[address])
- if err_cnt > self.three_way_limit:
- # FIXME: Remove all associated data for this url.
- # (Note, this also seems to imply we should report BadExit in bulk,
- # after we've had a chance for these false positives to be weeded out)
- plog("NOTICE", "Excessive HTTP 3-way failure for "+address+". Removing.")
- self.remove_target(address)
- else:
- plog("ERROR", "HTTP 3-way failure at "+exit_node+". This makes "+str(err_cnt)+" node failures for "+address)
-
- return TEST_FAILURE
-class SSLTest(GoogleBasedTest):
+class SSLTest(SearchBasedTest):
def __init__(self, mt, wordlist):
- self.test_hosts = 15
- GoogleBasedTest.__init__(self, mt, "SSL", 443, wordlist)
+ self.test_hosts = 10
+ SearchBasedTest.__init__(self, mt, "SSL", 443, wordlist)
def run_test(self):
self.tests_run += 1
return self.check_openssl(random.choice(self.targets))
def get_targets(self):
- return self.get_google_urls('https', self.test_hosts, True)
+ return self.get_search_urls('https', self.test_hosts, True, search_mode=google_search_mode)
def ssl_request(self, address):
''' initiate an ssl connection and return the server certificate '''
@@ -565,7 +919,7 @@
plog('WARN', 'Error getting the correct cert for ' + address)
return TEST_INCONCLUSIVE
if original_cert.has_expired():
- plog('WARN', 'The ssl cert for ' + address + 'seems to have expired. Skipping to the next test...')
+ plog('WARN', 'The ssl cert for '+address+' seems to have expired. Skipping to the next test...')
return TEST_INCONCLUSIVE
cert_file = open(ssl_certs_dir + address_file + '.pem', 'w')
cert_file.write(crypto.dump_certificate(crypto.FILETYPE_PEM, original_cert))
@@ -1413,23 +1767,25 @@
if len(argv) < 2:
print ''
print 'Please provide at least one test option:'
- print '--ssl (~works)'
- print '--http (gives some false positives)'
- print '--ssh (doesn\'t work yet)'
- print '--smtp (~works)'
- print '--pop (~works)'
- print '--imap (~works)'
- print '--dnsrebind (works with the ssl test)'
- print '--policies (~works)'
+ print '--ssl'
+ print '--http'
+ print '--html'
+# print '--ssh (doesn\'t work yet)'
+# print '--smtp (~works)'
+# print '--pop (~works)'
+# print '--imap (~works)'
+ print '--dnsrebind (use with one or more of above tests)'
+ print '--policies'
print ''
return
- opts = ['ssl','http','ssh','smtp','pop','imap','dns','dnsrebind','policies']
+ opts = ['ssl','html','http','ssh','smtp','pop','imap','dns','dnsrebind','policies']
flags, trailer = getopt.getopt(argv[1:], [], opts)
# get specific test types
do_ssl = ('--ssl','') in flags
do_http = ('--http','') in flags
+ do_html = ('--html','') in flags
do_ssh = ('--ssh','') in flags
do_smtp = ('--smtp','') in flags
do_pop = ('--pop','') in flags
@@ -1452,16 +1808,16 @@
mt.check_all_exits_port_consistency()
# maybe only the consistency test was required
- if not (do_ssl or do_http or do_ssh or do_smtp or do_pop or do_imap):
+ if not (do_ssl or do_html or do_http or do_ssh or do_smtp or do_pop or do_imap):
plog('INFO', 'Done.')
return
# Load the cookie jar
- global google_cookies
- google_cookies = cookielib.LWPCookieJar()
- if os.path.isfile(google_cookie_file):
- google_cookies.load(google_cookie_file)
- google_cookies.__filename = google_cookie_file
+ global search_cookies
+ search_cookies = cookielib.LWPCookieJar()
+ if os.path.isfile(search_cookie_file):
+ search_cookies.load(search_cookie_file)
+ search_cookies.__filename = search_cookie_file
tests = {}
@@ -1478,6 +1834,12 @@
except NoURLsFound, e:
plog('ERROR', e.message)
+ if do_html:
+ try:
+ tests["HTML"] = HTMLTest(mt, wordlist)
+ except NoURLsFound, e:
+ plog('ERROR', e.message)
+
if do_smtp:
try:
tests["SMTPS"] = SMTPSTest(mt)
@@ -1497,10 +1859,13 @@
plog('ERROR', e.message)
# maybe no tests could be initialized
- if not (do_ssl or do_http or do_ssh or do_smtp or do_pop or do_imap):
+ if not (do_ssl or do_html or do_http or do_ssh or do_smtp or do_pop or do_imap):
plog('INFO', 'Done.')
sys.exit(0)
-
+
+ for test in tests.itervalues():
+ test.rewind()
+
# start testing
while 1:
# Get as much milage out of each exit as we safely can:
Modified: torflow/trunk/NetworkScanners/soatcli.py
===================================================================
--- torflow/trunk/NetworkScanners/soatcli.py 2009-01-28 04:17:46 UTC (rev 18284)
+++ torflow/trunk/NetworkScanners/soatcli.py 2009-01-28 04:29:25 UTC (rev 18285)
@@ -65,7 +65,7 @@
if result.__class__.__name__ == 'SSHTestResult':
sshSet.add(result.exit_node)
ssh += 1
- elif result.__class__.__name__ == 'HttpTestResult':
+ elif result.__class__.__name__ == 'HttpTestResult' or result.__class__.__name__ == 'HtmlTestResult':
httpSet.add(result.exit_node)
http += 1
elif result.__class__.__name__ == 'SSLTestResult':
More information about the tor-commits
mailing list