[tor-commits] [torflow/master] Modernizing search modes. Adding support to soat.py
mikeperry at torproject.org
mikeperry at torproject.org
Wed Sep 14 01:45:07 UTC 2011
commit b4be3a63400e658eb26436d97fca766907ecf91c
Author: Christian Anderson <christian at avtok.com>
Date: Tue May 24 12:46:04 2011 -0400
Modernizing search modes. Adding support to soat.py
---
.gitignore | 3 ++
NetworkScanners/ExitAuthority/soat.py | 30 +++++++++++++------------
NetworkScanners/ExitAuthority/soat_config.py | 28 +++++++++++++-----------
3 files changed, 34 insertions(+), 27 deletions(-)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b4e8d7b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+*.pyc
+NetworkScanners/ExitAuthority/data/
+NetworkScanners/ExitAuthority/search_cookies.lwp
\ No newline at end of file
diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py
index 162ff0e..97d310b 100755
--- a/NetworkScanners/ExitAuthority/soat.py
+++ b/NetworkScanners/ExitAuthority/soat.py
@@ -2076,12 +2076,14 @@ class SearchBasedTest:
count = 0
while len(type_urls) < self.results_per_type and count < max_search_retry:
count += 1
+
+ #Try to filter based on filetype/protocol. Unreliable. We will re-filter.
query = random.choice(self.wordlist)
if filetype != 'any':
query += " "+self.search_mode["filetype"]+filetype
plog("WARN", "RESULTPROTOCOL IS:" + self.result_protocol)
- if self.result_protocol != 'any' and self.search_mode["inurl"]:
- query += " "+self.search_mode["inurl"]+self.result_protocol # this isn't too reliable, but we'll re-filter results later
+ if self.result_protocol == 'https' and self.search_mode["inurl"]:
+ query += " " + self.search_mode["inurl"] + "https"
#query += '&num=' + `g_results_per_page`
# search google for relevant pages
@@ -2124,19 +2126,19 @@ class SearchBasedTest:
traceback.print_exc()
print "Content is: "+str(content)
break
+
# get the links and do some additional filtering
+ assert(self.search_mode["class"])
for link in soup.findAll('a'):
- skip = True
- for a in link.attrs:
- if a[0] == "class" and self.search_mode["class"] in a[1]:
- skip = False
- break
- if skip:
- continue
- if link.has_key(self.search_mode['realtgt']):
- url = link[self.search_mode['realtgt']]
- else:
- url = link['href']
+ #Filter based on class of link
+ try:
+ if self.search_mode["class"] != link["class"]:
+ continue
+ except KeyError: continue
+
+ #Get real target
+ url = link[self.search_mode['realtgt']]
+
if self.result_protocol == 'any':
prot_list = None
else:
@@ -2158,7 +2160,7 @@ class SearchBasedTest:
type_urls.add(url)
else:
pass
- plog("INFO", "Have "+str(len(type_urls))+"/"+str(self.results_per_type)+" urls from search so far..")
+ plog("INFO", "Have "+str(len(type_urls))+"/"+str(self.results_per_type)+" urls from search so far..")
return type_urls
class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest):
diff --git a/NetworkScanners/ExitAuthority/soat_config.py b/NetworkScanners/ExitAuthority/soat_config.py
index 3e13463..39f8165 100644
--- a/NetworkScanners/ExitAuthority/soat_config.py
+++ b/NetworkScanners/ExitAuthority/soat_config.py
@@ -40,8 +40,8 @@ max_content_size = 256*1024
# Bind refetches of docuements to a specific source IP.
# Useful for eliminating false positives that arise
# from IP-based identifiers encoded in content
-#refetch_ip = None
-refetch_ip = "4.4.4.4"
+refetch_ip = None
+#refetch_ip = "4.4.4.4"
# Email settings for email scans.
from_email = "Tor Exit Scanner <noreply at torproject.org>"
@@ -134,21 +134,23 @@ search_cookie_file="./search_cookies.lwp"
# Search mode.
# Leave these maps alone. Change the default_search_mode variable
# to what you want.
-# XXX: Make a bing search mode.
-yahoo_search_mode = {"host" : "search.yahoo.com", "query":"p", "filetype": "originurlextension:", \
- "inurl":None, "class":"yschttl", "realtgt":"ourl", "useragent":False, \
- "extra":[]}
-google_search_mode = {"host" : "www.google.com", "query":"q", "filetype":"filetype:", \
- "inurl":"inurl:", "class" : "l", "realtgt":"href", "useragent":True, \
- "extra":[]}
-ixquick_search_mode = {"host" : "ixquick.com/do/metasearch.pl", "query":"all_terms", "filetype":"title:", \
+# XXX: Make a bing search mode and a DuckDuckGo search mode
+
+#Yahoo is no longer supported because they make it difficult to scrape their results
+#yahoo_search_mode = {"host" : "search.yahoo.com/search", "query":"p", "filetype": "vf:", \
+# "inurl":None, "class":"yschttl", "realtgt":"ourl", "useragent":False, \
+# "extra":[]}
+
+google_search_mode = {"host" : "www.google.com/search", "query":"q", "filetype":"filetype:", \
+ "inurl":"inurl:", "class" : "l", "realtgt":"href", "useragent":True, \
+ "extra":[]}
+
+ixquick_search_mode = {"host" : "ixquick.com/do/metasearch.pl", "query":"all_terms", "filetype":"url:.", \
"inurl":"url:", "class" : "title2", "realtgt":"href", "useragent":False, \
"extra":[("prfh","disable_family_filterEEE1N1Nnum_of_resultsEEE50N1Ndisable_video_family_filterEEE1N1N")]}
-# FIXME: This does not affect the ssl search.. Only Google has
-# a working "inurl:" that allows you to pick the scheme to be https
+
#default_search_mode = google_search_mode
-#default_search_mode = yahoo_search_mode
default_search_mode = ixquick_search_mode
# Regex of characters we consider unsafe to write to the filesystem
More information about the tor-commits
mailing list