[tor-commits] [torflow/master] Modernizing search modes. Adding support to soat.py

mikeperry at torproject.org mikeperry at torproject.org
Wed Sep 14 01:45:07 UTC 2011


commit b4be3a63400e658eb26436d97fca766907ecf91c
Author: Christian Anderson <christian at avtok.com>
Date:   Tue May 24 12:46:04 2011 -0400

    Modernizing search modes. Adding support to soat.py
---
 .gitignore                                   |    3 ++
 NetworkScanners/ExitAuthority/soat.py        |   30 +++++++++++++------------
 NetworkScanners/ExitAuthority/soat_config.py |   28 +++++++++++++-----------
 3 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b4e8d7b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+*.pyc
+NetworkScanners/ExitAuthority/data/
+NetworkScanners/ExitAuthority/search_cookies.lwp
\ No newline at end of file
diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py
index 162ff0e..97d310b 100755
--- a/NetworkScanners/ExitAuthority/soat.py
+++ b/NetworkScanners/ExitAuthority/soat.py
@@ -2076,12 +2076,14 @@ class SearchBasedTest:
     count = 0
     while len(type_urls) < self.results_per_type and count < max_search_retry:
       count += 1
+
+      #Try to filter based on filetype/protocol. Unreliable. We will re-filter.
       query = random.choice(self.wordlist)
       if filetype != 'any':
         query += " "+self.search_mode["filetype"]+filetype
       plog("WARN", "RESULTPROTOCOL IS:" + self.result_protocol)
-      if self.result_protocol != 'any' and self.search_mode["inurl"]:
-        query += " "+self.search_mode["inurl"]+self.result_protocol # this isn't too reliable, but we'll re-filter results later
+      if self.result_protocol == 'https' and self.search_mode["inurl"]:
+        query += " " + self.search_mode["inurl"] + "https"
       #query += '&num=' + `g_results_per_page`
 
       # search google for relevant pages
@@ -2124,19 +2126,19 @@ class SearchBasedTest:
         traceback.print_exc()
         print "Content is: "+str(content)
         break
+
       # get the links and do some additional filtering
+      assert(self.search_mode["class"])
       for link in soup.findAll('a'):
-        skip = True
-        for a in link.attrs:
-          if a[0] == "class" and self.search_mode["class"] in a[1]:
-            skip = False
-            break
-        if skip:
-          continue
-        if link.has_key(self.search_mode['realtgt']):
-          url = link[self.search_mode['realtgt']]
-        else:
-          url = link['href']
+        #Filter based on class of link
+        try:
+          if self.search_mode["class"] != link["class"]:
+            continue
+        except KeyError: continue
+
+        #Get real target
+        url = link[self.search_mode['realtgt']]
+
         if self.result_protocol == 'any':
           prot_list = None
         else:
@@ -2158,7 +2160,7 @@ class SearchBasedTest:
             type_urls.add(url)
         else:
           pass
-    plog("INFO", "Have "+str(len(type_urls))+"/"+str(self.results_per_type)+" urls from search so far..")
+      plog("INFO", "Have "+str(len(type_urls))+"/"+str(self.results_per_type)+" urls from search so far..")
     return type_urls
 
 class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest):
diff --git a/NetworkScanners/ExitAuthority/soat_config.py b/NetworkScanners/ExitAuthority/soat_config.py
index 3e13463..39f8165 100644
--- a/NetworkScanners/ExitAuthority/soat_config.py
+++ b/NetworkScanners/ExitAuthority/soat_config.py
@@ -40,8 +40,8 @@ max_content_size = 256*1024
 # Bind refetches of docuements to a specific source IP.
 # Useful for eliminating false positives that arise
 # from IP-based identifiers encoded in content
-#refetch_ip = None
-refetch_ip = "4.4.4.4"
+refetch_ip = None
+#refetch_ip = "4.4.4.4"
 
 # Email settings for email scans.
 from_email = "Tor Exit Scanner <noreply at torproject.org>"
@@ -134,21 +134,23 @@ search_cookie_file="./search_cookies.lwp"
 # Search mode. 
 # Leave these maps alone. Change the default_search_mode variable 
 # to what you want.
-# XXX: Make a bing search mode.
-yahoo_search_mode = {"host" : "search.yahoo.com", "query":"p", "filetype": "originurlextension:", \
-                      "inurl":None, "class":"yschttl", "realtgt":"ourl", "useragent":False, \
-                      "extra":[]}
-google_search_mode = {"host" : "www.google.com", "query":"q", "filetype":"filetype:", \
-                      "inurl":"inurl:", "class" : "l", "realtgt":"href", "useragent":True, \
-                      "extra":[]}
-ixquick_search_mode = {"host" : "ixquick.com/do/metasearch.pl", "query":"all_terms", "filetype":"title:", \
+# XXX: Make a bing search mode and a DuckDuckGo search mode
+
+#Yahoo is no longer supported because they make it difficult to scrape their results
+#yahoo_search_mode = {"host" : "search.yahoo.com/search", "query":"p", "filetype": "vf:", \
+#                      "inurl":None, "class":"yschttl", "realtgt":"ourl", "useragent":False, \
+#                      "extra":[]}
+
+google_search_mode = {"host" : "www.google.com/search", "query":"q", "filetype":"filetype:", \
+                       "inurl":"inurl:", "class" : "l", "realtgt":"href", "useragent":True, \
+                       "extra":[]}
+
+ixquick_search_mode = {"host" : "ixquick.com/do/metasearch.pl", "query":"all_terms", "filetype":"url:.", \
                       "inurl":"url:", "class" : "title2", "realtgt":"href", "useragent":False, \
                       "extra":[("prfh","disable_family_filterEEE1N1Nnum_of_resultsEEE50N1Ndisable_video_family_filterEEE1N1N")]}
  
-# FIXME: This does not affect the ssl search.. Only Google has 
-# a working "inurl:" that allows you to pick the scheme to be https 
+
 #default_search_mode = google_search_mode
-#default_search_mode = yahoo_search_mode
 default_search_mode = ixquick_search_mode
 
 # Regex of characters we consider unsafe to write to the filesystem





More information about the tor-commits mailing list