[tor-commits] [stem/master] Skip pointless extra read of network status documents

Thu Dec 6 17:06:42 UTC 2012

commit 51141d37b9a26a8a05cfa09cf85f3939ab79011a
Author: Damian Johnson <atagar at torproject.org>
Date:   Thu Dec 6 09:02:35 2012 -0800

    Skip pointless extra read of network status documents
    
    When parsing a network status document we read to the end of the router status
    entries solely to determine where the end of the file is. This is a waste of
    time and memory. Instead, checking for the end of the section as we go along.
---
 stem/control.py                        |    2 +
 stem/descriptor/__init__.py            |   14 +++++++++--
 stem/descriptor/networkstatus.py       |    4 +-
 stem/descriptor/router_status_entry.py |   36 ++++++++++++++++++++++++-------
 4 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/stem/control.py b/stem/control.py
index 6b19c74..0803ad4 100644
--- a/stem/control.py
+++ b/stem/control.py
@@ -36,7 +36,9 @@ providing its own for interacting at a higher level.
     |- map_address - maps one address to another such that connections to the original are replaced with the other
     |- get_version - convenience method to get tor version
     |- get_server_descriptor - querying the server descriptor for a relay
+    |- get_server_descriptors - provides all presently available server descriptors
     |- get_network_status - querying the router status entry for a relay
+    |- get_network_statuses - provides all preently available router status entries
     |- authenticate - convenience method to authenticate the controller
     +- protocolinfo - convenience method to get the protocol info
   
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index 4c5841e..7291995 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -175,7 +175,7 @@ class Descriptor(object):
   def __str__(self):
     return self._raw_contents
 
-def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_first = False, skip = False, end_position = None):
+def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_first = False, skip = False, end_position = None, include_ending_keyword = False):
   """
   Reads from the descriptor file until we get to one of the given keywords or reach the
   end of the file.
@@ -187,11 +187,14 @@ def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_fi
     given keywords
   :param bool skip: skips buffering content, returning None
   :param int end_position: end if we reach this point in the file
+  :param bool include_ending_keyword: provides the keyword we broke on if **True**
   
-  :returns: **list** with the lines until we find one of the keywords
+  :returns: **list** with the lines until we find one of the keywords, this is a two value tuple with the ending keyword if include_ending_keyword is **True**
   """
   
   content = None if skip else []
+  ending_keyword = None
+  
   if type(keywords) == str: keywords = (keywords,)
   
   if ignore_first:
@@ -218,6 +221,8 @@ def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_fi
       line_keyword = line_match.groups()[0]
     
     if line_keyword in keywords:
+      ending_keyword = line_keyword
+      
       if not inclusive:
         descriptor_file.seek(last_position)
       elif content is not None:
@@ -227,7 +232,10 @@ def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_fi
     elif content is not None:
       content.append(line)
   
-  return content
+  if include_ending_keyword:
+    return (content, ending_keyword)
+  else:
+    return content
 
 def _get_pseudo_pgp_block(remaining_contents):
   """
diff --git a/stem/descriptor/networkstatus.py b/stem/descriptor/networkstatus.py
index ca6aca2..efe40e1 100644
--- a/stem/descriptor/networkstatus.py
+++ b/stem/descriptor/networkstatus.py
@@ -296,7 +296,7 @@ class NetworkStatusDocumentV2(NetworkStatusDocument):
       validate,
       entry_class = stem.descriptor.router_status_entry.RouterStatusEntryV2,
       entry_keyword = ROUTERS_START,
-      section_end_keywords = V2_FOOTER_START,
+      section_end_keywords = (V2_FOOTER_START,),
       extra_args = (self,),
     ))
     
@@ -483,7 +483,7 @@ class NetworkStatusDocumentV3(NetworkStatusDocument):
       validate,
       entry_class = router_type,
       entry_keyword = ROUTERS_START,
-      section_end_keywords = FOOTER_START,
+      section_end_keywords = (FOOTER_START,),
       extra_args = (self,),
     ))
     
diff --git a/stem/descriptor/router_status_entry.py b/stem/descriptor/router_status_entry.py
index 61f03e8..2d02330 100644
--- a/stem/descriptor/router_status_entry.py
+++ b/stem/descriptor/router_status_entry.py
@@ -50,20 +50,40 @@ def parse_file(document_file, validate, entry_class, entry_keyword = "r", start_
     * **IOError** if the file can't be read
   """
   
-  if start_position is None:
+  if start_position:
+    document_file.seek(start_position)
+  else:
     start_position = document_file.tell()
   
-  if end_position is None:
-    if section_end_keywords:
-      stem.descriptor._read_until_keywords(section_end_keywords, document_file, skip = True)
-      end_position = document_file.tell()
+  # check if we're starting at the end of the section (ie, there's no entries to read)
+  if section_end_keywords:
+    first_keyword = None
+    line_match = stem.descriptor.KEYWORD_LINE.match(document_file.readline())
+    
+    if line_match:
+      first_keyword = line_match.groups()[0]
+    
+    document_file.seek(start_position)
+    
+    if first_keyword in section_end_keywords:
+      return
   
-  document_file.seek(start_position)
-  while not end_position or document_file.tell() < end_position:
-    desc_content = "".join(stem.descriptor._read_until_keywords(entry_keyword, document_file, ignore_first = True, end_position = end_position))
+  while end_position is None or document_file.tell() < end_position:
+    desc_lines, ending_keyword = stem.descriptor._read_until_keywords(
+      (entry_keyword,) + section_end_keywords,
+      document_file,
+      ignore_first = True,
+      end_position = end_position,
+      include_ending_keyword = True
+    )
+    
+    desc_content = "".join(desc_lines)
     
     if desc_content:
       yield entry_class(desc_content, validate, *extra_args)
+      
+      # check if we stopped at the end of the section
+      if ending_keyword in section_end_keywords: break
     else:
       break