[tor-commits] [stem/master] Cleaning up prior descriptor implementations

Mon Mar 26 00:10:01 UTC 2012

commit 7e9d454ed4eafe6b5461c5ff5170ca5634899edf
Author: Damian Johnson <atagar at torproject.org>
Date:   Sat Mar 24 13:41:11 2012 -0700

    Cleaning up prior descriptor implementations
    
    Adding header pydocs and made a variety of small fixes and naming improvements.
---
 stem/descriptor/__init__.py          |   45 +++++++----------
 stem/descriptor/reader.py            |    4 +-
 stem/descriptor/server_descriptor.py |   86 +++++++++++++++++++++++-----------
 test/integ/descriptor/reader.py      |    4 +-
 4 files changed, 81 insertions(+), 58 deletions(-)

diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index ad17296..215864f 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -1,12 +1,18 @@
 """
 Package for parsing and processing descriptor data.
+
+parse_file - Iterates over the descriptors in a file.
+Descriptor - Common parent for all descriptor file types.
+  |- get_path - location of the descriptor on disk if it came from a file
+  |- get_unrecognized_lines - unparsed descriptor content
+  +- __str__ - string that the descriptor was made from
 """
 
-__all__ = ["descriptor", "reader", "server_descriptor", "parse_descriptors", "Descriptor"]
+__all__ = ["descriptor", "reader", "server_descriptor", "parse_file", "Descriptor"]
 
 import os
 
-def parse_descriptors(path, descriptor_file):
+def parse_file(path, descriptor_file):
   """
   Provides an iterator for the descriptors within a given file.
   
@@ -15,7 +21,7 @@ def parse_descriptors(path, descriptor_file):
     descriptor_file (file) - opened file with the descriptor contents
   
   Returns:
-    iterator that parses the file's contents into descriptors
+    iterator for Descriptor instances in the file
   
   Raises:
     TypeError if we can't match the contents of the file to a descriptor type
@@ -26,33 +32,21 @@ def parse_descriptors(path, descriptor_file):
   
   # The tor descriptor specifications do not provide a reliable method for
   # identifying a descriptor file's type and version so we need to guess
-  # based on its filename for resources from the data directory and contents
-  # for files provided by metrics.
+  # based on...
+  # - its filename for resources from the tor data directory
+  # - first line of our contents for files provided by metrics
   
   filename = os.path.basename(path)
-  
-  if filename == "cached-descriptors":
-    # server descriptors from tor's data directory
-    while descriptor_file:
-      yield stem.descriptor.server_descriptor.parse_server_descriptors_v2(path, descriptor_file)
-    
-    return
-  
   first_line = descriptor_file.readline()
   descriptor_file.seek(0)
   
-  if first_line.startswith("router "):
-    # server descriptor
-    for desc in stem.descriptor.server_descriptor.parse_server_descriptors_v2(path, descriptor_file):
+  if filename == "cached-descriptors" or first_line.startswith("router "):
+    for desc in stem.descriptor.server_descriptor.parse_file_v2(descriptor_file):
+      desc._set_path(path)
       yield desc
-    
-    return
-  
-  # TODO: implement actual descriptor type recognition and parsing
-  # TODO: add integ test for non-descriptor text content
-  desc = Descriptor(descriptor_file.read())
-  desc._set_path(path)
-  yield desc
+  else:
+    # unrecognized descritor type
+    raise TypeError("Unable to determine the descriptor's type. filename: '%s', first line: '%s'" % (filename, first_line))
 
 class Descriptor:
   """
@@ -83,7 +77,7 @@ class Descriptor:
       list of lines of unrecognized content
     """
     
-    return []
+    raise NotImplementedError
   
   def _set_path(self, path):
     self._path = path
@@ -91,4 +85,3 @@ class Descriptor:
   def __str__(self):
     return self._raw_contents
 
-
diff --git a/stem/descriptor/reader.py b/stem/descriptor/reader.py
index 4f043bc..511ba04 100644
--- a/stem/descriptor/reader.py
+++ b/stem/descriptor/reader.py
@@ -365,7 +365,7 @@ class DescriptorReader:
   def _handle_descriptor_file(self, target):
     try:
       with open(target) as target_file:
-        for desc in stem.descriptor.parse_descriptors(target, target_file):
+        for desc in stem.descriptor.parse_file(target, target_file):
           self._enqueue_descriptor(desc)
           self._iter_notice.set()
     except TypeError, exc:
@@ -380,7 +380,7 @@ class DescriptorReader:
           if tar_entry.isfile():
             entry = tar_file.extractfile(tar_entry)
             
-            for desc in stem.descriptor.parse_descriptors(target, entry):
+            for desc in stem.descriptor.parse_file(target, entry):
               self._enqueue_descriptor(desc)
               self._iter_notice.set()
             
diff --git a/stem/descriptor/server_descriptor.py b/stem/descriptor/server_descriptor.py
index fe6889b..70db1fe 100644
--- a/stem/descriptor/server_descriptor.py
+++ b/stem/descriptor/server_descriptor.py
@@ -6,15 +6,22 @@ etc). This information is provided from a few sources...
 - control port via 'GETINFO desc/*' queries
 - the 'cached-descriptors' file in tor's data directory
 - tor metrics, at https://metrics.torproject.org/data.html
+
+parse_file_v2 - Iterates over the server descriptors in a file.
+ServerDescriptorV2 - Tor server descriptor, version 2.
+  |- get_unrecognized_lines - lines with unrecognized content
+  |- get_annotations - dictionary of content prior to the descriptor entry
+  |- get_annotation_lines - lines that provided the annotations
+  +- is_valid - checks the signature against the descriptor content
 """
 
 import re
 import datetime
 
+import stem.descriptor
 import stem.version
 import stem.util.connection
 import stem.util.tor_tools
-import stem.descriptor
 
 ENTRY_START = "router"
 ENTRY_END   = "router-signature"
@@ -48,9 +55,21 @@ SINGLE_FIELDS = (
   "family",
 )
 
-def parse_server_descriptors_v2(path, descriptor_file):
+def parse_file_v2(descriptor_file, validate = True):
   """
-  Iterates over the verion 2 server descriptors in a descriptor file.
+  Iterates over the version 2 server descriptors in a file.
+  
+  Arguments:
+    descriptor_file (file) - file with descriptor content
+    validate (bool)        - checks the validity of the descriptor's content if
+                             True, skips these checks otherwise
+  
+  Returns:
+    iterator for ServerDescriptorV2 instances in the file
+  
+  Raises:
+    ValueError if the contents is malformed and validate is True
+    IOError if the file can't be read
   """
   
   # Cached descriptors consist of annotations followed by the descriptor
@@ -73,6 +92,9 @@ def parse_server_descriptors_v2(path, descriptor_file):
   #   - parse as descriptor content until we get to ENTRY_END followed by the
   #     end of the signature block
   #   - construct a descriptor and provide it back to the caller
+  #
+  # Any annotations after the last server descriptor is ignored (never provided
+  # to the caller).
   
   while True:
     annotations = _read_until_keyword(ENTRY_START, descriptor_file)
@@ -82,16 +104,14 @@ def parse_server_descriptors_v2(path, descriptor_file):
     block_end_prefix = PGP_BLOCK_END.split(' ', 1)[0]
     descriptor_content += _read_until_keyword(block_end_prefix, descriptor_file, True)
     
-    # If the file has ending annotations (ie, non-descriptor text after the
-    # last descriptor) then we won't have any descriptor content at this point.
-    # This is fine. Those ending annotations are simply never returned to the
-    # caller.
-    
     if descriptor_content:
-      descriptor = ServerDescriptorV2("\n".join(descriptor_content), annotations = annotations)
-      descriptor._set_path(path)
+      # strip newlines from annotations
+      annotations = map(str.strip, annotations)
+      
+      descriptor_text = "".join(descriptor_content)
+      descriptor = ServerDescriptorV2(descriptor_text, validate, annotations)
       yield descriptor
-    else: return # done parsing descriptors
+    else: break # done parsing descriptors
 
 def _read_until_keyword(keyword, descriptor_file, inclusive = False):
   """
@@ -112,12 +132,10 @@ def _read_until_keyword(keyword, descriptor_file, inclusive = False):
   while True:
     last_position = descriptor_file.tell()
     line = descriptor_file.readline()
-    
     if not line: break # EOF
-    line = line.strip()
     
     if " " in line: line_keyword = line.split(" ", 1)[0]
-    else: line_keyword = line
+    else: line_keyword = line.strip()
     
     if line_keyword == keyword:
       if inclusive: content.append(line)
@@ -157,13 +175,13 @@ def _get_psudo_pgp_block(remaining_contents):
     
     while True:
       if not remaining_contents:
-        raise ValueError("Unterminated public key block")
+        raise ValueError("Unterminated pgp style block")
       
       line = remaining_contents.pop(0)
       block_lines.append(line)
       
       if line == PGP_BLOCK_END % block_type:
-        return block_type, "\n".join(block_lines)
+        return (block_type, "\n".join(block_lines))
   else:
     return (None, None)
 
@@ -197,7 +215,7 @@ class ServerDescriptorV2(stem.descriptor.Descriptor):
     contact (str)            - relay's contact information
     family (list)            - nicknames or fingerprints of relays it has a declared family with (*)
     
-    * required fields, others are left as None if undefined
+    (*) required fields, others are left as None if undefined
   """
   
   nickname = address = or_port = socks_port = dir_port = None
@@ -206,7 +224,7 @@ class ServerDescriptorV2(stem.descriptor.Descriptor):
   onion_key = onion_key_type = signing_key = signing_key_type = None
   router_sig = router_sig_type = contact = None
   hibernating = False
-  family = unrecognized_entries = []
+  family = unrecognized_lines = []
   
   # TODO: Until we have a proper ExitPolicy class this is just a list of the
   # exit policy strings...
@@ -258,6 +276,9 @@ class ServerDescriptorV2(stem.descriptor.Descriptor):
     while remaining_contents:
       line = remaining_contents.pop(0)
       
+      # last line can be empty
+      if not line and not remaining_contents: continue
+      
       # Some lines have an 'opt ' for backward compatability. They should be
       # ignored. This prefix is being removed in...
       # https://trac.torproject.org/projects/tor/ticket/5124
@@ -271,7 +292,12 @@ class ServerDescriptorV2(stem.descriptor.Descriptor):
         raise ValueError("Line contains invalid characters: %s" % line)
       
       keyword, value = line_match.groups()
-      block_type, block_contents = _get_psudo_pgp_block(remaining_contents)
+      
+      try:
+        block_type, block_contents = _get_psudo_pgp_block(remaining_contents)
+      except ValueError, exc:
+        if not validate: continue
+        raise exc
       
       if keyword in ("accept", "reject"):
         self.exit_policy.append("%s %s" % (keyword, value))
@@ -291,7 +317,6 @@ class ServerDescriptorV2(stem.descriptor.Descriptor):
           raise ValueError("The '%s' entry can only appear once in a descriptor" % keyword)
     
     # parse all the entries into our attributes
-    
     for keyword, values in entries.items():
       # most just work with the first (and only) value
       value, block_type, block_contents = values[0]
@@ -332,13 +357,15 @@ class ServerDescriptorV2(stem.descriptor.Descriptor):
           if not validate: continue
           raise ValueError("Bandwidth line must have three values: %s" % line)
         
-        if validate:
-          if not bandwidth_comp[0].isdigit():
-            raise ValueError("Bandwidth line's average rate isn't numeric: %s" % bandwidth_comp[0])
-          elif not bandwidth_comp[1].isdigit():
-            raise ValueError("Bandwidth line's burst rate isn't numeric: %s" % bandwidth_comp[1])
-          elif not bandwidth_comp[2].isdigit():
-            raise ValueError("Bandwidth line's observed rate isn't numeric: %s" % bandwidth_comp[2])
+        if not bandwidth_comp[0].isdigit():
+          if not validate: continue
+          raise ValueError("Bandwidth line's average rate isn't numeric: %s" % bandwidth_comp[0])
+        elif not bandwidth_comp[1].isdigit():
+          if not validate: continue
+          raise ValueError("Bandwidth line's burst rate isn't numeric: %s" % bandwidth_comp[1])
+        elif not bandwidth_comp[2].isdigit():
+          if not validate: continue
+          raise ValueError("Bandwidth line's observed rate isn't numeric: %s" % bandwidth_comp[2])
         
         self.average_bandwidth  = int(bandwidth_comp[0])
         self.burst_bandwidth    = int(bandwidth_comp[1])
@@ -419,7 +446,10 @@ class ServerDescriptorV2(stem.descriptor.Descriptor):
       elif keyword == "family":
         self.family = value.split(" ")
       else:
-        self.unrecognized_entries.append(line)
+        self.unrecognized_lines.append(line)
+  
+  def get_unrecognized_lines(self):
+    return list(unrecognized_lines)
   
   def get_annotations(self):
     """
diff --git a/test/integ/descriptor/reader.py b/test/integ/descriptor/reader.py
index 7f2a425..cdc33c2 100644
--- a/test/integ/descriptor/reader.py
+++ b/test/integ/descriptor/reader.py
@@ -51,7 +51,7 @@ def _get_raw_tar_descriptors():
       for tar_entry in tar_file:
         if tar_entry.isfile():
           entry = tar_file.extractfile(tar_entry)
-          raw_descriptors.append(entry.read().strip())
+          raw_descriptors.append(entry.read())
           entry.close()
     
     TAR_DESCRIPTORS = raw_descriptors
@@ -149,7 +149,7 @@ class TestDescriptorReader(unittest.TestCase):
     
     descriptor_path = os.path.join(DESCRIPTOR_TEST_DATA, "example_descriptor")
     with open(descriptor_path) as descriptor_file:
-      descriptor_entries.append(descriptor_file.read().strip())
+      descriptor_entries.append(descriptor_file.read())
     
     # running this test multiple times to flush out concurrency issues
     for i in xrange(15):