[tor-commits] [stem/master] Always processing @type headers

Sun Jan 20 00:03:31 UTC 2013

commit 3d3e69417975499fbe5d2a3eb591d808c6800874
Author: Damian Johnson <atagar at torproject.org>
Date:   Sat Jan 19 15:53:06 2013 -0800

    Always processing @type headers
    
    The parse_file() function only consumed the @type annotation when we didn't
    have a descriptor_type nor recognized filename. Changing it so we always
    consume the @type header.
    
    I'm also changing the priority order of parse_file() so if the file could be
    both a metrics archive or cached descriptor (for instance a file named
    'cached-consensus' that starts with a @type annotation) then we process it as a
    metrics archive.
---
 stem/descriptor/__init__.py            |   41 ++++++++++++++++++--------------
 test/integ/descriptor/networkstatus.py |   26 +++++++++++--------
 2 files changed, 38 insertions(+), 29 deletions(-)

diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index 7280222..f1fb86e 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -48,12 +48,12 @@ def parse_file(descriptor_file, descriptor_type = None, path = None):
   If you don't provide a **descriptor_type** argument then this automatically
   tries to determine the descriptor type based on the following...
 
-  * The filename if it matches something from tor's data directory. For
-    instance, tor's 'cached-descriptors' contains server descriptors.
-
   * The @type annotation on the first line. These are generally only found in
     the `descriptor archives <https://metrics.torproject.org>`_.
 
+  * The filename if it matches something from tor's data directory. For
+    instance, tor's 'cached-descriptors' contains server descriptors.
+
   This is a handy function for simple usage, but if you're reading multiple
   descriptor files you might want to consider the
   :class:`~stem.descriptor.reader.DescriptorReader`.
@@ -101,7 +101,12 @@ def parse_file(descriptor_file, descriptor_type = None, path = None):
   # by an annotation on their first line...
   # https://trac.torproject.org/5651
 
-  # Cached descriptor handling. These contain multiple descriptors per file.
+  initial_position = descriptor_file.tell()
+  first_line = descriptor_file.readline().strip()
+  metrics_header_match = re.match("^@type (\S+) (\d+).(\d+)$", first_line)
+
+  if not metrics_header_match:
+    descriptor_file.seek(initial_position)
 
   filename = '<undefined>' if path is None else os.path.basename(path)
   file_parser = None
@@ -114,22 +119,22 @@ def parse_file(descriptor_file, descriptor_type = None, path = None):
       file_parser = lambda f: _parse_metrics_file(desc_type, int(major_version), int(minor_version), f)
     else:
       raise ValueError("The descriptor_type must be of the form '<type> <major_version>.<minor_version>'")
-  elif filename == "cached-descriptors":
-    file_parser = stem.descriptor.server_descriptor._parse_file
-  elif filename == "cached-extrainfo":
-    file_parser = stem.descriptor.extrainfo_descriptor._parse_file
-  elif filename == "cached-consensus":
-    file_parser = stem.descriptor.networkstatus._parse_file
-  elif filename == "cached-microdesc-consensus":
-    file_parser = lambda f: stem.descriptor.networkstatus._parse_file(f, is_microdescriptor = True)
-  else:
+  elif metrics_header_match:
     # Metrics descriptor handling
-    first_line, desc = descriptor_file.readline().strip(), None
-    metrics_header_match = re.match("^@type (\S+) (\d+).(\d+)$", first_line)
 
-    if metrics_header_match:
-      desc_type, major_version, minor_version = metrics_header_match.groups()
-      file_parser = lambda f: _parse_metrics_file(desc_type, int(major_version), int(minor_version), f)
+    desc_type, major_version, minor_version = metrics_header_match.groups()
+    file_parser = lambda f: _parse_metrics_file(desc_type, int(major_version), int(minor_version), f)
+  else:
+    # Cached descriptor handling. These contain multiple descriptors per file.
+
+    if filename == "cached-descriptors":
+      file_parser = stem.descriptor.server_descriptor._parse_file
+    elif filename == "cached-extrainfo":
+      file_parser = stem.descriptor.extrainfo_descriptor._parse_file
+    elif filename == "cached-consensus":
+      file_parser = stem.descriptor.networkstatus._parse_file
+    elif filename == "cached-microdesc-consensus":
+      file_parser = lambda f: stem.descriptor.networkstatus._parse_file(f, is_microdescriptor = True)
 
   if file_parser:
     for desc in file_parser(descriptor_file):
diff --git a/test/integ/descriptor/networkstatus.py b/test/integ/descriptor/networkstatus.py
index c654158..035289d 100644
--- a/test/integ/descriptor/networkstatus.py
+++ b/test/integ/descriptor/networkstatus.py
@@ -117,17 +117,21 @@ class TestNetworkStatus(unittest.TestCase):
 
     consensus_path = test.integ.descriptor.get_resource("metrics_consensus")
 
-    with open(consensus_path) as descriptor_file:
-      descriptors = stem.descriptor.parse_file(descriptor_file, path = consensus_path)
-
-      router = next(descriptors)
-      self.assertEquals("sumkledi", router.nickname)
-      self.assertEquals("0013D22389CD50D0B784A3E4061CB31E8CE8CEB5", router.fingerprint)
-      self.assertEquals("8mCr8Sl7RF4ENU4jb0FZFA/3do8", router.digest)
-      self.assertEquals(datetime.datetime(2012, 7, 12, 4, 1, 55), router.published)
-      self.assertEquals("178.218.213.229", router.address)
-      self.assertEquals(80, router.or_port)
-      self.assertEquals(None, router.dir_port)
+    for specify_type in (True, False):
+      with open(consensus_path) as descriptor_file:
+        if specify_type:
+          descriptors = stem.descriptor.parse_file(descriptor_file, "network-status-consensus-3 1.0", path = consensus_path)
+        else:
+          descriptors = stem.descriptor.parse_file(descriptor_file, path = consensus_path)
+
+        router = next(descriptors)
+        self.assertEquals("sumkledi", router.nickname)
+        self.assertEquals("0013D22389CD50D0B784A3E4061CB31E8CE8CEB5", router.fingerprint)
+        self.assertEquals("8mCr8Sl7RF4ENU4jb0FZFA/3do8", router.digest)
+        self.assertEquals(datetime.datetime(2012, 7, 12, 4, 1, 55), router.published)
+        self.assertEquals("178.218.213.229", router.address)
+        self.assertEquals(80, router.or_port)
+        self.assertEquals(None, router.dir_port)
 
   def test_metrics_bridge_consensus(self):
     """