[tor-commits] [stem/master] Fixes to document parsing

Sat Oct 13 18:35:44 UTC 2012

commit 6c3717b65acc9d208ef3bf90b5b54f3983e507df
Author: Ravi Chandra Padmala <neenaoffline at gmail.com>
Date:   Wed Aug 15 20:47:29 2012 +0530

    Fixes to document parsing
    
    One major change is that stem.descriptor.networkstatus.parse_file now
    returns a NetworkStatusDocument object instead of iterating over the
    router descriptors in the document
---
 stem/descriptor/__init__.py            |  119 +++++++++++++++-----------------
 stem/descriptor/networkstatus.py       |   18 +++---
 test/integ/descriptor/networkstatus.py |   27 +------
 3 files changed, 68 insertions(+), 96 deletions(-)

diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index d9ac21b..6563e4b 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -65,9 +65,17 @@ def parse_file(path, descriptor_file):
   elif filename == "cached-extrainfo":
     file_parser = stem.descriptor.extrainfo_descriptor.parse_file
   elif filename == "cached-consensus":
-    file_parser = stem.descriptor.networkstatus.parse_file
+    file_parser = lambda f: [stem.descriptor.networkstatus.parse_file(f)]
   elif filename == "cached-microdesc-consensus":
-    file_parser = lambda f: stem.descriptor.networkstatus.parse_file(f, True, "microdesc")
+    file_parser = lambda f: [stem.descriptor.networkstatus.parse_file(f, True, "microdesc")]
+  else:
+    # Metrics descriptor handling
+    first_line, desc = descriptor_file.readline().strip(), None
+    metrics_header_match = re.match("^@type (\S+) (\d+).(\d+)$", first_line)
+    
+    if metrics_header_match:
+      desc_type, major_version, minor_version = metrics_header_match.groups()
+      file_parser = lambda f: _parse_metrics_file(desc_type, int(major_version), int(minor_version), f)
   
   if file_parser:
     for desc in file_parser(descriptor_file):
@@ -76,47 +84,33 @@ def parse_file(path, descriptor_file):
     
     return
   
-  # Metrics descriptor handling. These contain a single descriptor per file.
-  
-  first_line, desc = descriptor_file.readline().strip(), None
-  metrics_header_match = re.match("^@type (\S+) (\d+).(\d+)$", first_line)
-  
-  if metrics_header_match:
-    # still doesn't necessarily mean that this is a descriptor, check if the
-    # header contents are recognized
-    
-    desc_type, major_version, minor_version = metrics_header_match.groups()
-    major_version, minor_version = int(major_version), int(minor_version)
-    
-    if desc_type == "server-descriptor" and major_version == 1:
-      desc = stem.descriptor.server_descriptor.RelayDescriptor(descriptor_file.read())
-    elif desc_type == "bridge-server-descriptor" and major_version == 1:
-      desc = stem.descriptor.server_descriptor.BridgeDescriptor(descriptor_file.read())
-    elif desc_type == "extra-info" and major_version == 1:
-      desc = stem.descriptor.extrainfo_descriptor.RelayExtraInfoDescriptor(descriptor_file.read())
-    elif desc_type == "bridge-extra-info" and major_version == 1:
-      # version 1.1 introduced a 'transport' field...
-      # https://trac.torproject.org/6257
-      
-      desc = stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor(descriptor_file.read())
-    elif desc_type in ("network-status-consensus-3", "network-status-vote-3") and major_version == 1:
-      desc = stem.descriptor.networkstatus.NetworkStatusDocument(descriptor_file.read())
-      for desc in desc.router_descriptors:
-        desc._set_path(path)
-        yield desc
-      return
-    elif desc_type == "network-status-microdesc-consensus-3" and major_version == 1:
-      desc = stem.descriptor.networkstatus.MicrodescriptorConsensus(descriptor_file.read())
-  
-  if desc:
-    desc._set_path(path)
-    yield desc
-    return
-  
   # Not recognized as a descriptor file.
   
   raise TypeError("Unable to determine the descriptor's type. filename: '%s', first line: '%s'" % (filename, first_line))
 
+def _parse_metrics_file(descriptor_type, major_version, minor_version, descriptor_file):
+  # Parses descriptor files from metrics, yielding individual descriptors. This
+  # throws a TypeError if the descriptor_type or version isn't recognized.
+  import stem.descriptor.server_descriptor
+  import stem.descriptor.extrainfo_descriptor
+  import stem.descriptor.networkstatus
+  
+  if descriptor_type == "server-descriptor" and major_version == 1:
+    yield stem.descriptor.server_descriptor.RelayDescriptor(descriptor_file.read())
+  elif descriptor_type == "bridge-server-descriptor" and major_version == 1:
+    yield stem.descriptor.server_descriptor.BridgeDescriptor(descriptor_file.read())
+  elif descriptor_type == "extra-info" and major_version == 1:
+    yield stem.descriptor.extrainfo_descriptor.RelayExtraInfoDescriptor(descriptor_file.read())
+  elif descriptor_type == "bridge-extra-info" and major_version == 1:
+    # version 1.1 introduced a 'transport' field...
+    # https://trac.torproject.org/6257
+    
+    yield stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor(descriptor_file.read())
+  elif descriptor_type in ("network-status-consensus-3", "network-status-vote-3") and major_version == 1:
+    yield stem.descriptor.networkstatus.parse_file(descriptor_file)
+  elif descriptor_type == "network-status-microdesc-consensus-3" and major_version == 1:
+    yield stem.descriptor.networkstatus.parse_file(descriptor_file, flavour = "microdesc")
+
 class Descriptor(object):
   """
   Common parent for all types of descriptors.
@@ -177,19 +171,13 @@ def _peek_keyword(descriptor_file):
   :returns: keyword at the current offset of descriptor_file
   """
   
-  last_position = descriptor_file.tell()
-  line = descriptor_file.readline()
-  if not line: return None
-  
-  if " " in line:
-    keyword = line.split(" ", 1)[0]
-    if keyword == "opt":
-        keyword = line.split(" ", 2)[1]
-  else: keyword = line.strip()
+  line = _peek_line(descriptor_file)
   
-  descriptor_file.seek(last_position)
+  if line.startswith("opt "):
+    line = line[4:]
+  if not line: return None
   
-  return keyword
+  return line.split(" ", 1)[0].rstrip("\n")
 
 def _read_keyword_line(keyword, descriptor_file, validate = True, optional = False):
   """
@@ -200,8 +188,9 @@ def _read_keyword_line(keyword, descriptor_file, validate = True, optional = Fal
   Respects the opt keyword and returns the next keyword if the first is "opt".
   
   :param str keyword: keyword the line must begin with
-  :param bool optional: if the current line must begin with the given keyword
+  :param bool descriptor_file: file/file-like object containing descriptor data
   :param bool validate: validation is enabled
+  :param bool optional: if the current line must begin with the given keyword
   
   :returns: the text after the keyword if the keyword matches the one provided, otherwise returns None or raises an exception
   
@@ -214,13 +203,14 @@ def _read_keyword_line(keyword, descriptor_file, validate = True, optional = Fal
       raise ValueError("Unexpected end of document")
     return None
   
-  if line_matches_keyword(keyword, line):
-    line = descriptor_file.readline()
-    
-    if line == "opt " + keyword or line == keyword: return ""
-    elif line.startswith("opt "): return line.split(" ", 2)[2].rstrip("\n")
-    else: return line.split(" ", 1)[1].rstrip("\n")
-  elif line.startswith("opt"):
+  opt_line = False
+  if line.startswith("opt "):
+    line = line[4:]
+    opt_line = True
+  if re.match("^" + re.escape(keyword) + "($| )", line):
+    descriptor_file.readline()
+    return line[len(keyword):].strip()
+  elif opt_line and not optional:
     # if this is something new we don't recognize
     # ignore it and go to the next line
     descriptor_file.readline()
@@ -239,8 +229,8 @@ def _read_keyword_line_str(keyword, lines, validate = True, optional = False):
   
   :param str keyword: keyword the line must begin with
   :param list lines: list of strings to be read from
-  :param bool optional: if the current line must begin with the given keyword
   :param bool validate: validation is enabled
+  :param bool optional: if the current line must begin with the given keyword
   
   :returns: the text after the keyword if the keyword matches the one provided, otherwise returns None or raises an exception
   
@@ -252,16 +242,17 @@ def _read_keyword_line_str(keyword, lines, validate = True, optional = False):
       raise ValueError("Unexpected end of document")
     return
   
+  opt_line = False
+  if lines[0].startswith("opt "):
+    line = line[4:]
+    opt_line = True
   if line_matches_keyword(keyword, lines[0]):
     line = lines.pop(0)
     
-    if line == "opt " + keyword or line == keyword: return ""
-    elif line.startswith("opt "): return line.split(" ", 2)[2]
-    else: return line.split(" ", 1)[1]
-  elif line.startswith("opt "):
+    return line[len(keyword):].strip()
+  elif opt_line and not optional:
     # if this is something new we don't recognize yet
     # ignore it and go to the next line
-    lines.pop(0)
     return _read_keyword_line_str(keyword, lines, optional)
   elif not optional and validate:
     raise ValueError("Error parsing network status document: Expected %s, received: %s" % (keyword, lines[0]))
diff --git a/stem/descriptor/networkstatus.py b/stem/descriptor/networkstatus.py
index 7effc7e..f9d89a8 100644
--- a/stem/descriptor/networkstatus.py
+++ b/stem/descriptor/networkstatus.py
@@ -21,7 +21,7 @@ The documents can be obtained from any of the following sources...
   
   nsdoc_file = open("/home/neena/.tor/cached-consensus")
   try:
-    consensus = stem.descriptor.networkstatus.NetworkStatusDocument(nsdoc_file.read())
+    consensus = stem.descriptor.networkstatus.parse_file(nsdoc_file)
   except ValueError:
     print "Invalid cached-consensus file"
   
@@ -33,7 +33,9 @@ The documents can be obtained from any of the following sources...
 
   parse_file - parses a network status file and provides a NetworkStatusDocument
   NetworkStatusDocument - Tor v3 network status document
+    +- MicrodescriptorConsensus - Microdescriptor flavoured consensus documents
   RouterDescriptor - Router descriptor; contains information about a Tor relay
+    +- RouterMicrodescriptor - Router microdescriptor; contains information that doesn't change frequently
   DirectorySignature - Network status document's directory signature
   DirectoryAuthority - Directory authority defined in a v3 network status document
 """
@@ -63,7 +65,7 @@ Flavour = stem.util.enum.Enum(
   ("NONE", ""),
   ("NS", "ns"),
   ("MICRODESCRIPTOR", "microdesc"),
-  )
+)
 
 Flag = stem.util.enum.Enum(
   ("AUTHORITY", "Authority"),
@@ -78,18 +80,16 @@ Flag = stem.util.enum.Enum(
   ("UNNAMED", "Unnamed"),
   ("V2DIR", "V2Dir"),
   ("VALID", "Valid"),
-  )
-
-Flag = stem.util.enum.Enum(*[(flag.upper(), flag) for flag in ["Authority", "BadExit", "Exit", "Fast", "Guard", "HSDir", "Named", "Running", "Stable", "Unnamed", "V2Dir", "Valid"]])
+)
 
 def parse_file(document_file, validate = True, flavour = Flavour.NONE):
   """
-  Iterates over the router descriptors in a network status document.
+  Parses a network status document and provides a NetworkStatusDocument object.
   
   :param file document_file: file with network status document content
   :param bool validate: checks the validity of the document's contents if True, skips these checks otherwise
   
-  :returns: iterator for :class:`stem.descriptor.networkstatus.RouterDescriptor` instances in the file
+  :returns: :class:`stem.descriptor.networkstatus.NetworkStatusDocument` object
   
   :raises:
     * ValueError if the contents is malformed and validate is True
@@ -109,12 +109,12 @@ def parse_file(document_file, validate = True, flavour = Flavour.NONE):
     document = NetworkStatusDocument(document_data, validate)
     document_file.seek(r_offset)
     document.router_descriptors = _ns_router_desc_generator(document_file, document.vote_status == "vote", validate)
-    yield document
+    return document
   elif flavour == Flavour.MICRODESCRIPTOR:
     document = MicrodescriptorConsensus(document_data, validate)
     document_file.seek(r_offset)
     document.router_descriptors = _router_microdesc_generator(document_file, validate, document.known_flags)
-    yield document
+    return document
 
 def _ns_router_desc_generator(document_file, vote, validate):
   while _peek_keyword(document_file) == "r":
diff --git a/test/integ/descriptor/networkstatus.py b/test/integ/descriptor/networkstatus.py
index 484e67d..bd326ad 100644
--- a/test/integ/descriptor/networkstatus.py
+++ b/test/integ/descriptor/networkstatus.py
@@ -39,7 +39,7 @@ class TestNetworkStatusDocument(unittest.TestCase):
     
     count = 0
     with open(descriptor_path) as descriptor_file:
-      for desc in stem.descriptor.networkstatus.parse_file(descriptor_file):
+      for desc in stem.descriptor.networkstatus.parse_file(descriptor_file).router_descriptors:
         if resource.getrusage(resource.RUSAGE_SELF).ru_maxrss > 200000:
           # if we're using > 200 MB we should fail
           self.fail()
@@ -58,7 +58,7 @@ class TestNetworkStatusDocument(unittest.TestCase):
     with file(descriptor_path) as descriptor_file:
       desc = stem.descriptor.parse_file(descriptor_path, descriptor_file)
       
-      router = next(desc)
+      router = next(next(desc).router_descriptors)
       self.assertEquals("sumkledi", router.nickname)
       self.assertEquals("ABPSI4nNUNC3hKPkBhyzHozozrU", router.identity)
       self.assertEquals("8mCr8Sl7RF4ENU4jb0FZFA/3do8", router.digest)
@@ -150,7 +150,7 @@ I/TJmV928na7RLZe2mGHCAW3VQOvV+QkCfj05VZ8CsY=
     with file(descriptor_path) as descriptor_file:
       desc = stem.descriptor.parse_file(descriptor_path, descriptor_file)
       
-      router = next(desc)
+      router = next(next(desc).router_descriptors)
       self.assertEquals("sumkledi", router.nickname)
       self.assertEquals("ABPSI4nNUNC3hKPkBhyzHozozrU", router.identity)
       self.assertEquals("B5n4BiALAF8B5AqafxohyYiuj7E", router.digest)
@@ -273,28 +273,9 @@ class TestMicrodescriptorConsensus(unittest.TestCase):
     
     count = 0
     with open(descriptor_path) as descriptor_file:
-      for desc in next(stem.descriptor.networkstatus.parse_file(descriptor_file, True, flavour = Flavour.MICRODESCRIPTOR)).router_descriptors:
+      for desc in stem.descriptor.networkstatus.parse_file(descriptor_file, True, flavour = Flavour.MICRODESCRIPTOR).router_descriptors:
         assert desc.nickname # check that the router has a nickname
         count += 1
     
     assert count > 100 # sanity check - assuming atleast 100 relays in the consensus
-  
-  def test_metrics_microdesc_consensus(self):
-    """
-    Checks if consensus documents from Metrics are parsed properly.
-    """
-    
-    descriptor_path = test.integ.descriptor.get_resource("metrics_microdesc_consensus")
-    
-    with file(descriptor_path) as descriptor_file:
-      desc = stem.descriptor.parse_file(descriptor_path, descriptor_file)
-      
-      router = next(next(desc).router_descriptors)
-      self.assertEquals("JapanAnon", router.nickname)
-      self.assertEquals("AGw/p8P246zRPQ3ZsQx9+pM8I3s", router.identity)
-      self.assertEquals("9LDw0XiFeLQDXK9t8ht4+MK9tWx6Jxp1RwP36eatRWs", router.digest)
-      self.assertEquals(_strptime("2012-07-18 15:55:42"), router.publication)
-      self.assertEquals("220.0.231.71", router.ip)
-      self.assertEquals(443, router.orport)
-      self.assertEquals(9030, router.dirport)