[tor-commits] [stem/master] Fixes to document parsing
atagar at torproject.org
atagar at torproject.org
Sat Oct 13 18:35:44 UTC 2012
commit 6c3717b65acc9d208ef3bf90b5b54f3983e507df
Author: Ravi Chandra Padmala <neenaoffline at gmail.com>
Date: Wed Aug 15 20:47:29 2012 +0530
Fixes to document parsing
One major change is that stem.descriptor.networkstatus.parse_file now
returns a NetworkStatusDocument object instead of iterating over the
router descriptors in the document
---
stem/descriptor/__init__.py | 119 +++++++++++++++-----------------
stem/descriptor/networkstatus.py | 18 +++---
test/integ/descriptor/networkstatus.py | 27 +------
3 files changed, 68 insertions(+), 96 deletions(-)
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index d9ac21b..6563e4b 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -65,9 +65,17 @@ def parse_file(path, descriptor_file):
elif filename == "cached-extrainfo":
file_parser = stem.descriptor.extrainfo_descriptor.parse_file
elif filename == "cached-consensus":
- file_parser = stem.descriptor.networkstatus.parse_file
+ file_parser = lambda f: [stem.descriptor.networkstatus.parse_file(f)]
elif filename == "cached-microdesc-consensus":
- file_parser = lambda f: stem.descriptor.networkstatus.parse_file(f, True, "microdesc")
+ file_parser = lambda f: [stem.descriptor.networkstatus.parse_file(f, True, "microdesc")]
+ else:
+ # Metrics descriptor handling
+ first_line, desc = descriptor_file.readline().strip(), None
+ metrics_header_match = re.match("^@type (\S+) (\d+).(\d+)$", first_line)
+
+ if metrics_header_match:
+ desc_type, major_version, minor_version = metrics_header_match.groups()
+ file_parser = lambda f: _parse_metrics_file(desc_type, int(major_version), int(minor_version), f)
if file_parser:
for desc in file_parser(descriptor_file):
@@ -76,47 +84,33 @@ def parse_file(path, descriptor_file):
return
- # Metrics descriptor handling. These contain a single descriptor per file.
-
- first_line, desc = descriptor_file.readline().strip(), None
- metrics_header_match = re.match("^@type (\S+) (\d+).(\d+)$", first_line)
-
- if metrics_header_match:
- # still doesn't necessarily mean that this is a descriptor, check if the
- # header contents are recognized
-
- desc_type, major_version, minor_version = metrics_header_match.groups()
- major_version, minor_version = int(major_version), int(minor_version)
-
- if desc_type == "server-descriptor" and major_version == 1:
- desc = stem.descriptor.server_descriptor.RelayDescriptor(descriptor_file.read())
- elif desc_type == "bridge-server-descriptor" and major_version == 1:
- desc = stem.descriptor.server_descriptor.BridgeDescriptor(descriptor_file.read())
- elif desc_type == "extra-info" and major_version == 1:
- desc = stem.descriptor.extrainfo_descriptor.RelayExtraInfoDescriptor(descriptor_file.read())
- elif desc_type == "bridge-extra-info" and major_version == 1:
- # version 1.1 introduced a 'transport' field...
- # https://trac.torproject.org/6257
-
- desc = stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor(descriptor_file.read())
- elif desc_type in ("network-status-consensus-3", "network-status-vote-3") and major_version == 1:
- desc = stem.descriptor.networkstatus.NetworkStatusDocument(descriptor_file.read())
- for desc in desc.router_descriptors:
- desc._set_path(path)
- yield desc
- return
- elif desc_type == "network-status-microdesc-consensus-3" and major_version == 1:
- desc = stem.descriptor.networkstatus.MicrodescriptorConsensus(descriptor_file.read())
-
- if desc:
- desc._set_path(path)
- yield desc
- return
-
# Not recognized as a descriptor file.
raise TypeError("Unable to determine the descriptor's type. filename: '%s', first line: '%s'" % (filename, first_line))
+def _parse_metrics_file(descriptor_type, major_version, minor_version, descriptor_file):
+ # Parses descriptor files from metrics, yielding individual descriptors. This
+ # throws a TypeError if the descriptor_type or version isn't recognized.
+ import stem.descriptor.server_descriptor
+ import stem.descriptor.extrainfo_descriptor
+ import stem.descriptor.networkstatus
+
+ if descriptor_type == "server-descriptor" and major_version == 1:
+ yield stem.descriptor.server_descriptor.RelayDescriptor(descriptor_file.read())
+ elif descriptor_type == "bridge-server-descriptor" and major_version == 1:
+ yield stem.descriptor.server_descriptor.BridgeDescriptor(descriptor_file.read())
+ elif descriptor_type == "extra-info" and major_version == 1:
+ yield stem.descriptor.extrainfo_descriptor.RelayExtraInfoDescriptor(descriptor_file.read())
+ elif descriptor_type == "bridge-extra-info" and major_version == 1:
+ # version 1.1 introduced a 'transport' field...
+ # https://trac.torproject.org/6257
+
+ yield stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor(descriptor_file.read())
+ elif descriptor_type in ("network-status-consensus-3", "network-status-vote-3") and major_version == 1:
+ yield stem.descriptor.networkstatus.parse_file(descriptor_file)
+ elif descriptor_type == "network-status-microdesc-consensus-3" and major_version == 1:
+ yield stem.descriptor.networkstatus.parse_file(descriptor_file, flavour = "microdesc")
+
class Descriptor(object):
"""
Common parent for all types of descriptors.
@@ -177,19 +171,13 @@ def _peek_keyword(descriptor_file):
:returns: keyword at the current offset of descriptor_file
"""
- last_position = descriptor_file.tell()
- line = descriptor_file.readline()
- if not line: return None
-
- if " " in line:
- keyword = line.split(" ", 1)[0]
- if keyword == "opt":
- keyword = line.split(" ", 2)[1]
- else: keyword = line.strip()
+ line = _peek_line(descriptor_file)
- descriptor_file.seek(last_position)
+ if line.startswith("opt "):
+ line = line[4:]
+ if not line: return None
- return keyword
+ return line.split(" ", 1)[0].rstrip("\n")
def _read_keyword_line(keyword, descriptor_file, validate = True, optional = False):
"""
@@ -200,8 +188,9 @@ def _read_keyword_line(keyword, descriptor_file, validate = True, optional = Fal
Respects the opt keyword and returns the next keyword if the first is "opt".
:param str keyword: keyword the line must begin with
- :param bool optional: if the current line must begin with the given keyword
+ :param bool descriptor_file: file/file-like object containing descriptor data
:param bool validate: validation is enabled
+ :param bool optional: if the current line must begin with the given keyword
:returns: the text after the keyword if the keyword matches the one provided, otherwise returns None or raises an exception
@@ -214,13 +203,14 @@ def _read_keyword_line(keyword, descriptor_file, validate = True, optional = Fal
raise ValueError("Unexpected end of document")
return None
- if line_matches_keyword(keyword, line):
- line = descriptor_file.readline()
-
- if line == "opt " + keyword or line == keyword: return ""
- elif line.startswith("opt "): return line.split(" ", 2)[2].rstrip("\n")
- else: return line.split(" ", 1)[1].rstrip("\n")
- elif line.startswith("opt"):
+ opt_line = False
+ if line.startswith("opt "):
+ line = line[4:]
+ opt_line = True
+ if re.match("^" + re.escape(keyword) + "($| )", line):
+ descriptor_file.readline()
+ return line[len(keyword):].strip()
+ elif opt_line and not optional:
# if this is something new we don't recognize
# ignore it and go to the next line
descriptor_file.readline()
@@ -239,8 +229,8 @@ def _read_keyword_line_str(keyword, lines, validate = True, optional = False):
:param str keyword: keyword the line must begin with
:param list lines: list of strings to be read from
- :param bool optional: if the current line must begin with the given keyword
:param bool validate: validation is enabled
+ :param bool optional: if the current line must begin with the given keyword
:returns: the text after the keyword if the keyword matches the one provided, otherwise returns None or raises an exception
@@ -252,16 +242,17 @@ def _read_keyword_line_str(keyword, lines, validate = True, optional = False):
raise ValueError("Unexpected end of document")
return
+ opt_line = False
+ if lines[0].startswith("opt "):
+ line = line[4:]
+ opt_line = True
if line_matches_keyword(keyword, lines[0]):
line = lines.pop(0)
- if line == "opt " + keyword or line == keyword: return ""
- elif line.startswith("opt "): return line.split(" ", 2)[2]
- else: return line.split(" ", 1)[1]
- elif line.startswith("opt "):
+ return line[len(keyword):].strip()
+ elif opt_line and not optional:
# if this is something new we don't recognize yet
# ignore it and go to the next line
- lines.pop(0)
return _read_keyword_line_str(keyword, lines, optional)
elif not optional and validate:
raise ValueError("Error parsing network status document: Expected %s, received: %s" % (keyword, lines[0]))
diff --git a/stem/descriptor/networkstatus.py b/stem/descriptor/networkstatus.py
index 7effc7e..f9d89a8 100644
--- a/stem/descriptor/networkstatus.py
+++ b/stem/descriptor/networkstatus.py
@@ -21,7 +21,7 @@ The documents can be obtained from any of the following sources...
nsdoc_file = open("/home/neena/.tor/cached-consensus")
try:
- consensus = stem.descriptor.networkstatus.NetworkStatusDocument(nsdoc_file.read())
+ consensus = stem.descriptor.networkstatus.parse_file(nsdoc_file)
except ValueError:
print "Invalid cached-consensus file"
@@ -33,7 +33,9 @@ The documents can be obtained from any of the following sources...
parse_file - parses a network status file and provides a NetworkStatusDocument
NetworkStatusDocument - Tor v3 network status document
+ +- MicrodescriptorConsensus - Microdescriptor flavoured consensus documents
RouterDescriptor - Router descriptor; contains information about a Tor relay
+ +- RouterMicrodescriptor - Router microdescriptor; contains information that doesn't change frequently
DirectorySignature - Network status document's directory signature
DirectoryAuthority - Directory authority defined in a v3 network status document
"""
@@ -63,7 +65,7 @@ Flavour = stem.util.enum.Enum(
("NONE", ""),
("NS", "ns"),
("MICRODESCRIPTOR", "microdesc"),
- )
+)
Flag = stem.util.enum.Enum(
("AUTHORITY", "Authority"),
@@ -78,18 +80,16 @@ Flag = stem.util.enum.Enum(
("UNNAMED", "Unnamed"),
("V2DIR", "V2Dir"),
("VALID", "Valid"),
- )
-
-Flag = stem.util.enum.Enum(*[(flag.upper(), flag) for flag in ["Authority", "BadExit", "Exit", "Fast", "Guard", "HSDir", "Named", "Running", "Stable", "Unnamed", "V2Dir", "Valid"]])
+)
def parse_file(document_file, validate = True, flavour = Flavour.NONE):
"""
- Iterates over the router descriptors in a network status document.
+ Parses a network status document and provides a NetworkStatusDocument object.
:param file document_file: file with network status document content
:param bool validate: checks the validity of the document's contents if True, skips these checks otherwise
- :returns: iterator for :class:`stem.descriptor.networkstatus.RouterDescriptor` instances in the file
+ :returns: :class:`stem.descriptor.networkstatus.NetworkStatusDocument` object
:raises:
* ValueError if the contents is malformed and validate is True
@@ -109,12 +109,12 @@ def parse_file(document_file, validate = True, flavour = Flavour.NONE):
document = NetworkStatusDocument(document_data, validate)
document_file.seek(r_offset)
document.router_descriptors = _ns_router_desc_generator(document_file, document.vote_status == "vote", validate)
- yield document
+ return document
elif flavour == Flavour.MICRODESCRIPTOR:
document = MicrodescriptorConsensus(document_data, validate)
document_file.seek(r_offset)
document.router_descriptors = _router_microdesc_generator(document_file, validate, document.known_flags)
- yield document
+ return document
def _ns_router_desc_generator(document_file, vote, validate):
while _peek_keyword(document_file) == "r":
diff --git a/test/integ/descriptor/networkstatus.py b/test/integ/descriptor/networkstatus.py
index 484e67d..bd326ad 100644
--- a/test/integ/descriptor/networkstatus.py
+++ b/test/integ/descriptor/networkstatus.py
@@ -39,7 +39,7 @@ class TestNetworkStatusDocument(unittest.TestCase):
count = 0
with open(descriptor_path) as descriptor_file:
- for desc in stem.descriptor.networkstatus.parse_file(descriptor_file):
+ for desc in stem.descriptor.networkstatus.parse_file(descriptor_file).router_descriptors:
if resource.getrusage(resource.RUSAGE_SELF).ru_maxrss > 200000:
# if we're using > 200 MB we should fail
self.fail()
@@ -58,7 +58,7 @@ class TestNetworkStatusDocument(unittest.TestCase):
with file(descriptor_path) as descriptor_file:
desc = stem.descriptor.parse_file(descriptor_path, descriptor_file)
- router = next(desc)
+ router = next(next(desc).router_descriptors)
self.assertEquals("sumkledi", router.nickname)
self.assertEquals("ABPSI4nNUNC3hKPkBhyzHozozrU", router.identity)
self.assertEquals("8mCr8Sl7RF4ENU4jb0FZFA/3do8", router.digest)
@@ -150,7 +150,7 @@ I/TJmV928na7RLZe2mGHCAW3VQOvV+QkCfj05VZ8CsY=
with file(descriptor_path) as descriptor_file:
desc = stem.descriptor.parse_file(descriptor_path, descriptor_file)
- router = next(desc)
+ router = next(next(desc).router_descriptors)
self.assertEquals("sumkledi", router.nickname)
self.assertEquals("ABPSI4nNUNC3hKPkBhyzHozozrU", router.identity)
self.assertEquals("B5n4BiALAF8B5AqafxohyYiuj7E", router.digest)
@@ -273,28 +273,9 @@ class TestMicrodescriptorConsensus(unittest.TestCase):
count = 0
with open(descriptor_path) as descriptor_file:
- for desc in next(stem.descriptor.networkstatus.parse_file(descriptor_file, True, flavour = Flavour.MICRODESCRIPTOR)).router_descriptors:
+ for desc in stem.descriptor.networkstatus.parse_file(descriptor_file, True, flavour = Flavour.MICRODESCRIPTOR).router_descriptors:
assert desc.nickname # check that the router has a nickname
count += 1
assert count > 100 # sanity check - assuming atleast 100 relays in the consensus
-
- def test_metrics_microdesc_consensus(self):
- """
- Checks if consensus documents from Metrics are parsed properly.
- """
-
- descriptor_path = test.integ.descriptor.get_resource("metrics_microdesc_consensus")
-
- with file(descriptor_path) as descriptor_file:
- desc = stem.descriptor.parse_file(descriptor_path, descriptor_file)
-
- router = next(next(desc).router_descriptors)
- self.assertEquals("JapanAnon", router.nickname)
- self.assertEquals("AGw/p8P246zRPQ3ZsQx9+pM8I3s", router.identity)
- self.assertEquals("9LDw0XiFeLQDXK9t8ht4+MK9tWx6Jxp1RwP36eatRWs", router.digest)
- self.assertEquals(_strptime("2012-07-18 15:55:42"), router.publication)
- self.assertEquals("220.0.231.71", router.ip)
- self.assertEquals(443, router.orport)
- self.assertEquals(9030, router.dirport)
More information about the tor-commits
mailing list