[tor-commits] [stem/master] Cleaning up prior descriptor implementations
atagar at torproject.org
atagar at torproject.org
Mon Mar 26 00:10:01 UTC 2012
commit 7e9d454ed4eafe6b5461c5ff5170ca5634899edf
Author: Damian Johnson <atagar at torproject.org>
Date: Sat Mar 24 13:41:11 2012 -0700
Cleaning up prior descriptor implementations
Adding header pydocs and made a variety of small fixes and naming improvements.
---
stem/descriptor/__init__.py | 45 +++++++----------
stem/descriptor/reader.py | 4 +-
stem/descriptor/server_descriptor.py | 86 +++++++++++++++++++++++-----------
test/integ/descriptor/reader.py | 4 +-
4 files changed, 81 insertions(+), 58 deletions(-)
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index ad17296..215864f 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -1,12 +1,18 @@
"""
Package for parsing and processing descriptor data.
+
+parse_file - Iterates over the descriptors in a file.
+Descriptor - Common parent for all descriptor file types.
+ |- get_path - location of the descriptor on disk if it came from a file
+ |- get_unrecognized_lines - unparsed descriptor content
+ +- __str__ - string that the descriptor was made from
"""
-__all__ = ["descriptor", "reader", "server_descriptor", "parse_descriptors", "Descriptor"]
+__all__ = ["descriptor", "reader", "server_descriptor", "parse_file", "Descriptor"]
import os
-def parse_descriptors(path, descriptor_file):
+def parse_file(path, descriptor_file):
"""
Provides an iterator for the descriptors within a given file.
@@ -15,7 +21,7 @@ def parse_descriptors(path, descriptor_file):
descriptor_file (file) - opened file with the descriptor contents
Returns:
- iterator that parses the file's contents into descriptors
+ iterator for Descriptor instances in the file
Raises:
TypeError if we can't match the contents of the file to a descriptor type
@@ -26,33 +32,21 @@ def parse_descriptors(path, descriptor_file):
# The tor descriptor specifications do not provide a reliable method for
# identifying a descriptor file's type and version so we need to guess
- # based on its filename for resources from the data directory and contents
- # for files provided by metrics.
+ # based on...
+ # - its filename for resources from the tor data directory
+ # - first line of our contents for files provided by metrics
filename = os.path.basename(path)
-
- if filename == "cached-descriptors":
- # server descriptors from tor's data directory
- while descriptor_file:
- yield stem.descriptor.server_descriptor.parse_server_descriptors_v2(path, descriptor_file)
-
- return
-
first_line = descriptor_file.readline()
descriptor_file.seek(0)
- if first_line.startswith("router "):
- # server descriptor
- for desc in stem.descriptor.server_descriptor.parse_server_descriptors_v2(path, descriptor_file):
+ if filename == "cached-descriptors" or first_line.startswith("router "):
+ for desc in stem.descriptor.server_descriptor.parse_file_v2(descriptor_file):
+ desc._set_path(path)
yield desc
-
- return
-
- # TODO: implement actual descriptor type recognition and parsing
- # TODO: add integ test for non-descriptor text content
- desc = Descriptor(descriptor_file.read())
- desc._set_path(path)
- yield desc
+ else:
+ # unrecognized descritor type
+ raise TypeError("Unable to determine the descriptor's type. filename: '%s', first line: '%s'" % (filename, first_line))
class Descriptor:
"""
@@ -83,7 +77,7 @@ class Descriptor:
list of lines of unrecognized content
"""
- return []
+ raise NotImplementedError
def _set_path(self, path):
self._path = path
@@ -91,4 +85,3 @@ class Descriptor:
def __str__(self):
return self._raw_contents
-
diff --git a/stem/descriptor/reader.py b/stem/descriptor/reader.py
index 4f043bc..511ba04 100644
--- a/stem/descriptor/reader.py
+++ b/stem/descriptor/reader.py
@@ -365,7 +365,7 @@ class DescriptorReader:
def _handle_descriptor_file(self, target):
try:
with open(target) as target_file:
- for desc in stem.descriptor.parse_descriptors(target, target_file):
+ for desc in stem.descriptor.parse_file(target, target_file):
self._enqueue_descriptor(desc)
self._iter_notice.set()
except TypeError, exc:
@@ -380,7 +380,7 @@ class DescriptorReader:
if tar_entry.isfile():
entry = tar_file.extractfile(tar_entry)
- for desc in stem.descriptor.parse_descriptors(target, entry):
+ for desc in stem.descriptor.parse_file(target, entry):
self._enqueue_descriptor(desc)
self._iter_notice.set()
diff --git a/stem/descriptor/server_descriptor.py b/stem/descriptor/server_descriptor.py
index fe6889b..70db1fe 100644
--- a/stem/descriptor/server_descriptor.py
+++ b/stem/descriptor/server_descriptor.py
@@ -6,15 +6,22 @@ etc). This information is provided from a few sources...
- control port via 'GETINFO desc/*' queries
- the 'cached-descriptors' file in tor's data directory
- tor metrics, at https://metrics.torproject.org/data.html
+
+parse_file_v2 - Iterates over the server descriptors in a file.
+ServerDescriptorV2 - Tor server descriptor, version 2.
+ |- get_unrecognized_lines - lines with unrecognized content
+ |- get_annotations - dictionary of content prior to the descriptor entry
+ |- get_annotation_lines - lines that provided the annotations
+ +- is_valid - checks the signature against the descriptor content
"""
import re
import datetime
+import stem.descriptor
import stem.version
import stem.util.connection
import stem.util.tor_tools
-import stem.descriptor
ENTRY_START = "router"
ENTRY_END = "router-signature"
@@ -48,9 +55,21 @@ SINGLE_FIELDS = (
"family",
)
-def parse_server_descriptors_v2(path, descriptor_file):
+def parse_file_v2(descriptor_file, validate = True):
"""
- Iterates over the verion 2 server descriptors in a descriptor file.
+ Iterates over the version 2 server descriptors in a file.
+
+ Arguments:
+ descriptor_file (file) - file with descriptor content
+ validate (bool) - checks the validity of the descriptor's content if
+ True, skips these checks otherwise
+
+ Returns:
+ iterator for ServerDescriptorV2 instances in the file
+
+ Raises:
+ ValueError if the contents is malformed and validate is True
+ IOError if the file can't be read
"""
# Cached descriptors consist of annotations followed by the descriptor
@@ -73,6 +92,9 @@ def parse_server_descriptors_v2(path, descriptor_file):
# - parse as descriptor content until we get to ENTRY_END followed by the
# end of the signature block
# - construct a descriptor and provide it back to the caller
+ #
+ # Any annotations after the last server descriptor is ignored (never provided
+ # to the caller).
while True:
annotations = _read_until_keyword(ENTRY_START, descriptor_file)
@@ -82,16 +104,14 @@ def parse_server_descriptors_v2(path, descriptor_file):
block_end_prefix = PGP_BLOCK_END.split(' ', 1)[0]
descriptor_content += _read_until_keyword(block_end_prefix, descriptor_file, True)
- # If the file has ending annotations (ie, non-descriptor text after the
- # last descriptor) then we won't have any descriptor content at this point.
- # This is fine. Those ending annotations are simply never returned to the
- # caller.
-
if descriptor_content:
- descriptor = ServerDescriptorV2("\n".join(descriptor_content), annotations = annotations)
- descriptor._set_path(path)
+ # strip newlines from annotations
+ annotations = map(str.strip, annotations)
+
+ descriptor_text = "".join(descriptor_content)
+ descriptor = ServerDescriptorV2(descriptor_text, validate, annotations)
yield descriptor
- else: return # done parsing descriptors
+ else: break # done parsing descriptors
def _read_until_keyword(keyword, descriptor_file, inclusive = False):
"""
@@ -112,12 +132,10 @@ def _read_until_keyword(keyword, descriptor_file, inclusive = False):
while True:
last_position = descriptor_file.tell()
line = descriptor_file.readline()
-
if not line: break # EOF
- line = line.strip()
if " " in line: line_keyword = line.split(" ", 1)[0]
- else: line_keyword = line
+ else: line_keyword = line.strip()
if line_keyword == keyword:
if inclusive: content.append(line)
@@ -157,13 +175,13 @@ def _get_psudo_pgp_block(remaining_contents):
while True:
if not remaining_contents:
- raise ValueError("Unterminated public key block")
+ raise ValueError("Unterminated pgp style block")
line = remaining_contents.pop(0)
block_lines.append(line)
if line == PGP_BLOCK_END % block_type:
- return block_type, "\n".join(block_lines)
+ return (block_type, "\n".join(block_lines))
else:
return (None, None)
@@ -197,7 +215,7 @@ class ServerDescriptorV2(stem.descriptor.Descriptor):
contact (str) - relay's contact information
family (list) - nicknames or fingerprints of relays it has a declared family with (*)
- * required fields, others are left as None if undefined
+ (*) required fields, others are left as None if undefined
"""
nickname = address = or_port = socks_port = dir_port = None
@@ -206,7 +224,7 @@ class ServerDescriptorV2(stem.descriptor.Descriptor):
onion_key = onion_key_type = signing_key = signing_key_type = None
router_sig = router_sig_type = contact = None
hibernating = False
- family = unrecognized_entries = []
+ family = unrecognized_lines = []
# TODO: Until we have a proper ExitPolicy class this is just a list of the
# exit policy strings...
@@ -258,6 +276,9 @@ class ServerDescriptorV2(stem.descriptor.Descriptor):
while remaining_contents:
line = remaining_contents.pop(0)
+ # last line can be empty
+ if not line and not remaining_contents: continue
+
# Some lines have an 'opt ' for backward compatability. They should be
# ignored. This prefix is being removed in...
# https://trac.torproject.org/projects/tor/ticket/5124
@@ -271,7 +292,12 @@ class ServerDescriptorV2(stem.descriptor.Descriptor):
raise ValueError("Line contains invalid characters: %s" % line)
keyword, value = line_match.groups()
- block_type, block_contents = _get_psudo_pgp_block(remaining_contents)
+
+ try:
+ block_type, block_contents = _get_psudo_pgp_block(remaining_contents)
+ except ValueError, exc:
+ if not validate: continue
+ raise exc
if keyword in ("accept", "reject"):
self.exit_policy.append("%s %s" % (keyword, value))
@@ -291,7 +317,6 @@ class ServerDescriptorV2(stem.descriptor.Descriptor):
raise ValueError("The '%s' entry can only appear once in a descriptor" % keyword)
# parse all the entries into our attributes
-
for keyword, values in entries.items():
# most just work with the first (and only) value
value, block_type, block_contents = values[0]
@@ -332,13 +357,15 @@ class ServerDescriptorV2(stem.descriptor.Descriptor):
if not validate: continue
raise ValueError("Bandwidth line must have three values: %s" % line)
- if validate:
- if not bandwidth_comp[0].isdigit():
- raise ValueError("Bandwidth line's average rate isn't numeric: %s" % bandwidth_comp[0])
- elif not bandwidth_comp[1].isdigit():
- raise ValueError("Bandwidth line's burst rate isn't numeric: %s" % bandwidth_comp[1])
- elif not bandwidth_comp[2].isdigit():
- raise ValueError("Bandwidth line's observed rate isn't numeric: %s" % bandwidth_comp[2])
+ if not bandwidth_comp[0].isdigit():
+ if not validate: continue
+ raise ValueError("Bandwidth line's average rate isn't numeric: %s" % bandwidth_comp[0])
+ elif not bandwidth_comp[1].isdigit():
+ if not validate: continue
+ raise ValueError("Bandwidth line's burst rate isn't numeric: %s" % bandwidth_comp[1])
+ elif not bandwidth_comp[2].isdigit():
+ if not validate: continue
+ raise ValueError("Bandwidth line's observed rate isn't numeric: %s" % bandwidth_comp[2])
self.average_bandwidth = int(bandwidth_comp[0])
self.burst_bandwidth = int(bandwidth_comp[1])
@@ -419,7 +446,10 @@ class ServerDescriptorV2(stem.descriptor.Descriptor):
elif keyword == "family":
self.family = value.split(" ")
else:
- self.unrecognized_entries.append(line)
+ self.unrecognized_lines.append(line)
+
+ def get_unrecognized_lines(self):
+ return list(unrecognized_lines)
def get_annotations(self):
"""
diff --git a/test/integ/descriptor/reader.py b/test/integ/descriptor/reader.py
index 7f2a425..cdc33c2 100644
--- a/test/integ/descriptor/reader.py
+++ b/test/integ/descriptor/reader.py
@@ -51,7 +51,7 @@ def _get_raw_tar_descriptors():
for tar_entry in tar_file:
if tar_entry.isfile():
entry = tar_file.extractfile(tar_entry)
- raw_descriptors.append(entry.read().strip())
+ raw_descriptors.append(entry.read())
entry.close()
TAR_DESCRIPTORS = raw_descriptors
@@ -149,7 +149,7 @@ class TestDescriptorReader(unittest.TestCase):
descriptor_path = os.path.join(DESCRIPTOR_TEST_DATA, "example_descriptor")
with open(descriptor_path) as descriptor_file:
- descriptor_entries.append(descriptor_file.read().strip())
+ descriptor_entries.append(descriptor_file.read())
# running this test multiple times to flush out concurrency issues
for i in xrange(15):
More information about the tor-commits
mailing list