[tor-commits] [stem/master] Skeleton for extra-info descriptors
atagar at torproject.org
atagar at torproject.org
Mon May 14 00:14:27 UTC 2012
commit 13944b062ff26a5efa178ff8d5552ff1d049574b
Author: Damian Johnson <atagar at torproject.org>
Date: Sun May 6 20:08:53 2012 -0700
Skeleton for extra-info descriptors
Basic module for parsing extrainfo descriptors. This doesn't actually do any
parsing yet, just turning the raw content into a ExtraInfoDescriptor instance.
This abstracts code we'll need out of the server_descriptor module so we'll be
able to use it.
What code there is here is exercised via the runner tests.
---
stem/descriptor/__init__.py | 158 ++++++++++++++++++++++-
stem/descriptor/extrainfo_descriptor.py | 111 ++++++++++++++++
stem/descriptor/server_descriptor.py | 159 +----------------------
test/integ/descriptor/data/extrainfo_descriptor | 12 ++
4 files changed, 286 insertions(+), 154 deletions(-)
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index 2af1fb0..f3a5983 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -8,9 +8,16 @@ Descriptor - Common parent for all descriptor file types.
+- __str__ - string that the descriptor was made from
"""
-__all__ = ["descriptor", "reader", "server_descriptor", "parse_file", "Descriptor"]
+__all__ = ["descriptor", "reader", "extrainfo_descriptor", "server_descriptor", "parse_file", "Descriptor"]
import os
+import re
+
+KEYWORD_CHAR = "a-zA-Z0-9-"
+WHITESPACE = " \t"
+KEYWORD_LINE = re.compile("^([%s]+)[%s]*(.*)$" % (KEYWORD_CHAR, WHITESPACE))
+PGP_BLOCK_START = re.compile("^-----BEGIN ([%s%s]+)-----$" % (KEYWORD_CHAR, WHITESPACE))
+PGP_BLOCK_END = "-----END %s-----"
def parse_file(path, descriptor_file):
"""
@@ -28,6 +35,7 @@ def parse_file(path, descriptor_file):
IOError if unable to read from the descriptor_file
"""
+ import stem.descriptor.extrainfo_descriptor
import stem.descriptor.server_descriptor
# The tor descriptor specifications do not provide a reliable method for
@@ -44,6 +52,10 @@ def parse_file(path, descriptor_file):
for desc in stem.descriptor.server_descriptor.parse_file(descriptor_file):
desc._set_path(path)
yield desc
+ elif filename == "cached-extrainfo" or first_line.startswith("extra-info "):
+ for desc in stem.descriptor.extrainfo_descriptor.parse_file(descriptor_file):
+ desc._set_path(path)
+ yield desc
else:
# unrecognized descriptor type
raise TypeError("Unable to determine the descriptor's type. filename: '%s', first line: '%s'" % (filename, first_line))
@@ -85,3 +97,147 @@ class Descriptor:
def __str__(self):
return self._raw_contents
+def _read_until_keyword(keyword, descriptor_file, inclusive = False):
+ """
+ Reads from the descriptor file until we get to the given keyword or reach the
+ end of the file.
+
+ Arguments:
+ keyword (str) - keyword we want to read until
+ descriptor_file (file) - file with the descriptor content
+ inclusive (bool) - includes the line with the keyword if True
+
+ Returns:
+ list with the lines until we find the keyword
+ """
+
+ content = []
+
+ while True:
+ last_position = descriptor_file.tell()
+ line = descriptor_file.readline()
+ if not line: break # EOF
+
+ if " " in line: line_keyword = line.split(" ", 1)[0]
+ else: line_keyword = line.strip()
+
+ if line_keyword == keyword:
+ if inclusive: content.append(line)
+ else: descriptor_file.seek(last_position)
+
+ break
+ else:
+ content.append(line)
+
+ return content
+
+def _get_pseudo_pgp_block(remaining_contents):
+ """
+ Checks if given contents begins with a pseudo-Open-PGP-style block and, if
+ so, pops it off and provides it back to the caller.
+
+ Arguments:
+ remaining_contents (list) - lines to be checked for a public key block
+
+ Returns:
+ str with the armor wrapped contents or None if it doesn't exist
+
+ Raises:
+ ValueError if the contents starts with a key block but it's malformed (for
+ instance, if it lacks an ending line)
+ """
+
+ if not remaining_contents:
+ return None # nothing left
+
+ block_match = PGP_BLOCK_START.match(remaining_contents[0])
+
+ if block_match:
+ block_type = block_match.groups()[0]
+ block_lines = []
+
+ while True:
+ if not remaining_contents:
+ raise ValueError("Unterminated pgp style block")
+
+ line = remaining_contents.pop(0)
+ block_lines.append(line)
+
+ if line == PGP_BLOCK_END % block_type:
+ return "\n".join(block_lines)
+ else:
+ return None
+
+def _get_descriptor_components(raw_contents, validate, extra_keywords):
+ """
+ Initial breakup of the server descriptor contents to make parsing easier.
+
+ A descriptor contains a series of 'keyword lines' which are simply a keyword
+ followed by an optional value. Lines can also be followed by a signature
+ block.
+
+ To get a sublisting with just certain keywords use extra_keywords. This can
+ be useful if we care about their relative ordering with respect to each
+ other. For instance, we care about the ordering of 'accept' and 'reject'
+ entries because this influences the resulting exit policy, but for everything
+ else in server descriptors the order does not matter.
+
+ Arguments:
+ raw_contents (str) - descriptor content provided by the relay
+ validate (bool) - checks the validity of the descriptor's content if
+ True, skips these checks otherwise
+ extra_keywords (list) - entity keywords to put into a separate listing with
+ ordering intact
+
+ Returns:
+ tuple with the following attributes...
+ entries (dict) - keyword => (value, pgp key) entries
+ first_keyword (str) - keyword of the first line
+ last_keyword (str) - keyword of the last line
+ extra_entries (list) - lines containing entries matching extra_keywords
+ """
+
+ entries = {}
+ first_keyword = None
+ last_keyword = None
+ extra_entries = [] # entries with a keyword in extra_keywords
+ remaining_lines = raw_contents.split("\n")
+
+ while remaining_lines:
+ line = remaining_lines.pop(0)
+
+ # last line can be empty
+ if not line and not remaining_lines: continue
+
+ # Some lines have an 'opt ' for backward compatability. They should be
+ # ignored. This prefix is being removed in...
+ # https://trac.torproject.org/projects/tor/ticket/5124
+
+ if line.startswith("opt "): line = line[4:]
+
+ line_match = KEYWORD_LINE.match(line)
+
+ if not line_match:
+ if not validate: continue
+ raise ValueError("Line contains invalid characters: %s" % line)
+
+ keyword, value = line_match.groups()
+
+ if not first_keyword: first_keyword = keyword
+ last_keyword = keyword
+
+ try:
+ block_contents = _get_pseudo_pgp_block(remaining_lines)
+ except ValueError, exc:
+ if not validate: continue
+ raise exc
+
+ if keyword in extra_keywords:
+ extra_entries.append("%s %s" % (keyword, value))
+ elif keyword in entries:
+ entries[keyword].append((value, block_contents))
+ else:
+ entries[keyword] = [(value, block_contents)]
+
+ return entries, first_keyword, last_keyword, extra_entries
+
diff --git a/stem/descriptor/extrainfo_descriptor.py b/stem/descriptor/extrainfo_descriptor.py
new file mode 100644
index 0000000..891ba18
--- /dev/null
+++ b/stem/descriptor/extrainfo_descriptor.py
@@ -0,0 +1,111 @@
+"""
+Parsing for Tor extra-info descriptors. These are published by relays whenever
+their server descriptor is published and have a similar format. However, unlike
+server descriptors these don't contain information that Tor clients require to
+function and as such aren't fetched by default.
+
+Defined in section 2.2 of the dir-spec, extra-info descriptors contain
+interesting but non-vital information such as usage statistics. These documents
+cannot be requested of bridges.
+
+Extra-info descriptors are available from a few sources...
+
+- if you have 'DownloadExtraInfo 1' in your torrc...
+ - control port via 'GETINFO extra-info/digest/*' queries
+ - the 'cached-extrainfo' file in tor's data directory
+- tor metrics, at https://metrics.torproject.org/data.html
+- directory authorities and mirrors via their DirPort
+
+parse_file - Iterates over the extra-info descriptors in a file.
+ExtraInfoDescriptor - Tor extra-info descriptor.
+"""
+
+import stem.descriptor
+
+def parse_file(descriptor_file, validate = True):
+ """
+ Iterates over the extra-info descriptors in a file.
+
+ Arguments:
+ descriptor_file (file) - file with descriptor content
+ validate (bool) - checks the validity of the descriptor's content if
+ True, skips these checks otherwise
+
+ Returns:
+ iterator for ExtraInfoDescriptor instances in the file
+
+ Raises:
+ ValueError if the contents is malformed and validate is True
+ IOError if the file can't be read
+ """
+
+ while True:
+ extrainfo_content = stem.descriptor._read_until_keyword("router-signature", descriptor_file)
+
+ # we've reached the 'router-signature', now include the pgp style block
+ block_end_prefix = stem.descriptor.PGP_BLOCK_END.split(' ', 1)[0]
+ extrainfo_content += stem.descriptor._read_until_keyword(block_end_prefix, descriptor_file, True)
+
+ if extrainfo_content:
+ yield ExtraInfoDescriptor("".join(extrainfo_content), validate)
+ else: break # done parsing file
+
+class ExtraInfoDescriptor(stem.descriptor.Descriptor):
+ """
+ Extra-info descriptor document.
+
+ Attributes:
+ nickname (str) - relay's nickname (*)
+ fingerprint (str) - fourty hex digits that make up the relay's fingerprint (*)
+ published (datetime.datetime) - time in GMT when the descriptor was generated (*)
+ geoip_db_digest (str) - sha1 of geoIP database file
+
+ read_history (str) - read-history line, always unset
+ read_history_end (datetime.datetime) - end of the sampling interval
+ read_history_interval (int) - seconds per interval
+ read_history_values (list) - bytes read during each interval (*)
+
+ write_history (str) - write-history line, always unset
+ write_history_end (datetime.datetime) - end of the sampling interval
+ write_history_interval (int) - seconds per interval
+ write_history_values (list) - bytes written during each interval (*)
+
+ (*) required fields, others are left as None if undefined
+ """
+
+ def __init__(self, raw_contents, validate = True, annotations = None):
+ """
+ Extra-info descriptor constructor, created from a relay's extra-info
+ content (as provided by "GETINFO extra-info/digest/*", cached contents, and
+ metrics).
+
+ By default this validates the descriptor's content as it's parsed. This
+ validation can be disables to either improve performance or be accepting of
+ malformed data.
+
+ Arguments:
+ raw_contents (str) - extra-info content provided by the relay
+ validate (bool) - checks the validity of the extra-info descriptor if
+ True, skips these checks otherwise
+
+ Raises:
+ ValueError if the contents is malformed and validate is True
+ """
+
+ stem.descriptor.Descriptor.__init__(self, raw_contents)
+
+ self.nickname = None
+ self.fingerprint = None
+ self.published = None
+ self.geoip_db_digest = None
+
+ self.read_history = None
+ self.read_history_end = None
+ self.read_history_interval = None
+ self.read_history_values = []
+
+ self.write_history = None
+ self.write_history_end = None
+ self.write_history_interval = None
+ self.write_history_values = []
+
diff --git a/stem/descriptor/server_descriptor.py b/stem/descriptor/server_descriptor.py
index 551835d..b87ed2a 100644
--- a/stem/descriptor/server_descriptor.py
+++ b/stem/descriptor/server_descriptor.py
@@ -33,12 +33,6 @@ import stem.version
import stem.util.connection
import stem.util.tor_tools
-KEYWORD_CHAR = "a-zA-Z0-9-"
-WHITESPACE = " \t"
-KEYWORD_LINE = re.compile("^([%s]+)[%s]*(.*)$" % (KEYWORD_CHAR, WHITESPACE))
-PGP_BLOCK_START = re.compile("^-----BEGIN ([%s%s]+)-----$" % (KEYWORD_CHAR, WHITESPACE))
-PGP_BLOCK_END = "-----END %s-----"
-
# relay descriptors must have exactly one of the following
REQUIRED_FIELDS = (
"router",
@@ -127,56 +121,21 @@ def parse_file(descriptor_file, validate = True):
# to the caller).
while True:
- annotations = _read_until_keyword("router", descriptor_file)
- descriptor_content = _read_until_keyword("router-signature", descriptor_file)
+ annotations = stem.descriptor._read_until_keyword("router", descriptor_file)
+ descriptor_content = stem.descriptor._read_until_keyword("router-signature", descriptor_file)
# we've reached the 'router-signature', now include the pgp style block
- block_end_prefix = PGP_BLOCK_END.split(' ', 1)[0]
- descriptor_content += _read_until_keyword(block_end_prefix, descriptor_file, True)
+ block_end_prefix = stem.descriptor.PGP_BLOCK_END.split(' ', 1)[0]
+ descriptor_content += stem.descriptor._read_until_keyword(block_end_prefix, descriptor_file, True)
if descriptor_content:
# strip newlines from annotations
annotations = map(str.strip, annotations)
descriptor_text = "".join(descriptor_content)
- descriptor = RelayDescriptor(descriptor_text, validate, annotations)
- yield descriptor
+ yield RelayDescriptor(descriptor_text, validate, annotations)
else: break # done parsing descriptors
-def _read_until_keyword(keyword, descriptor_file, inclusive = False):
- """
- Reads from the descriptor file until we get to the given keyword or reach the
- end of the file.
-
- Arguments:
- keyword (str) - keyword we want to read until
- descriptor_file (file) - file with the descriptor content
- inclusive (bool) - includes the line with the keyword if True
-
- Returns:
- list with the lines until we find the keyword
- """
-
- content = []
-
- while True:
- last_position = descriptor_file.tell()
- line = descriptor_file.readline()
- if not line: break # EOF
-
- if " " in line: line_keyword = line.split(" ", 1)[0]
- else: line_keyword = line.strip()
-
- if line_keyword == keyword:
- if inclusive: content.append(line)
- else: descriptor_file.seek(last_position)
-
- break
- else:
- content.append(line)
-
- return content
-
class ServerDescriptor(stem.descriptor.Descriptor):
"""
Common parent for server descriptors.
@@ -297,7 +256,7 @@ class ServerDescriptor(stem.descriptor.Descriptor):
# does not matter so breaking it into key / value pairs.
entries, first_keyword, last_keyword, self.exit_policy = \
- _get_descriptor_components(raw_contents, validate)
+ stem.descriptor._get_descriptor_components(raw_contents, validate, ("accept", "reject"))
self._parse(entries, validate)
if validate: self._check_constraints(entries, first_keyword, last_keyword)
@@ -816,109 +775,3 @@ class BridgeDescriptor(ServerDescriptor):
def _first_keyword(self):
return "router"
-def _get_descriptor_components(raw_contents, validate):
- """
- Initial breakup of the server descriptor contents to make parsing easier.
-
- A descriptor contains a series of 'keyword lines' which are simply a keyword
- followed by an optional value. Lines can also be followed by a signature
- block.
-
- We care about the ordering of 'accept' and 'reject' entries because this
- influences the resulting exit policy, but for everything else the order does
- not matter so breaking it into key / value pairs.
-
- Arguments:
- raw_contents (str) - descriptor content provided by the relay
- validate (bool) - checks the validity of the descriptor's content if
- True, skips these checks otherwise
-
- Returns:
- tuple with the following attributes...
- entries (dict) - keyword => (value, pgp key) entries
- first_keyword (str) - keyword of the first line
- last_keyword (str) - keyword of the last line
- exit_policy (list) - lines containing the exit policy
- """
-
- entries = {}
- first_keyword = None
- last_keyword = None
- exit_policy = []
- remaining_lines = raw_contents.split("\n")
-
- while remaining_lines:
- line = remaining_lines.pop(0)
-
- # last line can be empty
- if not line and not remaining_lines: continue
-
- # Some lines have an 'opt ' for backward compatability. They should be
- # ignored. This prefix is being removed in...
- # https://trac.torproject.org/projects/tor/ticket/5124
-
- if line.startswith("opt "): line = line[4:]
-
- line_match = KEYWORD_LINE.match(line)
-
- if not line_match:
- if not validate: continue
- raise ValueError("Line contains invalid characters: %s" % line)
-
- keyword, value = line_match.groups()
-
- if not first_keyword: first_keyword = keyword
- last_keyword = keyword
-
- try:
- block_contents = _get_pseudo_pgp_block(remaining_lines)
- except ValueError, exc:
- if not validate: continue
- raise exc
-
- if keyword in ("accept", "reject"):
- exit_policy.append("%s %s" % (keyword, value))
- elif keyword in entries:
- entries[keyword].append((value, block_contents))
- else:
- entries[keyword] = [(value, block_contents)]
-
- return entries, first_keyword, last_keyword, exit_policy
-
-def _get_pseudo_pgp_block(remaining_contents):
- """
- Checks if given contents begins with a pseudo-Open-PGP-style block and, if
- so, pops it off and provides it back to the caller.
-
- Arguments:
- remaining_contents (list) - lines to be checked for a public key block
-
- Returns:
- str with the armor wrapped contents or None if it doesn't exist
-
- Raises:
- ValueError if the contents starts with a key block but it's malformed (for
- instance, if it lacks an ending line)
- """
-
- if not remaining_contents:
- return None # nothing left
-
- block_match = PGP_BLOCK_START.match(remaining_contents[0])
-
- if block_match:
- block_type = block_match.groups()[0]
- block_lines = []
-
- while True:
- if not remaining_contents:
- raise ValueError("Unterminated pgp style block")
-
- line = remaining_contents.pop(0)
- block_lines.append(line)
-
- if line == PGP_BLOCK_END % block_type:
- return "\n".join(block_lines)
- else:
- return None
-
diff --git a/test/integ/descriptor/data/extrainfo_descriptor b/test/integ/descriptor/data/extrainfo_descriptor
new file mode 100644
index 0000000..4525afe
--- /dev/null
+++ b/test/integ/descriptor/data/extrainfo_descriptor
@@ -0,0 +1,12 @@
+extra-info NINJA B2289C3EAB83ECD6EB916A2F481A02E6B76A0A48
+published 2012-05-05 17:03:50
+write-history 2012-05-05 17:02:45 (900 s) 1082368,19456,50176,272384,485376,1850368,1132544,1790976,2459648,4091904,6310912,13701120,3209216,3871744,7873536,5440512,7287808,10561536,9979904,11247616,11982848,7590912,10611712,20728832,38534144,6839296,3173376,16678912
+read-history 2012-05-05 17:02:45 (900 s) 3309568,9216,41984,27648,123904,2004992,364544,576512,1607680,3808256,4672512,12783616,2938880,2562048,7348224,3574784,6488064,10954752,9359360,4438016,6286336,6438912,4502528,10720256,38165504,1524736,2336768,8186880
+dirreq-write-history 2012-05-05 17:02:45 (900 s) 0,0,0,227328,349184,382976,738304,1171456,850944,657408,1675264,987136,702464,1335296,587776,1941504,893952,533504,695296,6828032,6326272,1287168,6310912,10085376,1048576,5372928,894976,8610816
+dirreq-read-history 2012-05-05 17:02:45 (900 s) 0,0,0,0,33792,27648,48128,46080,60416,51200,63488,64512,45056,27648,37888,48128,57344,34816,46080,50176,37888,51200,25600,33792,39936,32768,28672,30720
+router-signature
+-----BEGIN SIGNATURE-----
+K5FSywk7qvw/boA4DQcqkls6Ize5vcBYfhQ8JnOeRQC9+uDxbnpm3qaYN9jZ8myj
+k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw
+7LZqklu+gVvhMKREpchVqlAwXkWR44VENm24Hs+mT3M=
+-----END SIGNATURE-----
More information about the tor-commits
mailing list