[tor-commits] [stem/master] Moving bulk of server descriptor parsing to helpers
atagar at torproject.org
atagar at torproject.org
Sun Jan 25 22:37:33 UTC 2015
commit 99e3c7b8c3f8417324290d5bf7952393d968396a
Author: Damian Johnson <atagar at torproject.org>
Date: Sun Jan 4 16:41:12 2015 -0800
Moving bulk of server descriptor parsing to helpers
Shifting actual parsing to helper functions so we'll be able to use these for
lazy loading. This makes us a little less tolerant of garbage data but,
honestly, shouldn't be a bad thing.
---
stem/descriptor/server_descriptor.py | 455 +++++++++++++++--------------
test/unit/descriptor/server_descriptor.py | 8 +-
2 files changed, 244 insertions(+), 219 deletions(-)
diff --git a/stem/descriptor/server_descriptor.py b/stem/descriptor/server_descriptor.py
index 6726077..671e96d 100644
--- a/stem/descriptor/server_descriptor.py
+++ b/stem/descriptor/server_descriptor.py
@@ -167,6 +167,203 @@ def _parse_file(descriptor_file, is_bridge = False, validate = True, **kwargs):
break # done parsing descriptors
+def _parse_router_line(descriptor, value):
+ # "router" nickname address ORPort SocksPort DirPort
+
+ router_comp = value.split()
+
+ if len(router_comp) < 5:
+ raise ValueError('Router line must have five values: router %s' % value)
+ elif not stem.util.tor_tools.is_valid_nickname(router_comp[0]):
+ raise ValueError("Router line entry isn't a valid nickname: %s" % router_comp[0])
+ elif not stem.util.connection.is_valid_ipv4_address(router_comp[1]):
+ raise ValueError("Router line entry isn't a valid IPv4 address: %s" % router_comp[1])
+ elif not stem.util.connection.is_valid_port(router_comp[2], allow_zero = True):
+ raise ValueError("Router line's ORPort is invalid: %s" % router_comp[2])
+ elif not stem.util.connection.is_valid_port(router_comp[3], allow_zero = True):
+ raise ValueError("Router line's SocksPort is invalid: %s" % router_comp[3])
+ elif not stem.util.connection.is_valid_port(router_comp[4], allow_zero = True):
+ raise ValueError("Router line's DirPort is invalid: %s" % router_comp[4])
+
+ descriptor.nickname = router_comp[0]
+ descriptor.address = router_comp[1]
+ descriptor.or_port = int(router_comp[2])
+ descriptor.socks_port = None if router_comp[3] == '0' else int(router_comp[3])
+ descriptor.dir_port = None if router_comp[4] == '0' else int(router_comp[4])
+
+
+def _parse_bandwidth_line(descriptor, value):
+ # "bandwidth" bandwidth-avg bandwidth-burst bandwidth-observed
+
+ bandwidth_comp = value.split()
+
+ if len(bandwidth_comp) < 3:
+ raise ValueError('Bandwidth line must have three values: bandwidth %s' % value)
+ elif not bandwidth_comp[0].isdigit():
+ raise ValueError("Bandwidth line's average rate isn't numeric: %s" % bandwidth_comp[0])
+ elif not bandwidth_comp[1].isdigit():
+ raise ValueError("Bandwidth line's burst rate isn't numeric: %s" % bandwidth_comp[1])
+ elif not bandwidth_comp[2].isdigit():
+ raise ValueError("Bandwidth line's observed rate isn't numeric: %s" % bandwidth_comp[2])
+
+ descriptor.average_bandwidth = int(bandwidth_comp[0])
+ descriptor.burst_bandwidth = int(bandwidth_comp[1])
+ descriptor.observed_bandwidth = int(bandwidth_comp[2])
+
+
+def _parse_platform_line(descriptor, value):
+ # "platform" string
+
+ # The platform attribute was set earlier. This line can contain any
+ # arbitrary data, but tor seems to report its version followed by the
+ # os like the following...
+ #
+ # platform Tor 0.2.2.35 (git-73ff13ab3cc9570d) on Linux x86_64
+ #
+ # There's no guarantee that we'll be able to pick these out the
+ # version, but might as well try to save our caller the effort.
+
+ platform_match = re.match('^(?:node-)?Tor (\S*).* on (.*)$', value)
+
+ if platform_match:
+ version_str, descriptor.operating_system = platform_match.groups()
+
+ try:
+ descriptor.tor_version = stem.version._get_version(version_str)
+ except ValueError:
+ pass
+
+
+def _parse_published_line(descriptor, value):
+ # "published" YYYY-MM-DD HH:MM:SS
+
+ try:
+ descriptor.published = stem.util.str_tools._parse_timestamp(value)
+ except ValueError:
+ raise ValueError("Published line's time wasn't parsable: published %s" % value)
+
+
+def _parse_fingerprint_line(descriptor, value):
+ # This is forty hex digits split into space separated groups of four.
+ # Checking that we match this pattern.
+
+ fingerprint = value.replace(' ', '')
+
+ for grouping in value.split(' '):
+ if len(grouping) != 4:
+ raise ValueError('Fingerprint line should have groupings of four hex digits: %s' % value)
+
+ if not stem.util.tor_tools.is_valid_fingerprint(fingerprint):
+ raise ValueError('Tor relay fingerprints consist of forty hex digits: %s' % value)
+
+ descriptor.fingerprint = fingerprint
+
+
+def _parse_hibernating_line(descriptor, value):
+ # "hibernating" 0|1 (in practice only set if one)
+
+ if value not in ('0', '1'):
+ raise ValueError('Hibernating line had an invalid value, must be zero or one: %s' % value)
+
+ descriptor.hibernating = value == '1'
+
+
+def _parse_extrainfo_digest_line(descriptor, value):
+ # this is forty hex digits which just so happens to be the same a
+ # fingerprint
+
+ if not stem.util.tor_tools.is_valid_fingerprint(value):
+ raise ValueError('Extra-info digests should consist of forty hex digits: %s' % value)
+
+ descriptor.extra_info_digest = value
+
+
+def _parse_hidden_service_dir_line(descriptor, value):
+ if value:
+ descriptor.hidden_service_dir = value.split(' ')
+ else:
+ descriptor.hidden_service_dir = ['2']
+
+
+def _parse_uptime_line(descriptor, value):
+ # We need to be tolerant of negative uptimes to accommodate a past tor
+ # bug...
+ #
+ # Changes in version 0.1.2.7-alpha - 2007-02-06
+ # - If our system clock jumps back in time, don't publish a negative
+ # uptime in the descriptor. Also, don't let the global rate limiting
+ # buckets go absurdly negative.
+ #
+ # After parsing all of the attributes we'll double check that negative
+ # uptimes only occurred prior to this fix.
+
+ try:
+ descriptor.uptime = int(value)
+ except ValueError:
+ raise ValueError('Uptime line must have an integer value: %s' % value)
+
+
+def _parse_protocols_line(descriptor, value):
+ protocols_match = re.match('^Link (.*) Circuit (.*)$', value)
+
+ if not protocols_match:
+ raise ValueError('Protocols line did not match the expected pattern: protocols %s' % value)
+
+ link_versions, circuit_versions = protocols_match.groups()
+ descriptor.link_protocols = link_versions.split(' ')
+ descriptor.circuit_protocols = circuit_versions.split(' ')
+
+
+def _parse_or_address_line(descriptor, all_values):
+ or_addresses = []
+
+ for entry in all_values:
+ line = 'or-address %s' % entry
+
+ if ':' not in entry:
+ raise ValueError('or-address line missing a colon: %s' % line)
+
+ address, port = entry.rsplit(':', 1)
+ is_ipv6 = address.startswith('[') and address.endswith(']')
+
+ if is_ipv6:
+ address = address[1:-1] # remove brackets
+
+ if not ((not is_ipv6 and stem.util.connection.is_valid_ipv4_address(address)) or
+ (is_ipv6 and stem.util.connection.is_valid_ipv6_address(address))):
+ raise ValueError('or-address line has a malformed address: %s' % line)
+
+ if not stem.util.connection.is_valid_port(port):
+ raise ValueError('or-address line has a malformed port: %s' % line)
+
+ or_addresses.append((address, int(port), is_ipv6))
+
+ descriptor.or_addresses = or_addresses
+
+
+def _parse_history_line(descriptor, value, is_read):
+ keyword = 'read-history' if is_read else 'write-history'
+ timestamp, interval, remainder = \
+ stem.descriptor.extrainfo_descriptor._parse_timestamp_and_interval(keyword, value)
+
+ try:
+ if remainder:
+ history_values = [int(entry) for entry in remainder.split(',')]
+ else:
+ history_values = []
+ except ValueError:
+ raise ValueError('%s line has non-numeric values: %s %s' % (keyword, keyword, value))
+
+ if is_read:
+ descriptor.read_history_end = timestamp
+ descriptor.read_history_interval = interval
+ descriptor.read_history_values = history_values
+ else:
+ descriptor.write_history_end = timestamp
+ descriptor.write_history_interval = interval
+ descriptor.write_history_values = history_values
+
+
class ServerDescriptor(Descriptor):
"""
Common parent for server descriptors.
@@ -378,222 +575,50 @@ class ServerDescriptor(Descriptor):
if block_contents:
line += '\n%s' % block_contents
- if keyword == 'router':
- # "router" nickname address ORPort SocksPort DirPort
- router_comp = value.split()
-
- if len(router_comp) < 5:
- if not validate:
- continue
-
- raise ValueError('Router line must have five values: %s' % line)
-
- if validate:
- if not stem.util.tor_tools.is_valid_nickname(router_comp[0]):
- raise ValueError("Router line entry isn't a valid nickname: %s" % router_comp[0])
- elif not stem.util.connection.is_valid_ipv4_address(router_comp[1]):
- raise ValueError("Router line entry isn't a valid IPv4 address: %s" % router_comp[1])
- elif not stem.util.connection.is_valid_port(router_comp[2], allow_zero = True):
- raise ValueError("Router line's ORPort is invalid: %s" % router_comp[2])
- elif not stem.util.connection.is_valid_port(router_comp[3], allow_zero = True):
- raise ValueError("Router line's SocksPort is invalid: %s" % router_comp[3])
- elif not stem.util.connection.is_valid_port(router_comp[4], allow_zero = True):
- raise ValueError("Router line's DirPort is invalid: %s" % router_comp[4])
- elif not (router_comp[2].isdigit() and router_comp[3].isdigit() and router_comp[4].isdigit()):
- continue
-
- self.nickname = router_comp[0]
- self.address = router_comp[1]
- self.or_port = int(router_comp[2])
- self.socks_port = None if router_comp[3] == '0' else int(router_comp[3])
- self.dir_port = None if router_comp[4] == '0' else int(router_comp[4])
- elif keyword == 'bandwidth':
- # "bandwidth" bandwidth-avg bandwidth-burst bandwidth-observed
- bandwidth_comp = value.split()
-
- if len(bandwidth_comp) < 3:
- if not validate:
- continue
-
- raise ValueError('Bandwidth line must have three values: %s' % line)
- elif not bandwidth_comp[0].isdigit():
- if not validate:
- continue
-
- raise ValueError("Bandwidth line's average rate isn't numeric: %s" % bandwidth_comp[0])
- elif not bandwidth_comp[1].isdigit():
- if not validate:
- continue
-
- raise ValueError("Bandwidth line's burst rate isn't numeric: %s" % bandwidth_comp[1])
- elif not bandwidth_comp[2].isdigit():
- if not validate:
- continue
-
- raise ValueError("Bandwidth line's observed rate isn't numeric: %s" % bandwidth_comp[2])
-
- self.average_bandwidth = int(bandwidth_comp[0])
- self.burst_bandwidth = int(bandwidth_comp[1])
- self.observed_bandwidth = int(bandwidth_comp[2])
- elif keyword == 'platform':
- # "platform" string
-
- # The platform attribute was set earlier. This line can contain any
- # arbitrary data, but tor seems to report its version followed by the
- # os like the following...
- #
- # platform Tor 0.2.2.35 (git-73ff13ab3cc9570d) on Linux x86_64
- #
- # There's no guarantee that we'll be able to pick these out the
- # version, but might as well try to save our caller the effort.
-
- platform_match = re.match('^(?:node-)?Tor (\S*).* on (.*)$', value)
-
- if platform_match:
- version_str, self.operating_system = platform_match.groups()
-
- try:
- self.tor_version = stem.version._get_version(version_str)
- except ValueError:
- pass
- elif keyword == 'published':
- # "published" YYYY-MM-DD HH:MM:SS
-
- try:
- self.published = stem.util.str_tools._parse_timestamp(value)
- except ValueError:
- if validate:
- raise ValueError("Published line's time wasn't parsable: %s" % line)
- elif keyword == 'fingerprint':
- # This is forty hex digits split into space separated groups of four.
- # Checking that we match this pattern.
-
- fingerprint = value.replace(' ', '')
-
- if validate:
- for grouping in value.split(' '):
- if len(grouping) != 4:
- raise ValueError('Fingerprint line should have groupings of four hex digits: %s' % value)
-
- if not stem.util.tor_tools.is_valid_fingerprint(fingerprint):
- raise ValueError('Tor relay fingerprints consist of forty hex digits: %s' % value)
-
- self.fingerprint = fingerprint
- elif keyword == 'hibernating':
- # "hibernating" 0|1 (in practice only set if one)
-
- if validate and value not in ('0', '1'):
- raise ValueError('Hibernating line had an invalid value, must be zero or one: %s' % value)
-
- self.hibernating = value == '1'
- elif keyword == 'allow-single-hop-exits':
- self.allow_single_hop_exits = True
- elif keyword == 'caches-extra-info':
- self.extra_info_cache = True
- elif keyword == 'extra-info-digest':
- # this is forty hex digits which just so happens to be the same a
- # fingerprint
-
- if validate and not stem.util.tor_tools.is_valid_fingerprint(value):
- raise ValueError('Extra-info digests should consist of forty hex digits: %s' % value)
-
- self.extra_info_digest = value
- elif keyword == 'hidden-service-dir':
- if value:
- self.hidden_service_dir = value.split(' ')
+ try:
+ if keyword == 'router':
+ _parse_router_line(self, value)
+ elif keyword == 'bandwidth':
+ _parse_bandwidth_line(self, value)
+ elif keyword == 'platform':
+ _parse_platform_line(self, value)
+ elif keyword == 'published':
+ _parse_published_line(self, value)
+ elif keyword == 'fingerprint':
+ _parse_fingerprint_line(self, value)
+ elif keyword == 'hibernating':
+ _parse_hibernating_line(self, value)
+ elif keyword == 'allow-single-hop-exits':
+ self.allow_single_hop_exits = True
+ elif keyword == 'caches-extra-info':
+ self.extra_info_cache = True
+ elif keyword == 'extra-info-digest':
+ _parse_extrainfo_digest_line(self, value)
+ elif keyword == 'hidden-service-dir':
+ _parse_hidden_service_dir_line(self, value)
+ elif keyword == 'uptime':
+ _parse_uptime_line(self, value)
+ elif keyword == 'contact':
+ pass # parsed as a bytes field earlier
+ elif keyword == 'protocols':
+ _parse_protocols_line(self, value)
+ elif keyword == 'family':
+ self.family = set(value.split(' '))
+ elif keyword == 'eventdns':
+ self.eventdns = value == '1'
+ elif keyword == 'ipv6-policy':
+ self.exit_policy_v6 = stem.exit_policy.MicroExitPolicy(value)
+ elif keyword == 'or-address':
+ _parse_or_address_line(self, [entry[0] for entry in values])
+ elif keyword == 'read-history':
+ _parse_history_line(self, value, True)
+ elif keyword == 'write-history':
+ _parse_history_line(self, value, False)
else:
- self.hidden_service_dir = ['2']
- elif keyword == 'uptime':
- # We need to be tolerant of negative uptimes to accommodate a past tor
- # bug...
- #
- # Changes in version 0.1.2.7-alpha - 2007-02-06
- # - If our system clock jumps back in time, don't publish a negative
- # uptime in the descriptor. Also, don't let the global rate limiting
- # buckets go absurdly negative.
- #
- # After parsing all of the attributes we'll double check that negative
- # uptimes only occurred prior to this fix.
-
- try:
- self.uptime = int(value)
- except ValueError:
- if not validate:
- continue
-
- raise ValueError('Uptime line must have an integer value: %s' % value)
- elif keyword == 'contact':
- pass # parsed as a bytes field earlier
- elif keyword == 'protocols':
- protocols_match = re.match('^Link (.*) Circuit (.*)$', value)
-
- if protocols_match:
- link_versions, circuit_versions = protocols_match.groups()
- self.link_protocols = link_versions.split(' ')
- self.circuit_protocols = circuit_versions.split(' ')
- elif validate:
- raise ValueError('Protocols line did not match the expected pattern: %s' % line)
- elif keyword == 'family':
- self.family = set(value.split(' '))
- elif keyword == 'eventdns':
- self.eventdns = value == '1'
- elif keyword == 'ipv6-policy':
- self.exit_policy_v6 = stem.exit_policy.MicroExitPolicy(value)
- elif keyword == 'or-address':
- or_address_entries = [address_entry for (address_entry, _, _) in values]
-
- for entry in or_address_entries:
- line = '%s %s' % (keyword, entry)
-
- if ':' not in entry:
- if not validate:
- continue
- else:
- raise ValueError('or-address line missing a colon: %s' % line)
-
- address, port = entry.rsplit(':', 1)
- is_ipv6 = address.startswith('[') and address.endswith(']')
-
- if is_ipv6:
- address = address[1:-1] # remove brackets
-
- if not ((not is_ipv6 and stem.util.connection.is_valid_ipv4_address(address)) or
- (is_ipv6 and stem.util.connection.is_valid_ipv6_address(address))):
- if not validate:
- continue
- else:
- raise ValueError('or-address line has a malformed address: %s' % line)
-
- if stem.util.connection.is_valid_port(port):
- self.or_addresses.append((address, int(port), is_ipv6))
- elif validate:
- raise ValueError('or-address line has a malformed port: %s' % line)
- elif keyword in ('read-history', 'write-history'):
- try:
- timestamp, interval, remainder = \
- stem.descriptor.extrainfo_descriptor._parse_timestamp_and_interval(keyword, value)
-
- try:
- if remainder:
- history_values = [int(entry) for entry in remainder.split(',')]
- else:
- history_values = []
- except ValueError:
- raise ValueError('%s line has non-numeric values: %s' % (keyword, line))
-
- if keyword == 'read-history':
- self.read_history_end = timestamp
- self.read_history_interval = interval
- self.read_history_values = history_values
- else:
- self.write_history_end = timestamp
- self.write_history_interval = interval
- self.write_history_values = history_values
- except ValueError as exc:
- if validate:
- raise exc
- else:
- self._unrecognized_lines.append(line)
+ self._unrecognized_lines.append(line)
+ except ValueError as exc:
+ if validate:
+ raise exc
# if we have a negative uptime and a tor version that shouldn't exhibit
# this bug then fail validation
diff --git a/test/unit/descriptor/server_descriptor.py b/test/unit/descriptor/server_descriptor.py
index c57c476..61654a7 100644
--- a/test/unit/descriptor/server_descriptor.py
+++ b/test/unit/descriptor/server_descriptor.py
@@ -378,7 +378,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
"""
desc_text = get_relay_server_descriptor({'router': 'saberrider2008ReallyLongNickname 71.35.133.197 9001 0 0'}, content = True)
- self._expect_invalid_attr(desc_text, 'nickname', 'saberrider2008ReallyLongNickname')
+ self._expect_invalid_attr(desc_text, 'nickname')
def test_nickname_invalid_char(self):
"""
@@ -386,7 +386,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
"""
desc_text = get_relay_server_descriptor({'router': '$aberrider2008 71.35.133.197 9001 0 0'}, content = True)
- self._expect_invalid_attr(desc_text, 'nickname', '$aberrider2008')
+ self._expect_invalid_attr(desc_text, 'nickname')
def test_address_malformed(self):
"""
@@ -394,7 +394,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
"""
desc_text = get_relay_server_descriptor({'router': 'caerSidi 371.35.133.197 9001 0 0'}, content = True)
- self._expect_invalid_attr(desc_text, 'address', '371.35.133.197')
+ self._expect_invalid_attr(desc_text, 'address')
def test_port_too_high(self):
"""
@@ -402,7 +402,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
"""
desc_text = get_relay_server_descriptor({'router': 'caerSidi 71.35.133.197 900001 0 0'}, content = True)
- self._expect_invalid_attr(desc_text, 'or_port', 900001)
+ self._expect_invalid_attr(desc_text, 'or_port')
def test_port_malformed(self):
"""
More information about the tor-commits
mailing list