[tor-commits] [stem/master] Lazy loading server descriptor's contact and platform line
atagar at torproject.org
atagar at torproject.org
Sun Jan 25 22:37:34 UTC 2015
commit a30d624aea54ff60cc6dda850dda4356d68617d4
Author: Damian Johnson <atagar at torproject.org>
Date: Sun Jan 25 10:57:39 2015 -0800
Lazy loading server descriptor's contact and platform line
These lines are special in that they're raw bytes rather than unicode (that is
to say, they're not necessarily recognizable text). As such I left their
parsing alone which was eager loading, but on reflection we get a nice
performance boost by making these lazy too.
---
stem/descriptor/__init__.py | 41 ++++++++++++----------------------
stem/descriptor/server_descriptor.py | 16 ++++++-------
2 files changed, 21 insertions(+), 36 deletions(-)
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index 1a7a097..a5eb87d 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -326,6 +326,20 @@ def _parse_simple_line(keyword, attribute):
return _parse
+def _parse_bytes_line(keyword, attribute):
+ def _parse(descriptor, entries):
+ line_match = re.search(stem.util.str_tools._to_bytes('^(opt )?%s(?:[%s]+(.*))?$' % (keyword, WHITESPACE)), descriptor.get_bytes(), re.MULTILINE)
+ result = None
+
+ if line_match:
+ value = line_match.groups()[1]
+ result = b'' if value is None else value
+
+ setattr(descriptor, attribute, result)
+
+ return _parse
+
+
def _parse_timestamp_line(keyword, attribute):
# "<keyword>" YYYY-MM-DD HH:MM:SS
@@ -503,33 +517,6 @@ class Descriptor(object):
return self._raw_contents
-def _get_bytes_field(keyword, content):
- """
- Provides the value corresponding to the given keyword. This is handy to fetch
- values specifically allowed to be arbitrary bytes prior to converting to
- unicode.
-
- :param str keyword: line to look up
- :param bytes content: content to look through
-
- :returns: **bytes** value on the given line, **None** if the line doesn't
- exist
-
- :raises: **ValueError** if the content isn't bytes
- """
-
- if not isinstance(content, bytes):
- raise ValueError('Content must be bytes, got a %s' % type(content))
-
- line_match = re.search(stem.util.str_tools._to_bytes('^(opt )?%s(?:[%s]+(.*))?$' % (keyword, WHITESPACE)), content, re.MULTILINE)
-
- if line_match:
- value = line_match.groups()[1]
- return b'' if value is None else value
- else:
- return None
-
-
def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_first = False, skip = False, end_position = None, include_ending_keyword = False):
"""
Reads from the descriptor file until we get to one of the given keywords or reach the
diff --git a/stem/descriptor/server_descriptor.py b/stem/descriptor/server_descriptor.py
index c4f4bca..5786ab9 100644
--- a/stem/descriptor/server_descriptor.py
+++ b/stem/descriptor/server_descriptor.py
@@ -51,12 +51,12 @@ from stem.util import log
from stem.descriptor import (
PGP_BLOCK_END,
Descriptor,
- _get_bytes_field,
_get_descriptor_components,
_read_until_keywords,
_value,
_values,
_parse_simple_line,
+ _parse_bytes_line,
_parse_timestamp_line,
_parse_forty_character_hex,
_parse_key_block,
@@ -222,6 +222,8 @@ def _parse_bandwidth_line(descriptor, entries):
def _parse_platform_line(descriptor, entries):
# "platform" string
+ _parse_bytes_line('platform', 'platform')(descriptor, entries)
+
# The platform attribute was set earlier. This line can contain any
# arbitrary data, but tor seems to report its version followed by the
# os like the following...
@@ -367,6 +369,7 @@ def _parse_exit_policy(descriptor, entries):
del descriptor._unparsed_exit_policy
+_parse_contact_line = _parse_bytes_line('contact', 'contact')
_parse_published_line = _parse_timestamp_line('published', 'published')
_parse_extrainfo_digest_line = _parse_forty_character_hex('extra-info-digest', 'extra_info_digest')
_parse_read_history_line = functools.partial(_parse_history_line, 'read-history', 'read_history_end', 'read_history_interval', 'read_history_values')
@@ -437,6 +440,7 @@ class ServerDescriptor(Descriptor):
ATTRIBUTES = {
'nickname': (None, _parse_router_line),
'fingerprint': (None, _parse_fingerprint_line),
+ 'contact': (None, _parse_contact_line),
'published': (None, _parse_published_line),
'exit_policy': (None, _parse_exit_policy),
@@ -445,6 +449,7 @@ class ServerDescriptor(Descriptor):
'socks_port': (None, _parse_router_line),
'dir_port': (None, _parse_router_line),
+ 'platform': (None, _parse_platform_line),
'tor_version': (None, _parse_platform_line),
'operating_system': (None, _parse_platform_line),
'uptime': (None, _parse_uptime_line),
@@ -480,6 +485,7 @@ class ServerDescriptor(Descriptor):
'platform': _parse_platform_line,
'published': _parse_published_line,
'fingerprint': _parse_fingerprint_line,
+ 'contact': _parse_contact_line,
'hibernating': _parse_hibernating_line,
'extra-info-digest': _parse_extrainfo_digest_line,
'hidden-service-dir': _parse_hidden_service_dir_line,
@@ -493,7 +499,6 @@ class ServerDescriptor(Descriptor):
'caches-extra-info': _parse_caches_extra_info_line,
'family': _parse_family_line,
'eventdns': _parse_eventdns_line,
- 'contact': lambda descriptor, entries: None, # parsed as a bytes field earlier
}
def __init__(self, raw_contents, validate = True, annotations = None):
@@ -515,13 +520,6 @@ class ServerDescriptor(Descriptor):
"""
super(ServerDescriptor, self).__init__(raw_contents, lazy_load = not validate)
-
- # Only a few things can be arbitrary bytes according to the dir-spec, so
- # parsing them separately.
-
- self.platform = _get_bytes_field('platform', raw_contents)
- self.contact = _get_bytes_field('contact', raw_contents)
-
self._annotation_lines = annotations if annotations else []
# A descriptor contains a series of 'keyword lines' which are simply a
More information about the tor-commits
mailing list