[tor-commits] [stem/master] Normalizing descriptor handling as unicode
atagar at torproject.org
atagar at torproject.org
Sat Feb 2 18:20:49 UTC 2013
commit b4c4835d957463d6a453d9d7f7ad6007627abc96
Author: Damian Johnson <atagar at torproject.org>
Date: Tue Jan 29 09:03:30 2013 -0800
Normalizing descriptor handling as unicode
Our python 3 descriptor integration tests were having troubles due to the
ASCII/unicode switch. Adding a file interceptor so we always parse descriptors
as unicode.
Yes, yes, I know. Ewwww. I definitely don't like the _UnicodeReader helper so
suggestions welcome for an alternative.
---
stem/descriptor/__init__.py | 65 ++++++++++++++++++++++++++++++++++++++
stem/descriptor/networkstatus.py | 4 +-
stem/exit_policy.py | 6 ++--
stem/util/connection.py | 4 +-
stem/util/tor_tools.py | 6 ++--
test/integ/descriptor/reader.py | 2 +-
test/settings.cfg | 2 +-
7 files changed, 77 insertions(+), 12 deletions(-)
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index 0e4a3e1..fe6c03c 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -27,6 +27,8 @@ __all__ = [
import os
import re
+import stem.prereq
+
try:
# added in python 2.7
from collections import OrderedDict
@@ -97,6 +99,10 @@ def parse_file(descriptor_file, descriptor_type = None, path = None, validate =
import stem.descriptor.extrainfo_descriptor
import stem.descriptor.networkstatus
+ # attempt to read content as unicode
+
+ descriptor_file = _UnicodeReader(descriptor_file)
+
# The tor descriptor specifications do not provide a reliable method for
# identifying a descriptor file's type and version so we need to guess
# based on its filename. Metrics descriptors, however, can be identified
@@ -236,6 +242,65 @@ class Descriptor(object):
return self._raw_contents
+class _UnicodeReader(object):
+ """
+ File-like object that wraps another file. This replaces read ASCII bytes with
+ unicode content. This only supports read operations.
+ """
+
+ def __init__(self, wrapped_file):
+ self.wrapped_file = wrapped_file
+
+ def close(self):
+ return self.wrapped_file.close()
+
+ def getvalue(self):
+ return self.wrapped_file.getvalue()
+
+ def isatty(self):
+ return self.wrapped_file.isatty()
+
+ def next(self):
+ return self.wrapped_file.next()
+
+ def read(self, n = -1):
+ return self._to_unicode(self.wrapped_file.read(n))
+
+ def readline(self):
+ return self._to_unicode(self.wrapped_file.readline())
+
+ def readlines(self, sizehint = 0):
+ # being careful to do in-place conversion so we don't accidently double our
+ # memory usage
+
+ results = self.wrapped_file.readlines(sizehint)
+
+ for i in xrange(len(results)):
+ results[i] = self._to_unicode(results[i])
+
+ return results
+
+ def seek(self, pos, mode = 0):
+ return self.wrapped_file.seek(pos, mode)
+
+ def tell(self):
+ return self.wrapped_file.tell()
+
+ def _to_unicode(self, msg):
+ if msg is None:
+ return msg
+
+ if stem.prereq.is_python_3():
+ is_unicode = isinstance(msg, str)
+ else:
+ is_unicode = isinstance(msg, unicode)
+
+ if is_unicode:
+ return msg
+ else:
+ return msg.decode("utf-8", "replace")
+
+
def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_first = False, skip = False, end_position = None, include_ending_keyword = False):
"""
Reads from the descriptor file until we get to one of the given keywords or reach the
diff --git a/stem/descriptor/networkstatus.py b/stem/descriptor/networkstatus.py
index b4bc188..69f1a44 100644
--- a/stem/descriptor/networkstatus.py
+++ b/stem/descriptor/networkstatus.py
@@ -1343,10 +1343,10 @@ class DocumentSignature(object):
if validate:
if not stem.util.tor_tools.is_valid_fingerprint(identity):
- raise ValueError("Malformed fingerprint (%s) in the document signature" % (identity))
+ raise ValueError("Malformed fingerprint (%s) in the document signature" % identity)
if not stem.util.tor_tools.is_valid_fingerprint(key_digest):
- raise ValueError("Malformed key digest (%s) in the document signature" % (key_digest))
+ raise ValueError("Malformed key digest (%s) in the document signature" % key_digest)
self.method = method
self.identity = identity
diff --git a/stem/exit_policy.py b/stem/exit_policy.py
index 0ad2c54..e60f121 100644
--- a/stem/exit_policy.py
+++ b/stem/exit_policy.py
@@ -104,7 +104,7 @@ def get_config_policy(rules):
:raises: **ValueError** if input isn't a valid tor exit policy
"""
- if isinstance(rules, str):
+ if isinstance(rules, (str, unicode)):
rules = rules.split(',')
result = []
@@ -143,7 +143,7 @@ class ExitPolicy(object):
def __init__(self, *rules):
# sanity check the types
for rule in rules:
- if not isinstance(rule, (str, ExitPolicyRule)):
+ if not isinstance(rule, (str, unicode, ExitPolicyRule)):
raise TypeError("Exit policy rules can only contain strings or ExitPolicyRules, got a %s (%s)" % (type(rule), rules))
self._rules = None # lazily loaded series of ExitPolicyRule
@@ -300,7 +300,7 @@ class ExitPolicy(object):
is_all_accept, is_all_reject = True, True
for rule in self._input_rules:
- if isinstance(rule, str):
+ if isinstance(rule, (str, unicode)):
rule = ExitPolicyRule(rule.strip())
if rule.is_accept:
diff --git a/stem/util/connection.py b/stem/util/connection.py
index d7c0299..86ba49a 100644
--- a/stem/util/connection.py
+++ b/stem/util/connection.py
@@ -40,7 +40,7 @@ def is_valid_ip_address(address):
:returns: **True** if input is a valid IPv4 address, **False** otherwise
"""
- if not isinstance(address, str):
+ if not isinstance(address, (str, unicode)):
return False
# checks if theres four period separated values
@@ -108,7 +108,7 @@ def is_valid_port(entry, allow_zero = False):
return False
return True
- elif isinstance(entry, str):
+ elif isinstance(entry, (str, unicode)):
if not entry.isdigit():
return False
elif entry[0] == "0" and len(entry) > 1:
diff --git a/stem/util/tor_tools.py b/stem/util/tor_tools.py
index 2e52cee..e61a96c 100644
--- a/stem/util/tor_tools.py
+++ b/stem/util/tor_tools.py
@@ -45,7 +45,7 @@ def is_valid_fingerprint(entry, check_prefix = False):
:returns: **True** if the string could be a relay fingerprint, **False** otherwise
"""
- if not isinstance(entry, str):
+ if not isinstance(entry, (str, unicode)):
return False
elif check_prefix:
if not entry or entry[0] != "$":
@@ -65,7 +65,7 @@ def is_valid_nickname(entry):
:returns: **True** if the string could be a nickname, **False** otherwise
"""
- if not isinstance(entry, str):
+ if not isinstance(entry, (str, unicode)):
return False
return bool(NICKNAME_PATTERN.match(entry))
@@ -78,7 +78,7 @@ def is_valid_circuit_id(entry):
:returns: **True** if the string could be a circuit id, **False** otherwise
"""
- if not isinstance(entry, str):
+ if not isinstance(entry, (str, unicode)):
return False
return bool(CIRC_ID_PATTERN.match(entry))
diff --git a/test/integ/descriptor/reader.py b/test/integ/descriptor/reader.py
index 97ea27c..936cf39 100644
--- a/test/integ/descriptor/reader.py
+++ b/test/integ/descriptor/reader.py
@@ -64,7 +64,7 @@ def _get_raw_tar_descriptors():
if tar_entry.isfile():
entry = tar_file.extractfile(tar_entry)
entry.readline() # strip header
- raw_descriptors.append(entry.read())
+ raw_descriptors.append(entry.read().decode("utf-8", "replace"))
entry.close()
finally:
if tar_file:
diff --git a/test/settings.cfg b/test/settings.cfg
index 4d99bfe..3de9d00 100644
--- a/test/settings.cfg
+++ b/test/settings.cfg
@@ -156,7 +156,7 @@ target.torrc RUN_PTRACE => PORT, PTRACE
pyflakes.ignore stem/prereq.py => 'RSA' imported but unused
pyflakes.ignore stem/prereq.py => 'asn1' imported but unused
pyflakes.ignore stem/prereq.py => 'long_to_bytes' imported but unused
-pyflakes.ignore stem/descriptor/__init__.py => redefinition of unused 'OrderedDict' from line 32
+pyflakes.ignore stem/descriptor/__init__.py => redefinition of unused 'OrderedDict' from line 34
pyflakes.ignore stem/util/str_tools.py => redefinition of function '_to_bytes' from line 53
pyflakes.ignore test/mocking.py => undefined name 'builtins'
pyflakes.ignore test/unit/response/events.py => 'from stem import *' used; unable to detect undefined names
More information about the tor-commits
mailing list