[tor-commits] [stem/master] Normalizing descriptor handling as unicode

Sat Feb 2 18:20:49 UTC 2013

commit b4c4835d957463d6a453d9d7f7ad6007627abc96
Author: Damian Johnson <atagar at torproject.org>
Date:   Tue Jan 29 09:03:30 2013 -0800

    Normalizing descriptor handling as unicode
    
    Our python 3 descriptor integration tests were having troubles due to the
    ASCII/unicode switch. Adding a file interceptor so we always parse descriptors
    as unicode.
    
    Yes, yes, I know. Ewwww. I definitely don't like the _UnicodeReader helper so
    suggestions welcome for an alternative.
---
 stem/descriptor/__init__.py      |   65 ++++++++++++++++++++++++++++++++++++++
 stem/descriptor/networkstatus.py |    4 +-
 stem/exit_policy.py              |    6 ++--
 stem/util/connection.py          |    4 +-
 stem/util/tor_tools.py           |    6 ++--
 test/integ/descriptor/reader.py  |    2 +-
 test/settings.cfg                |    2 +-
 7 files changed, 77 insertions(+), 12 deletions(-)

diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index 0e4a3e1..fe6c03c 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -27,6 +27,8 @@ __all__ = [
 import os
 import re
 
+import stem.prereq
+
 try:
   # added in python 2.7
   from collections import OrderedDict
@@ -97,6 +99,10 @@ def parse_file(descriptor_file, descriptor_type = None, path = None, validate =
   import stem.descriptor.extrainfo_descriptor
   import stem.descriptor.networkstatus
 
+  # attempt to read content as unicode
+
+  descriptor_file = _UnicodeReader(descriptor_file)
+
   # The tor descriptor specifications do not provide a reliable method for
   # identifying a descriptor file's type and version so we need to guess
   # based on its filename. Metrics descriptors, however, can be identified
@@ -236,6 +242,65 @@ class Descriptor(object):
     return self._raw_contents
 
 
+class _UnicodeReader(object):
+  """
+  File-like object that wraps another file. This replaces read ASCII bytes with
+  unicode content. This only supports read operations.
+  """
+
+  def __init__(self, wrapped_file):
+    self.wrapped_file = wrapped_file
+
+  def close(self):
+    return self.wrapped_file.close()
+
+  def getvalue(self):
+    return self.wrapped_file.getvalue()
+
+  def isatty(self):
+    return self.wrapped_file.isatty()
+
+  def next(self):
+    return self.wrapped_file.next()
+
+  def read(self, n = -1):
+    return self._to_unicode(self.wrapped_file.read(n))
+
+  def readline(self):
+    return self._to_unicode(self.wrapped_file.readline())
+
+  def readlines(self, sizehint = 0):
+    # being careful to do in-place conversion so we don't accidently double our
+    # memory usage
+
+    results = self.wrapped_file.readlines(sizehint)
+
+    for i in xrange(len(results)):
+      results[i] = self._to_unicode(results[i])
+
+    return results
+
+  def seek(self, pos, mode = 0):
+    return self.wrapped_file.seek(pos, mode)
+
+  def tell(self):
+    return self.wrapped_file.tell()
+
+  def _to_unicode(self, msg):
+    if msg is None:
+      return msg
+
+    if stem.prereq.is_python_3():
+      is_unicode = isinstance(msg, str)
+    else:
+      is_unicode = isinstance(msg, unicode)
+
+    if is_unicode:
+      return msg
+    else:
+      return msg.decode("utf-8", "replace")
+
+
 def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_first = False, skip = False, end_position = None, include_ending_keyword = False):
   """
   Reads from the descriptor file until we get to one of the given keywords or reach the
diff --git a/stem/descriptor/networkstatus.py b/stem/descriptor/networkstatus.py
index b4bc188..69f1a44 100644
--- a/stem/descriptor/networkstatus.py
+++ b/stem/descriptor/networkstatus.py
@@ -1343,10 +1343,10 @@ class DocumentSignature(object):
 
     if validate:
       if not stem.util.tor_tools.is_valid_fingerprint(identity):
-        raise ValueError("Malformed fingerprint (%s) in the document signature" % (identity))
+        raise ValueError("Malformed fingerprint (%s) in the document signature" % identity)
 
       if not stem.util.tor_tools.is_valid_fingerprint(key_digest):
-        raise ValueError("Malformed key digest (%s) in the document signature" % (key_digest))
+        raise ValueError("Malformed key digest (%s) in the document signature" % key_digest)
 
     self.method = method
     self.identity = identity
diff --git a/stem/exit_policy.py b/stem/exit_policy.py
index 0ad2c54..e60f121 100644
--- a/stem/exit_policy.py
+++ b/stem/exit_policy.py
@@ -104,7 +104,7 @@ def get_config_policy(rules):
   :raises: **ValueError** if input isn't a valid tor exit policy
   """
 
-  if isinstance(rules, str):
+  if isinstance(rules, (str, unicode)):
     rules = rules.split(',')
 
   result = []
@@ -143,7 +143,7 @@ class ExitPolicy(object):
   def __init__(self, *rules):
     # sanity check the types
     for rule in rules:
-      if not isinstance(rule, (str, ExitPolicyRule)):
+      if not isinstance(rule, (str, unicode, ExitPolicyRule)):
         raise TypeError("Exit policy rules can only contain strings or ExitPolicyRules, got a %s (%s)" % (type(rule), rules))
 
     self._rules = None          # lazily loaded series of ExitPolicyRule
@@ -300,7 +300,7 @@ class ExitPolicy(object):
       is_all_accept, is_all_reject = True, True
 
       for rule in self._input_rules:
-        if isinstance(rule, str):
+        if isinstance(rule, (str, unicode)):
           rule = ExitPolicyRule(rule.strip())
 
         if rule.is_accept:
diff --git a/stem/util/connection.py b/stem/util/connection.py
index d7c0299..86ba49a 100644
--- a/stem/util/connection.py
+++ b/stem/util/connection.py
@@ -40,7 +40,7 @@ def is_valid_ip_address(address):
   :returns: **True** if input is a valid IPv4 address, **False** otherwise
   """
 
-  if not isinstance(address, str):
+  if not isinstance(address, (str, unicode)):
     return False
 
   # checks if theres four period separated values
@@ -108,7 +108,7 @@ def is_valid_port(entry, allow_zero = False):
         return False
 
     return True
-  elif isinstance(entry, str):
+  elif isinstance(entry, (str, unicode)):
     if not entry.isdigit():
       return False
     elif entry[0] == "0" and len(entry) > 1:
diff --git a/stem/util/tor_tools.py b/stem/util/tor_tools.py
index 2e52cee..e61a96c 100644
--- a/stem/util/tor_tools.py
+++ b/stem/util/tor_tools.py
@@ -45,7 +45,7 @@ def is_valid_fingerprint(entry, check_prefix = False):
   :returns: **True** if the string could be a relay fingerprint, **False** otherwise
   """
 
-  if not isinstance(entry, str):
+  if not isinstance(entry, (str, unicode)):
     return False
   elif check_prefix:
     if not entry or entry[0] != "$":
@@ -65,7 +65,7 @@ def is_valid_nickname(entry):
   :returns: **True** if the string could be a nickname, **False** otherwise
   """
 
-  if not isinstance(entry, str):
+  if not isinstance(entry, (str, unicode)):
     return False
 
   return bool(NICKNAME_PATTERN.match(entry))
@@ -78,7 +78,7 @@ def is_valid_circuit_id(entry):
   :returns: **True** if the string could be a circuit id, **False** otherwise
   """
 
-  if not isinstance(entry, str):
+  if not isinstance(entry, (str, unicode)):
     return False
 
   return bool(CIRC_ID_PATTERN.match(entry))
diff --git a/test/integ/descriptor/reader.py b/test/integ/descriptor/reader.py
index 97ea27c..936cf39 100644
--- a/test/integ/descriptor/reader.py
+++ b/test/integ/descriptor/reader.py
@@ -64,7 +64,7 @@ def _get_raw_tar_descriptors():
         if tar_entry.isfile():
           entry = tar_file.extractfile(tar_entry)
           entry.readline()  # strip header
-          raw_descriptors.append(entry.read())
+          raw_descriptors.append(entry.read().decode("utf-8", "replace"))
           entry.close()
     finally:
       if tar_file:
diff --git a/test/settings.cfg b/test/settings.cfg
index 4d99bfe..3de9d00 100644
--- a/test/settings.cfg
+++ b/test/settings.cfg
@@ -156,7 +156,7 @@ target.torrc RUN_PTRACE   => PORT, PTRACE
 pyflakes.ignore stem/prereq.py => 'RSA' imported but unused
 pyflakes.ignore stem/prereq.py => 'asn1' imported but unused
 pyflakes.ignore stem/prereq.py => 'long_to_bytes' imported but unused
-pyflakes.ignore stem/descriptor/__init__.py => redefinition of unused 'OrderedDict' from line 32
+pyflakes.ignore stem/descriptor/__init__.py => redefinition of unused 'OrderedDict' from line 34
 pyflakes.ignore stem/util/str_tools.py => redefinition of function '_to_bytes' from line 53
 pyflakes.ignore test/mocking.py => undefined name 'builtins'
 pyflakes.ignore test/unit/response/events.py => 'from stem import *' used; unable to detect undefined names