[tor-commits] [stem/master] Compression class

Sat Aug 17 20:44:26 UTC 2019

commit 9e9458501fa4b2f8819f93ff853c658b9e63366c
Author: Damian Johnson <atagar at torproject.org>
Date:   Tue Jun 18 18:37:05 2019 -0700

    Compression class
    
    Both our collector and remote modules need to decompress descriptors, so adding
    a little helper class to assist with availability checks and decompression.
---
 stem/descriptor/__init__.py              |  79 +++++++++++++++++++++++++++++++
 stem/descriptor/collector.py             |  77 +++---------------------------
 test/integ/descriptor/collector.py       |   9 ++--
 test/settings.cfg                        |   3 +-
 test/unit/descriptor/collector.py        |  11 +++--
 test/unit/descriptor/compression.py      |  39 +++++++++++++++
 test/unit/descriptor/data/compressed_bz2 | Bin 0 -> 1691 bytes
 7 files changed, 138 insertions(+), 80 deletions(-)

diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index 4d13ec60..0b3fda91 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -11,6 +11,8 @@ Package for parsing and processing descriptor data.
   parse_file - Parses the descriptors in a file.
   create_signing_key - Cretes a signing key that can be used for creating descriptors.
 
+  Compression - method of descriptor decompression
+
   Descriptor - Common parent for all descriptor file types.
     | |- content - creates the text of a new descriptor
     | |- create - creates a new descriptor
@@ -172,6 +174,83 @@ DocumentHandler = stem.util.enum.UppercaseEnum(
 )
 
 
+class _Compression(object):
+  """
+  Compression method supported by CollecTor.
+
+  :var bool available: **True** if this method of decryption is available,
+    **False** otherwise
+  :var str encoding: `http 'Accept-Encoding' parameter <https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens>`_
+  :var str extension: file extension of this compression
+
+  .. versionadded:: 1.8.0
+  """
+
+  def __init__(self, name, module, encoding, extension, decompression_func):
+    if module is None:
+      self._module = None
+      self.available = True
+    else:
+      # Compression modules are optional. Usually gzip and bz2 are available,
+      # but they might be missing if compiling python yourself. As for lzma it
+      # was added in python 3.3.
+
+      try:
+        self._module = __import__(module)
+        self.available = True
+      except ImportError:
+        self._module = None
+        self.available = False
+
+    self.extension = extension
+    self.encoding = encoding
+
+    self._name = name
+    self._module_name = module
+    self._decompression_func = decompression_func
+
+  def decompress(self, content):
+    """
+    Decompresses the given content via this method.
+
+    :param bytes content: content to be decompressed
+
+    :returns: **bytes** with the decompressed content
+
+    :raises:
+      If unable to decompress this provide...
+
+      * **IOError** if content isn't compressed with this
+      * **ImportError** if this method if decompression is unavalable
+    """
+
+    if not self.available:
+      raise ImportError("'%s' decompression module is unavailable" % self._module_name)
+
+    return self._decompression_func(self._module, content)
+
+  def __str__(self):
+    return self._name
+
+
+def _zstd_decompress(module, content):
+  output_buffer = io.BytesIO()
+
+  with module.ZstdDecompressor().write_to(output_buffer) as decompressor:
+    decompressor.write(content)
+
+  return output_buffer.getvalue()
+
+
+Compression = stem.util.enum.Enum(
+  ('PLAINTEXT', _Compression('plaintext', None, 'identity', '.txt', lambda module, content: content)),
+  ('GZIP', _Compression('gzip', 'zlib', 'gzip', '.gz', lambda module, content: module.decompress(content, module.MAX_WBITS | 32))),
+  ('BZ2', _Compression('bzip2', 'bz2', 'bzip2', '.bz2', lambda module, content: module.decompress(content))),
+  ('LZMA', _Compression('lzma', 'lzma', 'x-tor-lzma', '.xz', lambda module, content: module.decompress(content))),
+  ('ZSTD', _Compression('zstd', 'zstd', 'zstd', '.zst', _zstd_decompress)),
+)
+
+
 class TypeAnnotation(collections.namedtuple('TypeAnnotation', ['name', 'major_version', 'minor_version'])):
   """
   `Tor metrics type annotation
diff --git a/stem/descriptor/collector.py b/stem/descriptor/collector.py
index 21a774e9..b3f99241 100644
--- a/stem/descriptor/collector.py
+++ b/stem/descriptor/collector.py
@@ -50,10 +50,11 @@ With this you can either download and read directly from CollecTor...
 .. versionadded:: 1.8.0
 """
 
-import io
 import json
 import time
 
+from stem.descriptor import Compression
+
 try:
   # account for urllib's change between python 2.x and 3.x
   import urllib.request as urllib
@@ -68,82 +69,18 @@ COLLECTOR_URL = 'https://collector.torproject.org/'
 REFRESH_INDEX_RATE = 3600  # get new index if cached copy is an hour old
 
 
-class Compression(object):
-  """
-  Compression method supported by CollecTor.
-
-  :var bool available: **True** if this method of decryption is available,
-    **False** otherwise
-  :var str extension: file extension of this compression
-  """
-
-  def __init__(self, module, extension):
-    # Compression modules are optional. Usually gzip and bz2 are available, but
-    # they might be missing if compiling python yourself. As for lzma it was
-    # added in python 3.3.
-
-    try:
-      self._module = __import__(module)
-      self.available = True
-    except ImportError:
-      self._module = None
-      self.available = False
-
-    self.extension = extension
-    self._module_name = module
-
-  def decompress(self, content):
-    """
-    Decompresses the given content via this method.
-
-    :param bytes content: content to be decompressed
-
-    :returns: **bytes** with the decompressed content
-
-    :raises:
-      If unable to decompress this provide...
-
-      * **IOError** if content isn't compressed with this
-      * **ImportError** if this method if decompression is unavalable
-    """
-
-    if not self.available:
-      raise ImportError("'%s' decompression module is unavailable" % self)
-
-    if self._module_name == 'gzip':
-      if stem.prereq.is_python_3():
-        return self._module.decompress(content)
-      else:
-        # prior to python 3.2 gzip only had GzipFile
-        return self._module.GzipFile(fileobj = io.BytesIO(content)).read()
-    elif self._module_name == 'bz2':
-      return self._module.decompress(content)
-    elif self._module_name == 'lzma':
-      return self._module.decompress(content)
-    else:
-      raise ImportError('BUG: No implementation for %s decompression' % self)
-
-  def __str__(self):
-    return self._module_name
-
-
-GZIP = Compression('gzip', '.gz')
-BZ2 = Compression('bz2', '.bz2')
-LZMA = Compression('lzma', '.xz')
-
-
 def url(resource, compression = None):
   """
   Provides CollecTor url for the given resource.
 
   :param str resource: resource type of the url
-  :param descriptor.collector.Compression compression: compression type to
+  :param descriptor.Compression compression: compression type to
     download from
 
   :returns: **str** with the CollecTor url
   """
 
-  # TODO: Not yet sure how to most elegantly map resources to urls. No doubt
+  # TODO: Unsure how to most elegantly map resources to urls. No doubt
   # this'll change as we add more types.
 
   if resource == 'index':
@@ -152,7 +89,7 @@ def url(resource, compression = None):
     raise ValueError("'%s' isn't a recognized resource type" % resource)
 
   suffix = compression.extension if compression else ''
-  return ''.join((COLLECTOR_URL, '/'.join(path), suffix))
+  return COLLECTOR_URL + '/'.join(path) + suffix
 
 
 class CollecTor(object):
@@ -161,7 +98,7 @@ class CollecTor(object):
   provided in `an index <https://collector.torproject.org/index/index.json>`_
   that's fetched as required.
 
-  :var descriptor.collector.Compression compression: compression type to
+  :var descriptor.Compression compression: compression type to
     download from, if undefiled we'll use the best decompression available
   :var int retries: number of times to attempt the request if downloading it
     fails
@@ -172,7 +109,7 @@ class CollecTor(object):
     if compression == 'best':
       self.compression = None
 
-      for option in (LZMA, BZ2, GZIP):
+      for option in (Compression.LZMA, Compression.BZ2, Compression.GZIP):
         if option.available:
           self.compression = option
           break
diff --git a/test/integ/descriptor/collector.py b/test/integ/descriptor/collector.py
index 1af329a5..dbb09d5a 100644
--- a/test/integ/descriptor/collector.py
+++ b/test/integ/descriptor/collector.py
@@ -6,7 +6,8 @@ import unittest
 
 import test.require
 
-from stem.descriptor.collector import GZIP, BZ2, LZMA, CollecTor
+from stem.descriptor import Compression
+from stem.descriptor.collector import CollecTor
 
 
 class TestCollector(unittest.TestCase):
@@ -18,17 +19,17 @@ class TestCollector(unittest.TestCase):
   @test.require.only_run_once
   @test.require.online
   def test_index_gzip(self):
-    self._test_index(GZIP)
+    self._test_index(Compression.GZIP)
 
   @test.require.only_run_once
   @test.require.online
   def test_index_bz2(self):
-    self._test_index(BZ2)
+    self._test_index(Compression.BZ2)
 
   @test.require.only_run_once
   @test.require.online
   def test_index_lzma(self):
-    self._test_index(LZMA)
+    self._test_index(Compression.LZMA)
 
   def _test_index(self, compression):
     if compression and not compression.available:
diff --git a/test/settings.cfg b/test/settings.cfg
index 6f71a329..1bdb1a0a 100644
--- a/test/settings.cfg
+++ b/test/settings.cfg
@@ -239,10 +239,11 @@ test.unit_tests
 |test.unit.util.tor_tools.TestTorTools
 |test.unit.util.__init__.TestBaseUtil
 |test.unit.installation.TestInstallation
-|test.unit.descriptor.collector.TestCollector
 |test.unit.descriptor.descriptor.TestDescriptor
+|test.unit.descriptor.compression.TestCompression
 |test.unit.descriptor.export.TestExport
 |test.unit.descriptor.reader.TestDescriptorReader
+|test.unit.descriptor.collector.TestCollector
 |test.unit.descriptor.remote.TestDescriptorDownloader
 |test.unit.descriptor.server_descriptor.TestServerDescriptor
 |test.unit.descriptor.extrainfo_descriptor.TestExtraInfoDescriptor
diff --git a/test/unit/descriptor/collector.py b/test/unit/descriptor/collector.py
index a0f8cb2e..b2b464ce 100644
--- a/test/unit/descriptor/collector.py
+++ b/test/unit/descriptor/collector.py
@@ -7,7 +7,8 @@ import unittest
 
 import stem.prereq
 
-from stem.descriptor.collector import GZIP, BZ2, LZMA, CollecTor, url
+from stem.descriptor import Compression
+from stem.descriptor.collector import CollecTor, url
 
 try:
   # added in python 3.3
@@ -22,9 +23,9 @@ class TestCollector(unittest.TestCase):
   def test_url(self):
     self.assertEqual('https://collector.torproject.org/index/index.json', url('index'))
     self.assertEqual('https://collector.torproject.org/index/index.json', url('index', compression = None))
-    self.assertEqual('https://collector.torproject.org/index/index.json.gz', url('index', compression = GZIP))
-    self.assertEqual('https://collector.torproject.org/index/index.json.bz2', url('index', compression = BZ2))
-    self.assertEqual('https://collector.torproject.org/index/index.json.xz', url('index', compression = LZMA))
+    self.assertEqual('https://collector.torproject.org/index/index.json.gz', url('index', compression = Compression.GZIP))
+    self.assertEqual('https://collector.torproject.org/index/index.json.bz2', url('index', compression = Compression.BZ2))
+    self.assertEqual('https://collector.torproject.org/index/index.json.xz', url('index', compression = Compression.LZMA))
 
   @patch(URL_OPEN, Mock(return_value = io.BytesIO(b'{"index_created":"2017-12-25 21:06","build_revision":"56a303e","path":"https://collector.torproject.org"}')))
   def test_index(self):
@@ -47,7 +48,7 @@ class TestCollector(unittest.TestCase):
       self.assertRaisesRegexp(ValueError, 'No JSON object could be decoded', collector.index)
 
   def test_index_malformed_compression(self):
-    for compression in (GZIP, BZ2, LZMA):
+    for compression in (Compression.GZIP, Compression.BZ2, Compression.LZMA):
       with patch(URL_OPEN, Mock(return_value = io.BytesIO(b'not compressed'))):
         collector = CollecTor(compression = compression)
         self.assertRaisesRegexp(IOError, 'Unable to decompress response as %s' % compression, collector.index)
diff --git a/test/unit/descriptor/compression.py b/test/unit/descriptor/compression.py
new file mode 100644
index 00000000..3945bc9c
--- /dev/null
+++ b/test/unit/descriptor/compression.py
@@ -0,0 +1,39 @@
+"""
+Unit tests for stem.descriptor.Compression.
+"""
+
+import unittest
+
+from stem.descriptor import Compression
+
+from test.unit.descriptor import get_resource
+
+
+class TestCompression(unittest.TestCase):
+  def test_decompress_plaintext(self):
+    self._check_file(Compression.PLAINTEXT, 'compressed_identity')
+
+  def test_decompress_gzip(self):
+    self._check_file(Compression.GZIP, 'compressed_gzip')
+
+  def test_decompress_bz2(self):
+    self._check_file(Compression.BZ2, 'compressed_bz2')
+
+  def test_decompress_lzma(self):
+    self._check_file(Compression.LZMA, 'compressed_lzma')
+
+  def test_decompress_zstd(self):
+    self._check_file(Compression.ZSTD, 'compressed_zstd')
+
+  def _check_file(self, compression, filename):
+    """
+    Decompress one of our 'compressed_*' server descriptors.
+    """
+
+    if not compression.available:
+      self.skipTest('(%s unavailable)' % compression)
+      return
+
+    with open(get_resource(filename), 'rb') as compressed_file:
+      content = compression.decompress(compressed_file.read())
+      self.assertTrue(content.startswith(b'router moria1 128.31.0.34 9101 0 9131'))
diff --git a/test/unit/descriptor/data/compressed_bz2 b/test/unit/descriptor/data/compressed_bz2
new file mode 100644
index 00000000..4d645a71
Binary files /dev/null and b/test/unit/descriptor/data/compressed_bz2 differ