[tor-commits] [stem/master] Adjust CollecTor File class
atagar at torproject.org
atagar at torproject.org
Sat Aug 17 20:44:27 UTC 2019
commit 4c744badc658d3d93d113972bfbf7cb463298ee4
Author: Damian Johnson <atagar at torproject.org>
Date: Mon Jul 29 17:35:32 2019 -0700
Adjust CollecTor File class
Handful of very tiny adjustments. Dropping the unused tar attribute, making
descriptor guessing a static function (like the others), fixing some minor edge
cases, etc.
---
stem/descriptor/collector.py | 51 ++++++------
test/unit/descriptor/collector.py | 161 ++++++++++++++++----------------------
2 files changed, 92 insertions(+), 120 deletions(-)
diff --git a/stem/descriptor/collector.py b/stem/descriptor/collector.py
index d1c90e0e..f76fa225 100644
--- a/stem/descriptor/collector.py
+++ b/stem/descriptor/collector.py
@@ -157,7 +157,6 @@ class File(object):
:var str path: file path within collector
:var stem.descriptor.Compression compression: file compression, **None** if
this cannot be determined
- :var bool tar: **True** if a tarball, **False** otherwise
:var int size: size of the file
:var datetime start: beginning of the time range descriptors are for,
@@ -170,13 +169,12 @@ class File(object):
def __init__(self, path, size, last_modified):
self.path = path
self.compression = File._guess_compression(path)
- self.tar = path.endswith('.tar') or '.tar.' in path
self.size = size
self.start, self.end = File._guess_time_range(path)
self.last_modified = datetime.datetime.strptime(last_modified, '%Y-%m-%d %H:%M')
- self._guessed_type = None
+ self._guessed_type = File._guess_descriptor_types(path)
self._downloaded_to = None # location we last downloaded to
def read(self, directory = None, descriptor_type = None, timeout = None, retries = 3):
@@ -220,21 +218,21 @@ class File(object):
"""
if descriptor_type is None:
- descriptor_types = self._guess_descriptor_types()
-
- if not descriptor_types:
+ if not self._guessed_type:
raise ValueError("Unable to determine this file's descriptor type")
- elif len(descriptor_types) > 1:
- raise ValueError("Unable to determine disambiguate file's descriptor type from %s" % ', '.join(descriptor_types))
+ elif len(self._guessed_type) > 1:
+ raise ValueError("Unable to determine disambiguate file's descriptor type from %s" % ', '.join(self._guessed_type))
- descriptor_type = descriptor_types[0]
+ descriptor_type = self._guessed_type[0]
if directory is None:
if self._downloaded_to and os.path.exists(self._downloaded_to):
directory = os.path.dirname(self._downloaded_to)
else:
with tempfile.TemporaryDirectory() as tmp_directory:
- return self.read(tmp_directory, timeout, retries)
+ return self.read(tmp_directory, descriptor_type, timeout, retries)
+
+ # TODO: the following will not work if the tar contains multiple types or a type we do not support
path = self.download(directory, True, timeout, retries)
return parse_file(path, descriptor_type)
@@ -267,7 +265,7 @@ class File(object):
filename = self.path.split('/')[-1]
- if decompress:
+ if self.compression != Compression.PLAINTEXT and decompress:
filename = filename.rsplit('.', 1)[0]
path = os.path.join(directory, filename)
@@ -277,36 +275,35 @@ class File(object):
elif os.path.exists(path):
return path # file already exists
- with open(path, 'wb') as output_file:
- response = _download(COLLECTOR_URL + self.path, timeout, retries)
+ response = _download(COLLECTOR_URL + self.path, timeout, retries)
- if decompress:
- response = self.compression.decompress(response)
+ if decompress:
+ response = self.compression.decompress(response)
+ with open(path, 'wb') as output_file:
output_file.write(response)
self._downloaded_to = path
return path
- def _guess_descriptor_types(self):
+ @staticmethod
+ def _guess_descriptor_types(path):
"""
Descriptor @type this file is expected to have based on its path. If unable
to determine any this tuple is empty.
- :returns: **tuple** with the descriptor types this file is expected to have
- """
+ Hopefully this will be replaced with an explicit value in the future:
- if self._guessed_type is None:
- guessed_type = ()
+ https://trac.torproject.org/projects/tor/ticket/31204
- for path_prefix, types in COLLECTOR_DESC_TYPES.items():
- if self.path.startswith(path_prefix):
- guessed_type = (types,) if isinstance(types, str) else types
- break
+ :returns: **tuple** with the descriptor types this file is expected to have
+ """
- self._guessed_type = guessed_type
+ for path_prefix, types in COLLECTOR_DESC_TYPES.items():
+ if path.startswith(path_prefix):
+ return (types,) if isinstance(types, str) else types
- return self._guessed_type
+ return ()
@staticmethod
def _guess_compression(path):
@@ -437,7 +434,7 @@ class CollecTor(object):
elif end and (f.end is None or f.end > end):
continue
- if descriptor_type is None or any([desc_type.startswith(descriptor_type) for desc_type in f._guess_descriptor_types()]):
+ if descriptor_type is None or any([desc_type.startswith(descriptor_type) for desc_type in f._guessed_type]):
matches.append(f)
return matches
diff --git a/test/unit/descriptor/collector.py b/test/unit/descriptor/collector.py
index ad0087dd..77c1c460 100644
--- a/test/unit/descriptor/collector.py
+++ b/test/unit/descriptor/collector.py
@@ -21,68 +21,105 @@ except ImportError:
URL_OPEN = 'urllib.request.urlopen' if stem.prereq.is_python_3() else 'urllib2.urlopen'
-MINIMAL_INDEX = {
- 'index_created': '2017-12-25 21:06',
- 'build_revision': '56a303e',
- 'path': 'https://collector.torproject.org'
-}
-
-MINIMAL_INDEX_JSON = b'{"index_created":"2017-12-25 21:06","build_revision":"56a303e","path":"https://collector.torproject.org"}'
with open(get_resource('collector_index.json'), 'rb') as index_file:
- EXAMPLE_INDEX_CONTENT = index_file.read()
+ EXAMPLE_INDEX_JSON = index_file.read()
class TestCollector(unittest.TestCase):
+ # tests for the File class
+
+ def test_file_guess_descriptor_types(self):
+ test_values = {
+ 'archive/bridge-descriptors/extra-infos/bridge-extra-infos-2008-05.tar.xz': ('bridge-extra-info 1.3',),
+ 'archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz': ('network-status-microdesc-consensus-3 1.0', 'microdescriptor 1.0'),
+ 'archive/webstats/webstats-2015-03.tar': (),
+ 'archive/no_such_file.tar': (),
+ }
+
+ for path, expected in test_values.items():
+ self.assertEqual(expected, File._guess_descriptor_types(path))
+
+ def test_file_guess_compression(self):
+ test_values = {
+ 'archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz': Compression.LZMA,
+ 'archive/webstats/webstats-2015-03.tar': Compression.PLAINTEXT,
+ 'recent/relay-descriptors/extra-infos/2019-07-03-02-05-00-extra-infos': Compression.PLAINTEXT,
+ }
+
+ for path, expected in test_values.items():
+ self.assertEqual(expected, File._guess_compression(path))
+
+ def test_file_guess_time_range(self):
+ test_values = {
+ 'archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz':
+ (datetime.datetime(2014, 1, 1), datetime.datetime(2014, 2, 1)),
+ 'recent/relay-descriptors/extra-infos/2019-07-03-02-05-00-extra-infos':
+ (datetime.datetime(2019, 7, 3, 2, 5, 0), datetime.datetime(2019, 7, 3, 3, 5, 0)),
+ 'archive/relay-descriptors/certs.tar.xz':
+ (None, None),
+ 'archive/relay-descriptors/microdescs/microdescs-2014-12.tar.xz':
+ (datetime.datetime(2014, 12, 1), datetime.datetime(2015, 1, 1)),
+ 'recent/relay-descriptors/extra-infos/2019-07-03-23-05-00-extra-infos':
+ (datetime.datetime(2019, 7, 3, 23, 5, 0), datetime.datetime(2019, 7, 4, 0, 5, 0))
+ }
+
+ for path, (expected_start, expected_end) in test_values.items():
+ f = File(path, 7515396, '2014-02-07 03:59')
+ self.assertEqual(expected_start, f.start)
+ self.assertEqual(expected_end, f.end)
+
+ # tests for the CollecTor class
+
@patch(URL_OPEN)
- def test_download_plaintext(self, urlopen_mock):
- urlopen_mock.return_value = io.BytesIO(MINIMAL_INDEX_JSON)
+ def test_index_plaintext(self, urlopen_mock):
+ urlopen_mock.return_value = io.BytesIO(EXAMPLE_INDEX_JSON)
collector = CollecTor()
- self.assertEqual(MINIMAL_INDEX, collector.index(Compression.PLAINTEXT))
+ self.assertEqual(EXAMPLE_INDEX, collector.index(Compression.PLAINTEXT))
urlopen_mock.assert_called_with('https://collector.torproject.org/index/index.json', timeout = None)
@patch(URL_OPEN)
- def test_download_gzip(self, urlopen_mock):
+ def test_index_gzip(self, urlopen_mock):
if not Compression.GZIP.available:
self.skipTest('(gzip compression unavailable)')
return
import zlib
- urlopen_mock.return_value = io.BytesIO(zlib.compress(MINIMAL_INDEX_JSON))
+ urlopen_mock.return_value = io.BytesIO(zlib.compress(EXAMPLE_INDEX_JSON))
collector = CollecTor()
- self.assertEqual(MINIMAL_INDEX, collector.index(Compression.GZIP))
+ self.assertEqual(EXAMPLE_INDEX, collector.index(Compression.GZIP))
urlopen_mock.assert_called_with('https://collector.torproject.org/index/index.json.gz', timeout = None)
@patch(URL_OPEN)
- def test_download_bz2(self, urlopen_mock):
+ def test_index_bz2(self, urlopen_mock):
if not Compression.BZ2.available:
self.skipTest('(bz2 compression unavailable)')
return
import bz2
- urlopen_mock.return_value = io.BytesIO(bz2.compress(MINIMAL_INDEX_JSON))
+ urlopen_mock.return_value = io.BytesIO(bz2.compress(EXAMPLE_INDEX_JSON))
collector = CollecTor()
- self.assertEqual(MINIMAL_INDEX, collector.index(Compression.BZ2))
+ self.assertEqual(EXAMPLE_INDEX, collector.index(Compression.BZ2))
urlopen_mock.assert_called_with('https://collector.torproject.org/index/index.json.bz2', timeout = None)
@patch(URL_OPEN)
- def test_download_lzma(self, urlopen_mock):
+ def test_index_lzma(self, urlopen_mock):
if not Compression.LZMA.available:
self.skipTest('(lzma compression unavailable)')
return
import lzma
- urlopen_mock.return_value = io.BytesIO(lzma.compress(MINIMAL_INDEX_JSON))
+ urlopen_mock.return_value = io.BytesIO(lzma.compress(EXAMPLE_INDEX_JSON))
collector = CollecTor()
- self.assertEqual(MINIMAL_INDEX, collector.index(Compression.LZMA))
+ self.assertEqual(EXAMPLE_INDEX, collector.index(Compression.LZMA))
urlopen_mock.assert_called_with('https://collector.torproject.org/index/index.json.xz', timeout = None)
@patch(URL_OPEN)
- def test_download_retries(self, urlopen_mock):
+ def test_index_retries(self, urlopen_mock):
urlopen_mock.side_effect = IOError('boom')
collector = CollecTor(retries = 0)
@@ -95,11 +132,6 @@ class TestCollector(unittest.TestCase):
self.assertRaisesRegexp(IOError, 'boom', collector.index)
self.assertEqual(5, urlopen_mock.call_count)
- @patch(URL_OPEN, Mock(return_value = io.BytesIO(MINIMAL_INDEX_JSON)))
- def test_index(self):
- collector = CollecTor()
- self.assertEqual(MINIMAL_INDEX, collector.index(Compression.PLAINTEXT))
-
@patch(URL_OPEN, Mock(return_value = io.BytesIO(b'not json')))
def test_index_malformed_json(self):
collector = CollecTor()
@@ -118,104 +150,47 @@ class TestCollector(unittest.TestCase):
collector = CollecTor()
self.assertRaisesRegexp(IOError, 'Failed to decompress as %s' % compression, collector.index, compression)
- @patch(URL_OPEN, Mock(return_value = io.BytesIO(EXAMPLE_INDEX_CONTENT)))
- def test_real_index(self):
- collector = CollecTor()
- self.assertEqual(EXAMPLE_INDEX, collector.index(compression = Compression.PLAINTEXT))
-
@patch('stem.descriptor.collector.CollecTor.index', Mock(return_value = EXAMPLE_INDEX))
- def test_contents(self):
+ def test_files(self):
collector = CollecTor()
files = collector.files()
-
self.assertEqual(85, len(files))
- test_path = 'archive/relay-descriptors/extra-infos/extra-infos-2007-09.tar.xz'
- extrainfo_file = list(filter(lambda x: x.path == test_path, files))[0]
- self.assertEqual(test_path, extrainfo_file.path)
+ extrainfo_file = list(filter(lambda x: x.path.endswith('extra-infos-2007-09.tar.xz'), files))[0]
+ self.assertEqual('archive/relay-descriptors/extra-infos/extra-infos-2007-09.tar.xz', extrainfo_file.path)
self.assertEqual(Compression.LZMA, extrainfo_file.compression)
- self.assertEqual(True, extrainfo_file.tar)
self.assertEqual(6459884, extrainfo_file.size)
self.assertEqual(datetime.datetime(2016, 6, 23, 9, 54), extrainfo_file.last_modified)
- def test_file_compression_attributes(self):
- f = File('archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz', 7515396, '2014-02-07 03:59')
- self.assertEqual(Compression.LZMA, f.compression)
- self.assertEqual(True, f.tar)
-
- f = File('archive/webstats/webstats-2015-03.tar', 20480, '2018-03-19 16:07')
- self.assertEqual(Compression.PLAINTEXT, f.compression)
- self.assertEqual(True, f.tar)
-
- f = File('recent/relay-descriptors/extra-infos/2019-07-03-02-05-00-extra-infos', 1162899, '2019-07-03 02:05')
- self.assertEqual(Compression.PLAINTEXT, f.compression)
- self.assertEqual(False, f.tar)
-
- def test_file_date_attributes(self):
- f = File('archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz', 7515396, '2014-02-07 03:59')
- self.assertEqual(datetime.datetime(2014, 1, 1), f.start)
- self.assertEqual(datetime.datetime(2014, 2, 1), f.end)
-
- f = File('recent/relay-descriptors/extra-infos/2019-07-03-02-05-00-extra-infos', 1162899, '2019-07-03 02:05')
- self.assertEqual(datetime.datetime(2019, 7, 3, 2, 5, 0), f.start)
- self.assertEqual(datetime.datetime(2019, 7, 3, 3, 5, 0), f.end)
-
- f = File('archive/relay-descriptors/certs.tar.xz', 144696, '2019-07-03 03:29')
- self.assertEqual(None, f.start)
- self.assertEqual(None, f.end)
-
- # check date boundaries
-
- f = File('archive/relay-descriptors/microdescs/microdescs-2014-12.tar.xz', 7515396, '2014-02-07 03:59')
- self.assertEqual(datetime.datetime(2015, 1, 1), f.end)
-
- f = File('recent/relay-descriptors/extra-infos/2019-07-03-23-05-00-extra-infos', 1162899, '2019-07-03 02:05')
- self.assertEqual(datetime.datetime(2019, 7, 4, 0, 5, 0), f.end)
-
@patch('stem.descriptor.collector.CollecTor.index', Mock(return_value = EXAMPLE_INDEX))
- def test_file_query_by_type(self):
+ def test_files_by_descriptor_type(self):
collector = CollecTor()
- expected = [
+ self.assertEqual([
'archive/relay-descriptors/server-descriptors/server-descriptors-2005-12.tar.xz',
'archive/relay-descriptors/server-descriptors/server-descriptors-2006-02.tar.xz',
'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz',
'recent/relay-descriptors/server-descriptors/2019-07-03-02-05-00-server-descriptors',
'recent/relay-descriptors/server-descriptors/2019-07-03-03-05-00-server-descriptors',
'recent/relay-descriptors/server-descriptors/2019-07-03-04-05-00-server-descriptors',
- ]
-
- self.assertEqual(expected, list(map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor'))))
+ ], [f.path for f in collector.files(descriptor_type = 'server-descriptor')])
@patch('stem.descriptor.collector.CollecTor.index', Mock(return_value = EXAMPLE_INDEX))
- def test_file_query_by_date(self):
+ def test_file_by_date(self):
collector = CollecTor()
self.assertEqual([
'recent/relay-descriptors/server-descriptors/2019-07-03-02-05-00-server-descriptors',
'recent/relay-descriptors/server-descriptors/2019-07-03-03-05-00-server-descriptors',
'recent/relay-descriptors/server-descriptors/2019-07-03-04-05-00-server-descriptors',
- ], list(map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2007, 1, 1)))))
+ ], [f.path for f in collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2007, 1, 1))])
self.assertEqual([
'archive/relay-descriptors/server-descriptors/server-descriptors-2005-12.tar.xz',
'archive/relay-descriptors/server-descriptors/server-descriptors-2006-02.tar.xz',
'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz',
- ], list(map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', end = datetime.datetime(2007, 1, 1)))))
+ ], [f.path for f in collector.files(descriptor_type = 'server-descriptor', end = datetime.datetime(2007, 1, 1))])
self.assertEqual([
'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz',
- ], list(map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2006, 2, 10), end = datetime.datetime(2007, 1, 1)))))
-
- def test_guess_descriptor_types(self):
- f = File('archive/bridge-descriptors/extra-infos/bridge-extra-infos-2008-05.tar.xz', 377644, '2016-09-04 09:21')
- self.assertEqual(('bridge-extra-info 1.3',), f._guess_descriptor_types())
-
- f = File('archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz', 7515396, '2014-02-07 03:59')
- self.assertEqual(('network-status-microdesc-consensus-3 1.0', 'microdescriptor 1.0'), f._guess_descriptor_types())
-
- f = File('archive/webstats/webstats-2015-03.tar', 20480, '2018-03-19 16:07')
- self.assertEqual((), f._guess_descriptor_types())
-
- f = File('archive/no_such_file.tar', 20480, '2018-03-19 16:07')
- self.assertEqual((), f._guess_descriptor_types())
+ ], [f.path for f in collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2006, 2, 10), end = datetime.datetime(2007, 1, 1))])
More information about the tor-commits
mailing list