[tor-commits] [stem/master] Rewriting reader documentation
atagar at torproject.org
atagar at torproject.org
Mon Mar 26 00:10:01 UTC 2012
commit 02d51e37e54aeefc77d906792d12e1bb711b89c5
Author: Damian Johnson <atagar at torproject.org>
Date: Fri Mar 9 19:51:38 2012 -0800
Rewriting reader documentation
Moving all of the examples and substantial documentation up into the header,
and minorly revising how I intend for the module to be used. Not all of the
methods in the header have been added yet.
---
stem/descriptor/reader.py | 112 +++++++++++++++++++++++++++------------------
1 files changed, 67 insertions(+), 45 deletions(-)
diff --git a/stem/descriptor/reader.py b/stem/descriptor/reader.py
index a3ad418..b0c8f9a 100644
--- a/stem/descriptor/reader.py
+++ b/stem/descriptor/reader.py
@@ -1,7 +1,8 @@
"""
-Reads descriptors from local directories and archives.
+Utilities for reading descriptors from local directories and archives. This is
+mostly done through the DescriptorReader class, which is an iterator for the
+descriptor data in a series of destinations. For example...
-Example:
my_descriptors = [
"/tmp/server-descriptors-2012-03.tar.bz2",
"/tmp/archived_descriptors/",
@@ -9,10 +10,53 @@ Example:
reader = DescriptorReader(my_descriptors)
+ # prints the contents of all the descriptor files
with reader:
- # prints all of the descriptor contents
for descriptor in reader:
print descriptor
+
+This ignores files that cannot be processed due to read errors or unparsable
+content. To be notified of skipped files you can register a listener with
+register_skip_listener().
+
+The DescriptorReader keeps track of the last modified timestamps for descriptor
+files that it has read so it can skip unchanged files if ran again. This
+listing of processed files can also be persisted and applied to other
+DescriptorReaders. For instance, the following prints descriptors as they're
+changed over the course of a minute, and picks up where it left off if ran
+again...
+
+ reader = DescriptorReader(["/tmp/descriptor_data"])
+
+ try:
+ processed_files = load_processed_files("/tmp/used_descriptors")
+ reader.set_processed_files(processed_files)
+ except: pass # could not load, mabye this is the first run
+
+ with reader:
+ start_time = time.time()
+
+ while time.time() - start_time < 60:
+ # prints any descriptors that have changed since last checked
+ for descriptor in reader:
+ print descriptor
+
+ time.sleep(1)
+
+ save_processed_files(reader.get_processed_files(), "/tmp/used_descriptors")
+
+
+load_processed_files - Loads a listing of processed files.
+save_processed_files - Saves a listing of processed files.
+
+DescriptorReader - Iterator for descriptor data on the local file system.
+ |- get_processed_files - provides the listing of files that we've processed
+ |- set_processed_files - sets our tracking of the files we have processed
+ |- start - begins reading descriptor data
+ |- stop - stops reading descriptor data
+ |- join - joins on the thread used to process descriptor data
+ |- __enter__ / __exit__ - manages the descriptor reader thread in the context
+ +- __iter__ - iterates over descriptor data in unread files
"""
import os
@@ -29,7 +73,7 @@ def load_processed_files(path):
path (str) - location to load the processed files dictionary from
Returns:
- dict of 'path (str) => last modified timestamp (int)' mappings
+ dict of 'path (str) => last modified unix timestamp (int)' mappings
Raises:
IOError if unable to read the file
@@ -90,32 +134,6 @@ class DescriptorReader(threading.Thread):
"""
Iterator for the descriptor data on the local file system. This can process
text files, tarball archives (gzip or bzip2), or recurse directories.
-
- This keeps track the last modified timestamps for descriptor files we have
- used, and if you call restart() then this will only provide descriptors from
- new files or files that have changed since them.
-
- You can also save this listing of processed files and later apply it another
- DescriptorReader. For instance, to only print the descriptors that have
- changed since the last ran...
-
- reader = DescriptorReader(["/tmp/descriptor_data"])
-
- try:
- processed_files = load_processed_files("/tmp/used_descriptors")
- reader.set_processed_files(processed_files)
- except: pass # could not load, mabye this is the first run
-
- # only prints descriptors that have changed since we last ran
- with reader:
- for descriptor in reader:
- print descriptor
-
- save_processed_files(reader.get_processed_files(), "/tmp/used_descriptors")
-
- This ignores files that cannot be processed (either due to read errors or
- because they don't contain descriptor data). The caller can be notified of
- files that are skipped by restering a listener with register_skip_listener().
"""
def __init__(self, targets):
@@ -124,19 +142,12 @@ class DescriptorReader(threading.Thread):
self.processed_files = {}
self._stop_event = threading.Event()
- def stop(self):
- """
- Stops further reading of descriptors.
- """
-
- self._stop_event.set()
-
def get_processed_files(self):
"""
- For each file we have provided descriptor data for this provides a mapping
- of the form...
+ For each file that we have read descriptor data from this provides a
+ mapping of the form...
- absolute_path (str) => modified_time (int)
+ absolute path (str) => last modified unix timestamp (int)
This includes entries set through the set_processed_files() method.
@@ -149,17 +160,16 @@ class DescriptorReader(threading.Thread):
def set_processed_files(self, processed_files):
"""
- Appends a dictionary of 'path => modified timestamp' mappings to our
- listing of processed files. With the get_processed_files() method this can
- be used to skip descriptors that we have already read. For instance...
-
+ Sets the listing of the files we have processed. Most often this is useful
+ as a method for pre-populating the listing of descriptor files that we have
+ seen.
Arguments:
processed_files (dict) - mapping of absolute paths (str) to unix
timestamps for the last modified time (int)
"""
- self.processed_files.update(processed_files)
+ self.processed_files = dict(processed_files)
def register_skip_listener(self, listener):
"""
@@ -176,6 +186,13 @@ class DescriptorReader(threading.Thread):
self.skip_listeners.append(listener)
+ def stop(self):
+ """
+ Stops further reading of descriptor files.
+ """
+
+ self._stop_event.set()
+
def run(self):
# os.walk(path, followlinks = True)
#
@@ -184,6 +201,11 @@ class DescriptorReader(threading.Thread):
#
# >>> mimetypes.guess_type("/home/atagar/Desktop/server-descriptors-2012-03.tar.bz2")
# ('application/x-tar', 'bzip2')
+ #
+ # This only checks the file extension. To actually check the content (like
+ # the 'file' command) an option would be pymagic...
+ # https://github.com/cloudburst/pymagic
+
while not self._stop_event.isSet():
pass # TODO: implement
More information about the tor-commits
mailing list