[tor-commits] [stem/master] Rewriting reader documentation

Mon Mar 26 00:10:01 UTC 2012

commit 02d51e37e54aeefc77d906792d12e1bb711b89c5
Author: Damian Johnson <atagar at torproject.org>
Date:   Fri Mar 9 19:51:38 2012 -0800

    Rewriting reader documentation
    
    Moving all of the examples and substantial documentation up into the header,
    and minorly revising how I intend for the module to be used. Not all of the
    methods in the header have been added yet.
---
 stem/descriptor/reader.py |  112 +++++++++++++++++++++++++++------------------
 1 files changed, 67 insertions(+), 45 deletions(-)

diff --git a/stem/descriptor/reader.py b/stem/descriptor/reader.py
index a3ad418..b0c8f9a 100644
--- a/stem/descriptor/reader.py
+++ b/stem/descriptor/reader.py
@@ -1,7 +1,8 @@
 """
-Reads descriptors from local directories and archives.
+Utilities for reading descriptors from local directories and archives. This is
+mostly done through the DescriptorReader class, which is an iterator for the
+descriptor data in a series of destinations. For example...
 
-Example:
   my_descriptors = [
     "/tmp/server-descriptors-2012-03.tar.bz2",
     "/tmp/archived_descriptors/",
@@ -9,10 +10,53 @@ Example:
   
   reader = DescriptorReader(my_descriptors)
   
+  # prints the contents of all the descriptor files
   with reader:
-    # prints all of the descriptor contents
     for descriptor in reader:
       print descriptor
+
+This ignores files that cannot be processed due to read errors or unparsable
+content. To be notified of skipped files you can register a listener with
+register_skip_listener().
+
+The DescriptorReader keeps track of the last modified timestamps for descriptor
+files that it has read so it can skip unchanged files if ran again. This
+listing of processed files can also be persisted and applied to other
+DescriptorReaders. For instance, the following prints descriptors as they're
+changed over the course of a minute, and picks up where it left off if ran
+again...
+
+  reader = DescriptorReader(["/tmp/descriptor_data"])
+  
+  try:
+    processed_files = load_processed_files("/tmp/used_descriptors")
+    reader.set_processed_files(processed_files)
+  except: pass # could not load, mabye this is the first run
+  
+  with reader:
+    start_time = time.time()
+    
+    while time.time() - start_time < 60:
+      # prints any descriptors that have changed since last checked
+      for descriptor in reader:
+        print descriptor
+      
+      time.sleep(1)
+  
+  save_processed_files(reader.get_processed_files(), "/tmp/used_descriptors")
+
+
+load_processed_files - Loads a listing of processed files.
+save_processed_files - Saves a listing of processed files.
+
+DescriptorReader - Iterator for descriptor data on the local file system.
+  |- get_processed_files - provides the listing of files that we've processed
+  |- set_processed_files - sets our tracking of the files we have processed
+  |- start - begins reading descriptor data
+  |- stop - stops reading descriptor data
+  |- join - joins on the thread used to process descriptor data
+  |- __enter__ / __exit__ - manages the descriptor reader thread in the context
+  +- __iter__ - iterates over descriptor data in unread files
 """
 
 import os
@@ -29,7 +73,7 @@ def load_processed_files(path):
     path (str) - location to load the processed files dictionary from
   
   Returns:
-    dict of 'path (str) => last modified timestamp (int)' mappings
+    dict of 'path (str) => last modified unix timestamp (int)' mappings
   
   Raises:
     IOError if unable to read the file
@@ -90,32 +134,6 @@ class DescriptorReader(threading.Thread):
   """
   Iterator for the descriptor data on the local file system. This can process
   text files, tarball archives (gzip or bzip2), or recurse directories.
-  
-  This keeps track the last modified timestamps for descriptor files we have
-  used, and if you call restart() then this will only provide descriptors from
-  new files or files that have changed since them.
-  
-  You can also save this listing of processed files and later apply it another
-  DescriptorReader. For instance, to only print the descriptors that have
-  changed since the last ran...
-  
-    reader = DescriptorReader(["/tmp/descriptor_data"])
-    
-    try:
-      processed_files = load_processed_files("/tmp/used_descriptors")
-      reader.set_processed_files(processed_files)
-    except: pass # could not load, mabye this is the first run
-    
-    # only prints descriptors that have changed since we last ran
-    with reader:
-      for descriptor in reader:
-        print descriptor
-    
-    save_processed_files(reader.get_processed_files(), "/tmp/used_descriptors")
-  
-  This ignores files that cannot be processed (either due to read errors or
-  because they don't contain descriptor data). The caller can be notified of
-  files that are skipped by restering a listener with register_skip_listener().
   """
   
   def __init__(self, targets):
@@ -124,19 +142,12 @@ class DescriptorReader(threading.Thread):
     self.processed_files = {}
     self._stop_event = threading.Event()
   
-  def stop(self):
-    """
-    Stops further reading of descriptors.
-    """
-    
-    self._stop_event.set()
-  
   def get_processed_files(self):
     """
-    For each file we have provided descriptor data for this provides a mapping
-    of the form...
+    For each file that we have read descriptor data from this provides a
+    mapping of the form...
     
-    absolute_path (str) => modified_time (int)
+    absolute path (str) => last modified unix timestamp (int)
     
     This includes entries set through the set_processed_files() method.
     
@@ -149,17 +160,16 @@ class DescriptorReader(threading.Thread):
   
   def set_processed_files(self, processed_files):
     """
-    Appends a dictionary of 'path => modified timestamp' mappings to our
-    listing of processed files. With the get_processed_files() method this can
-    be used to skip descriptors that we have already read. For instance...
-    
+    Sets the listing of the files we have processed. Most often this is useful
+    as a method for pre-populating the listing of descriptor files that we have
+    seen.
     
     Arguments:
       processed_files (dict) - mapping of absolute paths (str) to unix
                                timestamps for the last modified time (int)
     """
     
-    self.processed_files.update(processed_files)
+    self.processed_files = dict(processed_files)
   
   def register_skip_listener(self, listener):
     """
@@ -176,6 +186,13 @@ class DescriptorReader(threading.Thread):
     
     self.skip_listeners.append(listener)
   
+  def stop(self):
+    """
+    Stops further reading of descriptor files.
+    """
+    
+    self._stop_event.set()
+  
   def run(self):
     # os.walk(path, followlinks = True)
     #
@@ -184,6 +201,11 @@ class DescriptorReader(threading.Thread):
     #
     # >>> mimetypes.guess_type("/home/atagar/Desktop/server-descriptors-2012-03.tar.bz2")
     # ('application/x-tar', 'bzip2')
+    #
+    # This only checks the file extension. To actually check the content (like
+    # the 'file' command) an option would be pymagic...
+    # https://github.com/cloudburst/pymagic
+    
     
     while not self._stop_event.isSet():
       pass # TODO: implement