[tor-commits] [bridgedb/master] Add bridgedb.parse.descriptors module.
isis at torproject.org
isis at torproject.org
Sat Mar 21 02:02:57 UTC 2015
commit 7869e4c7cd1e43f9354480f6cafba0794fd86433
Author: Isis Lovecruft <isis at torproject.org>
Date: Sun Jul 6 19:20:43 2014 +0000
Add bridgedb.parse.descriptors module.
This module implements parsing bridge descriptors with Stem.
* FIXES (partially) #9380.
---
lib/bridgedb/parse/descriptors.py | 151 +++++++++++++++++++++++++++++++++++++
1 file changed, 151 insertions(+)
diff --git a/lib/bridgedb/parse/descriptors.py b/lib/bridgedb/parse/descriptors.py
new file mode 100644
index 0000000..bedc0b1
--- /dev/null
+++ b/lib/bridgedb/parse/descriptors.py
@@ -0,0 +1,151 @@
+# -*- coding: utf-8 ; test-case-name: bridgedb.test.test_parse_descriptors ; -*-
+#_____________________________________________________________________________
+#
+# This file is part of BridgeDB, a Tor bridge distribution system.
+#
+# :authors: Isis Lovecruft 0xA3ADB67A2CDB8B35 <isis at torproject.org>
+# please also see AUTHORS file
+# :copyright: (c) 2007-2014, The Tor Project, Inc.
+# (c) 2014, Isis Lovecruft
+# :license: see LICENSE for licensing information
+#_____________________________________________________________________________
+
+from __future__ import print_function
+
+import datetime
+import logging
+
+from stem.descriptor import extrainfo_descriptor
+from stem.descriptor import networkstatus
+from stem.descriptor import server_descriptor
+from stem.descriptor import parse_file
+
+from bridgedb import safelog
+
+
+def parseNetworkStatusFile(filename, validate=True):
+ """Parse a file which contains an ``@type bridge-networkstatus`` document.
+
+ :rtype: dict
+ :returns: A dictionary of
+ :api:`stem.descriptor.router_status_entry.RouterStatusEntryV2`.
+ """
+ logging.info("Parsing networkstatus entries with Stem: %s" % filename)
+
+ fh = open(filename)
+ descriptors = fh.read()
+ fh.close()
+
+ # See ticket #12254 for why networkstatus-bridges documents don't look
+ # anything like the networkstatus v2 documents that they are purported to
+ # look like. They are missing all headers, and the entire footer including
+ # authority signatures.
+ #
+ # https://trac.torproject.org/projects/tor/ticket/12254
+ #
+ # As such, they do not currently start with a "published" line with an
+ # ISO8601 timestamp, as stem expects them to:
+ #
+ if not descriptors.startswith("published"):
+ precise = datetime.datetime.now().isoformat(sep=chr(0x20))
+ timestamp = precise.rsplit('.', 1)[0]
+ descriptors = "published {t}\n{d}".format(t=timestamp, d=descriptors)
+ else:
+ logging.warn(
+ ("Networkstatus file '%s' started with 'published' line! Please "
+ "revise this function!") % filename)
+
+ routers = networkstatus.BridgeNetworkStatusDocument(descriptors,
+ validate=validate)
+ return routers
+
+def parseServerDescriptorsFile(filename, validate=False):
+ """Parse a file which contains ``@type bridge-server-descriptor``s.
+
+ .. note:: ``validate`` defaults to ``False`` because there appears to be a
+ bug in Leekspin, the fake descriptor generator, where Stem thinks the
+ fingerprint doesn't match the keyâ¦
+
+ .. note:: We have to lie to Stem, pretending that these are ``@type
+ server-descriptor``s, **not** ``@type bridge-server-descriptor``s.
+ See ticket `#11257`_.
+
+ .. _`#11257`: https://trac.torproject.org/projects/tor/ticket/11257
+
+ :param str filename: The file to parse descriptors from.
+ :param bool validate: Whether or not to validate descriptor
+ contents. (default: ``False``)
+ :rtype: list
+ :returns: A list of
+ :api:`stem.descriptor.server_descriptor.RelayDescriptor`s.
+ """
+ logging.info("Parsing server descriptors with Stem: %s" % filename)
+ descriptorType = 'server-descriptor 1.0'
+ document = parse_file(filename, descriptorType, validate=validate)
+
+ routers = [router for router in document]
+ return routers
+
+def deduplicate(descriptors):
+ duplicates = []
+ nonDuplicates = []
+
+ for descriptor in descriptors:
+ router = descriptors.pop(descriptors.index(descriptor))
+ fingerprint = router.fingerprint
+
+ logging.debug("Deduplicating %s descriptor for router %s"
+ % (str(router.__class__).rsplit('.', 1)[1],
+ safelog.logSafely(fingerprint)))
+
+ for possibleDuplicate in descriptors:
+ if fingerprint == possibleDuplicate.fingerprint:
+ logging.warn("Duplicate extra-info descriptor for router %s"
+ % safelog.logSafely(fingerprint))
+ if router.published > possibleDuplicate.published:
+ # The router is newer than the duplicate, so get rid of
+ # the duplicate:
+ duplicates.append(possibleDuplicate)
+ elif router.published < possibleDuplicate.published:
+ # The router is older than the duplicate, so replace our
+ # router:
+ duplicates.append(router)
+ router = possibleDuplicate
+ else:
+ duplicates.append(possibleDuplicate)
+ logging.warn(("Duplicate descriptor and original "
+ "descriptor for router %s both had the same "
+ "timestamp: %s")
+ % (safelog.logSafely(fingerprint),
+ router.published))
+ else:
+ nonDuplicates.append(router)
+
+ logging.info("Descriptor deduplication finished.")
+ logging.info("Number of duplicates: %d" % len(duplicates))
+ logging.info("Number of non-duplicates: %d" % len(nonDuplicates))
+ return nonDuplicates
+
+
+def parseBridgeExtraInfoFiles(*filenames, **kwargs):
+ """Parse files which contain ``@type bridge-extrainfo-descriptor``s.
+
+ :kwargs: If there is a ``'validate'`` keyword argument, its value will be
+ passed along as the ``'validate'`` argument to
+ :api:`stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor`.
+ """
+ descriptors = []
+ descriptorType = 'bridge-extra-info 1.1'
+
+ validate = False
+ if ('validate' in kwargs) and (kwargs['validate'] is True):
+ validate = True
+
+ for filename in filenames:
+ logging.info("Parsing %s descriptors with Stem: %s"
+ % (descriptorType, filename))
+ document = parse_file(filename, descriptorType, validate=validate)
+ descriptors.extend([router for router in document])
+
+ routers = deduplicate(descriptors)
+ return routers
More information about the tor-commits
mailing list