[tor-commits] [bridgedb/master] Support handing out decoy bridges to bots.

phw at torproject.org phw at torproject.org
Tue Aug 20 16:56:57 UTC 2019


commit 7ceb25e306a5af456c4a4ba1f1f5b2a72d6eb77c
Author: Philipp Winter <phw at nymity.ch>
Date:   Wed Aug 14 15:00:59 2019 -0700

    Support handing out decoy bridges to bots.
    
    This patch makes it possible to identify bots by inspecting HTTP request
    headers.  A CSV file, specified by BLACKLISTED_REQUEST_HEADERS_FILE,
    contains mappings from request header to a regular expression of the
    header's value, e.g.:
    
      Accept-Language,[Kk]lingon
      User-Agent,Spa+ce
      ...
    
    Once a regular expression matches a client's request, we probably caught
    a bot.  This patch also makes it possible to respond to bot requests
    with a decoy bridge, e.g., to study what the owners of the bot intend to
    do with the bridge.  Decoy bridges are configured in the CSV file
    DECOY_BRIDGES_FILE.  The file maps a transport type and its IP address
    version to bridge lines, e.g.:
    
      vanillav4,1.2.3.4:1234 FINGERPRINT
      obfs4v4,obfs4 1.2.3.4:1234 FINGERPRINT ARGS
      ...
    
    This fixes <https://bugs.torproject.org/31252>
---
 CHANGELOG                             |   9 +++
 bridgedb.conf                         |  19 ++++++
 bridgedb/antibot.py                   | 123 ++++++++++++++++++++++++++++++++++
 bridgedb/distributors/https/server.py |  10 +++
 bridgedb/distributors/moat/server.py  |   6 ++
 bridgedb/main.py                      |   6 ++
 bridgedb/test/test_antibot.py         | 108 +++++++++++++++++++++++++++++
 7 files changed, 281 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index 32e6fe5..03390d6 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -16,6 +16,15 @@ Changes in version 0.8.0 - YYYY-MM-DD
         Use stem instead of leekspin to create test descriptors.  We now don't
         need to depend on leekspin anymore.
 
+        * FIXES #31252 https://bugs.torproject.org/31252
+        Add an anti-bot mechanism that allows us to detect bots by matching
+        HTTP request headers for blacklisted patterns.  For example, bots may
+        have their Accept-Language set to "Klingon".  Blacklisted patterns are
+        configured in BLACKLISTED_REQUEST_HEADERS_FILE.  When BridgeDB detects
+        a bot request, we can answer their request with a decoy bridge that's
+        only handed out to bots.  Decoy bridges are configured in
+        DECOY_BRIDGES_FILE.
+
 Changes in version 0.7.1 - 2019-06-07
 
         * FIXES #28496 https://bugs.torproject.org/28496
diff --git a/bridgedb.conf b/bridgedb.conf
index ba43bb6..a0e00a8 100644
--- a/bridgedb.conf
+++ b/bridgedb.conf
@@ -301,6 +301,25 @@ PROBING_RESISTANT_TRANSPORTS = ['scramblesuit', 'obfs4']
 # menu).
 DEFAULT_TRANSPORT = 'obfs4'
 
+# HTTP headers that suggest that a request was issued by a bot.  The CSV
+# file must have the following format:
+#   <HEADER>,<REGEXP>
+#   ...
+# For example:
+#   Accept-Language,[Kk]lingon
+BLACKLISTED_REQUEST_HEADERS_FILE="blacklisted-request-headers.csv"
+
+# Decoy bridges that we are handing out to bots that we detected using the
+# regular expressions in BLACKLISTED_REQUEST_HEADERS_FILE.  The CSV file must
+# have the following format:
+#   <TRANSPORT>v<IP_VERSION>,<BRIDGE_LINE>
+#   ...
+# For example:
+#   vanillav4,1.2.3.4:1234 0123456789ABCDEF0123456789ABCDEF01234567
+#   vanillav6,[::1]:1234 0123456789ABCDEF0123456789ABCDEF01234567
+#   obfs4v4,obfs4 1.2.3.4:1234 public-key=... node-id=... iat-mode=...
+DECOY_BRIDGES_FILE="decoy-bridges.csv"
+
 #-------------------------------
 # Moat Distribution Options  \
 #------------------------------------------------------------------------------
diff --git a/bridgedb/antibot.py b/bridgedb/antibot.py
new file mode 100644
index 0000000..e724c68
--- /dev/null
+++ b/bridgedb/antibot.py
@@ -0,0 +1,123 @@
+# -*- coding: utf-8 ; test-case-name: bridgedb.test.test_metrics ; -*-
+# _____________________________________________________________________________
+#
+# This file is part of BridgeDB, a Tor bridge distribution system.
+#
+# :authors: please see included AUTHORS file
+# :copyright: (c) 2019, The Tor Project, Inc.
+#             (c) 2019, Philipp Winter
+# :license: see LICENSE for licensing information
+# _____________________________________________________________________________
+
+"""Functions for dealing with bot requests."""
+
+import re
+import logging
+
+# Maps transport types and IP version (e.g., "obfs4v4", "vanillav4", or
+# "vanillav6") to bridge lines (e.g., "1.2.3.4:1234 ...".
+DECOY_BRIDGES = {}
+
+# Maps HTTP request headers (e.g., "Accept-Language") to regular expressions
+# that suggest that the request was issued by a bot (e.g., "[Kk]lingon").
+BLACKLISTED_REQUEST_HEADERS = {}
+
+
+def _loadCSV(filename):
+    """Load and return the content of the given CSV file.
+
+    :param str filename: The filename to read.
+    :rtype: dict
+    :returns: A dictionary mapping keys (first column) to values (second
+        column).
+    """
+
+    csv = dict()
+    try:
+        with open(filename) as fh:
+            for line in fh.readlines():
+                if line.count(",") != 1:
+                    logging.warning("Line must have exactly one comma: %s" %
+                                    line)
+                    continue
+                key, value = line.split(",")
+                csv[key.strip()] = value.strip()
+    except IOError as err:
+        logging.warning("I/O error while reading from file %s: %s" %
+                        (filename, err))
+
+    return csv
+
+
+def loadBlacklistedRequestHeaders(filename):
+    """Load and globally set a dictionary of blacklisted request headers.
+
+    :param str filename: The filename to read.
+    """
+
+    content = _loadCSV(filename)
+    blacklisted = dict()
+    # Turn dictionary values into compiled regular expressions.
+    for header, regexp in content.items():
+        try:
+            blacklisted[header] = re.compile(regexp)
+        except Exception as err:
+            logging.warning("Skipping regexp %s because we couldn't compile "
+                            "it: %s" % (regexp, err))
+
+    global BLACKLISTED_REQUEST_HEADERS
+    BLACKLISTED_REQUEST_HEADERS = blacklisted
+
+
+def loadDecoyBridges(filename):
+    """Load and globally set a dictionary of decoy bridges.
+
+    :param str filename: The filename to read.
+    """
+
+    d = _loadCSV(filename)
+    # Turn our bridge lines (which are strings) into lists.
+    decoyBridges = {ttype: [line] for ttype, line in d.items()}
+
+    global DECOY_BRIDGES
+    DECOY_BRIDGES = decoyBridges
+
+
+def getDecoyBridge(transport, ipVersion):
+    """Return a decoy bridge or, if none is available, None.
+
+    :param str transport: The desired transport, e.g., "vanilla" or "obfs4".
+    :param int ipVersion: The IP version, which must be either 4 or 6.
+    :rtype: list
+    :returns: Return a list of bridge lines or, if we don't have any, None.
+    """
+
+    if ipVersion not in [4, 6]:
+        return None
+
+    logging.info("Returning IPv%d decoy bridge for transport %s." %
+                 (ipVersion, transport))
+    return DECOY_BRIDGES.get("%sv%d" % (transport, ipVersion), None)
+
+
+def isRequestFromBot(request):
+    """Determine if the given request is coming from a bot.
+
+    :type request: :api:`twisted.web.http.Request`
+    :param request: A ``Request`` object, including POST arguments which
+        should include two key/value pairs.
+    :rtype: bool
+    :returns: True if the request is coming from a bot and False otherwise.
+    """
+
+    for header, badRegexp in BLACKLISTED_REQUEST_HEADERS.items():
+        value = request.getHeader(header)
+        if value is None:
+            continue
+
+        if badRegexp.search(value) is not None:
+            logging.info("Found bot request. Headers: %s" %
+                         request.requestHeaders)
+            return True
+
+    return False
diff --git a/bridgedb/distributors/https/server.py b/bridgedb/distributors/https/server.py
index 732f8bf..e5df7da 100644
--- a/bridgedb/distributors/https/server.py
+++ b/bridgedb/distributors/https/server.py
@@ -53,6 +53,7 @@ from bridgedb import strings
 from bridgedb import translations
 from bridgedb import txrecaptcha
 from bridgedb import metrics
+from bridgedb import antibot
 from bridgedb.distributors.common.http import setFQDN
 from bridgedb.distributors.common.http import getFQDN
 from bridgedb.distributors.common.http import getClientIP
@@ -916,6 +917,15 @@ class BridgesResource(CustomErrorHandlingResource, CSPResource):
             bridgeLines = [replaceControlChars(bridge.getBridgeLine(
                 bridgeRequest, self.includeFingerprints)) for bridge in bridges]
 
+            if antibot.isRequestFromBot(request):
+                transports = bridgeRequest.transports
+                # Return either a decoy bridge or no bridge.
+                if len(transports) > 2:
+                    logging.warning("More than one transport requested")
+                    return self.renderAnswer(request)
+                ttype = "vanilla" if len(transports) == 0 else transports[0]
+                return self.renderAnswer(request, antibot.getDecoyBridge(ttype, bridgeRequest.ipVersion))
+
         return self.renderAnswer(request, bridgeLines)
 
     def getResponseFormat(self, request):
diff --git a/bridgedb/distributors/moat/server.py b/bridgedb/distributors/moat/server.py
index 73d2423..10096e7 100644
--- a/bridgedb/distributors/moat/server.py
+++ b/bridgedb/distributors/moat/server.py
@@ -41,6 +41,7 @@ from twisted.web.server import Site
 from bridgedb import metrics
 from bridgedb import captcha
 from bridgedb import crypto
+from bridgedb import antibot
 from bridgedb.distributors.common.http import setFQDN
 from bridgedb.distributors.common.http import getFQDN
 from bridgedb.distributors.common.http import getClientIP
@@ -735,6 +736,11 @@ class CaptchaCheckResource(CaptchaResource):
                 logging.warn(("Not enough bridges of the type specified to "
                               "fulfill the following request: %s") % bridgeRequest)
 
+            if antibot.isRequestFromBot(request):
+                ttype = transport or "vanilla"
+                bridgeLines = antibot.getDecoyBridge(ttype,
+                                                     bridgeRequest.ipVersion)
+
             # If we have no bridges at all to give to the client, then
             # return a JSON API 404 error.
             if not bridgeLines:
diff --git a/bridgedb/main.py b/bridgedb/main.py
index 5d9b0c6..94f4921 100644
--- a/bridgedb/main.py
+++ b/bridgedb/main.py
@@ -26,6 +26,7 @@ from bridgedb import proxy
 from bridgedb import runner
 from bridgedb import util
 from bridgedb import metrics
+from bridgedb import antibot
 from bridgedb.bridges import MalformedBridgeInfo
 from bridgedb.bridges import MissingServerDescriptorDigest
 from bridgedb.bridges import ServerDescriptorDigestMismatch
@@ -417,6 +418,11 @@ def run(options, reactor=reactor):
             proxy.loadProxiesFromFile(proxyfile, proxies, removeStale=True)
         metrics.setProxies(proxies)
 
+        logging.info("Reloading blacklisted request headers...")
+        antibot.loadBlacklistedRequestHeaders(config.BLACKLISTED_REQUEST_HEADERS_FILE)
+        logging.info("Reloading decoy bridges...")
+        antibot.loadDecoyBridges(config.DECOY_BRIDGES_FILE)
+
         logging.info("Reparsing bridge descriptors...")
         (hashring,
          emailDistributorTmp,
diff --git a/bridgedb/test/test_antibot.py b/bridgedb/test/test_antibot.py
new file mode 100644
index 0000000..1cda86a
--- /dev/null
+++ b/bridgedb/test/test_antibot.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 ; test-case-name: bridgedb.test.test_metrics ; -*-
+# _____________________________________________________________________________
+#
+# This file is part of BridgeDB, a Tor bridge distribution system.
+#
+# :authors: please see included AUTHORS file
+# :copyright: (c) 2019, The Tor Project, Inc.
+#             (c) 2019, Philipp Winter
+# :license: see LICENSE for licensing information
+# _____________________________________________________________________________
+
+"""Tests for :mod:`bridgedb.antibot`."""
+
+import os
+import tempfile
+
+from twisted.trial import unittest
+from twisted.web.test.requesthelper import DummyRequest
+
+from bridgedb import antibot
+
+
+class AntiBot(unittest.TestCase):
+    """Unittests for :mod:`bridgedb.antibot`."""
+
+    def write_file(self, content):
+        """
+        Write the given content to a temporary file.
+
+        We're responsible for deleting the file once we're done.
+        """
+        fd, filename = tempfile.mkstemp(prefix="bridgedb")
+        fh = os.fdopen(fd, "w")
+        fh.write(content)
+        fh.close()
+        return filename
+
+    def test_load_csv(self):
+        """Load a valid CSV file."""
+        content = "foo,bar\nbar,foo\n"
+        filename = self.write_file(content)
+
+        csv = antibot._loadCSV(filename)
+        self.assertEqual(csv["foo"], "bar")
+        self.assertEqual(csv["bar"], "foo")
+
+        os.unlink(filename)
+
+    def test_load_invalid_csv(self):
+        """Load an invalid CSV file that has two commas in one line."""
+        content = "foo,bar,bad\nbar,foo\n"
+        filename = self.write_file(content)
+
+        csv = antibot._loadCSV(filename)
+        self.assertEqual(len(csv), 1)
+
+        os.unlink(filename)
+
+    def test_load_blacklisted_headers(self):
+        """Load valid blacklisted request headers."""
+        content = "accept-language,[Kk]lingon"
+        filename = self.write_file(content)
+
+        antibot.loadBlacklistedRequestHeaders(filename)
+
+        request = DummyRequest([''])
+        verdict = antibot.isRequestFromBot(request)
+        self.assertFalse(verdict)
+
+        request.requestHeaders.setRawHeaders("accept-language",
+                                             ["i speak kllingon"])
+        antibot.loadBlacklistedRequestHeaders(filename)
+        verdict = antibot.isRequestFromBot(request)
+        self.assertFalse(verdict)
+
+        request.requestHeaders.setRawHeaders("accept-language",
+                                             ["i speak klingon"])
+        antibot.loadBlacklistedRequestHeaders(filename)
+        verdict = antibot.isRequestFromBot(request)
+        self.assertTrue(verdict)
+
+        os.unlink(filename)
+
+    def test_load_invalid_blacklisted_headers(self):
+        """Load invalid blacklisted request headers with a broken regexp."""
+        content = "accept-language,[Klingon\nuser-agent,foo*"
+        filename = self.write_file(content)
+
+        antibot.loadBlacklistedRequestHeaders(filename)
+        self.assertEqual(len(antibot.BLACKLISTED_REQUEST_HEADERS), 1)
+
+        os.unlink(filename)
+
+    def test_load_decoy_bridges(self):
+        """Load decoy bridges."""
+        obfs4_line = "obfs4 1.2.3.4:1234 FINGERPRINT FOO BAR"
+        vanilla_line = "1.2.3.4:1234 FINGERPRINT"
+
+        content = "vanillav4,%s\nobfs4v4,%s" % (vanilla_line, obfs4_line)
+        filename = self.write_file(content)
+
+        antibot.loadDecoyBridges(filename)
+        self.assertEqual(antibot.getDecoyBridge("obfs4", 4), [obfs4_line])
+        self.assertEqual(antibot.getDecoyBridge("vanilla", 4), [vanilla_line])
+        self.assertEqual(antibot.getDecoyBridge("vanilla", 6), None)
+        self.assertEqual(antibot.getDecoyBridge("vanilla", 7), None)
+
+        os.unlink(filename)





More information about the tor-commits mailing list