[tor-commits] [tor/master] Select fallbacks by bandwidth rather than consensus weight

Tue Apr 26 23:28:11 UTC 2016

commit 78ec782f76d4e228761128f24589316276e80473
Author: teor (Tim Wilson-Brown) <teor2345 at gmail.com>
Date:   Fri Apr 8 23:53:24 2016 +1000

    Select fallbacks by bandwidth rather than consensus weight
    
    But as advertised bandwidth is controlled by relays,
    use consensus weight and median weight to bandwidth ratio
    to approximate measured bandwidth.
    
    Includes minor comment changes and parameter reordering.
---
 scripts/maint/updateFallbackDirs.py | 208 +++++++++++++++++++++++++++---------
 1 file changed, 156 insertions(+), 52 deletions(-)

diff --git a/scripts/maint/updateFallbackDirs.py b/scripts/maint/updateFallbackDirs.py
index 3120496..44a7318 100755
--- a/scripts/maint/updateFallbackDirs.py
+++ b/scripts/maint/updateFallbackDirs.py
@@ -112,17 +112,13 @@ CUTOFF_GUARD = .95
 # .00 means no bad exits
 PERMITTED_BADEXIT = .00
 
-# Clients will time out after 30 seconds trying to download a consensus
-# So allow fallback directories half that to deliver a consensus
-# The exact download times might change based on the network connection
-# running this script, but only by a few seconds
-# There is also about a second of python overhead
-CONSENSUS_DOWNLOAD_SPEED_MAX = 15.0
-# If the relay fails a consensus check, retry the download
-# This avoids delisting a relay due to transient network conditions
-CONSENSUS_DOWNLOAD_RETRY = True
+# older entries' weights are adjusted with ALPHA^(age in days)
+AGE_ALPHA = 0.99
+
+# this factor is used to scale OnionOO entries to [0,1]
+ONIONOO_SCALE_ONE = 999.
 
-## List Length Limits
+## Fallback Count Limits
 
 # The target for these parameters is 20% of the guards in the network
 # This is around 200 as of October 2015
@@ -130,37 +126,53 @@ _FB_POG = 0.2
 FALLBACK_PROPORTION_OF_GUARDS = None if OUTPUT_CANDIDATES else _FB_POG
 
 # We want exactly 100 fallbacks for the initial release
-# Limit the number of fallbacks (eliminating lowest by weight)
+# This gives us scope to add extra fallbacks to the list as needed
+# Limit the number of fallbacks (eliminating lowest by advertised bandwidth)
 MAX_FALLBACK_COUNT = None if OUTPUT_CANDIDATES else 100
 # Emit a C #error if the number of fallbacks is below
 MIN_FALLBACK_COUNT = 100
 
-## Fallback Weight Settings
+## Fallback Bandwidth Requirements
 
-# Any fallback with the Exit flag has its consensus weight multipled by this
-EXIT_WEIGHT_FRACTION = 1.0
+# Any fallback with the Exit flag has its bandwidth multipled by this fraction
+# to make sure we aren't further overloading exits
+# (Set to 1.0, because we asked that only lightly loaded exits opt-in,
+# and the extra load really isn't that much for large relays.)
+EXIT_BANDWIDTH_FRACTION = 1.0
 
-# If a single fallback's consensus weight is too low, it's pointless adding it
+# If a single fallback's bandwidth is too low, it's pointless adding it
 # We expect fallbacks to handle an extra 30 kilobytes per second of traffic
-# Make sure they support a hundred times that
-MIN_CONSENSUS_WEIGHT = 30.0 * 100.0
+# Make sure they can support a hundred times the expected extra load
+# (Use 102.4 to make it come out nicely in MB/s)
+# We convert this to a consensus weight before applying the filter,
+# because all the bandwidth amounts are specified by the relay
+MIN_BANDWIDTH = 102.4 * 30.0 * 1024.0
+
+# Clients will time out after 30 seconds trying to download a consensus
+# So allow fallback directories half that to deliver a consensus
+# The exact download times might change based on the network connection
+# running this script, but only by a few seconds
+# There is also about a second of python overhead
+CONSENSUS_DOWNLOAD_SPEED_MAX = 15.0
+# If the relay fails a consensus check, retry the download
+# This avoids delisting a relay due to transient network conditions
+CONSENSUS_DOWNLOAD_RETRY = True
+
+## Fallback Weights for Client Selection
 
 # All fallback weights are equal, and set to the value below
 # Authorities are weighted 1.0 by default
 # Clients use these weights to select fallbacks and authorities at random
 # If there are 100 fallbacks and 9 authorities:
-#  - each fallback is chosen with probability 10/(1000 + 9) ~= 0.99%
-#  - each authority is chosen with probability 1/(1000 + 9) ~= 0.09%
+#  - each fallback is chosen with probability 10.0/(10.0*100 + 1.0*9) ~= 0.99%
+#  - each authority is chosen with probability 1.0/(10.0*100 + 1.0*9) ~= 0.09%
+# A client choosing a bootstrap directory server will choose a fallback for
+# 10.0/(10.0*100 + 1.0*9) * 100 = 99.1% of attempts, and an authority for
+# 1.0/(10.0*100 + 1.0*9) * 9 = 0.9% of attempts.
+# (This disregards the bootstrap schedules, where clients start by choosing
+# from fallbacks & authoritites, then later choose from only authorities.)
 FALLBACK_OUTPUT_WEIGHT = 10.0
 
-## Other Configuration Parameters
-
-# older entries' weights are adjusted with ALPHA^(age in days)
-AGE_ALPHA = 0.99
-
-# this factor is used to scale OnionOO entries to [0,1]
-ONIONOO_SCALE_ONE = 999.
-
 ## Parsing Functions
 
 def parse_ts(t):
@@ -448,6 +460,11 @@ class Candidate(object):
       details['contact'] = None
     if not 'flags' in details or details['flags'] is None:
       details['flags'] = []
+    if (not 'advertised_bandwidth' in details
+        or details['advertised_bandwidth'] is None):
+      # relays without advertised bandwdith have it calculated from their
+      # consensus weight
+      details['advertised_bandwidth'] = 0
     details['last_changed_address_or_port'] = parse_ts(
                                       details['last_changed_address_or_port'])
     self._data = details
@@ -462,10 +479,6 @@ class Candidate(object):
     self._compute_ipv6addr()
     if self.ipv6addr is None:
       logging.debug("Failed to get an ipv6 address for %s."%(self._fpr,))
-    # Reduce the weight of exits to EXIT_WEIGHT_FRACTION * consensus_weight
-    if self.is_exit():
-      exit_weight = self._data['consensus_weight'] * EXIT_WEIGHT_FRACTION
-      self._data['consensus_weight'] = exit_weight
 
   def _stable_sort_or_addresses(self):
     # replace self._data['or_addresses'] with a stable ordering,
@@ -754,11 +767,9 @@ class Candidate(object):
       logging.info('%s not a candidate: guard avg too low (%lf)',
                    self._fpr, self._guard)
       return False
-    if (MIN_CONSENSUS_WEIGHT is not None
-        and self._data['consensus_weight'] < MIN_CONSENSUS_WEIGHT):
-      logging.info('%s not a candidate: consensus weight %.0f too low, must ' +
-                   'be at least %.0f', self._fpr,
-                   self._data['consensus_weight'], MIN_CONSENSUS_WEIGHT)
+    if (not self._data.has_key('consensus_weight')
+        or self._data['consensus_weight'] < 1):
+      logging.info('%s not a candidate: consensus weight invalid', self._fpr)
       return False
     return True
 
@@ -889,6 +900,30 @@ class Candidate(object):
                             ipv6 if has_ipv6 else value)
     return False
 
+  def cw_to_bw_factor(self):
+    # any relays with a missing or zero consensus weight are not candidates
+    # any relays with a missing advertised bandwidth have it set to zero
+    return self._data['advertised_bandwidth'] / self._data['consensus_weight']
+
+  # since advertised_bandwidth is reported by the relay, it can be gamed
+  # to avoid this, use the median consensus weight to bandwidth factor to
+  # estimate this relay's measured bandwidth, and make that the upper limit
+  def measured_bandwidth(self, median_cw_to_bw_factor):
+    cw_to_bw= median_cw_to_bw_factor
+    # Reduce exit bandwidth to make sure we're not overloading them
+    if self.is_exit():
+      cw_to_bw *= EXIT_BANDWIDTH_FRACTION
+    measured_bandwidth = self._data['consensus_weight'] * cw_to_bw
+    if self._data['advertised_bandwidth'] != 0:
+      # limit advertised bandwidth (if available) to measured bandwidth
+      return min(measured_bandwidth, self._data['advertised_bandwidth'])
+    else:
+      return measured_bandwidth
+
+  def set_measured_bandwidth(self, median_cw_to_bw_factor):
+    self._data['measured_bandwidth'] = self.measured_bandwidth(
+                                                      median_cw_to_bw_factor)
+
   def is_exit(self):
     return 'Exit' in self._data['flags']
 
@@ -1056,8 +1091,8 @@ class CandidateList(dict):
     logging.debug('Loading details document.')
     d = fetch('details',
         fields=('fingerprint,nickname,contact,last_changed_address_or_port,' +
-                'consensus_weight,or_addresses,dir_address,' +
-                'recommended_version,flags'))
+                'consensus_weight,advertised_bandwidth,or_addresses,' +
+                'dir_address,recommended_version,flags'))
     logging.debug('Loading details document done.')
 
     if not 'relays' in d: raise Exception("No relays found in document.")
@@ -1083,15 +1118,24 @@ class CandidateList(dict):
         guard_count += 1
     return guard_count
 
-  # Find fallbacks that fit the uptime, stability, and flags criteria
+  # Find fallbacks that fit the uptime, stability, and flags criteria,
+  # and make an array of them in self.fallbacks
   def compute_fallbacks(self):
     self.fallbacks = map(lambda x: self[x],
-                      sorted(
-                        filter(lambda x: self[x].is_candidate(),
-                               self.keys()),
-                        key=lambda x: self[x]._data['consensus_weight'],
+                         filter(lambda x: self[x].is_candidate(),
+                                self.keys()))
+
+  # sort fallbacks by their consensus weight to advertised bandwidth factor,
+  # lowest to highest
+  # used to find the median cw_to_bw_factor()
+  def sort_fallbacks_by_cw_to_bw_factor(self):
+    self.fallbacks.sort(key=lambda x: self[x].cw_to_bw_factor())
+
+  # sort fallbacks by their measured bandwidth, highest to lowest
+  # calculate_measured_bandwidth before calling this
+  def sort_fallbacks_by_measured_bandwidth(self):
+    self.fallbacks.sort(key=lambda x: self[x].self._data['measured_bandwidth'],
                         reverse=True)
-                      )
 
   @staticmethod
   def load_relaylist(file_name):
@@ -1194,13 +1238,64 @@ class CandidateList(dict):
     return '/* Whitelist & blacklist excluded %d of %d candidates. */'%(
                                                 excluded_count, initial_count)
 
-  def fallback_min_weight(self):
+  # calculate each fallback's measured bandwidth based on the median
+  # consensus weight to advertised bandwdith ratio
+  def calculate_measured_bandwidth(self):
+    self.sort_fallbacks_by_cw_to_bw_factor()
+    median_fallback = self.fallback_median(True)
+    median_cw_to_bw_factor = median_fallback.cw_to_bw_factor()
+    for f in self.fallbacks:
+      f.set_measured_bandwidth(median_cw_to_bw_factor)
+
+  # remove relays with low measured bandwidth from the fallback list
+  # calculate_measured_bandwidth for each relay before calling this
+  def remove_low_bandwidth_relays(self):
+    if MIN_BANDWIDTH is None:
+      return
+    above_min_bw_fallbacks = []
+    for f in self.fallbacks:
+      if f._data['measured_bandwidth'] >= MIN_BANDWIDTH:
+        above_min_bw_fallbacks.append(f)
+      else:
+        # the bandwidth we log here is limited by the relay's consensus weight
+        # as well as its adverttised bandwidth. See set_measured_bandwidth
+        # for details
+        logging.info('%s not a candidate: bandwidth %.1fMB/s too low, must ' +
+                     'be at least %.1fMB/s', f._fpr,
+                     f._data['measured_bandwidth']/(1024.0*1024.0),
+                     MIN_BANDWIDTH/(1024.0*1024.0))
+    self.fallbacks = above_min_bw_fallbacks
+
+  # the minimum fallback in the list
+  # call one of the sort_fallbacks_* functions before calling this
+  def fallback_min(self):
     if len(self.fallbacks) > 0:
       return self.fallbacks[-1]
     else:
       return None
 
-  def fallback_max_weight(self):
+  # the median fallback in the list
+  # call one of the sort_fallbacks_* functions before calling this
+  def fallback_median(self, require_advertised_bandwidth):
+    # use the low-median when there are an evan number of fallbacks,
+    # for consistency with the bandwidth authorities
+    if len(self.fallbacks) > 0:
+      median_position = (len(self.fallbacks) - 1) / 2
+      if not require_advertised_bandwidth:
+        return self.fallbacks[median_position]
+      # if we need advertised_bandwidth but this relay doesn't have it,
+      # move to a fallback with greater consensus weight until we find one
+      while not self.fallbacks[median_position]._data['advertised_bandwidth']:
+        median_position += 1
+        if median_position >= len(self.fallbacks):
+          return None
+      return self.fallbacks[median_position]
+    else:
+      return None
+
+  # the maximum fallback in the list
+  # call one of the sort_fallbacks_* functions before calling this
+  def fallback_max(self):
     if len(self.fallbacks) > 0:
       return self.fallbacks[0]
     else:
@@ -1211,7 +1306,7 @@ class CandidateList(dict):
     # Report:
     #  whether we checked consensus download times
     #  the number of fallback directories (and limits/exclusions, if relevant)
-    #  min & max fallback weights
+    #  min & max fallback bandwidths
     #  #error if below minimum count
     if PERFORM_IPV4_DIRPORT_CHECKS or PERFORM_IPV6_DIRPORT_CHECKS:
       s = '/* Checked %s%s%s DirPorts served a consensus within %.1fs. */'%(
@@ -1243,11 +1338,12 @@ class CandidateList(dict):
       s += 'Excluded:     %d (Eligible Count Exceeded Target Count)'%(
                                               eligible_count - fallback_count)
       s += '\n'
-    min_fb = self.fallback_min_weight()
-    min_weight = min_fb._data['consensus_weight']
-    max_fb = self.fallback_max_weight()
-    max_weight = max_fb._data['consensus_weight']
-    s += 'Consensus Weight Range: %d - %d'%(min_weight, max_weight)
+    min_fb = self.fallback_min()
+    min_bw = min_fb._data['measured_bandwidth']
+    max_fb = self.fallback_max()
+    max_bw = max_fb._data['measured_bandwidth']
+    s += 'Bandwidth Range: %.1f - %.1f MB/s'%(min_bw/(1024.0*1024.0),
+                                              max_bw/(1024.0*1024.0))
     s += '\n'
     s += '*/'
     if fallback_count < MIN_FALLBACK_COUNT:
@@ -1293,6 +1389,14 @@ def list_fallbacks():
   print candidates.summarise_filters(initial_count, excluded_count)
   eligible_count = len(candidates.fallbacks)
 
+  # calculate the measured bandwidth of each relay,
+  # then remove low-bandwidth relays
+  candidates.calculate_measured_bandwidth()
+  candidates.remove_low_bandwidth_relays()
+  # make sure the list is sorted by bandwidth when we output it
+  # so that we include the active fallbacks with the greatest bandwidth
+  candidates.sort_fallbacks_by_measured_bandwidth()
+
   # print the raw fallback list
   #for x in candidates.fallbacks:
   #  print x.fallbackdir_line(True)