[tor-commits] [tor/master] Implement backbone of overload statistics.

asn at torproject.org asn at torproject.org
Wed Mar 17 16:24:24 UTC 2021


commit 0a5ecb334298187a64f58382231245111130aa76
Author: George Kadianakis <desnacked at riseup.net>
Date:   Tue Mar 9 15:36:40 2021 +0200

    Implement backbone of overload statistics.
    
    - Implement overload statistics structure.
    - Implement function that keeps track of overload statistics.
    - Implement function that writes overload statistics to descriptor.
    - Unittest for the whole logic.
---
 changes/bug40222               |   6 +++
 src/app/config/config.c        |   1 +
 src/app/config/or_options_st.h |   3 ++
 src/feature/relay/router.c     |   6 +++
 src/feature/stats/rephist.c    | 116 +++++++++++++++++++++++++++++++++++++++++
 src/feature/stats/rephist.h    |  18 +++++++
 src/test/test_stats.c          | 109 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 259 insertions(+)

diff --git a/changes/bug40222 b/changes/bug40222
new file mode 100644
index 0000000000..8338ea4a7b
--- /dev/null
+++ b/changes/bug40222
@@ -0,0 +1,6 @@
+  o Major features (metrics):
+    - Introduce a new class of extra-info statistics that allows relays to
+      expose details of how overloaded they are. This information is controlled
+      using the OverloadStatistics torrc option, and it will be used to make
+      more informed decisions about the network's load balancing. Implements
+      proposal 328; closes ticket 40222.
diff --git a/src/app/config/config.c b/src/app/config/config.c
index 1ac460bac0..156e90a549 100644
--- a/src/app/config/config.c
+++ b/src/app/config/config.c
@@ -352,6 +352,7 @@ static const config_var_t option_vars_[] = {
   V(CacheDirectoryGroupReadable, AUTOBOOL,     "auto"),
   V(CellStatistics,              BOOL,     "0"),
   V(PaddingStatistics,           BOOL,     "1"),
+  V(OverloadStatistics,          BOOL,     "1"),
   V(LearnCircuitBuildTimeout,    BOOL,     "1"),
   V(CircuitBuildTimeout,         INTERVAL, "0"),
   OBSOLETE("CircuitIdleTimeout"),
diff --git a/src/app/config/or_options_st.h b/src/app/config/or_options_st.h
index 90302eae7b..689adbc71b 100644
--- a/src/app/config/or_options_st.h
+++ b/src/app/config/or_options_st.h
@@ -674,6 +674,9 @@ struct or_options_t {
   /** If true, include statistics file contents in extra-info documents. */
   int ExtraInfoStatistics;
 
+  /** If true, include overload statistics in extra-info documents. */
+  int OverloadStatistics;
+
   /** If true, do not believe anybody who tells us that a domain resolves
    * to an internal address, or that an internal address has a PTR mapping.
    * Helps avoid some cross-site attacks. */
diff --git a/src/feature/relay/router.c b/src/feature/relay/router.c
index 73e8393761..88160bd1cb 100644
--- a/src/feature/relay/router.c
+++ b/src/feature/relay/router.c
@@ -3372,6 +3372,12 @@ extrainfo_dump_to_string_stats_helper(smartlist_t *chunks,
       if (contents)
         smartlist_add(chunks, contents);
     }
+    if (options->OverloadStatistics) {
+      contents = rep_hist_get_overload_stats_lines();
+      if (contents) {
+        smartlist_add(chunks, contents);
+      }
+    }
     /* bridge statistics */
     if (should_record_bridge_info(options)) {
       const char *bridge_stats = geoip_get_bridge_stats_extrainfo(now);
diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c
index f7c9336bff..64bec000df 100644
--- a/src/feature/stats/rephist.c
+++ b/src/feature/stats/rephist.c
@@ -183,6 +183,122 @@ static time_t started_tracking_stability = 0;
 /** Map from hex OR identity digest to or_history_t. */
 static digestmap_t *history_map = NULL;
 
+/** Represents a state of overload stats.
+ *
+ *  All the timestamps in this structure have already been rounded down to the
+ *  nearest hour. */
+typedef struct {
+  /* When did we last experience a general overload? */
+  time_t overload_general_time;
+
+  /* When did we last experience a bandwidth-related overload? */
+  time_t overload_ratelimits_time;
+  /* How many times have we gone off the our read limits? */
+  uint64_t overload_read_count;
+  /* How many times have we gone off the our write limits? */
+  uint64_t overload_write_count;
+
+  /* When did we last experience a file descriptor exhaustion? */
+  time_t overload_fd_exhausted_time;
+  /* How many times have we experienced a file descriptor exhaustion? */
+  uint64_t overload_fd_exhausted;
+} overload_stats_t;
+
+/** Current state of overload stats */
+static overload_stats_t overload_stats;
+
+/** Return true if this overload happened within the last `n_hours`. */
+static bool
+overload_happened_recently(time_t overload_time, unsigned n_hours)
+{
+  /* An overload is relevant if it happened in the last 72 hours */
+  if (overload_time > approx_time() - 3600 * n_hours) {
+    return true;
+  }
+  return false;
+}
+
+/* The current version of the overload stats version */
+#define OVERLOAD_STATS_VERSION 1
+
+/** Returns an allocated string for extra-info documents for publishing
+ *  overload statistics. */
+char *
+rep_hist_get_overload_stats_lines(void)
+{
+  char *result = NULL;
+  smartlist_t *chunks = smartlist_new();
+  char tbuf[ISO_TIME_LEN+1];
+
+  /* First encode the general overload */
+  if (overload_happened_recently(overload_stats.overload_general_time, 72)) {
+    format_iso_time(tbuf, overload_stats.overload_general_time);
+    smartlist_add_asprintf(chunks, "overload-general %d %s",
+                           OVERLOAD_STATS_VERSION, tbuf);
+  }
+
+  /* Now do bandwidth-related overloads */
+  if (overload_happened_recently(overload_stats.overload_ratelimits_time,24)) {
+    const or_options_t *options = get_options();
+    format_iso_time(tbuf, overload_stats.overload_ratelimits_time);
+    smartlist_add_asprintf(chunks,
+                           "overload-ratelimits %d %s %" PRIu64 " %" PRIu64
+                           " %" PRIu64 " %" PRIu64,
+                           OVERLOAD_STATS_VERSION, tbuf,
+                           options->BandwidthRate, options->BandwidthBurst,
+                           overload_stats.overload_read_count,
+                           overload_stats.overload_write_count);
+  }
+
+  /* Finally file descriptor overloads */
+  if (overload_happened_recently(
+                              overload_stats.overload_fd_exhausted_time, 72)) {
+    format_iso_time(tbuf, overload_stats.overload_fd_exhausted_time);
+    smartlist_add_asprintf(chunks, "overload-fd-exhausted %d %s",
+                           OVERLOAD_STATS_VERSION, tbuf);
+  }
+
+  /* Bail early if we had nothing to write */
+  if (smartlist_len(chunks) == 0) {
+    goto done;
+  }
+
+  result = smartlist_join_strings(chunks, "\n", 0, NULL);
+
+ done:
+  SMARTLIST_FOREACH(chunks, char *, cp, tor_free(cp));
+  smartlist_free(chunks);
+  return result;
+}
+
+/** Round down the time in `a` to the beginning of the current hour */
+#define SET_TO_START_OF_HOUR(a) STMT_BEGIN \
+  (a) = approx_time() - (approx_time() % 3600); \
+STMT_END
+
+/** Note down an overload event of type `overload`. */
+void
+rep_hist_note_overload(overload_type_t overload)
+{
+  switch (overload) {
+  case OVERLOAD_GENERAL:
+    SET_TO_START_OF_HOUR(overload_stats.overload_general_time);
+    break;
+  case OVERLOAD_READ:
+    SET_TO_START_OF_HOUR(overload_stats.overload_ratelimits_time);
+    overload_stats.overload_read_count++;
+    break;
+  case OVERLOAD_WRITE:
+    SET_TO_START_OF_HOUR(overload_stats.overload_ratelimits_time);
+    overload_stats.overload_write_count++;
+    break;
+  case OVERLOAD_FD_EXHAUSTED:
+    SET_TO_START_OF_HOUR(overload_stats.overload_fd_exhausted_time);
+    overload_stats.overload_fd_exhausted++;
+    break;
+  }
+}
+
 /** Return the or_history_t for the OR with identity digest <b>id</b>,
  * creating it if necessary. */
 static or_history_t *
diff --git a/src/feature/stats/rephist.h b/src/feature/stats/rephist.h
index 7ecbfebfd7..45020d7967 100644
--- a/src/feature/stats/rephist.h
+++ b/src/feature/stats/rephist.h
@@ -140,6 +140,24 @@ void rep_hist_reset_padding_counts(void);
 void rep_hist_prep_published_padding_counts(time_t now);
 void rep_hist_padding_count_timers(uint64_t num_timers);
 
+/**
+ * Represents the various types of overload we keep track of and expose in our
+ * extra-info descriptor.
+*/
+typedef enum {
+  /* A general overload -- can have many different causes. */
+  OVERLOAD_GENERAL,
+  /* We went over our configured read rate/burst bandwidth limit */
+  OVERLOAD_READ,
+  /* We went over our configured write rate/burst bandwidth limit */
+  OVERLOAD_WRITE,
+  /* We exhausted the file descriptors in this system */
+  OVERLOAD_FD_EXHAUSTED,
+} overload_type_t;
+
+void rep_hist_note_overload(overload_type_t overload);
+char *rep_hist_get_overload_stats_lines(void);
+
 #ifdef TOR_UNIT_TESTS
 struct hs_v2_stats_t;
 const struct hs_v2_stats_t *rep_hist_get_hs_v2_stats(void);
diff --git a/src/test/test_stats.c b/src/test/test_stats.c
index 617a36faba..1f14255e31 100644
--- a/src/test/test_stats.c
+++ b/src/test/test_stats.c
@@ -703,6 +703,114 @@ test_load_stats_file(void *arg)
   tor_free(content);
 }
 
+/** Test the overload stats logic. */
+static void
+test_overload_stats(void *arg)
+{
+  time_t current_time = 1010101010;
+  char *stats_str = NULL;
+  (void) arg;
+
+  /* Change time to 03-01-2002 23:36 UTC */
+  /* This should make the extrainfo timestamp be "2002-01-03 23:00:00" */
+  update_approx_time(current_time);
+
+  /* With an empty rephist we shouldn't get anything back */
+  stats_str = rep_hist_get_overload_stats_lines();
+  tt_assert(!stats_str);
+
+  /* Note a DNS overload */
+  rep_hist_note_overload(OVERLOAD_GENERAL);
+
+  /* Move the time forward one hour */
+  current_time += 3600;
+  update_approx_time(current_time);
+
+  /* Now check the string */
+  stats_str = rep_hist_get_overload_stats_lines();
+  tt_str_op("overload-general 1 2002-01-03 23:00:00", OP_EQ, stats_str);
+  tor_free(stats_str);
+
+  /* Move the time forward 72 hours: see that the line has disappeared. */
+  current_time += 3600*72;
+  update_approx_time(current_time);
+
+  stats_str = rep_hist_get_overload_stats_lines();
+  tt_assert(!stats_str);
+
+  /* Now the time should be 2002-01-07 00:00:00 */
+
+  /* Note a DNS overload */
+  rep_hist_note_overload(OVERLOAD_GENERAL);
+
+  stats_str = rep_hist_get_overload_stats_lines();
+  tt_str_op("overload-general 1 2002-01-07 00:00:00", OP_EQ, stats_str);
+  tor_free(stats_str);
+
+  /* Also note an fd exhaustion event */
+  rep_hist_note_overload(OVERLOAD_FD_EXHAUSTED);
+
+  stats_str = rep_hist_get_overload_stats_lines();
+  tt_str_op("overload-general 1 2002-01-07 00:00:00\n"
+            "overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str);
+  tor_free(stats_str);
+
+  /* Move the time forward. Register DNS overload. See that the time changed */
+  current_time += 3600*2;
+  update_approx_time(current_time);
+
+  rep_hist_note_overload(OVERLOAD_GENERAL);
+
+  stats_str = rep_hist_get_overload_stats_lines();
+  tt_str_op("overload-general 1 2002-01-07 02:00:00\n"
+            "overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str);
+  tor_free(stats_str);
+
+  /* Move the time forward. Register a bandwidth ratelimit event. See that the
+     string is added */
+  current_time += 3600*2;
+  update_approx_time(current_time);
+
+  /* Register the rate limit event */
+  rep_hist_note_overload(OVERLOAD_READ);
+  /* Also set some rate limiting values that should be reflected on the log */
+  get_options_mutable()->BandwidthRate = 1000;
+  get_options_mutable()->BandwidthBurst = 2000;
+
+  stats_str = rep_hist_get_overload_stats_lines();
+  tt_str_op("overload-general 1 2002-01-07 02:00:00\n"
+            "overload-ratelimits 1 2002-01-07 04:00:00 1000 2000 1 0\n"
+            "overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str);
+  tor_free(stats_str);
+
+  /* Move the time forward 24 hours: no rate limit line anymore. */
+  current_time += 3600*24;
+  update_approx_time(current_time);
+
+  stats_str = rep_hist_get_overload_stats_lines();
+  tt_str_op("overload-general 1 2002-01-07 02:00:00\n"
+            "overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str);
+  tor_free(stats_str);
+
+  /* Move the time forward 44 hours: no fd exhausted line anymore. */
+  current_time += 3600*44;
+  update_approx_time(current_time);
+
+  stats_str = rep_hist_get_overload_stats_lines();
+  tt_str_op("overload-general 1 2002-01-07 02:00:00", OP_EQ, stats_str);
+  tor_free(stats_str);
+
+  /* Move the time forward 2 hours: there is nothing left. */
+  current_time += 3600*2;
+  update_approx_time(current_time);
+
+  stats_str = rep_hist_get_overload_stats_lines();
+  tt_assert(!stats_str);
+
+ done:
+  tor_free(stats_str);
+}
+
 #define ENT(name)                                                       \
   { #name, test_ ## name , 0, NULL, NULL }
 #define FORK(name)                                                      \
@@ -718,6 +826,7 @@ struct testcase_t stats_tests[] = {
   FORK(get_bandwidth_lines),
   FORK(rephist_v3_onions),
   FORK(load_stats_file),
+  FORK(overload_stats),
 
   END_OF_TESTCASES
 };





More information about the tor-commits mailing list