[tor-commits] [onionperf/develop] Move filters and filter metadata to analysis files

karsten at torproject.org karsten at torproject.org
Wed Sep 16 15:15:08 UTC 2020


commit 95b749a8fc690825c0a828b8473c58faea7ad912
Author: Ana Custura <ana at netstat.org.uk>
Date:   Thu Sep 10 01:51:54 2020 +0100

    Move filters and filter metadata to analysis files
---
 onionperf/filtering.py     | 25 ++++++++++++++++++-------
 onionperf/onionperf        |  9 +--------
 onionperf/visualization.py | 14 +++++++++-----
 3 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/onionperf/filtering.py b/onionperf/filtering.py
index 1b614d6..c008c03 100644
--- a/onionperf/filtering.py
+++ b/onionperf/filtering.py
@@ -7,6 +7,7 @@
 
 import re
 from onionperf.analysis import OPAnalysis
+from collections import defaultdict
 
 class Filtering(object):
 
@@ -14,9 +15,11 @@ class Filtering(object):
         self.fingerprints_to_include = None
         self.fingerprints_to_exclude = None
         self.fingerprint_pattern = re.compile("\$?([0-9a-fA-F]{40})")
+        self.filters = defaultdict(list)
 
     def include_fingerprints(self, path):
         self.fingerprints_to_include = []
+        self.fingerprints_to_include_path = path
         with open(path, 'rt') as f:
             for line in f:
                 fingerprint_match = self.fingerprint_pattern.match(line)
@@ -26,6 +29,7 @@ class Filtering(object):
 
     def exclude_fingerprints(self, path):
         self.fingerprints_to_exclude = []
+        self.fingerprints_to_exclude_path = path
         with open(path, 'rt') as f:
             for line in f:
                 fingerprint_match = self.fingerprint_pattern.match(line)
@@ -33,12 +37,16 @@ class Filtering(object):
                     fingerprint = fingerprint_match.group(1).upper()
                     self.fingerprints_to_exclude.append(fingerprint)
 
-    def apply_filters(self, input_path, output_dir, output_file):
-        self.analysis = OPAnalysis.load(filename=input_path)
+    def filter_tor_circuits(self, analysis):
         if self.fingerprints_to_include is None and self.fingerprints_to_exclude is None:
             return
-        for source in self.analysis.get_nodes():
-            tor_circuits = self.analysis.get_tor_circuits(source)
+        self.filters["tor/circuits"] = []
+        if self.fingerprints_to_include:
+           self.filters["tor/circuits"].append({"name": "include_fingerprints", "filepath": self.fingerprints_to_include_path })
+        if self.fingerprints_to_exclude:
+           self.filters["tor/circuits"].append({"name": "exclude_fingerprints", "filepath": self.fingerprints_to_exclude_path })
+        for source in analysis.get_nodes():
+            tor_circuits = analysis.get_tor_circuits(source)
             filtered_circuit_ids = []
             for circuit_id, tor_circuit in tor_circuits.items():
                 keep = False
@@ -56,8 +64,11 @@ class Filtering(object):
                                 keep = False
                                 break
                 if not keep:
-                    filtered_circuit_ids.append(circuit_id)
-            for circuit_id in filtered_circuit_ids:
-                del(tor_circuits[circuit_id])
+                    tor_circuits[circuit_id]["filtered"] = True
+
+    def apply_filters(self, input_path, output_dir, output_file):
+        self.analysis = OPAnalysis.load(filename=input_path)
+        self.filter_tor_circuits(self.analysis)
+        self.analysis.json_db["filters"] = self.filters
         self.analysis.save(filename=output_file, output_prefix=output_dir, sort_keys=False)
 
diff --git a/onionperf/onionperf b/onionperf/onionperf
index 1efa8cb..108af4e 100755
--- a/onionperf/onionperf
+++ b/onionperf/onionperf
@@ -342,13 +342,6 @@ files generated by this script will be written""",
         required="True",
         action=PathStringArgsAction, dest="datasets")
 
-    visualize_parser.add_argument('--outer-join',
-        help="""Include measurements without an existing mapping between TGen
-                transfers/streams and Tor streams/circuits, which is the
-                equivalent of an outer join in the database sense""",
-        action="store_true", dest="outer_join",
-        default=False)
-
     visualize_parser.add_argument('-p', '--prefix',
         help="a STRING filename prefix for graphs we generate",
         metavar="STRING", type=str,
@@ -489,7 +482,7 @@ def visualize(args):
             if analysis is not None:
                analyses.append(analysis)
         tgen_viz.add_dataset(analyses, label)
-    tgen_viz.plot_all(args.prefix, outer_join=args.outer_join)
+    tgen_viz.plot_all(args.prefix)
 
 def type_nonnegative_integer(value):
     i = int(value)
diff --git a/onionperf/visualization.py b/onionperf/visualization.py
index 0f69879..f5bc03f 100644
--- a/onionperf/visualization.py
+++ b/onionperf/visualization.py
@@ -31,11 +31,11 @@ class Visualization(object, metaclass=ABCMeta):
 
 class TGenVisualization(Visualization):
 
-    def plot_all(self, output_prefix, outer_join=False):
+    def plot_all(self, output_prefix):
         if len(self.datasets) > 0:
             prefix = output_prefix + '.' if output_prefix is not None else ''
             ts = time.strftime("%Y-%m-%d_%H:%M:%S")
-            self.__extract_data_frame(outer_join)
+            self.__extract_data_frame()
             self.data.to_csv("{0}onionperf.viz.{1}.csv".format(prefix, ts))
             sns.set_context("paper")
             self.page = PdfPages("{0}onionperf.viz.{1}.pdf".format(prefix, ts))
@@ -51,7 +51,7 @@ class TGenVisualization(Visualization):
             self.__plot_errors_time()
             self.page.close()
 
-    def __extract_data_frame(self, outer_join=False):
+    def __extract_data_frame(self):
         streams = []
         for (analyses, label) in self.datasets:
             for analysis in analyses:
@@ -145,8 +145,12 @@ class TGenVisualization(Visualization):
                                     if "failure_reason_remote" in tor_stream:
                                         error_code_parts.append(tor_stream["failure_reason_remote"])
                             stream["error_code"] = "/".join(error_code_parts)
-                        if tor_circuit or outer_join:
-                            streams.append(stream)
+
+                        if "filters" in analysis.json_db.keys() and analysis.json_db["filters"]["tor/circuits"]:
+                           if tor_circuit and "filtered" not in tor_circuit.keys():
+                               streams.append(stream)
+                        else:
+                           streams.append(stream)
         self.data = pd.DataFrame.from_records(streams, index="id")
 
     def __plot_firstbyte_ecdf(self):





More information about the tor-commits mailing list