[tor-commits] [torperf/master] Truncate .data and .extradata files to contain only the last 4 days.

karsten at torproject.org karsten at torproject.org
Tue Mar 6 17:20:15 UTC 2012


commit 178f753a451959cb78c6d0bf13a929ed1940a2e1
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Mon Mar 5 16:33:41 2012 +0100

    Truncate .data and .extradata files to contain only the last 4 days.
---
 extra_stats.py     |   57 +++++++++++++++++++++++++++++++++++----
 measurements-HOWTO |   18 ++++++++----
 truncate-data.py   |   75 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 138 insertions(+), 12 deletions(-)

diff --git a/extra_stats.py b/extra_stats.py
index 69bb8a5..662cf43 100755
--- a/extra_stats.py
+++ b/extra_stats.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 
-import sys, time
+import os, re, sys, time
 import TorCtl.TorUtil as TorUtil
 import TorCtl.TorCtl as TorCtl
 
@@ -22,15 +22,18 @@ class Circuit:
     self.stream_fail_reason = None
 
 class WriteStats(TorCtl.PostEventListener):
-  def __init__(self, port, filename):
+  def __init__(self, port, filename, truncate):
     TorCtl.PostEventListener.__init__(self)
     self._port = int(port)
     self._filename = filename
+    self.truncate = truncate
+    self.first_launched = None
     self._conn = None
     self.all_circs = {}
     self.ignore_streams = {}
     self.current_timeout = None
     self.current_quantile = None
+    self.truncate_statsfile()
 
   def connect(self):
     self._conn = TorCtl.connect(HOST, self._port)
@@ -52,6 +55,7 @@ class WriteStats(TorCtl.PostEventListener):
     self.current_quantile = b.cutoff_quantile
     result = b.event_name + " " +b.body
     self.write_result(result)
+    self.truncate_statsfile()
 
   def circ_status_event(self, c):
     if c.status == "LAUNCHED":
@@ -94,6 +98,7 @@ class WriteStats(TorCtl.PostEventListener):
          (self.current_timeout, self.current_quantile)
 
     self.write_result(result)
+    self.truncate_statsfile()
 
   def stream_status_event(self, event):
     if event.status == "NEW":
@@ -140,19 +145,59 @@ class WriteStats(TorCtl.PostEventListener):
 
   def write_result(self, result):
     # XXX: hrmm. seems wasteful to keep opening+closing..
+    # XXX: When changing this, also change truncated_statsfile().
     statsfile = open(self._filename, 'a')
     statsfile.write(result+"\n")
     statsfile.close()
 
+  def truncate_statsfile(self):
+    if not self.truncate:
+      return
+    launched_str = "^.*LAUNCH=([\\d]*).*$"
+    if not self.first_launched:
+      if os.path.isfile(self._filename):
+        launched_re = re.compile(launched_str)
+        with open(self._filename) as statsfile:
+          for line in statsfile:
+            m = launched_re.match(line)
+            if m:
+              self.first_launched = int(m.group(1))
+              break
+      if not self.first_launched:
+        self.first_launched = time.time()
+    now = time.time()
+    if self.first_launched < now - 7 * 24 * 60 * 60:
+      copylines = False
+      statsfilebak_path = self._filename + ".bak"
+      statsfilebak_file = open(statsfilebak_path, "w")
+      launched_re = re.compile(launched_str)
+      with open(self._filename) as statsfile:
+        for line in statsfile:
+          if copylines:
+            statsfilebak_file.write(line)
+          else:
+            m = launched_re.match(line)
+            if m and int(m.group(1)) >= now - 4 * 24 * 60 * 60:
+              statsfilebak_file.write(line)
+              copylines = True
+      statsfilebak_file.close()
+      os.rename(statsfilebak_path, self._filename)
+
 def main():
-  if len(sys.argv) < 3:
+  if len(sys.argv) < 3 or len(sys.argv) > 4:
     print "Bad arguments"
     sys.exit(1)
 
-  port = sys.argv[1]
-  filename = sys.argv[2]
+  truncate = False
+  if (sys.argv[1] == "--truncate"):
+    truncate = True
+    port = sys.argv[2]
+    filename = sys.argv[3]
+  else:
+    port = sys.argv[1]
+    filename = sys.argv[2]
 
-  stats = WriteStats(port, filename)
+  stats = WriteStats(port, filename, truncate)
   stats.connect()
   stats.setup_listener()
   try:
diff --git a/measurements-HOWTO b/measurements-HOWTO
index 67de0c5..2cefb68 100644
--- a/measurements-HOWTO
+++ b/measurements-HOWTO
@@ -103,14 +103,17 @@ cd ~/torperf/torclient50kb && tor -f ~/torperf/torclient50kb/torrc
 cd ~/torperf/torclient1mb && tor -f ~/torperf/torclient1mb/torrc
 cd ~/torperf/torclient5mb && tor -f ~/torperf/torclient5mb/torrc
 sleep 5
-cd ~/torperf/torclient50kb && python ../extra_stats.py 10020
+cd ~/torperf/torclient50kb && python ../extra_stats.py --truncate 10020
   ../50kb.extradata &
-cd ~/torperf/torclient1mb && python ../extra_stats.py 10021
+cd ~/torperf/torclient1mb && python ../extra_stats.py --truncate 10021
   ../1mb.extradata &
-cd ~/torperf/torclient5mb && python ../extra_stats.py 10022
+cd ~/torperf/torclient5mb && python ../extra_stats.py --truncate 10022
   ../5mb.extradata &
 EOF
 
+(Omit the --truncate switch if you don't want .extradata files to be
+truncated once per week to contain only the last 4 days of data.)
+
 $ chmod a+x start-tors
 $ ./start-tors
 
@@ -125,13 +128,16 @@ $ crontab -e
 
 */5 * * * * timeout -s2 295 ~/torperf/trivsocks-client
   torperf.torproject.org 127.0.0.1:9020 /.50kbfile >> ~/torperf/50kb.data
-  2>/dev/null
+  2>/dev/null; ~/torperf/truncate-data.py ~/torperf/50kb.data
 2,32 * * * * timeout -s2 1795 ~/torperf/trivsocks-client
   torperf.torproject.org 127.0.0.1:9021 /.1mbfile >> ~/torperf/1mb.data
-  2>/dev/null
+  2>/dev/null; ~/torperf/truncate-data.py ~/torperf/1mb.data
 8 * * * * timeout -s2 3595 ~/torperf/trivsocks-client
   torperf.torproject.org 127.0.0.1:9022 /.5mbfile >> ~/torperf/5mb.data
-  2>/dev/null
+  2>/dev/null; ~/torperf/truncate-data.py ~/torperf/5mb.data
+
+(Omit the truncate-data.py command if you don't want .data files to be
+truncated once per week to contain only the last 4 days of data.)
 
 From now on, the three files 50kb.data, 1mb.data, and 5mb.data should
 accumulate lines like this (50kb.data shown here; line breaks are only for
diff --git a/truncate-data.py b/truncate-data.py
new file mode 100755
index 0000000..a70e3fc
--- /dev/null
+++ b/truncate-data.py
@@ -0,0 +1,75 @@
+#!/usr/bin/python
+import os
+import re
+import sys
+import time
+
+# Truncate Torperf .data file by deleting lines older than 4 days, but
+# only truncate once a week.
+def main():
+
+  # Check usage.
+  if len(sys.argv) != 2:
+    print "Usage: ./truncate.py <.data file>"
+    return
+  data_path = sys.argv[1]
+  if not os.path.isfile(data_path):
+    print "%s is not a .data file." % data_path
+    return
+
+  # Prepare for parsing.
+  parselines = False
+  copylines = False
+  databak_path = data_path + ".bak"
+  databak_file = None
+  started_re = re.compile('(^[\\d]*) .*')
+  now = time.time()
+
+  # Parse the .data file line by line, possibly stopping early if the
+  # first timestamp we find isn't older than a week.
+  with open(data_path) as data_file:
+    for line in data_file:
+
+      # Copy lines written in the past 4 days.  We have already decided to
+      # copy this part of the .data file before, so just copy the line and
+      # continue.
+      if copylines:
+        databak_file.write(line)
+        continue
+
+      # Skip empty lines.
+      if line.strip() == "":
+        continue
+
+      # Extract the first timestamp from the current line.
+      m = started_re.match(line)
+      if not m:
+        print "%s is not a valid .data file." % data_path
+        return
+      started_ts = int(m.group(1))
+
+      # Decide whether to start copying lines.  We have already decided to
+      # truncate this file before.
+      if parselines:
+        if started_ts >= now - 4 * 24 * 60 * 60:
+          databak_file.write(line)
+          copylines = True
+        continue
+
+      # Decide whether to truncate this file at all.
+      if started_ts >= now - 7 * 24 * 60 * 60:
+        return
+
+      # Open a .bak file to write into and start parsing lines to copy in
+      # the next iteration.
+      databak_file = open(databak_path, "w")
+      parselines = True
+
+  # Close the .bak file and replace the original .data file with it.
+  if databak_file:
+    databak_file.close()
+    os.rename(databak_path, data_path)
+
+if __name__ == "__main__":
+  main()
+



More information about the tor-commits mailing list