[tor-commits] [sbws/master] Refactor to clean v3bw files too

pastly at torproject.org pastly at torproject.org
Thu Aug 9 14:21:19 UTC 2018

commit 9d6019f0921dcb4090e06d3489d0860d588c9f57
Author: juga0 <juga at riseup.net>
Date:   Thu Jul 12 15:46:14 2018 +0000

    Refactor to clean v3bw files too
 sbws/core/cleanup.py | 87 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 69 insertions(+), 18 deletions(-)

diff --git a/sbws/core/cleanup.py b/sbws/core/cleanup.py
index 149cdd2..014b85d 100644
--- a/sbws/core/cleanup.py
+++ b/sbws/core/cleanup.py
@@ -10,6 +10,8 @@ import shutil
 import logging
 import time
+from sbws.util.timestamp import unixts_to_dt_obj
 log = logging.getLogger(__name__)
@@ -27,17 +29,21 @@ def gen_parser(sub):
     p.add_argument('--dry-run', action='store_true',
                    help='Don\'t actually compress or delete anything')
+    p.add_argument('--v3bw', action='store_true', help='Clean also v3bw files')
-def _get_older_files_than(dname, num_days_ago, extensions):
+def _get_older_files_than(dname, time_delta, extensions, is_v3bw=False):
     assert os.path.isdir(dname)
-    assert isinstance(num_days_ago, int)
+    assert isinstance(time_delta, int)
     assert isinstance(extensions, list)
     for ext in extensions:
         assert isinstance(ext, str)
         assert ext[0] == '.'
     # Determine oldest allowed date
     today = datetime.utcfromtimestamp(time.time())
-    oldest_day = today - timedelta(days=num_days_ago)
+    oldest_day = today - timedelta(days=time_delta)
+    if is_v3bw:
+        oldest = today - timedelta(minutes=time_delta)
     # Compile a regex that can extract a date from a file name that looks like
     # /path/to/foo/YYYY-MM-DD*.extension
     extensions = [re.escape(e) for e in extensions]
@@ -50,38 +56,52 @@ def _get_older_files_than(dname, num_days_ago, extensions):
     for root, dirs, files in os.walk(dname):
         for f in files:
             fname = os.path.join(root, f)
-            match = regex.match(fname)
-            if not match:
-                log.debug('Ignoring %s because it doesn\'t look like '
-                          'YYYY-MM-DD', fname)
-                continue
-            d = datetime(*[int(n) for n in match.group(1).split('-')])
-            if d < oldest_day:
-                yield fname
-def _remove_rotten_files(datadir, rotten_days, dry_run=True):
+            if is_v3bw:  # or (v3bw_ext not in fname)
+                # not forcing files to have correct names just the extension
+                _, ext = os.path.splitext(fname)
+                if ext not in ['.v3bw']:
+                    log.debug('Ignoring %s because it doesn\'t have extension '
+                              '%s', fname, ext)
+                    continue
+                dt = unixts_to_dt_obj(os.path.getmtime(fname))
+                if dt < oldest and os.path.splitext:
+                    yield fname
+            else:
+                match = regex.match(fname)
+                if not match:
+                    log.debug('Ignoring %s because it doesn\'t look like '
+                              'YYYY-MM-DD', fname)
+                    continue
+                d = datetime(*[int(n) for n in match.group(1).split('-')])
+                if d < oldest_day:
+                    yield fname
+def _remove_rotten_files(datadir, rotten_days, dry_run=True, is_v3bw=False):
     assert os.path.isdir(datadir)
     assert isinstance(rotten_days, int)
     # Hold the lock for basically the entire time just in case someone else
     # moves files between when we get the list of files and when we try to
     # delete them.
+    exts = ['.txt', '.txt.gz'] if not is_v3bw else ['.v3bw']
     with DirectoryLock(datadir):
-        for fname in _get_older_files_than(datadir, rotten_days,
-                                           ['.txt', '.txt.gz']):
+        for fname in _get_older_files_than(datadir, rotten_days, exts,
+                                           is_v3bw):
             log.info('Deleting %s', fname)
             if not dry_run:
-def _compress_stale_files(datadir, stale_days, dry_run=True):
+def _compress_stale_files(datadir, stale_days, dry_run=True, is_v3bw=False):
     assert os.path.isdir(datadir)
     assert isinstance(stale_days, int)
     # Hold the lock for basically the entire time just in case someone else
     # moves files between when we get the list of files and when we try to
     # compress them.
+    exts = ['.txt', '.txt.gz'] if not is_v3bw else ['.v3bw']
     with DirectoryLock(datadir):
-        for fname in _get_older_files_than(datadir, stale_days, ['.txt']):
+        for fname in _get_older_files_than(datadir, stale_days, exts,
+                                           is_v3bw):
             log.info('Compressing %s', fname)
             if dry_run:
@@ -92,6 +112,24 @@ def _compress_stale_files(datadir, stale_days, dry_run=True):
+def _check_validity_periods(valid, stale, rotten):
+    if stale - 2 < valid:
+        fail_hard('For safetly, cleanup/stale_* (%d) must be at least 2 '
+                  'days larger than general/data_period or general/valid_ * '
+                  '(%d)', stale, valid)
+    if rotten < stale:
+        fail_hard('cleanup/rotten_* (%d) must be the same or larger than '
+                  'cleanup/stale_* (%d)', rotten, stale)
+    if stale / 2 < valid:
+        log.warning(
+            'cleanup/stale_ (%d) is less than twice '
+            'general/data_period or general/valid_*(%d). '
+            'For ease of parsing older results '
+            'if necessary, it is recommended to make stale at least '
+            'twice the data_period.', stale, valid)
 def main(args, conf):
     Main entry point in to the cleanup command.
@@ -126,3 +164,16 @@ def main(args, conf):
     _remove_rotten_files(datadir, rotten_days, dry_run=args.dry_run)
     _compress_stale_files(datadir, stale_days, dry_run=args.dry_run)
+    if args.v3bw:
+        v3bw_dir = conf['paths']['v3bw_dname']
+        if not os.path.isdir(datadir):
+            fail_hard('%s does not exist', v3bw_dir)
+        valid = conf.getint('general', 'valid_mins_v3bw_files')
+        stale = conf.getint('cleanup', 'stale_mins_v3bw_files')
+        rotten = conf.getint('cleanup', 'rotten_mins_v3bw_files')
+        _check_validity_periods(valid, stale, rotten)
+        _remove_rotten_files(v3bw_dir, rotten, dry_run=args.dry_run,
+                             is_v3bw=True)
+        _compress_stale_files(v3bw_dir, stale, dry_run=args.dry_run,
+                              is_v3bw=True)

More information about the tor-commits mailing list