[tor-commits] [sbws/master] Use juga's code to cleanup result files too
pastly at torproject.org
pastly at torproject.org
Thu Aug 9 14:21:19 UTC 2018
commit 3a1789bddf7aecd74a9ed784f38ebb1a9f45f2de
Author: Matt Traudt <sirmatt at ksu.edu>
Date: Wed Aug 1 19:35:53 2018 -0400
Use juga's code to cleanup result files too
---
sbws/core/cleanup.py | 132 +++++++++++++++++----------------------------------
1 file changed, 43 insertions(+), 89 deletions(-)
diff --git a/sbws/core/cleanup.py b/sbws/core/cleanup.py
index ad5b286..7dc7e78 100644
--- a/sbws/core/cleanup.py
+++ b/sbws/core/cleanup.py
@@ -7,7 +7,6 @@ from sbws.util.timestamp import unixts_to_dt_obj
from argparse import ArgumentDefaultsHelpFormatter
from datetime import datetime
from datetime import timedelta
-import re
import os
import gzip
import shutil
@@ -64,38 +63,6 @@ def _get_files_mtime_older_than(dname, days_delta, extensions):
yield fname
-def _get_older_files_than(dname, num_days_ago, extensions):
- assert os.path.isdir(dname)
- assert isinstance(num_days_ago, int)
- assert isinstance(extensions, list)
- for ext in extensions:
- assert isinstance(ext, str)
- assert ext[0] == '.'
- # Determine oldest allowed date
- today = datetime.utcfromtimestamp(time.time())
- oldest_day = today - timedelta(days=num_days_ago)
- # Compile a regex that can extract a date from a file name that looks like
- # /path/to/foo/YYYY-MM-DD*.extension
- extensions = [re.escape(e) for e in extensions]
- day_part = '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]'
- regex = re.compile(r'^.*/({}).*({})$'
- .format(day_part, '|'.join(extensions)))
- # Walk through all files in the given dname, find files that match the
- # regex, and yield the ones that contain a date in the file name that is
- # too old.
- for root, dirs, files in os.walk(dname):
- for f in files:
- fname = os.path.join(root, f)
- match = regex.match(fname)
- if not match:
- log.debug('Ignoring %s because it doesn\'t look like '
- 'YYYY-MM-DD', fname)
- continue
- d = datetime(*[int(n) for n in match.group(1).split('-')])
- if d < oldest_day:
- yield fname
-
-
def _delete_files(dname, files, dry_run=True):
"""Delete the files passed as argument."""
assert os.path.isdir(dname)
@@ -108,20 +75,6 @@ def _delete_files(dname, files, dry_run=True):
os.remove(fname)
-def _remove_rotten_files(datadir, rotten_days, dry_run=True):
- assert os.path.isdir(datadir)
- assert isinstance(rotten_days, int)
- # Hold the lock for basically the entire time just in case someone else
- # moves files between when we get the list of files and when we try to
- # delete them.
- with DirectoryLock(datadir):
- for fname in _get_older_files_than(datadir, rotten_days,
- ['.txt', '.txt.gz']):
- log.info('Deleting %s', fname)
- if not dry_run:
- os.remove(fname)
-
-
def _compress_files(dname, files, dry_run=True):
"""Compress the files passed as argument."""
assert os.path.isdir(dname)
@@ -139,24 +92,6 @@ def _compress_files(dname, files, dry_run=True):
os.remove(fname)
-def _compress_stale_files(datadir, stale_days, dry_run=True):
- assert os.path.isdir(datadir)
- assert isinstance(stale_days, int)
- # Hold the lock for basically the entire time just in case someone else
- # moves files between when we get the list of files and when we try to
- # compress them.
- with DirectoryLock(datadir):
- for fname in _get_older_files_than(datadir, stale_days, ['.txt']):
- log.info('Compressing %s', fname)
- if dry_run:
- continue
- with open(fname, 'rt') as in_fd:
- out_fname = fname + '.gz'
- with gzip.open(out_fname, 'wt') as out_fd:
- shutil.copyfileobj(in_fd, out_fd)
- os.remove(fname)
-
-
def _check_validity_periods_v3bw(compress_after_days, delete_after_days):
if 1 <= compress_after_days and compress_after_days < delete_after_days:
return True
@@ -164,6 +99,26 @@ def _check_validity_periods_v3bw(compress_after_days, delete_after_days):
"after a bigger number of days.")
+def _check_validity_periods_results(
+ data_period, compress_after_days, delete_after_days):
+ if compress_after_days - 2 < data_period:
+ fail_hard(
+ 'For safetly, cleanup/stale_days (%d) must be at least 2 days '
+ 'larger than general/data_period (%d)', compress_after_days,
+ data_period)
+ if delete_after_days < compress_after_days:
+ fail_hard(
+ 'cleanup/rotten_days (%d) must be the same or larger than '
+ 'cleanup/stale_days (%d)', delete_after_days, compress_after_days)
+ if compress_after_days / 2 < data_period:
+ log.warning(
+ 'cleanup/stale_days (%d) is less than twice '
+ 'general/data_period (%d). For ease of parsing older results '
+ 'if necessary, it is recommended to make stale_days at least '
+ 'twice the data_period.', compress_after_days, data_period)
+ return True
+
+
def _clean_v3bw_files(args, conf):
v3bw_dname = conf['paths']['v3bw_dname']
if not os.path.isdir(v3bw_dname):
@@ -186,6 +141,28 @@ def _clean_v3bw_files(args, conf):
_compress_files(v3bw_dname, files_to_compress, dry_run=args.dry_run)
+def _clean_result_files(args, conf):
+ datadir = conf['paths']['datadir']
+ if not os.path.isdir(datadir):
+ fail_hard('%s does not exist', datadir)
+ data_period = conf.getint('general', 'data_period')
+ compress_after_days = conf.getint('cleanup', 'stale_days')
+ delete_after_days = conf.getint('cleanup', 'rotten_days')
+ _check_validity_periods_results(
+ data_period, compress_after_days, delete_after_days)
+
+ # first delete so that the files to be deleted are not compressed first
+ files_to_delete = _get_files_mtime_older_than(
+ datadir, delete_after_days, ['.txt', '.gz'])
+ _delete_files(datadir, files_to_delete, dry_run=args.dry_run)
+
+ # when dry_run is true, compress will also show all the files that
+ # would have been deleted, since they are not really deleted
+ files_to_compress = _get_files_mtime_older_than(
+ datadir, compress_after_days, ['.txt'])
+ _compress_files(datadir, files_to_compress, dry_run=args.dry_run)
+
+
def main(args, conf):
'''
Main entry point in to the cleanup command.
@@ -200,30 +177,7 @@ def main(args, conf):
fail_hard('Nothing to clean.')
if not args.no_results:
- datadir = conf['paths']['datadir']
- if not os.path.isdir(datadir):
- fail_hard('%s does not exist', datadir)
-
- fresh_days = conf.getint('general', 'data_period')
- stale_days = conf.getint('cleanup', 'stale_days')
- rotten_days = conf.getint('cleanup', 'rotten_days')
- if stale_days - 2 < fresh_days:
- fail_hard('For safetly, cleanup/stale_days (%d) must be at least '
- '2 days larger than general/data_period (%d)',
- stale_days, fresh_days)
- if rotten_days < stale_days:
- fail_hard('cleanup/rotten_days (%d) must be the same or larger '
- 'than cleanup/stale_days (%d)', rotten_days, stale_days)
-
- if stale_days / 2 < fresh_days:
- log.warning(
- 'cleanup/stale_days (%d) is less than twice '
- 'general/data_period (%d). For ease of parsing older results '
- 'if necessary, it is recommended to make stale_days at least '
- 'twice the data_period.', stale_days, fresh_days)
-
- _remove_rotten_files(datadir, rotten_days, dry_run=args.dry_run)
- _compress_stale_files(datadir, stale_days, dry_run=args.dry_run)
+ _clean_result_files(args, conf)
if not args.no_v3bw:
_clean_v3bw_files(args, conf)
More information about the tor-commits
mailing list