[tor-commits] [webstats/master] Don't delete parsed files from in/.
runa at torproject.org
runa at torproject.org
Tue Nov 13 15:27:44 UTC 2012
commit b306f7fc0f0285ea1d008dbbe271495bc6d92b3e
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Wed Nov 7 17:18:43 2012 -0500
Don't delete parsed files from in/.
---
src/org/torproject/webstats/Main.java | 63 ++++++++++++--------------------
1 files changed, 24 insertions(+), 39 deletions(-)
diff --git a/src/org/torproject/webstats/Main.java b/src/org/torproject/webstats/Main.java
index 1c7780c..7241d68 100644
--- a/src/org/torproject/webstats/Main.java
+++ b/src/org/torproject/webstats/Main.java
@@ -25,7 +25,7 @@ import org.apache.commons.compress.compressors.gzip.*;
*
* The main operation is to parse Apache web log files from the in/
* directory and write sanitized web log files to the out/ directory.
- * Files in the in/ directory are assumed to never change and will be
+ * Files in the in/ directory are assumed to never change and may be
* deleted after processing by this program. Files in the out/ directory
* are guaranteed to never change and may be deleted by a subsequently
* running program.
@@ -34,8 +34,8 @@ import org.apache.commons.compress.compressors.gzip.*;
* in/ are not parsed more than once and that files in out/ do not need to
* be changed:
* - state/lock prevents concurrent executions of this program.
- * - state/in-history contains file names of previously read and deleted
- * files in the in/ directory.
+ * - state/in-history contains file names of previously read and possibly
+ * deleted files in the in/ directory.
* - state/in-history.new is the file written in the current execution
* that will replace state/in-history during the execution.
* - state/temp/ contains new or updated output files parsed in the
@@ -56,12 +56,12 @@ import org.apache.commons.compress.compressors.gzip.*;
* 2. Read the contents from state/in-history and state/out-history and
* the directory listings of in/ to memory.
* 3. For each file in in/:
- * a. Append the file name to state/in-history.new.
- * b. Check that the file name is not contained in state/in-history.
- * If it is, print out a warning and skip the file.
- * c. Parse and sanitize the file in chunks of 250,000 lines to reduce
+ * a. Append the file name to state/in-history.new if it was not
+ * contained in state/in-history. If it was contained, skip the
+ * file.
+ * b. Parse and sanitize the file in chunks of 250,000 lines to reduce
* writes.
- * d. When writing sanitized chunks to output files, for each output
+ * c. When writing sanitized chunks to output files, for each output
* file, check in the following order if there is already such a
* file in
* i. state/temp/,
@@ -73,8 +73,7 @@ import org.apache.commons.compress.compressors.gzip.*;
* 4. Rename state/in-history to state/in-history.old and rename
* state/in-history.new to state/in-history. Delete
* state/in-history.old.
- * 5. Delete files in in/ that have been parsed in this execution.
- * 6. For each file in state/temp/:
+ * 5. For each file in state/temp/:
* a. Check if there's a corresponding line in state/out-history. If
* so, check whether there is a file in state/full/ or out/. If
* so, move the file to state/full/. Otherwise move the file to
@@ -84,13 +83,13 @@ import org.apache.commons.compress.compressors.gzip.*;
* days old, move the file to state/full/.
* c. If b. does not apply, append a line to out-history.new and move
* the file to out/.
- * 7. For each file in state/full/, check whether the sanitized log is at
+ * 6. For each file in state/full/, check whether the sanitized log is at
* least four (4) days old and not contained in state/out-history. If
* so, append a line to out-history.new and move the file to out/.
- * 8. Rename state/out-history to state/out-history.old and rename
+ * 7. Rename state/out-history to state/out-history.old and rename
* state/out-history.new to state/out-history. Delete
* state/out-history.old.
- * 9. Delete state/lock and exit.
+ * 8. Delete state/lock and exit.
*
* If the program is interrupted and leaves a lock file in state/lock, it
* requires an operator to fix the state/ directory and make it work
@@ -133,20 +132,19 @@ public class Main {
readHistoryFiles(); /* Step 2 */
readInDirectoryListing();
for (File inFile : inFiles) { /* Step 3 */
- appendToInHistory(inFile);
+ appendToInHistoryIfNotContained(inFile);
if (!checkFileName(inFile) || checkParsedBefore(inFile)) {
continue;
}
sanitizeInFile(inFile);
}
overwriteInHistoryFile(); /* Step 4 */
- deleteParsedInFiles(); /* Step 5 */
- for (String outputFileName : updatedOutputFiles) { /* Step 6 */
+ for (String outputFileName : updatedOutputFiles) { /* Step 5 */
moveOutputFile(outputFileName);
}
- moveFullFilesToOut(); /* Step 7 */
- overwriteOutHistoryFile(); /* Step 8 */
- deleteLockFile(); /* Step 9 */
+ moveFullFilesToOut(); /* Step 6 */
+ overwriteOutHistoryFile(); /* Step 7 */
+ deleteLockFile(); /* Step 8 */
}
/* Define file and directory names. */
@@ -262,10 +260,12 @@ public class Main {
}
return result;
}
- private static void appendToInHistory(File inFile) {
- inHistoryNewFiles.add(inFile.getAbsolutePath());
- String line = inFile.getAbsolutePath();
- appendToHistoryFile(stateInHistoryNewFile, line);
+ private static void appendToInHistoryIfNotContained(File inFile) {
+ if (!inHistoryNewFiles.contains(inFile.getAbsolutePath())) {
+ inHistoryNewFiles.add(inFile.getAbsolutePath());
+ String line = inFile.getAbsolutePath();
+ appendToHistoryFile(stateInHistoryNewFile, line);
+ }
}
private static void appendToHistoryFile(File historyFile, String line) {
try {
@@ -286,14 +286,7 @@ public class Main {
inFile.getName().endsWith(".gz");
}
private static boolean checkParsedBefore(File inFile) {
- if (inHistoryFiles.contains(inFile.getAbsolutePath())) {
- System.err.println("Parsed and subsequently deleted input file '"
- + inFile.getAbsolutePath() + "' before. It shouldn't be "
- + "there again. Skipping it now and not deleting it later.");
- return true;
- } else {
- return false;
- }
+ return inHistoryFiles.contains(inFile.getAbsolutePath());
}
private static void sanitizeInFile(File inFile) {
try {
@@ -452,14 +445,6 @@ public class Main {
stateInHistoryNewFile.renameTo(stateInHistoryFile);
stateInHistoryOldFile.delete();
}
- private static void deleteParsedInFiles() {
- Set<String> filesToDelete = new HashSet<String>();
- filesToDelete.addAll(inHistoryNewFiles);
- filesToDelete.removeAll(inHistoryFiles);
- for (String file : filesToDelete) {
- new File(file).delete();
- }
- }
private static void moveOutputFile(String outputFileName) {
File outFile = new File(outDirectory, outputFileName);
File stateTempFile = new File(stateTempDirectory, outputFileName);
More information about the tor-commits
mailing list