[tor-commits] [metrics-lib/master] Add new descriptor type for web server access logs.
karsten at torproject.org
karsten at torproject.org
Wed Jan 31 12:30:03 UTC 2018
commit 3cd814d8481c87ee3609783d66ae4e2eec81d290
Author: iwakeh <iwakeh at torproject.org>
Date: Fri Sep 15 14:07:08 2017 +0000
Add new descriptor type for web server access logs.
Implements task-22983 and is based on the log-descriptor
specification.
---
CHANGELOG.md | 7 +
.../org/torproject/descriptor/LogDescriptor.java | 47 ++++++
.../torproject/descriptor/WebServerAccessLog.java | 65 ++++++++
.../descriptor/impl/DescriptorParserImpl.java | 12 +-
.../torproject/descriptor/index/package-info.java | 6 +-
.../torproject/descriptor/internal/FileType.java | 53 +++++-
.../descriptor/internal/package-info.java | 10 +-
.../descriptor/log/InternalLogDescriptor.java | 63 ++++++++
.../descriptor/log/InternalWebServerAccessLog.java | 17 ++
.../descriptor/log/LogDescriptorImpl.java | 163 +++++++++++++++++++
.../descriptor/log/WebServerAccessLogImpl.java | 119 ++++++++++++++
.../descriptor/log/WebServerAccessLogLine.java | 135 ++++++++++++++++
.../torproject/descriptor/log/package-info.java | 14 ++
.../org/torproject/descriptor/package-info.java | 6 +-
.../descriptor/log/LogDescriptorTest.java | 178 +++++++++++++++++++++
.../descriptor/log/WebServerAccessLogLineTest.java | 140 ++++++++++++++++
.../descriptor/log/WebServerAccessLogTest.java | 94 +++++++++++
.../descriptor/log/WebServerModuleTest.java | 113 +++++++++++++
...eotrichon.torproject.org_access.log_20151007.xz | Bin 0 -> 4056 bytes
...rver.org_dummy.host.net_access.log_20111111.bz2 | Bin 0 -> 76 bytes
...meronense.torproject.org_access.log_20170530.gz | Bin 0 -> 388 bytes
...meronense.torproject.org_access.log_20170531.gz | Bin 0 -> 388 bytes
...eronense.torproject.org_access.log_20170530.log | 26 +++
...eronense.torproject.org_access.log_20170607.log | 26 +++
24 files changed, 1280 insertions(+), 14 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cd0dc6a..42e0e09 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,10 @@
+# Changes in version 2.2.0 - 2018-01-??
+
+ * Major changes
+ - Add new descriptor type WebServerAccessLog to parse web server
+ access logs.
+
+
# Changes in version 2.1.1 - 2017-10-09
* Minor changes
diff --git a/src/main/java/org/torproject/descriptor/LogDescriptor.java b/src/main/java/org/torproject/descriptor/LogDescriptor.java
new file mode 100644
index 0000000..ff02cae
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/LogDescriptor.java
@@ -0,0 +1,47 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor;
+
+import java.util.List;
+
+/**
+ * Contains a log file.
+ *
+ * <p>Unlike other descriptors, logs can get very large and are typically stored
+ * on disk in compressed form. However, all access to log contents through this
+ * interface and its subinterfaces is made available in uncompressed form.</p>
+ *
+ * @since 2.2.0
+ */
+public interface LogDescriptor extends Descriptor {
+
+ /**
+ * Returns the decompressed raw descriptor bytes of the log.
+ *
+ * @since 2.2.0
+ */
+ @Override
+ public byte[] getRawDescriptorBytes();
+
+ /**
+ * Returns annotations found in the log file, which may be an empty List if a
+ * log format does not support adding annotations.
+ *
+ * @since 2.2.0
+ */
+ @Override
+ public List<String> getAnnotations();
+
+ /**
+ * Returns unrecognized lines encountered while parsing the log, which may be
+ * an empty list or a fixed-size list with only a few entries, depending on
+ * the log type.
+ *
+ * @since 2.2.0
+ */
+ @Override
+ public List<String> getUnrecognizedLines();
+
+}
+
diff --git a/src/main/java/org/torproject/descriptor/WebServerAccessLog.java b/src/main/java/org/torproject/descriptor/WebServerAccessLog.java
new file mode 100644
index 0000000..b94bc30
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/WebServerAccessLog.java
@@ -0,0 +1,65 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor;
+
+import java.time.LocalDate;
+import java.util.List;
+
+/**
+ * Contains a sanitized web server access log file from a {@code torproject.org}
+ * web server.
+ *
+ * <p>Parsing non-sanitized web server access logs from {@code torproject.org}
+ * web servers or other web servers is not explicitly supported, but may work
+ * anyway.</p>
+ *
+ * @since 2.2.0
+ */
+public interface WebServerAccessLog extends LogDescriptor {
+
+ /**
+ * Returns the date when requests contained in the log have been started,
+ * which is parsed from the log file path.
+ *
+ * <p>Typical web server access logs may contain date information in their
+ * file path, too, but that would be the date when the log file was rotated,
+ * which is not necessary the same date as the date in contained request
+ * lines.</p>
+ *
+ * @since 2.2.0
+ */
+ public LocalDate getLogDate();
+
+ /**
+ * Returns the hostname of the physical host writing this log file, which is
+ * parsed from the log file path.
+ *
+ * <p>A physical host can serve multiple virtual hosts, and a virtual host can
+ * be served by multiple physical hosts.</p>
+ *
+ * @since 2.2.0
+ */
+ public String getPhysicalHost();
+
+ /**
+ * Returns the hostname of the virtual host that this log file was written
+ * for, which is parsed from the log file path.
+ *
+ * <p>A physical host can serve multiple virtual hosts, and a virtual host can
+ * be served by multiple physical hosts.</p>
+ *
+ * @since 2.2.0
+ */
+ public String getVirtualHost();
+
+ /**
+ * Returns at most three unrecognized lines encountered while parsing the log.
+ *
+ * @since 2.2.0
+ */
+ @Override
+ public List<String> getUnrecognizedLines();
+
+}
+
diff --git a/src/main/java/org/torproject/descriptor/impl/DescriptorParserImpl.java b/src/main/java/org/torproject/descriptor/impl/DescriptorParserImpl.java
index d32c031..f244abb 100644
--- a/src/main/java/org/torproject/descriptor/impl/DescriptorParserImpl.java
+++ b/src/main/java/org/torproject/descriptor/impl/DescriptorParserImpl.java
@@ -9,6 +9,10 @@ import static org.torproject.descriptor.impl.DescriptorImpl.SP;
import org.torproject.descriptor.Descriptor;
import org.torproject.descriptor.DescriptorParseException;
import org.torproject.descriptor.DescriptorParser;
+import org.torproject.descriptor.log.LogDescriptorImpl;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.io.File;
import java.lang.reflect.Constructor;
@@ -19,6 +23,9 @@ import java.util.List;
public class DescriptorParserImpl implements DescriptorParser {
+ private static final Logger log
+ = LoggerFactory.getLogger(DescriptorParserImpl.class);
+
@Override
public Iterable<Descriptor> parseDescriptors(byte[] rawDescriptorBytes,
File descriptorFile, String fileName) {
@@ -26,8 +33,7 @@ public class DescriptorParserImpl implements DescriptorParser {
return this.detectTypeAndParseDescriptors(rawDescriptorBytes,
descriptorFile, fileName);
} catch (DescriptorParseException e) {
- /* Looks like we attempted to parse the whole raw descriptor bytes at once
- * below and ran into a parse issue. */
+ log.debug("Cannot parse descriptor file ’{}’.", descriptorFile, e);
List<Descriptor> parsedDescriptors = new ArrayList<>();
parsedDescriptors.add(new UnparseableDescriptorImpl(rawDescriptorBytes,
new int[] { 0, rawDescriptorBytes.length }, descriptorFile, e));
@@ -124,6 +130,8 @@ public class DescriptorParserImpl implements DescriptorParser {
} else if (firstLines.startsWith("@type torperf 1.")) {
return TorperfResultImpl.parseTorperfResults(rawDescriptorBytes,
descriptorFile);
+ } else if (descriptorFile.getName().contains(LogDescriptorImpl.MARKER)) {
+ return LogDescriptorImpl.parse(rawDescriptorBytes, descriptorFile);
} else {
throw new DescriptorParseException("Could not detect descriptor "
+ "type in descriptor starting with '" + firstLines + "'.");
diff --git a/src/main/java/org/torproject/descriptor/index/package-info.java b/src/main/java/org/torproject/descriptor/index/package-info.java
index c685c63..021cbea 100644
--- a/src/main/java/org/torproject/descriptor/index/package-info.java
+++ b/src/main/java/org/torproject/descriptor/index/package-info.java
@@ -2,14 +2,12 @@
* See LICENSE for licensing information */
/**
- * <h1>This package is still in alpha stage.</h1>
- * <p>The public interface might still change in unexpected ways.</p>
+ * <h1>This package is part of the implementation not the public API.</h1>
+ * <p>The public interface might change in unexpected ways.</p>
*
* <p>Interfaces and essential classes for obtaining and processing
* CollecTor's index.json file.</p>
*
- * <p>Interfaces and classes make the content of index.json available.</p>
- *
*
* @since 1.4.0
*/
diff --git a/src/main/java/org/torproject/descriptor/internal/FileType.java b/src/main/java/org/torproject/descriptor/internal/FileType.java
index 36b5df8..353f0bb 100644
--- a/src/main/java/org/torproject/descriptor/internal/FileType.java
+++ b/src/main/java/org/torproject/descriptor/internal/FileType.java
@@ -12,6 +12,8 @@ import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
@@ -43,6 +45,8 @@ public enum FileType {
/**
* Returns <code>valueOf</code> or the default enum {@link #PLAIN}, i.e.,
* this method doesn't throw any exceptions and allways returns a valid enum.
+ *
+ * @since 2.1.0
*/
public static FileType findType(String ext) {
FileType res = null;
@@ -54,16 +58,61 @@ public enum FileType {
}
}
- /** Return the appropriate input stream. */
+ /**
+ * Return the appropriate input stream.
+ *
+ * @since 1.4.0
+ */
public InputStream inputStream(InputStream is) throws Exception {
return this.inClass.getConstructor(new Class[]{InputStream.class})
.newInstance(is);
}
- /** Return the appropriate output stream. */
+ /**
+ * Return the appropriate output stream.
+ *
+ * @since 1.4.0
+ */
public OutputStream outputStream(OutputStream os) throws Exception {
return this.outClass.getConstructor(new Class[]{OutputStream.class})
.newInstance(os);
}
+
+ /**
+ * Compresses the given bytes in memory and returns the compressed bytes.
+ *
+ * @since 2.2.0
+ */
+ public byte[] compress(byte[] bytes) throws Exception {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ try (OutputStream os = this.outputStream(baos)) {
+ os.write(bytes);
+ os.flush();
+ }
+ return baos.toByteArray();
+ }
+
+ /**
+ * Decompresses the given bytes in memory and returns the decompressed bytes.
+ *
+ * @since 2.2.0
+ */
+ public byte[] decompress(byte[] bytes) throws Exception {
+ if (0 == bytes.length) {
+ return bytes;
+ }
+ try (InputStream is
+ = this.inputStream(new ByteArrayInputStream(bytes));
+ ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
+ int readByte = is.read();
+ while (readByte > 0) {
+ baos.write(readByte);
+ readByte = is.read();
+ }
+ baos.flush();
+ return baos.toByteArray();
+ }
+ }
+
}
diff --git a/src/main/java/org/torproject/descriptor/internal/package-info.java b/src/main/java/org/torproject/descriptor/internal/package-info.java
index 5bc7bcb..b845921 100644
--- a/src/main/java/org/torproject/descriptor/internal/package-info.java
+++ b/src/main/java/org/torproject/descriptor/internal/package-info.java
@@ -2,11 +2,13 @@
* See LICENSE for licensing information */
/**
- * <h1>This package is part of the implementation not the public API.</h1>
- * <p>The public interface might change in unexpected ways.</p>
+ * Interfaces and essential classes for obtaining and processing
+ * descriptors.
+ *
+ * <p><strong>This package is part of the implementation not the
+ * public API.</strong></p>
*
- * <p>Interfaces and essential classes for obtaining and processing
- * descriptors.</p>
+ * <p>The public interface might change in unexpected ways.</p>
*
* @since 2.1.0
*/
diff --git a/src/main/java/org/torproject/descriptor/log/InternalLogDescriptor.java b/src/main/java/org/torproject/descriptor/log/InternalLogDescriptor.java
new file mode 100644
index 0000000..3c0039b
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/log/InternalLogDescriptor.java
@@ -0,0 +1,63 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import org.torproject.descriptor.DescriptorParseException;
+import org.torproject.descriptor.LogDescriptor;
+
+/**
+ * This interface provides methods for internal use only.
+ *
+ * @since 2.2.0
+ */
+public interface InternalLogDescriptor extends LogDescriptor {
+
+ /** Logfile name parts separator. */
+ public static final String SEP = "_";
+
+ /**
+ * Validate log lines.
+ *
+ * @since 2.2.0
+ */
+ public void validate() throws DescriptorParseException;
+
+ /**
+ * Set the <code>Validator</code> that will perform the validation on log
+ * lines.
+ *
+ * <p>Usually set by the implementing class.</p>
+ *
+ * @since 2.2.0
+ */
+ public void setValidator(Validator validator);
+
+ /**
+ * Set the descriptor's bytes.
+ *
+ * @since 2.2.0
+ */
+ public void setRawDescriptorBytes(byte[] bytes);
+
+ /** Return the descriptor's preferred compression. */
+ public String getCompressionType();
+
+ /**
+ * Provides a single function for validating a single log line.
+ *
+ * @since 2.2.0
+ */
+ public interface Validator {
+
+ /**
+ * Verifies a log line.
+ *
+ * @since 2.2.0
+ */
+ public boolean validate(String line);
+
+ }
+
+}
+
diff --git a/src/main/java/org/torproject/descriptor/log/InternalWebServerAccessLog.java b/src/main/java/org/torproject/descriptor/log/InternalWebServerAccessLog.java
new file mode 100644
index 0000000..540f25d
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/log/InternalWebServerAccessLog.java
@@ -0,0 +1,17 @@
+/* Copyright 2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+/**
+ * This interface provides methods for internal use only.
+ *
+ * @since 2.2.0
+ */
+public interface InternalWebServerAccessLog extends InternalLogDescriptor {
+
+ /** The log's name should include this string. */
+ public static final String MARKER = "access.log";
+
+}
+
diff --git a/src/main/java/org/torproject/descriptor/log/LogDescriptorImpl.java b/src/main/java/org/torproject/descriptor/log/LogDescriptorImpl.java
new file mode 100644
index 0000000..97854e4
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/log/LogDescriptorImpl.java
@@ -0,0 +1,163 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import org.torproject.descriptor.Descriptor;
+import org.torproject.descriptor.DescriptorParseException;
+import org.torproject.descriptor.LogDescriptor;
+import org.torproject.descriptor.internal.FileType;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+/**
+ * Base class for log descriptors.
+ *
+ * @since 2.2.0
+ */
+public abstract class LogDescriptorImpl
+ implements LogDescriptor, InternalLogDescriptor {
+
+ /** The log's file name should contain this string. */
+ public static final String MARKER = ".log";
+
+ private static final int unrecognizedLinesLimit = 3;
+
+ private static final Logger log
+ = LoggerFactory.getLogger(LogDescriptorImpl.class);
+
+ private static Pattern filenamePattern = Pattern.compile(
+ "(?:\\S*)" + MARKER + SEP + "(?:[0-9a-zA-Z]*)(?:\\.?)([a-zA-Z2]*)");
+
+ private final File descriptorFile;
+
+ /** Byte array for plain, i.e. uncompressed, log data. */
+ private byte[] logBytes;
+
+ private FileType fileType;
+
+ private List<String> unrecognizedLines = new ArrayList<>();
+
+ private Validator validator = (String line) -> true;
+
+ /**
+ * This constructor performs basic operations on the given bytes.
+ *
+ * <p>An unknown compression type (see {@link #getCompressionType})
+ * is interpreted as missing compression. In this case the bytes
+ * will be compressed to the given compression type.</p>
+ *
+ * @since 2.2.0
+ */
+ protected LogDescriptorImpl(byte[] logBytes, File descriptorFile,
+ FileType defaultCompression) throws DescriptorParseException {
+ this.logBytes = logBytes;
+ this.descriptorFile = descriptorFile;
+ try {
+ Matcher mat = filenamePattern.matcher(descriptorFile.getName());
+ if (!mat.find()) {
+ throw new DescriptorParseException(
+ "Log file name doesn't comply to standard: " + descriptorFile);
+ }
+ this.fileType = FileType.findType(mat.group(1).toUpperCase());
+ if (FileType.PLAIN == this.fileType) {
+ this.fileType = defaultCompression;
+ } else {
+ this.logBytes = this.fileType.decompress(this.logBytes);
+ }
+ } catch (Exception ex) {
+ throw new DescriptorParseException("Cannot parse file "
+ + descriptorFile.getName(), ex);
+ }
+ }
+
+ @Override
+ public void validate() throws DescriptorParseException {
+ try (BufferedReader br
+ = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(
+ this.logBytes)))) {
+ this.unrecognizedLines.addAll(br.lines().parallel().filter((line)
+ -> null != line && !line.isEmpty() && !validator.validate(line))
+ .limit(unrecognizedLinesLimit).collect(Collectors.toList()));
+ } catch (Exception ex) {
+ throw new DescriptorParseException("Cannot validate log lines.", ex);
+ }
+ }
+
+ /**
+ * Assemble a LogDescriptor.
+ *
+ * @since 2.2.0
+ */
+ public static List<Descriptor> parse(byte[] logBytes,
+ File descriptorFile) throws DescriptorParseException {
+ if (descriptorFile.getName().contains(InternalWebServerAccessLog.MARKER)) {
+ return Arrays.asList(new Descriptor[]{
+ new WebServerAccessLogImpl(logBytes, descriptorFile)});
+ } else {
+ throw new DescriptorParseException("Cannot parse file "
+ + descriptorFile.getName());
+ }
+ }
+
+ public static byte[] collectionToBytes(Collection<String> lines) {
+ return lines.stream().collect(Collectors.joining("\n", "", "\n"))
+ .getBytes();
+ }
+
+ @Override
+ public void setValidator(Validator validator) {
+ this.validator = validator;
+ }
+
+ @Override
+ public String getCompressionType() {
+ return this.fileType.name().toLowerCase();
+ }
+
+ @Override
+ public byte[] getRawDescriptorBytes() {
+ return this.logBytes;
+ }
+
+ @Override
+ public void setRawDescriptorBytes(byte[] bytes) {
+ this.logBytes = bytes;
+ }
+
+ @Override
+ public int getRawDescriptorLength() {
+ return this.logBytes.length;
+ }
+
+ @Override
+ public List<String> getAnnotations() {
+ return Collections.emptyList();
+ }
+
+ @Override
+ public List<String> getUnrecognizedLines() {
+ return this.unrecognizedLines;
+ }
+
+ @Override
+ public File getDescriptorFile() {
+ return descriptorFile;
+ }
+
+}
+
diff --git a/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java
new file mode 100644
index 0000000..6708c3a
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java
@@ -0,0 +1,119 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import org.torproject.descriptor.DescriptorParseException;
+import org.torproject.descriptor.WebServerAccessLog;
+import org.torproject.descriptor.internal.FileType;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.util.Collection;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Implementation of web server access log descriptors.
+ *
+ * <p>Defines sanitization and validation for web server access logs.</p>
+ *
+ * @since 2.2.0
+ */
+public class WebServerAccessLogImpl extends LogDescriptorImpl
+ implements InternalWebServerAccessLog, WebServerAccessLog {
+
+ private static final Logger log
+ = LoggerFactory.getLogger(WebServerAccessLogImpl.class);
+
+ /** The log's name should include this string. */
+ public static final String MARKER = InternalWebServerAccessLog.MARKER;
+
+ /** The mandatory web server log descriptor file name pattern. */
+ public static final Pattern filenamePattern
+ = Pattern.compile("(\\S*)" + SEP + "(\\S*)" + SEP + "" + MARKER
+ + SEP + "(\\d*)(?:\\.?)([a-zA-Z]*)");
+
+ private final String physicalHost;
+
+ private final String virtualHost;
+
+ private final LocalDate logDate;
+
+ /**
+ * Creates a WebServerAccessLog from the given bytes and filename.
+ *
+ * <p>The given bytes are read, whereas the file is not read.</p>
+ *
+ * <p>The path of the given file has to be compliant to the following
+ * naming pattern
+ * {@code
+ * <virtualHost>-<physicalHost>-access.log-<yyyymmdd>.<compression>},
+ * where an unknown compression type (see {@link #getCompressionType})
+ * is interpreted as missing compression. In this case the bytes
+ * will be compressed to the default compression type.
+ * The immediate parent name is taken to be the physical host collecting the
+ * logs.</p>
+ */
+ protected WebServerAccessLogImpl(byte[] logBytes, File file)
+ throws DescriptorParseException {
+ this(logBytes, file, FileType.XZ);
+ }
+
+ /** For internal use only. */
+ public WebServerAccessLogImpl(Collection<String> lines, String filename)
+ throws DescriptorParseException {
+ this(LogDescriptorImpl.collectionToBytes(lines), new File(filename));
+ }
+
+ private WebServerAccessLogImpl(byte[] logBytes, File file,
+ FileType defaultCompression) throws DescriptorParseException {
+ super(logBytes, file, defaultCompression);
+ try {
+ String fn = file.toPath().getFileName().toString();
+ Matcher mat = filenamePattern.matcher(fn);
+ if (!mat.find()) {
+ throw new DescriptorParseException(
+ "WebServerAccessLog file name doesn't comply to standard: " + fn);
+ }
+ this.virtualHost = mat.group(1);
+ this.physicalHost = mat.group(2);
+ if (null == this.virtualHost || null == this.physicalHost
+ || this.virtualHost.isEmpty() || this.physicalHost.isEmpty()) {
+ throw new DescriptorParseException(
+ "WebServerAccessLog file name doesn't comply to standard: " + fn);
+ }
+ String ymd = mat.group(3);
+ this.logDate = LocalDate.parse(ymd, DateTimeFormatter.BASIC_ISO_DATE);
+ this.setValidator((line)
+ -> WebServerAccessLogLine.makeLine(line).isValid());
+ this.validate();
+ } catch (DescriptorParseException dpe) {
+ throw dpe; // escalate
+ } catch (Exception pe) {
+ throw new DescriptorParseException(
+ "Cannot parse WebServerAccessLog file: " + file, pe);
+ }
+ }
+
+ @Override
+ public String getPhysicalHost() {
+ return this.physicalHost;
+ }
+
+ @Override
+ public String getVirtualHost() {
+ return this.virtualHost;
+ }
+
+ @Override
+ public LocalDate getLogDate() {
+ return this.logDate;
+ }
+
+}
+
diff --git a/src/main/java/org/torproject/descriptor/log/WebServerAccessLogLine.java b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogLine.java
new file mode 100644
index 0000000..ab20dd2
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogLine.java
@@ -0,0 +1,135 @@
+/* Copyright 2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.time.LocalDate;
+import java.time.ZoneOffset;
+import java.time.ZonedDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.Optional;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class WebServerAccessLogLine {
+
+ private static final Logger log = LoggerFactory
+ .getLogger(WebServerAccessLogLine.class);
+
+ private static final String DATE_PATTERN = "dd/MMM/yyyy";
+
+ private static final DateTimeFormatter dateTimeFormatter
+ = DateTimeFormatter.ofPattern(DATE_PATTERN + ":HH:mm:ss xxxx");
+
+ private static Pattern logLinePattern = Pattern.compile(
+ "^((?:\\d{1,3}\\.){3}\\d{1,3}) (\\S+) (\\S+) "
+ + "\\[([\\w/]+)([\\w:]+)(\\s[+\\-]\\d{4})\\] "
+ + "\"([A-Z]+) ([^\"]+) ([A-Z]+/\\d\\.\\d)\" "
+ + "(\\d{3}) (\\d+|-)(.*)");
+
+ private String ip;
+ private int response;
+ private String request;
+ private String method;
+ private String dateString;
+ private LocalDate date;
+ private String protocol;
+ private Optional<Integer> size;
+ private boolean valid = false;
+ private String type;
+
+ /** Returns a log line string. Possibly empty. */
+ public String toLogString() {
+ if (!this.valid) {
+ return "";
+ }
+ return toString();
+ }
+
+ @Override
+ public String toString() {
+ return String.format("%s - - [%s:00:00:00 +0000] \"%s %s %s\" %d %s",
+ this.ip, this.dateString, this.method, this.request, this.type,
+ this.response, this.size.isPresent() ? this.size.get() : "-");
+ }
+
+ /** Returns the string of the date using 'yyyymmdd' format. */
+ public String getDateString() {
+ return dateString;
+ }
+
+ /** Returns a string containing the ip. */
+ public String getIp() {
+ return this.ip;
+ }
+
+ /** Only used internally during sanitization. */
+ public void setIp(String ip) {
+ this.ip = ip;
+ }
+
+ public String getMethod() {
+ return this.method;
+ }
+
+ public String getProtocol() {
+ return this.protocol;
+ }
+
+ public String getRequest() {
+ return this.request;
+ }
+
+ public int getResponse() {
+ return this.response;
+ }
+
+ /** Only used internally during sanitization. */
+ public void setRequest(String request) {
+ this.request = request;
+ }
+
+ public LocalDate getDate() {
+ return this.date;
+ }
+
+ public boolean isValid() {
+ return this.valid;
+ }
+
+ /** Creates a Line from a string. */
+ public static WebServerAccessLogLine makeLine(String line) {
+ WebServerAccessLogLine res = new WebServerAccessLogLine();
+ try {
+ Matcher mat = logLinePattern.matcher(line);
+ if (mat.find()) {
+ res.response = Integer.valueOf(mat.group(10));
+ res.method = mat.group(7);
+ res.protocol = mat.group(9);
+ String dateTimeString = mat.group(4) + mat.group(5) + mat.group(6);
+ res.date = ZonedDateTime.parse(dateTimeString,
+ dateTimeFormatter).withZoneSameInstant(ZoneOffset.UTC)
+ .toLocalDate();
+ res.dateString = res.date
+ .format(DateTimeFormatter.ofPattern(DATE_PATTERN));
+ res.ip = mat.group(1);
+ res.request = mat.group(8);
+ res.type = mat.group(9);
+ if ("-".equals(mat.group(11))) {
+ res.size = Optional.empty();
+ } else {
+ res.size = Optional.of(Integer.valueOf(mat.group(11)));
+ }
+ res.valid = true;
+ }
+ } catch (Throwable th) {
+ log.debug("Unmatchable line: '{}'.", line, th);
+ return new WebServerAccessLogLine();
+ }
+ return res;
+ }
+
+}
diff --git a/src/main/java/org/torproject/descriptor/log/package-info.java b/src/main/java/org/torproject/descriptor/log/package-info.java
new file mode 100644
index 0000000..68bcfa1
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/log/package-info.java
@@ -0,0 +1,14 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+/**
+ * <h1>This package is part of the implementation not the public API.</h1>
+ * <p>The public interface might change in unexpected ways.</p>
+ *
+ * <p>Interfaces and essential classes for obtaining and processing
+ * log descriptors.</p>
+ *
+ * @since 2.2.0
+ */
+package org.torproject.descriptor.log;
+
diff --git a/src/main/java/org/torproject/descriptor/package-info.java b/src/main/java/org/torproject/descriptor/package-info.java
index 0410bac..d844d40 100644
--- a/src/main/java/org/torproject/descriptor/package-info.java
+++ b/src/main/java/org/torproject/descriptor/package-info.java
@@ -65,9 +65,11 @@
* connected to the Tor network rather than by the Tor software. This
* group comprises descriptors by the bridge distribution service BridgeDB
* ({@link org.torproject.descriptor.BridgePoolAssignment}), the exit list
- * service TorDNSEL ({@link org.torproject.descriptor.ExitList}), and the
+ * service TorDNSEL ({@link org.torproject.descriptor.ExitList}), the
* performance measurement service Torperf
- * ({@link org.torproject.descriptor.TorperfResult}).</li>
+ * ({@link org.torproject.descriptor.TorperfResult}), and sanitized access logs
+ * of Tor's web servers
+ * ({@link org.torproject.descriptor.WebServerAccessLog}).</li>
* </ol>
*
* @since 1.0.0
diff --git a/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java b/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java
new file mode 100644
index 0000000..b12cfc0
--- /dev/null
+++ b/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java
@@ -0,0 +1,178 @@
+
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.torproject.descriptor.Descriptor;
+import org.torproject.descriptor.DescriptorParser;
+import org.torproject.descriptor.DescriptorReader;
+import org.torproject.descriptor.DescriptorSourceFactory;
+import org.torproject.descriptor.LogDescriptor;
+import org.torproject.descriptor.UnparseableDescriptor;
+import org.torproject.descriptor.WebServerAccessLog;
+
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+
+ at RunWith(Parameterized.class)
+public class LogDescriptorTest {
+
+ /** Temporary folder containing all files for this test. */
+ @Rule
+ public TemporaryFolder temp = new TemporaryFolder();
+
+ /** Directory containing two input descriptor files. */
+ protected File indir;
+
+ /** Descriptor reader used in this test. */
+ protected DescriptorReader reader
+ = DescriptorSourceFactory.createDescriptorReader();
+
+ protected int size;
+ protected String[] pan;
+ protected Class<LogDescriptor> type;
+ protected boolean isDecompressionTest;
+
+ /** All types of data that can be encountered during sync. */
+ @Parameters
+ public static Collection<Object[]> pathAndName() {
+ return Arrays.asList(new Object[][] {
+ {Boolean.TRUE, 1878, new String[]{"meronense.torproject.org",
+ "metrics.torproject.org_meronense.torproject.org_access.log"
+ + "_20170530.gz",
+ "metrics.torproject.org", "20170530", "gz"},
+ WebServerAccessLog.class},
+ {Boolean.FALSE, 1878, new String[]{"meronense.torproject.org",
+ "xy.host.org_meronense.torproject.org_access.log_20170530.log",
+ "metrics.torproject.org", "20170530", "xz"},
+ WebServerAccessLog.class},
+ {Boolean.TRUE, 70730, new String[]{"archeotrichon.torproject.org",
+ "archive.torproject.org_archeotrichon.torproject.org_access.log_"
+ + "20151007.xz",
+ "archive.torproject.org", "20151007", "xz"},
+ WebServerAccessLog.class},
+ {Boolean.TRUE, 0, new String[]{"dummy.host.net",
+ "nix.server.org_dummy.host.net_access.log_20111111.bz2",
+ "nix.server.org", "20111111", "bz2"},
+ WebServerAccessLog.class}});
+ }
+
+ /** This constructor receives the above defined data for each run. */
+ public LogDescriptorTest(boolean decompression, int size, String[] pan,
+ Class<LogDescriptor> type) {
+ this.pan = pan;
+ this.size = size;
+ this.type = type;
+ this.isDecompressionTest = decompression;
+ }
+
+ /** Prepares the temporary folder and writes files to it for this test. */
+ private void createTemporaryFolderAndContents() throws IOException {
+ this.indir = this.temp.newFolder();
+ String path = this.pan[0];
+ String name = this.pan[1];
+ File logdir = new File(indir, path);
+ logdir.mkdir();
+ File accessLogFile = new File(logdir, name);
+ Files.copy(getClass().getClassLoader().getResource(path + "/" + name)
+ .openStream(), accessLogFile.toPath());
+ }
+
+ /** Read the test files. */
+ @Before
+ public void readAll() throws IOException {
+ createTemporaryFolderAndContents();
+ Iterator<Descriptor> descs = this.reader
+ .readDescriptors(this.indir).iterator();
+ while (descs.hasNext()) {
+ descs.next();
+ }
+ }
+
+ protected List<Descriptor> retrieve() throws Exception {
+ assertEquals(1, this.reader.getParsedFiles().size());
+ File logFile = new File(this.reader.getParsedFiles().firstKey());
+ byte[] raw = Files.readAllBytes(logFile.toPath());
+ DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser();
+ List<Descriptor> descs = new ArrayList<>();
+ for (Descriptor desc
+ : dp.parseDescriptors(raw, logFile, logFile.getName())) {
+ descs.add(desc);
+ }
+ return descs;
+ }
+
+ @Test
+ public void testParsing() throws Exception {
+ List<Descriptor> descs = retrieve();
+ assertTrue("Wrong type. " + dataUsed(),
+ (descs.get(0) instanceof LogDescriptor));
+ InternalLogDescriptor ld = (InternalLogDescriptor) descs.get(0);
+ assertEquals("Wrong compression type string. " + dataUsed(),
+ pan[4], ld.getCompressionType());
+ }
+
+ private String dataUsed() {
+ return "Used data: " + Arrays.toString(pan);
+ }
+
+ @Test
+ public void testUnknownLogType() throws Exception {
+ assertEquals(dataUsed(), 1, this.reader.getParsedFiles().size());
+ File logFile = new File(this.reader.getParsedFiles().firstKey());
+ DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser();
+ File invalidFile = new File(this.reader.getParsedFiles().firstKey()
+ .replace("access", "-"));
+ List<Descriptor> descs = new ArrayList<>();
+ for (Descriptor desc // note: only 'invalidFile' is used by LogDescriptor
+ : dp.parseDescriptors(new byte[]{}, invalidFile, logFile.getName())) {
+ descs.add(desc);
+ }
+ assertTrue(dataUsed() + "\nWrong type: "
+ + Arrays.toString(descs.get(0).getClass().getInterfaces()),
+ (descs.get(0) instanceof UnparseableDescriptor));
+ }
+
+ @Test
+ public void testCompressionInvalid() throws Exception {
+ if (!isDecompressionTest) {
+ return;
+ }
+ assertEquals(1, this.reader.getParsedFiles().size());
+ File logFile = new File(this.reader.getParsedFiles().firstKey());
+ byte[] raw = Files.readAllBytes(logFile.toPath());
+ for (int i = 0; i < 3; i++) {
+ raw[0] = (byte) i;
+ }
+ DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser();
+ List<Descriptor> descs = new ArrayList<>();
+ for (Descriptor desc
+ : dp.parseDescriptors(raw, logFile, logFile.getName())) {
+ descs.add(desc);
+ }
+ assertTrue(dataUsed() + "\nWrong type: "
+ + Arrays.toString(descs.get(0).getClass().getInterfaces()),
+ (descs.get(0) instanceof UnparseableDescriptor));
+ assertArrayEquals(dataUsed(), raw, descs.get(0).getRawDescriptorBytes());
+ }
+}
+
diff --git a/src/test/java/org/torproject/descriptor/log/WebServerAccessLogLineTest.java b/src/test/java/org/torproject/descriptor/log/WebServerAccessLogLineTest.java
new file mode 100644
index 0000000..ec23b61
--- /dev/null
+++ b/src/test/java/org/torproject/descriptor/log/WebServerAccessLogLineTest.java
@@ -0,0 +1,140 @@
+/* Copyright 2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+import org.junit.runners.Parameterized.Parameters;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+ at RunWith(Parameterized.class)
+public class WebServerAccessLogLineTest {
+
+ /** Test data structure:
+ * reference date, real log line, cleaned line, is valid.
+ */
+ @Parameters
+ public static Collection<Object[]> logData() {
+ return Arrays.asList(new Object[][] {
+ { "0.0.0.0 - - [22/Jan/2018:00:00:00 +0000] \"GET "
+ + "/collector/archive HTTP/1.1\" 301 -",
+ "0.0.0.0 - - [22/Jan/2018:00:00:00 +0000] \"GET "
+ + "/collector/archive HTTP/1.1\" 301 -", Boolean.TRUE},
+ { "0.0.0.0 - - [22/Jan/2018:00:00:00 +0000] \"GET "
+ + "/collector/archive HTTP/1.1\" 301 X \"ccc\"",
+ "", Boolean.FALSE},
+ { "123.98.100.23 xyz xyz [22/Jan/2018:01:20:03 +0000] \"GET "
+ + "/collector/archive HTTP/1.1\" 301 - xyz abc xxxXXXXXXXX",
+ "123.98.100.23 - - [22/Jan/2018:00:00:00 +0000] \"GET "
+ + "/collector/archive HTTP/1.1\" 301 -", Boolean.TRUE},
+ { "127.0.0.1 abc xyz [03/May/2017:06:07:08 +0000] "
+ + "\"GET /server-status HTTP/1.1\" 303 294 "
+ + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+ "127.0.0.1 - - [03/May/2017:00:00:00 +0000] \"GET /server-status"
+ + " HTTP/1.1\" 303 294", Boolean.TRUE},
+ { "127.0.0.1 abc xyz [03/May/2017:06:07:08 +0000] "
+ + "\"GET /server-status?auto HTTP/1.1\" 303 294 "
+ + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+ "127.0.0.1 - - [03/May/2017:00:00:00 +0000] \"GET /server-status"
+ + "?auto HTTP/1.1\" 303 294", Boolean.TRUE},
+ { "42.41.40.39 - - [04/May/2017:06:07:08 +0000] "
+ + "\"HEAD /server-status?auto HTTP/1.1\" 200 294 "
+ + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+ "42.41.40.39 - - [04/May/2017:00:00:00 +0000] \"HEAD /server-status"
+ + "?auto HTTP/1.1\" 200 294", Boolean.TRUE},
+ { "42.41.39 - - [04/May/2017:06:07:08 +0000] "
+ + "\"HEAD /server-status?auto HTTP/1.1\" 200 294 "
+ + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+ "", Boolean.FALSE},
+ { "42.41.40.1039 - - [04/May/2017:06:07:08 +0000] "
+ + "\"HEAD /server-status?auto HTTP/1.1\" 200 294 "
+ + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+ "", Boolean.FALSE},
+ { "42.41.40_039 - - [04/May/2017:06:07:08 +0000] "
+ + "\"HEAD /server-status?auto HTTP/1.1\" 200 294 "
+ + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+ "", Boolean.FALSE},
+ { "0.0.0.2 - - [05/May/2017:15:16:17 +0000] "
+ + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+ + "404 536 \"-\" \"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0;"
+ + " SLCC1; .NET CLR 2.0.50; Media Center PC 5.0; .NET CLR 3.5.2;\"",
+ "0.0.0.2 - - [05/May/2017:00:00:00 +0000] "
+ + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+ + "404 536",
+ Boolean.TRUE},
+ { "0.0.0.99 - - [05/June/2017:15:16:17 +0000] "
+ + "\"GET http://metrics.torproject.org/favicon.ico FTP/1.0\" "
+ + "300 536 \"-\" \"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0;"
+ + " SLCC1; .NET CLR 2.0.50; Media Center PC 5.0; .NET CLR 3.5.2;\"",
+ "", Boolean.FALSE},
+ { "0.0.0.99 - - [05/Jun/2017:15:16:17 +0000] "
+ + "\"GET http://metrics.torproject.org/favicon.ico FTP/1.0\" "
+ + "300 536 \"-\" \"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0;"
+ + " SLCC1; .NET CLR 2.0.50; Media Center PC 5.0; .NET CLR 3.5.2;\"",
+ "0.0.0.99 - - [05/Jun/2017:00:00:00 +0000] "
+ + "\"GET http://metrics.torproject.org/favicon.ico FTP/1.0\" 300 536",
+ Boolean.TRUE},
+ { "0.0.0.7 - - [06/May/2017:00:16:17 +0100] "
+ + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+ + "333 536 \"-\" \"Mozilla/4.0 (compatible; Opera 7.0; Windows 6.0;"
+ + " funky values ; \"",
+ "0.0.0.7 - - [05/May/2017:00:00:00 +0000] "
+ + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+ + "333 536", Boolean.TRUE},
+ { "0.0.0.1 - - [07/Dec/2016:20:16:18 -1000] "
+ + "\"GET http://t3.torproject.org/?query=what HTTP/1.1\" "
+ + "200 777 \"-\" \"Mozilla/4.0 (compatible; MSIE 7.0; Windows 10;"
+ + " SLCC1; .NET CLR 2.0; Media Center PC 5.0; .NET CLR 3.5.2)\"",
+ "0.0.0.1 - - [08/Dec/2016:00:00:00 +0000] "
+ + "\"GET http://t3.torproject.org/?query=what HTTP/1.1\" 200 777",
+ Boolean.TRUE},
+ { "abcdefghijklmnop1234567890", "", Boolean.FALSE},
+ { "", "", Boolean.FALSE},
+ { "0.0.0.7 - - [06/May/2017:00:16:17 +0000] "
+ + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+ + "333 536 \"-\" \"Mozilla/4.0 (compatible; Opera 7.0; Windows 8.0;",
+ "0.0.0.7 - - [06/May/2017:00:00:00 +0000] "
+ + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+ + "333 536", Boolean.TRUE},
+ { "0.0.0.7 - - [06/May/2017:00:16:17 +0000] "
+ + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+ + "333 536 \"-\" \"Mozilla/4.0 (compatible; Opera 7.0; Windows XT;",
+ "0.0.0.7 - - [06/May/2017:00:00:00 +0000] "
+ + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+ + "333 536", Boolean.TRUE},
+ { "0.0.0.0 - - [08/May/2017:00:00:00 +0000] "
+ + "\"GET /server-status HTTP/1.1\" 200 1294",
+ "0.0.0.0 - - [08/May/2017:00:00:00 +0000] \"GET "
+ + "/server-status HTTP/1.1\" 200 1294", Boolean.TRUE}
+ });
+ }
+
+ @Parameter(0)
+ public String real;
+
+ @Parameter(1)
+ public String clean;
+
+ @Parameter(2)
+ public boolean valid;
+
+ @Test
+ public void testValidation() {
+ WebServerAccessLogLine line = WebServerAccessLogLine.makeLine(real);
+ assertEquals("Failed on line: " + real, valid, line.isValid());
+ assertEquals("Failed on line: " + real, clean, line.toLogString());
+ if (valid && !"".equals(clean)) { // A cleaned, accepted line is valid.
+ assertEquals("Failed on line: " + clean, clean,
+ WebServerAccessLogLine.makeLine(clean).toLogString());
+ }
+ }
+
+}
+
diff --git a/src/test/java/org/torproject/descriptor/log/WebServerAccessLogTest.java b/src/test/java/org/torproject/descriptor/log/WebServerAccessLogTest.java
new file mode 100644
index 0000000..3e98f13
--- /dev/null
+++ b/src/test/java/org/torproject/descriptor/log/WebServerAccessLogTest.java
@@ -0,0 +1,94 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Collection;
+
+ at RunWith(Parameterized.class)
+public class WebServerAccessLogTest {
+
+ /** Test data structure: given line, cleaned line, valid, filename. */
+ @Parameters
+ public static Collection<Object[]> logData() {
+ return Arrays.asList(new Object[][] {
+ { "0.0.0.0 - - [20/Sep/2017:00:00:00 +0000] "
+ + "\"GET /fonts/WOFF/OTF/SourceSansPro-It.otf.woff HTTP/1.1\" "
+ + "200 50556 \"https://metrics.torproject.org/\" \"-\" -",
+ "0.0.0.0 - - [20/Sep/2017:00:00:00 +0000] \"GET "
+ + "/fonts/WOFF/OTF/SourceSansPro-It.otf.woff HTTP/1.1\" 200 50556\n",
+ Boolean.TRUE, "virt.host0_phys.host1a_access.log_20170920"},
+ { "127.0.0.1 qwer 123 [30/May/2017:06:07:08 +0000] "
+ + "\"GET /server-status?auto HTTP/1.1\" 333 294 "
+ + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+ "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET /server-status"
+ + " HTTP/1.1\" 333 294\n", Boolean.TRUE,
+ "virt.host1_phys.host2a_access.log_20170530"},
+ { "0.0.0.3 abc 567 [30/May/2017:06:07:08 +0000] "
+ + "\"GET /server-status?auto HTTP/1.1\" 333 294 "
+ + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+ "0.0.0.3 - - [30/May/2017:00:00:00 +0000] \"GET /server-status"
+ + " HTTP/1.1\" 333 294\n", Boolean.TRUE,
+ "virt-host1_phys.host2a_access.log_20170530"},
+ { "11.22.33.44 - - [30/Jul/2017:15:16:17 +0000] "
+ + "\"GET http://www.torproject.org/favicon.ico HTTP/1.1\" "
+ + "100 536 \"-\" \"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0;"
+ + " SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.5.2; "
+ + ".NET CLR 3.5.30729; .NET CLR 3.0.30618)\"",
+ "0.0.0.0 - - [30/Jul/2017:00:00:00 +0000] "
+ + "\"GET http://www.torproject.org/favicon.ico HTTP/1.1\" "
+ + "100 536\n", Boolean.TRUE,
+ "virt.host1_phys.host2b_access.log_20170730"},
+ { "abcdefghijklmnop1234567890", "", Boolean.FALSE,
+ "vhost1_phys.host2c_access.log_20170731.log"},
+ { "", "", Boolean.FALSE, "host2d_host1_access.log_20170731.log"},
+ { "0.0.0.0 - - [30/May/2017:00:00:00 +0000] "
+ + "\"GET /server-status HTTP/1.1\" 200 1294 \"-\" \"-\" -",
+ "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+ + "/server-status HTTP/1.1\" 200 1294\n", Boolean.TRUE,
+ "some/other/path/virtual_physical_access.log_20170530.log"}
+ });
+ }
+
+ private String real;
+ private String clean;
+ private int count;
+ private boolean valid;
+ private String fn;
+ private File file;
+
+ /** Set the above test data. */
+ public WebServerAccessLogTest(String in, String out, boolean valid,
+ String filename) {
+ this.real = in;
+ this.clean = out;
+ this.valid = valid;
+ this.fn = filename;
+ this.file = new File(filename);
+ }
+
+ @Test
+ public void testValidation() throws Exception {
+ WebServerAccessLogImpl wsal
+ = new WebServerAccessLogImpl(real.getBytes(), file);
+ wsal.validate();
+ if (valid) {
+ assertEquals(0, wsal.getUnrecognizedLines().size());
+ } else {
+ if (!real.isEmpty()) {
+ assertEquals(real, wsal.getUnrecognizedLines().get(0));
+ }
+ }
+ }
+
+}
+
diff --git a/src/test/java/org/torproject/descriptor/log/WebServerModuleTest.java b/src/test/java/org/torproject/descriptor/log/WebServerModuleTest.java
new file mode 100644
index 0000000..a11bc30
--- /dev/null
+++ b/src/test/java/org/torproject/descriptor/log/WebServerModuleTest.java
@@ -0,0 +1,113 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import static org.junit.Assert.assertEquals;
+
+import org.torproject.descriptor.DescriptorParseException;
+
+import org.hamcrest.Matchers;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.stream.Collectors;
+
+/** This class contains various tests for the webstats module. */
+public class WebServerModuleTest {
+
+ @Rule
+ public ExpectedException thrown = ExpectedException.none();
+
+ @Test
+ public void testWrongFormat() throws Exception {
+ String filename = "h1_phys1_access.log_no-date.log";
+ thrown.expect(DescriptorParseException.class);
+ thrown.expectMessage(Matchers
+ .containsString("Cannot parse WebServerAccessLog file: "
+ + filename));
+ new WebServerAccessLogImpl(new byte[0],
+ new File(filename));
+ }
+
+ @Test
+ public void testDateFormat() throws Exception {
+ String filename = "h2_phys2_access.log_05001713";
+ thrown.expect(DescriptorParseException.class);
+ thrown.expectMessage(Matchers
+ .containsString("Cannot parse WebServerAccessLog file: "
+ + filename));
+ new WebServerAccessLogImpl(new byte[0],
+ new File(filename));
+ }
+
+ @Test
+ public void testNoParentPathRoot() throws Exception {
+ String filename = "h3_access.log_05001213";
+ thrown.expect(DescriptorParseException.class);
+ thrown.expectMessage(Matchers
+ .containsString("WebServerAccessLog "
+ + "file name doesn't comply to standard: " + filename));
+ new WebServerAccessLogImpl(new byte[0],
+ new File(filename));
+ }
+
+ @Test
+ public void testNoParentPathThis() throws Exception {
+ String filename = "_h3_access.log_05001213";
+ thrown.expect(DescriptorParseException.class);
+ thrown.expectMessage(Matchers
+ .containsString("WebServerAccessLog "
+ + "file name doesn't comply to standard: " + filename));
+ new WebServerAccessLogImpl(new byte[0],
+ new File(filename));
+ }
+
+ @Test
+ public void testNoParentPathParent() throws Exception {
+ String filename = "h3__access.log_05001213";
+ thrown.expect(DescriptorParseException.class);
+ thrown.expectMessage(Matchers
+ .containsString("WebServerAccessLog "
+ + "file name doesn't comply to standard: " + filename));
+ new WebServerAccessLogImpl(new byte[0],
+ new File(filename));
+ }
+
+ private static String[] logLines = {
+ "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+ + "/server-status HTTP/1.1\" 200 1205",
+ "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+ + "/server-status HTTP/1.1\" 200 1203",
+ "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+ + "/server-status HTTP/1.1\" 200 1207",
+ "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+ + "/server-status HTTP/1.1\" 200 1204",
+ "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+ + "/server-status HTTP/1.1\" 200 1202",
+ "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+ + "/server-status HTTP/1.1\" 200 1206",
+ "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+ + "/server-status HTTP/1.1\" 200 1201"
+ };
+
+ private static String logText = Arrays.asList(logLines).stream()
+ .map((String line) -> line + (" some content"))
+ .collect(Collectors.joining("\n"));
+
+ @Test
+ public void testBasics() throws Exception {
+ WebServerAccessLogImpl wsal = new WebServerAccessLogImpl(logText.getBytes(),
+ new File("vhost_host7_access.log_20170530"));
+ assertEquals(wsal.getAnnotations().size(), 0);
+ assertEquals(logText, new String(wsal.getRawDescriptorBytes()));
+ assertEquals("host7", wsal.getPhysicalHost());
+ assertEquals("vhost", wsal.getVirtualHost());
+ }
+
+}
+
diff --git a/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz b/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz
new file mode 100644
index 0000000..b459742
Binary files /dev/null and b/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz differ
diff --git a/src/test/resources/dummy.host.net/nix.server.org_dummy.host.net_access.log_20111111.bz2 b/src/test/resources/dummy.host.net/nix.server.org_dummy.host.net_access.log_20111111.bz2
new file mode 100644
index 0000000..17f335d
Binary files /dev/null and b/src/test/resources/dummy.host.net/nix.server.org_dummy.host.net_access.log_20111111.bz2 differ
diff --git a/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170530.gz b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170530.gz
new file mode 100644
index 0000000..8c2333b
Binary files /dev/null and b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170530.gz differ
diff --git a/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz
new file mode 100644
index 0000000..8c2333b
Binary files /dev/null and b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz differ
diff --git a/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170530.log b/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170530.log
new file mode 100644
index 0000000..eee478b
--- /dev/null
+++ b/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170530.log
@@ -0,0 +1,26 @@
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2873 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:37:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2877 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:39:44 +0000] "GET / HTTP/1.1" 200 868 "-" "check_http/v2.1.1 (monitoring-plugins 2.1.1)"
+123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:42:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:49:50 +0000] "GET / HTTP/1.1" 200 868 "-" "Wget/1.15 (linux-gnu)"
+123.456.789.0 - - [30/May/2017:06:52:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2872 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:54:44 +0000] "GET / HTTP/1.1" 200 868 "-" "check_http/v2.1.1 (monitoring-plugins 2.1.1)"
+123.456.789.0 - - [30/May/2017:06:56:54 +0000] "-" 408 0 "-" "-"
+123.456.789.0 - - [30/May/2017:06:57:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2876 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+
diff --git a/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170607.log b/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170607.log
new file mode 100644
index 0000000..eee478b
--- /dev/null
+++ b/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170607.log
@@ -0,0 +1,26 @@
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2873 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:37:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2877 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:39:44 +0000] "GET / HTTP/1.1" 200 868 "-" "check_http/v2.1.1 (monitoring-plugins 2.1.1)"
+123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:42:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:49:50 +0000] "GET / HTTP/1.1" 200 868 "-" "Wget/1.15 (linux-gnu)"
+123.456.789.0 - - [30/May/2017:06:52:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2872 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:54:44 +0000] "GET / HTTP/1.1" 200 868 "-" "check_http/v2.1.1 (monitoring-plugins 2.1.1)"
+123.456.789.0 - - [30/May/2017:06:56:54 +0000] "-" 408 0 "-" "-"
+123.456.789.0 - - [30/May/2017:06:57:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2876 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+
More information about the tor-commits
mailing list