[tor-commits] [metrics-tasks/master] Add code for web logs analysis (#20008).
karsten at torproject.org
karsten at torproject.org
Sun Aug 28 12:38:42 UTC 2016
commit ff63b91cdb0383b4c02ac6c82912a02e195c8b25
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Sun Aug 28 14:19:35 2016 +0200
Add code for web logs analysis (#20008).
---
task-20008/.gitignore | 9 ++
task-20008/LICENSE | 30 ++++++
task-20008/README.md | 46 ++++++++++
task-20008/run.sh | 8 ++
task-20008/src/Importer.java | 211 +++++++++++++++++++++++++++++++++++++++++++
task-20008/webstats.sql | 28 ++++++
6 files changed, 332 insertions(+)
diff --git a/task-20008/.gitignore b/task-20008/.gitignore
new file mode 100644
index 0000000..fb5bbfa
--- /dev/null
+++ b/task-20008/.gitignore
@@ -0,0 +1,9 @@
+/.classpath
+/.project
+/.vagrant/
+/Vagrantfile
+/bin/
+/lib/
+/log
+/webstats.torproject.org/
+
diff --git a/task-20008/LICENSE b/task-20008/LICENSE
new file mode 100644
index 0000000..9dbb66d
--- /dev/null
+++ b/task-20008/LICENSE
@@ -0,0 +1,30 @@
+Copyright 2016 The Tor Project
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+
+* Neither the names of the copyright owners nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/task-20008/README.md b/task-20008/README.md
new file mode 100644
index 0000000..5f7e254
--- /dev/null
+++ b/task-20008/README.md
@@ -0,0 +1,46 @@
+README: Perform an ad-hoc analysis of Tor's sanitized web logs
+==============================================================
+
+Sanitized versions of Tor's Apache web logs are available at
+https://webstats.torproject.org/. Let's perform an ad-hoc analysis of
+these logs to decide what graphs we'll want to put on Tor Metrics. And
+let's perform this analysis by throwing everything into a PostgreSQL
+database that we might later want to re-use for Tor Metrics.
+
+Steps to import sanitized web logs into the database:
+
+Create PostgreSQL database user webstats and database of same name:
+
+$ sudo -u postgres createuser -P webstats
+$ sudo -u postgres createdb -O webstats webstats
+
+Import database schema:
+
+$ psql -f webstats.sql webstats
+
+Fetch sanitized web logs and put them under webstats.torproject.org/:
+
+$ wget --recursive --reject "index.html*" --no-parent \
+ --accept "*201608*" https://webstats.torproject.org/
+
+Fetch the following required libraries and put them in the lib/
+folder:
+
+ - lib/commons-compress-1.9.jar
+ - lib/postgresql-jdbc3-9.2.jar
+ - lib/xz-1.5.jar
+
+Run the importer:
+
+$ ./run.sh
+
+Log into database and run a simple query:
+
+$ psql webstats
+
+webstats=> SELECT log_date, SUM(count) AS hits FROM requests
+ NATURAL JOIN resources NATURAL JOIN files
+ WHERE method = 'GET' AND response_code = 200
+ AND (resource_string = '/' OR resource_string LIKE '/index%')
+ AND site = 'www.torproject.org' GROUP BY log_date ORDER BY log_date;
+
diff --git a/task-20008/run.sh b/task-20008/run.sh
new file mode 100755
index 0000000..4039ee6
--- /dev/null
+++ b/task-20008/run.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+date
+rm -rf bin
+mkdir -p bin/
+javac -d bin/ -cp lib/commons-compress-1.9.jar:lib/postgresql-jdbc3-9.2.jar:lib/xz-1.5.jar src/Importer.java
+java -cp bin:lib/commons-compress-1.9.jar:lib/postgresql-jdbc3-9.2.jar:lib/xz-1.5.jar Importer webstats.torproject.org >> log
+date
+
diff --git a/task-20008/src/Importer.java b/task-20008/src/Importer.java
new file mode 100644
index 0000000..6834bf0
--- /dev/null
+++ b/task-20008/src/Importer.java
@@ -0,0 +1,211 @@
+/* Copyright 2016 The Tor Project
+ * See LICENSE for licensing information */
+
+import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.sql.Connection;
+import java.sql.Date;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class Importer {
+
+ public static void main(String[] args) throws Exception,
+ SQLException {
+ new Importer().importFiles(new File(args[0]));
+ }
+
+ private void importFiles(File directory) throws Exception,
+ SQLException {
+ this.connect();
+ List<File> files = new ArrayList<>();
+ files.add(directory);
+ while (!files.isEmpty()) {
+ File file = files.remove(0);
+ if (file.isDirectory()) {
+ files.addAll(Arrays.asList(file.listFiles()));
+ } else {
+ this.importFile(file);
+ }
+ }
+ this.disconnect();
+ }
+
+ private Connection connection;
+
+ PreparedStatement psFiles;
+
+ PreparedStatement psResourcesSelect;
+
+ PreparedStatement psResourcesInsert;
+
+ PreparedStatement psRequests;
+
+ void connect() throws SQLException {
+ this.connection = DriverManager.getConnection(
+ "jdbc:postgresql://localhost/webstats?user=vagrant&password=vagrant");
+ this.connection.setAutoCommit(false);
+ this.psFiles = this.connection.prepareStatement("INSERT INTO files "
+ + "(server, site, log_date) VALUES (?, ?, ?)",
+ Statement.RETURN_GENERATED_KEYS);
+ this.psResourcesSelect = this.connection.prepareStatement(
+ "SELECT resource_id FROM resources WHERE resource_string = ?");
+ this.psResourcesInsert = this.connection.prepareStatement(
+ "INSERT INTO resources (resource_string) VALUES (?)",
+ Statement.RETURN_GENERATED_KEYS);
+ this.psRequests = this.connection.prepareStatement("INSERT INTO requests "
+ + "(file_id, method, resource_id, response_code, count,"
+ + " total_bytes_sent) VALUES (?, CAST(? AS method), ?, ?, ?, ?)");
+ }
+
+ void disconnect() throws SQLException {
+ this.connection.close();
+ }
+
+ final Pattern fileNamePattern =
+ Pattern.compile("^(.+)-access.log-(\\d{8}).xz$");
+
+ DateFormat dateFormat = new SimpleDateFormat("yyyyMMdd");
+
+ final Pattern logLinePattern = Pattern.compile("^0.0.0.[01] - - "
+ + "\\[\\d{2}/\\w{3}/\\d{4}:00:00:00 \\+0000\\] "
+ + "\"(GET|HEAD) ([^ ]+) HTTP[^ ]+\" (\\d+) (-|\\d+) \"-\" \"-\" -$");
+
+ private void importFile(File file) throws Exception { // TODO catch exceptions!
+ String server = file.getParentFile().getName();
+ if (file.getName().contains("-ssl-access.log-")) {
+ System.out.println("Skipping file: " + file.getAbsolutePath());
+ return;
+ }
+ Matcher fileNameMatcher = this.fileNamePattern.matcher(file.getName());
+ if (!fileNameMatcher.matches()) {
+ System.err.println("Invalid file name: " + file.getAbsolutePath());
+ return;
+ }
+ String site = fileNameMatcher.group(1);
+ long logDateMillis = this.dateFormat.parse(fileNameMatcher.group(2))
+ .getTime();
+ int fileId = writeFile(server, site, logDateMillis);
+ BufferedReader br = new BufferedReader(new InputStreamReader(
+ new XZCompressorInputStream(new FileInputStream(file))));
+ String line;
+ SortedMap<String, long[]> requests = new TreeMap<>();
+ while ((line = br.readLine()) != null) {
+ Matcher logLineMatcher = this.logLinePattern.matcher(line);
+ if (!logLineMatcher.matches()) {
+ System.err.println("Invalid line: " + line);
+ this.connection.rollback(); // TODO reconsider
+ br.close();
+ return;
+ }
+ String method = logLineMatcher.group(1);
+ String resource = this.truncateString(logLineMatcher.group(2), 2048);
+ String responseCodeString = logLineMatcher.group(3);
+ String combined = String.format("%s %s %s", method, resource,
+ responseCodeString);
+ long bytesSent = logLineMatcher.group(4).equals("-") ? 0L
+ : Long.parseLong(logLineMatcher.group(4));
+ long[] request = requests.get(combined);
+ if (request == null) {
+ request = new long[] { 1L, bytesSent };
+ } else {
+ request[0]++;
+ request[1] += bytesSent;
+ }
+ requests.put(combined, request);
+ }
+ for (Map.Entry<String, long[]> request : requests.entrySet()) {
+ String[] keyParts = request.getKey().split(" ");
+ String method = keyParts[0];
+ String resource = keyParts[1];
+ int responseCode = Integer.parseInt(keyParts[2]);
+ long[] valueParts = request.getValue();
+ int count = (int) valueParts[0];
+ long totalBytesSent = valueParts[1];
+ this.writeRequest(fileId, method, resource, responseCode, count,
+ totalBytesSent);
+ }
+ br.close();
+ this.connection.commit();
+ }
+
+ int writeFile(String server, String site, long logDateMillis)
+ throws Exception {
+ int fileId = -1;
+ this.psFiles.clearParameters();
+ server = this.truncateString(server, 32);
+ this.psFiles.setString(1, server);
+ site = this.truncateString(site, 128);
+ this.psFiles.setString(2, site);
+ this.psFiles.setDate(3, new Date(logDateMillis));
+ this.psFiles.execute();
+ ResultSet resultSet = psFiles.getGeneratedKeys();
+ if (resultSet.next()) {
+ fileId = resultSet.getInt(1);
+ }
+ resultSet.close();
+ return fileId;
+ }
+
+ void writeRequest(int fileId, String method, String resource,
+ int responseCode, int count, long totalBytesSent) throws Exception {
+ System.out.printf("Writing request to database: %d, %s, %s, %d, %d, %d%n",
+ fileId, method, resource, responseCode, count, totalBytesSent);
+ int resourceId = -1;
+ this.psResourcesSelect.clearParameters();
+ this.psResourcesSelect.setString(1, resource);
+ ResultSet rs = this.psResourcesSelect.executeQuery();
+ if (rs.next()) {
+ resourceId = rs.getInt(1);
+ } else {
+ /* There's a small potential for a race condition between the previous
+ * SELECT and this INSERT INTO, but that will be resolved by the UNIQUE
+ * constraint when committing the transaction. */
+ this.psResourcesInsert.clearParameters();
+ this.psResourcesInsert.setString(1, resource);
+ this.psResourcesInsert.execute();
+ ResultSet resultSet = psResourcesInsert.getGeneratedKeys();
+ if (resultSet.next()) {
+ resourceId = resultSet.getInt(1);
+ } else {
+ throw new Exception("Could not retrieve auto-generated key for new "
+ + "resources entry."); // TODO better error handling
+ }
+ resultSet.close();
+ }
+ this.psRequests.clearParameters();
+ this.psRequests.setInt(1, fileId);
+ this.psRequests.setString(2, method);
+ this.psRequests.setInt(3, resourceId);
+ this.psRequests.setInt(4, responseCode);
+ this.psRequests.setInt(5, count);
+ this.psRequests.setLong(6, totalBytesSent);
+ this.psRequests.execute();
+ }
+
+ private String truncateString(String originalString, int truncateAfter) {
+ if (originalString.length() > truncateAfter) {
+ System.err.println("String too long, truncating: " + originalString);
+ originalString = originalString.substring(0, truncateAfter);
+ }
+ return originalString;
+ }
+}
+
diff --git a/task-20008/webstats.sql b/task-20008/webstats.sql
new file mode 100644
index 0000000..91a36b6
--- /dev/null
+++ b/task-20008/webstats.sql
@@ -0,0 +1,28 @@
+-- Copyright 2016 The Tor Project
+-- See LICENSE for licensing information
+
+CREATE TYPE method AS ENUM ('GET', 'HEAD');
+
+CREATE TABLE files (
+ file_id serial PRIMARY KEY,
+ server character varying(32) NOT NULL,
+ site character varying(128) NOT NULL,
+ log_date date NOT NULL,
+ UNIQUE (server, site, log_date)
+);
+
+CREATE TABLE resources (
+ resource_id serial PRIMARY KEY,
+ resource_string character varying(2048) UNIQUE NOT NULL
+);
+
+CREATE TABLE requests (
+ file_id integer REFERENCES files (file_id),
+ method method NOT NULL,
+ resource_id integer REFERENCES resources (resource_id),
+ response_code smallint NOT NULL,
+ count integer NOT NULL,
+ total_bytes_sent bigint NOT NULL,
+ UNIQUE (file_id, method, resource_id, response_code)
+);
+
More information about the tor-commits
mailing list