[tor-commits] [metrics-web/master] Provide all aggregate statistics on the website.
karsten at torproject.org
karsten at torproject.org
Sun Dec 8 16:37:54 UTC 2013
commit 7a824614474316b6f85656987eb93a920e345bb5
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Sun Dec 8 17:30:55 2013 +0100
Provide all aggregate statistics on the website.
This commit adds an intermediate layer between aggregating data in
cronjobs and visualizing results in graphs and tables on the website. All
data that is graphed or otherwise presented now come from 6 new .csv files
that are publicly available and not from the database that is only locally
available on the metrics server. A major advantage of this change is that
people can easily plot their own graphs or even develop a prettier metrics
website without writing their own data aggregation code.
---
db/tordir.sql | 122 ++++++
etc/web.xml | 18 +
rserve/csv.R | 291 ++++++--------
rserve/graphs.R | 404 ++++++++------------
rserve/rserve-init.R | 6 -
rserve/tables.R | 32 +-
.../ernie/web/research/ResearchStatsServlet.java | 132 +++++++
web/WEB-INF/banner.jsp | 6 +-
web/WEB-INF/error.jsp | 1 +
web/WEB-INF/stats.jsp | 288 ++++++++++++++
10 files changed, 857 insertions(+), 443 deletions(-)
diff --git a/db/tordir.sql b/db/tordir.sql
index 6b31aee..2a8533d 100644
--- a/db/tordir.sql
+++ b/db/tordir.sql
@@ -953,3 +953,125 @@ CREATE OR REPLACE FUNCTION refresh_all() RETURNS INTEGER AS $$
END;
$$ LANGUAGE plpgsql;
+-- View for exporting server statistics.
+CREATE VIEW stats_servers AS
+ (SELECT date, NULL AS flag, NULL AS country, NULL AS version,
+ NULL AS platform, TRUE AS ec2bridge, NULL AS relays,
+ avg_running_ec2 AS bridges FROM bridge_network_size
+ WHERE date < current_date - 1)
+UNION ALL
+ (SELECT COALESCE(network_size.date, bridge_network_size.date) AS date,
+ NULL AS flag, NULL AS country, NULL AS version, NULL AS platform,
+ NULL AS ec2bridge, network_size.avg_running AS relays,
+ bridge_network_size.avg_running AS bridges FROM network_size
+ FULL OUTER JOIN bridge_network_size
+ ON network_size.date = bridge_network_size.date
+ WHERE COALESCE(network_size.date, bridge_network_size.date) <
+ current_date - 1)
+UNION ALL
+ (SELECT date, 'Exit' AS flag, NULL AS country, NULL AS version,
+ NULL AS platform, NULL AS ec2bridge, avg_exit AS relays,
+ NULL AS bridges FROM network_size WHERE date < current_date - 1)
+UNION ALL
+ (SELECT date, 'Guard' AS flag, NULL AS country, NULL AS version,
+ NULL AS platform, NULL AS ec2bridge, avg_guard AS relays,
+ NULL AS bridges FROM network_size WHERE date < current_date - 1)
+UNION ALL
+ (SELECT date, 'Fast' AS flag, NULL AS country, NULL AS version,
+ NULL AS platform, NULL AS ec2bridge, avg_fast AS relays,
+ NULL AS bridges FROM network_size WHERE date < current_date - 1)
+UNION ALL
+ (SELECT date, 'Stable' AS flag, NULL AS country, NULL AS version,
+ NULL AS platform, NULL AS ec2bridge, avg_stable AS relays,
+ NULL AS bridges FROM network_size WHERE date < current_date - 1)
+UNION ALL
+ (SELECT date, 'HSDir' AS flag, NULL AS country, NULL AS version,
+ NULL AS platform, NULL AS ec2bridge, avg_hsdir AS relays,
+ NULL AS bridges FROM network_size WHERE date < current_date - 1)
+UNION ALL
+ (SELECT date, NULL AS flag, CASE WHEN country != 'zz' THEN country
+ ELSE '??' END AS country, NULL AS version, NULL AS platform,
+ NULL AS ec2bridge, relays, NULL AS bridges FROM relay_countries
+ WHERE date < current_date - 1)
+UNION ALL
+ (SELECT date, NULL AS flag, NULL AS country, version, NULL AS platform,
+ NULL AS ec2bridge, relays, NULL AS bridges FROM relay_versions
+ WHERE date < current_date - 1)
+UNION ALL
+ (SELECT date, NULL AS flag, NULL AS country, NULL AS version,
+ 'Linux' AS platform, NULL AS ec2bridge, avg_linux AS relays,
+ NULL AS bridges FROM relay_platforms WHERE date < current_date - 1)
+UNION ALL
+ (SELECT date, NULL AS flag, NULL AS country, NULL AS version,
+ 'Darwin' AS platform, NULL AS ec2bridge, avg_darwin AS relays,
+ NULL AS bridges FROM relay_platforms WHERE date < current_date - 1)
+UNION ALL
+ (SELECT date, NULL AS flag, NULL AS country, NULL AS version,
+ 'FreeBSD' AS platform, NULL AS ec2bridge, avg_bsd AS relays,
+ NULL AS bridges FROM relay_platforms WHERE date < current_date - 1)
+UNION ALL
+ (SELECT date, NULL AS flag, NULL AS country, NULL AS version,
+ 'Windows' AS platform, NULL AS ec2bridge, avg_windows AS relays,
+ NULL AS bridges FROM relay_platforms WHERE date < current_date - 1)
+UNION ALL
+ (SELECT date, NULL AS flag, NULL AS country, NULL AS version,
+ 'Other' AS platform, NULL AS ec2bridge, avg_other AS relays,
+ NULL AS bridges FROM relay_platforms WHERE date < current_date - 1)
+ORDER BY 1, 2, 3, 4, 5, 6;
+
+-- View for exporting bandwidth statistics.
+CREATE VIEW stats_bandwidth AS
+ (SELECT COALESCE(bandwidth_flags.date, bwhist_flags.date) AS date,
+ COALESCE(bandwidth_flags.isexit, bwhist_flags.isexit) AS isexit,
+ COALESCE(bandwidth_flags.isguard, bwhist_flags.isguard) AS isguard,
+ bandwidth_flags.bwadvertised AS advbw,
+ CASE WHEN bwhist_flags.read IS NOT NULL
+ THEN bwhist_flags.read / 86400 END AS bwread,
+ CASE WHEN bwhist_flags.written IS NOT NULL
+ THEN bwhist_flags.written / 86400 END AS bwwrite,
+ NULL AS dirread, NULL AS dirwrite
+ FROM bandwidth_flags FULL OUTER JOIN bwhist_flags
+ ON bandwidth_flags.date = bwhist_flags.date
+ AND bandwidth_flags.isexit = bwhist_flags.isexit
+ AND bandwidth_flags.isguard = bwhist_flags.isguard
+ WHERE COALESCE(bandwidth_flags.date, bwhist_flags.date) <
+ current_date - 3)
+UNION ALL
+ (SELECT COALESCE(total_bandwidth.date, total_bwhist.date, u.date)
+ AS date, NULL AS isexit, NULL AS isguard,
+ total_bandwidth.bwadvertised AS advbw,
+ CASE WHEN total_bwhist.read IS NOT NULL
+ THEN total_bwhist.read / 86400 END AS bwread,
+ CASE WHEN total_bwhist.written IS NOT NULL
+ THEN total_bwhist.written / 86400 END AS bwwrite,
+ CASE WHEN u.date IS NOT NULL
+ THEN FLOOR(CAST(u.dr AS NUMERIC) * CAST(u.brp AS NUMERIC) /
+ CAST(u.brd AS NUMERIC) / CAST(86400 AS NUMERIC)) END AS dirread,
+ CASE WHEN u.date IS NOT NULL
+ THEN FLOOR(CAST(u.dw AS NUMERIC) * CAST(u.bwp AS NUMERIC) /
+ CAST(u.bwd AS NUMERIC) / CAST(86400 AS NUMERIC)) END AS dirwrite
+ FROM total_bandwidth FULL OUTER JOIN total_bwhist
+ ON total_bandwidth.date = total_bwhist.date
+ FULL OUTER JOIN (SELECT * FROM user_stats WHERE country = 'zy'
+ AND bwp / bwd <= 3) u
+ ON COALESCE(total_bandwidth.date, total_bwhist.date) = u.date
+ WHERE COALESCE(total_bandwidth.date, total_bwhist.date, u.date) <
+ current_date - 3)
+ORDER BY 1, 2, 3;
+
+-- View for exporting torperf statistics.
+CREATE VIEW stats_torperf AS
+SELECT date, CASE WHEN source LIKE '%-50kb' THEN 50 * 1024
+ WHEN source LIKE '%-1mb' THEN 1024 * 1024
+ WHEN source LIKE '%-5mb' THEN 5 * 1024 * 1024 END AS size,
+ CASE WHEN source NOT LIKE 'all-%'
+ THEN split_part(source, '-', 1) END AS source, q1, md, q3, timeouts,
+ failures, requests FROM torperf_stats WHERE date < current_date - 1
+ ORDER BY 1, 2, 3;
+
+-- View for exporting connbidirect statistics.
+CREATE VIEW stats_connbidirect AS
+SELECT DATE(statsend) AS date, source, belownum AS below, readnum AS read,
+ writenum AS write, bothnum AS "both" FROM connbidirect
+ WHERE DATE(statsend) < current_date - 1 ORDER BY 1, 2;
+
diff --git a/etc/web.xml b/etc/web.xml
index 3f49001..866f427 100644
--- a/etc/web.xml
+++ b/etc/web.xml
@@ -128,6 +128,24 @@
</servlet-mapping>
<servlet>
+ <servlet-name>ResearchStats</servlet-name>
+ <servlet-class>
+ org.torproject.ernie.web.research.ResearchStatsServlet
+ </servlet-class>
+ <init-param>
+ <param-name>statsDir</param-name>
+ <param-value>
+ /srv/metrics.torproject.org/web/stats/
+ </param-value>
+ </init-param>
+ </servlet>
+ <servlet-mapping>
+ <servlet-name>ResearchStats</servlet-name>
+ <url-pattern>/stats/*</url-pattern>
+ <url-pattern>/stats.html</url-pattern>
+ </servlet-mapping>
+
+ <servlet>
<servlet-name>Status</servlet-name>
<servlet-class>
org.torproject.ernie.web.StatusServlet
diff --git a/rserve/csv.R b/rserve/csv.R
index 8150b17..2ec6e7b 100644
--- a/rserve/csv.R
+++ b/rserve/csv.R
@@ -1,223 +1,177 @@
options(scipen = 15)
export_networksize <- function(path) {
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, avg_running AS relays FROM network_size",
- "WHERE date < current_date - 1")
- rs <- dbSendQuery(con, q)
- relays <- fetch(rs, n = -1)
- q <- paste("SELECT date, avg_running AS bridges",
- "FROM bridge_network_size WHERE date < current_date - 1")
- rs <- dbSendQuery(con, q)
- bridges <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
- networksize <- rbind(melt(relays, "date"), melt(bridges, "date"))
- networksize <- cast(networksize, date ~ variable)
- networksize <- networksize[order(networksize$date), ]
- write.csv(networksize, path, quote = FALSE, row.names = FALSE)
+ s <- read.csv("/srv/metrics.torproject.org/web/stats/servers.csv",
+ stringsAsFactors = FALSE)
+ s <- s[s$flag == '' & s$country == '' & s$version == '' &
+ s$platform == '' & s$ec2bridge == '',
+ c("date", "relays", "bridges")]
+ write.csv(s, path, quote = FALSE, row.names = FALSE)
}
export_cloudbridges <- function(path) {
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, avg_running_ec2 AS cloudbridges",
- "FROM bridge_network_size WHERE date < current_date - 1",
- "ORDER BY date")
- rs <- dbSendQuery(con, q)
- cloudbridges <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
+ s <- read.csv("/srv/metrics.torproject.org/web/stats/servers.csv",
+ stringsAsFactors = FALSE)
+ s <- s[s$flag == '' & s$country == '' & s$version == '' &
+ s$platform == '' & s$ec2bridge == 't', ]
+ cloudbridges <- data.frame(date = s$date, cloudbridges = s$bridges)
write.csv(cloudbridges, path, quote = FALSE, row.names = FALSE)
}
export_relaycountries <- function(path) {
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, country, relays FROM relay_countries",
- "WHERE date < current_date - 1 ORDER BY date, country")
- rs <- dbSendQuery(con, q)
- relays <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
- write.csv(relays, path, quote = FALSE, row.names = FALSE)
+ s <- read.csv("/srv/metrics.torproject.org/web/stats/servers.csv",
+ stringsAsFactors = FALSE)
+ s <- s[s$flag == '' & s$country != '' & s$version == '' &
+ s$platform == '' & s$ec2bridge == '',
+ c("date", "country", "relays")]
+ write.csv(s, path, quote = FALSE, row.names = FALSE)
}
export_versions <- function(path) {
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, version, relays FROM relay_versions",
- "WHERE date < current_date - 1")
- rs <- dbSendQuery(con, q)
- versions <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
- versions <- cast(versions, date ~ version, value = "relays")
+ s <- read.csv("/srv/metrics.torproject.org/web/stats/servers.csv",
+ stringsAsFactors = FALSE)
+ s <- s[s$flag == '' & s$country == '' & s$version != '' &
+ s$platform == '' & s$ec2bridge == '',
+ c("date", "version", "relays")]
+ versions <- cast(s, date ~ version, value = "relays")
versions <- versions[order(versions$date), ]
write.csv(versions, path, quote = FALSE, row.names = FALSE)
}
export_platforms <- function(path) {
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, avg_linux AS linux, avg_darwin AS darwin,",
- "avg_bsd AS bsd, avg_windows AS windows, avg_other AS other",
- "FROM relay_platforms WHERE date < current_date - 1 ORDER BY date")
- rs <- dbSendQuery(con, q)
- platforms <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
+ s <- read.csv("/srv/metrics.torproject.org/web/stats/servers.csv",
+ stringsAsFactors = FALSE)
+ s <- s[s$flag == '' & s$country == '' & s$version == '' &
+ s$platform != '' & s$ec2bridge == '',
+ c("date", "platform", "relays")]
+ s <- data.frame(date = s$date,
+ platform = ifelse(s$platform == 'FreeBSD', 'bsd',
+ tolower(s$platform)), relays = s$relays)
+ s <- cast(s, date ~ platform, value = "relays")
+ platforms <- s[order(s$date), ]
write.csv(platforms, path, quote = FALSE, row.names = FALSE)
}
export_bandwidth <- function(path) {
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, bwadvertised FROM total_bandwidth",
- "WHERE date < current_date - 3")
- rs <- dbSendQuery(con, q)
- bw_desc <- fetch(rs, n = -1)
- q <- paste("SELECT date, read, written FROM total_bwhist",
- "WHERE date < current_date - 3")
- rs <- dbSendQuery(con, q)
- bw_hist <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
- bandwidth <- rbind(data.frame(date = bw_desc$date,
- value = bw_desc$bwadvertised, variable = "bwadv"),
- data.frame(date = bw_hist$date, value = floor((bw_hist$read +
- bw_hist$written) / (2 * 86400)), variable = "bwhist"))
- bandwidth <- cast(bandwidth, date ~ variable, value = "value")
- bandwidth <- bandwidth[order(bandwidth$date), ]
- write.csv(bandwidth, path, quote = FALSE, row.names = FALSE)
+ b <- read.csv("/srv/metrics.torproject.org/web/stats/bandwidth.csv",
+ stringsAsFactors = FALSE)
+ b <- b[b$isexit == '' & b$isguard == '', ]
+ b <- data.frame(date = as.Date(b$date, "%Y-%m-%d"),
+ bwadv = b$advbw,
+ bwhist = floor((b$bwread + b$bwwrite) / 2))
+ b <- b[order(b$date), ]
+ write.csv(b, path, quote = FALSE, row.names = FALSE)
}
export_bwhist_flags <- function(path) {
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, isexit, isguard, read / 86400 AS read,",
- "written / 86400 AS written",
- "FROM bwhist_flags WHERE date < current_date - 3",
- "ORDER BY date, isexit, isguard")
- rs <- dbSendQuery(con, q)
- bw <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
- write.csv(bw, path, quote = FALSE, row.names = FALSE)
+ b <- read.csv("/srv/metrics.torproject.org/web/stats/bandwidth.csv",
+ stringsAsFactors = FALSE)
+ b <- b[b$isexit != '' & b$isguard != '' & !is.na(b$bwread) &
+ !is.na(b$bwwrite), ]
+ b <- data.frame(date = as.Date(b$date, "%Y-%m-%d"),
+ isexit = b$isexit == 't', isguard = b$isguard == 't',
+ read = b$bwread, written = b$bwwrite)
+ write.csv(b, path, quote = FALSE, row.names = FALSE)
}
export_dirbytes <- function(path) {
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, dr, dw, brp, bwp, brd, bwd FROM user_stats",
- "WHERE country = 'zy' AND bwp / bwd <= 3",
- "AND date < current_date - 3 ORDER BY date")
- rs <- dbSendQuery(con, q)
- dir <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
- dir <- data.frame(date = dir$date,
- dirread = floor(dir$dr * dir$brp / dir$brd / 86400),
- dirwrite = floor(dir$dw * dir$bwp / dir$bwd / 86400))
- dir <- na.omit(dir)
- write.csv(dir, path, quote = FALSE, row.names = FALSE)
+ b <- read.csv("/srv/metrics.torproject.org/web/stats/bandwidth.csv",
+ stringsAsFactors = FALSE)
+ b <- b[b$isexit == '' & b$isguard == '' & !is.na(b$dirread) &
+ !is.na(b$dirwrite), ]
+ b <- data.frame(date = as.Date(b$date, "%Y-%m-%d"),
+ dirread = b$dirread, dirwrite = b$dirwrite)
+ b <- b[order(b$date), ]
+ write.csv(b, path, quote = FALSE, row.names = FALSE)
}
export_relayflags <- function(path) {
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, avg_running AS running, avg_exit AS exit,",
- "avg_guard AS guard, avg_fast AS fast, avg_stable AS stable,",
- "avg_hsdir AS hsdir",
- "FROM network_size WHERE date < current_date - 1 ORDER BY date")
- rs <- dbSendQuery(con, q)
- relayflags <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
+ s <- read.csv("/srv/metrics.torproject.org/web/stats/servers.csv",
+ stringsAsFactors = FALSE)
+ s <- s[s$country == '' & s$version == '' & s$platform == '' &
+ s$ec2bridge == '', ]
+ s <- data.frame(date = as.Date(s$date, "%Y-%m-%d"),
+ flag = ifelse(s$flag == '', 'running', tolower(s$flag)),
+ relays = s$relays)
+ s <- cast(s, date ~ flag, value = "relays")
+ relayflags <- s[order(s$date), ]
write.csv(relayflags, path, quote = FALSE, row.names = FALSE)
}
export_torperf <- function(path) {
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT source, date, q1, md, q3 FROM torperf_stats",
- "WHERE date < current_date - 1 ORDER BY source, date")
- rs <- dbSendQuery(con, q)
- torperf <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
+ t <- read.csv("/srv/metrics.torproject.org/web/stats/torperf.csv",
+ stringsAsFactors = FALSE)
+ t <- data.frame(
+ source = paste(ifelse(t$source == '', 'all', t$source),
+ ifelse(t$size == 50 * 1024, '50kb',
+ ifelse(t$size == 1024 * 1024, '1mb', '5mb')),
+ sep = '-'),
+ date = as.Date(t$date, "%Y-%m-%d"),
+ q1 = t$q1, md = t$md, q3 = t$q3)
+ torperf <- t[order(t$source, t$date), ]
write.csv(torperf, path, quote = FALSE, row.names = FALSE)
}
export_torperf_failures <- function(path) {
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT source, date, timeouts, failures, requests",
- "FROM torperf_stats WHERE date < current_date - 1",
- "ORDER BY source, date")
- rs <- dbSendQuery(con, q)
- torperf <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
+ t <- read.csv("/srv/metrics.torproject.org/web/stats/torperf.csv",
+ stringsAsFactors = FALSE)
+ t <- data.frame(
+ source = paste(ifelse(t$source == '', 'all', t$source),
+ ifelse(t$size == 50 * 1024, '50kb',
+ ifelse(t$size == 1024 * 1024, '1mb', '5mb')),
+ sep = '-'),
+ date = as.Date(t$date, "%Y-%m-%d"),
+ timeouts = t$timeouts, failures = t$failures, requests = t$requests)
+ torperf <- t[order(t$source, t$date), ]
write.csv(torperf, path, quote = FALSE, row.names = FALSE)
}
export_connbidirect <- function(path) {
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT DATE(statsend) AS date, source, belownum AS below,",
- "readnum AS read, writenum AS write, bothnum AS \"both\"",
- "FROM connbidirect WHERE DATE(statsend) < current_date - 1",
- "ORDER BY 1, 2")
- rs <- dbSendQuery(con, q)
- c <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
- write.csv(format(c, trim = TRUE, scientific = FALSE), path,
+ c <- read.csv("/srv/metrics.torproject.org/web/stats/connbidirect.csv",
+ stringsAsFactors = FALSE)
+ write.csv(format(c, trim = TRUE, scientific = FALSE), path,
quote = FALSE, row.names = FALSE)
}
export_bandwidth_flags <- function(path) {
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, isexit, isguard, bwadvertised AS value",
- "FROM bandwidth_flags WHERE date < current_date - 3")
- rs <- dbSendQuery(con, q)
- bw_desc <- fetch(rs, n = -1)
- q <- paste("SELECT date, isexit, isguard,",
- "(read + written) / (2 * 86400) AS value",
- "FROM bwhist_flags WHERE date < current_date - 3")
- rs <- dbSendQuery(con, q)
- bw_hist <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
- bandwidth <- rbind(data.frame(bw_desc, type = "advbw"),
- data.frame(bw_hist, type = "bwhist"))
- bandwidth <- rbind(
- data.frame(bandwidth[bandwidth$isguard == TRUE, ], flag = "guard"),
- data.frame(bandwidth[bandwidth$isexit == TRUE, ], flag = "exit"))
- bandwidth <- aggregate(list(value = bandwidth$value),
- by = list(date = bandwidth$date, type = bandwidth$type,
- flag = bandwidth$flag), FUN = sum)
- write.csv(format(bandwidth, trim = TRUE, scientific = FALSE), path,
- quote = FALSE, row.names = FALSE)
+ b <- read.csv("/srv/metrics.torproject.org/web/stats/bandwidth.csv",
+ stringsAsFactors = FALSE)
+ b <- b[b$isexit != '' & b$isguard != '', ]
+ b <- data.frame(date = as.Date(b$date, "%Y-%m-%d"),
+ isexit = b$isexit == 't', isguard = b$isguard == 't',
+ advbw = b$advbw,
+ bwhist = floor((b$bwread + b$bwwrite) / 2))
+ b <- rbind(
+ data.frame(b[b$isguard == TRUE, ], flag = "guard"),
+ data.frame(b[b$isexit == TRUE, ], flag = "exit"))
+ b <- data.frame(date = b$date, advbw = b$advbw, bwhist = b$bwhist,
+ flag = b$flag)
+ b <- aggregate(list(advbw = b$advbw, bwhist = b$bwhist),
+ by = list(date = b$date, flag = b$flag), FUN = sum,
+ na.rm = TRUE, na.action = NULL)
+ b <- melt(b, id.vars = c("date", "flag"))
+ b <- data.frame(date = b$date, type = b$variable, flag = b$flag,
+ value = b$value)
+ b <- b[b$value > 0, ]
+ write.csv(b, path, quote = FALSE, row.names = FALSE)
}
export_userstats <- function(path) {
- u <- read.csv(paste("/srv/metrics.torproject.org/task-8462-graphs/",
- "task-8462/userstats.csv", sep = ""),
+ c <- read.csv("/srv/metrics.torproject.org/web/stats/clients.csv",
stringsAsFactors = FALSE)
- write.csv(format(u, trim = TRUE, scientific = FALSE), path,
+ c <- data.frame(date = c$date, node = c$node, country = c$country,
+ transport = c$transport, version = c$version,
+ frac = c$frac, users = c$clients)
+ write.csv(format(c, trim = TRUE, scientific = FALSE), path,
quote = FALSE, row.names = FALSE)
}
help_export_monthly_userstats <- function(path, aggr_fun) {
- u <- read.csv(paste("/srv/metrics.torproject.org/task-8462-graphs/",
- "task-8462/userstats.csv", sep = ""),
+ c <- read.csv("/srv/metrics.torproject.org/web/stats/clients.csv",
stringsAsFactors = FALSE)
- u <- u[u$country != '' & u$transport == '' & u$version == '',
- c("date", "country", "users")]
+ c <- c[c$country != '' & c$transport == '' & c$version == '', ]
+ u <- data.frame(date = c$date, country = c$country, users = c$clients,
+ stringsAsFactors = FALSE)
u <- aggregate(list(users = u$users),
by = list(date = u$date, country = u$country), sum)
u <- aggregate(list(users = u$users),
@@ -241,11 +195,12 @@ export_monthly_userstats_average <- function(path) {
}
export_userstats_detector <- function(path) {
- u <- read.csv(paste("/srv/metrics.torproject.org/task-8462-graphs/",
- "task-8462/userstats.csv", sep = ""),
+ c <- read.csv("/srv/metrics.torproject.org/web/stats/clients.csv",
stringsAsFactors = FALSE)
- u <- u[u$country != '' & u$transport == '' & u$version == '' &
- u$node == 'relay', c("country", "date", "users")]
+ c <- c[c$country != '' & c$transport == '' & c$version == '' &
+ c$node == 'relay', ]
+ u <- data.frame(country = c$country, date = c$date, users = c$clients,
+ stringsAsFactors = FALSE)
u <- rbind(u, data.frame(country = "zy",
aggregate(list(users = u$users),
by = list(date = u$date), sum)))
diff --git a/rserve/graphs.R b/rserve/graphs.R
index 8157d89..b862584 100644
--- a/rserve/graphs.R
+++ b/rserve/graphs.R
@@ -279,34 +279,21 @@ date_breaks <- function(days) {
plot_networksize <- function(start, end, path) {
end <- min(end, as.character(Sys.Date() - 2))
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, avg_running AS relays FROM network_size ",
- "WHERE date >= '", start, "' AND date <= '", end, "'", sep = "")
- rs <- dbSendQuery(con, q)
- relays <- fetch(rs, n = -1)
- q <- paste("SELECT date, avg_running AS bridges ",
- "FROM bridge_network_size WHERE date >= '", start,
- "' AND date <= '", end, "'", sep = "")
- rs <- dbSendQuery(con, q)
- bridges <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
+ s <- read.csv("/srv/metrics.torproject.org/web/stats/servers.csv",
+ stringsAsFactors = FALSE)
+ s <- s[s$date >= start & s$date <= end & s$flag == '' &
+ s$country == '' & s$version == '' & s$platform == '' &
+ s$ec2bridge == '', ]
+ s <- data.frame(date = as.Date(s$date, "%Y-%m-%d"), relays = s$relays,
+ bridges = s$bridges)
dates <- seq(from = as.Date(start, "%Y-%m-%d"),
to = as.Date(end, "%Y-%m-%d"), by="1 day")
- missing <- setdiff(dates, as.Date(relays$date, origin = "1970-01-01"))
- if (length(missing) > 0)
- relays <- rbind(relays,
- data.frame(date = as.Date(missing, origin = "1970-01-01"),
- relays = NA))
- missing <- setdiff(dates, bridges$date)
+ missing <- setdiff(dates, as.Date(s$date, origin = "1970-01-01"))
if (length(missing) > 0)
- bridges <- rbind(bridges,
+ s <- rbind(s,
data.frame(date = as.Date(missing, origin = "1970-01-01"),
- bridges = NA))
- relays <- melt(relays, id = "date")
- bridges <- melt(bridges, id = "date")
- networksize <- rbind(relays, bridges)
+ relays = NA, bridges = NA))
+ networksize <- melt(s, id = "date")
date_breaks <- date_breaks(
as.numeric(max(as.Date(networksize$date, "%Y-%m-%d")) -
min(as.Date(networksize$date, "%Y-%m-%d"))))
@@ -326,61 +313,47 @@ plot_networksize <- function(start, end, path) {
plot_cloudbridges <- function(start, end, path) {
end <- min(end, as.character(Sys.Date() - 2))
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, avg_running_ec2 ",
- "FROM bridge_network_size WHERE date >= '", start,
- "' AND date <= '", end, "'", sep = "")
- rs <- dbSendQuery(con, q)
- bridges <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
+ s <- read.csv("/srv/metrics.torproject.org/web/stats/servers.csv",
+ stringsAsFactors = FALSE)
+ s <- s[s$date >= start & s$date <= end & s$flag == '' &
+ s$country == '' & s$version == '' & s$platform == '' &
+ s$ec2bridge == 't', ]
+ s <- data.frame(date = as.Date(s$date, "%Y-%m-%d"), bridges = s$bridges)
dates <- seq(from = as.Date(start, "%Y-%m-%d"),
to = as.Date(end, "%Y-%m-%d"), by="1 day")
- missing <- setdiff(dates, bridges$date)
+ missing <- setdiff(dates, s$date)
if (length(missing) > 0)
- bridges <- rbind(bridges,
+ s <- rbind(s,
data.frame(date = as.Date(missing, origin = "1970-01-01"),
- avg_running_ec2 = NA))
+ bridges = NA))
date_breaks <- date_breaks(
- as.numeric(max(as.Date(bridges$date, "%Y-%m-%d")) -
- min(as.Date(bridges$date, "%Y-%m-%d"))))
- ggplot(bridges, aes(x = as.Date(date, "%Y-%m-%d"),
- y = avg_running_ec2)) +
+ as.numeric(max(as.Date(s$date, "%Y-%m-%d")) -
+ min(as.Date(s$date, "%Y-%m-%d"))))
+ ggplot(s, aes(x = as.Date(date, "%Y-%m-%d"), y = bridges)) +
geom_line(size = 1, colour = "green3") +
scale_x_date(name = paste("\nThe Tor Project - ",
"https://metrics.torproject.org/", sep = ""),
format = date_breaks$format, major = date_breaks$major,
minor = date_breaks$minor) +
scale_y_continuous(name = "", limits = c(0,
- max(bridges$avg_running_ec2, na.rm = TRUE))) +
+ max(s$bridges, na.rm = TRUE))) +
opts(title = "Number of Tor Cloud bridges\n")
ggsave(filename = path, width = 8, height = 5, dpi = 72)
}
plot_relaycountries <- function(start, end, country, path) {
end <- min(end, as.character(Sys.Date() - 2))
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- if (country == "all") {
- q <- paste("SELECT date, avg_running AS relays FROM network_size ",
- "WHERE date >= '", start, "' AND date <= '", end, "'", sep = "")
- } else {
- q <- paste("SELECT date, relays FROM relay_countries ",
- "WHERE date >= '", start, "' AND date <= '", end,
- "' AND country = '", country, "'", sep = "")
- }
- rs <- dbSendQuery(con, q)
- u <- fetch(rs, n = -1)
- if (length(u$date) == 0)
- u <- data.frame(date = as.Date(start), relays = 0)
- dbDisconnect(con)
- dbUnloadDriver(drv)
+ s <- read.csv("/srv/metrics.torproject.org/web/stats/servers.csv",
+ stringsAsFactors = FALSE)
+ s <- s[s$date >= start & s$date <= end & s$flag == '' &
+ s$country == ifelse(country == "all", '', country) &
+ s$version == '' & s$platform == '' & s$ec2bridge == '', ]
+ s <- data.frame(date = as.Date(s$date, "%Y-%m-%d"), relays = s$relays)
dates <- seq(from = as.Date(start, "%Y-%m-%d"),
to = as.Date(end, "%Y-%m-%d"), by="1 day")
- missing <- setdiff(dates, u$date)
+ missing <- setdiff(dates, s$date)
if (length(missing) > 0)
- u <- rbind(u,
+ s <- rbind(s,
data.frame(date = as.Date(missing, origin = "1970-01-01"),
relays = NA))
title <- ifelse(country == "all",
@@ -388,15 +361,15 @@ plot_relaycountries <- function(start, end, country, path) {
paste("Number of relays in ", countryname(country), "\n", sep = ""))
formatter <- function(x, ...) { format(x, scientific = FALSE, ...) }
date_breaks <- date_breaks(
- as.numeric(max(as.Date(u$date, "%Y-%m-%d")) -
- min(as.Date(u$date, "%Y-%m-%d"))))
- ggplot(u, aes(x = as.Date(date, "%Y-%m-%d"), y = relays)) +
+ as.numeric(max(as.Date(s$date, "%Y-%m-%d")) -
+ min(as.Date(s$date, "%Y-%m-%d"))))
+ ggplot(s, aes(x = as.Date(date, "%Y-%m-%d"), y = relays)) +
geom_line(size = 1) +
scale_x_date(name = paste("\nThe Tor Project - ",
"https://metrics.torproject.org/", sep = ""),
format = date_breaks$format, major = date_breaks$major,
minor = date_breaks$minor) +
- scale_y_continuous(name = "", limits = c(0, max(u$relays,
+ scale_y_continuous(name = "", limits = c(0, max(s$relays,
na.rm = TRUE)), formatter = formatter) +
opts(title = title)
ggsave(filename = path, width = 8, height = 5, dpi = 72)
@@ -404,20 +377,19 @@ plot_relaycountries <- function(start, end, country, path) {
plot_versions <- function(start, end, path) {
end <- min(end, as.character(Sys.Date() - 2))
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, version, relays FROM relay_versions ",
- "WHERE date >= '", start, "' AND date <= '", end, "'", sep = "")
- rs <- dbSendQuery(con, q)
- versions <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
+ s <- read.csv("/srv/metrics.torproject.org/web/stats/servers.csv",
+ stringsAsFactors = FALSE)
+ s <- s[s$date >= start & s$date <= end & s$flag == '' &
+ s$country == '' & s$version != '' & s$platform == '' &
+ s$ec2bridge == '', ]
+ s <- data.frame(date = as.Date(s$date, "%Y-%m-%d"), version = s$version,
+ relays = s$relays)
known_versions <- c("0.1.0", "0.1.1", "0.1.2", "0.2.0", "0.2.1",
"0.2.2", "0.2.3", "0.2.4")
colours <- data.frame(breaks = known_versions,
values = brewer.pal(length(known_versions), "Accent"),
stringsAsFactors = FALSE)
- versions <- versions[versions$version %in% known_versions, ]
+ versions <- s[s$version %in% known_versions, ]
visible_versions <- sort(unique(versions$version))
date_breaks <- date_breaks(
as.numeric(max(as.Date(versions$date, "%Y-%m-%d")) -
@@ -440,16 +412,13 @@ plot_versions <- function(start, end, path) {
plot_platforms <- function(start, end, path) {
end <- min(end, as.character(Sys.Date() - 2))
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user=dbuser, password=dbpassword, dbname=db)
- q <- paste("SELECT date, avg_linux, avg_darwin, avg_bsd, avg_windows, ",
- "avg_other FROM relay_platforms WHERE date >= '", start,
- "' AND date <= '", end, "'", sep = "")
- rs <- dbSendQuery(con, q)
- platforms <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
- platforms <- melt(platforms, id = "date")
+ s <- read.csv("/srv/metrics.torproject.org/web/stats/servers.csv",
+ stringsAsFactors = FALSE)
+ s <- s[s$date >= start & s$date <= end & s$flag == '' &
+ s$country == '' & s$version == '' & s$platform != '' &
+ s$ec2bridge == '', ]
+ platforms <- data.frame(date = as.Date(s$date, "%Y-%m-%d"),
+ variable = s$platform, value = s$relays)
date_breaks <- date_breaks(
as.numeric(max(as.Date(platforms$date, "%Y-%m-%d")) -
min(as.Date(platforms$date, "%Y-%m-%d"))))
@@ -463,32 +432,22 @@ plot_platforms <- function(start, end, path) {
scale_y_continuous(name = "",
limits = c(0, max(platforms$value, na.rm = TRUE))) +
scale_colour_manual(name = "Platform",
- breaks = c("avg_linux", "avg_darwin", "avg_bsd", "avg_windows",
- "avg_other"),
- values = c("#E69F00", "#56B4E9", "#009E73", "#0072B2", "#333333"),
- labels = c("Linux", "Darwin", "FreeBSD", "Windows", "Other")) +
+ breaks = c("Linux", "Darwin", "FreeBSD", "Windows", "Other"),
+ values = c("#E69F00", "#56B4E9", "#009E73", "#0072B2", "#333333")) +
opts(title = "Relay platforms\n")
ggsave(filename = path, width = 8, height = 5, dpi = 72)
}
plot_bandwidth <- function(start, end, path) {
end <- min(end, as.character(Sys.Date() - 4))
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, bwadvertised FROM total_bandwidth ",
- "WHERE date >= '", start, "' AND date <= '", end, "'", sep = "")
- rs <- dbSendQuery(con, q)
- bw_desc <- fetch(rs, n = -1)
- q <- paste("SELECT date, read, written FROM total_bwhist ",
- "WHERE date >= '", start, "' AND date <= '", end, "'", sep = "")
- rs <- dbSendQuery(con, q)
- bw_hist <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
- bandwidth <- rbind(data.frame(date = bw_desc$date,
- value = bw_desc$bwadvertised, variable = "bwadv"),
- data.frame(date = bw_hist$date, value = (bw_hist$read +
- bw_hist$written) / (2 * 86400), variable = "bwhist"))
+ b <- read.csv("/srv/metrics.torproject.org/web/stats/bandwidth.csv",
+ stringsAsFactors = FALSE)
+ b <- b[b$date >= start & b$date <= end & b$isexit == '' &
+ b$isguard == '', ]
+ b <- data.frame(date = as.Date(b$date, "%Y-%m-%d"),
+ bwadv = b$advbw,
+ bwhist = (b$bwread + b$bwwrite) / 2)
+ bandwidth <- melt(b, id = "date")
date_breaks <- date_breaks(
as.numeric(max(as.Date(bandwidth$date, "%Y-%m-%d")) -
min(as.Date(bandwidth$date, "%Y-%m-%d"))))
@@ -510,15 +469,13 @@ plot_bandwidth <- function(start, end, path) {
plot_bwhist_flags <- function(start, end, path) {
end <- min(end, as.character(Sys.Date() - 4))
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, isexit, isguard, read, written ",
- "FROM bwhist_flags WHERE date >= '", start, "' AND date <= '", end,
- "'", sep = "")
- rs <- dbSendQuery(con, q)
- bw <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
+ b <- read.csv("/srv/metrics.torproject.org/web/stats/bandwidth.csv",
+ stringsAsFactors = FALSE)
+ b <- b[b$date >= start & b$date <= end & b$isexit != '' &
+ b$isguard != '', ]
+ bw <- data.frame(date = as.Date(b$date, "%Y-%m-%d"),
+ isexit = b$isexit == 't', isguard = b$isguard == 't',
+ read = b$bwread, written = b$bwwrite)
dates <- seq(from = as.Date(start, "%Y-%m-%d"),
to = as.Date(end, "%Y-%m-%d"), by = "1 day")
missing <- setdiff(dates, as.Date(bw$date, origin = "1970-01-01"))
@@ -539,7 +496,7 @@ plot_bwhist_flags <- function(start, end, path) {
date_breaks <- date_breaks(
as.numeric(max(as.Date(bw$date, "%Y-%m-%d")) -
min(as.Date(bw$date, "%Y-%m-%d"))))
- ggplot(bw, aes(x = as.Date(date, "%Y-%m-%d"), y = value / 2^20 / 86400,
+ ggplot(bw, aes(x = as.Date(date, "%Y-%m-%d"), y = value / 2^20,
colour = variable)) +
geom_line(size = 1) +
scale_x_date(name = paste("\nThe Tor Project - ",
@@ -547,7 +504,7 @@ plot_bwhist_flags <- function(start, end, path) {
format = date_breaks$format, major = date_breaks$major,
minor = date_breaks$minor) +
scale_y_continuous(name="Bandwidth (MiB/s)",
- limits = c(0, max(bw$value, na.rm = TRUE) / 2^20 / 86400)) +
+ limits = c(0, max(bw$value, na.rm = TRUE) / 2^20)) +
scale_colour_manual(name = "",
values = c("#E69F00", "#56B4E9", "#009E73", "#0072B2")) +
opts(title = "Bandwidth history by relay flags",
@@ -557,20 +514,13 @@ plot_bwhist_flags <- function(start, end, path) {
plot_dirbytes <- function(start, end, path) {
end <- min(end, as.character(Sys.Date() - 4))
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, dr, dw, brp, bwp, brd, bwd FROM user_stats ",
- "WHERE country = 'zy' AND bwp / bwd <= 3 AND date >= '", start,
- "' AND date <= '", end, "' ORDER BY date", sep = "")
- rs <- dbSendQuery(con, q)
- dir <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
- dir <- data.frame(date = dir$date,
- dirread = floor(dir$dr * dir$brp / dir$brd / 86400),
- dirwrite = floor(dir$dw * dir$bwp / dir$bwd / 86400))
- dir <- na.omit(dir)
- dir <- melt(dir, id = "date")
+ b <- read.csv("/srv/metrics.torproject.org/web/stats/bandwidth.csv",
+ stringsAsFactors = FALSE)
+ b <- b[b$date >= start & b$date <= end & b$isexit == '' &
+ b$isguard == '', ]
+ b <- data.frame(date = as.Date(b$date, "%Y-%m-%d"),
+ dirread = b$dirread, dirwrite = b$dirwrite)
+ dir <- melt(b, id = "date")
date_breaks <- date_breaks(
as.numeric(max(as.Date(dir$date, "%Y-%m-%d")) -
min(as.Date(dir$date, "%Y-%m-%d"))))
@@ -593,20 +543,17 @@ plot_dirbytes <- function(start, end, path) {
plot_relayflags <- function(start, end, flags, path) {
end <- min(end, as.character(Sys.Date() - 2))
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- columns <- paste("avg_", tolower(flags), sep = "", collapse = ", ")
- q <- paste("SELECT date, ", columns, " FROM network_size ",
- "WHERE date >= '", start, "' AND date <= '", end, "'", sep = "")
- rs <- dbSendQuery(con, q)
- networksize <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
- networksize <- melt(networksize, id = "date")
+ s <- read.csv("/srv/metrics.torproject.org/web/stats/servers.csv",
+ stringsAsFactors = FALSE)
+ s <- s[s$date >= start & s$date <= end & s$country == '' &
+ s$version == '' & s$platform == '' & s$ec2bridge == '', ]
+ s <- data.frame(date = as.Date(s$date, "%Y-%m-%d"),
+ variable = ifelse(s$flag == '', 'Running', s$flag),
+ value = s$relays)
+ networksize <- s[s$variable %in% flags, ]
networksize <- rbind(data.frame(
date = as.Date(end) + 1,
- variable = paste("avg_", c("running", "exit", "guard", "fast",
- "stable", "hsdir"), sep = ""),
+ variable = c("Running", "Exit", "Guard", "Fast", "Stable", "HSDir"),
value = NA), networksize)
dates <- seq(from = as.Date(start, "%Y-%m-%d"),
to = as.Date(end, "%Y-%m-%d"), by="1 day")
@@ -614,8 +561,7 @@ plot_relayflags <- function(start, end, flags, path) {
if (length(missing) > 0)
networksize <- rbind(data.frame(
date = as.Date(rep(missing, 6), origin = "1970-01-01"),
- variable = paste("avg_", c("running", "exit", "guard", "fast",
- "stable", "hsdir"), sep = ""),
+ variable = c("Running", "Exit", "Guard", "Fast", "Stable", "HSDir"),
value = rep(NA, length(missing) * 6)), networksize)
date_breaks <- date_breaks(
as.numeric(max(as.Date(end, "%Y-%m-%d")) -
@@ -630,23 +576,21 @@ plot_relayflags <- function(start, end, flags, path) {
na.rm = TRUE))) +
scale_colour_manual(name = "Relay flags", values = c("#E69F00",
"#56B4E9", "#009E73", "#EE6A50", "#000000", "#0072B2"),
- breaks = paste("avg_", tolower(flags), sep = ""),
- labels = flags) +
+ breaks = flags, labels = flags) +
opts(title = "Number of relays with relay flags assigned\n")
ggsave(filename = path, width = 8, height = 5, dpi = 72)
}
plot_torperf <- function(start, end, source, filesize, path) {
end <- min(end, as.character(Sys.Date() - 2))
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, q1, md, q3 FROM torperf_stats ",
- "WHERE source = '", paste(source, filesize, sep = "-"),
- "' AND date >= '", start, "' AND date <= '", end, "'", sep = "")
- rs <- dbSendQuery(con, q)
- torperf <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
+ size <- ifelse(filesize == '50kb', 50 * 1024,
+ ifelse(filesize == '1mb', 1024 * 1024, 5 * 1024 * 1024))
+ t <- read.csv("/srv/metrics.torproject.org/web/stats/torperf.csv",
+ stringsAsFactors = FALSE)
+ t <- t[t$date >= start & t$date <= end & t$size == size &
+ t$source == ifelse(source == 'all', '', source), ]
+ torperf <- data.frame(date = as.Date(t$date, "%Y-%m-%d"),
+ q1 = t$q1, md = t$md, q3 = t$q3)
dates <- seq(from = as.Date(start, "%Y-%m-%d"),
to = as.Date(end, "%Y-%m-%d"), by="1 day")
missing <- setdiff(dates, torperf$date)
@@ -687,16 +631,15 @@ plot_torperf <- function(start, end, source, filesize, path) {
plot_torperf_failures <- function(start, end, source, filesize, path) {
end <- min(end, as.character(Sys.Date() - 2))
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, timeouts, failures, requests ",
- "FROM torperf_stats WHERE source = '",
- paste(source, filesize, sep = "-"),
- "' AND date >= '", start, "' AND date <= '", end, "'", sep = "")
- rs <- dbSendQuery(con, q)
- torperf <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
+ size <- ifelse(filesize == '50kb', 50 * 1024,
+ ifelse(filesize == '1mb', 1024 * 1024, 5 * 1024 * 1024))
+ t <- read.csv("/srv/metrics.torproject.org/web/stats/torperf.csv",
+ stringsAsFactors = FALSE)
+ t <- t[t$date >= start & t$date <= end & t$size == size &
+ t$source == ifelse(source == 'all', '', source), ]
+ torperf <- data.frame(date = as.Date(t$date, "%Y-%m-%d"),
+ timeouts = t$timeouts, failures = t$failures,
+ requests = t$requests)
dates <- seq(from = as.Date(start, "%Y-%m-%d"),
to = as.Date(end, "%Y-%m-%d"), by="1 day")
missing <- setdiff(dates, torperf$date)
@@ -741,15 +684,11 @@ plot_torperf_failures <- function(start, end, source, filesize, path) {
plot_connbidirect <- function(start, end, path) {
end <- min(end, as.character(Sys.Date() - 2))
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT DATE(statsend) AS date, readnum, writenum, bothnum ",
- "FROM connbidirect WHERE DATE(statsend) >= '", start,
- "' AND DATE(statsend) <= '", end, "'", sep = "")
- rs <- dbSendQuery(con, q)
- c <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
+ c <- read.csv("/srv/metrics.torproject.org/web/stats/connbidirect.csv",
+ stringsAsFactors = FALSE)
+ c <- c[c$date >= start & c$date <= end, ]
+ c <- data.frame(date = as.Date(c$date, "%Y-%m-%d"),
+ readnum = c$read, writenum = c$write, bothnum = c$both)
connbidirect <- data.frame(date = c$date, c[, 2:4] /
(c$readnum + c$writenum + c$bothnum))
connbidirect <- melt(connbidirect, id = "date")
@@ -773,18 +712,12 @@ plot_connbidirect <- function(start, end, path) {
}
plot_fast_exits <- function(start, end, path) {
- r <- read.csv(paste("/srv/metrics.torproject.org/task-6498-graphs/",
- "task-6498/task-6498-results.csv", sep = ""),
+ f <- read.csv("/srv/metrics.torproject.org/web/stats/fast-exits.csv",
stringsAsFactors = FALSE)
- r <- r[r$valid_after >= paste(start, "00:00:00") &
- r$valid_after <= paste(end, "23:59:59") &
- r$valid_after < paste(Sys.Date() - 1, "23:59:59"), ]
- r <- r[r$min_rate == 11875 & r$ports == "80-443-554-1755" &
- r$min_advbw == 5000 & r$same_network == TRUE, ]
- r <- aggregate(list(relays = r$relays, P_exit = 100 * r$exit_prob),
- by = list(date = as.Date(cut.Date(as.Date(r$valid_after), "day"))),
- FUN = median)
- r <- melt(r, id.vars = c("date"))
+ f <- f[f$date >= start & f$date <= end, ]
+ f <- data.frame(date = as.Date(f$date, "%Y-%m-%d"),
+ relays = f$fastnum, P_exit = f$fastprob)
+ r <- melt(f, id.vars = c("date"))
r <- data.frame(r, type = ifelse(r$variable == "P_exit",
"Total exit probability (in %)", "Number of relays"))
ggplot(r, aes(x = date, y = value)) +
@@ -801,31 +734,16 @@ plot_fast_exits <- function(start, end, path) {
}
plot_almost_fast_exits <- function(start, end, path) {
- t <- read.csv(paste("/srv/metrics.torproject.org/task-6498-graphs/",
- "task-6498/task-6498-results.csv", sep = ""),
+ f <- read.csv("/srv/metrics.torproject.org/web/stats/fast-exits.csv",
stringsAsFactors = FALSE)
- t <- t[t$valid_after >= paste(start, "00:00:00") &
- t$valid_after <= paste(end, "23:59:59") &
- t$valid_after < paste(Sys.Date() - 1, "23:59:59"), ]
- t1 <- t[t$min_rate == 11875 & t$ports == "80-443-554-1755" &
- t$min_advbw == 5000 & t$same_network == TRUE, ]
- t2 <- t[t$min_rate == 10000 & t$ports == "80-443" &
- t$min_advbw == 2000 & t$same_network == FALSE, ]
- t <- rbind(data.frame(t1, var = "fast"),
- data.frame(t2, var = "almost_fast"))
- r <- cast(t, valid_after ~ var, value = "relays", fun.aggregate = max)
- r <- data.frame(valid_after = r$valid_after, fast = r$fast,
- almost = r$almost_fast - r$fast, var = "relays")
- e <- cast(t, valid_after ~ var, value = "exit_prob",
- fun.aggregate = max)
- e <- data.frame(valid_after = e$valid_after, fast = 100 * e$fast,
- almost = 100 * (e$almost_fast - e$fast), var = "exit_prob")
- t <- rbind(r, e)
- t <- aggregate(list(fast = t$fast, almost = t$almost),
- by = list(date = as.Date(cut.Date(as.Date(t$valid_after), "day")),
- var = ifelse(t$var == "exit_prob", "Total exit probability (in %)",
- "Number of relays")), FUN = median)
- t <- melt(t, id.vars = c("date", "var"))
+ f <- f[f$date >= start & f$date <= end, ]
+ f <- melt(f, id.vars = c("date"))
+ t <- data.frame(date = as.Date(f$date, "%Y-%m-%d"),
+ var = ifelse(f$variable == 'fastnum' | f$variable == 'almostnum',
+ "Number of relays", "Total exit probability (in %)"),
+ variable = ifelse(f$variable == 'fastnum' |
+ f$variable == 'fastprob', "fast", "almost fast"),
+ value = floor(f$value))
t <- data.frame(t, type = ifelse(t$variable == "fast",
"fast exits (95+ Mbit/s, 5000+ KB/s, 80/443/554/1755, 2- per /24",
paste("almost fast exits (80+ Mbit/s, 2000+ KB/s, 80/443,",
@@ -843,29 +761,28 @@ plot_almost_fast_exits <- function(start, end, path) {
plot_bandwidth_flags <- function(start, end, path) {
end <- min(end, as.character(Sys.Date() - 4))
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv, user = dbuser, password = dbpassword, dbname = db)
- q <- paste("SELECT date, isexit, isguard, bwadvertised AS value ",
- "FROM bandwidth_flags WHERE date >= '", start, "' AND date <= '",
- end, "'", sep = "")
- rs <- dbSendQuery(con, q)
- bw_desc <- fetch(rs, n = -1)
- q <- paste("SELECT date, isexit, isguard, ",
- "(read + written) / (2 * 86400) ",
- "AS value FROM bwhist_flags WHERE date >= '", start,
- "' AND date <= '", end, "'", sep = "")
- rs <- dbSendQuery(con, q)
- bw_hist <- fetch(rs, n = -1)
- dbDisconnect(con)
- dbUnloadDriver(drv)
- bandwidth <- rbind(data.frame(bw_desc, type = "advertised bandwidth"),
- data.frame(bw_hist, type = "bandwidth history"))
- bandwidth <- rbind(
- data.frame(bandwidth[bandwidth$isguard == TRUE, ], flag = "Guard"),
- data.frame(bandwidth[bandwidth$isexit == TRUE, ], flag = "Exit"))
- bandwidth <- aggregate(list(value = bandwidth$value),
- by = list(date = bandwidth$date, type = bandwidth$type,
- flag = bandwidth$flag), FUN = sum)
+ b <- read.csv("/srv/metrics.torproject.org/web/stats/bandwidth.csv",
+ stringsAsFactors = FALSE)
+ b <- b[b$date >= start & b$date <= end & b$isexit != '' &
+ b$isguard != '', ]
+ b <- data.frame(date = as.Date(b$date, "%Y-%m-%d"),
+ isexit = b$isexit == 't', isguard = b$isguard == 't',
+ advbw = b$advbw,
+ bwhist = floor((b$bwread + b$bwwrite) / 2))
+ b <- rbind(
+ data.frame(b[b$isguard == TRUE, ], flag = "Guard"),
+ data.frame(b[b$isexit == TRUE, ], flag = "Exit"))
+ b <- data.frame(date = b$date, advbw = b$advbw, bwhist = b$bwhist,
+ flag = b$flag)
+ b <- aggregate(list(advbw = b$advbw, bwhist = b$bwhist),
+ by = list(date = b$date, flag = b$flag), FUN = sum,
+ na.rm = TRUE, na.action = NULL)
+ b <- melt(b, id.vars = c("date", "flag"))
+ b <- data.frame(date = b$date,
+ type = ifelse(b$variable == 'advbw', 'advertised bandwidth',
+ 'bandwidth history'),
+ flag = b$flag, value = b$value)
+ bandwidth <- b[b$value > 0, ]
date_breaks <- date_breaks(
as.numeric(max(as.Date(bandwidth$date, "%Y-%m-%d")) -
min(as.Date(bandwidth$date, "%Y-%m-%d"))))
@@ -908,10 +825,9 @@ plot_bandwidth_flags <- function(start, end, path) {
plot_userstats <- function(start, end, node, variable, value, events,
path) {
end <- min(end, as.character(Sys.Date() - 2))
- u <- read.csv(paste("/srv/metrics.torproject.org/task-8462-graphs/",
- "task-8462/userstats.csv", sep = ""),
+ c <- read.csv("/srv/metrics.torproject.org/web/stats/clients.csv",
stringsAsFactors = FALSE)
- u <- u[u$date >= start & u$date <= end, ]
+ u <- c[c$date >= start & c$date <= end, ]
if (node == 'relay') {
if (value != 'all') {
u <- u[u$country == value & u$node == 'relay', ]
@@ -940,14 +856,15 @@ plot_userstats <- function(start, end, node, variable, value, events,
title <- "Bridge users\n"
}
}
- u <- data.frame(date = as.Date(u$date, "%Y-%m-%d"), users = u$users)
+ u <- data.frame(date = as.Date(u$date, "%Y-%m-%d"), users = u$clients,
+ lower = u$lower, upper = u$upper)
dates <- seq(from = as.Date(start, "%Y-%m-%d"),
to = as.Date(end, "%Y-%m-%d"), by="1 day")
missing <- setdiff(dates, u$date)
if (length(missing) > 0) {
u <- rbind(u,
data.frame(date = as.Date(missing, origin = "1970-01-01"),
- users = NA))
+ users = NA, lower = NA, upper = NA))
}
formatter <- function(x, ...) { format(x, scientific = FALSE, ...) }
date_breaks <- date_breaks(
@@ -957,21 +874,14 @@ plot_userstats <- function(start, end, node, variable, value, events,
plot <- ggplot(u, aes(x = date, y = users))
if (length(na.omit(u$users)) > 0 & events != "off" &
variable == 'country' & value != "all") {
- r <- read.csv(
- "/srv/metrics.torproject.org/web/detector/userstats-ranges.csv",
- stringsAsFactors = FALSE)
- r <- r[r$date >= start & r$date <= end & r$country == value,
- c("date", "minusers", "maxusers")]
- r <- cast(rbind(melt(u, id.vars = "date"), melt(r, id.vars = "date")))
- upturns <- r[r$users > r$maxusers, 1:2]
- downturns <- r[r$users < r$minusers, 1:2]
+ upturns <- u[u$users > u$upper, c("date", "users")]
+ downturns <- u[u$users <= u$lower, c("date", "users")]
if (events == "on") {
- if (length(r$maxusers) > 0)
- max_y <- max(max_y, max(r$maxusers, na.rm = TRUE))
- r[r$minusers < 0, "minusers"] <- 0
+ if (length(u$upper) > 0)
+ max_y <- max(max_y, max(u$upper, na.rm = TRUE))
+ u[u$lower < 0, "lower"] <- 0
plot <- plot +
- geom_ribbon(data = r, aes(ymin = minusers,
- ymax = maxusers), fill = "gray")
+ geom_ribbon(aes(ymin = lower, ymax = upper), fill = "gray")
}
if (length(upturns$date) > 0)
plot <- plot +
diff --git a/rserve/rserve-init.R b/rserve/rserve-init.R
index cb1f7a6..7a87b16 100644
--- a/rserve/rserve-init.R
+++ b/rserve/rserve-init.R
@@ -1,7 +1,5 @@
##Pre-loaded libraries and graphing functions to speed things up
-library("RPostgreSQL")
-library("DBI")
library("ggplot2")
library("proto")
library("grid")
@@ -10,10 +8,6 @@ library("plyr")
library("digest")
library("RColorBrewer")
-db = "tordir"
-dbuser = "metrics"
-dbpassword= ""
-
source('graphs.R')
source('csv.R')
source('tables.R')
diff --git a/rserve/tables.R b/rserve/tables.R
index 24de947..091a4de 100644
--- a/rserve/tables.R
+++ b/rserve/tables.R
@@ -4,12 +4,12 @@ countrynames <- function(countries) {
write_userstats <- function(start, end, node, path) {
end <- min(end, as.character(Sys.Date()))
- u <- read.csv(paste("/srv/metrics.torproject.org/task-8462-graphs/",
- "task-8462/userstats.csv", sep = ""),
+ c <- read.csv("/srv/metrics.torproject.org/web/stats/clients.csv",
stringsAsFactors = FALSE)
- u <- u[u$date >= start & u$date <= end & u$country != '' &
- u$transport == '' & u$version == '' & u$node == node,
- c("country", "users")]
+ c <- c[c$date >= start & c$date <= end & c$country != '' &
+ c$transport == '' & c$version == '' & c$node == node, ]
+ u <- data.frame(country = c$country, users = c$clients,
+ stringsAsFactors = FALSE)
u <- aggregate(list(users = u$users), by = list(country = u$country),
mean)
total <- sum(u$users)
@@ -34,23 +34,13 @@ write_userstats_bridge <- function(start, end, path) {
write_userstats_censorship_events <- function(start, end, path) {
end <- min(end, as.character(Sys.Date()))
- u <- read.csv(paste("/srv/metrics.torproject.org/task-8462-graphs/",
- "task-8462/userstats.csv", sep = ""),
+ c <- read.csv("/srv/metrics.torproject.org/web/stats/clients.csv",
stringsAsFactors = FALSE)
- u <- u[u$date >= start & u$date <= end & u$country != '' &
- u$transport == '' & u$version == '' & u$node == 'relay',
- c("date", "country", "users")]
- r <- read.csv(
- "/srv/metrics.torproject.org/web/detector/userstats-ranges.csv",
- stringsAsFactors = FALSE)
- r <- r[r$date >= start & r$date <= end,
- c("date", "country", "minusers", "maxusers")]
- r <- cast(rbind(melt(u, id.vars = c("date", "country")),
- melt(r, id.vars = c("date", "country"))))
- r <- na.omit(r[r$users < r$minusers | r$users > r$maxusers, ])
- r <- data.frame(date = r$date, country = r$country,
- upturn = ifelse(r$users > r$maxusers, 1, 0),
- downturn = ifelse(r$users < r$minusers, 1, 0))
+ c <- c[c$date >= start & c$date <= end & c$country != '' &
+ c$transport == '' & c$version == '' & c$node == 'relay', ]
+ r <- data.frame(date = c$date, country = c$country,
+ upturn = ifelse(c$clients > c$upper, 1, 0),
+ downturn = ifelse(c$clients <= c$lower, 1, 0))
r <- aggregate(r[, c("upturn", "downturn")],
by = list(country = r$country), sum)
r <- r[!(r$country %in% c("zy", "??", "a1", "a2", "o1", "ap", "eu")), ]
diff --git a/src/org/torproject/ernie/web/research/ResearchStatsServlet.java b/src/org/torproject/ernie/web/research/ResearchStatsServlet.java
new file mode 100644
index 0000000..ab1c231
--- /dev/null
+++ b/src/org/torproject/ernie/web/research/ResearchStatsServlet.java
@@ -0,0 +1,132 @@
+/* Copyright 2013 The Tor Project
+ * See LICENSE for licensing information */
+package org.torproject.ernie.web.research;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import javax.servlet.ServletConfig;
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+public class ResearchStatsServlet extends HttpServlet {
+
+ private static final long serialVersionUID = 3346710354297653810L;
+
+ private File statsDir;
+
+ private SortedSet<String> availableStatisticsFiles;
+
+ public void init(ServletConfig config) throws ServletException {
+ super.init(config);
+ this.statsDir = new File(config.getInitParameter("statsDir"));
+ this.availableStatisticsFiles = new TreeSet<String>();
+ this.availableStatisticsFiles.add("servers");
+ this.availableStatisticsFiles.add("bandwidth");
+ this.availableStatisticsFiles.add("fast-exits");
+ this.availableStatisticsFiles.add("clients");
+ this.availableStatisticsFiles.add("torperf");
+ this.availableStatisticsFiles.add("connbidirect");
+ }
+
+ public long getLastModified(HttpServletRequest request) {
+ File statsFile = this.determineStatsFile(request);
+ if (statsFile == null || !statsFile.exists()) {
+ return 0L;
+ } else {
+ return statsFile.lastModified();
+ }
+ }
+
+ public void doGet(HttpServletRequest request,
+ HttpServletResponse response) throws IOException, ServletException {
+ String requestURI = request.getRequestURI();
+ if (requestURI.equals("/ernie/stats/")) {
+ this.writeDirectoryListing(request, response);
+ } else if (requestURI.equals("/ernie/stats.html")) {
+ this.writeStatisticsPage(request, response);
+ } else {
+ File statsFile = this.determineStatsFile(request);
+ if (statsFile == null) {
+ response.sendError(HttpServletResponse.SC_NOT_FOUND);
+ return;
+ } else if (!this.writeStatsFile(statsFile, response)) {
+ response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
+ }
+ }
+ }
+
+ private void writeDirectoryListing(HttpServletRequest request,
+ HttpServletResponse response) throws IOException, ServletException {
+ request.setAttribute("directory", "/stats");
+ request.setAttribute("extension", ".csv");
+ request.setAttribute("files", this.availableStatisticsFiles);
+ request.getRequestDispatcher("/WEB-INF/dir.jsp").forward(request,
+ response);
+ }
+
+ private void writeStatisticsPage(HttpServletRequest request,
+ HttpServletResponse response) throws IOException, ServletException {
+ request.getRequestDispatcher("/WEB-INF/stats.jsp").forward(request,
+ response);
+ }
+
+ private File determineStatsFile(HttpServletRequest request) {
+ String requestedStatsFile = request.getRequestURI();
+ if (requestedStatsFile.equals("/ernie/stats/") ||
+ requestedStatsFile.equals("/ernie/stats.html")) {
+ return null;
+ }
+ if (requestedStatsFile.endsWith(".csv")) {
+ requestedStatsFile = requestedStatsFile.substring(0,
+ requestedStatsFile.length() - ".csv".length());
+ }
+ if (requestedStatsFile.contains("/")) {
+ requestedStatsFile = requestedStatsFile.substring(
+ requestedStatsFile.lastIndexOf("/") + 1);
+ }
+ if (!availableStatisticsFiles.contains(requestedStatsFile)) {
+ return null;
+ } else {
+ return new File(this.statsDir, requestedStatsFile + ".csv");
+ }
+ }
+
+ private boolean writeStatsFile(File statsFile,
+ HttpServletResponse response) throws IOException, ServletException {
+ if (!statsFile.exists()) {
+ return false;
+ }
+ byte[] statsFileBytes;
+ try {
+ BufferedInputStream bis = new BufferedInputStream(
+ new FileInputStream(statsFile), 1024);
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ byte[] buffer = new byte[1024];
+ int length;
+ while ((length = bis.read(buffer)) > 0) {
+ baos.write(buffer, 0, length);
+ }
+ bis.close();
+ statsFileBytes = baos.toByteArray();
+ } catch (IOException e) {
+ return false;
+ }
+ String statsFileContent = new String(statsFileBytes);
+ response.setContentType("text/csv");
+ response.setHeader("Content-Length", String.valueOf(
+ statsFileContent.length()));
+ response.setHeader("Content-Disposition",
+ "inline; filename=\"" + statsFile.getName() + "\"");
+ response.getWriter().print(statsFileContent);
+ return true;
+ }
+}
+
diff --git a/web/WEB-INF/banner.jsp b/web/WEB-INF/banner.jsp
index 8c3c33a..6bff272 100644
--- a/web/WEB-INF/banner.jsp
+++ b/web/WEB-INF/banner.jsp
@@ -57,7 +57,8 @@
<%} else if (currentPage.endsWith("research.jsp") ||
currentPage.endsWith("data.jsp") ||
currentPage.endsWith("formats.jsp") ||
- currentPage.endsWith("tools.jsp")) {
+ currentPage.endsWith("tools.jsp") ||
+ currentPage.endsWith("stats.jsp")) {
%><br>
<font size="2">
<a <%if (currentPage.endsWith("data.jsp")) {
@@ -69,6 +70,9 @@
<a <%if (currentPage.endsWith("tools.jsp")) {
%>class="current"<%} else {%> href="/tools.html"<%}
%>>Tools</a>
+ <a <%if (currentPage.endsWith("stats.jsp")) {
+ %>class="current"<%} else {%> href="/stats.html"<%}
+ %>>Statistics</a>
</font>
<%}%>
</td>
diff --git a/web/WEB-INF/error.jsp b/web/WEB-INF/error.jsp
index a010309..9c5150e 100644
--- a/web/WEB-INF/error.jsp
+++ b/web/WEB-INF/error.jsp
@@ -54,6 +54,7 @@ Maybe you find what you're looking for on our sitemap:
<li><a href="data.html">Data</a></li>
<li><a href="formats.html">Formats</a></li>
<li><a href="tools.html">Tools</a></li>
+<li><a href="stats.html">Statistics</a></li>
</ul></li>
<li><a href="status.html">Status</a>
<ul>
diff --git a/web/WEB-INF/stats.jsp b/web/WEB-INF/stats.jsp
new file mode 100644
index 0000000..eac4b57
--- /dev/null
+++ b/web/WEB-INF/stats.jsp
@@ -0,0 +1,288 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+<html>
+<head>
+ <title>Tor Metrics Portal: Statistics</title>
+ <meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
+ <link href="/css/stylesheet-ltr.css" type="text/css" rel="stylesheet">
+ <link href="/images/favicon.ico" type="image/x-icon" rel="shortcut icon">
+</head>
+<body>
+ <div class="center">
+ <%@ include file="banner.jsp"%>
+ <div class="main-column">
+<h2>Tor Metrics Portal: Statistics</h2>
+<br>
+
+<p>The metrics portal aggregates large amounts of Tor network
+<a href="data.html">data</a> and visualizes results in customizable
+<a href="graphs.html">graphs</a> and tables.
+All aggregated data are also available for download, so that people can
+easily plot their own graphs or even develop a prettier metrics website
+without writing their own data aggregation code.
+Data formats of aggregate statistics are specified below.</p>
+
+<h3>Number of relays and bridges</h3>
+
+<p>Statistics file <a href="stats/servers.csv">servers.csv</a> contains
+the average number of relays and bridges in the Tor network.
+All averages are calculated per day by evaluating the relay and bridge
+lists published by the directory authorities.
+Statistics include subsets of relays or bridges by relay flag (only
+relays), country code (only relays, only until February 2013), Tor
+software version (only relays), operating system (only relays), and EC2
+cloud (only bridges).
+The statistics file contains the following columns:</p>
+
+<ul>
+<li><b>date:</b> UTC date (YYYY-MM-DD) when relays or bridges have been
+listed as running.</li>
+<li><b>flag:</b> Relay flag assigned by the directory authorities.
+Examples are <b>"Exit"</b>, <b>"Guard"</b>, <b>"Fast"</b>,
+<b>"Stable"</b>, and <b>"HSDir"</b>.
+Relays can have none, some, or all these relay flags assigned.
+Relays that don't have the <b>"Running"</b> flag are not included in these
+statistics regardless of their other flags.
+If this column contains the empty string, all running relays are included,
+regardless of assigned flags.
+There are no statistics on the number of bridges by relay flag.</li>
+<li><b>country:</b> Two-letter lower-case country code as found in a GeoIP
+database by resolving the relay's first onion-routing IP address, or
+<b>"??"</b> if an IP addresses could not be resolved.
+If this column contains the empty string, all running relays are included,
+regardless of their resolved country code.
+Statistics on relays by country code are only available until January 31,
+2013.
+There are no statistics on the number of bridges by country code.</li>
+<li><b>version:</b> First three dotted numbers of the Tor software version
+as reported by the relay.
+An example is <b>"0.2.5"</b>.
+If this column contains the empty string, all running relays are included,
+regardless of the Tor software version they run.
+There are no statistics on the number of bridges by Tor software
+version.</li>
+<li><b>platform:</b> Operating system as reported by the relay.
+Examples are <b>"Linux"</b>, <b>"Darwin"</b> (Mac OS X), <b>"FreeBSD"</b>,
+<b>"Windows"</b>, and <b>"Other"</b>.
+If this column contains the empty string, all running relays are included,
+regardless of the operating system they run on.
+There are no statistics on the number of bridges by operating system.</li>
+<li><b>ec2bridge:</b> Whether bridges are running in the EC2 cloud or not.
+More precisely, bridges in the EC2 cloud running an image provided by Tor
+by default set their nickname to <b>"ec2bridger"</b> plus 8 random hex
+characters.
+This column either contains <b>"t"</b> for bridges matching this naming
+scheme, or the empty string for all bridges regardless of their nickname.
+There are no statistics on the number of relays running in the EC2
+cloud.</li>
+<li><b>relays:</b> The average number of relays matching the criteria in
+the previous columns.
+If the values in previous columns are specific to bridges only, this
+column contains the empty string.</li>
+<li><b>bridges:</b> The average number of bridges matching the criteria in
+the previous columns.
+If the values in previous columns are specific to relays only, this column
+contains the empty string.</li>
+</ul>
+
+<h3>Bandwidth provided and consumed by relays</h3>
+
+Statistics on bandwidth provided and consumed by relays are contained in
+file <a href="stats/bandwidth.csv">bandwidth.csv</a>.
+This file contains three different bandwidth metrics:
+(1) bandwidth that relays are capable to provide and bandwidth that relays
+report to have consumed, either (2) for any traffic, or (3) only traffic
+from serving directory data.
+Relays providing bandwidth statistics are categorized by having the
+<b>"Exit"</b> and <b>"Guard"</b> relay flag, having both, or not having
+either.
+The statistics file contains the following columns:
+
+<ul>
+<li><b>date:</b> UTC date (YYYY-MM-DD) that relays reported bandwidth data
+for.</li>
+<li><b>isexit:</b> Whether relays included in this line have the
+<b>"Exit"</b> relay flag or not, which can be <b>"t"</b> or <b>"f"</b>.
+If this column contains the empty string, bandwidth data from all running
+relays are included, regardless of assigned relay flags.</li>
+<li><b>isguard:</b> Whether relays included in this line have the
+<b>"Guard"</b> relay flag or not, which can be <b>"t"</b> or <b>"f"</b>.
+If this column contains the empty string, bandwidth data from all running
+relays are included, regardless of assigned relay flags.</li>
+<li><b>advbw:</b> Total advertised bandwidth in bytes per second that
+relays are capable to provide.</li>
+<li><b>bwread:</b> Total bandwidth in bytes per second that relays have
+read.
+This metric includes any kind of traffic.</li>
+<li><b>bwwrite:</b> Similar to <b>bwread</b>, but for traffic written by
+relays.</li>
+<li><b>dirread:</b> Bandwidth in bytes per second that relays have read
+when serving directory data.
+Not all relays report how many bytes they read when serving directory data
+which is why this value is an estimate from the available data.
+This metric is not available for subsets of relays with certain relay
+flags, so that this column will contain the empty string if either
+<b>isexit</b> or <b>isguard</b> is non-empty.</li>
+<li><b>dirwrite:</b> Similar to <b>dirread</b>, but for traffic written by
+relays when serving directory data.</li>
+</ul>
+
+<h3>Relays meeting or almost meeting fast-exit requirements</h3>
+
+Statistics file <a href="stats/fast-exits.csv">fast-exits.csv</a> contains
+the number of relays meeting or almost meeting fast-exit requirements.
+These requirements originate from a Tor sponsor contract and are defined as
+follows:
+a Tor relay is fast if it has at least 95 Mbit/s configured bandwidth
+rate, at least 5000 KB/s advertised bandwidth capacity, and permits
+exiting to ports 80, 443, 554, and 1755; furthermore, there may be at most
+2 relays per /24 network in the set of fast exits.
+Similarly, an almost fast exit is one that almost meets the fast-exit
+requirements, but fails at least one of them.
+In particular, an almost fast exit is one that has at least 80 Mbit/s
+configured bandwidth rate, at least 2000 KB/s advertised bandwidth
+capacity, and permits exiting to ports 80 and 443; also, if there are more
+than 2 relays per /24 network meeting fast-exit requirements, all but two
+are considered almost fast.
+The statistics file contains the following columns:
+
+<ul>
+<li><b>date:</b> UTC date (YYYY-MM-DD) when relays have been listed as
+running.</li>
+<li><b>fastnum:</b> Average number of relays matching fast-exit
+requirements.</li>
+<li><b>almostnum:</b> Average number of relays almost matching
+fast-exit requirements.</li>
+<li><b>fastprob:</b> Total exit probability of all relays matching
+fast-exit requirements.</li>
+<li><b>almostprob:</b> Total exit probability of all relays almost
+matching fast-exit requirements.</li>
+</li>
+</ul>
+
+<h3>Estimated number of clients in the Tor network</h3>
+
+Statistics file <a href="stats/clients.csv">clients.csv</a> contains
+estimates on the number of clients in the Tor network.
+These estimates are based on the number of directory requests counted on
+directory mirrors and bridges.
+Statistics are available for clients connecting directly to the Tor
+network and clients connecting via bridges.
+For relays, there exist statistics on the number of clients by country,
+and for bridges, statistics are available by country, by transport, and by
+IP version.
+Statistics further include expected client numbers from past observations
+which can be used to detect censorship or release of censorship.
+The statistics file contains the following columns:
+
+<ul>
+<li><b>date:</b> UTC date (YYYY-MM-DD) for which client numbers are
+estimated.</li>
+<li><b>node:</b> The node type to which clients connect first, which can
+be either <b>"relay"</b> or <b>"bridge"</b>.</li>
+<li><b>country:</b> Two-letter lower-case country code as found in a GeoIP
+database by resolving clients' IP addresses, or <b>"??"</b> if client IP
+addresses could not be resolved.
+If this column contains the empty string, all clients are included,
+regardless of their country code.</li>
+<li><b>transport:</b> Transport name used by clients to connect to the Tor
+network using bridges.
+Examples are <b>"obfs2"</b>, <b>"obfs3"</b>, <b>"websocket"</b>, or
+<b>"<OR>"</b> (original onion routing protocol).
+If this column contains the empty string, all clients are included,
+regardless of their transport.
+There are no statistics on the number of clients by transport that connect
+to the Tor network via relays.</li>
+<li><b>version:</b> IP version used by clients to connect to the Tor
+network using bridges.
+Examples are <b>"v4"</b> and <b>"v6"</b>.
+If this column contains the empty string, all clients are included,
+regardless of their IP version.
+There are no statistics on the number of clients by IP version that connect
+directly to the Tor network using relays.</li>
+<li><b>lower:</b> Lower number of expected clients under the assumption
+that there has been no censorship event.
+If this column contains the empty string, there are no expectations on the
+number of clients.</li>
+<li><b>upper:</b> Upper number of expected clients under the assumption
+that there has been no release of censorship.
+If this column contains the empty string, there are no expectations on the
+number of clients.</li>
+<li><b>clients:</b> Estimated number of clients.</li>
+<li><b>frac:</b> Fraction of relays or bridges in percent that the
+estimate is based on.
+The higher this value, the more reliable is the estimate.
+Values above 50 can be considered reliable enough for most purposes,
+lower values should be handled with more care.</li>
+</ul>
+
+<h3>Performance of downloading static files over Tor</h3>
+
+Statistics file <a href="stats/torperf.csv">torperf.csv</a> contains
+aggregate statistics on download performance over time.
+These statistics come from the Torperf service that periodically downloads
+static files over Tor.
+The statistics file contains the following columns:
+
+<ul>
+<li><b>date:</b> UTC date (YYYY-MM-DD) when download performance was
+measured.</li>
+<li><b>size:</b> Size of the downloaded file in bytes.</li>
+<li><b>source:</b> Name of the Torperf service performing measurements.
+If this column contains the empty string, all measurements are included,
+regardless of which Torperf service performed them.
+Examples are <b>"moria"</b>, <b>"siv"</b>, and <b>"torperf"</b>.</li>
+<li><b>q1:</b> First quartile of time until receiving the last byte in
+milliseconds.</li>
+<li><b>md:</b> Median of time until receiving the last byte in
+milliseconds.</li>
+<li><b>q3:</b> Third quartile of time until receiving the last byte in
+milliseconds.</li>
+<li><b>timeouts:</b> Number of timeouts that occurred when attempting to
+download the static file over Tor.</li>
+<li><b>failures:</b> Number of failures that occurred when attempting to
+download the static file over Tor.</li>
+<li><b>requests:</b> Total number of requests made to download the static
+file over Tor.</li>
+</ul>
+
+<h3>Fraction of connections used uni-/bidirectionally</h3>
+
+Statistics file <a href="stats/connbidirect.csv">connbidirect.csv</a>
+contains statistics on the fraction of connections that is used uni- or
+bidirectionally.
+Every 10 seconds, relays determine for every connection whether they read
+and wrote less than a threshold of 20 KiB.
+For the remaining connections, relays report whether they read/wrote at
+least 10 times as many bytes as they wrote/read.
+If so, they classify a connection as "mostly reading" or "mostly writing,"
+respectively.
+All other connections are classified as "both reading and writing."
+After classifying connections, read and write counters are reset for the
+next 10-second interval.
+Statistics are aggregated over 24 hours.
+The statistics file contains the following columns:
+
+<ul>
+<li><b>date:</b> UTC date (YYYY-MM-DD) for which statistics on
+uni-/bidirectional connection usage were reported.</li>
+<li><b>source:</b> Fingerprint of the relay reporting statistics.</li>
+<li><b>below:</b> Number of 10-second intervals of connections with less
+than 20 KiB read and written data.</li>
+<li><b>read:</b> Number of 10-second intervals of connections with 10
+times as many read bytes as written bytes.</li>
+<li><b>write:</b> Number of 10-second intervals of connections with 10
+times as many written bytes as read bytes.</li>
+<li><b>both:</b> Number of 10-second intervals of connections with less
+than 10 times as many written or read bytes as in the other
+direction.</li>
+</ul>
+ </div>
+ </div>
+ <div class="bottom" id="bottom">
+ <%@ include file="footer.jsp"%>
+ </div>
+</body>
+</html>
+</body>
+</html>
+
More information about the tor-commits
mailing list