[tor-commits] [metrics-web/release] Use readr to speed up drawing graphs.

karsten at torproject.org karsten at torproject.org
Sat Nov 9 21:45:06 UTC 2019


commit 2c44721c9ab903183558b92d7a4e17674fcb79be
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Mon Dec 17 21:03:16 2018 +0100

    Use readr to speed up drawing graphs.
    
    Over two years ago, in commit 1f90b72 from October 2016, we made our
    user graphs faster by avoiding to read the large .csv file on demand.
    Instead we read it once as part of the daily update, saved it to disk
    as .RData file using R's save() function, and loaded it back to memory
    using R's load() function when drawing a graph.
    
    This approach worked okay. It just had two disadvantages:
    
     1. We had to write a small amount of R code for each graph type,
        which is why we only did it for graphs with large .csv files.
     2. Running these small R script as part of the daily update made it
        harder to move away from Ant towards a Java-only execution model.
    
    The new approach implemented in this commit uses read_csv() fromt the
    readr package which reads CSV files several times faster than
    read.csv().
    
    Requires installing the readr package from CRAN, which is available on
    Debian in stretch-backports and later as r-cran-readr.
    
    Implements #28799.
---
 build.xml                          |  14 ---
 src/main/R/clients/split-clients.R |  12 ---
 src/main/R/rserver/graphs.R        | 169 +++++++++++++++++++++++++++++--------
 src/main/R/rserver/rserve-init.R   |   1 +
 src/main/R/webstats/write-RData.R  |  16 ----
 5 files changed, 136 insertions(+), 76 deletions(-)

diff --git a/build.xml b/build.xml
index 89c8b31..250417e 100644
--- a/build.xml
+++ b/build.xml
@@ -362,8 +362,6 @@
     <property name="module.name" value="clients" />
     <property name="localmoddir" value="${modulebase}/${module.name}" />
 
-    <property name="rdatadir" value="${localmoddir}/RData" />
-    <mkdir dir="${rdatadir}" />
     <property name="statsdir"
               value="${localmoddir}/stats" />
     <mkdir dir="${statsdir}" />
@@ -410,10 +408,6 @@
 
     <copy file="${localmoddir}/clients.csv" todir="${statsdir}" />
     <copy file="${localmoddir}/userstats-combined.csv" todir="${statsdir}" />
-
-    <antcall target="run-R" >
-      <param name="module.Rscript" value="split-clients.R" />
-    </antcall>
   </target>
 
   <target name="servers" >
@@ -426,13 +420,7 @@
 
   <target name="webstats" >
     <property name="module.name" value="webstats" />
-    <property name="rdatadir" value="${modulebase}/${module.name}/RData" />
-    <mkdir dir="${rdatadir}" />
-
     <antcall target="run-java" />
-    <antcall target="run-R" >
-      <param name="module.Rscript" value="write-RData.R" />
-    </antcall>
   </target>
 
   <target name="totalcw" >
@@ -482,8 +470,6 @@
       <fileset dir="${modulebase}/totalcw/stats" includes="totalcw.csv" />
     </copy>
     <copy todir="${rdatadir}" >
-      <fileset dir="${modulebase}/clients/RData" includes="*.RData" />
-      <fileset dir="${modulebase}/webstats/RData" includes="*.RData" />
       <fileset dir="${resources}/web/images/" includes="no-data-available.*" />
     </copy>
   </target>
diff --git a/src/main/R/clients/split-clients.R b/src/main/R/clients/split-clients.R
deleted file mode 100644
index 9f80902..0000000
--- a/src/main/R/clients/split-clients.R
+++ /dev/null
@@ -1,12 +0,0 @@
-dir.create("RData", showWarnings = FALSE)
-
-c <- read.csv("clients.csv", stringsAsFactors = FALSE)
-data <- c[c$node == 'relay', !(names(c) %in% c("node"))]
-save(data, file = "RData/clients-relay.RData")
-data <- c[c$node == 'bridge', !(names(c) %in% c("node"))]
-save(data, file = "RData/clients-bridge.RData")
-
-u <- read.csv("userstats-combined.csv", stringsAsFactors = FALSE)
-data <- u[, !(names(u) %in% c("node", "version"))]
-save(data, file = "RData/userstats-bridge-combined.RData")
-
diff --git a/src/main/R/rserver/graphs.R b/src/main/R/rserver/graphs.R
index 7501a95..e541c30 100644
--- a/src/main/R/rserver/graphs.R
+++ b/src/main/R/rserver/graphs.R
@@ -348,6 +348,9 @@ robust_call <- function(wrappee, filename) {
        })
 }
 
+# Disable readr's automatic progress bar.
+options(readr.show_progress = FALSE)
+
 prepare_networksize <- function(start_p, end_p) {
   read.csv(paste(stats_dir, "networksize.csv", sep = ""),
     colClasses = c("date" = "Date")) %>%
@@ -863,8 +866,19 @@ write_bandwidth_flags <- function(start_p = NULL, end_p = NULL, path_p) {
 
 plot_userstats <- function(start_p, end_p, node_p, variable_p, value_p,
     events_p, path_p) {
-  load(paste(rdata_dir, "clients-", node_p, ".RData", sep = ""))
-  c <- data
+  c <- read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        node = col_character(),
+        country = col_character(),
+        transport = col_character(),
+        version = col_character(),
+        lower = col_double(),
+        upper = col_double(),
+        clients = col_double(),
+        frac = col_skip()),
+      na = character()) %>%
+    filter(node == node_p)
   u <- c[c$date >= start_p & c$date <= end_p, c("date", "country", "transport",
       "version", "lower", "upper", "clients")]
   u <- rbind(u, data.frame(date = start_p,
@@ -1011,14 +1025,24 @@ plot_userstats_bridge_version <- function(start_p, end_p, version_p, path_p) {
 
 write_userstats_relay_country <- function(start_p = NULL, end_p = NULL,
     country_p = NULL, events_p = NULL, path_p) {
-  load(paste(rdata_dir, "clients-relay.RData", sep = ""))
-  u <- data %>%
+  read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        node = col_character(),
+        country = col_character(),
+        transport = col_character(),
+        version = col_character(),
+        lower = col_double(),
+        upper = col_double(),
+        clients = col_double(),
+        frac = col_double())) %>%
+    filter(node == "relay") %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
     filter(if (!is.null(country_p))
       country == ifelse(country_p == "all", "", country_p) else TRUE) %>%
-    filter(transport == "") %>%
-    filter(version == "") %>%
+    filter(is.na(transport)) %>%
+    filter(is.na(version)) %>%
     select(date, country, clients, lower, upper, frac) %>%
     rename(users = clients) %>%
     write.csv(path_p, quote = FALSE, row.names = FALSE, na = "")
@@ -1026,14 +1050,24 @@ write_userstats_relay_country <- function(start_p = NULL, end_p = NULL,
 
 write_userstats_bridge_country <- function(start_p = NULL, end_p = NULL,
     country_p = NULL, path_p) {
-  load(paste(rdata_dir, "clients-bridge.RData", sep = ""))
-  data %>%
+  read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        node = col_character(),
+        country = col_character(),
+        transport = col_character(),
+        version = col_character(),
+        lower = col_double(),
+        upper = col_double(),
+        clients = col_double(),
+        frac = col_double())) %>%
+    filter(node == "bridge") %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
     filter(if (!is.null(country_p))
       country == ifelse(country_p == "all", "", country_p) else TRUE) %>%
-    filter(transport == "") %>%
-    filter(version == "") %>%
+    filter(is.na(transport)) %>%
+    filter(is.na(version)) %>%
     select(date, country, clients, frac) %>%
     rename(users = clients) %>%
     write.csv(path_p, quote = FALSE, row.names = FALSE, na = "")
@@ -1041,13 +1075,23 @@ write_userstats_bridge_country <- function(start_p = NULL, end_p = NULL,
 
 write_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL,
     transport_p = NULL, path_p) {
-  load(paste(rdata_dir, "clients-bridge.RData", sep = ""))
-  u <- data %>%
+  u <- read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        node = col_character(),
+        country = col_character(),
+        transport = col_character(),
+        version = col_character(),
+        lower = col_double(),
+        upper = col_double(),
+        clients = col_double(),
+        frac = col_double())) %>%
+    filter(node == "bridge") %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
-    filter(country == "") %>%
-    filter(version == "") %>%
-    filter(transport != "") %>%
+    filter(is.na(country)) %>%
+    filter(is.na(version)) %>%
+    filter(!is.na(transport)) %>%
     select(date, transport, clients, frac)
   if (is.null(transport_p) || "!<OR>" %in% transport_p) {
     n <- u %>%
@@ -1068,12 +1112,22 @@ write_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL,
 
 write_userstats_bridge_version <- function(start_p = NULL, end_p = NULL,
     version_p = NULL, path_p) {
-  load(paste(rdata_dir, "clients-bridge.RData", sep = ""))
-  data %>%
+  read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        node = col_character(),
+        country = col_character(),
+        transport = col_character(),
+        version = col_character(),
+        lower = col_double(),
+        upper = col_double(),
+        clients = col_double(),
+        frac = col_double())) %>%
+    filter(node == "bridge") %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
-    filter(country == "") %>%
-    filter(transport == "") %>%
+    filter(is.na(country)) %>%
+    filter(is.na(transport)) %>%
     filter(if (!is.null(version_p)) version == version_p else TRUE) %>%
     select(date, version, clients, frac) %>%
     rename(users = clients) %>%
@@ -1081,8 +1135,16 @@ write_userstats_bridge_version <- function(start_p = NULL, end_p = NULL,
 }
 
 prepare_userstats_bridge_combined <- function(start_p, end_p, country_p) {
-  load(paste(rdata_dir, "userstats-bridge-combined.RData", sep = ""))
-  data %>%
+  read_csv(file = paste(stats_dir, "userstats-combined.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        node = col_skip(),
+        country = col_character(),
+        transport = col_character(),
+        version = col_skip(),
+        frac = col_double(),
+        low = col_double(),
+        high = col_double())) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
     filter(if (!is.null(country_p)) country == country_p else TRUE)
@@ -1135,7 +1197,7 @@ prepare_advbwdist_perc <- function(start_p, end_p, p_p) {
     filter(if (!is.null(p_p)) percentile %in% as.numeric(p_p) else
       percentile != "") %>%
     transmute(date, percentile = as.factor(percentile),
-      variable = ifelse(isexit != "t", "all", "exits"),
+      variable = ifelse(is.na(isexit), "all", "exits"),
       advbw = advbw * 8 / 1e9)
 }
 
@@ -1258,11 +1320,20 @@ write_hidserv_rend_relayed_cells <- function(start_p = NULL, end_p = NULL,
 }
 
 prepare_webstats_tb <- function(start_p, end_p) {
-  load(paste(rdata_dir, "webstats-tb.RData", sep = ""))
-  data %>%
+  read_csv(file = paste(stats_dir, "webstats.csv", sep = ""),
+      col_types = cols(
+        log_date = col_date(format = ""),
+        request_type = col_factor(),
+        platform = col_skip(),
+        channel = col_skip(),
+        locale = col_skip(),
+        incremental = col_skip(),
+        count = col_double())) %>%
     filter(if (!is.null(start_p)) log_date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>%
-    mutate(request_type = factor(request_type))
+    filter(request_type %in% c("tbid", "tbsd", "tbup", "tbur")) %>%
+    group_by(log_date, request_type) %>%
+    summarize(count = sum(count))
 }
 
 plot_webstats_tb <- function(start_p, end_p, path_p) {
@@ -1296,8 +1367,15 @@ write_webstats_tb <- function(start_p = NULL, end_p = NULL, path_p) {
 }
 
 prepare_webstats_tb_platform <- function(start_p, end_p) {
-  read.csv(paste(stats_dir, "webstats.csv", sep = ""),
-    colClasses = c("log_date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "webstats.csv", sep = ""),
+      col_types = cols(
+        log_date = col_date(format = ""),
+        request_type = col_factor(),
+        platform = col_factor(),
+        channel = col_skip(),
+        locale = col_skip(),
+        incremental = col_skip(),
+        count = col_double())) %>%
     filter(if (!is.null(start_p)) log_date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>%
     filter(request_type %in% c("tbid", "tbup")) %>%
@@ -1337,8 +1415,15 @@ write_webstats_tb_platform <- function(start_p = NULL, end_p = NULL, path_p) {
 }
 
 plot_webstats_tb_locale <- function(start_p, end_p, path_p) {
-  d <- read.csv(paste(stats_dir, "webstats.csv", sep = ""),
-    colClasses = c("log_date" = "Date", "locale" = "character"))
+  d <- read_csv(file = paste(stats_dir, "webstats.csv", sep = ""),
+      col_types = cols(
+        log_date = col_date(format = ""),
+        request_type = col_factor(),
+        platform = col_skip(),
+        channel = col_skip(),
+        locale = col_factor(),
+        incremental = col_skip(),
+        count = col_double()))
   d <- d[d$log_date >= start_p & d$log_date <= end_p &
          d$request_type %in% c("tbid", "tbup"), ]
   levels(d$request_type) <- list(
@@ -1375,8 +1460,15 @@ plot_webstats_tb_locale <- function(start_p, end_p, path_p) {
 # plot_webstats_tb_locale needs the preliminary data frame e for its
 # breaks and labels. Left as future work.
 write_webstats_tb_locale <- function(start_p = NULL, end_p = NULL, path_p) {
-  read.csv(paste(stats_dir, "webstats.csv", sep = ""),
-    colClasses = c("log_date" = "Date", "locale" = "character")) %>%
+  read_csv(file = paste(stats_dir, "webstats.csv", sep = ""),
+      col_types = cols(
+        log_date = col_date(format = ""),
+        request_type = col_factor(),
+        platform = col_skip(),
+        channel = col_skip(),
+        locale = col_factor(),
+        incremental = col_skip(),
+        count = col_double())) %>%
     filter(if (!is.null(start_p)) log_date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>%
     filter(request_type %in% c("tbid", "tbup")) %>%
@@ -1390,11 +1482,20 @@ write_webstats_tb_locale <- function(start_p = NULL, end_p = NULL, path_p) {
 }
 
 prepare_webstats_tm <- function(start_p, end_p) {
-  load(paste(rdata_dir, "webstats-tm.RData", sep = ""))
-  data %>%
+  read_csv(file = paste(stats_dir, "webstats.csv", sep = ""),
+      col_types = cols(
+        log_date = col_date(format = ""),
+        request_type = col_factor(),
+        platform = col_skip(),
+        channel = col_skip(),
+        locale = col_skip(),
+        incremental = col_skip(),
+        count = col_double())) %>%
     filter(if (!is.null(start_p)) log_date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>%
-    mutate(request_type = factor(request_type))
+    filter(request_type %in% c("tmid", "tmup")) %>%
+    group_by(log_date, request_type) %>%
+    summarize(count = sum(count))
 }
 
 plot_webstats_tm <- function(start_p, end_p, path_p) {
diff --git a/src/main/R/rserver/rserve-init.R b/src/main/R/rserver/rserve-init.R
index b9a1d3b..f160698 100644
--- a/src/main/R/rserver/rserve-init.R
+++ b/src/main/R/rserver/rserve-init.R
@@ -5,6 +5,7 @@ library("RColorBrewer")
 library("scales")
 library(dplyr)
 library(tidyr)
+library(readr)
 
 source('graphs.R')
 source('tables.R')
diff --git a/src/main/R/webstats/write-RData.R b/src/main/R/webstats/write-RData.R
deleted file mode 100644
index 96cc840..0000000
--- a/src/main/R/webstats/write-RData.R
+++ /dev/null
@@ -1,16 +0,0 @@
-dir.create("RData", showWarnings = FALSE)
-
-d <- read.csv("stats/webstats.csv", stringsAsFactors = FALSE)
-d <- d[d$request_type %in% c('tbid', 'tbsd', 'tbup', 'tbur'), ]
-data <- aggregate(list(count = d$count),
-    by = list(log_date = as.Date(d$log_date), request_type = d$request_type),
-    FUN = sum)
-save(data, file = "RData/webstats-tb.RData")
-
-d <- read.csv("stats/webstats.csv", stringsAsFactors = FALSE)
-d <- d[d$request_type %in% c('tmid', 'tmup'), ]
-data <- aggregate(list(count = d$count),
-    by = list(log_date = as.Date(d$log_date), request_type = d$request_type),
-    FUN = sum)
-save(data, file = "RData/webstats-tm.RData")
-





More information about the tor-commits mailing list