[tor-commits] [onionoo/master] Only unescape valid UTF.

karsten at torproject.org karsten at torproject.org
Tue Mar 27 14:22:13 UTC 2018


commit 532ef3479a576733934edb45f588f2a074061f62
Author: iwakeh <iwakeh at torproject.org>
Date:   Thu Mar 15 13:58:17 2018 +0000

    Only unescape valid UTF.
    
    Add a utility method for only un-escaping valid utf and supply a test
    as well as test data for this issue.
    
    Fixes task-22594.
---
 CHANGELOG.md                                       |  8 ++++
 .../org/torproject/onionoo/docs/DocumentStore.java |  4 +-
 .../torproject/onionoo/server/ResponseBuilder.java |  5 +--
 .../torproject/onionoo/util/FormattingUtils.java   | 34 +++++++++++++++++
 .../onionoo/util/FormattingUtilsTest.java          | 43 ++++++++++++++++++++++
 src/test/resources/lines-for-escape-tests.txt      | 16 ++++++++
 6 files changed, 104 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6fe389b..3a3c468 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,11 @@
+# Changes in version 5.1-1.12.0 - 2018-??-??
+
+ * Minor changes
+   - Don't attempt to un-escape character sequences in contact lines
+     (like "\uk") that only happen to start like escaped utf-8 characters
+     (like "\u0055").
+
+
 # Changes in version 5.1-1.11.0 - 2018-03-14
 
  * Medium changes
diff --git a/src/main/java/org/torproject/onionoo/docs/DocumentStore.java b/src/main/java/org/torproject/onionoo/docs/DocumentStore.java
index 4622a34..f1f3803 100644
--- a/src/main/java/org/torproject/onionoo/docs/DocumentStore.java
+++ b/src/main/java/org/torproject/onionoo/docs/DocumentStore.java
@@ -9,7 +9,6 @@ import com.google.gson.Gson;
 import com.google.gson.GsonBuilder;
 import com.google.gson.JsonParseException;
 
-import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -318,8 +317,7 @@ public class DocumentStore {
        * objects are escaped JSON, e.g., \u00F2.  When Gson serlializes
        * this string, it escapes the \ to \\, hence writes \\u00F2.  We
        * need to undo this and change \\u00F2 back to \u00F2. */
-      documentString = StringUtils.replace(gson.toJson(document),
-          "\\\\u", "\\u");
+      documentString = FormattingUtils.replaceValidUtf(gson.toJson(document));
       /* Existing details statuses don't contain opening and closing curly
        * brackets, so we should remove them from new details statuses,
        * too. */
diff --git a/src/main/java/org/torproject/onionoo/server/ResponseBuilder.java b/src/main/java/org/torproject/onionoo/server/ResponseBuilder.java
index bb36a2c..e2bdf82 100644
--- a/src/main/java/org/torproject/onionoo/server/ResponseBuilder.java
+++ b/src/main/java/org/torproject/onionoo/server/ResponseBuilder.java
@@ -12,12 +12,11 @@ import org.torproject.onionoo.docs.DocumentStoreFactory;
 import org.torproject.onionoo.docs.SummaryDocument;
 import org.torproject.onionoo.docs.UptimeDocument;
 import org.torproject.onionoo.docs.WeightsDocument;
+import org.torproject.onionoo.util.FormattingUtils;
 
 import com.google.gson.Gson;
 import com.google.gson.GsonBuilder;
 
-import org.apache.commons.lang3.StringUtils;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -348,7 +347,7 @@ public class ResponseBuilder {
         /* Whenever we provide Gson with a string containing an escaped
          * non-ASCII character like \u00F2, it escapes the \ to \\, which
          * we need to undo before including the string in a response. */
-        return StringUtils.replace(gson.toJson(dd), "\\\\u", "\\u");
+        return FormattingUtils.replaceValidUtf(gson.toJson(dd));
       } else {
         // TODO We should probably log that we didn't find a details
         // document that we expected to exist.
diff --git a/src/main/java/org/torproject/onionoo/util/FormattingUtils.java b/src/main/java/org/torproject/onionoo/util/FormattingUtils.java
index 7ed1377..3d16f5a 100644
--- a/src/main/java/org/torproject/onionoo/util/FormattingUtils.java
+++ b/src/main/java/org/torproject/onionoo/util/FormattingUtils.java
@@ -3,8 +3,18 @@
 
 package org.torproject.onionoo.util;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/** Static helper methods for string processing etc. */
 public class FormattingUtils {
 
+  private static Logger log = LoggerFactory.getLogger(
+      FormattingUtils.class);
+
   private FormattingUtils() {
   }
 
@@ -35,5 +45,29 @@ public class FormattingUtils {
   public static String formatDecimalNumber(long decimalNumber) {
     return String.format("%,d", decimalNumber);
   }
+
+  private static Pattern escapePattern = Pattern.compile(
+       "(\\\\{4}u[0-9a-fA-F]{4})");
+
+  /** De-escape only valid UTF and leave anything else escaped. */
+  public static String replaceValidUtf(String text) {
+    if (null == text || text.isEmpty()) {
+      return text;
+    }
+    try {
+      StringBuffer sb = new StringBuffer();
+      Matcher mat = escapePattern.matcher(text);
+      while (mat.find()) {
+        String unescaped = mat.group(1);
+        mat.appendReplacement(sb, unescaped);
+      }
+      mat.appendTail(sb);
+      return sb.toString();
+    } catch (Throwable ex) {
+      log.debug("Couldn't process input '{}'.", text, ex);
+      return text;
+    }
+  }
+
 }
 
diff --git a/src/test/java/org/torproject/onionoo/util/FormattingUtilsTest.java b/src/test/java/org/torproject/onionoo/util/FormattingUtilsTest.java
new file mode 100644
index 0000000..8744696
--- /dev/null
+++ b/src/test/java/org/torproject/onionoo/util/FormattingUtilsTest.java
@@ -0,0 +1,43 @@
+package org.torproject.onionoo.util;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+import org.junit.runners.Parameterized.Parameters;
+
+import java.io.File;
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+ at RunWith(Parameterized.class)
+public class FormattingUtilsTest {
+
+  /** Provide test data. */
+  @Parameters
+  public static Collection<String[]> data() throws Exception {
+    List<String> lines = Files.readAllLines((new File(ClassLoader
+        .getSystemResource("lines-for-escape-tests.txt").toURI()))
+        .toPath());
+    List<String[]> testData = new ArrayList<>();
+    for (int i = 0; i < lines.size(); i += 2) {
+      testData.add(new String[]{lines.get(i), lines.get(i + 1)});
+    }
+    return testData;
+  }
+
+  @Parameter(0)
+  public String in;
+
+  @Parameter(1)
+  public String out;
+
+  @Test
+  public void testReplaceUtf() {
+    assertEquals(out, new String(FormattingUtils.replaceValidUtf(in)));
+  }
+}
diff --git a/src/test/resources/lines-for-escape-tests.txt b/src/test/resources/lines-for-escape-tests.txt
new file mode 100644
index 0000000..4fb5895
--- /dev/null
+++ b/src/test/resources/lines-for-escape-tests.txt
@@ -0,0 +1,16 @@
+
+
+abc
+abc
+\\\\u
+\\\\u
+Haha/\\\\@/\\\\live/\\\\./\\\\co/\\\\./\\\uk
+Haha/\\\\@/\\\\live/\\\\./\\\\co/\\\\./\\\uk
+\\\\u20ac
+\\u20ac
+\\\\u0024
+\\u0024
+some \\\\u20ac other string \\\\u0024 to unescape
+some \\u20ac other string \\u0024 to unescape
+abcd efg\\\\u0024xyz\\\\uxxxx
+abcd efg\\u0024xyz\\\\uxxxx



More information about the tor-commits mailing list