[tor-commits] [snowflake/main] amp package.
dcf at torproject.org
dcf at torproject.org
Thu Aug 5 22:18:28 UTC 2021
commit c9e0dd287f30b2acb0145a7efc326c881792138a
Author: David Fifield <david at bamsoftware.com>
Date: Sun Jul 18 15:22:03 2021 -0600
amp package.
This package contains a CacheURL function that modifies a URL to be
accessed through an AMP cache, and the "AMP armor" data encoding scheme
for encoding data into the AMP subset of HTML.
---
common/amp/armor_decoder.go | 136 +++++++++++++++++++
common/amp/armor_encoder.go | 176 ++++++++++++++++++++++++
common/amp/armor_test.go | 227 +++++++++++++++++++++++++++++++
common/amp/cache.go | 178 ++++++++++++++++++++++++
common/amp/cache_test.go | 320 ++++++++++++++++++++++++++++++++++++++++++++
common/amp/doc.go | 88 ++++++++++++
common/amp/path.go | 44 ++++++
common/amp/path_test.go | 54 ++++++++
8 files changed, 1223 insertions(+)
diff --git a/common/amp/armor_decoder.go b/common/amp/armor_decoder.go
new file mode 100644
index 0000000..fed44a6
--- /dev/null
+++ b/common/amp/armor_decoder.go
@@ -0,0 +1,136 @@
+package amp
+
+import (
+ "bufio"
+ "bytes"
+ "encoding/base64"
+ "fmt"
+ "io"
+
+ "golang.org/x/net/html"
+)
+
+// ErrUnknownVersion is the error returned when the first character inside the
+// element encoding (but outside the base64 encoding) is not '0'.
+type ErrUnknownVersion byte
+
+func (err ErrUnknownVersion) Error() string {
+ return fmt.Sprintf("unknown armor version indicator %+q", byte(err))
+}
+
+func isASCIIWhitespace(b byte) bool {
+ switch b {
+ // https://infra.spec.whatwg.org/#ascii-whitespace
+ case '\x09', '\x0a', '\x0c', '\x0d', '\x20':
+ return true
+ default:
+ return false
+ }
+}
+
+func splitASCIIWhitespace(data []byte, atEOF bool) (advance int, token []byte, err error) {
+ var i, j int
+ // Skip initial whitespace.
+ for i = 0; i < len(data); i++ {
+ if !isASCIIWhitespace(data[i]) {
+ break
+ }
+ }
+ // Look for next whitespace.
+ for j = i; j < len(data); j++ {
+ if isASCIIWhitespace(data[j]) {
+ return j + 1, data[i:j], nil
+ }
+ }
+ // We reached the end of data without finding more whitespace. Only
+ // consider it a token if we are at EOF.
+ if atEOF && i < j {
+ return j, data[i:j], nil
+ }
+ // Otherwise, request more data.
+ return i, nil, nil
+}
+
+func decodeToWriter(w io.Writer, r io.Reader) (int64, error) {
+ tokenizer := html.NewTokenizer(r)
+ // Set a memory limit on token sizes, otherwise the tokenizer will
+ // buffer text indefinitely if it is not broken up by other token types.
+ tokenizer.SetMaxBuf(elementSizeLimit)
+ active := false
+ total := int64(0)
+ for {
+ tt := tokenizer.Next()
+ switch tt {
+ case html.ErrorToken:
+ err := tokenizer.Err()
+ if err == io.EOF {
+ err = nil
+ }
+ if err == nil && active {
+ return total, fmt.Errorf("missing </pre> tag")
+ }
+ return total, err
+ case html.TextToken:
+ if active {
+ // Re-join the separate chunks of text and
+ // feed them to the decoder.
+ scanner := bufio.NewScanner(bytes.NewReader(tokenizer.Text()))
+ scanner.Split(splitASCIIWhitespace)
+ for scanner.Scan() {
+ n, err := w.Write(scanner.Bytes())
+ total += int64(n)
+ if err != nil {
+ return total, err
+ }
+ }
+ if err := scanner.Err(); err != nil {
+ return total, err
+ }
+ }
+ case html.StartTagToken:
+ tn, _ := tokenizer.TagName()
+ if string(tn) == "pre" {
+ if active {
+ // nesting not allowed
+ return total, fmt.Errorf("unexpected %s", tokenizer.Token())
+ }
+ active = true
+ }
+ case html.EndTagToken:
+ tn, _ := tokenizer.TagName()
+ if string(tn) == "pre" {
+ if !active {
+ // stray end tag
+ return total, fmt.Errorf("unexpected %s", tokenizer.Token())
+ }
+ active = false
+ }
+ }
+ }
+}
+
+// NewArmorDecoder returns a new AMP armor decoder.
+func NewArmorDecoder(r io.Reader) (io.Reader, error) {
+ pr, pw := io.Pipe()
+ go func() {
+ _, err := decodeToWriter(pw, r)
+ pw.CloseWithError(err)
+ }()
+
+ // The first byte inside the element encoding is a serverâclient
+ // protocol version indicator.
+ var version [1]byte
+ _, err := pr.Read(version[:])
+ if err != nil {
+ pr.CloseWithError(err)
+ return nil, err
+ }
+ switch version[0] {
+ case '0':
+ return base64.NewDecoder(base64.StdEncoding, pr), nil
+ default:
+ err := ErrUnknownVersion(version[0])
+ pr.CloseWithError(err)
+ return nil, err
+ }
+}
diff --git a/common/amp/armor_encoder.go b/common/amp/armor_encoder.go
new file mode 100644
index 0000000..5d6b0ae
--- /dev/null
+++ b/common/amp/armor_encoder.go
@@ -0,0 +1,176 @@
+package amp
+
+import (
+ "encoding/base64"
+ "io"
+)
+
+// https://amp.dev/boilerplate/
+// https://amp.dev/documentation/guides-and-tutorials/learn/spec/amp-boilerplate/?format=websites
+// https://amp.dev/documentation/guides-and-tutorials/learn/spec/amphtml/?format=websites#the-amp-html-format
+const (
+ boilerplateStart = `<!doctype html>
+<html amp>
+<head>
+<meta charset="utf-8">
+<script async src="https://cdn.ampproject.org/v0.js"></script>
+<link rel="canonical" href="#">
+<meta name="viewport" content="width=device-width">
+<style amp-boilerplate>body{-webkit-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-moz-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-ms-animation:-amp-start 8s steps(1,end) 0s 1 normal both;animation:-amp-start 8s steps(1,end) 0s 1 normal both}@-webkit-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-moz-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-ms-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-o-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}</style><noscript><style amp-boilerplate>body{-webkit-animation:none;-moz-animation:none;-ms-animation:none;animation:none}</style></noscript>
+</head>
+<body>
+`
+ boilerplateEnd = `</body>
+</html>`
+)
+
+const (
+ // We restrict the amount of text may go inside an HTML element, in
+ // order to limit the amount a decoder may have to buffer.
+ elementSizeLimit = 32 * 1024
+
+ // The payload is conceptually a long base64-encoded string, but we
+ // break the string into short chunks separated by whitespace. This is
+ // to protect against modification by AMP caches, which reportedly may
+ // truncate long words in text:
+ // https://bugs.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake/25985#note_2592348
+ bytesPerChunk = 32
+
+ // We set the number of chunks per element so as to stay under
+ // elementSizeLimit. Here, we assume that there is 1 byte of whitespace
+ // after each chunk (with an additional whitespace byte at the beginning
+ // of the element).
+ chunksPerElement = (elementSizeLimit - 1) / (bytesPerChunk + 1)
+)
+
+// The AMP armor encoder is a chain of a base64 encoder (base64.NewEncoder) and
+// an HTML element encoder (elementEncoder). A top-level encoder (armorEncoder)
+// coordinates these two, and handles prepending and appending the AMP
+// boilerplate. armorEncoder's Write method writes data into the base64 encoder,
+// where it makes its way through the chain.
+
+// NewArmorEncoder returns a new AMP armor encoder. Anything written to the
+// returned io.WriteCloser will be encoded and written to w. The caller must
+// call Close to flush any partially written data and output the AMP boilerplate
+// trailer.
+func NewArmorEncoder(w io.Writer) (io.WriteCloser, error) {
+ // Immediately write the AMP boilerplate header.
+ _, err := w.Write([]byte(boilerplateStart))
+ if err != nil {
+ return nil, err
+ }
+
+ element := &elementEncoder{w: w}
+ // Write a serverâclient protocol version indicator, outside the base64
+ // layer.
+ _, err = element.Write([]byte{'0'})
+ if err != nil {
+ return nil, err
+ }
+
+ base64 := base64.NewEncoder(base64.StdEncoding, element)
+ return &armorEncoder{
+ w: w,
+ element: element,
+ base64: base64,
+ }, nil
+}
+
+type armorEncoder struct {
+ base64 io.WriteCloser
+ element *elementEncoder
+ w io.Writer
+}
+
+func (enc *armorEncoder) Write(p []byte) (int, error) {
+ // Write into the chain base64 | element | w.
+ return enc.base64.Write(p)
+}
+
+func (enc *armorEncoder) Close() error {
+ // Close the base64 encoder first, to flush out any buffered data and
+ // the final padding.
+ err := enc.base64.Close()
+ if err != nil {
+ return err
+ }
+
+ // Next, close the element encoder, to close any open elements.
+ err = enc.element.Close()
+ if err != nil {
+ return err
+ }
+
+ // Finally, output the AMP boilerplate trailer.
+ _, err = enc.w.Write([]byte(boilerplateEnd))
+ if err != nil {
+ return err
+ }
+
+ return nil
+}
+
+// elementEncoder arranges written data into pre elements, with the text within
+// separated into chunks. It does no HTML encoding, so data written must not
+// contain any bytes that are meaningful in HTML.
+type elementEncoder struct {
+ w io.Writer
+ chunkCounter int
+ elementCounter int
+}
+
+func (enc *elementEncoder) Write(p []byte) (n int, err error) {
+ total := 0
+ for len(p) > 0 {
+ if enc.elementCounter == 0 && enc.chunkCounter == 0 {
+ _, err := enc.w.Write([]byte("<pre>\n"))
+ if err != nil {
+ return total, err
+ }
+ }
+
+ n := bytesPerChunk - enc.chunkCounter
+ if n > len(p) {
+ n = len(p)
+ }
+ nn, err := enc.w.Write(p[:n])
+ if err != nil {
+ return total, err
+ }
+ total += nn
+ p = p[n:]
+
+ enc.chunkCounter += n
+ if enc.chunkCounter >= bytesPerChunk {
+ enc.chunkCounter = 0
+ enc.elementCounter += 1
+ nn, err = enc.w.Write([]byte("\n"))
+ if err != nil {
+ return total, err
+ }
+ total += nn
+ }
+
+ if enc.elementCounter >= chunksPerElement {
+ enc.elementCounter = 0
+ nn, err = enc.w.Write([]byte("</pre>\n"))
+ if err != nil {
+ return total, err
+ }
+ total += nn
+ }
+ }
+ return total, nil
+}
+
+func (enc *elementEncoder) Close() error {
+ var err error
+ if !(enc.elementCounter == 0 && enc.chunkCounter == 0) {
+ if enc.chunkCounter == 0 {
+ _, err = enc.w.Write([]byte("</pre>\n"))
+ } else {
+ _, err = enc.w.Write([]byte("\n</pre>\n"))
+ }
+ }
+ return err
+}
diff --git a/common/amp/armor_test.go b/common/amp/armor_test.go
new file mode 100644
index 0000000..594ae65
--- /dev/null
+++ b/common/amp/armor_test.go
@@ -0,0 +1,227 @@
+package amp
+
+import (
+ "crypto/rand"
+ "io"
+ "io/ioutil"
+ "strings"
+ "testing"
+)
+
+func armorDecodeToString(src string) (string, error) {
+ dec, err := NewArmorDecoder(strings.NewReader(src))
+ if err != nil {
+ return "", err
+ }
+ p, err := ioutil.ReadAll(dec)
+ return string(p), err
+}
+
+func TestArmorDecoder(t *testing.T) {
+ for _, test := range []struct {
+ input string
+ expectedOutput string
+ expectedErr bool
+ }{
+ {`
+<pre>
+0
+</pre>
+`,
+ "",
+ false,
+ },
+ {`
+<pre>
+0aGVsbG8gd29ybGQK
+</pre>
+`,
+ "hello world\n",
+ false,
+ },
+ // bad version indicator
+ {`
+<pre>
+1aGVsbG8gd29ybGQK
+</pre>
+`,
+ "",
+ true,
+ },
+ // text outside <pre> elements
+ {`
+0aGVsbG8gd29ybGQK
+blah blah blah
+<pre>
+0aGVsbG8gd29ybGQK
+</pre>
+0aGVsbG8gd29ybGQK
+blah blah blah
+`,
+ "hello world\n",
+ false,
+ },
+ {`
+<pre>
+0QUJDREV
+GR0hJSkt
+MTU5PUFF
+SU1RVVld
+</pre>
+junk
+<pre>
+YWVowMTI
+zNDU2Nzg
+5Cg
+=
+</pre>
+<pre>
+=
+</pre>
+`,
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\n",
+ false,
+ },
+ // no <pre> elements, hence no version indicator
+ {`
+aGVsbG8gd29ybGQK
+blah blah blah
+aGVsbG8gd29ybGQK
+aGVsbG8gd29ybGQK
+blah blah blah
+`,
+ "",
+ true,
+ },
+ // empty <pre> elements, hence no version indicator
+ {`
+aGVsbG8gd29ybGQK
+blah blah blah
+<pre> </pre>
+aGVsbG8gd29ybGQK
+aGVsbG8gd29ybGQK<pre></pre>
+blah blah blah
+`,
+ "",
+ true,
+ },
+ // other elements inside <pre>
+ {
+ "blah <pre>0aGVsb<p>G8gd29</p>ybGQK</pre>",
+ "hello world\n",
+ false,
+ },
+ // HTML comment
+ {
+ "blah <!-- <pre>aGVsbG8gd29ybGQK</pre> -->",
+ "",
+ true,
+ },
+ // all kinds of ASCII whitespace
+ {
+ "blah <pre>\x200\x09aG\x0aV\x0csb\x0dG8\x20gd29ybGQK</pre>",
+ "hello world\n",
+ false,
+ },
+
+ // bad padding
+ {`
+<pre>
+0QUJDREV
+GR0hJSkt
+MTU5PUFF
+SU1RVVld
+</pre>
+junk
+<pre>
+YWVowMTI
+zNDU2Nzg
+5Cg
+=
+</pre>
+`,
+ "",
+ true,
+ },
+ /*
+ // per-chunk base64
+ // test disabled because Go stdlib handles this incorrectly:
+ // https://github.com/golang/go/issues/31626
+ {
+ "<pre>QQ==</pre><pre>Qg==</pre>",
+ "",
+ true,
+ },
+ */
+ // missing </pre>
+ {
+ "blah <pre></pre><pre>0aGVsbG8gd29ybGQK",
+ "",
+ true,
+ },
+ // nested <pre>
+ {
+ "blah <pre>0aGVsb<pre>G8gd29</pre>ybGQK</pre>",
+ "",
+ true,
+ },
+ } {
+ output, err := armorDecodeToString(test.input)
+ if test.expectedErr && err == nil {
+ t.Errorf("%+q â (%+q, %v), expected error", test.input, output, err)
+ continue
+ }
+ if !test.expectedErr && err != nil {
+ t.Errorf("%+q â (%+q, %v), expected no error", test.input, output, err)
+ continue
+ }
+ if !test.expectedErr && output != test.expectedOutput {
+ t.Errorf("%+q â (%+q, %v), expected (%+q, %v)",
+ test.input, output, err, test.expectedOutput, nil)
+ continue
+ }
+ }
+}
+
+func armorRoundTrip(s string) (string, error) {
+ var encoded strings.Builder
+ enc, err := NewArmorEncoder(&encoded)
+ if err != nil {
+ return "", err
+ }
+ _, err = io.Copy(enc, strings.NewReader(s))
+ if err != nil {
+ return "", err
+ }
+ err = enc.Close()
+ if err != nil {
+ return "", err
+ }
+ return armorDecodeToString(encoded.String())
+}
+
+func TestArmorRoundTrip(t *testing.T) {
+ lengths := make([]int, 0)
+ // Test short strings and lengths around elementSizeLimit thresholds.
+ for i := 0; i < bytesPerChunk*2; i++ {
+ lengths = append(lengths, i)
+ }
+ for i := -10; i < +10; i++ {
+ lengths = append(lengths, elementSizeLimit+i)
+ lengths = append(lengths, 2*elementSizeLimit+i)
+ }
+ for _, n := range lengths {
+ buf := make([]byte, n)
+ rand.Read(buf)
+ input := string(buf)
+ output, err := armorRoundTrip(input)
+ if err != nil {
+ t.Errorf("length %d â error %v", n, err)
+ continue
+ }
+ if output != input {
+ t.Errorf("length %d â %+q", n, output)
+ continue
+ }
+ }
+}
diff --git a/common/amp/cache.go b/common/amp/cache.go
new file mode 100644
index 0000000..102993f
--- /dev/null
+++ b/common/amp/cache.go
@@ -0,0 +1,178 @@
+package amp
+
+import (
+ "crypto/sha256"
+ "encoding/base32"
+ "fmt"
+ "net"
+ "net/url"
+ "path"
+ "strings"
+
+ "golang.org/x/net/idna"
+)
+
+// domainPrefixBasic does the basic domain prefix conversion. Does not do any
+// IDNA mapping, such as https://www.unicode.org/reports/tr46/.
+//
+// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#basic-algorithm
+func domainPrefixBasic(domain string) (string, error) {
+ // 1. Punycode Decode the publisher domain.
+ prefix, err := idna.ToUnicode(domain)
+ if err != nil {
+ return "", err
+ }
+
+ // 2. Replace any "-" (hyphen) character in the output of step 1 with
+ // "--" (two hyphens).
+ prefix = strings.Replace(prefix, "-", "--", -1)
+
+ // 3. Replace any "." (dot) character in the output of step 2 with "-"
+ // (hyphen).
+ prefix = strings.Replace(prefix, ".", "-", -1)
+
+ // 4. If the output of step 3 has a "-" (hyphen) at both positions 3 and
+ // 4, then to the output of step 3, add a prefix of "0-" and add a
+ // suffix of "-0".
+ if len(prefix) >= 4 && prefix[2] == '-' && prefix[3] == '-' {
+ prefix = "0-" + prefix + "-0"
+ }
+
+ // 5. Punycode Encode the output of step 3.
+ return idna.ToASCII(prefix)
+}
+
+// Lower-case base32 without padding.
+var fallbackBase32Encoding = base32.NewEncoding("abcdefghijklmnopqrstuvwxyz234567").WithPadding(base32.NoPadding)
+
+// domainPrefixFallback does the fallback domain prefix conversion. The returned
+// base32 domain uses lower-case letters.
+//
+// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#fallback-algorithm
+func domainPrefixFallback(domain string) string {
+ // The algorithm specification does not say what, exactly, we are to
+ // take the SHA-256 of. domain is notionally an abstract Unicode
+ // string, not a byte sequence. While
+ // https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/AmpCurlUrlGenerator.js#L62
+ // says "Take the SHA256 of the punycode view of the domain," in reality
+ // it hashes the UTF-8 encoding of the domain, without Punycode:
+ // https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/AmpCurlUrlGenerator.js#L141
+ // https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/browser/Sha256.js#L24
+ // We do the same here, hashing the raw bytes of domain, presumed to be
+ // UTF-8.
+
+ // 1. Hash the publisher's domain using SHA256.
+ h := sha256.Sum256([]byte(domain))
+
+ // 2. Base32 Escape the output of step 1.
+ // 3. Remove the last 4 characters from the output of step 2, which are
+ // always "=" (equals) characters.
+ return fallbackBase32Encoding.EncodeToString(h[:])
+}
+
+// domainPrefix computes the domain prefix of an AMP cache URL.
+//
+// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#domain-name-prefix
+func domainPrefix(domain string) string {
+ // https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#combined-algorithm
+ // 1. Run the Basic Algorithm. If the output is a valid DNS label,
+ // [append the Cache domain suffix and] return. Otherwise continue to
+ // step 2.
+ prefix, err := domainPrefixBasic(domain)
+ // "A domain prefix is not a valid DNS label if it is longer than 63
+ // characters"
+ if err == nil && len(prefix) <= 63 {
+ return prefix
+ }
+ // 2. Run the Fallback Algorithm. [Append the Cache domain suffix and]
+ // return.
+ return domainPrefixFallback(domain)
+}
+
+// CacheURL computes the AMP cache URL for the publisher URL pubURL, using the
+// AMP cache at cacheURL. contentType is a string such as "c" or "i" that
+// indicates what type of serving the AMP cache is to perform. The Scheme of
+// pubURL must be "http" or "https". The Port of pubURL, if any, must match the
+// default for the scheme. cacheURL may not have RawQuery, Fragment, or
+// RawFragment set, because the resulting URL's query and fragment are taken
+// from the publisher URL.
+//
+// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/
+func CacheURL(pubURL, cacheURL *url.URL, contentType string) (*url.URL, error) {
+ // The cache URL subdomain, including the domain prefix corresponding to
+ // the publisher URL's domain.
+ resultHost := domainPrefix(pubURL.Hostname()) + "." + cacheURL.Hostname()
+ if cacheURL.Port() != "" {
+ resultHost = net.JoinHostPort(resultHost, cacheURL.Port())
+ }
+
+ // https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#url-path
+ // The first part of the path is the cache URL's own path, if any.
+ pathComponents := []string{cacheURL.EscapedPath()}
+ // The next path component is the content type. We cannot encode an
+ // empty content type, because it would result in consecutive path
+ // separators, which would semantically combine into a single separator.
+ if contentType == "" {
+ return nil, fmt.Errorf("invalid content type %+q", contentType)
+ }
+ pathComponents = append(pathComponents, url.PathEscape(contentType))
+ // Then, we add an "s" path component, if the publisher URL scheme is
+ // "https".
+ switch pubURL.Scheme {
+ case "http":
+ // Do nothing.
+ case "https":
+ pathComponents = append(pathComponents, "s")
+ default:
+ return nil, fmt.Errorf("invalid scheme %+q in publisher URL", pubURL.Scheme)
+ }
+ // The next path component is the publisher URL's host. The AMP cache
+ // URL format specification is not clear about whether other
+ // subcomponents of the authority (namely userinfo and port) may appear
+ // here. We adopt a policy of forbidding userinfo, and requiring that
+ // the port be the default for the scheme (and then we omit the port
+ // entirely from the returned URL).
+ if pubURL.User != nil {
+ return nil, fmt.Errorf("publisher URL may not contain userinfo")
+ }
+ if port := pubURL.Port(); port != "" {
+ if !((pubURL.Scheme == "http" && port == "80") || (pubURL.Scheme == "https" && port == "443")) {
+ return nil, fmt.Errorf("publisher URL port %+q is not the default for scheme %+q", port, pubURL.Scheme)
+ }
+ }
+ // As with the content type, we cannot encode an empty host, because
+ // that would result in an empty path component.
+ if pubURL.Hostname() == "" {
+ return nil, fmt.Errorf("invalid host %+q in publisher URL", pubURL.Hostname())
+ }
+ pathComponents = append(pathComponents, url.PathEscape(pubURL.Hostname()))
+ // Finally, we append the remainder of the original escaped path from
+ // the publisher URL.
+ pathComponents = append(pathComponents, pubURL.EscapedPath())
+
+ resultRawPath := path.Join(pathComponents...)
+ resultPath, err := url.PathUnescape(resultRawPath)
+ if err != nil {
+ return nil, err
+ }
+
+ // The query and fragment of the returned URL always come from pubURL.
+ // Any query or fragment of cacheURL would be ignored. Return an error
+ // if either is set.
+ if cacheURL.RawQuery != "" {
+ return nil, fmt.Errorf("cache URL may not contain a query")
+ }
+ if cacheURL.Fragment != "" {
+ return nil, fmt.Errorf("cache URL may not contain a fragment")
+ }
+
+ return &url.URL{
+ Scheme: cacheURL.Scheme,
+ User: cacheURL.User,
+ Host: resultHost,
+ Path: resultPath,
+ RawPath: resultRawPath,
+ RawQuery: pubURL.RawQuery,
+ Fragment: pubURL.Fragment,
+ }, nil
+}
diff --git a/common/amp/cache_test.go b/common/amp/cache_test.go
new file mode 100644
index 0000000..45950fd
--- /dev/null
+++ b/common/amp/cache_test.go
@@ -0,0 +1,320 @@
+package amp
+
+import (
+ "bytes"
+ "net/url"
+ "testing"
+
+ "golang.org/x/net/idna"
+)
+
+func TestDomainPrefixBasic(t *testing.T) {
+ // Tests expecting no error.
+ for _, test := range []struct {
+ domain, expected string
+ }{
+ {"", ""},
+ {"xn--", ""},
+ {"...", "---"},
+
+ // Should not apply mappings such as case folding and
+ // normalization.
+ {"b\u00fccher.de", "xn--bcher-de-65a"},
+ {"B\u00fccher.de", "xn--Bcher-de-65a"},
+ {"bu\u0308cher.de", "xn--bucher-de-hkf"},
+
+ // Check some that differ between IDNA 2003 and IDNA 2008.
+ // https://unicode.org/reports/tr46/#Deviations
+ // https://util.unicode.org/UnicodeJsps/idna.jsp
+ {"faÃ.de", "xn--fa-de-mqa"},
+ {"βÏλοÏ.com", "xn---com-4ld8c2a6a8e"},
+
+ // Lengths of 63 and 64. 64 is too long for a DNS label, but
+ // domainPrefixBasic is not expected to check for that.
+ {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"},
+ {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"},
+
+ // https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#basic-algorithm
+ {"example.com", "example-com"},
+ {"foo.example.com", "foo-example-com"},
+ {"foo-example.com", "foo--example-com"},
+ {"xn--57hw060o.com", "xn---com-p33b41770a"},
+ {"\u26a1\U0001f60a.com", "xn---com-p33b41770a"},
+ {"en-us.example.com", "0-en--us-example-com-0"},
+ } {
+ output, err := domainPrefixBasic(test.domain)
+ if err != nil || output != test.expected {
+ t.Errorf("%+q â (%+q, %v), expected (%+q, %v)",
+ test.domain, output, err, test.expected, nil)
+ }
+ }
+
+ // Tests expecting an error.
+ for _, domain := range []string{
+ "xn---",
+ } {
+ output, err := domainPrefixBasic(domain)
+ if err == nil || output != "" {
+ t.Errorf("%+q â (%+q, %v), expected (%+q, non-nil)",
+ domain, output, err, "")
+ }
+ }
+}
+
+func TestDomainPrefixFallback(t *testing.T) {
+ for _, test := range []struct {
+ domain, expected string
+ }{
+ {
+ "",
+ "4oymiquy7qobjgx36tejs35zeqt24qpemsnzgtfeswmrw6csxbkq",
+ },
+ {
+ "example.com",
+ "un42n5xov642kxrxrqiyanhcoupgql5lt4wtbkyt2ijflbwodfdq",
+ },
+
+ // These checked against the output of
+ // https://github.com/ampproject/amp-toolbox/tree/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url,
+ // using the widget at
+ // https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#url-format.
+ {
+ "000000000000000000000000000000000000000000000000000000000000.com",
+ "stejanx4hsijaoj4secyecy4nvqodk56kw72whwcmvdbtucibf5a",
+ },
+ {
+ "00000000000000000000000000000000000000000000000000000000000a.com",
+ "jdcvbsorpnc3hcjrhst56nfm6ymdpovlawdbm2efyxpvlt4cpbya",
+ },
+ {
+ "00000000000000000000000000000000000000000000000000000000000\u03bb.com",
+ "qhzqeumjkfpcpuic3vqruyjswcr7y7gcm3crqyhhywvn3xrhchfa",
+ },
+ } {
+ output := domainPrefixFallback(test.domain)
+ if output != test.expected {
+ t.Errorf("%+q â %+q, expected %+q",
+ test.domain, output, test.expected)
+ }
+ }
+}
+
+// Checks that domainPrefix chooses domainPrefixBasic or domainPrefixFallback as
+// appropriate; i.e., always returns string that is a valid DNS label and is
+// IDNA-decodable.
+func TestDomainPrefix(t *testing.T) {
+ // A validating IDNA profile, which checks label length and that the
+ // label contains only certain ASCII characters. It does not do the
+ // ValidateLabels check, because that depends on the input having
+ // certain properties.
+ profile := idna.New(
+ idna.VerifyDNSLength(true),
+ idna.StrictDomainName(true),
+ )
+ for _, domain := range []string{
+ "example.com",
+ "\u0314example.com",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", // 63 bytes
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", // 64 bytes
+ "xn--57hw060o.com",
+ "a b c",
+ } {
+ output := domainPrefix(domain)
+ if bytes.IndexByte([]byte(output), '.') != -1 {
+ t.Errorf("%+q â %+q contains a dot", domain, output)
+ }
+ _, err := profile.ToUnicode(output)
+ if err != nil {
+ t.Errorf("%+q â error %v", domain, err)
+ }
+ }
+}
+
+func mustParseURL(rawurl string) *url.URL {
+ u, err := url.Parse(rawurl)
+ if err != nil {
+ panic(err)
+ }
+ return u
+}
+
+func TestCacheURL(t *testing.T) {
+ // Tests expecting no error.
+ for _, test := range []struct {
+ pub string
+ cache string
+ contentType string
+ expected string
+ }{
+ // With or without trailing slash on pubURL.
+ {
+ "http://example.com/",
+ "https://amp.cache/",
+ "c",
+ "https://example-com.amp.cache/c/example.com",
+ },
+ {
+ "http://example.com",
+ "https://amp.cache/",
+ "c",
+ "https://example-com.amp.cache/c/example.com",
+ },
+ // https pubURL.
+ {
+ "https://example.com/",
+ "https://amp.cache/",
+ "c",
+ "https://example-com.amp.cache/c/s/example.com",
+ },
+ // The content type should be escaped if necessary.
+ {
+ "http://example.com/",
+ "https://amp.cache/",
+ "/",
+ "https://example-com.amp.cache/%2F/example.com",
+ },
+ // Retain pubURL path, query, and fragment, including escaping.
+ {
+ "http://example.com/my%2Fpath/index.html?a=1#fragment",
+ "https://amp.cache/",
+ "c",
+ "https://example-com.amp.cache/c/example.com/my%2Fpath/index.html?a=1#fragment",
+ },
+ // Retain scheme, userinfo, port, and path of cacheURL, escaping
+ // whatever is necessary.
+ {
+ "http://example.com",
+ "http://cache%2Fuser:cache%40pass@amp.cache:123/with/../../path/..%2f../",
+ "c",
+ "http://cache%2Fuser:cache%40pass@example-com.amp.cache:123/path/..%2f../c/example.com",
+ },
+ // Port numbers in pubURL are allowed, if they're the default
+ // for scheme.
+ {
+ "http://example.com:80/",
+ "https://amp.cache/",
+ "c",
+ "https://example-com.amp.cache/c/example.com",
+ },
+ {
+ "https://example.com:443/",
+ "https://amp.cache/",
+ "c",
+ "https://example-com.amp.cache/c/s/example.com",
+ },
+ // "?" at the end of cacheURL is okay, as long as the query is
+ // empty.
+ {
+ "http://example.com/",
+ "https://amp.cache/?",
+ "c",
+ "https://example-com.amp.cache/c/example.com",
+ },
+
+ // https://developers.google.com/amp/cache/overview#example-requesting-document-using-tls
+ {
+ "https://example.com/amp_document.html",
+ "https://cdn.ampproject.org/",
+ "c",
+ "https://example-com.cdn.ampproject.org/c/s/example.com/amp_document.html",
+ },
+ // https://developers.google.com/amp/cache/overview#example-requesting-image-using-plain-http
+ {
+ "http://example.com/logo.png",
+ "https://cdn.ampproject.org/",
+ "i",
+ "https://example-com.cdn.ampproject.org/i/example.com/logo.png",
+ },
+ // https://developers.google.com/amp/cache/overview#query-parameter-example
+ {
+ "https://example.com/g?value=Hello%20World",
+ "https://cdn.ampproject.org/",
+ "c",
+ "https://example-com.cdn.ampproject.org/c/s/example.com/g?value=Hello%20World",
+ },
+ } {
+ pubURL := mustParseURL(test.pub)
+ cacheURL := mustParseURL(test.cache)
+ outputURL, err := CacheURL(pubURL, cacheURL, test.contentType)
+ if err != nil {
+ t.Errorf("%+q %+q %+q â error %v",
+ test.pub, test.cache, test.contentType, err)
+ continue
+ }
+ if outputURL.String() != test.expected {
+ t.Errorf("%+q %+q %+q â %+q, expected %+q",
+ test.pub, test.cache, test.contentType, outputURL, test.expected)
+ continue
+ }
+ }
+
+ // Tests expecting an error.
+ for _, test := range []struct {
+ pub string
+ cache string
+ contentType string
+ }{
+ // Empty content type.
+ {
+ "http://example.com/",
+ "https://amp.cache/",
+ "",
+ },
+ // Empty host.
+ {
+ "http:///index.html",
+ "https://amp.cache/",
+ "c",
+ },
+ // Empty scheme.
+ {
+ "//example.com/",
+ "https://amp.cache/",
+ "c",
+ },
+ // Unrecognized scheme.
+ {
+ "ftp://example.com/",
+ "https://amp.cache/",
+ "c",
+ },
+ // Wrong port number for scheme.
+ {
+ "http://example.com:443/",
+ "https://amp.cache/",
+ "c",
+ },
+ // userinfo in pubURL.
+ {
+ "http://user@example.com/",
+ "https://amp.cache/",
+ "c",
+ },
+ {
+ "http://user:pass@example.com/",
+ "https://amp.cache/",
+ "c",
+ },
+ // cacheURL may not contain a query.
+ {
+ "http://example.com/",
+ "https://amp.cache/?a=1",
+ "c",
+ },
+ // cacheURL may not contain a fragment.
+ {
+ "http://example.com/",
+ "https://amp.cache/#fragment",
+ "c",
+ },
+ } {
+ pubURL := mustParseURL(test.pub)
+ cacheURL := mustParseURL(test.cache)
+ outputURL, err := CacheURL(pubURL, cacheURL, test.contentType)
+ if err == nil {
+ t.Errorf("%+q %+q %+q â %+q, expected error",
+ test.pub, test.cache, test.contentType, outputURL)
+ continue
+ }
+ }
+}
diff --git a/common/amp/doc.go b/common/amp/doc.go
new file mode 100644
index 0000000..1387114
--- /dev/null
+++ b/common/amp/doc.go
@@ -0,0 +1,88 @@
+/*
+Package amp provides functions for working with the AMP (Accelerated Mobile
+Pages) subset of HTML, and conveying binary data through an AMP cache.
+
+AMP cache
+
+The CacheURL function takes a plain URL and converts it to be accessed through a
+given AMP cache.
+
+The EncodePath and DecodePath functions provide a way to encode data into the
+suffix of a URL path. AMP caches do not support HTTP POST, but encoding data
+into a URL path with GET is an alternative means of sending data to the server.
+The format of an encoded path is:
+ 0<0 or more bytes, including slash>/<base64 of data>
+That is:
+* "0", a format version number, which controls the interpretation of the rest of
+the path. Only the first byte matters as a version indicator (not the whole
+first path component).
+* Any number of slash or non-slash bytes. These may be used as padding or to
+prevent cache collisions in the AMP cache.
+* A final slash.
+* base64 encoding of the data, using the URL-safe alphabet (which does not
+include slash).
+
+For example, an encoding of the string "This is path-encoded data." is the
+following. The "lgWHcwhXFjUm" following the format version number is random
+padding that will be ignored on decoding.
+ 0lgWHcwhXFjUm/VGhpcyBpcyBwYXRoLWVuY29kZWQgZGF0YS4
+
+It is the caller's responsibility to add or remove any directory path prefix
+before calling EncodePath or DecodePath.
+
+AMP armor
+
+AMP armor is a data encoding scheme that that satisfies the requirements of the
+AMP (Accelerated Mobile Pages) subset of HTML, and survives modification by an
+AMP cache. For the requirements of AMP HTML, see
+https://amp.dev/documentation/guides-and-tutorials/learn/spec/amphtml/.
+For modifications that may be made by an AMP cache, see
+https://github.com/ampproject/amphtml/blob/main/docs/spec/amp-cache-modifications.md.
+
+The encoding is based on ones created by Ivan Markin. See codec/amp/ in
+https://github.com/nogoegst/amper and discussion at
+https://bugs.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake/25985.
+
+The encoding algorithm works as follows. Base64-encode the input. Prepend the
+input with the byte '0'; this is a protocol version indicator that the decoder
+can use to determine how to interpret the bytes that follow. Split the base64
+into fixed-size chunks separated by whitespace. Take up to 1024 chunks at a
+time, and wrap them in a pre element. Then, situate the markup so far within the
+body of the AMP HTML boilerplate. The decoding algorithm is to scan the HTML for
+pre elements, split their text contents on whitespace and concatenate, then
+base64 decode. The base64 encoding uses the standard alphabet, with normal "="
+padding (https://tools.ietf.org/html/rfc4648#section-4).
+
+The reason for splitting the base64 into chunks is that AMP caches reportedly
+truncate long strings that are not broken by whitespace:
+https://bugs.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake/25985#note_2592348.
+The characters that may separate the chunks are the ASCII whitespace characters
+(https://infra.spec.whatwg.org/#ascii-whitespace) "\x09", "\x0a", "\x0c",
+"\x0d", and "\x20". The reason for separating the chunks into pre elements is to
+limit the amount of text a decoder may have to buffer while parsing the HTML.
+Each pre element may contain at most 64 KB of text. pre elements may not be
+nested.
+
+Example
+
+The following is the result of encoding the string
+"This was encoded with AMP armor.":
+
+ <!doctype html>
+ <html amp>
+ <head>
+ <meta charset="utf-8">
+ <script async src="https://cdn.ampproject.org/v0.js"></script>
+ <link rel="canonical" href="#">
+ <meta name="viewport" content="width=device-width">
+ <style amp-boilerplate>body{-webkit-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-moz-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-ms-animation:-amp-start 8s steps(1,end) 0s 1 normal both;animation:-amp-start 8s steps(1,end) 0s 1 normal both}@-webkit-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-moz-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-ms-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-o-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}</style><noscript><style amp-boilerplate>body{-webkit-animation:none;-moz-animation:none;-ms-animation:none;animation:none}</style></noscript>
+ </head>
+ <body>
+ <pre>
+ 0VGhpcyB3YXMgZW5jb2RlZCB3aXRoIEF
+ NUCBhcm1vci4=
+ </pre>
+ </body>
+ </html>
+*/
+package amp
diff --git a/common/amp/path.go b/common/amp/path.go
new file mode 100644
index 0000000..5903694
--- /dev/null
+++ b/common/amp/path.go
@@ -0,0 +1,44 @@
+package amp
+
+import (
+ "crypto/rand"
+ "encoding/base64"
+ "fmt"
+ "strings"
+)
+
+// EncodePath encodes data in a way that is suitable for the suffix of an AMP
+// cache URL.
+func EncodePath(data []byte) string {
+ var cacheBreaker [9]byte
+ _, err := rand.Read(cacheBreaker[:])
+ if err != nil {
+ panic(err)
+ }
+ b64 := base64.RawURLEncoding.EncodeToString
+ return "0" + b64(cacheBreaker[:]) + "/" + b64(data)
+}
+
+// DecodePath decodes data from a path suffix as encoded by EncodePath. The path
+// must have already been trimmed of any directory prefix (as might be present
+// in, e.g., an HTTP request). That is, the first character of path should be
+// the "0" message format indicator.
+func DecodePath(path string) ([]byte, error) {
+ if len(path) < 1 {
+ return nil, fmt.Errorf("missing format indicator")
+ }
+ version := path[0]
+ rest := path[1:]
+ switch version {
+ case '0':
+ // Ignore everything else up to and including the final slash
+ // (there must be at least one slash).
+ i := strings.LastIndexByte(rest, '/')
+ if i == -1 {
+ return nil, fmt.Errorf("missing data")
+ }
+ return base64.RawURLEncoding.DecodeString(rest[i+1:])
+ default:
+ return nil, fmt.Errorf("unknown format indicator %q", version)
+ }
+}
diff --git a/common/amp/path_test.go b/common/amp/path_test.go
new file mode 100644
index 0000000..20e4ccf
--- /dev/null
+++ b/common/amp/path_test.go
@@ -0,0 +1,54 @@
+package amp
+
+import (
+ "testing"
+)
+
+func TestDecodePath(t *testing.T) {
+ for _, test := range []struct {
+ path string
+ expectedData string
+ expectedErrStr string
+ }{
+ {"", "", "missing format indicator"},
+ {"0", "", "missing data"},
+ {"0foobar", "", "missing data"},
+ {"/0/YWJj", "", "unknown format indicator '/'"},
+
+ {"0/", "", ""},
+ {"0foobar/", "", ""},
+ {"0/YWJj", "abc", ""},
+ {"0///YWJj", "abc", ""},
+ {"0foobar/YWJj", "abc", ""},
+ {"0/foobar/YWJj", "abc", ""},
+ } {
+ data, err := DecodePath(test.path)
+ if test.expectedErrStr != "" {
+ if err == nil || err.Error() != test.expectedErrStr {
+ t.Errorf("%+q expected error %+q, got %+q",
+ test.path, test.expectedErrStr, err)
+ }
+ } else if err != nil {
+ t.Errorf("%+q expected no error, got %+q", test.path, err)
+ } else if string(data) != test.expectedData {
+ t.Errorf("%+q expected data %+q, got %+q",
+ test.path, test.expectedData, data)
+ }
+ }
+}
+
+func TestPathRoundTrip(t *testing.T) {
+ for _, data := range []string{
+ "",
+ "\x00",
+ "/",
+ "hello world",
+ } {
+ decoded, err := DecodePath(EncodePath([]byte(data)))
+ if err != nil {
+ t.Errorf("%+q roundtripped with error %v", data, err)
+ } else if string(decoded) != data {
+ t.Errorf("%+q roundtripped to %+q", data, decoded)
+ }
+ }
+}
More information about the tor-commits
mailing list