[tor-commits] [snowflake/main] amp package.

dcf at torproject.org dcf at torproject.org
Thu Aug 5 22:18:28 UTC 2021


commit c9e0dd287f30b2acb0145a7efc326c881792138a
Author: David Fifield <david at bamsoftware.com>
Date:   Sun Jul 18 15:22:03 2021 -0600

    amp package.
    
    This package contains a CacheURL function that modifies a URL to be
    accessed through an AMP cache, and the "AMP armor" data encoding scheme
    for encoding data into the AMP subset of HTML.
---
 common/amp/armor_decoder.go | 136 +++++++++++++++++++
 common/amp/armor_encoder.go | 176 ++++++++++++++++++++++++
 common/amp/armor_test.go    | 227 +++++++++++++++++++++++++++++++
 common/amp/cache.go         | 178 ++++++++++++++++++++++++
 common/amp/cache_test.go    | 320 ++++++++++++++++++++++++++++++++++++++++++++
 common/amp/doc.go           |  88 ++++++++++++
 common/amp/path.go          |  44 ++++++
 common/amp/path_test.go     |  54 ++++++++
 8 files changed, 1223 insertions(+)

diff --git a/common/amp/armor_decoder.go b/common/amp/armor_decoder.go
new file mode 100644
index 0000000..fed44a6
--- /dev/null
+++ b/common/amp/armor_decoder.go
@@ -0,0 +1,136 @@
+package amp
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/base64"
+	"fmt"
+	"io"
+
+	"golang.org/x/net/html"
+)
+
+// ErrUnknownVersion is the error returned when the first character inside the
+// element encoding (but outside the base64 encoding) is not '0'.
+type ErrUnknownVersion byte
+
+func (err ErrUnknownVersion) Error() string {
+	return fmt.Sprintf("unknown armor version indicator %+q", byte(err))
+}
+
+func isASCIIWhitespace(b byte) bool {
+	switch b {
+	// https://infra.spec.whatwg.org/#ascii-whitespace
+	case '\x09', '\x0a', '\x0c', '\x0d', '\x20':
+		return true
+	default:
+		return false
+	}
+}
+
+func splitASCIIWhitespace(data []byte, atEOF bool) (advance int, token []byte, err error) {
+	var i, j int
+	// Skip initial whitespace.
+	for i = 0; i < len(data); i++ {
+		if !isASCIIWhitespace(data[i]) {
+			break
+		}
+	}
+	// Look for next whitespace.
+	for j = i; j < len(data); j++ {
+		if isASCIIWhitespace(data[j]) {
+			return j + 1, data[i:j], nil
+		}
+	}
+	// We reached the end of data without finding more whitespace. Only
+	// consider it a token if we are at EOF.
+	if atEOF && i < j {
+		return j, data[i:j], nil
+	}
+	// Otherwise, request more data.
+	return i, nil, nil
+}
+
+func decodeToWriter(w io.Writer, r io.Reader) (int64, error) {
+	tokenizer := html.NewTokenizer(r)
+	// Set a memory limit on token sizes, otherwise the tokenizer will
+	// buffer text indefinitely if it is not broken up by other token types.
+	tokenizer.SetMaxBuf(elementSizeLimit)
+	active := false
+	total := int64(0)
+	for {
+		tt := tokenizer.Next()
+		switch tt {
+		case html.ErrorToken:
+			err := tokenizer.Err()
+			if err == io.EOF {
+				err = nil
+			}
+			if err == nil && active {
+				return total, fmt.Errorf("missing </pre> tag")
+			}
+			return total, err
+		case html.TextToken:
+			if active {
+				// Re-join the separate chunks of text and
+				// feed them to the decoder.
+				scanner := bufio.NewScanner(bytes.NewReader(tokenizer.Text()))
+				scanner.Split(splitASCIIWhitespace)
+				for scanner.Scan() {
+					n, err := w.Write(scanner.Bytes())
+					total += int64(n)
+					if err != nil {
+						return total, err
+					}
+				}
+				if err := scanner.Err(); err != nil {
+					return total, err
+				}
+			}
+		case html.StartTagToken:
+			tn, _ := tokenizer.TagName()
+			if string(tn) == "pre" {
+				if active {
+					// nesting not allowed
+					return total, fmt.Errorf("unexpected %s", tokenizer.Token())
+				}
+				active = true
+			}
+		case html.EndTagToken:
+			tn, _ := tokenizer.TagName()
+			if string(tn) == "pre" {
+				if !active {
+					// stray end tag
+					return total, fmt.Errorf("unexpected %s", tokenizer.Token())
+				}
+				active = false
+			}
+		}
+	}
+}
+
+// NewArmorDecoder returns a new AMP armor decoder.
+func NewArmorDecoder(r io.Reader) (io.Reader, error) {
+	pr, pw := io.Pipe()
+	go func() {
+		_, err := decodeToWriter(pw, r)
+		pw.CloseWithError(err)
+	}()
+
+	// The first byte inside the element encoding is a server–client
+	// protocol version indicator.
+	var version [1]byte
+	_, err := pr.Read(version[:])
+	if err != nil {
+		pr.CloseWithError(err)
+		return nil, err
+	}
+	switch version[0] {
+	case '0':
+		return base64.NewDecoder(base64.StdEncoding, pr), nil
+	default:
+		err := ErrUnknownVersion(version[0])
+		pr.CloseWithError(err)
+		return nil, err
+	}
+}
diff --git a/common/amp/armor_encoder.go b/common/amp/armor_encoder.go
new file mode 100644
index 0000000..5d6b0ae
--- /dev/null
+++ b/common/amp/armor_encoder.go
@@ -0,0 +1,176 @@
+package amp
+
+import (
+	"encoding/base64"
+	"io"
+)
+
+// https://amp.dev/boilerplate/
+// https://amp.dev/documentation/guides-and-tutorials/learn/spec/amp-boilerplate/?format=websites
+// https://amp.dev/documentation/guides-and-tutorials/learn/spec/amphtml/?format=websites#the-amp-html-format
+const (
+	boilerplateStart = `<!doctype html>
+<html amp>
+<head>
+<meta charset="utf-8">
+<script async src="https://cdn.ampproject.org/v0.js"></script>
+<link rel="canonical" href="#">
+<meta name="viewport" content="width=device-width">
+<style amp-boilerplate>body{-webkit-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-moz-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-ms-animation:-amp-start 8s steps(1,end) 0s 1 normal both;animation:-amp-start 8s steps(1,end) 0s 1 normal both}@-webkit-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-moz-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-ms-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-o-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}</style><noscript><style amp-boilerplate>body{-webkit-animation:none;-moz-animation:none;-ms-animation:none;animation:none}</style></noscript>
+</head>
+<body>
+`
+	boilerplateEnd = `</body>
+</html>`
+)
+
+const (
+	// We restrict the amount of text may go inside an HTML element, in
+	// order to limit the amount a decoder may have to buffer.
+	elementSizeLimit = 32 * 1024
+
+	// The payload is conceptually a long base64-encoded string, but we
+	// break the string into short chunks separated by whitespace. This is
+	// to protect against modification by AMP caches, which reportedly may
+	// truncate long words in text:
+	// https://bugs.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake/25985#note_2592348
+	bytesPerChunk = 32
+
+	// We set the number of chunks per element so as to stay under
+	// elementSizeLimit. Here, we assume that there is 1 byte of whitespace
+	// after each chunk (with an additional whitespace byte at the beginning
+	// of the element).
+	chunksPerElement = (elementSizeLimit - 1) / (bytesPerChunk + 1)
+)
+
+// The AMP armor encoder is a chain of a base64 encoder (base64.NewEncoder) and
+// an HTML element encoder (elementEncoder). A top-level encoder (armorEncoder)
+// coordinates these two, and handles prepending and appending the AMP
+// boilerplate. armorEncoder's Write method writes data into the base64 encoder,
+// where it makes its way through the chain.
+
+// NewArmorEncoder returns a new AMP armor encoder. Anything written to the
+// returned io.WriteCloser will be encoded and written to w. The caller must
+// call Close to flush any partially written data and output the AMP boilerplate
+// trailer.
+func NewArmorEncoder(w io.Writer) (io.WriteCloser, error) {
+	// Immediately write the AMP boilerplate header.
+	_, err := w.Write([]byte(boilerplateStart))
+	if err != nil {
+		return nil, err
+	}
+
+	element := &elementEncoder{w: w}
+	// Write a server–client protocol version indicator, outside the base64
+	// layer.
+	_, err = element.Write([]byte{'0'})
+	if err != nil {
+		return nil, err
+	}
+
+	base64 := base64.NewEncoder(base64.StdEncoding, element)
+	return &armorEncoder{
+		w:       w,
+		element: element,
+		base64:  base64,
+	}, nil
+}
+
+type armorEncoder struct {
+	base64  io.WriteCloser
+	element *elementEncoder
+	w       io.Writer
+}
+
+func (enc *armorEncoder) Write(p []byte) (int, error) {
+	// Write into the chain base64 | element | w.
+	return enc.base64.Write(p)
+}
+
+func (enc *armorEncoder) Close() error {
+	// Close the base64 encoder first, to flush out any buffered data and
+	// the final padding.
+	err := enc.base64.Close()
+	if err != nil {
+		return err
+	}
+
+	// Next, close the element encoder, to close any open elements.
+	err = enc.element.Close()
+	if err != nil {
+		return err
+	}
+
+	// Finally, output the AMP boilerplate trailer.
+	_, err = enc.w.Write([]byte(boilerplateEnd))
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// elementEncoder arranges written data into pre elements, with the text within
+// separated into chunks. It does no HTML encoding, so data written must not
+// contain any bytes that are meaningful in HTML.
+type elementEncoder struct {
+	w              io.Writer
+	chunkCounter   int
+	elementCounter int
+}
+
+func (enc *elementEncoder) Write(p []byte) (n int, err error) {
+	total := 0
+	for len(p) > 0 {
+		if enc.elementCounter == 0 && enc.chunkCounter == 0 {
+			_, err := enc.w.Write([]byte("<pre>\n"))
+			if err != nil {
+				return total, err
+			}
+		}
+
+		n := bytesPerChunk - enc.chunkCounter
+		if n > len(p) {
+			n = len(p)
+		}
+		nn, err := enc.w.Write(p[:n])
+		if err != nil {
+			return total, err
+		}
+		total += nn
+		p = p[n:]
+
+		enc.chunkCounter += n
+		if enc.chunkCounter >= bytesPerChunk {
+			enc.chunkCounter = 0
+			enc.elementCounter += 1
+			nn, err = enc.w.Write([]byte("\n"))
+			if err != nil {
+				return total, err
+			}
+			total += nn
+		}
+
+		if enc.elementCounter >= chunksPerElement {
+			enc.elementCounter = 0
+			nn, err = enc.w.Write([]byte("</pre>\n"))
+			if err != nil {
+				return total, err
+			}
+			total += nn
+		}
+	}
+	return total, nil
+}
+
+func (enc *elementEncoder) Close() error {
+	var err error
+	if !(enc.elementCounter == 0 && enc.chunkCounter == 0) {
+		if enc.chunkCounter == 0 {
+			_, err = enc.w.Write([]byte("</pre>\n"))
+		} else {
+			_, err = enc.w.Write([]byte("\n</pre>\n"))
+		}
+	}
+	return err
+}
diff --git a/common/amp/armor_test.go b/common/amp/armor_test.go
new file mode 100644
index 0000000..594ae65
--- /dev/null
+++ b/common/amp/armor_test.go
@@ -0,0 +1,227 @@
+package amp
+
+import (
+	"crypto/rand"
+	"io"
+	"io/ioutil"
+	"strings"
+	"testing"
+)
+
+func armorDecodeToString(src string) (string, error) {
+	dec, err := NewArmorDecoder(strings.NewReader(src))
+	if err != nil {
+		return "", err
+	}
+	p, err := ioutil.ReadAll(dec)
+	return string(p), err
+}
+
+func TestArmorDecoder(t *testing.T) {
+	for _, test := range []struct {
+		input          string
+		expectedOutput string
+		expectedErr    bool
+	}{
+		{`
+<pre>
+0
+</pre>
+`,
+			"",
+			false,
+		},
+		{`
+<pre>
+0aGVsbG8gd29ybGQK
+</pre>
+`,
+			"hello world\n",
+			false,
+		},
+		// bad version indicator
+		{`
+<pre>
+1aGVsbG8gd29ybGQK
+</pre>
+`,
+			"",
+			true,
+		},
+		// text outside <pre> elements
+		{`
+0aGVsbG8gd29ybGQK
+blah blah blah
+<pre>
+0aGVsbG8gd29ybGQK
+</pre>
+0aGVsbG8gd29ybGQK
+blah blah blah
+`,
+			"hello world\n",
+			false,
+		},
+		{`
+<pre>
+0QUJDREV
+GR0hJSkt
+MTU5PUFF
+SU1RVVld
+</pre>
+junk
+<pre>
+YWVowMTI
+zNDU2Nzg
+5Cg
+=
+</pre>
+<pre>
+=
+</pre>
+`,
+			"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\n",
+			false,
+		},
+		// no <pre> elements, hence no version indicator
+		{`
+aGVsbG8gd29ybGQK
+blah blah blah
+aGVsbG8gd29ybGQK
+aGVsbG8gd29ybGQK
+blah blah blah
+`,
+			"",
+			true,
+		},
+		// empty <pre> elements, hence no version indicator
+		{`
+aGVsbG8gd29ybGQK
+blah blah blah
+<pre>   </pre>
+aGVsbG8gd29ybGQK
+aGVsbG8gd29ybGQK<pre></pre>
+blah blah blah
+`,
+			"",
+			true,
+		},
+		// other elements inside <pre>
+		{
+			"blah <pre>0aGVsb<p>G8gd29</p>ybGQK</pre>",
+			"hello world\n",
+			false,
+		},
+		// HTML comment
+		{
+			"blah <!-- <pre>aGVsbG8gd29ybGQK</pre> -->",
+			"",
+			true,
+		},
+		// all kinds of ASCII whitespace
+		{
+			"blah <pre>\x200\x09aG\x0aV\x0csb\x0dG8\x20gd29ybGQK</pre>",
+			"hello world\n",
+			false,
+		},
+
+		// bad padding
+		{`
+<pre>
+0QUJDREV
+GR0hJSkt
+MTU5PUFF
+SU1RVVld
+</pre>
+junk
+<pre>
+YWVowMTI
+zNDU2Nzg
+5Cg
+=
+</pre>
+`,
+			"",
+			true,
+		},
+		/*
+			// per-chunk base64
+			// test disabled because Go stdlib handles this incorrectly:
+			// https://github.com/golang/go/issues/31626
+			{
+				"<pre>QQ==</pre><pre>Qg==</pre>",
+				"",
+				true,
+			},
+		*/
+		// missing </pre>
+		{
+			"blah <pre></pre><pre>0aGVsbG8gd29ybGQK",
+			"",
+			true,
+		},
+		// nested <pre>
+		{
+			"blah <pre>0aGVsb<pre>G8gd29</pre>ybGQK</pre>",
+			"",
+			true,
+		},
+	} {
+		output, err := armorDecodeToString(test.input)
+		if test.expectedErr && err == nil {
+			t.Errorf("%+q → (%+q, %v), expected error", test.input, output, err)
+			continue
+		}
+		if !test.expectedErr && err != nil {
+			t.Errorf("%+q → (%+q, %v), expected no error", test.input, output, err)
+			continue
+		}
+		if !test.expectedErr && output != test.expectedOutput {
+			t.Errorf("%+q → (%+q, %v), expected (%+q, %v)",
+				test.input, output, err, test.expectedOutput, nil)
+			continue
+		}
+	}
+}
+
+func armorRoundTrip(s string) (string, error) {
+	var encoded strings.Builder
+	enc, err := NewArmorEncoder(&encoded)
+	if err != nil {
+		return "", err
+	}
+	_, err = io.Copy(enc, strings.NewReader(s))
+	if err != nil {
+		return "", err
+	}
+	err = enc.Close()
+	if err != nil {
+		return "", err
+	}
+	return armorDecodeToString(encoded.String())
+}
+
+func TestArmorRoundTrip(t *testing.T) {
+	lengths := make([]int, 0)
+	// Test short strings and lengths around elementSizeLimit thresholds.
+	for i := 0; i < bytesPerChunk*2; i++ {
+		lengths = append(lengths, i)
+	}
+	for i := -10; i < +10; i++ {
+		lengths = append(lengths, elementSizeLimit+i)
+		lengths = append(lengths, 2*elementSizeLimit+i)
+	}
+	for _, n := range lengths {
+		buf := make([]byte, n)
+		rand.Read(buf)
+		input := string(buf)
+		output, err := armorRoundTrip(input)
+		if err != nil {
+			t.Errorf("length %d → error %v", n, err)
+			continue
+		}
+		if output != input {
+			t.Errorf("length %d → %+q", n, output)
+			continue
+		}
+	}
+}
diff --git a/common/amp/cache.go b/common/amp/cache.go
new file mode 100644
index 0000000..102993f
--- /dev/null
+++ b/common/amp/cache.go
@@ -0,0 +1,178 @@
+package amp
+
+import (
+	"crypto/sha256"
+	"encoding/base32"
+	"fmt"
+	"net"
+	"net/url"
+	"path"
+	"strings"
+
+	"golang.org/x/net/idna"
+)
+
+// domainPrefixBasic does the basic domain prefix conversion. Does not do any
+// IDNA mapping, such as https://www.unicode.org/reports/tr46/.
+//
+// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#basic-algorithm
+func domainPrefixBasic(domain string) (string, error) {
+	// 1. Punycode Decode the publisher domain.
+	prefix, err := idna.ToUnicode(domain)
+	if err != nil {
+		return "", err
+	}
+
+	// 2. Replace any "-" (hyphen) character in the output of step 1 with
+	//    "--" (two hyphens).
+	prefix = strings.Replace(prefix, "-", "--", -1)
+
+	// 3. Replace any "." (dot) character in the output of step 2 with "-"
+	//    (hyphen).
+	prefix = strings.Replace(prefix, ".", "-", -1)
+
+	// 4. If the output of step 3 has a "-" (hyphen) at both positions 3 and
+	//    4, then to the output of step 3, add a prefix of "0-" and add a
+	//    suffix of "-0".
+	if len(prefix) >= 4 && prefix[2] == '-' && prefix[3] == '-' {
+		prefix = "0-" + prefix + "-0"
+	}
+
+	// 5. Punycode Encode the output of step 3.
+	return idna.ToASCII(prefix)
+}
+
+// Lower-case base32 without padding.
+var fallbackBase32Encoding = base32.NewEncoding("abcdefghijklmnopqrstuvwxyz234567").WithPadding(base32.NoPadding)
+
+// domainPrefixFallback does the fallback domain prefix conversion. The returned
+// base32 domain uses lower-case letters.
+//
+// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#fallback-algorithm
+func domainPrefixFallback(domain string) string {
+	// The algorithm specification does not say what, exactly, we are to
+	// take the SHA-256 of. domain is notionally an abstract Unicode
+	// string, not a byte sequence. While
+	// https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/AmpCurlUrlGenerator.js#L62
+	// says "Take the SHA256 of the punycode view of the domain," in reality
+	// it hashes the UTF-8 encoding of the domain, without Punycode:
+	// https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/AmpCurlUrlGenerator.js#L141
+	// https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/browser/Sha256.js#L24
+	// We do the same here, hashing the raw bytes of domain, presumed to be
+	// UTF-8.
+
+	// 1. Hash the publisher's domain using SHA256.
+	h := sha256.Sum256([]byte(domain))
+
+	// 2. Base32 Escape the output of step 1.
+	// 3. Remove the last 4 characters from the output of step 2, which are
+	//    always "=" (equals) characters.
+	return fallbackBase32Encoding.EncodeToString(h[:])
+}
+
+// domainPrefix computes the domain prefix of an AMP cache URL.
+//
+// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#domain-name-prefix
+func domainPrefix(domain string) string {
+	// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#combined-algorithm
+	// 1. Run the Basic Algorithm. If the output is a valid DNS label,
+	//    [append the Cache domain suffix and] return. Otherwise continue to
+	//    step 2.
+	prefix, err := domainPrefixBasic(domain)
+	// "A domain prefix is not a valid DNS label if it is longer than 63
+	// characters"
+	if err == nil && len(prefix) <= 63 {
+		return prefix
+	}
+	// 2. Run the Fallback Algorithm. [Append the Cache domain suffix and]
+	//    return.
+	return domainPrefixFallback(domain)
+}
+
+// CacheURL computes the AMP cache URL for the publisher URL pubURL, using the
+// AMP cache at cacheURL. contentType is a string such as "c" or "i" that
+// indicates what type of serving the AMP cache is to perform. The Scheme of
+// pubURL must be "http" or "https". The Port of pubURL, if any, must match the
+// default for the scheme. cacheURL may not have RawQuery, Fragment, or
+// RawFragment set, because the resulting URL's query and fragment are taken
+// from the publisher URL.
+//
+// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/
+func CacheURL(pubURL, cacheURL *url.URL, contentType string) (*url.URL, error) {
+	// The cache URL subdomain, including the domain prefix corresponding to
+	// the publisher URL's domain.
+	resultHost := domainPrefix(pubURL.Hostname()) + "." + cacheURL.Hostname()
+	if cacheURL.Port() != "" {
+		resultHost = net.JoinHostPort(resultHost, cacheURL.Port())
+	}
+
+	// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#url-path
+	// The first part of the path is the cache URL's own path, if any.
+	pathComponents := []string{cacheURL.EscapedPath()}
+	// The next path component is the content type. We cannot encode an
+	// empty content type, because it would result in consecutive path
+	// separators, which would semantically combine into a single separator.
+	if contentType == "" {
+		return nil, fmt.Errorf("invalid content type %+q", contentType)
+	}
+	pathComponents = append(pathComponents, url.PathEscape(contentType))
+	// Then, we add an "s" path component, if the publisher URL scheme is
+	// "https".
+	switch pubURL.Scheme {
+	case "http":
+		// Do nothing.
+	case "https":
+		pathComponents = append(pathComponents, "s")
+	default:
+		return nil, fmt.Errorf("invalid scheme %+q in publisher URL", pubURL.Scheme)
+	}
+	// The next path component is the publisher URL's host. The AMP cache
+	// URL format specification is not clear about whether other
+	// subcomponents of the authority (namely userinfo and port) may appear
+	// here. We adopt a policy of forbidding userinfo, and requiring that
+	// the port be the default for the scheme (and then we omit the port
+	// entirely from the returned URL).
+	if pubURL.User != nil {
+		return nil, fmt.Errorf("publisher URL may not contain userinfo")
+	}
+	if port := pubURL.Port(); port != "" {
+		if !((pubURL.Scheme == "http" && port == "80") || (pubURL.Scheme == "https" && port == "443")) {
+			return nil, fmt.Errorf("publisher URL port %+q is not the default for scheme %+q", port, pubURL.Scheme)
+		}
+	}
+	// As with the content type, we cannot encode an empty host, because
+	// that would result in an empty path component.
+	if pubURL.Hostname() == "" {
+		return nil, fmt.Errorf("invalid host %+q in publisher URL", pubURL.Hostname())
+	}
+	pathComponents = append(pathComponents, url.PathEscape(pubURL.Hostname()))
+	// Finally, we append the remainder of the original escaped path from
+	// the publisher URL.
+	pathComponents = append(pathComponents, pubURL.EscapedPath())
+
+	resultRawPath := path.Join(pathComponents...)
+	resultPath, err := url.PathUnescape(resultRawPath)
+	if err != nil {
+		return nil, err
+	}
+
+	// The query and fragment of the returned URL always come from pubURL.
+	// Any query or fragment of cacheURL would be ignored. Return an error
+	// if either is set.
+	if cacheURL.RawQuery != "" {
+		return nil, fmt.Errorf("cache URL may not contain a query")
+	}
+	if cacheURL.Fragment != "" {
+		return nil, fmt.Errorf("cache URL may not contain a fragment")
+	}
+
+	return &url.URL{
+		Scheme:   cacheURL.Scheme,
+		User:     cacheURL.User,
+		Host:     resultHost,
+		Path:     resultPath,
+		RawPath:  resultRawPath,
+		RawQuery: pubURL.RawQuery,
+		Fragment: pubURL.Fragment,
+	}, nil
+}
diff --git a/common/amp/cache_test.go b/common/amp/cache_test.go
new file mode 100644
index 0000000..45950fd
--- /dev/null
+++ b/common/amp/cache_test.go
@@ -0,0 +1,320 @@
+package amp
+
+import (
+	"bytes"
+	"net/url"
+	"testing"
+
+	"golang.org/x/net/idna"
+)
+
+func TestDomainPrefixBasic(t *testing.T) {
+	// Tests expecting no error.
+	for _, test := range []struct {
+		domain, expected string
+	}{
+		{"", ""},
+		{"xn--", ""},
+		{"...", "---"},
+
+		// Should not apply mappings such as case folding and
+		// normalization.
+		{"b\u00fccher.de", "xn--bcher-de-65a"},
+		{"B\u00fccher.de", "xn--Bcher-de-65a"},
+		{"bu\u0308cher.de", "xn--bucher-de-hkf"},
+
+		// Check some that differ between IDNA 2003 and IDNA 2008.
+		// https://unicode.org/reports/tr46/#Deviations
+		// https://util.unicode.org/UnicodeJsps/idna.jsp
+		{"faß.de", "xn--fa-de-mqa"},
+		{"βόλοσ.com", "xn---com-4ld8c2a6a8e"},
+
+		// Lengths of 63 and 64. 64 is too long for a DNS label, but
+		// domainPrefixBasic is not expected to check for that.
+		{"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"},
+		{"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"},
+
+		// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#basic-algorithm
+		{"example.com", "example-com"},
+		{"foo.example.com", "foo-example-com"},
+		{"foo-example.com", "foo--example-com"},
+		{"xn--57hw060o.com", "xn---com-p33b41770a"},
+		{"\u26a1\U0001f60a.com", "xn---com-p33b41770a"},
+		{"en-us.example.com", "0-en--us-example-com-0"},
+	} {
+		output, err := domainPrefixBasic(test.domain)
+		if err != nil || output != test.expected {
+			t.Errorf("%+q → (%+q, %v), expected (%+q, %v)",
+				test.domain, output, err, test.expected, nil)
+		}
+	}
+
+	// Tests expecting an error.
+	for _, domain := range []string{
+		"xn---",
+	} {
+		output, err := domainPrefixBasic(domain)
+		if err == nil || output != "" {
+			t.Errorf("%+q → (%+q, %v), expected (%+q, non-nil)",
+				domain, output, err, "")
+		}
+	}
+}
+
+func TestDomainPrefixFallback(t *testing.T) {
+	for _, test := range []struct {
+		domain, expected string
+	}{
+		{
+			"",
+			"4oymiquy7qobjgx36tejs35zeqt24qpemsnzgtfeswmrw6csxbkq",
+		},
+		{
+			"example.com",
+			"un42n5xov642kxrxrqiyanhcoupgql5lt4wtbkyt2ijflbwodfdq",
+		},
+
+		// These checked against the output of
+		// https://github.com/ampproject/amp-toolbox/tree/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url,
+		// using the widget at
+		// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#url-format.
+		{
+			"000000000000000000000000000000000000000000000000000000000000.com",
+			"stejanx4hsijaoj4secyecy4nvqodk56kw72whwcmvdbtucibf5a",
+		},
+		{
+			"00000000000000000000000000000000000000000000000000000000000a.com",
+			"jdcvbsorpnc3hcjrhst56nfm6ymdpovlawdbm2efyxpvlt4cpbya",
+		},
+		{
+			"00000000000000000000000000000000000000000000000000000000000\u03bb.com",
+			"qhzqeumjkfpcpuic3vqruyjswcr7y7gcm3crqyhhywvn3xrhchfa",
+		},
+	} {
+		output := domainPrefixFallback(test.domain)
+		if output != test.expected {
+			t.Errorf("%+q → %+q, expected %+q",
+				test.domain, output, test.expected)
+		}
+	}
+}
+
+// Checks that domainPrefix chooses domainPrefixBasic or domainPrefixFallback as
+// appropriate; i.e., always returns string that is a valid DNS label and is
+// IDNA-decodable.
+func TestDomainPrefix(t *testing.T) {
+	// A validating IDNA profile, which checks label length and that the
+	// label contains only certain ASCII characters. It does not do the
+	// ValidateLabels check, because that depends on the input having
+	// certain properties.
+	profile := idna.New(
+		idna.VerifyDNSLength(true),
+		idna.StrictDomainName(true),
+	)
+	for _, domain := range []string{
+		"example.com",
+		"\u0314example.com",
+		"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",  // 63 bytes
+		"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", // 64 bytes
+		"xn--57hw060o.com",
+		"a b c",
+	} {
+		output := domainPrefix(domain)
+		if bytes.IndexByte([]byte(output), '.') != -1 {
+			t.Errorf("%+q → %+q contains a dot", domain, output)
+		}
+		_, err := profile.ToUnicode(output)
+		if err != nil {
+			t.Errorf("%+q → error %v", domain, err)
+		}
+	}
+}
+
+func mustParseURL(rawurl string) *url.URL {
+	u, err := url.Parse(rawurl)
+	if err != nil {
+		panic(err)
+	}
+	return u
+}
+
+func TestCacheURL(t *testing.T) {
+	// Tests expecting no error.
+	for _, test := range []struct {
+		pub         string
+		cache       string
+		contentType string
+		expected    string
+	}{
+		// With or without trailing slash on pubURL.
+		{
+			"http://example.com/",
+			"https://amp.cache/",
+			"c",
+			"https://example-com.amp.cache/c/example.com",
+		},
+		{
+			"http://example.com",
+			"https://amp.cache/",
+			"c",
+			"https://example-com.amp.cache/c/example.com",
+		},
+		// https pubURL.
+		{
+			"https://example.com/",
+			"https://amp.cache/",
+			"c",
+			"https://example-com.amp.cache/c/s/example.com",
+		},
+		// The content type should be escaped if necessary.
+		{
+			"http://example.com/",
+			"https://amp.cache/",
+			"/",
+			"https://example-com.amp.cache/%2F/example.com",
+		},
+		// Retain pubURL path, query, and fragment, including escaping.
+		{
+			"http://example.com/my%2Fpath/index.html?a=1#fragment",
+			"https://amp.cache/",
+			"c",
+			"https://example-com.amp.cache/c/example.com/my%2Fpath/index.html?a=1#fragment",
+		},
+		// Retain scheme, userinfo, port, and path of cacheURL, escaping
+		// whatever is necessary.
+		{
+			"http://example.com",
+			"http://cache%2Fuser:cache%40pass@amp.cache:123/with/../../path/..%2f../",
+			"c",
+			"http://cache%2Fuser:cache%40pass@example-com.amp.cache:123/path/..%2f../c/example.com",
+		},
+		// Port numbers in pubURL are allowed, if they're the default
+		// for scheme.
+		{
+			"http://example.com:80/",
+			"https://amp.cache/",
+			"c",
+			"https://example-com.amp.cache/c/example.com",
+		},
+		{
+			"https://example.com:443/",
+			"https://amp.cache/",
+			"c",
+			"https://example-com.amp.cache/c/s/example.com",
+		},
+		// "?" at the end of cacheURL is okay, as long as the query is
+		// empty.
+		{
+			"http://example.com/",
+			"https://amp.cache/?",
+			"c",
+			"https://example-com.amp.cache/c/example.com",
+		},
+
+		// https://developers.google.com/amp/cache/overview#example-requesting-document-using-tls
+		{
+			"https://example.com/amp_document.html",
+			"https://cdn.ampproject.org/",
+			"c",
+			"https://example-com.cdn.ampproject.org/c/s/example.com/amp_document.html",
+		},
+		// https://developers.google.com/amp/cache/overview#example-requesting-image-using-plain-http
+		{
+			"http://example.com/logo.png",
+			"https://cdn.ampproject.org/",
+			"i",
+			"https://example-com.cdn.ampproject.org/i/example.com/logo.png",
+		},
+		// https://developers.google.com/amp/cache/overview#query-parameter-example
+		{
+			"https://example.com/g?value=Hello%20World",
+			"https://cdn.ampproject.org/",
+			"c",
+			"https://example-com.cdn.ampproject.org/c/s/example.com/g?value=Hello%20World",
+		},
+	} {
+		pubURL := mustParseURL(test.pub)
+		cacheURL := mustParseURL(test.cache)
+		outputURL, err := CacheURL(pubURL, cacheURL, test.contentType)
+		if err != nil {
+			t.Errorf("%+q %+q %+q → error %v",
+				test.pub, test.cache, test.contentType, err)
+			continue
+		}
+		if outputURL.String() != test.expected {
+			t.Errorf("%+q %+q %+q → %+q, expected %+q",
+				test.pub, test.cache, test.contentType, outputURL, test.expected)
+			continue
+		}
+	}
+
+	// Tests expecting an error.
+	for _, test := range []struct {
+		pub         string
+		cache       string
+		contentType string
+	}{
+		// Empty content type.
+		{
+			"http://example.com/",
+			"https://amp.cache/",
+			"",
+		},
+		// Empty host.
+		{
+			"http:///index.html",
+			"https://amp.cache/",
+			"c",
+		},
+		// Empty scheme.
+		{
+			"//example.com/",
+			"https://amp.cache/",
+			"c",
+		},
+		// Unrecognized scheme.
+		{
+			"ftp://example.com/",
+			"https://amp.cache/",
+			"c",
+		},
+		// Wrong port number for scheme.
+		{
+			"http://example.com:443/",
+			"https://amp.cache/",
+			"c",
+		},
+		// userinfo in pubURL.
+		{
+			"http://user@example.com/",
+			"https://amp.cache/",
+			"c",
+		},
+		{
+			"http://user:pass@example.com/",
+			"https://amp.cache/",
+			"c",
+		},
+		// cacheURL may not contain a query.
+		{
+			"http://example.com/",
+			"https://amp.cache/?a=1",
+			"c",
+		},
+		// cacheURL may not contain a fragment.
+		{
+			"http://example.com/",
+			"https://amp.cache/#fragment",
+			"c",
+		},
+	} {
+		pubURL := mustParseURL(test.pub)
+		cacheURL := mustParseURL(test.cache)
+		outputURL, err := CacheURL(pubURL, cacheURL, test.contentType)
+		if err == nil {
+			t.Errorf("%+q %+q %+q → %+q, expected error",
+				test.pub, test.cache, test.contentType, outputURL)
+			continue
+		}
+	}
+}
diff --git a/common/amp/doc.go b/common/amp/doc.go
new file mode 100644
index 0000000..1387114
--- /dev/null
+++ b/common/amp/doc.go
@@ -0,0 +1,88 @@
+/*
+Package amp provides functions for working with the AMP (Accelerated Mobile
+Pages) subset of HTML, and conveying binary data through an AMP cache.
+
+AMP cache
+
+The CacheURL function takes a plain URL and converts it to be accessed through a
+given AMP cache.
+
+The EncodePath and DecodePath functions provide a way to encode data into the
+suffix of a URL path. AMP caches do not support HTTP POST, but encoding data
+into a URL path with GET is an alternative means of sending data to the server.
+The format of an encoded path is:
+	0<0 or more bytes, including slash>/<base64 of data>
+That is:
+* "0", a format version number, which controls the interpretation of the rest of
+the path. Only the first byte matters as a version indicator (not the whole
+first path component).
+* Any number of slash or non-slash bytes. These may be used as padding or to
+prevent cache collisions in the AMP cache.
+* A final slash.
+* base64 encoding of the data, using the URL-safe alphabet (which does not
+include slash).
+
+For example, an encoding of the string "This is path-encoded data." is the
+following. The "lgWHcwhXFjUm" following the format version number is random
+padding that will be ignored on decoding.
+	0lgWHcwhXFjUm/VGhpcyBpcyBwYXRoLWVuY29kZWQgZGF0YS4
+
+It is the caller's responsibility to add or remove any directory path prefix
+before calling EncodePath or DecodePath.
+
+AMP armor
+
+AMP armor is a data encoding scheme that that satisfies the requirements of the
+AMP (Accelerated Mobile Pages) subset of HTML, and survives modification by an
+AMP cache. For the requirements of AMP HTML, see
+https://amp.dev/documentation/guides-and-tutorials/learn/spec/amphtml/.
+For modifications that may be made by an AMP cache, see
+https://github.com/ampproject/amphtml/blob/main/docs/spec/amp-cache-modifications.md.
+
+The encoding is based on ones created by Ivan Markin. See codec/amp/ in
+https://github.com/nogoegst/amper and discussion at
+https://bugs.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake/25985.
+
+The encoding algorithm works as follows. Base64-encode the input. Prepend the
+input with the byte '0'; this is a protocol version indicator that the decoder
+can use to determine how to interpret the bytes that follow. Split the base64
+into fixed-size chunks separated by whitespace. Take up to 1024 chunks at a
+time, and wrap them in a pre element. Then, situate the markup so far within the
+body of the AMP HTML boilerplate. The decoding algorithm is to scan the HTML for
+pre elements, split their text contents on whitespace and concatenate, then
+base64 decode. The base64 encoding uses the standard alphabet, with normal "="
+padding (https://tools.ietf.org/html/rfc4648#section-4).
+
+The reason for splitting the base64 into chunks is that AMP caches reportedly
+truncate long strings that are not broken by whitespace:
+https://bugs.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake/25985#note_2592348.
+The characters that may separate the chunks are the ASCII whitespace characters
+(https://infra.spec.whatwg.org/#ascii-whitespace) "\x09", "\x0a", "\x0c",
+"\x0d", and "\x20". The reason for separating the chunks into pre elements is to
+limit the amount of text a decoder may have to buffer while parsing the HTML.
+Each pre element may contain at most 64 KB of text. pre elements may not be
+nested.
+
+Example
+
+The following is the result of encoding the string
+"This was encoded with AMP armor.":
+
+	<!doctype html>
+	<html amp>
+	<head>
+	<meta charset="utf-8">
+	<script async src="https://cdn.ampproject.org/v0.js"></script>
+	<link rel="canonical" href="#">
+	<meta name="viewport" content="width=device-width">
+	<style amp-boilerplate>body{-webkit-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-moz-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-ms-animation:-amp-start 8s steps(1,end) 0s 1 normal both;animation:-amp-start 8s steps(1,end) 0s 1 normal both}@-webkit-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-moz-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-ms-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-o-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}</style><noscript><style amp-boilerplate>body{-webkit-animation:none;-moz-animation:none;-ms-animation:none;animation:none}</style></noscript>
+	</head>
+	<body>
+	<pre>
+	0VGhpcyB3YXMgZW5jb2RlZCB3aXRoIEF
+	NUCBhcm1vci4=
+	</pre>
+	</body>
+	</html>
+*/
+package amp
diff --git a/common/amp/path.go b/common/amp/path.go
new file mode 100644
index 0000000..5903694
--- /dev/null
+++ b/common/amp/path.go
@@ -0,0 +1,44 @@
+package amp
+
+import (
+	"crypto/rand"
+	"encoding/base64"
+	"fmt"
+	"strings"
+)
+
+// EncodePath encodes data in a way that is suitable for the suffix of an AMP
+// cache URL.
+func EncodePath(data []byte) string {
+	var cacheBreaker [9]byte
+	_, err := rand.Read(cacheBreaker[:])
+	if err != nil {
+		panic(err)
+	}
+	b64 := base64.RawURLEncoding.EncodeToString
+	return "0" + b64(cacheBreaker[:]) + "/" + b64(data)
+}
+
+// DecodePath decodes data from a path suffix as encoded by EncodePath. The path
+// must have already been trimmed of any directory prefix (as might be present
+// in, e.g., an HTTP request). That is, the first character of path should be
+// the "0" message format indicator.
+func DecodePath(path string) ([]byte, error) {
+	if len(path) < 1 {
+		return nil, fmt.Errorf("missing format indicator")
+	}
+	version := path[0]
+	rest := path[1:]
+	switch version {
+	case '0':
+		// Ignore everything else up to and including the final slash
+		// (there must be at least one slash).
+		i := strings.LastIndexByte(rest, '/')
+		if i == -1 {
+			return nil, fmt.Errorf("missing data")
+		}
+		return base64.RawURLEncoding.DecodeString(rest[i+1:])
+	default:
+		return nil, fmt.Errorf("unknown format indicator %q", version)
+	}
+}
diff --git a/common/amp/path_test.go b/common/amp/path_test.go
new file mode 100644
index 0000000..20e4ccf
--- /dev/null
+++ b/common/amp/path_test.go
@@ -0,0 +1,54 @@
+package amp
+
+import (
+	"testing"
+)
+
+func TestDecodePath(t *testing.T) {
+	for _, test := range []struct {
+		path           string
+		expectedData   string
+		expectedErrStr string
+	}{
+		{"", "", "missing format indicator"},
+		{"0", "", "missing data"},
+		{"0foobar", "", "missing data"},
+		{"/0/YWJj", "", "unknown format indicator '/'"},
+
+		{"0/", "", ""},
+		{"0foobar/", "", ""},
+		{"0/YWJj", "abc", ""},
+		{"0///YWJj", "abc", ""},
+		{"0foobar/YWJj", "abc", ""},
+		{"0/foobar/YWJj", "abc", ""},
+	} {
+		data, err := DecodePath(test.path)
+		if test.expectedErrStr != "" {
+			if err == nil || err.Error() != test.expectedErrStr {
+				t.Errorf("%+q expected error %+q, got %+q",
+					test.path, test.expectedErrStr, err)
+			}
+		} else if err != nil {
+			t.Errorf("%+q expected no error, got %+q", test.path, err)
+		} else if string(data) != test.expectedData {
+			t.Errorf("%+q expected data %+q, got %+q",
+				test.path, test.expectedData, data)
+		}
+	}
+}
+
+func TestPathRoundTrip(t *testing.T) {
+	for _, data := range []string{
+		"",
+		"\x00",
+		"/",
+		"hello world",
+	} {
+		decoded, err := DecodePath(EncodePath([]byte(data)))
+		if err != nil {
+			t.Errorf("%+q roundtripped with error %v", data, err)
+		} else if string(decoded) != data {
+			t.Errorf("%+q roundtripped to %+q", data, decoded)
+		}
+	}
+}





More information about the tor-commits mailing list