[tor-commits] [ooni-probe/master] Improve charset detection regexp
art at torproject.org
art at torproject.org
Mon May 30 16:28:33 UTC 2016
commit 55cd85351c02d7a7333f1c39e9993820b05b2b6d
Author: Arturo Filastò <arturo at filasto.net>
Date: Sun May 8 19:08:22 2016 +0200
Improve charset detection regexp
---
ooni/templates/httpt.py | 12 ++++++++++--
ooni/tests/test_templates.py | 3 +++
2 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/ooni/templates/httpt.py b/ooni/templates/httpt.py
index 51fba1a..6b4c4b9 100644
--- a/ooni/templates/httpt.py
+++ b/ooni/templates/httpt.py
@@ -1,4 +1,5 @@
import re
+import codecs
import random
from txtorcon.interface import StreamListenerMixin
@@ -17,7 +18,7 @@ from ooni.utils.net import StringProducer, userAgents
from ooni.utils.trueheaders import TrueHeaders
from ooni.errors import handleAllFailures
-META_CHARSET_REGEXP = re.compile('<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"\']*([^\s"\'/>]+)')
+META_CHARSET_REGEXP = re.compile('<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"\']*([^\s"\'/>!;]+)')
class InvalidSocksProxyOption(Exception):
pass
@@ -56,7 +57,14 @@ def _representBody(body):
# try to decode using that one first
charset = META_CHARSET_REGEXP.search(body, re.IGNORECASE)
if charset:
- charsets.insert(0, charset.group(1))
+ try:
+ encoding = charset.group(1).lower()
+ codecs.lookup(encoding)
+ charsets.insert(0, encoding)
+ except (LookupError, IndexError):
+ # Skip invalid codecs and partial regexp match
+ pass
+
for encoding in charsets:
try:
body = unicode(body, encoding)
diff --git a/ooni/tests/test_templates.py b/ooni/tests/test_templates.py
index e8fe636..ebd5b2e 100644
--- a/ooni/tests/test_templates.py
+++ b/ooni/tests/test_templates.py
@@ -55,8 +55,11 @@ class TestHTTPT(unittest.TestCase):
"""
with_charset_html = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">'
with_empty_charset = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=">'
+ with_two_charsets = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8;charset=utf-8">'
self.assertEqual(httpt.META_CHARSET_REGEXP.search(no_charset_html), None)
self.assertEqual(httpt.META_CHARSET_REGEXP.search(with_charset_html).group(1), 'iso-8859-1')
+ self.assertEqual(httpt.META_CHARSET_REGEXP.search(
+ with_two_charsets).group(1), 'UTF-8')
self.assertEqual(httpt.META_CHARSET_REGEXP.search(with_empty_charset), None)
class TestDNST(unittest.TestCase):
More information about the tor-commits
mailing list