[tor-commits] [gettor/master] Make locale parser more robust
cohosh at torproject.org
cohosh at torproject.org
Fri Jan 31 14:27:36 UTC 2020
commit 277ba71b466332b4b8bd4752a2c46c5bcb9cf71c
Author: Cecylia Bocovich <cohosh at torproject.org>
Date: Mon Jan 27 10:17:29 2020 -0500
Make locale parser more robust
This change expands the locale parse to have the following properties:
- if only the language code is given, choses the regionalization that
occurs first in the locale list (e.g., "en" --> "en-US"
- if regionalization for the language is *not* present, choses the
generalized language or a different regionalization (e.g. "pt-PT" -->
"pt-BR")
- parses both the subject and body looking for the most specific
regionalization
- defaults to en-US if no available language is found
---
gettor/parse/email.py | 15 ++++++++---
tests/test_email_service.py | 66 +++++++++++++++++++++++++++++++++++++--------
2 files changed, 66 insertions(+), 15 deletions(-)
diff --git a/gettor/parse/email.py b/gettor/parse/email.py
index d487684..874b1cd 100644
--- a/gettor/parse/email.py
+++ b/gettor/parse/email.py
@@ -116,8 +116,12 @@ class EmailParser(object):
def parse_keywords(self, text, request):
for word in re.split(r"\s+", text.strip()):
- if word.lower() in self.locales:
- request["language"] = word.lower()
+ for locale in self.locales:
+ if word.lower() == locale.lower():
+ request["language"] = locale
+ elif (not request["language"]) and (word.lower()[:2] ==
+ locale.lower()[:2]):
+ request["language"] = locale
if word.lower() in self.platforms:
request["command"] = "links"
request["platform"] = word.lower()
@@ -143,8 +147,11 @@ class EmailParser(object):
subject = subject.group(1)
request = self.parse_keywords(subject, request)
- if not request["command"] or not request["language"]:
- request = self.parse_keywords(msg_str, request)
+ # Always parse the body too, to see if there's more specific information
+ request = self.parse_keywords(msg_str, request)
+
+ if not request["language"]:
+ request["language"] = "en-US"
return request
diff --git a/tests/test_email_service.py b/tests/test_email_service.py
index 407937c..00795c1 100644
--- a/tests/test_email_service.py
+++ b/tests/test_email_service.py
@@ -82,17 +82,61 @@ class EmailServiceTests(unittest.TestCase):
def test_language_email_parser(self):
ep = conftests.EmailParser(self.settings, "gettor at torproject.org")
- ep.locales = ["en", "ru"]
- request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n Subject: \r\n Reply-To: hiro at torproject.org \nTo: gettor at torproject.org\n osx en")
- self.assertEqual(request["command"], "links")
- self.assertEqual(request["platform"], "osx")
- self.assertEqual(request["language"], "en")
-
- request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n Subject: \r\n Reply-To: hiro at torproject.org \nTo: gettor at torproject.org\n linux ru")
- self.assertEqual(request["command"], "links")
- self.assertEqual(request["platform"], "linux")
- self.assertEqual(request["language"], "ru")
-
+ ep.locales = ["en-US", "es-ES", "es-AR", "pt-BR", "fa"]
+ request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+ "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+ "gettor at torproject.org\n osx en")
+ self.assertEqual(request["language"], "en-US")
+
+ request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+ "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+ "gettor at torproject.org\n osx ES")
+ self.assertEqual(request["language"], "es-ES")
+
+ request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+ "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+ "gettor at torproject.org\n osx en-US")
+ self.assertEqual(request["language"], "en-US")
+
+ request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+ "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+ "gettor at torproject.org\n linux fa")
+ self.assertEqual(request["language"], "fa")
+
+ request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+ "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+ "gettor at torproject.org\n osx es")
+ self.assertEqual(request["language"], "es-ES")
+
+ request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+ "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+ "gettor at torproject.org\n linux zz")
+ self.assertEqual(request["language"], "en-US")
+
+ request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+ "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+ "gettor at torproject.org\n linux pt-PT")
+ self.assertEqual(request["language"], "pt-BR")
+
+ request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+ "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+ "gettor at torproject.org\n linux es-AR")
+ self.assertEqual(request["language"], "es-AR")
+
+ request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+ "Subject: linux es\r\n Reply-To: hiro at torproject.org \nTo:"
+ "gettor at torproject.org\n linux es-AR")
+ self.assertEqual(request["language"], "es-AR")
+
+ request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+ "Subject: linux es\r\n Reply-To: hiro at torproject.org \nTo:"
+ "gettor at torproject.org\n linux")
+ self.assertEqual(request["language"], "es-ES")
+
+ request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+ "Subject: linux es-AR\r\n Reply-To: hiro at torproject.org \nTo:"
+ "gettor at torproject.org\n linux es")
+ self.assertEqual(request["language"], "es-AR")
def test_sent_links_message(self):
ep = self.sm_client
More information about the tor-commits
mailing list