Try both utf-8 and windows-1252 for decoding email

Recent submissions from Cirrus were classified as spam by the lore
analysis robot script.  This is because cirrus used windows-1252 for
the encoding which failed to decode as utf-8.

Try both encodings when decoding email.

Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
This commit is contained in:
Mario Limonciello 2023-11-16 10:42:10 -06:00
parent 58ec43257c
commit 8228c2222f

View File

@ -34,6 +34,8 @@ content_types = {
def classify_content(content): def classify_content(content):
# load content into the email library # load content into the email library
msg = email.message_from_string(content) msg = email.message_from_string(content)
decoded = None
body = None
# check the subject # check the subject
subject = msg["Subject"] subject = msg["Subject"]
@ -42,17 +44,28 @@ def classify_content(content):
if "PATCH" in subject: if "PATCH" in subject:
return ContentType.PATCH return ContentType.PATCH
if msg.is_multipart():
for part in msg.walk(): for part in msg.walk():
if part.get_content_type() == "text/plain": if part.get_content_type() == "text/plain":
body = part.get_payload(decode=True)
else:
body = msg.get_payload(decode=True)
if body:
for encoding in ["utf-8", "windows-1252"]:
try: try:
body = part.get_payload(decode=True).decode("utf-8") decoded = body.decode(encoding)
break
except UnicodeDecodeError:
pass
if decoded:
for key in content_types.keys(): for key in content_types.keys():
if key in body: if key in decoded:
return content_types[key] return content_types[key]
break else:
except UnicodeDecodeError as e: logging.warning("Failed to decode email: %s, treating as SPAM", body)
logging.warning("Failed to decode email: %s, treating as SPAM" % e)
break
return ContentType.SPAM return ContentType.SPAM