Try both utf-8 and windows-1252 for decoding email

Recent submissions from Cirrus were classified as spam by the lore
analysis robot script.  This is because cirrus used windows-1252 for
the encoding which failed to decode as utf-8.

Try both encodings when decoding email.

Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
This commit is contained in:
Mario Limonciello 2023-11-16 10:42:10 -06:00
parent 58ec43257c
commit 8228c2222f
1 changed files with 22 additions and 9 deletions

View File

@ -34,6 +34,8 @@ content_types = {
def classify_content(content):
# load content into the email library
msg = email.message_from_string(content)
decoded = None
body = None
# check the subject
subject = msg["Subject"]
@ -42,17 +44,28 @@ def classify_content(content):
if "PATCH" in subject:
return ContentType.PATCH
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
body = part.get_payload(decode=True)
else:
body = msg.get_payload(decode=True)
if body:
for encoding in ["utf-8", "windows-1252"]:
try:
body = part.get_payload(decode=True).decode("utf-8")
decoded = body.decode(encoding)
break
except UnicodeDecodeError:
pass
if decoded:
for key in content_types.keys():
if key in body:
if key in decoded:
return content_types[key]
break
except UnicodeDecodeError as e:
logging.warning("Failed to decode email: %s, treating as SPAM" % e)
break
else:
logging.warning("Failed to decode email: %s, treating as SPAM", body)
return ContentType.SPAM