Try both utf-8 and windows-1252 for decoding email
Recent submissions from Cirrus were classified as spam by the lore analysis robot script. This is because cirrus used windows-1252 for the encoding which failed to decode as utf-8. Try both encodings when decoding email. Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
This commit is contained in:
parent
58ec43257c
commit
8228c2222f
|
@ -34,6 +34,8 @@ content_types = {
|
|||
def classify_content(content):
|
||||
# load content into the email library
|
||||
msg = email.message_from_string(content)
|
||||
decoded = None
|
||||
body = None
|
||||
|
||||
# check the subject
|
||||
subject = msg["Subject"]
|
||||
|
@ -42,17 +44,28 @@ def classify_content(content):
|
|||
if "PATCH" in subject:
|
||||
return ContentType.PATCH
|
||||
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain":
|
||||
body = part.get_payload(decode=True)
|
||||
else:
|
||||
body = msg.get_payload(decode=True)
|
||||
|
||||
if body:
|
||||
for encoding in ["utf-8", "windows-1252"]:
|
||||
try:
|
||||
body = part.get_payload(decode=True).decode("utf-8")
|
||||
decoded = body.decode(encoding)
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
if decoded:
|
||||
for key in content_types.keys():
|
||||
if key in body:
|
||||
if key in decoded:
|
||||
return content_types[key]
|
||||
break
|
||||
except UnicodeDecodeError as e:
|
||||
logging.warning("Failed to decode email: %s, treating as SPAM" % e)
|
||||
break
|
||||
else:
|
||||
logging.warning("Failed to decode email: %s, treating as SPAM", body)
|
||||
|
||||
return ContentType.SPAM
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue