Try both utf-8 and windows-1252 for decoding email

Recent submissions from Cirrus were classified as spam by the lore analysis robot script. This is because cirrus used windows-1252 for the encoding which failed to decode as utf-8. Try both encodings when decoding email. Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
2023-11-16 10:42:10 -06:00 · 2023-11-16 10:42:10 -06:00 · 8228c2222f
parent 58ec43257c
commit 8228c2222f
1 changed files with 22 additions and 9 deletions
--- a/contrib/process_linux_firmware.py
+++ b/contrib/process_linux_firmware.py
@ -34,6 +34,8 @@ content_types = {
 def classify_content(content):
    # load content into the email library
    msg = email.message_from_string(content)
+    decoded = None
+    body = None

    # check the subject
    subject = msg["Subject"]
@ -42,17 +44,28 @@ def classify_content(content):
    if "PATCH" in subject:
        return ContentType.PATCH

+    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == "text/plain":
+                body = part.get_payload(decode=True)
+    else:
+        body = msg.get_payload(decode=True)
+
+    if body:
+        for encoding in ["utf-8", "windows-1252"]:
            try:
-                body = part.get_payload(decode=True).decode("utf-8")
+                decoded = body.decode(encoding)
+                break
+            except UnicodeDecodeError:
+                pass
+
+    if decoded:
        for key in content_types.keys():
-                    if key in body:
+            if key in decoded:
                return content_types[key]
-                break
-            except UnicodeDecodeError as e:
-                logging.warning("Failed to decode email: %s, treating as SPAM" % e)
-                break
+    else:
+        logging.warning("Failed to decode email: %s, treating as SPAM", body)
+
    return ContentType.SPAM