Try both utf-8 and windows-1252 for decoding email

Recent submissions from Cirrus were classified as spam by the lore analysis robot script. This is because cirrus used windows-1252 for the encoding which failed to decode as utf-8. Try both encodings when decoding email. Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
2025-02-17 11:57:00 +00:00 · 2023-11-16 10:42:10 -06:00 · 2023-11-16 10:42:10 -06:00 · 8228c2222f
commit 8228c2222f
parent 58ec43257c
1 changed files with 22 additions and 9 deletions
--- a/contrib/process_linux_firmware.py
+++ b/contrib/process_linux_firmware.py
@ -34,6 +34,8 @@ content_types = {
 def classify_content(content):
    # load content into the email library
    msg = email.message_from_string(content)
    decoded = None
    body = None
    # check the subject
    subject = msg["Subject"]
@ -42,17 +44,28 @@ def classify_content(content):
    if "PATCH" in subject:
        return ContentType.PATCH
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == "text/plain":
                body = part.get_payload(decode=True)
    else:
        body = msg.get_payload(decode=True)
    if body:
        for encoding in ["utf-8", "windows-1252"]:
            try:
-                body = part.get_payload(decode=True).decode("utf-8")
+                decoded = body.decode(encoding)
                break
            except UnicodeDecodeError:
                pass
    if decoded:
        for key in content_types.keys():
-                    if key in body:
+            if key in decoded:
                return content_types[key]
-                break
+    else:
-            except UnicodeDecodeError as e:
+        logging.warning("Failed to decode email: %s, treating as SPAM", body)
-                logging.warning("Failed to decode email: %s, treating as SPAM" % e)
+
                break
    return ContentType.SPAM