cue: tolerate NBSP as whitespace

Apparently such .cue files exist. They fail both probing and parsing. To make it worse, the sample at hand was encoded as Latin1. One part of this is replacing bstr_lstrip() with a version that supports NBSP. One could argue that bstr_lstrip() should always do this, but I don't want to overdo it. There are many more unicode abomination which it could be said it's supposed to handle, so it will stay ASCII instead of going down this rabbit hole. I'm just assuming this cue sheet was generated by some stupid software that inexplicably liked NBSPs (which is how we justify a one-off fix). The new lstrip_whitespace() doesn't look particularly efficient, but it doesn't have to be. The second part is dealing with the fact that the charset is not necessarily UTF-8. We don't want to do conversion before probing thinks it knows it's a cue sheet (would probably make it more fragile all around), so just make it work with Latin1 by assuming invalid code points are Latin1. This fallback is part of why lstrip_whitespace() is sort of roundabout. (You could still rewrite it as much more efficient state machine, instead of using a slow and validating UTF-8 parser that is called per codepoint. Starting to overthink this.) Multimedia is terrible. Legacy charsets are terrible. Everything is terrible. Fixes: #7429
2025-03-22 11:18:32 +00:00 · 2020-02-03 19:13:44 +01:00 · 2020-02-03 19:13:44 +01:00 · cbee577d0a
commit cbee577d0a
parent 13624b5c7a
1 changed files with 31 additions and 5 deletions
--- a/demux/cue.c
+++ b/demux/cue.c
@ -62,20 +62,46 @@ static const struct {
    { -1 },
 };

+static const uint8_t spaces[] = {' ', '\f', '\n', '\r', '\t', '\v', 0xA0};
+
+static struct bstr lstrip_whitespace(struct bstr data)
+{
+    while (data.len) {
+        bstr rest = data;
+        int code = bstr_decode_utf8(data, &rest);
+        if (code < 0) {
+            // Tolerate Latin1 => probing works (which doesn't convert charsets).
+            code = data.start[0];
+            rest.start += 1;
+            rest.len -= 1;
+        }
+        for (size_t n = 0; n < MP_ARRAY_SIZE(spaces); n++) {
+            if (spaces[n] == code) {
+                data = rest;
+                goto next;
+            }
+        }
+        break;
+    next: ;
+    }
+    return data;
+}
+
 static enum cue_command read_cmd(struct bstr *data, struct bstr *out_params)
 {
    struct bstr line = bstr_strip_linebreaks(bstr_getline(*data, data));
-    line = bstr_lstrip(line);
+    line = lstrip_whitespace(line);
    if (line.len == 0)
        return CUE_EMPTY;
    for (int n = 0; cue_command_strings[n].command != -1; n++) {
        struct bstr name = bstr0(cue_command_strings[n].text);
        if (bstr_case_startswith(line, name)) {
            struct bstr rest = bstr_cut(line, name.len);
-            if (rest.len && !strchr(WHITESPACE, rest.start[0]))
+            struct bstr par = lstrip_whitespace(rest);
+            if (rest.len && par.len == rest.len)
                continue;
            if (out_params)
-                *out_params = bstr_lstrip(rest);
+                *out_params = par;
            return cue_command_strings[n].command;
        }
    }
@ -94,7 +120,7 @@ static bool eat_char(struct bstr *data, char ch)

 static char *read_quoted(void *talloc_ctx, struct bstr *data)
 {
-    *data = bstr_lstrip(*data);
+    *data = lstrip_whitespace(*data);
    if (!eat_char(data, '"'))
        return NULL;
    int end = bstrchr(*data, '"');
@ -118,7 +144,7 @@ static struct bstr strip_quotes(struct bstr data)
 // Return -1 on failure.
 static int read_int(struct bstr *data, bool two_digit)
 {
-    *data = bstr_lstrip(*data);
+    *data = lstrip_whitespace(*data);
    if (data->len && data->start[0] == '-')
        return -1;
    struct bstr s = *data;