mpv/sub/filter_sdh.c

/*
 * This file is part of mpv.
 *
 * mpv is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * mpv is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <stddef.h>

#include "misc/ctype.h"
#include "common/common.h"
#include "common/msg.h"
#include "options/options.h"
#include "sd.h"

// Filter for removing subtitle additions for deaf or hard-of-hearing (SDH)
// This is for English, but may in part work for others too.
// The intention is that it can always be active so may not remove
// all SDH parts.
// It is for filtering ASS encoded subtitles

static const char *const enclosure_pair[][2] = {
    {"(",      ")"},
    {"[",      "]"},
    {"\uFF08", "\uFF09"},
    {0},
};

struct buffer {
    char *string;
    int length;
    int pos;
};

static void init_buf(struct buffer *buf, int length)
{
    buf->string = talloc_size(NULL, length);
    buf->pos = 0;
    buf->length = length;
}

static inline int append(struct sd_filter *sd, struct buffer *buf, char c)
{
    if (buf->pos >= 0 && buf->pos < buf->length) {
        buf->string[buf->pos++] = c;
    } else {
        // ensure that terminating \0 is always written
        if (c == '\0')
            buf->string[buf->length - 1] = c;
    }
    return c;
}

static int get_char_bytes(char *str)
{
    // In case the first character is non-ASCII.
    // Will only work with UTF-8 but you shouldn't be
    // using anything else anyway.
    if (str && str[0]) {
        if (!(str[0] >> 7 & 1)) {
            return strnlen(str, 1);
        } else if (!(str[0] >> 5 & 1)) {
            return strnlen(str, 2);
        } else if (!(str[0] >> 4 & 1)) {
            return strnlen(str, 3);
        } else if (!(str[0] >> 3 & 1)) {
            return strnlen(str, 4);
        }
    }
    return 0;
}

static const char *get_right_enclosure(char *left)
{
    // See if the right hand character is mapped. If not, just return the same thing.
    for (int i = 0; enclosure_pair[i][0]; i++) {
        if (strcmp(left, enclosure_pair[i][0]) == 0)
            return enclosure_pair[i][1];
    }
    return left;
}

static bool valid_left_enclosure(struct sd_filter *sd, char *str)
{
    // All characters in this string are valid left hand enclosure characters.
    char *enclosures = sd->opts->sub_filter_SDH_enclosures;
    int len = strlen(enclosures);
    for (int i = 0; i < len; i++) {
        if (str && str[0] && str[0] == enclosures[i])
            return true;
    }
    return false;
}


// copy ass override tags, if they exist att current position,
// from source string to destination buffer stopping at first
// character following last sequence of '{text}'
//
// Parameters:
//     rpp       read pointer pointer to source string, updated on return
//     buf       write buffer
//
// on return the read pointer is updated to the position after
// the tags.
static void copy_ass(struct sd_filter *sd, char **rpp, struct buffer *buf)
{
    char *rp = *rpp;

    while (rp[0] == '{') {
        while (*rp) {
            char tmp = append(sd, buf, rp[0]);
            rp++;
            if (tmp == '}')
                break;
        }
    }
    *rpp = rp;

    return;
}

static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf,
                          const char *left, const char *right);

// check for speaker label, like MAN:
// normal subtitles may include mixed case text with : after so
// only upper case is accepted and lower case l which for some
// looks like upper case I unless filter_harder - then
// lower case is also acceptable
//
// Parameters:
//     rpp       read pointer pointer to source string, updated on return
//     buf       write buffer
//
// scan in source string and copy ass tags to destination string
// skipping speaker label if it exists
//
// if no label was found read pointer and write position in buffer
// will be unchanged
// otherwise they point to next position after label and next write position
static void skip_speaker_label(struct sd_filter *sd, char **rpp, struct buffer *buf)
{
    int filter_harder = sd->opts->sub_filter_SDH_harder;
    char *rp = *rpp;
    int old_pos = buf->pos;

    copy_ass(sd, &rp, buf);
    // copy any leading "- "
    if (rp[0] == '-') {
        append(sd, buf, rp[0]);
        rp++;
    }
    copy_ass(sd, &rp, buf);
    while (rp[0] == ' ') {
        append(sd, buf, rp[0]);
        rp++;
        copy_ass(sd, &rp, buf);
    }
    // skip past valid data searching for :
    while (*rp && rp[0] != ':') {
        if (rp[0] == '{') {
            copy_ass(sd, &rp, buf);
        } else if (rp[0] == '[') {
            // not uncommon with [xxxx]: which should also be skipped
            if (!skip_enclosed(sd, &rp, buf, "[", "]")) {
                buf->pos = old_pos;
                return;
            }
        } else if ((mp_isalpha(rp[0]) &&
                    (filter_harder || mp_isupper(rp[0]) || rp[0] == 'l')) ||
                   mp_isdigit(rp[0]) ||
                   rp[0] == ' ' || rp[0] == '\'' ||
                   (filter_harder && (rp[0] == '(' || rp[0] == ')')) ||
                   rp[0] == '#' || rp[0] == '.' || rp[0] == ',') {
            rp++;
        } else {
            buf->pos = old_pos;
            return;
         }
    }
    if (!*rp) {
        // : was not found
        buf->pos = old_pos;
        return;
    }
    rp++; // skip :
    copy_ass(sd, &rp, buf);
    if (!*rp) {
        // end of data
    } else if (rp[0] == '\\' && rp[1] == 'N') {
        // line end follows - skip it as line is empty
        rp += 2;
    } else if (rp[0] == ' ') {
        while (rp[0] == ' ') {
            rp++;
        }
        if (rp[0] == '\\' && rp[1] == 'N') {
            // line end follows - skip it as line is empty
            rp += 2;
        }
    } else {
        // non space follows - no speaker label
        buf->pos = old_pos;
        return;
    }
    *rpp = rp;

    return;
}

// Check for text enclosed in symbols, like (SOUND)
// and skip it while preserving ass tags.
// Parentheses are a special case since normal subtitles may have
// them so only upper case is accepted and lower case l which for
// some looks like upper case I. If sub_filter_SDH_harder is used,
// both upper and lower case is accepted.
//
// For other symbols, all text in between is removed.
//
// Parameters:
//     rpp       read pointer pointer to source string, updated on return
//     buf       write buffer
//
// scan in source string
// the first character in source string must be the starting left symbol
// and copy ass tags to destination string but
// skipping enclosed text if it looks like SDH
//
// return true if enclosed text was removed.
// if not valid SDH read pointer and write buffer position will be unchanged
// otherwise they point to next position after text and next write position
static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf,
                          const char *left, const char *right)
{
    bool filter_harder = sd->opts->sub_filter_SDH_harder;
    char *rp = *rpp;
    int old_pos = buf->pos;
    bool parenthesis = strcmp(left, "(") == 0 || strcmp(left, "\uFF08") == 0;

    // skip past the left character
    rp += get_char_bytes(rp);
    // skip past valid data searching for the right character
    bool only_digits = parenthesis;
    while (*rp && rp[0] != right[0]) {
        if (rp[0] == '{') {
            copy_ass(sd, &rp, buf);
        } else if (parenthesis && ((mp_isalpha(rp[0]) &&
                    (filter_harder || mp_isupper(rp[0]) || rp[0] == 'l')) ||
                   mp_isdigit(rp[0]) ||
                   rp[0] == ' ' || rp[0] == '\'' || rp[0] == '#' ||
                   rp[0] == '.' || rp[0] == ',' ||
                   rp[0] == '-' || rp[0] == '"' || rp[0] == '\\')) {
            if (!mp_isdigit(rp[0]))
                only_digits = false;
            rp++;
        } else if (parenthesis) {
            buf->pos = old_pos;
            return false;
        } else {
            rp++;
        }
    }
    if (!*rp) {
        // ) was not found
        buf->pos = old_pos;
        return false;
    }
    if (only_digits) {
        // number within parentheses is probably not SDH
        buf->pos = old_pos;
        return false;
    }
    // skip past the right character
    rp += get_char_bytes(rp);
    // skip trailing spaces
    while (rp[0] == ' ') {
        rp++;
    }
    *rpp = rp;

    return true;
}

// remove leading hyphen and following spaces in write buffer
//
// Parameters:
//     start_pos start position i buffer
//     buf       buffer to remove in
//
// when removing characters the following are moved back
//
static void remove_leading_hyphen_space(struct sd_filter *sd, int start_pos,
                                        struct buffer *buf)
{
    int old_pos = buf->pos;
    if (start_pos < 0 || start_pos >= old_pos)
        return;
    append(sd, buf, '\0');  // \0 terminate for reading

    // move past leading ass tags
    while (buf->string[start_pos] == '{') {
        while (buf->string[start_pos] && buf->string[start_pos] != '}') {
            start_pos++;
        }
        if (buf->string[start_pos])
            start_pos++; // skip past '}'
    }

    // if there is not a leading '-' no removing will be done
    if (buf->string[start_pos] != '-') {
        buf->pos = old_pos;
        return;
    }

    char *rp = &buf->string[start_pos];  // read from here
    buf->pos = start_pos; // start writing here
    rp++; // skip '-'
    copy_ass(sd, &rp, buf);
    while (rp[0] == ' ') {
        rp++; // skip ' '
        copy_ass(sd, &rp, buf);
    }
    while (*rp) {
        // copy the rest
        append(sd, buf, rp[0]);
        rp++;
    }
}

// Filter ASS formatted string for SDH
//
// Parameters:
//     data         ASS line
//     length       length of ASS line
//     toff         Text offset from data. required: 0 <= toff <= length
//
// Returns  a talloc allocated string with filtered ASS data (may be the same
// content as original if no SDH was found) which must be released
// by caller using talloc_free.
//
// Returns NULL if filtering resulted in all of ASS data being removed so no
// subtitle should be output
static char *filter_SDH(struct sd_filter *sd, char *data, int length, ptrdiff_t toff)
{
    struct buffer writebuf;
    struct buffer *buf = &writebuf;
    init_buf(buf, length + 1); // with room for terminating '\0'

    // pre-text headers into buf, rp is the (null-terminated) remaining text
    char *ass = talloc_strndup(NULL, data, length), *rp = ass;
    while (rp - ass < toff)
        append(sd, buf, *rp++);

    bool contains_text = false;  // true if non SDH text was found
    bool line_with_text = false; // if last line contained text
    int wp_line_start = buf->pos; // write pos to start of last line
    int wp_line_end   = buf->pos; // write pos to end of previous line with text (\N)

    // go through the lines in the text
    // they are separated by \N
    while (*rp) {
        line_with_text = false;
        wp_line_start = buf->pos;

        // skip any speaker label
        skip_speaker_label(sd, &rp, buf);

        // go through the rest of the line looking for SDH in () or []
        while (*rp && !(rp[0] == '\\' && rp[1] == 'N')) {
            copy_ass(sd, &rp, buf);
            char left[5] = {0};
            const char *right = NULL;
            if (valid_left_enclosure(sd, rp)) {
                int bytes = get_char_bytes(rp);
                for (int i = 0; i < bytes; i++)
                    left[i] = rp[i];
                left[bytes] = '\0';
                right = get_right_enclosure(left);
            }
            if (left[0] && right && right[0]) {
                if (!skip_enclosed(sd, &rp, buf, left, right)) {
                    append(sd, buf, rp[0]);
                    rp++;
                    line_with_text =  true;
                }
            } else if (*rp && rp[0] != '\\') {
                if ((rp[0] > 32 && rp[0] < 127 && rp[0] != '-') ||
                    (unsigned char)rp[0] >= 0xC0)
                {
                    line_with_text =  true;
                }
                append(sd, buf, rp[0]);
                rp++;
            } else if (rp[0] == '\\' && rp[1] != 'N') {
                append(sd, buf, rp[0]);
                rp++;
            }
        }
        // either end of data or ASS line end defined by separating \N
        if (*rp) {
            // ASS line end
            if (line_with_text) {
                contains_text = true;
                wp_line_end = buf->pos;
                append(sd, buf, rp[0]); // copy backslash
                append(sd, buf, rp[1]); // copy N
                rp += 2; // move read pointer past \N
            } else {
                // no text in line, remove leading hyphen and spaces
                remove_leading_hyphen_space(sd, wp_line_start, buf);
                // and join with next line
                rp += 2; // move read pointer past \N
            }
        }
    }
    // if no normal text in last line - remove last line
    // by moving write pointer to start of last line
    if (!line_with_text) {
        buf->pos = wp_line_end;
    } else {
        contains_text = true;
    }
    talloc_free(ass);

    if (contains_text) {
        // the ASS data contained normal text after filtering
        append(sd, buf, '\0'); // '\0' terminate
        return buf->string;
    } else {
        // all data removed by filtering
        talloc_free(buf->string);
        return NULL;
    }
}

static bool sdh_init(struct sd_filter *ft)
{
    if (strcmp(ft->codec, "ass") != 0)
        return false;

    if (!ft->opts->sub_filter_SDH)
        return false;

    if (!ft->event_format) {
        MP_VERBOSE(ft, "SDH filtering not possible - format missing\n");
        return false;
    }

    return true;
}

static struct demux_packet *sdh_filter(struct sd_filter *ft,
                                       struct demux_packet *pkt)
{
    bstr text = sd_ass_pkt_text(ft, pkt, sd_ass_fmt_offset(ft->event_format));
    if (!text.start || !text.len || pkt->len >= INT_MAX)
        return pkt;  // we don't touch it

    ptrdiff_t toff = text.start - pkt->buffer;
    char *line = filter_SDH(ft, (char *)pkt->buffer, (int)pkt->len, toff);
    if (!line)
        return NULL;
    if (0 == bstrcmp0((bstr){(char *)pkt->buffer, pkt->len}, line)) {
        talloc_free(line);
        return pkt;  // unmodified, no need to allocate new packet
    }

    // Stupidly, this copies it again. One could possibly allocate the packet
    // for writing in the first place (new_demux_packet()) and use
    // demux_packet_shorten().
    struct demux_packet *npkt = new_demux_packet_from(line, strlen(line));
    if (npkt)
        demux_packet_copy_attribs(npkt, pkt);

    talloc_free(line);
    return npkt;
}

const struct sd_filter_functions sd_filter_sdh = {
    .init   = sdh_init,
    .filter = sdh_filter,
};