filter_sdh: add --sub-filter-sdh-enclosures option

This filter is a bit complicated, but one of the essential parts of it
is removing text enclosed by particular set of characters (e.g. text
inbetween []). This was previously hardcoded to only take into account
parenthesis and brackets, but people may want to filter more things so
make this customizable. The option only takes "left hand characters" so
the right pair is mapped internally if applicable. If not, then we just
use the same character. Fixes #8268 since the unicode character in
question can just be passed to this option.
This commit is contained in:
Dudemanguy 2023-11-05 12:51:43 -06:00
parent b7d85f0d4a
commit ce958b7742
5 changed files with 80 additions and 12 deletions

View File

@ -34,6 +34,7 @@ Interface changes
- `--screenshot-avif-pixfmt` no longer defaults to yuv420p
- `--screenshot-avif-opts` defaults to lossless screenshot
- rename key `MP_KEY_BACK` to `MP_KEY_GO_BACK`
- add `--sub-filter-sdh-enclosures` option
--- mpv 0.37.0 ---
- `--save-position-on-quit` and its associated commands now store state files
in %LOCALAPPDATA% instead of %APPDATA% directory by default on Windows.

View File

@ -2898,8 +2898,11 @@ Subtitles
This is intended for English, but may in part work for other languages too.
The intention is that it can be always enabled so may not remove
all parts added.
It removes speaker labels (like MAN:), upper case text in parentheses and
any text in brackets.
It removes speaker labels (like MAN:) and any text enclosed within symbols like
parentheses or brackets as specified by the ``--sub-filter-sdh-enclosures`` option.
Note that parenthesis are a special case and only upper case text is removed. For
more filtering, you can use the ``--sub-filter-sdh-harder`` option.
Default: ``no``.
@ -2910,6 +2913,15 @@ Subtitles
Default: ``no``.
``--sub-filter-sdh-enclosures=<string>``
Specify a string of characters that ``--sub-filter-sdh`` will use to potentially
remove text. Text that is enclosed within characters specified by this string will
be removed. Note that bracket characters with known pairs (such as ``(`` or ``[``)
will be mapped internally to their matching right hand character, so you only need
to specify left hand characters.
Default: ``([``.
``--sub-filter-regex-...=...``
Set a list of regular expressions to match on text subtitles, and remove any
lines that match (default: empty). This is a string list option. See

View File

@ -262,6 +262,7 @@ const struct m_sub_options mp_sub_filter_opts = {
.opts = (const struct m_option[]){
{"sub-filter-sdh", OPT_BOOL(sub_filter_SDH)},
{"sub-filter-sdh-harder", OPT_BOOL(sub_filter_SDH_harder)},
{"sub-filter-sdh-enclosures", OPT_STRING(sub_filter_SDH_enclosures)},
{"sub-filter-regex-enable", OPT_BOOL(rf_enable)},
{"sub-filter-regex-plain", OPT_BOOL(rf_plain)},
{"sub-filter-regex", OPT_STRINGLIST(rf_items)},
@ -271,6 +272,7 @@ const struct m_sub_options mp_sub_filter_opts = {
},
.size = sizeof(OPT_BASE_STRUCT),
.defaults = &(OPT_BASE_STRUCT){
.sub_filter_SDH_enclosures = "([",
.rf_enable = true,
},
.change_flags = UPDATE_SUB_FILT,

View File

@ -123,6 +123,7 @@ struct mp_subtitle_opts {
struct mp_sub_filter_opts {
bool sub_filter_SDH;
bool sub_filter_SDH_harder;
char *sub_filter_SDH_enclosures;
bool rf_enable;
bool rf_plain;
char **rf_items;

View File

@ -33,6 +33,12 @@
// all SDH parts.
// It is for filtering ASS encoded subtitles
static const char *const enclosure_pair[][2] = {
{"(", ")"},
{"[", "]"},
{0},
};
struct buffer {
char *string;
int length;
@ -58,6 +64,47 @@ static inline int append(struct sd_filter *sd, struct buffer *buf, char c)
return c;
}
static int get_char_bytes(char *str)
{
// In case the final character is non-ASCII.
// Will only work with UTF-8 but you shouldn't be
// using anything else anyway.
if (str && str[0]) {
if (!(str[0] >> 7 & 1)) {
return 1;
} else if (!(str[0] >> 5 & 1)) {
return 2;
} else if (!(str[0] >> 4 & 1)) {
return 3;
} else if (!(str[0] >> 3 & 1)) {
return 4;
}
}
return 0;
}
static const char *get_right_enclosure(char *left)
{
// See if the right hand character is mapped. If not, just return the same thing.
for (int i = 0; enclosure_pair[i][0]; i++) {
if (strcmp(left, enclosure_pair[i][0]) == 0)
return enclosure_pair[i][1];
}
return left;
}
static bool valid_left_enclosure(struct sd_filter *sd, char *str)
{
// All characters in this string are valid left hand enclosure characters.
char *enclosures = sd->opts->sub_filter_SDH_enclosures;
int len = strlen(enclosures);
for (int i = 0; i < len; i++) {
if (str && str[0] && str[0] == enclosures[i])
return true;
}
return false;
}
// copy ass override tags, if they exist att current position,
// from source string to destination buffer stopping at first
@ -203,7 +250,8 @@ static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf,
char *rp = *rpp;
int old_pos = buf->pos;
rp++; // skip past the left character
// skip past the left character
rp += get_char_bytes(rp);
// skip past valid data searching for the right character
bool only_digits = strcmp(left, "(") == 0;
while (*rp && rp[0] != right[0]) {
@ -235,7 +283,8 @@ static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf,
buf->pos = old_pos;
return false;
}
rp++; // skip right character
// skip past the right character
rp += get_char_bytes(rp);
// skip trailing spaces
while (rp[0] == ' ') {
rp++;
@ -332,14 +381,17 @@ static char *filter_SDH(struct sd_filter *sd, char *data, int length, ptrdiff_t
// go through the rest of the line looking for SDH in () or []
while (*rp && !(rp[0] == '\\' && rp[1] == 'N')) {
copy_ass(sd, &rp, buf);
if (rp[0] == '[') {
if (!skip_enclosed(sd, &rp, buf, "[", "]")) {
append(sd, buf, rp[0]);
rp++;
line_with_text = true;
}
} else if (rp[0] == '(') {
if (!skip_enclosed(sd, &rp, buf, "(", ")")) {
char left[5] = {0};
const char *right = NULL;
if (valid_left_enclosure(sd, rp)) {
int bytes = get_char_bytes(rp);
for (int i = 0; i < bytes; i++)
left[i] = rp[i];
left[bytes + 1] = '\0';
right = get_right_enclosure(left);
}
if (left[0] && right && right[0]) {
if (!skip_enclosed(sd, &rp, buf, left, right)) {
append(sd, buf, rp[0]);
rp++;
line_with_text = true;