mirror of https://github.com/mpv-player/mpv
filter_sdh: add --sub-filter-sdh-enclosures option
This filter is a bit complicated, but one of the essential parts of it is removing text enclosed by particular set of characters (e.g. text inbetween []). This was previously hardcoded to only take into account parenthesis and brackets, but people may want to filter more things so make this customizable. The option only takes "left hand characters" so the right pair is mapped internally if applicable. If not, then we just use the same character. Fixes #8268 since the unicode character in question can just be passed to this option.
This commit is contained in:
parent
b7d85f0d4a
commit
ce958b7742
|
@ -34,6 +34,7 @@ Interface changes
|
|||
- `--screenshot-avif-pixfmt` no longer defaults to yuv420p
|
||||
- `--screenshot-avif-opts` defaults to lossless screenshot
|
||||
- rename key `MP_KEY_BACK` to `MP_KEY_GO_BACK`
|
||||
- add `--sub-filter-sdh-enclosures` option
|
||||
--- mpv 0.37.0 ---
|
||||
- `--save-position-on-quit` and its associated commands now store state files
|
||||
in %LOCALAPPDATA% instead of %APPDATA% directory by default on Windows.
|
||||
|
|
|
@ -2898,8 +2898,11 @@ Subtitles
|
|||
This is intended for English, but may in part work for other languages too.
|
||||
The intention is that it can be always enabled so may not remove
|
||||
all parts added.
|
||||
It removes speaker labels (like MAN:), upper case text in parentheses and
|
||||
any text in brackets.
|
||||
|
||||
It removes speaker labels (like MAN:) and any text enclosed within symbols like
|
||||
parentheses or brackets as specified by the ``--sub-filter-sdh-enclosures`` option.
|
||||
Note that parenthesis are a special case and only upper case text is removed. For
|
||||
more filtering, you can use the ``--sub-filter-sdh-harder`` option.
|
||||
|
||||
Default: ``no``.
|
||||
|
||||
|
@ -2910,6 +2913,15 @@ Subtitles
|
|||
|
||||
Default: ``no``.
|
||||
|
||||
``--sub-filter-sdh-enclosures=<string>``
|
||||
Specify a string of characters that ``--sub-filter-sdh`` will use to potentially
|
||||
remove text. Text that is enclosed within characters specified by this string will
|
||||
be removed. Note that bracket characters with known pairs (such as ``(`` or ``[``)
|
||||
will be mapped internally to their matching right hand character, so you only need
|
||||
to specify left hand characters.
|
||||
|
||||
Default: ``([``.
|
||||
|
||||
``--sub-filter-regex-...=...``
|
||||
Set a list of regular expressions to match on text subtitles, and remove any
|
||||
lines that match (default: empty). This is a string list option. See
|
||||
|
|
|
@ -262,6 +262,7 @@ const struct m_sub_options mp_sub_filter_opts = {
|
|||
.opts = (const struct m_option[]){
|
||||
{"sub-filter-sdh", OPT_BOOL(sub_filter_SDH)},
|
||||
{"sub-filter-sdh-harder", OPT_BOOL(sub_filter_SDH_harder)},
|
||||
{"sub-filter-sdh-enclosures", OPT_STRING(sub_filter_SDH_enclosures)},
|
||||
{"sub-filter-regex-enable", OPT_BOOL(rf_enable)},
|
||||
{"sub-filter-regex-plain", OPT_BOOL(rf_plain)},
|
||||
{"sub-filter-regex", OPT_STRINGLIST(rf_items)},
|
||||
|
@ -271,6 +272,7 @@ const struct m_sub_options mp_sub_filter_opts = {
|
|||
},
|
||||
.size = sizeof(OPT_BASE_STRUCT),
|
||||
.defaults = &(OPT_BASE_STRUCT){
|
||||
.sub_filter_SDH_enclosures = "([",
|
||||
.rf_enable = true,
|
||||
},
|
||||
.change_flags = UPDATE_SUB_FILT,
|
||||
|
|
|
@ -123,6 +123,7 @@ struct mp_subtitle_opts {
|
|||
struct mp_sub_filter_opts {
|
||||
bool sub_filter_SDH;
|
||||
bool sub_filter_SDH_harder;
|
||||
char *sub_filter_SDH_enclosures;
|
||||
bool rf_enable;
|
||||
bool rf_plain;
|
||||
char **rf_items;
|
||||
|
|
|
@ -33,6 +33,12 @@
|
|||
// all SDH parts.
|
||||
// It is for filtering ASS encoded subtitles
|
||||
|
||||
static const char *const enclosure_pair[][2] = {
|
||||
{"(", ")"},
|
||||
{"[", "]"},
|
||||
{0},
|
||||
};
|
||||
|
||||
struct buffer {
|
||||
char *string;
|
||||
int length;
|
||||
|
@ -58,6 +64,47 @@ static inline int append(struct sd_filter *sd, struct buffer *buf, char c)
|
|||
return c;
|
||||
}
|
||||
|
||||
static int get_char_bytes(char *str)
|
||||
{
|
||||
// In case the final character is non-ASCII.
|
||||
// Will only work with UTF-8 but you shouldn't be
|
||||
// using anything else anyway.
|
||||
if (str && str[0]) {
|
||||
if (!(str[0] >> 7 & 1)) {
|
||||
return 1;
|
||||
} else if (!(str[0] >> 5 & 1)) {
|
||||
return 2;
|
||||
} else if (!(str[0] >> 4 & 1)) {
|
||||
return 3;
|
||||
} else if (!(str[0] >> 3 & 1)) {
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const char *get_right_enclosure(char *left)
|
||||
{
|
||||
// See if the right hand character is mapped. If not, just return the same thing.
|
||||
for (int i = 0; enclosure_pair[i][0]; i++) {
|
||||
if (strcmp(left, enclosure_pair[i][0]) == 0)
|
||||
return enclosure_pair[i][1];
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
||||
static bool valid_left_enclosure(struct sd_filter *sd, char *str)
|
||||
{
|
||||
// All characters in this string are valid left hand enclosure characters.
|
||||
char *enclosures = sd->opts->sub_filter_SDH_enclosures;
|
||||
int len = strlen(enclosures);
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (str && str[0] && str[0] == enclosures[i])
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// copy ass override tags, if they exist att current position,
|
||||
// from source string to destination buffer stopping at first
|
||||
|
@ -203,7 +250,8 @@ static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf,
|
|||
char *rp = *rpp;
|
||||
int old_pos = buf->pos;
|
||||
|
||||
rp++; // skip past the left character
|
||||
// skip past the left character
|
||||
rp += get_char_bytes(rp);
|
||||
// skip past valid data searching for the right character
|
||||
bool only_digits = strcmp(left, "(") == 0;
|
||||
while (*rp && rp[0] != right[0]) {
|
||||
|
@ -235,7 +283,8 @@ static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf,
|
|||
buf->pos = old_pos;
|
||||
return false;
|
||||
}
|
||||
rp++; // skip right character
|
||||
// skip past the right character
|
||||
rp += get_char_bytes(rp);
|
||||
// skip trailing spaces
|
||||
while (rp[0] == ' ') {
|
||||
rp++;
|
||||
|
@ -332,14 +381,17 @@ static char *filter_SDH(struct sd_filter *sd, char *data, int length, ptrdiff_t
|
|||
// go through the rest of the line looking for SDH in () or []
|
||||
while (*rp && !(rp[0] == '\\' && rp[1] == 'N')) {
|
||||
copy_ass(sd, &rp, buf);
|
||||
if (rp[0] == '[') {
|
||||
if (!skip_enclosed(sd, &rp, buf, "[", "]")) {
|
||||
append(sd, buf, rp[0]);
|
||||
rp++;
|
||||
line_with_text = true;
|
||||
}
|
||||
} else if (rp[0] == '(') {
|
||||
if (!skip_enclosed(sd, &rp, buf, "(", ")")) {
|
||||
char left[5] = {0};
|
||||
const char *right = NULL;
|
||||
if (valid_left_enclosure(sd, rp)) {
|
||||
int bytes = get_char_bytes(rp);
|
||||
for (int i = 0; i < bytes; i++)
|
||||
left[i] = rp[i];
|
||||
left[bytes + 1] = '\0';
|
||||
right = get_right_enclosure(left);
|
||||
}
|
||||
if (left[0] && right && right[0]) {
|
||||
if (!skip_enclosed(sd, &rp, buf, left, right)) {
|
||||
append(sd, buf, rp[0]);
|
||||
rp++;
|
||||
line_with_text = true;
|
||||
|
|
Loading…
Reference in New Issue