mirror of
https://github.com/mpv-player/mpv
synced 2025-01-08 16:10:29 +00:00
misc: add language-matching utilities
This commit is contained in:
parent
1669c4698d
commit
8c8d97c26c
@ -131,6 +131,7 @@ sources = files(
|
||||
'misc/charset_conv.c',
|
||||
'misc/dispatch.c',
|
||||
'misc/json.c',
|
||||
'misc/language.c',
|
||||
'misc/natural_sort.c',
|
||||
'misc/node.c',
|
||||
'misc/random.c',
|
||||
|
362
misc/language.c
Normal file
362
misc/language.c
Normal file
@ -0,0 +1,362 @@
|
||||
/*
|
||||
* Language code utility functions
|
||||
*
|
||||
* This file is part of mpv.
|
||||
*
|
||||
* mpv is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* mpv is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with mpv. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "language.h"
|
||||
|
||||
#include "common/common.h"
|
||||
#include "osdep/strnlen.h"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
|
||||
static const struct lang {
|
||||
char match[4];
|
||||
char canonical[4];
|
||||
} langmap[] = {
|
||||
{"aa", "aar"},
|
||||
{"ab", "abk"},
|
||||
{"ae", "ave"},
|
||||
{"af", "afr"},
|
||||
{"ak", "aka"},
|
||||
{"am", "amh"},
|
||||
{"an", "arg"},
|
||||
{"ar", "ara"},
|
||||
{"as", "asm"},
|
||||
{"av", "ava"},
|
||||
{"ay", "aym"},
|
||||
{"az", "aze"},
|
||||
{"ba", "bak"},
|
||||
{"be", "bel"},
|
||||
{"bg", "bul"},
|
||||
{"bh", "bih"},
|
||||
{"bi", "bis"},
|
||||
{"bm", "bam"},
|
||||
{"bn", "ben"},
|
||||
{"bo", "tib"},
|
||||
{"bod", "tib"},
|
||||
{"br", "bre"},
|
||||
{"bs", "bos"},
|
||||
{"ca", "cat"},
|
||||
{"ce", "che"},
|
||||
{"ces", "cze"},
|
||||
{"ch", "cha"},
|
||||
{"co", "cos"},
|
||||
{"cr", "cre"},
|
||||
{"cs", "cze"},
|
||||
{"cu", "chu"},
|
||||
{"cv", "chv"},
|
||||
{"cy", "wel"},
|
||||
{"cym", "wel"},
|
||||
{"da", "dan"},
|
||||
{"de", "ger"},
|
||||
{"deu", "ger"},
|
||||
{"dv", "div"},
|
||||
{"dz", "dzo"},
|
||||
{"ee", "ewe"},
|
||||
{"el", "gre"},
|
||||
{"ell", "gre"},
|
||||
{"en", "eng"},
|
||||
{"eo", "epo"},
|
||||
{"es", "spa"},
|
||||
{"et", "est"},
|
||||
{"eu", "baq"},
|
||||
{"eus", "baq"},
|
||||
{"fa", "per"},
|
||||
{"fas", "per"},
|
||||
{"ff", "ful"},
|
||||
{"fi", "fin"},
|
||||
{"fj", "fij"},
|
||||
{"fo", "fao"},
|
||||
{"fr", "fre"},
|
||||
{"fra", "fre"},
|
||||
{"fy", "fry"},
|
||||
{"ga", "gle"},
|
||||
{"gd", "gla"},
|
||||
{"gl", "glg"},
|
||||
{"gn", "grn"},
|
||||
{"gu", "guj"},
|
||||
{"gv", "glv"},
|
||||
{"ha", "hau"},
|
||||
{"he", "heb"},
|
||||
{"hi", "hin"},
|
||||
{"ho", "hmo"},
|
||||
{"hr", "hrv"},
|
||||
{"ht", "hat"},
|
||||
{"hu", "hun"},
|
||||
{"hy", "arm"},
|
||||
{"hye", "arm"},
|
||||
{"hz", "her"},
|
||||
{"ia", "ina"},
|
||||
{"id", "ind"},
|
||||
{"ie", "ile"},
|
||||
{"ig", "ibo"},
|
||||
{"ii", "iii"},
|
||||
{"ik", "ipk"},
|
||||
{"io", "ido"},
|
||||
{"is", "ice"},
|
||||
{"isl", "ice"},
|
||||
{"it", "ita"},
|
||||
{"iu", "iku"},
|
||||
{"ja", "jpn"},
|
||||
{"jv", "jav"},
|
||||
{"ka", "geo"},
|
||||
{"kat", "geo"},
|
||||
{"kg", "kon"},
|
||||
{"ki", "kik"},
|
||||
{"kj", "kua"},
|
||||
{"kk", "kaz"},
|
||||
{"kl", "kal"},
|
||||
{"km", "khm"},
|
||||
{"kn", "kan"},
|
||||
{"ko", "kor"},
|
||||
{"kr", "kau"},
|
||||
{"ks", "kas"},
|
||||
{"ku", "kur"},
|
||||
{"kv", "kom"},
|
||||
{"kw", "cor"},
|
||||
{"ky", "kir"},
|
||||
{"la", "lat"},
|
||||
{"lb", "ltz"},
|
||||
{"lg", "lug"},
|
||||
{"li", "lim"},
|
||||
{"ln", "lin"},
|
||||
{"lo", "lao"},
|
||||
{"lt", "lit"},
|
||||
{"lu", "lub"},
|
||||
{"lv", "lav"},
|
||||
{"mg", "mlg"},
|
||||
{"mh", "mah"},
|
||||
{"mi", "mao"},
|
||||
{"mk", "mac"},
|
||||
{"mkd", "mac"},
|
||||
{"ml", "mal"},
|
||||
{"mn", "mon"},
|
||||
{"mr", "mar"},
|
||||
{"mri", "mao"},
|
||||
{"ms", "may"},
|
||||
{"msa", "may"},
|
||||
{"mt", "mlt"},
|
||||
{"my", "bur"},
|
||||
{"mya", "bur"},
|
||||
{"na", "nau"},
|
||||
{"nb", "nob"},
|
||||
{"nd", "nde"},
|
||||
{"ne", "nep"},
|
||||
{"ng", "ndo"},
|
||||
{"nl", "dut"},
|
||||
{"nld", "dut"},
|
||||
{"nn", "nno"},
|
||||
{"no", "nor"},
|
||||
{"nr", "nbl"},
|
||||
{"nv", "nav"},
|
||||
{"ny", "nya"},
|
||||
{"oc", "oci"},
|
||||
{"oj", "oji"},
|
||||
{"om", "orm"},
|
||||
{"or", "ori"},
|
||||
{"os", "oss"},
|
||||
{"pa", "pan"},
|
||||
{"pi", "pli"},
|
||||
{"pl", "pol"},
|
||||
{"ps", "pus"},
|
||||
{"pt", "por"},
|
||||
{"qu", "que"},
|
||||
{"rm", "roh"},
|
||||
{"rn", "run"},
|
||||
{"ro", "rum"},
|
||||
{"ron", "rum"},
|
||||
{"ru", "rus"},
|
||||
{"rw", "kin"},
|
||||
{"sa", "san"},
|
||||
{"sc", "srd"},
|
||||
{"sd", "snd"},
|
||||
{"se", "sme"},
|
||||
{"sg", "sag"},
|
||||
{"si", "sin"},
|
||||
{"sk", "slo"},
|
||||
{"sl", "slv"},
|
||||
{"slk", "slo"},
|
||||
{"sm", "smo"},
|
||||
{"sn", "sna"},
|
||||
{"so", "som"},
|
||||
{"sq", "alb"},
|
||||
{"sqi", "alb"},
|
||||
{"sr", "srp"},
|
||||
{"ss", "ssw"},
|
||||
{"st", "sot"},
|
||||
{"su", "sun"},
|
||||
{"sv", "swe"},
|
||||
{"sw", "swa"},
|
||||
{"ta", "tam"},
|
||||
{"te", "tel"},
|
||||
{"tg", "tgk"},
|
||||
{"th", "tha"},
|
||||
{"ti", "tir"},
|
||||
{"tk", "tuk"},
|
||||
{"tl", "tgl"},
|
||||
{"tn", "tsn"},
|
||||
{"to", "ton"},
|
||||
{"tr", "tur"},
|
||||
{"ts", "tso"},
|
||||
{"tt", "tat"},
|
||||
{"tw", "twi"},
|
||||
{"ty", "tah"},
|
||||
{"ug", "uig"},
|
||||
{"uk", "ukr"},
|
||||
{"ur", "urd"},
|
||||
{"uz", "uzb"},
|
||||
{"ve", "ven"},
|
||||
{"vi", "vie"},
|
||||
{"vo", "vol"},
|
||||
{"wa", "wln"},
|
||||
{"wo", "wol"},
|
||||
{"xh", "xho"},
|
||||
{"yi", "yid"},
|
||||
{"yo", "yor"},
|
||||
{"za", "zha"},
|
||||
{"zh", "chi"},
|
||||
{"zho", "chi"},
|
||||
{"zu", "zul"},
|
||||
};
|
||||
|
||||
struct langsearch {
|
||||
const char *str;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
static int lang_compare(const void *s, const void *k)
|
||||
{
|
||||
const struct langsearch *search = s;
|
||||
const struct lang *key = k;
|
||||
|
||||
int ret = strncasecmp(search->str, key->match, search->size);
|
||||
if (!ret && search->size < sizeof(key->match) && key->match[search->size])
|
||||
return 1;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void canonicalize(const char **lang, size_t *size)
|
||||
{
|
||||
if (*size > sizeof(langmap[0].match))
|
||||
return;
|
||||
|
||||
struct langsearch search = {*lang, *size};
|
||||
struct lang *l = bsearch(&search, langmap, MP_ARRAY_SIZE(langmap), sizeof(langmap[0]),
|
||||
&lang_compare);
|
||||
|
||||
if (l) {
|
||||
*lang = l->canonical;
|
||||
*size = strnlen(l->canonical, sizeof(l->canonical));
|
||||
}
|
||||
}
|
||||
|
||||
static bool tag_matches(const char *l1, size_t s1, const char *l2, size_t s2)
|
||||
{
|
||||
return s1 == s2 && !strncasecmp(l1, l2, s1);
|
||||
}
|
||||
|
||||
int mp_match_lang_single(const char *l1, const char *l2)
|
||||
{
|
||||
// We never consider null or empty strings to match
|
||||
if (!l1 || !l2 || !*l1 || !*l2)
|
||||
return 0;
|
||||
|
||||
// The first subtag should always be a language; canonicalize to 3-letter ISO 639-2B (arbitrarily chosen)
|
||||
size_t s1 = strcspn(l1, "-_");
|
||||
size_t s2 = strcspn(l2, "-_");
|
||||
|
||||
const char *l1c = l1;
|
||||
const char *l2c = l2;
|
||||
size_t s1c = s1;
|
||||
size_t s2c = s2;
|
||||
|
||||
canonicalize(&l1c, &s1c);
|
||||
canonicalize(&l2c, &s2c);
|
||||
|
||||
// If the first subtags don't match, we have no match at all
|
||||
if (!tag_matches(l1c, s1c, l2c, s2c))
|
||||
return 0;
|
||||
|
||||
// Attempt to match each subtag in each string against each in the other
|
||||
int score = 1;
|
||||
bool x1 = false;
|
||||
int count = 0;
|
||||
for (;;) {
|
||||
l1 += s1;
|
||||
|
||||
while (*l1 == '-' || *l1 == '_')
|
||||
l1++;
|
||||
|
||||
if (!*l1)
|
||||
break;
|
||||
|
||||
s1 = strcspn(l1, "-_");
|
||||
if (tag_matches(l1, s1, "x", 1)) {
|
||||
x1 = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
const char *l2o = l2;
|
||||
size_t s2o = s2;
|
||||
bool x2 = false;
|
||||
for (;;) {
|
||||
l2 += s2;
|
||||
|
||||
while (*l2 == '-' || *l2 == '_')
|
||||
l2++;
|
||||
|
||||
if (!*l2)
|
||||
break;
|
||||
|
||||
s2 = strcspn(l2, "-_");
|
||||
if (tag_matches(l2, s2, "x", 1)) {
|
||||
x2 = true;
|
||||
if (!x1)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Private-use subtags only match against other private-use subtags
|
||||
if (x1 && !x2)
|
||||
continue;
|
||||
|
||||
if (tag_matches(l1c, s1c, l2c, s2c)) {
|
||||
// Matches for subtags earlier in the user's string take priority over later ones,
|
||||
// for up to LANGUAGE_SCORE_BITS subtags
|
||||
int shift = (LANGUAGE_SCORE_BITS - count - 1);
|
||||
if (shift < 0)
|
||||
shift = 0;
|
||||
score += (1 << shift);
|
||||
|
||||
if (score >= LANGUAGE_SCORE_MAX)
|
||||
return LANGUAGE_SCORE_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
l2 = l2o;
|
||||
s2 = s2o;
|
||||
|
||||
count++;
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
29
misc/language.h
Normal file
29
misc/language.h
Normal file
@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Language code utility functions
|
||||
*
|
||||
* This file is part of mpv.
|
||||
*
|
||||
* mpv is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* mpv is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with mpv. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef MP_LANGUAGE_H
|
||||
#define MP_LANGUAGE_H
|
||||
|
||||
#define LANGUAGE_SCORE_BITS 16
|
||||
#define LANGUAGE_SCORE_MAX (1 << LANGUAGE_SCORE_BITS)
|
||||
|
||||
// Where applicable, l1 is the user-specified code and l2 is the code being checked against it
|
||||
int mp_match_lang_single(const char *l1, const char *l2);
|
||||
|
||||
#endif /* MP_LANGUAGE_H */
|
@ -355,6 +355,7 @@ def build(ctx):
|
||||
( "misc/dispatch.c" ),
|
||||
( "misc/jni.c", "android" ),
|
||||
( "misc/json.c" ),
|
||||
( "misc/language.c" ),
|
||||
( "misc/natural_sort.c" ),
|
||||
( "misc/node.c" ),
|
||||
( "misc/rendezvous.c" ),
|
||||
|
Loading…
Reference in New Issue
Block a user