mpv/misc/language.c

/*
 * Language code utility functions
 *
 * This file is part of mpv.
 *
 * mpv is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * mpv is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "language.h"

#include "common/common.h"
#include "osdep/strnlen.h"

#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>

static const struct lang {
    char match[4];
    char canonical[4];
} langmap[] = {
    {"aa", "aar"},
    {"ab", "abk"},
    {"ae", "ave"},
    {"af", "afr"},
    {"ak", "aka"},
    {"am", "amh"},
    {"an", "arg"},
    {"ar", "ara"},
    {"as", "asm"},
    {"av", "ava"},
    {"ay", "aym"},
    {"az", "aze"},
    {"ba", "bak"},
    {"be", "bel"},
    {"bg", "bul"},
    {"bh", "bih"},
    {"bi", "bis"},
    {"bm", "bam"},
    {"bn", "ben"},
    {"bo", "tib"},
    {"bod", "tib"},
    {"br", "bre"},
    {"bs", "bos"},
    {"ca", "cat"},
    {"ce", "che"},
    {"ces", "cze"},
    {"ch", "cha"},
    {"co", "cos"},
    {"cr", "cre"},
    {"cs", "cze"},
    {"cu", "chu"},
    {"cv", "chv"},
    {"cy", "wel"},
    {"cym", "wel"},
    {"da", "dan"},
    {"de", "ger"},
    {"deu", "ger"},
    {"dv", "div"},
    {"dz", "dzo"},
    {"ee", "ewe"},
    {"el", "gre"},
    {"ell", "gre"},
    {"en", "eng"},
    {"eo", "epo"},
    {"es", "spa"},
    {"et", "est"},
    {"eu", "baq"},
    {"eus", "baq"},
    {"fa", "per"},
    {"fas", "per"},
    {"ff", "ful"},
    {"fi", "fin"},
    {"fj", "fij"},
    {"fo", "fao"},
    {"fr", "fre"},
    {"fra", "fre"},
    {"fy", "fry"},
    {"ga", "gle"},
    {"gd", "gla"},
    {"gl", "glg"},
    {"gn", "grn"},
    {"gu", "guj"},
    {"gv", "glv"},
    {"ha", "hau"},
    {"he", "heb"},
    {"hi", "hin"},
    {"ho", "hmo"},
    {"hr", "hrv"},
    {"ht", "hat"},
    {"hu", "hun"},
    {"hy", "arm"},
    {"hye", "arm"},
    {"hz", "her"},
    {"ia", "ina"},
    {"id", "ind"},
    {"ie", "ile"},
    {"ig", "ibo"},
    {"ii", "iii"},
    {"ik", "ipk"},
    {"io", "ido"},
    {"is", "ice"},
    {"isl", "ice"},
    {"it", "ita"},
    {"iu", "iku"},
    {"ja", "jpn"},
    {"jv", "jav"},
    {"ka", "geo"},
    {"kat", "geo"},
    {"kg", "kon"},
    {"ki", "kik"},
    {"kj", "kua"},
    {"kk", "kaz"},
    {"kl", "kal"},
    {"km", "khm"},
    {"kn", "kan"},
    {"ko", "kor"},
    {"kr", "kau"},
    {"ks", "kas"},
    {"ku", "kur"},
    {"kv", "kom"},
    {"kw", "cor"},
    {"ky", "kir"},
    {"la", "lat"},
    {"lb", "ltz"},
    {"lg", "lug"},
    {"li", "lim"},
    {"ln", "lin"},
    {"lo", "lao"},
    {"lt", "lit"},
    {"lu", "lub"},
    {"lv", "lav"},
    {"mg", "mlg"},
    {"mh", "mah"},
    {"mi", "mao"},
    {"mk", "mac"},
    {"mkd", "mac"},
    {"ml", "mal"},
    {"mn", "mon"},
    {"mr", "mar"},
    {"mri", "mao"},
    {"ms", "may"},
    {"msa", "may"},
    {"mt", "mlt"},
    {"my", "bur"},
    {"mya", "bur"},
    {"na", "nau"},
    {"nb", "nob"},
    {"nd", "nde"},
    {"ne", "nep"},
    {"ng", "ndo"},
    {"nl", "dut"},
    {"nld", "dut"},
    {"nn", "nno"},
    {"no", "nor"},
    {"nr", "nbl"},
    {"nv", "nav"},
    {"ny", "nya"},
    {"oc", "oci"},
    {"oj", "oji"},
    {"om", "orm"},
    {"or", "ori"},
    {"os", "oss"},
    {"pa", "pan"},
    {"pi", "pli"},
    {"pl", "pol"},
    {"ps", "pus"},
    {"pt", "por"},
    {"qu", "que"},
    {"rm", "roh"},
    {"rn", "run"},
    {"ro", "rum"},
    {"ron", "rum"},
    {"ru", "rus"},
    {"rw", "kin"},
    {"sa", "san"},
    {"sc", "srd"},
    {"sd", "snd"},
    {"se", "sme"},
    {"sg", "sag"},
    {"si", "sin"},
    {"sk", "slo"},
    {"sl", "slv"},
    {"slk", "slo"},
    {"sm", "smo"},
    {"sn", "sna"},
    {"so", "som"},
    {"sq", "alb"},
    {"sqi", "alb"},
    {"sr", "srp"},
    {"ss", "ssw"},
    {"st", "sot"},
    {"su", "sun"},
    {"sv", "swe"},
    {"sw", "swa"},
    {"ta", "tam"},
    {"te", "tel"},
    {"tg", "tgk"},
    {"th", "tha"},
    {"ti", "tir"},
    {"tk", "tuk"},
    {"tl", "tgl"},
    {"tn", "tsn"},
    {"to", "ton"},
    {"tr", "tur"},
    {"ts", "tso"},
    {"tt", "tat"},
    {"tw", "twi"},
    {"ty", "tah"},
    {"ug", "uig"},
    {"uk", "ukr"},
    {"ur", "urd"},
    {"uz", "uzb"},
    {"ve", "ven"},
    {"vi", "vie"},
    {"vo", "vol"},
    {"wa", "wln"},
    {"wo", "wol"},
    {"xh", "xho"},
    {"yi", "yid"},
    {"yo", "yor"},
    {"za", "zha"},
    {"zh", "chi"},
    {"zho", "chi"},
    {"zu", "zul"},
};

struct langsearch {
    const char *str;
    size_t size;
};

static int lang_compare(const void *s, const void *k)
{
    const struct langsearch *search = s;
    const struct lang *key = k;

    int ret = strncasecmp(search->str, key->match, search->size);
    if (!ret && search->size < sizeof(key->match) && key->match[search->size])
        return 1;
    return ret;
}

static void canonicalize(const char **lang, size_t *size)
{
    if (*size > sizeof(langmap[0].match))
        return;

    struct langsearch search = {*lang, *size};
    struct lang *l = bsearch(&search, langmap, MP_ARRAY_SIZE(langmap), sizeof(langmap[0]),
                             &lang_compare);

    if (l) {
        *lang = l->canonical;
        *size = strnlen(l->canonical, sizeof(l->canonical));
    }
}

static bool tag_matches(const char *l1, size_t s1, const char *l2, size_t s2)
{
    return s1 == s2 && !strncasecmp(l1, l2, s1);
}

int mp_match_lang_single(const char *l1, const char *l2)
{
    // We never consider null or empty strings to match
    if (!l1 || !l2 || !*l1 || !*l2)
        return 0;

    // The first subtag should always be a language; canonicalize to 3-letter ISO 639-2B (arbitrarily chosen)
    size_t s1 = strcspn(l1, "-_");
    size_t s2 = strcspn(l2, "-_");

    const char *l1c = l1;
    const char *l2c = l2;
    size_t s1c = s1;
    size_t s2c = s2;

    canonicalize(&l1c, &s1c);
    canonicalize(&l2c, &s2c);

    // If the first subtags don't match, we have no match at all
    if (!tag_matches(l1c, s1c, l2c, s2c))
        return 0;

    // Attempt to match each subtag in each string against each in the other
    int score = 1;
    bool x1 = false;
    int count = 0;
    for (;;) {
        l1 += s1;

        while (*l1 == '-' || *l1 == '_')
            l1++;

        if (!*l1)
            break;

        s1 = strcspn(l1, "-_");
        if (tag_matches(l1, s1, "x", 1)) {
            x1 = true;
            continue;
        }

        const char *l2o = l2;
        size_t s2o = s2;
        bool x2 = false;
        for (;;) {
            l2 += s2;

            while (*l2 == '-' || *l2 == '_')
                l2++;

            if (!*l2)
                break;

            s2 = strcspn(l2, "-_");
            if (tag_matches(l2, s2, "x", 1)) {
                x2 = true;
                if (!x1)
                    break;
                continue;
            }

            // Private-use subtags only match against other private-use subtags
            if (x1 && !x2)
                continue;

            if (tag_matches(l1c, s1c, l2c, s2c)) {
                // Matches for subtags earlier in the user's string take priority over later ones,
                // for up to LANGUAGE_SCORE_BITS subtags
                int shift = (LANGUAGE_SCORE_BITS - count - 1);
                if (shift < 0)
                    shift = 0;
                score += (1 << shift);

                if (score >= LANGUAGE_SCORE_MAX)
                    return LANGUAGE_SCORE_MAX;
            }
        }

        l2 = l2o;
        s2 = s2o;

        count++;
    }

    return score;
}
misc: add language-matching utilities 2021-05-26 22:46:56 +00:00			`/*`
			`* Language code utility functions`
			`*`
			`* This file is part of mpv.`
			`*`
			`* mpv is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* mpv is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with mpv. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`#include "language.h"`

			`#include "common/common.h"`
			`#include "osdep/strnlen.h"`

			`#include <stdbool.h>`
			`#include <stddef.h>`
			`#include <stdlib.h>`
			`#include <string.h>`
			`#include <strings.h>`

			`static const struct lang {`
			`char match[4];`
			`char canonical[4];`
			`} langmap[] = {`
			`{"aa", "aar"},`
			`{"ab", "abk"},`
			`{"ae", "ave"},`
			`{"af", "afr"},`
			`{"ak", "aka"},`
			`{"am", "amh"},`
			`{"an", "arg"},`
			`{"ar", "ara"},`
			`{"as", "asm"},`
			`{"av", "ava"},`
			`{"ay", "aym"},`
			`{"az", "aze"},`
			`{"ba", "bak"},`
			`{"be", "bel"},`
			`{"bg", "bul"},`
			`{"bh", "bih"},`
			`{"bi", "bis"},`
			`{"bm", "bam"},`
			`{"bn", "ben"},`
			`{"bo", "tib"},`
			`{"bod", "tib"},`
			`{"br", "bre"},`
			`{"bs", "bos"},`
			`{"ca", "cat"},`
			`{"ce", "che"},`
			`{"ces", "cze"},`
			`{"ch", "cha"},`
			`{"co", "cos"},`
			`{"cr", "cre"},`
			`{"cs", "cze"},`
			`{"cu", "chu"},`
			`{"cv", "chv"},`
			`{"cy", "wel"},`
			`{"cym", "wel"},`
			`{"da", "dan"},`
			`{"de", "ger"},`
			`{"deu", "ger"},`
			`{"dv", "div"},`
			`{"dz", "dzo"},`
			`{"ee", "ewe"},`
			`{"el", "gre"},`
			`{"ell", "gre"},`
			`{"en", "eng"},`
			`{"eo", "epo"},`
			`{"es", "spa"},`
			`{"et", "est"},`
			`{"eu", "baq"},`
			`{"eus", "baq"},`
			`{"fa", "per"},`
			`{"fas", "per"},`
			`{"ff", "ful"},`
			`{"fi", "fin"},`
			`{"fj", "fij"},`
			`{"fo", "fao"},`
			`{"fr", "fre"},`
			`{"fra", "fre"},`
			`{"fy", "fry"},`
			`{"ga", "gle"},`
			`{"gd", "gla"},`
			`{"gl", "glg"},`
			`{"gn", "grn"},`
			`{"gu", "guj"},`
			`{"gv", "glv"},`
			`{"ha", "hau"},`
			`{"he", "heb"},`
			`{"hi", "hin"},`
			`{"ho", "hmo"},`
			`{"hr", "hrv"},`
			`{"ht", "hat"},`
			`{"hu", "hun"},`
			`{"hy", "arm"},`
			`{"hye", "arm"},`
			`{"hz", "her"},`
			`{"ia", "ina"},`
			`{"id", "ind"},`
			`{"ie", "ile"},`
			`{"ig", "ibo"},`
			`{"ii", "iii"},`
			`{"ik", "ipk"},`
			`{"io", "ido"},`
			`{"is", "ice"},`
			`{"isl", "ice"},`
			`{"it", "ita"},`
			`{"iu", "iku"},`
			`{"ja", "jpn"},`
			`{"jv", "jav"},`
			`{"ka", "geo"},`
			`{"kat", "geo"},`
			`{"kg", "kon"},`
			`{"ki", "kik"},`
			`{"kj", "kua"},`
			`{"kk", "kaz"},`
			`{"kl", "kal"},`
			`{"km", "khm"},`
			`{"kn", "kan"},`
			`{"ko", "kor"},`
			`{"kr", "kau"},`
			`{"ks", "kas"},`
			`{"ku", "kur"},`
			`{"kv", "kom"},`
			`{"kw", "cor"},`
			`{"ky", "kir"},`
			`{"la", "lat"},`
			`{"lb", "ltz"},`
			`{"lg", "lug"},`
			`{"li", "lim"},`
			`{"ln", "lin"},`
			`{"lo", "lao"},`
			`{"lt", "lit"},`
			`{"lu", "lub"},`
			`{"lv", "lav"},`
			`{"mg", "mlg"},`
			`{"mh", "mah"},`
			`{"mi", "mao"},`
			`{"mk", "mac"},`
			`{"mkd", "mac"},`
			`{"ml", "mal"},`
			`{"mn", "mon"},`
			`{"mr", "mar"},`
			`{"mri", "mao"},`
			`{"ms", "may"},`
			`{"msa", "may"},`
			`{"mt", "mlt"},`
			`{"my", "bur"},`
			`{"mya", "bur"},`
			`{"na", "nau"},`
			`{"nb", "nob"},`
			`{"nd", "nde"},`
			`{"ne", "nep"},`
			`{"ng", "ndo"},`
			`{"nl", "dut"},`
			`{"nld", "dut"},`
			`{"nn", "nno"},`
			`{"no", "nor"},`
			`{"nr", "nbl"},`
			`{"nv", "nav"},`
			`{"ny", "nya"},`
			`{"oc", "oci"},`
			`{"oj", "oji"},`
			`{"om", "orm"},`
			`{"or", "ori"},`
			`{"os", "oss"},`
			`{"pa", "pan"},`
			`{"pi", "pli"},`
			`{"pl", "pol"},`
			`{"ps", "pus"},`
			`{"pt", "por"},`
			`{"qu", "que"},`
			`{"rm", "roh"},`
			`{"rn", "run"},`
			`{"ro", "rum"},`
			`{"ron", "rum"},`
			`{"ru", "rus"},`
			`{"rw", "kin"},`
			`{"sa", "san"},`
			`{"sc", "srd"},`
			`{"sd", "snd"},`
			`{"se", "sme"},`
			`{"sg", "sag"},`
			`{"si", "sin"},`
			`{"sk", "slo"},`
			`{"sl", "slv"},`
			`{"slk", "slo"},`
			`{"sm", "smo"},`
			`{"sn", "sna"},`
			`{"so", "som"},`
			`{"sq", "alb"},`
			`{"sqi", "alb"},`
			`{"sr", "srp"},`
			`{"ss", "ssw"},`
			`{"st", "sot"},`
			`{"su", "sun"},`
			`{"sv", "swe"},`
			`{"sw", "swa"},`
			`{"ta", "tam"},`
			`{"te", "tel"},`
			`{"tg", "tgk"},`
			`{"th", "tha"},`
			`{"ti", "tir"},`
			`{"tk", "tuk"},`
			`{"tl", "tgl"},`
			`{"tn", "tsn"},`
			`{"to", "ton"},`
			`{"tr", "tur"},`
			`{"ts", "tso"},`
			`{"tt", "tat"},`
			`{"tw", "twi"},`
			`{"ty", "tah"},`
			`{"ug", "uig"},`
			`{"uk", "ukr"},`
			`{"ur", "urd"},`
			`{"uz", "uzb"},`
			`{"ve", "ven"},`
			`{"vi", "vie"},`
			`{"vo", "vol"},`
			`{"wa", "wln"},`
			`{"wo", "wol"},`
			`{"xh", "xho"},`
			`{"yi", "yid"},`
			`{"yo", "yor"},`
			`{"za", "zha"},`
			`{"zh", "chi"},`
			`{"zho", "chi"},`
			`{"zu", "zul"},`
			`};`

			`struct langsearch {`
			`const char *str;`
			`size_t size;`
			`};`

			`static int lang_compare(const void s, const void k)`
			`{`
			`const struct langsearch *search = s;`
			`const struct lang *key = k;`

			`int ret = strncasecmp(search->str, key->match, search->size);`
			`if (!ret && search->size < sizeof(key->match) && key->match[search->size])`
			`return 1;`
			`return ret;`
			`}`

			`static void canonicalize(const char *lang, size_t size)`
			`{`
			`if (*size > sizeof(langmap[0].match))`
			`return;`

			`struct langsearch search = {lang, size};`
			`struct lang *l = bsearch(&search, langmap, MP_ARRAY_SIZE(langmap), sizeof(langmap[0]),`
			`&lang_compare);`

			`if (l) {`
			`*lang = l->canonical;`
			`*size = strnlen(l->canonical, sizeof(l->canonical));`
			`}`
			`}`

			`static bool tag_matches(const char l1, size_t s1, const char l2, size_t s2)`
			`{`
			`return s1 == s2 && !strncasecmp(l1, l2, s1);`
			`}`

			`int mp_match_lang_single(const char l1, const char l2)`
			`{`
			`// We never consider null or empty strings to match`
			`if (!l1 \|\| !l2 \|\| !l1 \|\| !l2)`
			`return 0;`

			`// The first subtag should always be a language; canonicalize to 3-letter ISO 639-2B (arbitrarily chosen)`
			`size_t s1 = strcspn(l1, "-_");`
			`size_t s2 = strcspn(l2, "-_");`

			`const char *l1c = l1;`
			`const char *l2c = l2;`
			`size_t s1c = s1;`
			`size_t s2c = s2;`

			`canonicalize(&l1c, &s1c);`
			`canonicalize(&l2c, &s2c);`

			`// If the first subtags don't match, we have no match at all`
			`if (!tag_matches(l1c, s1c, l2c, s2c))`
			`return 0;`

			`// Attempt to match each subtag in each string against each in the other`
			`int score = 1;`
			`bool x1 = false;`
			`int count = 0;`
			`for (;;) {`
			`l1 += s1;`

			`while (l1 == '-' \|\| l1 == '_')`
			`l1++;`

			`if (!*l1)`
			`break;`

			`s1 = strcspn(l1, "-_");`
			`if (tag_matches(l1, s1, "x", 1)) {`
			`x1 = true;`
			`continue;`
			`}`

			`const char *l2o = l2;`
			`size_t s2o = s2;`
			`bool x2 = false;`
			`for (;;) {`
			`l2 += s2;`

			`while (l2 == '-' \|\| l2 == '_')`
			`l2++;`

			`if (!*l2)`
			`break;`

			`s2 = strcspn(l2, "-_");`
			`if (tag_matches(l2, s2, "x", 1)) {`
			`x2 = true;`
			`if (!x1)`
			`break;`
			`continue;`
			`}`

			`// Private-use subtags only match against other private-use subtags`
			`if (x1 && !x2)`
			`continue;`

			`if (tag_matches(l1c, s1c, l2c, s2c)) {`
			`// Matches for subtags earlier in the user's string take priority over later ones,`
			`// for up to LANGUAGE_SCORE_BITS subtags`
			`int shift = (LANGUAGE_SCORE_BITS - count - 1);`
			`if (shift < 0)`
			`shift = 0;`
			`score += (1 << shift);`

			`if (score >= LANGUAGE_SCORE_MAX)`
			`return LANGUAGE_SCORE_MAX;`
			`}`
			`}`

			`l2 = l2o;`
			`s2 = s2o;`

			`count++;`
			`}`

			`return score;`
			`}`