mpv/TOOLS/matroska.py

#!/usr/bin/env python3
"""
Generate C definitions for parsing Matroska files.
Can also be used to directly parse Matroska files and display their contents.
"""

import sys
from binascii import hexlify
from math import ldexp

#
# This file is part of mpv.
#
# mpv is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# mpv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
#


elements_ebml = (
    "EBML, 1a45dfa3, sub", (
        "EBMLVersion, 4286, uint",
        "EBMLReadVersion, 42f7, uint",
        "EBMLMaxIDLength, 42f2, uint",
        "EBMLMaxSizeLength, 42f3, uint",
        "DocType, 4282, str",
        "DocTypeVersion, 4287, uint",
        "DocTypeReadVersion, 4285, uint",
    ),

    "CRC32, bf, binary",
    "Void, ec, binary",
)

elements_matroska = (
    "Segment, 18538067, sub", (

        "SeekHead*, 114d9b74, sub", (
            "Seek*, 4dbb, sub", (
                "SeekID, 53ab, ebml_id",
                "SeekPosition, 53ac, uint",
            ),
        ),

        "Info*, 1549a966, sub", (
            "SegmentUID, 73a4, binary",
            "PrevUID, 3cb923, binary",
            "NextUID, 3eb923, binary",
            "TimecodeScale, 2ad7b1, uint",
            "DateUTC, 4461, sint",
            "Title, 7ba9, str",
            "MuxingApp, 4d80, str",
            "WritingApp, 5741, str",
            "Duration, 4489, float",
        ),

        "Cluster*, 1f43b675, sub", (
            "Timecode, e7, uint",
            "BlockGroup*, a0, sub", (
                "Block, a1, binary",
                "BlockDuration, 9b, uint",
                "ReferenceBlock*, fb, sint",
                "DiscardPadding,  75A2, sint",
                "BlockAdditions, 75A1, sub", (
                    "BlockMore*, A6, sub", (
                        "BlockAddID, EE, uint",
                        "BlockAdditional, A5, binary",
                    ),
                ),
            ),
            "SimpleBlock*, a3, binary",
        ),

        "Tracks*, 1654ae6b, sub", (
            "TrackEntry*, ae, sub", (
                "TrackNumber, d7, uint",
                "TrackUID, 73c5, uint",
                "TrackType, 83, uint",
                "FlagEnabled, b9, uint",
                "FlagDefault, 88, uint",
                "FlagForced, 55aa, uint",
                "FlagLacing, 9c, uint",
                "MinCache, 6de7, uint",
                "MaxCache, 6df8, uint",
                "DefaultDuration, 23e383, uint",
                "TrackTimecodeScale, 23314f, float",
                "MaxBlockAdditionID, 55ee, uint",
                "Name, 536e, str",
                "Language, 22b59c, str",
                "LanguageBCP47, 22b59d, str",
                "CodecID, 86, str",
                "CodecPrivate, 63a2, binary",
                "CodecName, 258688, str",
                "CodecDecodeAll, aa, uint",
                "CodecDelay, 56aa, uint",
                "SeekPreRoll, 56bb, uint",
                "Video, e0, sub", (
                    "FlagInterlaced, 9a, uint",
                    "PixelWidth, b0, uint",
                    "PixelHeight, ba, uint",
                    "DisplayWidth, 54b0, uint",
                    "DisplayHeight, 54ba, uint",
                    "DisplayUnit, 54b2, uint",
                    "PixelCropTop, 54bb, uint",
                    "PixelCropLeft, 54cc, uint",
                    "PixelCropRight, 54dd, uint",
                    "PixelCropBottom, 54aa, uint",
                    "FrameRate, 2383e3, float",
                    "ColourSpace, 2eb524, binary",
                    "StereoMode, 53b8, uint",
                    "Colour, 55b0, sub", (
                        "MatrixCoefficients,      55B1, uint",
                        "BitsPerChannel,          55B2, uint",
                        "ChromaSubsamplingHorz,   55B3, uint",
                        "ChromaSubsamplingVert,   55B4, uint",
                        "CbSubsamplingHorz,       55B5, uint",
                        "CbSubsamplingVert,       55B6, uint",
                        "ChromaSitingHorz,        55B7, uint",
                        "ChromaSitingVert,        55B8, uint",
                        "Range,                   55B9, uint",
                        "TransferCharacteristics, 55BA, uint",
                        "Primaries,               55BB, uint",
                        "MaxCLL,                  55BC, uint",
                        "MaxFALL,                 55BD, uint",
                        "MasteringMetadata,       55D0, sub", (
                            "PrimaryRChromaticityX,   55D1, float",
                            "PrimaryRChromaticityY,   55D2, float",
                            "PrimaryGChromaticityX,   55D3, float",
                            "PrimaryGChromaticityY,   55D4, float",
                            "PrimaryBChromaticityX,   55D5, float",
                            "PrimaryBChromaticityY,   55D6, float",
                            "WhitePointChromaticityX, 55D7, float",
                            "WhitePointChromaticityY, 55D8, float",
                            "LuminanceMax,            55D9, float",
                            "LuminanceMin,            55DA, float",
                        ),
                    ),
                    "Projection, 7670, sub", (
                        "ProjectionType, 7671, uint",
                        "ProjectionPrivate, 7672, binary",
                        "ProjectionPoseYaw, 7673, float",
                        "ProjectionPosePitch, 7674, float",
                        "ProjectionPoseRoll, 7675, float",
                    ),
                ),
                "Audio, e1, sub", (
                    "SamplingFrequency, b5, float",
                    "OutputSamplingFrequency, 78b5, float",
                    "Channels, 9f, uint",
                    "BitDepth, 6264, uint",
                ),
                "ContentEncodings, 6d80, sub", (
                    "ContentEncoding*, 6240, sub", (
                        "ContentEncodingOrder, 5031, uint",
                        "ContentEncodingScope, 5032, uint",
                        "ContentEncodingType, 5033, uint",
                        "ContentCompression, 5034, sub", (
                            "ContentCompAlgo, 4254, uint",
                            "ContentCompSettings, 4255, binary",
                        ),
                    ),
                ),
                "BlockAdditionMapping*, 41e4, sub", (
                    "BlockAddIDValue, 41f0, uint",
                    "BlockAddIDName, 41a4, str",
                    "BlockAddIDType, 41e7, uint",
                    "BlockAddIDExtraData, 41ed, binary",
                ),
            ),
        ),

        "Cues, 1c53bb6b, sub", (
            "CuePoint*, bb, sub", (
                "CueTime, b3, uint",
                "CueTrackPositions*, b7, sub", (
                    "CueTrack, f7, uint",
                    "CueClusterPosition, f1, uint",
                    "CueRelativePosition, f0, uint",
                    "CueDuration, b2, uint",
                ),
            ),
        ),

        "Attachments, 1941a469, sub", (
            "AttachedFile*, 61a7, sub", (
                "FileDescription, 467e, str",
                "FileName, 466e, str",
                "FileMimeType, 4660, str",
                "FileData, 465c, binary",
                "FileUID, 46ae, uint",
            ),
        ),

        "Chapters, 1043a770, sub", (
            "EditionEntry*, 45b9, sub", (
                "EditionUID, 45bc, uint",
                "EditionFlagHidden, 45bd, uint",
                "EditionFlagDefault, 45db, uint",
                "EditionFlagOrdered, 45dd, uint",
                "ChapterAtom*, b6, sub", (
                    "ChapterUID, 73c4, uint",
                    "ChapterTimeStart, 91, uint",
                    "ChapterTimeEnd, 92, uint",
                    "ChapterFlagHidden, 98, uint",
                    "ChapterFlagEnabled, 4598, uint",
                    "ChapterSegmentUID, 6e67, binary",
                    "ChapterSegmentEditionUID, 6ebc, uint",
                    "ChapterDisplay*, 80, sub", (
                        "ChapString, 85, str",
                        "ChapLanguage*, 437c, str",
                        "ChapLanguageBCP47*, 437d, str",
                        "ChapCountry*, 437e, str",
                    ),
                ),
            ),
        ),
        "Tags*, 1254c367, sub", (
            "Tag*, 7373, sub", (
                "Targets, 63c0, sub", (
                    "TargetTypeValue, 68ca, uint",
                    "TargetType, 63ca, str",
                    "TargetTrackUID, 63c5, uint",
                    "TargetEditionUID, 63c9, uint",
                    "TargetChapterUID, 63c4, uint",
                    "TargetAttachmentUID, 63c6, uint",
                 ),
                "SimpleTag*, 67c8, sub", (
                    "TagName, 45a3, str",
                    "TagLanguage, 447a, str",
                    "TagLanguageBCP47, 447b, str",
                    "TagString, 4487, str",
                    "TagDefault, 4484, uint",
                ),
            ),
        ),
    ),
)


def byte2num(s):
    return int(hexlify(s), 16)

def camelcase_to_words(name):
    parts = []
    start = 0
    for i in range(1, len(name)):
        if name[i].isupper() and (name[i-1].islower() or
                                  name[i+1:i+2].islower()):
            parts.append(name[start:i])
            start = i
    parts.append(name[start:])
    return "_".join(parts).lower()

class MatroskaElement:

    def __init__(self, name, elid, valtype, namespace):
        self.name = name
        self.definename = f"{namespace}_ID_{name.upper()}"
        self.fieldname = camelcase_to_words(name)
        self.structname = "ebml_" + self.fieldname
        self.elid = elid
        self.valtype = valtype
        if valtype == "sub":
            self.ebmltype = "EBML_TYPE_SUBELEMENTS"
            self.valname = "struct " + self.structname
        else:
            self.ebmltype = "EBML_TYPE_" + valtype.upper()
            try:
                self.valname = {"uint": "uint64_t", "str": "char *",
                                "binary": "bstr", "ebml_id": "uint32_t",
                                "float": "double", "sint": "int64_t",
                                }[valtype]
            except KeyError:
                raise SyntaxError("Unrecognized value type " + valtype)
        self.subelements = ()

    def add_subelements(self, subelements):
        self.subelements = subelements
        self.subids = {x[0].elid for x in subelements}

elementd = {}
elementlist = []
def parse_elems(elements, namespace):
    subelements = []
    for el in elements:
        if isinstance(el, str):
            name, hexid, eltype = (x.strip() for x in el.split(","))
            hexid = hexid.lower()
            multiple = name.endswith("*")
            name = name.strip("*")
            new = MatroskaElement(name, hexid, eltype, namespace)
            elementd[hexid] = new
            elementlist.append(new)
            subelements.append((new, multiple))
        else:
            new.add_subelements(parse_elems(el, namespace))
    return subelements

parse_elems(elements_ebml, "EBML")
parse_elems(elements_matroska, "MATROSKA")

def printf(out, *args):
    out.write(" ".join(str(x) for x in args))
    out.write("\n")

def generate_c_header(out):
    printf(out, "// Generated by TOOLS/matroska.py, do not edit manually")
    printf(out)

    for el in elementlist:
        printf(out, f"#define {el.definename:40} 0x{el.elid}")

    printf(out)

    for el in reversed(elementlist):
        if not el.subelements:
            continue
        printf(out)
        printf(out, f"struct {el.structname} {{")
        length = max(len(subel.valname) for subel, multiple in el.subelements)+1
        for subel, multiple in el.subelements:
            printf(out, "    {e.valname:{length}} {star}{e.fieldname};".format(
                        e=subel, length=length, star=" *"[multiple]))
        printf(out)
        for subel, multiple in el.subelements:
            printf(out, f"    int  n_{subel.fieldname};")
        printf(out, "};")

    for el in elementlist:
        if not el.subelements:
            continue
        printf(out, f"extern const struct ebml_elem_desc {el.structname}_desc;")

    printf(out)
    printf(out, "#define MAX_EBML_SUBELEMENTS", max(len(el.subelements)
                                                    for el in elementlist))


def generate_c_definitions(out):
    printf(out, "// Generated by TOOLS/matroska.py, do not edit manually")
    printf(out)
    for el in reversed(elementlist):
        printf(out)
        if el.subelements:
            printf(out, "#define N", el.fieldname)
            printf(out, f'E_S("{el.name}", {len(el.subelements)})')
            for subel, multiple in el.subelements:
                msg = f"F({subel.definename}, {subel.fieldname}, {int(multiple)})"
                printf(out, msg)
            printf(out, "}};")
            printf(out, "#undef N")
        else:
            printf(out, f'E("{el.name}", {el.fieldname}, {el.ebmltype})')

def read(s, length):
    t = s.read(length)
    if len(t) != length:
        raise EOFError
    return t

def read_id(s):
    t = read(s, 1)
    i = 0
    mask = 128
    if ord(t) == 0:
        raise SyntaxError
    while not ord(t) & mask:
        i += 1
        mask >>= 1
    t += read(s, i)
    return t

def read_vint(s):
    t = read(s, 1)
    i = 0
    mask = 128
    if ord(t) == 0:
        raise SyntaxError
    while not ord(t) & mask:
        i += 1
        mask >>= 1
    t = bytes((ord(t) & (mask - 1),))
    t += read(s, i)
    return i+1, byte2num(t)

def read_str(s, length):
    return read(s, length)

def read_uint(s, length):
    t = read(s, length)
    return byte2num(t)

def read_sint(s, length):
    i = read_uint(s, length)
    mask = 1 << (length * 8 - 1)
    if i & mask:
        i -= 2 * mask
    return i

def read_float(s, length):
    t = read(s, length)
    i = byte2num(t)
    if length == 4:
        f = ldexp((i & 0x7fffff) + (1 << 23), (i >> 23 & 0xff) - 150)
        if i & (1 << 31):
            f = -f
    elif length == 8:
        f = ldexp((i & ((1 << 52) - 1)) + (1 << 52), (i >> 52 & 0x7ff) - 1075)
        if i & (1 << 63):
            f = -f
    else:
        raise SyntaxError
    return f

def parse_one(s, depth, parent, maxlen):
    elid = hexlify(read_id(s)).decode("ascii")
    elem = elementd.get(elid)
    size, length = read_vint(s)
    this_length = len(elid) / 2 + size + length
    if elem is not None:
        if elem.valtype != "skip":
            indent = "    " * depth
            print(f"{indent} [{elid}] {elem.name} size: {length} value:", end=" ")
        if elem.valtype == "sub":
            print("subelements:")
            while length > 0:
                length -= parse_one(s, depth + 1, elem, length)
            if length < 0:
                raise SyntaxError
        elif elem.valtype == "str":
            print("string", repr(read_str(s, length).decode("utf8", "replace")))
        elif elem.valtype in ("binary", "ebml_id"):
            t = read_str(s, length)
            dec = ""
            if elem.valtype == "ebml_id":
                idelem = elementd.get(hexlify(t).decode("ascii"))
                if idelem is None:
                    dec = "(UNKNOWN)"
                else:
                    dec = f"({idelem.name})"
            if len(t) < 20:
                t = hexlify(t).decode("ascii")
            else:
                t = f"<{len(t)} bytes>"
            print("binary", t, dec)
        elif elem.valtype == "uint":
            print("uint", read_uint(s, length))
        elif elem.valtype == "sint":
            print("sint", read_sint(s, length))
        elif elem.valtype == "float":
            print("float", read_float(s, length))
        elif elem.valtype == "skip":
            read(s, length)
        else:
            raise NotImplementedError
    else:
        print("    " * depth, "[" + elid + "] Unknown element! size:", length)
        read(s, length)
    return this_length

if __name__ == "__main__":
    def parse_toplevel(s):
        parse_one(s, 0, None, 1 << 63)

    if sys.argv[1] == "--generate-header":
        generate_c_header(open(sys.argv[2], "w"))
    elif sys.argv[1] == "--generate-definitions":
        generate_c_definitions(open(sys.argv[2], "w"))
    else:
        s = open(sys.argv[1], "rb")
        while 1:
            start = s.tell()
            try:
                parse_toplevel(s)
            except EOFError:
                if s.tell() != start:
                    raise Exception("Unexpected end of file")
                break