#!/usr/bin/env python3 """ Generate C definitions for parsing Matroska files. Can also be used to directly parse Matroska files and display their contents. """ import sys from binascii import hexlify from math import ldexp # # This file is part of mpv. # # mpv is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # mpv is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with mpv. If not, see . # elements_ebml = ( "EBML, 1a45dfa3, sub", ( "EBMLVersion, 4286, uint", "EBMLReadVersion, 42f7, uint", "EBMLMaxIDLength, 42f2, uint", "EBMLMaxSizeLength, 42f3, uint", "DocType, 4282, str", "DocTypeVersion, 4287, uint", "DocTypeReadVersion, 4285, uint", ), "CRC32, bf, binary", "Void, ec, binary", ) elements_matroska = ( "Segment, 18538067, sub", ( "SeekHead*, 114d9b74, sub", ( "Seek*, 4dbb, sub", ( "SeekID, 53ab, ebml_id", "SeekPosition, 53ac, uint", ), ), "Info*, 1549a966, sub", ( "SegmentUID, 73a4, binary", "PrevUID, 3cb923, binary", "NextUID, 3eb923, binary", "TimecodeScale, 2ad7b1, uint", "DateUTC, 4461, sint", "Title, 7ba9, str", "MuxingApp, 4d80, str", "WritingApp, 5741, str", "Duration, 4489, float", ), "Cluster*, 1f43b675, sub", ( "Timecode, e7, uint", "BlockGroup*, a0, sub", ( "Block, a1, binary", "BlockDuration, 9b, uint", "ReferenceBlock*, fb, sint", "DiscardPadding, 75A2, sint", "BlockAdditions, 75A1, sub", ( "BlockMore*, A6, sub", ( "BlockAddID, EE, uint", "BlockAdditional, A5, binary", ), ), ), "SimpleBlock*, a3, binary", ), "Tracks*, 1654ae6b, sub", ( "TrackEntry*, ae, sub", ( "TrackNumber, d7, uint", "TrackUID, 73c5, uint", "TrackType, 83, uint", "FlagEnabled, b9, uint", "FlagDefault, 88, uint", "FlagForced, 55aa, uint", "FlagLacing, 9c, uint", "MinCache, 6de7, uint", "MaxCache, 6df8, uint", "DefaultDuration, 23e383, uint", "TrackTimecodeScale, 23314f, float", "MaxBlockAdditionID, 55ee, uint", "Name, 536e, str", "Language, 22b59c, str", "LanguageBCP47, 22b59d, str", "CodecID, 86, str", "CodecPrivate, 63a2, binary", "CodecName, 258688, str", "CodecDecodeAll, aa, uint", "CodecDelay, 56aa, uint", "SeekPreRoll, 56bb, uint", "Video, e0, sub", ( "FlagInterlaced, 9a, uint", "PixelWidth, b0, uint", "PixelHeight, ba, uint", "DisplayWidth, 54b0, uint", "DisplayHeight, 54ba, uint", "DisplayUnit, 54b2, uint", "PixelCropTop, 54bb, uint", "PixelCropLeft, 54cc, uint", "PixelCropRight, 54dd, uint", "PixelCropBottom, 54aa, uint", "FrameRate, 2383e3, float", "ColourSpace, 2eb524, binary", "StereoMode, 53b8, uint", "Colour, 55b0, sub", ( "MatrixCoefficients, 55B1, uint", "BitsPerChannel, 55B2, uint", "ChromaSubsamplingHorz, 55B3, uint", "ChromaSubsamplingVert, 55B4, uint", "CbSubsamplingHorz, 55B5, uint", "CbSubsamplingVert, 55B6, uint", "ChromaSitingHorz, 55B7, uint", "ChromaSitingVert, 55B8, uint", "Range, 55B9, uint", "TransferCharacteristics, 55BA, uint", "Primaries, 55BB, uint", "MaxCLL, 55BC, uint", "MaxFALL, 55BD, uint", "MasteringMetadata, 55D0, sub", ( "PrimaryRChromaticityX, 55D1, float", "PrimaryRChromaticityY, 55D2, float", "PrimaryGChromaticityX, 55D3, float", "PrimaryGChromaticityY, 55D4, float", "PrimaryBChromaticityX, 55D5, float", "PrimaryBChromaticityY, 55D6, float", "WhitePointChromaticityX, 55D7, float", "WhitePointChromaticityY, 55D8, float", "LuminanceMax, 55D9, float", "LuminanceMin, 55DA, float", ), ), "Projection, 7670, sub", ( "ProjectionType, 7671, uint", "ProjectionPrivate, 7672, binary", "ProjectionPoseYaw, 7673, float", "ProjectionPosePitch, 7674, float", "ProjectionPoseRoll, 7675, float", ), ), "Audio, e1, sub", ( "SamplingFrequency, b5, float", "OutputSamplingFrequency, 78b5, float", "Channels, 9f, uint", "BitDepth, 6264, uint", ), "ContentEncodings, 6d80, sub", ( "ContentEncoding*, 6240, sub", ( "ContentEncodingOrder, 5031, uint", "ContentEncodingScope, 5032, uint", "ContentEncodingType, 5033, uint", "ContentCompression, 5034, sub", ( "ContentCompAlgo, 4254, uint", "ContentCompSettings, 4255, binary", ), ), ), "BlockAdditionMapping*, 41e4, sub", ( "BlockAddIDValue, 41f0, uint", "BlockAddIDName, 41a4, str", "BlockAddIDType, 41e7, uint", "BlockAddIDExtraData, 41ed, binary", ), ), ), "Cues, 1c53bb6b, sub", ( "CuePoint*, bb, sub", ( "CueTime, b3, uint", "CueTrackPositions*, b7, sub", ( "CueTrack, f7, uint", "CueClusterPosition, f1, uint", "CueRelativePosition, f0, uint", "CueDuration, b2, uint", ), ), ), "Attachments, 1941a469, sub", ( "AttachedFile*, 61a7, sub", ( "FileDescription, 467e, str", "FileName, 466e, str", "FileMimeType, 4660, str", "FileData, 465c, binary", "FileUID, 46ae, uint", ), ), "Chapters, 1043a770, sub", ( "EditionEntry*, 45b9, sub", ( "EditionUID, 45bc, uint", "EditionFlagHidden, 45bd, uint", "EditionFlagDefault, 45db, uint", "EditionFlagOrdered, 45dd, uint", "ChapterAtom*, b6, sub", ( "ChapterUID, 73c4, uint", "ChapterTimeStart, 91, uint", "ChapterTimeEnd, 92, uint", "ChapterFlagHidden, 98, uint", "ChapterFlagEnabled, 4598, uint", "ChapterSegmentUID, 6e67, binary", "ChapterSegmentEditionUID, 6ebc, uint", "ChapterDisplay*, 80, sub", ( "ChapString, 85, str", "ChapLanguage*, 437c, str", "ChapLanguageBCP47*, 437d, str", "ChapCountry*, 437e, str", ), ), ), ), "Tags*, 1254c367, sub", ( "Tag*, 7373, sub", ( "Targets, 63c0, sub", ( "TargetTypeValue, 68ca, uint", "TargetType, 63ca, str", "TargetTrackUID, 63c5, uint", "TargetEditionUID, 63c9, uint", "TargetChapterUID, 63c4, uint", "TargetAttachmentUID, 63c6, uint", ), "SimpleTag*, 67c8, sub", ( "TagName, 45a3, str", "TagLanguage, 447a, str", "TagLanguageBCP47, 447b, str", "TagString, 4487, str", "TagDefault, 4484, uint", ), ), ), ), ) def byte2num(s): return int(hexlify(s), 16) def camelcase_to_words(name): parts = [] start = 0 for i in range(1, len(name)): if name[i].isupper() and (name[i-1].islower() or name[i+1:i+2].islower()): parts.append(name[start:i]) start = i parts.append(name[start:]) return "_".join(parts).lower() class MatroskaElement: def __init__(self, name, elid, valtype, namespace): self.name = name self.definename = f"{namespace}_ID_{name.upper()}" self.fieldname = camelcase_to_words(name) self.structname = "ebml_" + self.fieldname self.elid = elid self.valtype = valtype if valtype == "sub": self.ebmltype = "EBML_TYPE_SUBELEMENTS" self.valname = "struct " + self.structname else: self.ebmltype = "EBML_TYPE_" + valtype.upper() try: self.valname = {"uint": "uint64_t", "str": "char *", "binary": "bstr", "ebml_id": "uint32_t", "float": "double", "sint": "int64_t", }[valtype] except KeyError: raise SyntaxError("Unrecognized value type " + valtype) self.subelements = () def add_subelements(self, subelements): self.subelements = subelements self.subids = {x[0].elid for x in subelements} elementd = {} elementlist = [] def parse_elems(elements, namespace): subelements = [] for el in elements: if isinstance(el, str): name, hexid, eltype = (x.strip() for x in el.split(",")) hexid = hexid.lower() multiple = name.endswith("*") name = name.strip("*") new = MatroskaElement(name, hexid, eltype, namespace) elementd[hexid] = new elementlist.append(new) subelements.append((new, multiple)) else: new.add_subelements(parse_elems(el, namespace)) return subelements parse_elems(elements_ebml, "EBML") parse_elems(elements_matroska, "MATROSKA") def printf(out, *args): out.write(" ".join(str(x) for x in args)) out.write("\n") def generate_c_header(out): printf(out, "// Generated by TOOLS/matroska.py, do not edit manually") printf(out) for el in elementlist: printf(out, f"#define {el.definename:40} 0x{el.elid}") printf(out) for el in reversed(elementlist): if not el.subelements: continue printf(out) printf(out, f"struct {el.structname} {{") length = max(len(subel.valname) for subel, multiple in el.subelements)+1 for subel, multiple in el.subelements: printf(out, " {e.valname:{length}} {star}{e.fieldname};".format( e=subel, length=length, star=" *"[multiple])) printf(out) for subel, multiple in el.subelements: printf(out, f" int n_{subel.fieldname};") printf(out, "};") for el in elementlist: if not el.subelements: continue printf(out, f"extern const struct ebml_elem_desc {el.structname}_desc;") printf(out) printf(out, "#define MAX_EBML_SUBELEMENTS", max(len(el.subelements) for el in elementlist)) def generate_c_definitions(out): printf(out, "// Generated by TOOLS/matroska.py, do not edit manually") printf(out) for el in reversed(elementlist): printf(out) if el.subelements: printf(out, "#define N", el.fieldname) printf(out, f'E_S("{el.name}", {len(el.subelements)})') for subel, multiple in el.subelements: msg = f"F({subel.definename}, {subel.fieldname}, {int(multiple)})" printf(out, msg) printf(out, "}};") printf(out, "#undef N") else: printf(out, f'E("{el.name}", {el.fieldname}, {el.ebmltype})') def read(s, length): t = s.read(length) if len(t) != length: raise EOFError return t def read_id(s): t = read(s, 1) i = 0 mask = 128 if ord(t) == 0: raise SyntaxError while not ord(t) & mask: i += 1 mask >>= 1 t += read(s, i) return t def read_vint(s): t = read(s, 1) i = 0 mask = 128 if ord(t) == 0: raise SyntaxError while not ord(t) & mask: i += 1 mask >>= 1 t = bytes((ord(t) & (mask - 1),)) t += read(s, i) return i+1, byte2num(t) def read_str(s, length): return read(s, length) def read_uint(s, length): t = read(s, length) return byte2num(t) def read_sint(s, length): i = read_uint(s, length) mask = 1 << (length * 8 - 1) if i & mask: i -= 2 * mask return i def read_float(s, length): t = read(s, length) i = byte2num(t) if length == 4: f = ldexp((i & 0x7fffff) + (1 << 23), (i >> 23 & 0xff) - 150) if i & (1 << 31): f = -f elif length == 8: f = ldexp((i & ((1 << 52) - 1)) + (1 << 52), (i >> 52 & 0x7ff) - 1075) if i & (1 << 63): f = -f else: raise SyntaxError return f def parse_one(s, depth, parent, maxlen): elid = hexlify(read_id(s)).decode("ascii") elem = elementd.get(elid) size, length = read_vint(s) this_length = len(elid) / 2 + size + length if elem is not None: if elem.valtype != "skip": indent = " " * depth print(f"{indent} [{elid}] {elem.name} size: {length} value:", end=" ") if elem.valtype == "sub": print("subelements:") while length > 0: length -= parse_one(s, depth + 1, elem, length) if length < 0: raise SyntaxError elif elem.valtype == "str": print("string", repr(read_str(s, length).decode("utf8", "replace"))) elif elem.valtype in ("binary", "ebml_id"): t = read_str(s, length) dec = "" if elem.valtype == "ebml_id": idelem = elementd.get(hexlify(t).decode("ascii")) if idelem is None: dec = "(UNKNOWN)" else: dec = f"({idelem.name})" if len(t) < 20: t = hexlify(t).decode("ascii") else: t = f"<{len(t)} bytes>" print("binary", t, dec) elif elem.valtype == "uint": print("uint", read_uint(s, length)) elif elem.valtype == "sint": print("sint", read_sint(s, length)) elif elem.valtype == "float": print("float", read_float(s, length)) elif elem.valtype == "skip": read(s, length) else: raise NotImplementedError else: print(" " * depth, "[" + elid + "] Unknown element! size:", length) read(s, length) return this_length if __name__ == "__main__": def parse_toplevel(s): parse_one(s, 0, None, 1 << 63) if sys.argv[1] == "--generate-header": generate_c_header(open(sys.argv[2], "w")) elif sys.argv[1] == "--generate-definitions": generate_c_definitions(open(sys.argv[2], "w")) else: s = open(sys.argv[1], "rb") while 1: start = s.tell() try: parse_toplevel(s) except EOFError: if s.tell() != start: raise Exception("Unexpected end of file") break