mpv/TOOLS/matroska.py

488 lines
17 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Generate C definitions for parsing Matroska files.
Can also be used to directly parse Matroska files and display their contents.
"""
import sys
from binascii import hexlify
from math import ldexp
#
# This file is part of mpv.
#
# mpv is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# mpv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with mpv. If not, see <http://www.gnu.org/licenses/>.
#
elements_ebml = (
"EBML, 1a45dfa3, sub", (
"EBMLVersion, 4286, uint",
"EBMLReadVersion, 42f7, uint",
"EBMLMaxIDLength, 42f2, uint",
"EBMLMaxSizeLength, 42f3, uint",
"DocType, 4282, str",
"DocTypeVersion, 4287, uint",
"DocTypeReadVersion, 4285, uint",
),
"CRC32, bf, binary",
"Void, ec, binary",
)
elements_matroska = (
"Segment, 18538067, sub", (
"SeekHead*, 114d9b74, sub", (
"Seek*, 4dbb, sub", (
"SeekID, 53ab, ebml_id",
"SeekPosition, 53ac, uint",
),
),
"Info*, 1549a966, sub", (
"SegmentUID, 73a4, binary",
"PrevUID, 3cb923, binary",
"NextUID, 3eb923, binary",
"TimecodeScale, 2ad7b1, uint",
"DateUTC, 4461, sint",
"Title, 7ba9, str",
"MuxingApp, 4d80, str",
"WritingApp, 5741, str",
"Duration, 4489, float",
),
"Cluster*, 1f43b675, sub", (
"Timecode, e7, uint",
"BlockGroup*, a0, sub", (
"Block, a1, binary",
"BlockDuration, 9b, uint",
"ReferenceBlock*, fb, sint",
"DiscardPadding, 75A2, sint",
"BlockAdditions, 75A1, sub", (
"BlockMore*, A6, sub", (
"BlockAddID, EE, uint",
"BlockAdditional, A5, binary",
),
),
),
"SimpleBlock*, a3, binary",
),
"Tracks*, 1654ae6b, sub", (
"TrackEntry*, ae, sub", (
"TrackNumber, d7, uint",
"TrackUID, 73c5, uint",
"TrackType, 83, uint",
"FlagEnabled, b9, uint",
"FlagDefault, 88, uint",
"FlagForced, 55aa, uint",
"FlagLacing, 9c, uint",
"MinCache, 6de7, uint",
"MaxCache, 6df8, uint",
"DefaultDuration, 23e383, uint",
"TrackTimecodeScale, 23314f, float",
"MaxBlockAdditionID, 55ee, uint",
"Name, 536e, str",
"Language, 22b59c, str",
"LanguageBCP47, 22b59d, str",
"CodecID, 86, str",
"CodecPrivate, 63a2, binary",
"CodecName, 258688, str",
"CodecDecodeAll, aa, uint",
"CodecDelay, 56aa, uint",
"SeekPreRoll, 56bb, uint",
"Video, e0, sub", (
"FlagInterlaced, 9a, uint",
"PixelWidth, b0, uint",
"PixelHeight, ba, uint",
"DisplayWidth, 54b0, uint",
"DisplayHeight, 54ba, uint",
"DisplayUnit, 54b2, uint",
"PixelCropTop, 54bb, uint",
"PixelCropLeft, 54cc, uint",
"PixelCropRight, 54dd, uint",
"PixelCropBottom, 54aa, uint",
"FrameRate, 2383e3, float",
"ColourSpace, 2eb524, binary",
"StereoMode, 53b8, uint",
"Colour, 55b0, sub", (
"MatrixCoefficients, 55B1, uint",
"BitsPerChannel, 55B2, uint",
"ChromaSubsamplingHorz, 55B3, uint",
"ChromaSubsamplingVert, 55B4, uint",
"CbSubsamplingHorz, 55B5, uint",
"CbSubsamplingVert, 55B6, uint",
"ChromaSitingHorz, 55B7, uint",
"ChromaSitingVert, 55B8, uint",
"Range, 55B9, uint",
"TransferCharacteristics, 55BA, uint",
"Primaries, 55BB, uint",
"MaxCLL, 55BC, uint",
"MaxFALL, 55BD, uint",
"MasteringMetadata, 55D0, sub", (
"PrimaryRChromaticityX, 55D1, float",
"PrimaryRChromaticityY, 55D2, float",
"PrimaryGChromaticityX, 55D3, float",
"PrimaryGChromaticityY, 55D4, float",
"PrimaryBChromaticityX, 55D5, float",
"PrimaryBChromaticityY, 55D6, float",
"WhitePointChromaticityX, 55D7, float",
"WhitePointChromaticityY, 55D8, float",
"LuminanceMax, 55D9, float",
"LuminanceMin, 55DA, float",
),
),
"Projection, 7670, sub", (
"ProjectionType, 7671, uint",
"ProjectionPrivate, 7672, binary",
"ProjectionPoseYaw, 7673, float",
"ProjectionPosePitch, 7674, float",
"ProjectionPoseRoll, 7675, float",
),
),
"Audio, e1, sub", (
"SamplingFrequency, b5, float",
"OutputSamplingFrequency, 78b5, float",
"Channels, 9f, uint",
"BitDepth, 6264, uint",
),
"ContentEncodings, 6d80, sub", (
"ContentEncoding*, 6240, sub", (
"ContentEncodingOrder, 5031, uint",
"ContentEncodingScope, 5032, uint",
"ContentEncodingType, 5033, uint",
"ContentCompression, 5034, sub", (
"ContentCompAlgo, 4254, uint",
"ContentCompSettings, 4255, binary",
),
),
),
"BlockAdditionMapping*, 41e4, sub", (
"BlockAddIDValue, 41f0, uint",
"BlockAddIDName, 41a4, str",
"BlockAddIDType, 41e7, uint",
"BlockAddIDExtraData, 41ed, binary",
),
),
),
"Cues, 1c53bb6b, sub", (
"CuePoint*, bb, sub", (
"CueTime, b3, uint",
"CueTrackPositions*, b7, sub", (
"CueTrack, f7, uint",
"CueClusterPosition, f1, uint",
"CueRelativePosition, f0, uint",
"CueDuration, b2, uint",
),
),
),
"Attachments, 1941a469, sub", (
"AttachedFile*, 61a7, sub", (
"FileDescription, 467e, str",
"FileName, 466e, str",
"FileMimeType, 4660, str",
"FileData, 465c, binary",
"FileUID, 46ae, uint",
),
),
"Chapters, 1043a770, sub", (
"EditionEntry*, 45b9, sub", (
"EditionUID, 45bc, uint",
"EditionFlagHidden, 45bd, uint",
"EditionFlagDefault, 45db, uint",
"EditionFlagOrdered, 45dd, uint",
"ChapterAtom*, b6, sub", (
"ChapterUID, 73c4, uint",
"ChapterTimeStart, 91, uint",
"ChapterTimeEnd, 92, uint",
"ChapterFlagHidden, 98, uint",
"ChapterFlagEnabled, 4598, uint",
"ChapterSegmentUID, 6e67, binary",
"ChapterSegmentEditionUID, 6ebc, uint",
"ChapterDisplay*, 80, sub", (
"ChapString, 85, str",
"ChapLanguage*, 437c, str",
"ChapLanguageBCP47*, 437d, str",
"ChapCountry*, 437e, str",
),
),
),
),
"Tags*, 1254c367, sub", (
"Tag*, 7373, sub", (
"Targets, 63c0, sub", (
"TargetTypeValue, 68ca, uint",
"TargetType, 63ca, str",
"TargetTrackUID, 63c5, uint",
"TargetEditionUID, 63c9, uint",
"TargetChapterUID, 63c4, uint",
"TargetAttachmentUID, 63c6, uint",
),
"SimpleTag*, 67c8, sub", (
"TagName, 45a3, str",
"TagLanguage, 447a, str",
"TagLanguageBCP47, 447b, str",
"TagString, 4487, str",
"TagDefault, 4484, uint",
),
),
),
),
)
def byte2num(s):
return int(hexlify(s), 16)
def camelcase_to_words(name):
parts = []
start = 0
for i in range(1, len(name)):
if name[i].isupper() and (name[i-1].islower() or
name[i+1:i+2].islower()):
parts.append(name[start:i])
start = i
parts.append(name[start:])
return "_".join(parts).lower()
class MatroskaElement:
def __init__(self, name, elid, valtype, namespace):
self.name = name
self.definename = f"{namespace}_ID_{name.upper()}"
self.fieldname = camelcase_to_words(name)
self.structname = "ebml_" + self.fieldname
self.elid = elid
self.valtype = valtype
if valtype == "sub":
self.ebmltype = "EBML_TYPE_SUBELEMENTS"
self.valname = "struct " + self.structname
else:
self.ebmltype = "EBML_TYPE_" + valtype.upper()
try:
self.valname = {"uint": "uint64_t", "str": "char *",
"binary": "bstr", "ebml_id": "uint32_t",
"float": "double", "sint": "int64_t",
}[valtype]
except KeyError:
raise SyntaxError("Unrecognized value type " + valtype)
self.subelements = ()
def add_subelements(self, subelements):
self.subelements = subelements
self.subids = {x[0].elid for x in subelements}
elementd = {}
elementlist = []
def parse_elems(elements, namespace):
subelements = []
for el in elements:
if isinstance(el, str):
name, hexid, eltype = (x.strip() for x in el.split(","))
hexid = hexid.lower()
multiple = name.endswith("*")
name = name.strip("*")
new = MatroskaElement(name, hexid, eltype, namespace)
elementd[hexid] = new
elementlist.append(new)
subelements.append((new, multiple))
else:
new.add_subelements(parse_elems(el, namespace))
return subelements
parse_elems(elements_ebml, "EBML")
parse_elems(elements_matroska, "MATROSKA")
def printf(out, *args):
out.write(" ".join(str(x) for x in args))
out.write("\n")
def generate_c_header(out):
printf(out, "// Generated by TOOLS/matroska.py, do not edit manually")
printf(out)
for el in elementlist:
printf(out, f"#define {el.definename:40} 0x{el.elid}")
printf(out)
for el in reversed(elementlist):
if not el.subelements:
continue
printf(out)
printf(out, f"struct {el.structname} {{")
length = max(len(subel.valname) for subel, multiple in el.subelements)+1
for subel, multiple in el.subelements:
printf(out, " {e.valname:{length}} {star}{e.fieldname};".format(
e=subel, length=length, star=" *"[multiple]))
printf(out)
for subel, multiple in el.subelements:
printf(out, f" int n_{subel.fieldname};")
printf(out, "};")
for el in elementlist:
if not el.subelements:
continue
printf(out, f"extern const struct ebml_elem_desc {el.structname}_desc;")
printf(out)
printf(out, "#define MAX_EBML_SUBELEMENTS", max(len(el.subelements)
for el in elementlist))
def generate_c_definitions(out):
printf(out, "// Generated by TOOLS/matroska.py, do not edit manually")
printf(out)
for el in reversed(elementlist):
printf(out)
if el.subelements:
printf(out, "#define N", el.fieldname)
printf(out, f'E_S("{el.name}", {len(el.subelements)})')
for subel, multiple in el.subelements:
msg = f"F({subel.definename}, {subel.fieldname}, {int(multiple)})"
printf(out, msg)
printf(out, "}};")
printf(out, "#undef N")
else:
printf(out, f'E("{el.name}", {el.fieldname}, {el.ebmltype})')
def read(s, length):
t = s.read(length)
if len(t) != length:
raise EOFError
return t
def read_id(s):
t = read(s, 1)
i = 0
mask = 128
if ord(t) == 0:
raise SyntaxError
while not ord(t) & mask:
i += 1
mask >>= 1
t += read(s, i)
return t
def read_vint(s):
t = read(s, 1)
i = 0
mask = 128
if ord(t) == 0:
raise SyntaxError
while not ord(t) & mask:
i += 1
mask >>= 1
t = bytes((ord(t) & (mask - 1),))
t += read(s, i)
return i+1, byte2num(t)
def read_str(s, length):
return read(s, length)
def read_uint(s, length):
t = read(s, length)
return byte2num(t)
def read_sint(s, length):
i = read_uint(s, length)
mask = 1 << (length * 8 - 1)
if i & mask:
i -= 2 * mask
return i
def read_float(s, length):
t = read(s, length)
i = byte2num(t)
if length == 4:
f = ldexp((i & 0x7fffff) + (1 << 23), (i >> 23 & 0xff) - 150)
if i & (1 << 31):
f = -f
elif length == 8:
f = ldexp((i & ((1 << 52) - 1)) + (1 << 52), (i >> 52 & 0x7ff) - 1075)
if i & (1 << 63):
f = -f
else:
raise SyntaxError
return f
def parse_one(s, depth, parent, maxlen):
elid = hexlify(read_id(s)).decode("ascii")
elem = elementd.get(elid)
size, length = read_vint(s)
this_length = len(elid) / 2 + size + length
if elem is not None:
if elem.valtype != "skip":
indent = " " * depth
print(f"{indent} [{elid}] {elem.name} size: {length} value:", end=" ")
if elem.valtype == "sub":
print("subelements:")
while length > 0:
length -= parse_one(s, depth + 1, elem, length)
if length < 0:
raise SyntaxError
elif elem.valtype == "str":
print("string", repr(read_str(s, length).decode("utf8", "replace")))
elif elem.valtype in ("binary", "ebml_id"):
t = read_str(s, length)
dec = ""
if elem.valtype == "ebml_id":
idelem = elementd.get(hexlify(t).decode("ascii"))
if idelem is None:
dec = "(UNKNOWN)"
else:
dec = f"({idelem.name})"
if len(t) < 20:
t = hexlify(t).decode("ascii")
else:
t = f"<{len(t)} bytes>"
print("binary", t, dec)
elif elem.valtype == "uint":
print("uint", read_uint(s, length))
elif elem.valtype == "sint":
print("sint", read_sint(s, length))
elif elem.valtype == "float":
print("float", read_float(s, length))
elif elem.valtype == "skip":
read(s, length)
else:
raise NotImplementedError
else:
print(" " * depth, "[" + elid + "] Unknown element! size:", length)
read(s, length)
return this_length
if __name__ == "__main__":
def parse_toplevel(s):
parse_one(s, 0, None, 1 << 63)
if sys.argv[1] == "--generate-header":
generate_c_header(open(sys.argv[2], "w"))
elif sys.argv[1] == "--generate-definitions":
generate_c_definitions(open(sys.argv[2], "w"))
else:
s = open(sys.argv[1], "rb")
while 1:
start = s.tell()
try:
parse_toplevel(s)
except EOFError:
if s.tell() != start:
raise Exception("Unexpected end of file")
break