1
0
mirror of https://github.com/mpv-player/mpv synced 2025-01-07 07:30:09 +00:00
mpv/TOOLS/matroska.py
wm4 f8cc184134 TOOLS: change license of some scripts involved in build to LGPL
wscript calls them directly, and thus are probably part of the build
system. They seem to be fully covered by relicensing agreements.
2017-06-24 10:08:33 +02:00

471 lines
16 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Generate C definitions for parsing Matroska files.
Can also be used to directly parse Matroska files and display their contents.
"""
#
# This file is part of mpv.
#
# mpv is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# mpv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with mpv. If not, see <http://www.gnu.org/licenses/>.
#
# for compatibility with Python 2.x
from __future__ import print_function
elements_ebml = (
'EBML, 1a45dfa3, sub', (
'EBMLVersion, 4286, uint',
'EBMLReadVersion, 42f7, uint',
'EBMLMaxIDLength, 42f2, uint',
'EBMLMaxSizeLength, 42f3, uint',
'DocType, 4282, str',
'DocTypeVersion, 4287, uint',
'DocTypeReadVersion, 4285, uint',
),
'CRC32, bf, binary',
'Void, ec, binary',
)
elements_matroska = (
'Segment, 18538067, sub', (
'SeekHead*, 114d9b74, sub', (
'Seek*, 4dbb, sub', (
'SeekID, 53ab, ebml_id',
'SeekPosition, 53ac, uint',
),
),
'Info*, 1549a966, sub', (
'SegmentUID, 73a4, binary',
'PrevUID, 3cb923, binary',
'NextUID, 3eb923, binary',
'TimecodeScale, 2ad7b1, uint',
'DateUTC, 4461, sint',
'Title, 7ba9, str',
'MuxingApp, 4d80, str',
'WritingApp, 5741, str',
'Duration, 4489, float',
),
'Cluster*, 1f43b675, sub', (
'Timecode, e7, uint',
'BlockGroup*, a0, sub', (
'Block, a1, binary',
'BlockDuration, 9b, uint',
'ReferenceBlock*, fb, sint',
'DiscardPadding, 75A2, sint',
'BlockAdditions, 75A1, sub', (
'BlockMore*, A6, sub', (
'BlockAddID, EE, uint',
'BlockAdditional, A5, binary',
),
),
),
'SimpleBlock*, a3, binary',
),
'Tracks*, 1654ae6b, sub', (
'TrackEntry*, ae, sub', (
'TrackNumber, d7, uint',
'TrackUID, 73c5, uint',
'TrackType, 83, uint',
'FlagEnabled, b9, uint',
'FlagDefault, 88, uint',
'FlagForced, 55aa, uint',
'FlagLacing, 9c, uint',
'MinCache, 6de7, uint',
'MaxCache, 6df8, uint',
'DefaultDuration, 23e383, uint',
'TrackTimecodeScale, 23314f, float',
'MaxBlockAdditionID, 55ee, uint',
'Name, 536e, str',
'Language, 22b59c, str',
'CodecID, 86, str',
'CodecPrivate, 63a2, binary',
'CodecName, 258688, str',
'CodecDecodeAll, aa, uint',
'CodecDelay, 56aa, uint',
'SeekPreRoll, 56bb, uint',
'Video, e0, sub', (
'FlagInterlaced, 9a, uint',
'PixelWidth, b0, uint',
'PixelHeight, ba, uint',
'DisplayWidth, 54b0, uint',
'DisplayHeight, 54ba, uint',
'DisplayUnit, 54b2, uint',
'FrameRate, 2383e3, float',
'ColourSpace, 2eb524, binary',
'StereoMode, 53b8, uint',
'Colour, 55b0, sub', (
'MatrixCoefficients, 55B1, uint',
'BitsPerChannel, 55B2, uint',
'ChromaSubsamplingHorz, 55B3, uint',
'ChromaSubsamplingVert, 55B4, uint',
'CbSubsamplingHorz, 55B5, uint',
'CbSubsamplingVert, 55B6, uint',
'ChromaSitingHorz, 55B7, uint',
'ChromaSitingVert, 55B8, uint',
'Range, 55B9, uint',
'TransferCharacteristics, 55BA, uint',
'Primaries, 55BB, uint',
'MaxCLL, 55BC, uint',
'MaxFALL, 55BD, uint',
'MasteringMetadata, 55D0, sub', (
'PrimaryRChromaticityX, 55D1, float',
'PrimaryRChromaticityY, 55D2, float',
'PrimaryGChromaticityX, 55D3, float',
'PrimaryGChromaticityY, 55D4, float',
'PrimaryBChromaticityX, 55D5, float',
'PrimaryBChromaticityY, 55D6, float',
'WhitePointChromaticityX, 55D7, float',
'WhitePointChromaticityY, 55D8, float',
'LuminanceMax, 55D9, float',
'LuminanceMin, 55DA, float',
),
),
),
'Audio, e1, sub', (
'SamplingFrequency, b5, float',
'OutputSamplingFrequency, 78b5, float',
'Channels, 9f, uint',
'BitDepth, 6264, uint',
),
'ContentEncodings, 6d80, sub', (
'ContentEncoding*, 6240, sub', (
'ContentEncodingOrder, 5031, uint',
'ContentEncodingScope, 5032, uint',
'ContentEncodingType, 5033, uint',
'ContentCompression, 5034, sub', (
'ContentCompAlgo, 4254, uint',
'ContentCompSettings, 4255, binary',
),
),
),
),
),
'Cues, 1c53bb6b, sub', (
'CuePoint*, bb, sub', (
'CueTime, b3, uint',
'CueTrackPositions*, b7, sub', (
'CueTrack, f7, uint',
'CueClusterPosition, f1, uint',
'CueRelativePosition, f0, uint',
'CueDuration, b2, uint',
),
),
),
'Attachments, 1941a469, sub', (
'AttachedFile*, 61a7, sub', (
'FileDescription, 467e, str',
'FileName, 466e, str',
'FileMimeType, 4660, str',
'FileData, 465c, binary',
'FileUID, 46ae, uint',
),
),
'Chapters, 1043a770, sub', (
'EditionEntry*, 45b9, sub', (
'EditionUID, 45bc, uint',
'EditionFlagHidden, 45bd, uint',
'EditionFlagDefault, 45db, uint',
'EditionFlagOrdered, 45dd, uint',
'ChapterAtom*, b6, sub', (
'ChapterUID, 73c4, uint',
'ChapterTimeStart, 91, uint',
'ChapterTimeEnd, 92, uint',
'ChapterFlagHidden, 98, uint',
'ChapterFlagEnabled, 4598, uint',
'ChapterSegmentUID, 6e67, binary',
'ChapterSegmentEditionUID, 6ebc, uint',
'ChapterDisplay*, 80, sub', (
'ChapString, 85, str',
'ChapLanguage*, 437c, str',
'ChapCountry*, 437e, str',
),
),
),
),
'Tags*, 1254c367, sub', (
'Tag*, 7373, sub', (
'Targets, 63c0, sub', (
'TargetTypeValue, 68ca, uint',
'TargetTrackUID, 63c5, uint',
'TargetEditionUID, 63c9, uint',
'TargetChapterUID, 63c4, uint',
'TargetAttachmentUID, 63c6, uint',
),
'SimpleTag*, 67c8, sub', (
'TagName, 45a3, str',
'TagLanguage, 447a, str',
'TagString, 4487, str'
),
),
),
),
)
import sys
from math import ldexp
from binascii import hexlify
def byte2num(s):
return int(hexlify(s), 16)
class EOF(Exception): pass
def camelcase_to_words(name):
parts = []
start = 0
for i in range(1, len(name)):
if name[i].isupper() and (name[i-1].islower() or
name[i+1:i+2].islower()):
parts.append(name[start:i])
start = i
parts.append(name[start:])
return '_'.join(parts).lower()
class MatroskaElement(object):
def __init__(self, name, elid, valtype, namespace):
self.name = name
self.definename = '{0}_ID_{1}'.format(namespace, name.upper())
self.fieldname = camelcase_to_words(name)
self.structname = 'ebml_' + self.fieldname
self.elid = elid
self.valtype = valtype
if valtype == 'sub':
self.ebmltype = 'EBML_TYPE_SUBELEMENTS'
self.valname = 'struct ' + self.structname
else:
self.ebmltype = 'EBML_TYPE_' + valtype.upper()
try:
self.valname = {'uint': 'uint64_t', 'str': 'char *',
'binary': 'bstr', 'ebml_id': 'uint32_t',
'float': 'double', 'sint': 'int64_t',
}[valtype]
except KeyError:
raise SyntaxError('Unrecognized value type ' + valtype)
self.subelements = ()
def add_subelements(self, subelements):
self.subelements = subelements
self.subids = set(x[0].elid for x in subelements)
elementd = {}
elementlist = []
def parse_elems(l, namespace):
subelements = []
for el in l:
if isinstance(el, str):
name, hexid, eltype = [x.strip() for x in el.split(',')]
hexid = hexid.lower()
multiple = name.endswith('*')
name = name.strip('*')
new = MatroskaElement(name, hexid, eltype, namespace)
elementd[hexid] = new
elementlist.append(new)
subelements.append((new, multiple))
else:
new.add_subelements(parse_elems(el, namespace))
return subelements
parse_elems(elements_ebml, 'EBML')
parse_elems(elements_matroska, 'MATROSKA')
def printf(out, *args):
out.write(u' '.join([str(x) for x in args]))
out.write(u'\n')
def generate_C_header(out):
printf(out, '// Generated by TOOLS/matroska.py, do not edit manually')
printf(out)
for el in elementlist:
printf(out, '#define {0.definename:40} 0x{0.elid}'.format(el))
printf(out)
for el in reversed(elementlist):
if not el.subelements:
continue
printf(out)
printf(out, 'struct {0.structname} {{'.format(el))
l = max(len(subel.valname) for subel, multiple in el.subelements)+1
for subel, multiple in el.subelements:
printf(out, ' {e.valname:{l}} {star}{e.fieldname};'.format(
e=subel, l=l, star=' *'[multiple]))
printf(out)
for subel, multiple in el.subelements:
printf(out, ' int n_{0.fieldname};'.format(subel))
printf(out, '};')
for el in elementlist:
if not el.subelements:
continue
printf(out, 'extern const struct ebml_elem_desc {0.structname}_desc;'.format(el))
printf(out)
printf(out, '#define MAX_EBML_SUBELEMENTS', max(len(el.subelements)
for el in elementlist))
def generate_C_definitions(out):
printf(out, '// Generated by TOOLS/matroska.py, do not edit manually')
printf(out)
for el in reversed(elementlist):
printf(out)
if el.subelements:
printf(out, '#define N', el.fieldname)
printf(out, 'E_S("{0}", {1})'.format(el.name, len(el.subelements)))
for subel, multiple in el.subelements:
printf(out, 'F({0.definename}, {0.fieldname}, {1})'.format(
subel, int(multiple)))
printf(out, '}};')
printf(out, '#undef N')
else:
printf(out, 'E("{0.name}", {0.fieldname}, {0.ebmltype})'.format(el))
def read(s, length):
t = s.read(length)
if len(t) != length:
raise EOF
return t
def read_id(s):
t = read(s, 1)
i = 0
mask = 128
if ord(t) == 0:
raise SyntaxError
while not ord(t) & mask:
i += 1
mask >>= 1
t += read(s, i)
return t
def read_vint(s):
t = read(s, 1)
i = 0
mask = 128
if ord(t) == 0:
raise SyntaxError
while not ord(t) & mask:
i += 1
mask >>= 1
t = bytes((ord(t) & (mask - 1),))
t += read(s, i)
return i+1, byte2num(t)
def read_str(s, length):
return read(s, length)
def read_uint(s, length):
t = read(s, length)
return byte2num(t)
def read_sint(s, length):
i = read_uint(s, length)
mask = 1 << (length * 8 - 1)
if i & mask:
i -= 2 * mask
return i
def read_float(s, length):
t = read(s, length)
i = byte2num(t)
if length == 4:
f = ldexp((i & 0x7fffff) + (1 << 23), (i >> 23 & 0xff) - 150)
if i & (1 << 31):
f = -f
elif length == 8:
f = ldexp((i & ((1 << 52) - 1)) + (1 << 52), (i >> 52 & 0x7ff) - 1075)
if i & (1 << 63):
f = -f
else:
raise SyntaxError
return f
def parse_one(s, depth, parent, maxlen):
elid = hexlify(read_id(s)).decode('ascii')
elem = elementd.get(elid)
size, length = read_vint(s)
this_length = len(elid) / 2 + size + length
if elem is not None:
if elem.valtype != 'skip':
print(" " * depth, '[' + elid + ']', elem.name, 'size:', length, 'value:', end=' ')
if elem.valtype == 'sub':
print('subelements:')
while length > 0:
length -= parse_one(s, depth + 1, elem, length)
if length < 0:
raise SyntaxError
elif elem.valtype == 'str':
print('string', repr(read_str(s, length).decode('utf8', 'replace')))
elif elem.valtype in ('binary', 'ebml_id'):
t = read_str(s, length)
dec = ''
if elem.valtype == 'ebml_id':
idelem = elementd.get(hexlify(t).decode('ascii'))
if idelem is None:
dec = '(UNKNOWN)'
else:
dec = '({0.name})'.format(idelem)
if len(t) < 20:
t = hexlify(t).decode('ascii')
else:
t = '<{0} bytes>'.format(len(t))
print('binary', t, dec)
elif elem.valtype == 'uint':
print('uint', read_uint(s, length))
elif elem.valtype == 'sint':
print('sint', read_sint(s, length))
elif elem.valtype == 'float':
print('float', read_float(s, length))
elif elem.valtype == 'skip':
read(s, length)
else:
raise NotImplementedError
else:
print(" " * depth, '[' + elid + '] Unknown element! size:', length)
read(s, length)
return this_length
if __name__ == "__main__":
def parse_toplevel(s):
parse_one(s, 0, None, 1 << 63)
if sys.argv[1] == '--generate-header':
generate_C_header(sys.stdout)
elif sys.argv[1] == '--generate-definitions':
generate_C_definitions(sys.stdout)
else:
if sys.version_info.major < 3:
raise Exception("Dumping requires Python 3.")
s = open(sys.argv[1], "rb")
while 1:
start = s.tell()
try:
parse_toplevel(s)
except EOF:
if s.tell() != start:
raise Exception("Unexpected end of file")
break