mirror of https://github.com/mpv-player/mpv synced 2025-03-11 08:37:59 +00:00
Uoti Urpala 5f631d1c08 matroska: add new parsing code
Add a new EBML parser implementation that should allow significant
improvements to the Matroska demuxer. The new parsing code is not
actually used yet by the demuxer. The only changes to existing code in
this commit are to generate the MATROSKA_ID_* / EBML_ID_* macro
definitions from the new implementation and to rename some of them
(the new implementation uses names matching the official Matroska spec).

The main parser implementation is added in ebml.c. There are two new
generated files, ebml_defs.c and ebml_types.h, that contain
definitions of EBML elements. Those are generated by the new script
TOOLS/matroska.py. There's a new Makefile target "generated_ebml" that
run the script to refresh the content of the generated files.
2010-01-27 14:26:43 +02:00

398 lines
12 KiB
Executable File

Generate C definitions for parsing Matroska files.
Can also be used to directly parse Matroska files and display their contents.
# This file is part of MPlayer.
# MPlayer is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# MPlayer is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License along
# with MPlayer; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
elements_ebml = (
'EBML, 1a45dfa3, sub', (
'EBMLVersion, 4286, uint',
'EBMLReadVersion, 42f7, uint',
'EBMLMaxIDLength, 42f2, uint',
'EBMLMaxSizeLength, 42f3, uint',
'DocType, 4282, str',
'DocTypeVersion, 4287, uint',
'DocTypeReadVersion, 4285, uint',
'CRC32, bf, binary',
'Void, ec, binary',
elements_matroska = (
'Segment, 18538067, sub', (
'SeekHead*, 114d9b74, sub', (
'Seek*, 4dbb, sub', (
'SeekID, 53ab, ebml_id',
'SeekPosition, 53ac, uint',
'Info*, 1549a966, sub', (
'SegmentUID, 73a4, binary',
'PrevUID, 3cb923, binary',
'NextUID, 3eb923, binary',
'TimecodeScale, 2ad7b1, uint',
'DateUTC, 4461, sint',
'Title, 7ba9, str',
'MuxingApp, 4d80, str',
'WritingApp, 5741, str',
'Duration, 4489, float',
'Cluster*, 1f43b675, sub', (
'Timecode, e7, uint',
'BlockGroup*, a0, sub', (
'Block, a1, binary',
'BlockDuration, 9b, uint',
'ReferenceBlock*, fb, sint',
'SimpleBlock*, a3, binary',
'Tracks*, 1654ae6b, sub', (
'TrackEntry*, ae, sub', (
'TrackNumber, d7, uint',
'TrackUID, 73c5, uint',
'TrackType, 83, uint',
'FlagEnabled, b9, uint',
'FlagDefault, 88, uint',
'FlagForced, 55aa, uint',
'FlagLacing, 9c, uint',
'MinCache, 6de7, uint',
'DefaultDuration, 23e383, uint',
'TrackTimecodeScale, 23314f, float',
'MaxBlockAdditionID, 55ee, uint',
'Name, 536e, str',
'Language, 22b59c, str',
'CodecID, 86, str',
'CodecPrivate, 63a2, binary',
'CodecDecodeAll, aa, uint',
'Video, e0, sub', (
'FlagInterlaced, 9a, uint',
'PixelWidth, b0, uint',
'PixelHeight, ba, uint',
'DisplayWidth, 54b0, uint',
'DisplayHeight, 54ba, uint',
'FrameRate, 2383e3, float',
'Audio, e1, sub', (
'SamplingFrequency, b5, float',
'Channels, 9f, uint',
'BitDepth, 6264, uint',
'ContentEncodings, 6d80, sub', (
'ContentEncoding*, 6240, sub', (
'ContentEncodingOrder, 5031, uint',
'ContentEncodingScope, 5032, uint',
'ContentEncodingType, 5033, uint',
'ContentCompression, 5034, sub', (
'ContentCompAlgo, 4254, uint',
'ContentCompSettings, 4255, binary',
'Cues, 1c53bb6b, sub', (
'CuePoint*, bb, sub', (
'CueTime, b3, uint',
'CueTrackPositions*, b7, sub', (
'CueTrack, f7, uint',
'CueClusterPosition, f1, uint',
'Attachments, 1941a469, sub', (
'AttachedFile*, 61a7, sub', (
'FileName, 466e, str',
'FileMimeType, 4660, str',
'FileData, 465c, binary',
'FileUID, 46ae, uint',
'Chapters, 1043a770, sub', (
'EditionEntry*, 45b9, sub', (
'EditionUID, 45bc, uint',
'EditionFlagHidden, 45bd, uint',
'EditionFlagDefault, 45db, uint',
'EditionFlagOrdered, 45dd, uint',
'ChapterAtom*, b6, sub', (
'ChapterUID, 73c4, uint',
'ChapterTimeStart, 91, uint',
'ChapterTimeEnd, 92, uint',
'ChapterFlagHidden, 98, uint',
'ChapterFlagEnabled, 4598, uint',
'ChapterSegmentUID, 6e67, binary',
'ChapterSegmentEditionUID, 6ebc, uint',
'ChapterDisplay*, 80, sub', (
'ChapString, 85, str',
'ChapLanguage*, 437c, str',
'Tags*, 1254c367, sub', (
'Tag*, 7373, sub', (
'Targets, 63c0, sub', (
'TargetTypeValue, 68ca, uint',
import sys
from math import ldexp
def byte2num(s):
return int(s.encode('hex'), 16)
def camelcase_to_words(name):
parts = []
start = 0
for i in range(1, len(name)):
if name[i].isupper() and (name[i-1].islower() or
start = i
return '_'.join(parts).lower()
class MatroskaElement(object):
def __init__(self, name, elid, valtype, namespace):
self.name = name
self.definename = '%s_ID_%s' % (namespace, name.upper())
self.fieldname = camelcase_to_words(name)
self.structname = 'ebml_' + self.fieldname
self.elid = elid
self.valtype = valtype
if valtype == 'sub':
self.ebmltype = 'EBML_TYPE_SUBELEMENTS'
self.valname = 'struct %s' % self.structname
self.ebmltype = 'EBML_TYPE_' + valtype.upper()
self.valname = {'uint': 'uint64_t', 'str': 'struct bstr',
'binary': 'struct bstr', 'ebml_id': 'uint32_t',
'float': 'double', 'sint': 'int64_t',
except KeyError:
raise SyntaxError('Unrecognized value type ' + valtype)
self.subelements = ()
def add_subelements(self, subelements):
self.subelements = subelements
self.subids = set(x[0].elid for x in subelements)
elementd = {}
elementlist = []
def parse_elems(l, namespace):
subelements = []
for el in l:
if isinstance(el, str):
name, hexid, eltype = [x.strip() for x in el.split(',')]
multiple = name.endswith('*')
name = name.strip('*')
new = MatroskaElement(name, hexid, eltype, namespace)
elementd[hexid] = new
subelements.append((new, multiple))
new.add_subelements(parse_elems(el, namespace))
return subelements
parse_elems(elements_ebml, 'EBML')
parse_elems(elements_matroska, 'MATROSKA')
def generate_C_header():
print('// Generated by TOOLS/matroska.py, do not edit manually')
for el in elementlist:
print('#define %-40s 0x%s' % (el.definename, el.elid))
for el in reversed(elementlist):
if not el.subelements:
print('struct %s {' % el.structname)
l = max(len(subel.valname) for subel, multiple in el.subelements)+1
for subel, multiple in el.subelements:
print(' %-*s %s%s;' % (l, subel.valname, (' ', '*')[multiple],
for subel, multiple in el.subelements:
print(' int n_%s;' % (subel.fieldname))
for el in elementlist:
if not el.subelements:
print('extern const struct ebml_elem_desc %s_desc;' % el.structname)
print('#define MAX_EBML_SUBELEMENTS %d' % max(len(el.subelements)
for el in elementlist))
def generate_C_definitions():
print('// Generated by TOOLS/matroska.py, do not edit manually')
for el in reversed(elementlist):
if el.subelements:
print('#define N %s' % el.fieldname)
print('E_S("%s", %d)' % (el.name, len(el.subelements)))
for subel, multiple in el.subelements:
print('F(%s, %s, %d)' % (subel.definename, subel.fieldname,
print('#undef N')
print('E("%s", %s, %s)' % (el.name, el.fieldname, el.ebmltype))
def read(s, length):
t = s.read(length)
if len(t) != length:
raise IOError
return t
def read_id(s):
t = read(s, 1)
i = 0
mask = 128
if ord(t) == 0:
raise SyntaxError
while not ord(t) & mask:
i += 1
mask >>= 1
t += read(s, i)
return t
def read_vint(s):
t = read(s, 1)
i = 0
mask = 128
if ord(t) == 0:
raise SyntaxError
while not ord(t) & mask:
i += 1
mask >>= 1
t = chr(ord(t) & (mask - 1))
t += read(s, i)
return i+1, byte2num(t)
def read_str(s, length):
return read(s, length)
def read_uint(s, length):
t = read(s, length)
return byte2num(t)
def read_sint(s, length):
i = read_uint(s, length)
mask = 1 << (length * 8 - 1)
if i & mask:
i -= 2 * mask
return i
def read_float(s, length):
t = read(s, length)
i = byte2num(t)
if length == 4:
f = ldexp((i & 0x7fffff) + (1 << 23), (i >> 23 & 0xff) - 150)
if i & (1 << 31):
f = -f
return f
raise SyntaxError
def parse_one(s, depth, parent, maxlen):
elid = read_id(s).encode('hex')
elem = elementd.get(elid)
if parent is not None and elid not in parent.subids and elid not in ('ec', 'bf'):
print('Unexpected:', elid)
if 1:
raise NotImplementedError
size, length = read_vint(s)
this_length = len(elid) / 2 + size + length
if elem is not None:
if elem.valtype != 'skip':
print depth, elid, elem.name, 'size:', length, 'value:',
if elem.valtype == 'sub':
while length > 0:
length -= parse_one(s, depth + 1, elem, length)
if length < 0:
raise SyntaxError
elif elem.valtype == 'str':
print 'string', repr(read_str(s, length))
elif elem.valtype in ('binary', 'ebml_id'):
t = read_str(s, length)
dec = ''
if elem.valtype == 'ebml_id':
idelem = elementd.get(t.encode('hex'))
if idelem is None:
dec = '(UNKNOWN)'
dec = '(%s)' % idelem.name
if len(t) < 20:
t = t.encode('hex')
t = '<skipped %d bytes>' % len(t)
print 'binary', t, dec
elif elem.valtype == 'uint':
print 'uint', read_uint(s, length)
elif elem.valtype == 'sint':
print 'sint', read_sint(s, length)
elif elem.valtype == 'float':
print 'float', read_float(s, length)
elif elem.valtype == 'skip':
read(s, length)
raise NotImplementedError
print(depth, 'Unknown element:', elid, 'size:', length)
read(s, length)
return this_length
def parse_toplevel(s):
parse_one(s, 0, None, 1 << 63)
if sys.argv[1] == '--generate-header':
elif sys.argv[1] == '--generate-definitions':
s = open(sys.argv[1])
while 1: