hydrus/include/matroska.py

# EBML/Matroska parser
# Copyright (C) 2010  Johannes Sasongko <sasongko@gmail.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
#
# The developers of the Exaile media player hereby grant permission
# for non-GPL compatible GStreamer and Exaile plugins to be used and
# distributed together with GStreamer and Exaile. This permission is
# above and beyond the permissions granted by the GPL license by which
# Exaile is covered. If you modify this code, you may extend this
# exception to your version of the code, but you are not obligated to
# do so. If you do not wish to do so, delete this exception statement
# from your version.


# This code is heavily based on public domain code by "Omion" (from the
# Hydrogenaudio forums), as obtained from Matroska's Subversion repository at
# revision 858 (2004-10-03), under "/trunk/Perl.Parser/MatroskaParser.pm".


import sys
from struct import unpack

SINT, UINT, FLOAT, STRING, UTF8, DATE, MASTER, BINARY = range(8)

class EbmlException(Exception): pass

class BinaryData(str): pass
class UnknownData: pass

class Ebml:
    """EBML parser.

    Usage: Ebml(location, tags).parse()
    tags is a dictionary of the form { id: (name, type) }.
    """

    ## Constructor and destructor

    def __init__(self, location, tags):
        self.tags = tags
        self.open(location)

    def __del__(self):
        self.close()

    ## File access.
    ## These can be overridden to provide network support.

    def open(self, location):
        """Open a location and set self.size."""
        self.file = f = open(location, 'rb')
        f = self.file
        f.seek(0, 2)
        self.size = f.tell()
        f.seek(0, 0)

    def seek(self, offset, mode):
        self.file.seek(offset, mode)

    def tell(self):
        return self.file.tell()

    def read(self, length):
        return self.file.read(length)

    def close(self):
        self.file.close()

    ## Element reading

    def readSize(self):
        b1 = self.read(1)
        b1b = ord(b1)
        if b1b & 0x80:
            # 1 byte
            return b1b & 0x7f
        elif b1b & 0x40:
            # 2 bytes
            # JS: BE-ushort
            return unpack(">H", chr(0x40 ^ b1b) + self.read(1))[0]
        elif b1b & 0x20:
            # 3 bytes
            # JS: BE-ulong
            return unpack(">L", "\0" + chr(0x20 ^ b1b) + self.read(2))[0]
        elif b1b & 0x10:
            # 4 bytes
            # JS: BE-ulong
            return unpack(">L", chr(0x10 ^ b1b) + self.read(3))[0]
        elif b1b & 0x08:
            # 5 bytes
            # JS: uchar BE-ulong. We change this to BE uchar ulong.
            high, low = unpack(">BL", chr(0x08 ^ b1b) + self.read(4))
            return high * 4294967296 + low
        elif b1b & 0x04:
            # 6 bytes
            # JS: BE-slong BE-ulong
            high, low = unpack(">HL", chr(0x04 ^ b1b) + self.read(5))
            return high * 4294967296 + low
        elif b1b & 0x02:
            # 7 bytes
            # JS: BE-ulong BE-ulong
            high, low = unpack(">LL",
                    "\0" + chr(0x02 ^ b1b) + self.read(6))
            return high * 4294967296 + low
        elif b1b & 0x01:
            # 8 bytes
            # JS: BE-ulong BE-ulong
            high, low = unpack(">LL", chr(0x01 ^ b1b) + self.read(7))
            return high * 4294967296 + low
        else:
            raise EbmlException(
                    "invalid element size with leading byte 0x%X" % b1b)

    def readInteger(self, length):
        if length == 1:
            # 1 byte
            return ord(self.read(1))
        elif length == 2:
            # 2 bytes
            return unpack(">H", self.read(2))[0]
        elif length == 3:
            # 3 bytes
            return unpack(">L", "\0" + self.read(3))[0]
        elif length == 4:
            # 4 bytes
            return unpack(">L", self.read(4))[0]
        elif length == 5:
            # 5 bytes
            high, low = unpack(">BL", self.read(5))
            return high * 4294967296 + low
        elif length == 6:
            # 6 bytes
            high, low = unpack(">HL", self.read(6))
            return high * 4294967296 + low
        elif length == 7:
            # 7 bytes
            high, low = unpack(">LL", "\0" + (self.read(7)))
            return high * 4294967296 + low
        elif length == 8:
            # 8 bytes
            high, low = unpack(">LL", self.read(8))
            return high * 4294967296 + low
        else:
            raise EbmlException(
                    "don't know how to read %r-byte integer" % length)

    def readFloat(self, length):
        # Need to reverse the bytes for little-endian machines
        if length == 4:
            # single
            return unpack('@f', self.read(4)[::-1])[0]
        elif length == 8:
            # double
            return unpack('@d', self.read(8)[::-1])[0]
        elif length == 10:
            # extended (don't know how to handle it)
            return 'EXTENDED'
        else:
            raise EbmlException("don't know how to read %r-byte float" % length)

    def readID(self):
        b1 = self.read(1)
        b1b = ord(b1)
        if b1b & 0x80:
            # 1 byte
            return b1b & 0x7f
        elif b1b & 0x40:
            # 2 bytes
            return unpack(">H", chr(0x40 ^ b1b) + self.read(1))[0]
        elif b1b & 0x20:
            # 3 bytes
            return unpack(">L", "\0" + chr(0x20 ^ b1b) + self.read(2))[0]
        elif b1b & 0x10:
            # 4 bytes
            return unpack(">L", chr(0x10 ^ b1b) + self.read(3))[0]
        else:
            raise EbmlException(
                    "invalid element ID with leading byte 0x%X" % b1b)

    ## Parsing

    def parse(self, from_=0, to=None):
        """Parses EBML from `from_` to `to`.

        Note that not all streams support seeking backwards, so prepare to handle
        an exception if you try to parse from arbitrary position.
        """
        if to is None:
            to = self.size
        self.seek(from_, 0)
        node = {}
        # Iterate over current node's children.
        while self.tell() < to:
            try:
                id = self.readID()
            except EbmlException, e:
                # Invalid EBML header. We can't reliably get any more data from
                # this level, so just return anything we have.
                print >>sys.stderr, "ERROR:", e
                return node
            size = self.readSize()
            try:
                key, type_ = self.tags[id]
            except KeyError:
                self.seek(size, 1)
            else:
                try:
                    if type_ is MASTER:
                        tell = self.tell()
                        value = self.parse(tell, tell + size)
                    elif type_ in (SINT, UINT, DATE):
                        value = self.readInteger(size)
                    elif type_ is FLOAT:
                        value = self.readFloat(size)
                    elif type_ is STRING:
                        value = unicode(self.read(size), 'ascii')
                    elif type_ is UTF8:
                        value = unicode(self.read(size), 'utf-8')
                    elif type_ is BINARY:
                        value = BinaryData(self.read(size))
                    else:
                        assert False
                except (EbmlException, UnicodeDecodeError), e:
                    print >>sys.stderr, "WARNING:", e
                try:
                    parentval = node[key]
                except KeyError:
                    parentval = node[key] = []
                parentval.append(value)
        return node

'''Hydrus Dev deleted this!
## GIO-specific code

import gio

class GioEbml(Ebml):
    # NOTE: All seeks are faked using InputStream.skip because we need to use
    # BufferedInputStream but it does not implement Seekable.

    def open(self, location):
        f = gio.File(location)
        self.buffer = gio.BufferedInputStream(f.read())
        self._tell = 0

        self.size = f.query_info('standard::size').get_size()

    def seek(self, offset, mode):
        if mode == 0:
            skip = offset - self._tell
        elif mode == 1:
            skip = offset
        elif mode == 2:
            skip = self.size - self._tell + offset
        else:
            raise ValueError("invalid seek mode: %r" % offset)
        if skip < 0:
            raise gio.Error("cannot seek backwards from %d" % self._tell)
        self._tell += skip
        self.buffer.skip(skip)

    def tell(self):
        return self._tell

    def read(self, length):
        result = self.buffer.read(length)
        self._tell += len(result)
        return result

    def close(self):
        self.buffer.close()
'''

## Matroska-specific code

# Interesting Matroska tags.
# Tags not defined here are skipped while parsing.
MatroskaTags = {
    0xa45dfa3: ('EBML', MASTER ),
    0x0282: ('DocType', STRING), # hydrus dev added this
    # Segment
    0x08538067: ('Segment', MASTER),
    # Segment Information
    0x0549A966: ('Info', MASTER),
    0x3384: ('SegmentFilename', UTF8),
    0x0AD7B1: ('TimecodeScale', UINT),
    0x0489: ('Duration', FLOAT),
    0x0461: ('DateUTC', DATE),
    0x3BA9: ('Title', UTF8),
    0x0D80: ('MuxingApp', UTF8),
    0x1741: ('WritingApp', UTF8),
    # Track
    0x0654AE6B: ('Tracks', MASTER),
    0x2E: ('TrackEntry', MASTER),
    0x57: ('TrackNumber', UINT),
    0x03: ('TrackType', UINT),
    0x29: ('FlagEnabled', UINT),
    0x08: ('FlagDefault', UINT),
    0x03E383: ('DefaultDuration', UINT),
    0x03314F: ('TrackTimecodeScale', FLOAT),
    0x137F: ('TrackOffset', SINT),
    0x136E: ('Name', UTF8),
    0x02B59C: ('Language', STRING),
    0x06: ('CodecID', STRING),
    0x058688: ('CodecName', UTF8),
    0x1A9697: ('CodecSettings', UTF8),
    0x1B4040: ('CodecInfoURL', STRING),
    0x06B240: ('CodecDownloadURL', STRING),
    0x2A: ('CodecDecodeAll', UINT),
    0x2FAB: ('TrackOverlay', UINT),
    # Video
    0x60: ('Video', MASTER),
    0x30: ('PixelWidth', UINT), # hydrus dev added this
    0x3A: ('PixelHeight', UINT), # hydrus dev added this
    # Audio
    0x61: ('Audio', MASTER),
    0x35: ('SamplingFrequency', UINT),
    0x38B5: ('OutputSamplingFrequency', UINT),
    0x1F: ('Channels', UINT),
    0x3D7B: ('ChannelPositions', BINARY),
    0x2264: ('BitDepth', UINT),
    # Content Encoding
    0x2D80: ('ContentEncodings', MASTER),
    0x2240: ('ContentEncoding', MASTER),
    0x1031: ('ContentEncodingOrder', UINT),
    0x1032: ('ContentEncodingScope', UINT),
    0x1033: ('ContentEncodingType', UINT),
    0x1034: ('ContentCompression', MASTER),
    0x0254: ('ContentCompAlgo', UINT),
    0x0255: ('ContentCompSettings', BINARY),
    # Chapters
    0x0043A770: ('Chapters', MASTER),
    0x05B9: ('EditionEntry', MASTER),
    0x05BC: ('EditionUID', UINT),
    0x05BD: ('EditionFlagHidden', UINT),
    0x05DB: ('EditionFlagDefault', UINT),
    0x05DD: ('EditionManaged', UINT),
    0x36: ('ChapterAtom', MASTER),
    0x33C4: ('ChapterUID', UINT),
    0x11: ('ChapterTimeStart', UINT),
    0x12: ('ChapterTimeEnd', UINT),
    0x18: ('ChapterFlagHidden', UINT),
    0x0598: ('ChapterFlagEnabled', UINT),
    0x23C3: ('ChapterPhysicalEquiv', UINT),
    0x0F: ('ChapterTrack', MASTER),
    0x09: ('ChapterTrackNumber', UINT),
    0x00: ('ChapterDisplay', MASTER),
    0x05: ('ChapString', UTF8),
    0x037C: ('ChapLanguage', STRING),
    0x037E: ('ChapCountry', STRING),
    # Tagging
    0x0254C367: ('Tags', MASTER),
    0x3373: ('Tag', MASTER),
    0x23C0: ('Targets', MASTER),
    0x28CA: ('TargetTypevalue', UINT),
    0x23CA: ('TargetType', STRING),
    0x23C9: ('EditionUID', UINT),
    0x23C4: ('ChapterUID', UINT),
    0x23C5: ('TrackUID', UINT),
    0x23C6: ('AttachmentUID', UINT),
    0x27C8: ('SimpleTag', MASTER),
    0x05A3: ('TagName', UTF8),
    0x047A: ('TagLanguage', STRING),
    0x0484: ('TagDefault', UINT),
    0x0487: ('TagString', UTF8),
    0x0485: ('TagBinary', BINARY),
}

def parse(location):
    return Ebml(location, MatroskaTags).parse()

def dump(location):
    from pprint import pprint
    pprint(parse(location))

def dump_tags(location):
    from pprint import pprint
    mka = parse(location)
    segment = mka['Segment'][0]
    info = segment['Info'][0]
    length = info['Duration'][0] * info['TimecodeScale'][0] / 1e9
    print "Length = %f seconds" % length
    pprint(segment['Tags'][0]['Tag'])

if __name__ == '__main__':
    import sys
    location = sys.argv[1]
    if sys.platform == 'win32' and '://' not in location:
        # XXX: This is most likely a bug in the Win32 GIO port; it converts
        # paths into UTF-8 and requires them to be specified in UTF-8 as well.
        # Here we decode the path according to the FS encoding to get the
        # Unicode representation first. If the path is in a different encoding,
        # this step will fail.
        location = location.decode(sys.getfilesystemencoding()).encode('utf-8')
    dump_tags(location)


# vi: et sts=4 sw=4 ts=4