mpv/TOOLS/lib/Parse/Matroska/Reader.pm

use 5.008;
use strict;
use warnings;

# ABSTRACT: a low-level reader for EBML files
package Parse::Matroska::Reader;

use Parse::Matroska::Definitions qw{elem_by_hexid};
use Parse::Matroska::Element;

use Carp;
use Scalar::Util qw{openhandle weaken};
use IO::Handle;
use IO::File;
use List::Util qw{first};
use Encode;

use constant BIGINT_TRY => 'Pari,GMP,FastCalc';
use Math::BigInt try => BIGINT_TRY;
use Math::BigRat try => BIGINT_TRY;

=head1 SYNOPSIS

    use Parse::Matroska::Reader;
    my $reader = Parse::Matroska::Reader->new($path);
    $reader->close;
    $reader->open(\$string_with_matroska_data);

    my $elem = $reader->read_element;
    print "Element ID: $elem->{elid}\n";
    print "Element name: $elem->{name}\n";
    if ($elem->{type} ne 'sub') {
        print "Element value: $elem->get_value\n";
    } else {
        while (my $child = $elem->next_child) {
            print "Child element: $child->{name}\n";
        }
    }
    $reader->close;

=head1 DESCRIPTION

Reads EBML data, which is used in Matroska files.
This is a low-level reader which is meant to be used as a backend
for higher level readers. TODO: write the high level readers :)

=head1 NOTE

The API of this module is not yet considered stable.

=method new

Creates a new reader.
Calls L</open($arg)> with its arguments if provided.

=cut
sub new {
    my $class = shift;
    my $self = {};
    bless $self, $class;

    $self->open(@_) if @_;
    return $self;
}

=method open($arg)

Creates the internal filehandle. The argument can be:

=for :list
* An open filehandle or L<IO::Handle> object.
The filehandle is not C<dup()>ed, so calling L</close> in this
object will close the given filehandle as well.
* A scalar containing a path to a file.
* On perl v5.14 or newer, a scalarref pointing to EBML data.
For similar functionality in older perls, give an L<IO::String> object
or the handle to an already C<open>ed scalarref.

=cut
sub open {
    my ($self, $arg) = @_;
    $self->{fh} = openhandle($arg) || IO::File->new($arg, "<:raw")
        or croak "Can't open $arg: $!";
}

=method close

Closes the internal filehandle.

=cut
sub close {
    my ($self) = @_;
    $self->{fh}->close;
    delete $self->{fh};
}

# equivalent to $self->readlen(1), possibly faster
sub _getc {
    my ($self) = @_;
    my $c = $self->{fh}->getc;
    croak "Can't do read of length 1: $!" if !defined $c && $!;
    return $c;
}

=method readlen($length)

Reads C<$length> bytes from the internal filehandle.

=cut
sub readlen {
    my ($self, $len) = @_;
    my $data;
    my $readlen = $self->{fh}->read($data, $len);
    croak "Can't do read of length $len: $!"
                 unless defined $readlen;
    return $data;
}

# converts a byte string into an integer
# we do so by converting the integer into a hex string (big-endian)
# and then reading the hex-string into an integer
sub _bin2int($) {
    my ($bin) = @_;
    # if the length is larger than 3
    # the resulting integer might be larger than INT_MAX
    if (length($bin) > 3) {
        return Math::BigInt->from_hex(unpack("H*", $bin));
    }
    return hex(unpack("H*", $bin));
}

# creates a floating-point number with the given mantissa and exponent
sub _ldexp {
    my ($mantissa, $exponent) = @_;
    my $r = new Math::BigRat($mantissa);
    return $r * Math::BigRat->new(2)**$exponent;
}

# NOTE: the read_* functions are hard to read because they're ports
# of even harder to read python functions.
# TODO: make them readable

=method read_id

Reads an EBML ID atom in hexadecimal string format, suitable
for passing to L<Parse::Matroska::Definitions/elem_by_hexid($id)>.

=cut
sub read_id {
    my ($self) = @_;
    my $t = $self->_getc;
    return undef unless defined $t;
    my $i = 0;
    my $mask = 1<<7;

    if (ord($t) == 0) {
        croak "Matroska Syntax error: first byte of ID was \\0"
    }
    until (ord($t) & $mask) {
        ++$i;
        $mask >>= 1;
    }
    # return hex string of the bytes we just read
    return unpack "H*", ($t . $self->readlen($i));
}

=method read_size

Reads an EBML Data Size atom, which immediately follows
an EBML ID atom.

This returns an array consisting of:

=for :list
0. The length of the Data Size atom.
1. The value encoded in the Data Size atom, which is the length of all the data following it.

=cut
sub read_size {
    my ($self) = @_;
    my $t = $self->_getc;
    my $i = 0;
    my $mask = 1<<7;

    if (ord($t) == 0) {
        croak "Matroska Syntax error: first byte of data size was \\0"
    }
    until (ord($t) & $mask) {
        ++$i;
        $mask >>= 1;
    }
    $t = $t & chr($mask-1); # strip length bits (keep only significant bits)
    return ($i+1, _bin2int $t . $self->readlen($i));
}

=method read_str($length)

Reads a string of length C<$length> bytes from the internal filehandle.
The string is already L<Encode/decode>d from C<UTF-8>, which is the
standard Matroska string encoding.

=cut
{
    my $utf8 = find_encoding("UTF-8");
    sub read_str {
        my ($self, $length) = @_;
        return $utf8->decode($self->readlen($length));
    }
}

=method read_uint($length)

Reads an unsigned integer of length C<$length> bytes
from the internal filehandle.

Returns a L<Math::BigInt> object if C<$length> is greater
than 4.

=cut
sub read_uint {
    my ($self, $length) = @_;
    return _bin2int $self->readlen($length);
}

=method read_sint($length)

Reads a signed integer of length C<$length> bytes
from the internal filehandle.

Returns a L<Math::BigInt> object if C<$length> is greater
than 4.

=cut
sub read_sint {
    my ($self, $length) = @_;
    my $i = $self->read_uint($length);

    # Apply 2's complement to the unsigned int
    my $mask = int(2 ** ($length * 8 - 1));
    # if the most significant bit is set...
    if ($i & $mask) {
        # subtract the MSB twice
        $i -= 2 * $mask;
    }
    return $i;
}

=method read_float($length)

Reads an IEEE floating point number of length C<$length>
bytes from the internal filehandle.

Only lengths C<4> and C<8> are supported (C C<float> and C<double>).

=cut
{
    my $b1 = new Math::BigInt 1;

    sub read_float {
        my ($self, $length) = @_;
        my $i = new Math::BigInt $self->read_uint($length)->bstr;
        my $f;

        # These evil expressions reinterpret an unsigned int as IEEE binary floats
        if ($length == 4) {
            $f = _ldexp(($i & ((1<<23) - 1)) + (1<<23), ($i>>23 & ((1<<8) - 1)) - 150);
            $f = -$f if $i & ($b1<<31);
        } elsif ($length == 8) {
            $f = _ldexp(($i & (($b1<<52) - 1)) + ($b1<<52), ($i>>52 & ((1<<12) - 1)) - 1075);
            $f = -$f if $i & ($b1<<63);
        } else {
            croak "Matroska Syntax error: unsupported IEEE float byte size $length";
        }

        return $f;
    }
}

=method read_ebml_id($length)

Reads an EBML ID when it's encoded as the data inside another
EBML element, that is, when the enclosing element's C<type> is
C<ebml_id>.

This returns a hashref with the EBML element description as
defined in L<Parse::Matroska::Definitions>.

=cut
sub read_ebml_id {
    my ($self, $length) = @_;
    return elem_by_hexid(unpack("H*", $self->readlen($length)));
}

=method skip($length)

Skips C<$length> bytes in the internal filehandle.

=cut
sub skip {
    my ($self, $len) = @_;
    return if $self->{fh}->can('seek') && $self->{fh}->seek($len, 1);
    $self->readlen($len);
    return;
}

=method getpos

Wrapper for L<IO::Seekable/$io-E<gt>getpos> in the internal filehandle.

Returns undef if the internal filehandle can't C<getpos>.

=cut
sub getpos {
    my ($self) = @_;
    return undef unless $self->{fh}->can('getpos');
    return $self->{fh}->getpos;
}

=method setpos($pos)

Wrapper for L<IO::Seekable/$io-E<gt>setpos> in the internal filehandle.

Returns C<undef> if the internal filehandle can't C<setpos>.

Croaks if C<setpos> does not seek to the requested position,
that is, if calling C<getpos> does not yield the same object
as the C<$pos> argument.

=cut
sub setpos {
    my ($self, $pos) = @_;
    return undef unless $pos && $self->{fh}->can('setpos');

    my $ret = $self->{fh}->setpos($pos);
    croak "Cannot seek to correct position"
        unless $self->getpos eq $pos;
    return $ret;
}

=method read_element($read_bin)

Reads a full EBML element from the internal filehandle.

Returns a L<Parse::Matroska::Element> object initialized with
the read data. If C<read_bin> is not present or is false, will
delay-load the contents of C<binary> type elements, that is,
they will only be loaded when calling C<get_value> on the
returned L<Parse::Matroska::Element> object.

Does not read the children of the element if its type is
C<sub>. Look into the L<Parse::Matroska::Element> interface
for details in how to read children elements.

Pass a true C<$read_bin> if the stream being read is not
seekable (C<getpos> is undef) and the contents of C<binary>
elements is desired, otherwise seeking errors or internal
filehandle corruption might occur.

=cut
sub read_element {
    my ($self, $read_bin) = @_;
    return undef if $self->{fh}->eof;

    my $elem_pos = $self->getpos;

    my $elid = $self->read_id;
    my $elem_def = elem_by_hexid($elid);
    my ($size_len, $content_len) = $self->read_size;
    my $full_len = length($elid)/2 + $size_len + $content_len;

    my $elem = Parse::Matroska::Element->new(
        elid => $elid,
        name => $elem_def && $elem_def->{name},
        type => $elem_def && $elem_def->{valtype},
        size_len => $size_len,
        content_len => $content_len,
        full_len => $full_len,
        reader => $self,
        elem_pos => $elem_pos,
        data_pos => $self->getpos,
        );
    weaken($elem->{reader});

    if (defined $elem_def) {
        if ($elem->{type} eq 'sub') {
            $elem->{value} = [];
        } elsif ($elem->{type} eq 'str') {
            $elem->{value} = $self->read_str($content_len);
        } elsif ($elem->{type} eq 'ebml_id') {
            $elem->{value} = $self->read_ebml_id($content_len);
        } elsif ($elem->{type} eq 'uint') {
            $elem->{value} = $self->read_uint($content_len);
        } elsif ($elem->{type} eq 'sint') {
            $elem->{value} = $self->read_sint($content_len);
        } elsif ($elem->{type} eq 'float') {
            $elem->{value} = $self->read_float($content_len);
        } elsif ($elem->{type} eq 'skip') {
            $self->skip($content_len);
        } elsif ($elem->{type} eq 'binary') {
            if ($read_bin) {
                $elem->{value} = $self->readlen($content_len);
            } else {
                $self->skip($content_len);
            }
        } else {
            die "Matroska Definition error: type $elem->{valtype} unknown"
        }
    } else {
        $self->skip($content_len);
    }
    return $elem;
}

1;

=head1 CAVEATS

Children elements have to be processed as soon as an element
with children is found, or their children ignored with
L<Parse::Matroska::Element/skip>. Not doing so doesn't cause
errors but results in an invalid structure, with constant C<0>
depth.

To work correctly in unseekable streams, either the contents
of C<binary>-type elements has to be ignored or the C<read_bin>
flag to C<read_element> has to be true.