mpv/TOOLS/lib/Parse/Matroska/Reader.pm

427 lines
11 KiB
Perl

use 5.008;
use strict;
use warnings;
# ABSTRACT: a low-level reader for EBML files
package Parse::Matroska::Reader;
use Parse::Matroska::Definitions qw{elem_by_hexid};
use Parse::Matroska::Element;
use Carp;
use Scalar::Util qw{openhandle weaken};
use IO::Handle;
use IO::File;
use List::Util qw{first};
use Encode;
use constant BIGINT_TRY => 'Pari,GMP,FastCalc';
use Math::BigInt try => BIGINT_TRY;
use Math::BigRat try => BIGINT_TRY;
=head1 SYNOPSIS
use Parse::Matroska::Reader;
my $reader = Parse::Matroska::Reader->new($path);
$reader->close;
$reader->open(\$string_with_matroska_data);
my $elem = $reader->read_element;
print "Element ID: $elem->{elid}\n";
print "Element name: $elem->{name}\n";
if ($elem->{type} ne 'sub') {
print "Element value: $elem->get_value\n";
} else {
while (my $child = $elem->next_child) {
print "Child element: $child->{name}\n";
}
}
$reader->close;
=head1 DESCRIPTION
Reads EBML data, which is used in Matroska files.
This is a low-level reader which is meant to be used as a backend
for higher level readers. TODO: write the high level readers :)
=head1 NOTE
The API of this module is not yet considered stable.
=method new
Creates a new reader.
Calls L</open($arg)> with its arguments if provided.
=cut
sub new {
my $class = shift;
my $self = {};
bless $self, $class;
$self->open(@_) if @_;
return $self;
}
=method open($arg)
Creates the internal filehandle. The argument can be:
=for :list
* An open filehandle or L<IO::Handle> object.
The filehandle is not C<dup()>ed, so calling L</close> in this
object will close the given filehandle as well.
* A scalar containing a path to a file.
* On perl v5.14 or newer, a scalarref pointing to EBML data.
For similar functionality in older perls, give an L<IO::String> object
or the handle to an already C<open>ed scalarref.
=cut
sub open {
my ($self, $arg) = @_;
$self->{fh} = openhandle($arg) || IO::File->new($arg, "<:raw")
or croak "Can't open $arg: $!";
}
=method close
Closes the internal filehandle.
=cut
sub close {
my ($self) = @_;
$self->{fh}->close;
delete $self->{fh};
}
# equivalent to $self->readlen(1), possibly faster
sub _getc {
my ($self) = @_;
my $c = $self->{fh}->getc;
croak "Can't do read of length 1: $!" if !defined $c && $!;
return $c;
}
=method readlen($length)
Reads C<$length> bytes from the internal filehandle.
=cut
sub readlen {
my ($self, $len) = @_;
my $data;
my $readlen = $self->{fh}->read($data, $len);
croak "Can't do read of length $len: $!"
unless defined $readlen;
return $data;
}
# converts a byte string into an integer
# we do so by converting the integer into a hex string (big-endian)
# and then reading the hex-string into an integer
sub _bin2int($) {
my ($bin) = @_;
# if the length is larger than 3
# the resulting integer might be larger than INT_MAX
if (length($bin) > 3) {
return Math::BigInt->from_hex(unpack("H*", $bin));
}
return hex(unpack("H*", $bin));
}
# creates a floating-point number with the given mantissa and exponent
sub _ldexp {
my ($mantissa, $exponent) = @_;
my $r = new Math::BigRat($mantissa);
return $r * Math::BigRat->new(2)**$exponent;
}
# NOTE: the read_* functions are hard to read because they're ports
# of even harder to read python functions.
# TODO: make them readable
=method read_id
Reads an EBML ID atom in hexadecimal string format, suitable
for passing to L<Parse::Matroska::Definitions/elem_by_hexid($id)>.
=cut
sub read_id {
my ($self) = @_;
my $t = $self->_getc;
return undef unless defined $t;
my $i = 0;
my $mask = 1<<7;
if (ord($t) == 0) {
croak "Matroska Syntax error: first byte of ID was \\0"
}
until (ord($t) & $mask) {
++$i;
$mask >>= 1;
}
# return hex string of the bytes we just read
return unpack "H*", ($t . $self->readlen($i));
}
=method read_size
Reads an EBML Data Size atom, which immediately follows
an EBML ID atom.
This returns an array consisting of:
=for :list
0. The length of the Data Size atom.
1. The value encoded in the Data Size atom, which is the length of all the data following it.
=cut
sub read_size {
my ($self) = @_;
my $t = $self->_getc;
my $i = 0;
my $mask = 1<<7;
if (ord($t) == 0) {
croak "Matroska Syntax error: first byte of data size was \\0"
}
until (ord($t) & $mask) {
++$i;
$mask >>= 1;
}
$t = $t & chr($mask-1); # strip length bits (keep only significant bits)
return ($i+1, _bin2int $t . $self->readlen($i));
}
=method read_str($length)
Reads a string of length C<$length> bytes from the internal filehandle.
The string is already L<Encode/decode>d from C<UTF-8>, which is the
standard Matroska string encoding.
=cut
{
my $utf8 = find_encoding("UTF-8");
sub read_str {
my ($self, $length) = @_;
return $utf8->decode($self->readlen($length));
}
}
=method read_uint($length)
Reads an unsigned integer of length C<$length> bytes
from the internal filehandle.
Returns a L<Math::BigInt> object if C<$length> is greater
than 4.
=cut
sub read_uint {
my ($self, $length) = @_;
return _bin2int $self->readlen($length);
}
=method read_sint($length)
Reads a signed integer of length C<$length> bytes
from the internal filehandle.
Returns a L<Math::BigInt> object if C<$length> is greater
than 4.
=cut
sub read_sint {
my ($self, $length) = @_;
my $i = $self->read_uint($length);
# Apply 2's complement to the unsigned int
my $mask = int(2 ** ($length * 8 - 1));
# if the most significant bit is set...
if ($i & $mask) {
# subtract the MSB twice
$i -= 2 * $mask;
}
return $i;
}
=method read_float($length)
Reads an IEEE floating point number of length C<$length>
bytes from the internal filehandle.
Only lengths C<4> and C<8> are supported (C C<float> and C<double>).
=cut
{
my $b1 = new Math::BigInt 1;
sub read_float {
my ($self, $length) = @_;
my $i = new Math::BigInt $self->read_uint($length)->bstr;
my $f;
# These evil expressions reinterpret an unsigned int as IEEE binary floats
if ($length == 4) {
$f = _ldexp(($i & ((1<<23) - 1)) + (1<<23), ($i>>23 & ((1<<8) - 1)) - 150);
$f = -$f if $i & ($b1<<31);
} elsif ($length == 8) {
$f = _ldexp(($i & (($b1<<52) - 1)) + ($b1<<52), ($i>>52 & ((1<<12) - 1)) - 1075);
$f = -$f if $i & ($b1<<63);
} else {
croak "Matroska Syntax error: unsupported IEEE float byte size $length";
}
return $f;
}
}
=method read_ebml_id($length)
Reads an EBML ID when it's encoded as the data inside another
EBML element, that is, when the enclosing element's C<type> is
C<ebml_id>.
This returns a hashref with the EBML element description as
defined in L<Parse::Matroska::Definitions>.
=cut
sub read_ebml_id {
my ($self, $length) = @_;
return elem_by_hexid(unpack("H*", $self->readlen($length)));
}
=method skip($length)
Skips C<$length> bytes in the internal filehandle.
=cut
sub skip {
my ($self, $len) = @_;
return if $self->{fh}->can('seek') && $self->{fh}->seek($len, 1);
$self->readlen($len);
return;
}
=method getpos
Wrapper for L<IO::Seekable/$io-E<gt>getpos> in the internal filehandle.
Returns undef if the internal filehandle can't C<getpos>.
=cut
sub getpos {
my ($self) = @_;
return undef unless $self->{fh}->can('getpos');
return $self->{fh}->getpos;
}
=method setpos($pos)
Wrapper for L<IO::Seekable/$io-E<gt>setpos> in the internal filehandle.
Returns C<undef> if the internal filehandle can't C<setpos>.
Croaks if C<setpos> does not seek to the requested position,
that is, if calling C<getpos> does not yield the same object
as the C<$pos> argument.
=cut
sub setpos {
my ($self, $pos) = @_;
return undef unless $pos && $self->{fh}->can('setpos');
my $ret = $self->{fh}->setpos($pos);
croak "Cannot seek to correct position"
unless $self->getpos eq $pos;
return $ret;
}
=method read_element($read_bin)
Reads a full EBML element from the internal filehandle.
Returns a L<Parse::Matroska::Element> object initialized with
the read data. If C<read_bin> is not present or is false, will
delay-load the contents of C<binary> type elements, that is,
they will only be loaded when calling C<get_value> on the
returned L<Parse::Matroska::Element> object.
Does not read the children of the element if its type is
C<sub>. Look into the L<Parse::Matroska::Element> interface
for details in how to read children elements.
Pass a true C<$read_bin> if the stream being read is not
seekable (C<getpos> is undef) and the contents of C<binary>
elements is desired, otherwise seeking errors or internal
filehandle corruption might occur.
=cut
sub read_element {
my ($self, $read_bin) = @_;
return undef if $self->{fh}->eof;
my $elem_pos = $self->getpos;
my $elid = $self->read_id;
my $elem_def = elem_by_hexid($elid);
my ($size_len, $content_len) = $self->read_size;
my $full_len = length($elid)/2 + $size_len + $content_len;
my $elem = Parse::Matroska::Element->new(
elid => $elid,
name => $elem_def && $elem_def->{name},
type => $elem_def && $elem_def->{valtype},
size_len => $size_len,
content_len => $content_len,
full_len => $full_len,
reader => $self,
elem_pos => $elem_pos,
data_pos => $self->getpos,
);
weaken($elem->{reader});
if (defined $elem_def) {
if ($elem->{type} eq 'sub') {
$elem->{value} = [];
} elsif ($elem->{type} eq 'str') {
$elem->{value} = $self->read_str($content_len);
} elsif ($elem->{type} eq 'ebml_id') {
$elem->{value} = $self->read_ebml_id($content_len);
} elsif ($elem->{type} eq 'uint') {
$elem->{value} = $self->read_uint($content_len);
} elsif ($elem->{type} eq 'sint') {
$elem->{value} = $self->read_sint($content_len);
} elsif ($elem->{type} eq 'float') {
$elem->{value} = $self->read_float($content_len);
} elsif ($elem->{type} eq 'skip') {
$self->skip($content_len);
} elsif ($elem->{type} eq 'binary') {
if ($read_bin) {
$elem->{value} = $self->readlen($content_len);
} else {
$self->skip($content_len);
}
} else {
die "Matroska Definition error: type $elem->{valtype} unknown"
}
} else {
$self->skip($content_len);
}
return $elem;
}
1;
=head1 CAVEATS
Children elements have to be processed as soon as an element
with children is found, or their children ignored with
L<Parse::Matroska::Element/skip>. Not doing so doesn't cause
errors but results in an invalid structure, with constant C<0>
depth.
To work correctly in unseekable streams, either the contents
of C<binary>-type elements has to be ignored or the C<read_bin>
flag to C<read_element> has to be true.