1
0
mirror of https://github.com/mpv-player/mpv synced 2024-12-09 08:29:42 +00:00
mpv/TOOLS/lib/Parse/Matroska/Reader.pm
Diogo Franco (Kovensky) 4a66fce7d3 Parse::Matroska::Reader: get rid of bigrat
It seems that it was causing issues with certain perl setups (such as
the one on issue #549). It also turns out that it was not behaving correctly
(not all constants were being promoted to big nums as they should), so we
use explicit objects to derive the constants.

There were also precedence issues. I wonder if this even worked right to
begin with.

The 'double' path (8-byte floats) is untested, as I couldn't easily find
a file with such a field.

Closes #549.
2014-02-15 19:43:22 -03:00

427 lines
11 KiB
Perl

use 5.008;
use strict;
use warnings;
# ABSTRACT: a low-level reader for EBML files
package Parse::Matroska::Reader;
use Parse::Matroska::Definitions qw{elem_by_hexid};
use Parse::Matroska::Element;
use Carp;
use Scalar::Util qw{openhandle weaken};
use IO::Handle;
use IO::File;
use List::Util qw{first};
use Encode;
use constant BIGINT_TRY => 'Pari,GMP,FastCalc';
use Math::BigInt try => BIGINT_TRY;
use Math::BigRat try => BIGINT_TRY;
=head1 SYNOPSIS
use Parse::Matroska::Reader;
my $reader = Parse::Matroska::Reader->new($path);
$reader->close;
$reader->open(\$string_with_matroska_data);
my $elem = $reader->read_element;
print "Element ID: $elem->{elid}\n";
print "Element name: $elem->{name}\n";
if ($elem->{type} ne 'sub') {
print "Element value: $elem->get_value\n";
} else {
while (my $child = $elem->next_child) {
print "Child element: $child->{name}\n";
}
}
$reader->close;
=head1 DESCRIPTION
Reads EBML data, which is used in Matroska files.
This is a low-level reader which is meant to be used as a backend
for higher level readers. TODO: write the high level readers :)
=head1 NOTE
The API of this module is not yet considered stable.
=method new
Creates a new reader.
Calls L</open($arg)> with its arguments if provided.
=cut
sub new {
my $class = shift;
my $self = {};
bless $self, $class;
$self->open(@_) if @_;
return $self;
}
=method open($arg)
Creates the internal filehandle. The argument can be:
=for :list
* An open filehandle or L<IO::Handle> object.
The filehandle is not C<dup()>ed, so calling L</close> in this
object will close the given filehandle as well.
* A scalar containing a path to a file.
* On perl v5.14 or newer, a scalarref pointing to EBML data.
For similar functionality in older perls, give an L<IO::String> object
or the handle to an already C<open>ed scalarref.
=cut
sub open {
my ($self, $arg) = @_;
$self->{fh} = openhandle($arg) || IO::File->new($arg, "<:raw")
or croak "Can't open $arg: $!";
}
=method close
Closes the internal filehandle.
=cut
sub close {
my ($self) = @_;
$self->{fh}->close;
delete $self->{fh};
}
# equivalent to $self->readlen(1), possibly faster
sub _getc {
my ($self) = @_;
my $c = $self->{fh}->getc;
croak "Can't do read of length 1: $!" if !defined $c && $!;
return $c;
}
=method readlen($length)
Reads C<$length> bytes from the internal filehandle.
=cut
sub readlen {
my ($self, $len) = @_;
my $data;
my $readlen = $self->{fh}->read($data, $len);
croak "Can't do read of length $len: $!"
unless defined $readlen;
return $data;
}
# converts a byte string into an integer
# we do so by converting the integer into a hex string (big-endian)
# and then reading the hex-string into an integer
sub _bin2int($) {
my ($bin) = @_;
# if the length is larger than 3
# the resulting integer might be larger than INT_MAX
if (length($bin) > 3) {
return Math::BigInt->from_hex(unpack("H*", $bin));
}
return hex(unpack("H*", $bin));
}
# creates a floating-point number with the given mantissa and exponent
sub _ldexp {
my ($mantissa, $exponent) = @_;
my $r = new Math::BigRat($mantissa);
return $r * Math::BigRat->new(2)**$exponent;
}
# NOTE: the read_* functions are hard to read because they're ports
# of even harder to read python functions.
# TODO: make them readable
=method read_id
Reads an EBML ID atom in hexadecimal string format, suitable
for passing to L<Parse::Matroska::Definitions/elem_by_hexid($id)>.
=cut
sub read_id {
my ($self) = @_;
my $t = $self->_getc;
return undef unless defined $t;
my $i = 0;
my $mask = 1<<7;
if (ord($t) == 0) {
croak "Matroska Syntax error: first byte of ID was \\0"
}
until (ord($t) & $mask) {
++$i;
$mask >>= 1;
}
# return hex string of the bytes we just read
return unpack "H*", ($t . $self->readlen($i));
}
=method read_size
Reads an EBML Data Size atom, which immediately follows
an EBML ID atom.
This returns an array consisting of:
=for :list
0. The length of the Data Size atom.
1. The value encoded in the Data Size atom, which is the length of all the data following it.
=cut
sub read_size {
my ($self) = @_;
my $t = $self->_getc;
my $i = 0;
my $mask = 1<<7;
if (ord($t) == 0) {
croak "Matroska Syntax error: first byte of data size was \\0"
}
until (ord($t) & $mask) {
++$i;
$mask >>= 1;
}
$t = $t & chr($mask-1); # strip length bits (keep only significant bits)
return ($i+1, _bin2int $t . $self->readlen($i));
}
=method read_str($length)
Reads a string of length C<$length> bytes from the internal filehandle.
The string is already L<Encode/decode>d from C<UTF-8>, which is the
standard Matroska string encoding.
=cut
{
my $utf8 = find_encoding("UTF-8");
sub read_str {
my ($self, $length) = @_;
return $utf8->decode($self->readlen($length));
}
}
=method read_uint($length)
Reads an unsigned integer of length C<$length> bytes
from the internal filehandle.
Returns a L<Math::BigInt> object if C<$length> is greater
than 4.
=cut
sub read_uint {
my ($self, $length) = @_;
return _bin2int $self->readlen($length);
}
=method read_sint($length)
Reads a signed integer of length C<$length> bytes
from the internal filehandle.
Returns a L<Math::BigInt> object if C<$length> is greater
than 4.
=cut
sub read_sint {
my ($self, $length) = @_;
my $i = $self->read_uint($length);
# Apply 2's complement to the unsigned int
my $mask = int(2 ** ($length * 8 - 1));
# if the most significant bit is set...
if ($i & $mask) {
# subtract the MSB twice
$i -= 2 * $mask;
}
return $i;
}
=method read_float($length)
Reads an IEEE floating point number of length C<$length>
bytes from the internal filehandle.
Only lengths C<4> and C<8> are supported (C C<float> and C<double>).
=cut
{
my $b1 = new Math::BigInt 1;
sub read_float {
my ($self, $length) = @_;
my $i = new Math::BigInt $self->read_uint($length)->bstr;
my $f;
# These evil expressions reinterpret an unsigned int as IEEE binary floats
if ($length == 4) {
$f = _ldexp(($i & ((1<<23) - 1)) + (1<<23), ($i>>23 & ((1<<8) - 1)) - 150);
$f = -$f if $i & ($b1<<31);
} elsif ($length == 8) {
$f = _ldexp(($i & (($b1<<52) - 1)) + ($b1<<52), ($i>>52 & ((1<<12) - 1)) - 1075);
$f = -$f if $i & ($b1<<63);
} else {
croak "Matroska Syntax error: unsupported IEEE float byte size $length";
}
return $f;
}
}
=method read_ebml_id($length)
Reads an EBML ID when it's encoded as the data inside another
EBML element, that is, when the enclosing element's C<type> is
C<ebml_id>.
This returns a hashref with the EBML element description as
defined in L<Parse::Matroska::Definitions>.
=cut
sub read_ebml_id {
my ($self, $length) = @_;
return elem_by_hexid(unpack("H*", $self->readlen($length)));
}
=method skip($length)
Skips C<$length> bytes in the internal filehandle.
=cut
sub skip {
my ($self, $len) = @_;
return if $self->{fh}->can('seek') && $self->{fh}->seek($len, 1);
$self->readlen($len);
return;
}
=method getpos
Wrapper for L<IO::Seekable/$io-E<gt>getpos> in the internal filehandle.
Returns undef if the internal filehandle can't C<getpos>.
=cut
sub getpos {
my ($self) = @_;
return undef unless $self->{fh}->can('getpos');
return $self->{fh}->getpos;
}
=method setpos($pos)
Wrapper for L<IO::Seekable/$io-E<gt>setpos> in the internal filehandle.
Returns C<undef> if the internal filehandle can't C<setpos>.
Croaks if C<setpos> does not seek to the requested position,
that is, if calling C<getpos> does not yield the same object
as the C<$pos> argument.
=cut
sub setpos {
my ($self, $pos) = @_;
return undef unless $pos && $self->{fh}->can('setpos');
my $ret = $self->{fh}->setpos($pos);
croak "Cannot seek to correct position"
unless $self->getpos eq $pos;
return $ret;
}
=method read_element($read_bin)
Reads a full EBML element from the internal filehandle.
Returns a L<Parse::Matroska::Element> object initialized with
the read data. If C<read_bin> is not present or is false, will
delay-load the contents of C<binary> type elements, that is,
they will only be loaded when calling C<get_value> on the
returned L<Parse::Matroska::Element> object.
Does not read the children of the element if its type is
C<sub>. Look into the L<Parse::Matroska::Element> interface
for details in how to read children elements.
Pass a true C<$read_bin> if the stream being read is not
seekable (C<getpos> is undef) and the contents of C<binary>
elements is desired, otherwise seeking errors or internal
filehandle corruption might occur.
=cut
sub read_element {
my ($self, $read_bin) = @_;
return undef if $self->{fh}->eof;
my $elem_pos = $self->getpos;
my $elid = $self->read_id;
my $elem_def = elem_by_hexid($elid);
my ($size_len, $content_len) = $self->read_size;
my $full_len = length($elid)/2 + $size_len + $content_len;
my $elem = Parse::Matroska::Element->new(
elid => $elid,
name => $elem_def && $elem_def->{name},
type => $elem_def && $elem_def->{valtype},
size_len => $size_len,
content_len => $content_len,
full_len => $full_len,
reader => $self,
elem_pos => $elem_pos,
data_pos => $self->getpos,
);
weaken($elem->{reader});
if (defined $elem_def) {
if ($elem->{type} eq 'sub') {
$elem->{value} = [];
} elsif ($elem->{type} eq 'str') {
$elem->{value} = $self->read_str($content_len);
} elsif ($elem->{type} eq 'ebml_id') {
$elem->{value} = $self->read_ebml_id($content_len);
} elsif ($elem->{type} eq 'uint') {
$elem->{value} = $self->read_uint($content_len);
} elsif ($elem->{type} eq 'sint') {
$elem->{value} = $self->read_sint($content_len);
} elsif ($elem->{type} eq 'float') {
$elem->{value} = $self->read_float($content_len);
} elsif ($elem->{type} eq 'skip') {
$self->skip($content_len);
} elsif ($elem->{type} eq 'binary') {
if ($read_bin) {
$elem->{value} = $self->readlen($content_len);
} else {
$self->skip($content_len);
}
} else {
die "Matroska Definition error: type $elem->{valtype} unknown"
}
} else {
$self->skip($content_len);
}
return $elem;
}
1;
=head1 CAVEATS
Children elements have to be processed as soon as an element
with children is found, or their children ignored with
L<Parse::Matroska::Element/skip>. Not doing so doesn't cause
errors but results in an invalid structure, with constant C<0>
depth.
To work correctly in unseekable streams, either the contents
of C<binary>-type elements has to be ignored or the C<read_bin>
flag to C<read_element> has to be true.