mpv/stream/stream_libarchive.c

624 lines
19 KiB
C
Raw Normal View History

/*
* This file is part of mpv.
*
* mpv is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* mpv is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with mpv. If not, see <http://www.gnu.org/licenses/>.
*/
#include <archive.h>
#include <archive_entry.h>
#include "misc/bstr.h"
#include "common/common.h"
#include "misc/thread_tools.h"
#include "stream.h"
#include "stream_libarchive.h"
#define MP_ARCHIVE_FLAG_MAYBE_ZIP (MP_ARCHIVE_FLAG_PRIV << 0)
#define MP_ARCHIVE_FLAG_MAYBE_RAR (MP_ARCHIVE_FLAG_PRIV << 1)
#define MP_ARCHIVE_FLAG_MAYBE_VOLUMES (MP_ARCHIVE_FLAG_PRIV << 2)
struct mp_archive_volume {
struct mp_archive *mpa;
int index; // volume number (starting with 0, mp_archive.primary_src)
struct stream *src; // NULL => not current volume, or 0 sized dummy stream
int64_t seek_to;
char *url;
};
stream_libarchive: fix unnecessarily opening all volumes on opening Seems like I'm still not done with rar playback stuff... It turns out the reason for archive_read_open1() opening all volumes had nothing to do with libarchive's rar code, but was a consequence of how multi volume support is implemented in libarchive, and due to the fact that we enabled archive_read_support_format_zip_seekable() (through archive_read_support_format_zip()). The seekable zip format will seek to the end of the file and search for a zip "header" there. It could possibly be considered a libarchive bug that it does that even if it's fairly sure that it's a RAR file. We already do probing on a small buffer read from the start of the file (i.e. not giving libarchive a way to seek the stream before we think it's an archive), but that does not help, since libarchive needs to probe _again_. libarchive does not seem to provide a function to query the format (no archive_read_get_format()). Which seems quite strange, but at least I didn't find one. This commit works this around by doing some manual rar/zip probing. We could have gone only with rar probing. But detecting zip separately allows us to avoid that stream_libarchive seeks to the end during early probing. This is an additional bonus on top of "fixing" multi volume rar. The zip probing is from archive_read_format_zip_streamable_bid(). The rar signature is the common prefix of the rar and rar5 formats in libarchive (presumably the RAR fixed header parts without version). If the demuxer seeks to the end of the rar entry, this will still open all volumes; I'm not sure whether the old/removed rar code in mpv could handle this better. See: #7182
2020-01-09 01:25:13 +00:00
static bool probe_rar(struct stream *s)
{
static uint8_t rar_sig[] = {0x52, 0x61, 0x72, 0x21, 0x1a, 0x07};
uint8_t buf[6];
if (stream_read_peek(s, buf, sizeof(buf)) != sizeof(buf))
return false;
return memcmp(buf, rar_sig, 6) == 0;
}
static bool probe_multi_rar(struct stream *s)
{
uint8_t hdr[14];
if (stream_read_peek(s, hdr, sizeof(hdr)) == sizeof(hdr)) {
// Look for rar mark head & main head (assume they're in order).
if (hdr[6] == 0x00 && hdr[7 + 2] == 0x73) {
int rflags = hdr[7 + 3] | (hdr[7 + 4] << 8);
return rflags & 0x100;
}
}
return false;
}
stream_libarchive: fix unnecessarily opening all volumes on opening Seems like I'm still not done with rar playback stuff... It turns out the reason for archive_read_open1() opening all volumes had nothing to do with libarchive's rar code, but was a consequence of how multi volume support is implemented in libarchive, and due to the fact that we enabled archive_read_support_format_zip_seekable() (through archive_read_support_format_zip()). The seekable zip format will seek to the end of the file and search for a zip "header" there. It could possibly be considered a libarchive bug that it does that even if it's fairly sure that it's a RAR file. We already do probing on a small buffer read from the start of the file (i.e. not giving libarchive a way to seek the stream before we think it's an archive), but that does not help, since libarchive needs to probe _again_. libarchive does not seem to provide a function to query the format (no archive_read_get_format()). Which seems quite strange, but at least I didn't find one. This commit works this around by doing some manual rar/zip probing. We could have gone only with rar probing. But detecting zip separately allows us to avoid that stream_libarchive seeks to the end during early probing. This is an additional bonus on top of "fixing" multi volume rar. The zip probing is from archive_read_format_zip_streamable_bid(). The rar signature is the common prefix of the rar and rar5 formats in libarchive (presumably the RAR fixed header parts without version). If the demuxer seeks to the end of the rar entry, this will still open all volumes; I'm not sure whether the old/removed rar code in mpv could handle this better. See: #7182
2020-01-09 01:25:13 +00:00
static bool probe_zip(struct stream *s)
{
uint8_t p[4];
if (stream_read_peek(s, p, sizeof(p)) != sizeof(p))
return false;
// Lifted from libarchive, BSD license.
if (p[0] == 'P' && p[1] == 'K') {
if ((p[2] == '\001' && p[3] == '\002') ||
(p[2] == '\003' && p[3] == '\004') ||
(p[2] == '\005' && p[3] == '\006') ||
(p[2] == '\006' && p[3] == '\006') ||
(p[2] == '\007' && p[3] == '\010') ||
(p[2] == '0' && p[3] == '0'))
return true;
}
return false;
}
static int mp_archive_probe(struct stream *src)
{
int flags = 0;
assert(stream_tell(src) == 0);
if (probe_zip(src))
flags |= MP_ARCHIVE_FLAG_MAYBE_ZIP;
if (probe_rar(src)) {
flags |= MP_ARCHIVE_FLAG_MAYBE_RAR;
if (probe_multi_rar(src))
flags |= MP_ARCHIVE_FLAG_MAYBE_VOLUMES;
}
return flags;
}
static bool volume_seek(struct mp_archive_volume *vol)
{
if (!vol->src || vol->seek_to < 0)
return true;
bool r = stream_seek(vol->src, vol->seek_to);
vol->seek_to = -1;
return r;
}
static ssize_t read_cb(struct archive *arch, void *priv, const void **buffer)
{
struct mp_archive_volume *vol = priv;
if (!vol->src)
return 0;
if (!volume_seek(vol))
return -1;
int res = stream_read_partial(vol->src, vol->mpa->buffer,
sizeof(vol->mpa->buffer));
*buffer = vol->mpa->buffer;
return MPMAX(res, 0);
}
// lazy seek to avoid problems with end seeking over http
static int64_t seek_cb(struct archive *arch, void *priv,
int64_t offset, int whence)
{
struct mp_archive_volume *vol = priv;
if (!vol->src)
return 0;
switch (whence) {
case SEEK_SET:
vol->seek_to = offset;
break;
case SEEK_CUR:
if (vol->seek_to < 0)
vol->seek_to = stream_tell(vol->src);
vol->seek_to += offset;
break;
case SEEK_END: ;
int64_t size = stream_get_size(vol->src);
if (size < 0)
return -1;
vol->seek_to = size + offset;
break;
default:
return -1;
}
return vol->seek_to;
}
static int64_t skip_cb(struct archive *arch, void *priv, int64_t request)
{
struct mp_archive_volume *vol = priv;
if (!vol->src)
return request;
if (!volume_seek(vol))
return -1;
int64_t old = stream_tell(vol->src);
stream_seek_skip(vol->src, old + request);
return stream_tell(vol->src) - old;
}
static int open_cb(struct archive *arch, void *priv)
{
struct mp_archive_volume *vol = priv;
vol->seek_to = -1;
if (!vol->src) {
// Avoid annoying warnings/latency for known dummy volumes.
if (vol->index >= vol->mpa->num_volumes)
return ARCHIVE_OK;
MP_INFO(vol->mpa, "Opening volume '%s'...\n", vol->url);
stream, demux: redo origin policy thing mpv has a very weak and very annoying policy that determines whether a playlist should be used or not. For example, if you play a remote playlist, you usually don't want it to be able to read local filesystem entries. (Although for a media player the impact is small I guess.) It's weak and annoying as in that it does not prevent certain cases which could be interpreted as bad in some cases, such as allowing playlists on the local filesystem to reference remote URLs. It probably barely makes sense, but we just want to exclude some other "definitely not a good idea" things, all while playlists generally just work, so whatever. The policy is: - from the command line anything is played - local playlists can reference anything except "unsafe" streams ("unsafe" means special stream inputs like libavfilter graphs) - remote playlists can reference only remote URLs - things like "memory://" and archives are "transparent" to this This commit does... something. It replaces the weird stream flags with a slightly clearer "origin" value, which is now consequently passed down and used everywhere. It fixes some deviations from the described policy. I wanted to force archives to reference only content within them, but this would probably have been more complicated (or required different abstractions), and I'm too lazy to figure it out, so archives are now "transparent" (playlists within archives behave the same outside). There may be a lot of bugs in this. This is unfortunately a very noisy commit because: - every stream open call now needs to pass the origin - so does every demuxer open call (=> params param. gets mandatory) - most stream were changed to provide the "origin" value - the origin value needed to be passed along in a lot of places - I was too lazy to split the commit Fixes: #7274
2019-12-20 08:41:42 +00:00
vol->src = stream_create(vol->url,
STREAM_READ |
vol->mpa->primary_src->stream_origin,
vol->mpa->primary_src->cancel,
vol->mpa->primary_src->global);
// We pretend that failure to open a stream means it was not found,
// we assume in turn means that the volume doesn't exist (since
// libarchive builds volumes as some sort of abstraction on top of its
// stream layer, and its rar code cannot access volumes or signal
// anything related to this). libarchive also encounters a fatal error
// when a volume could not be opened. However, due to the way volume
// support works, it is fine with 0-sized volumes, which we simulate
// whenever vol->src==NULL for an opened volume.
if (!vol->src) {
vol->mpa->num_volumes = MPMIN(vol->mpa->num_volumes, vol->index);
MP_INFO(vol->mpa, "Assuming the volume above was not needed.\n");
}
return ARCHIVE_OK;
}
// just rewind the primary stream
return stream_seek(vol->src, 0) ? ARCHIVE_OK : ARCHIVE_FATAL;
}
static void volume_close(struct mp_archive_volume *vol)
{
// don't close the primary stream
if (vol->src && vol->src != vol->mpa->primary_src) {
free_stream(vol->src);
vol->src = NULL;
}
}
static int close_cb(struct archive *arch, void *priv)
{
struct mp_archive_volume *vol = priv;
volume_close(vol);
return ARCHIVE_OK;
}
static void mp_archive_close(struct mp_archive *mpa)
{
if (mpa && mpa->arch) {
archive_read_close(mpa->arch);
archive_read_free(mpa->arch);
mpa->arch = NULL;
}
}
// Supposedly we're not allowed to continue reading on FATAL returns. Otherwise
// crashes and other UB is possible. Assume calling the close/free functions is
// still ok. Return true if it was fatal and the archive was closed.
static bool mp_archive_check_fatal(struct mp_archive *mpa, int r)
{
if (r > ARCHIVE_FATAL)
return false;
MP_FATAL(mpa, "fatal error received - closing archive\n");
mp_archive_close(mpa);
return true;
}
void mp_archive_free(struct mp_archive *mpa)
{
mp_archive_close(mpa);
stream_libarchive: workaround various types of locale braindeath Fix that libarchive fails to return filenames for UTF-8/UTF-16 entries. The reason is that it uses locales and all that garbage, and mpv does not set a locale. Both C locales and wchar_t are shitfucked retarded legacy braindeath. If the C/POSIX standard committee had actually competent members, these would have been deprecated or removed long ago. (I mean, they managed to remove gets().) To justify this emotional outbreak potentially insulting to unknown persons, I will write a lot of text. Those not comfortable with toxic language should pretend this is a religious text. C locales are supposed to be a way to support certain languages and cultures easier. One example are character codepages. Back when UTF-8 was not invented yet, there were only 255 possible characters, which is not enough for anything but English and some european languages. So they decided to make the meaning of a character dependent on the current codepage. The locale (LC_CTYPE specifically) determines what character encoding is currently used. Of course nowadays, this is legacy nonsense. Everything uses UTF-8 for "char", and what doesn't is broken and terrible anyway. But the old ways stayed with us, and the stupidity of it as well. C locales were utterly moronic even when they were invented. The locale (via setlocale()) is global state, and global state is not a reasonable way to do anything. It will break libraries, or well modularized code. (The latter would be forced to strictly guard all entrypoints set set/restore locales, assuming a single threaded world.) On top of that, setting a locale randomly changes the semantics of a bunch of standard functions. If a function respects locale, you suddenly can't rely on it to behave the same on all systems. Some behavior can come as a surprise, and of course it will be dependent on the region of the user (it doesn't help that most software is US-centric, and the US locale is almost like the C locale, i.e. almost what you expect). Idiotically, locales were not just used to define the current character encoding, but the concept was used for a whole lot of things, like e. g. whether numbers should use "," or "." as decimal separaror. The latter issue is actually much worse, because it breaks basic string conversion or parsing of numbers for the purpose of interacting with file formats and such. Much can be said about how retarded locales are, even beyond what I just wrote, or will wrote below. They are so hilariously misdesigned and insufficient, I can't even fathom how this shit was _standardized_. (In any case, that meant everyone was forced to implement it.) Many C functions can't even do it correctly. For example, the character set encoding can be a multibyte encoding (not just UTF-8, but awful garbage like Shift JIS (sometimes called SHIT JIZZ), yet functions like toupper() can return only 1 byte. Or just take the fact that the locale API tries to define standard paper sizes (LC_PAPER) or telephone number formatting (LC_TELEPHONE). Who the fuck uses this, or would ever use this? But the badness doesn't stop here. At some point, they invented threads. And they put absolutely no thought into how threads should interact with locales. So they kept locales as global state. Because obviously, you want to be able to change the semantics of basic string processing functions _while_ they're running, right? (Any thread can call setlocale() at any time, and it's supposed to change the locale of all other threads.) At this point, how the fuck are you supposed to do anything correctly? You can't even temporarily switch the locale with setlocale(), because it would asynchronously fuckup the other threads. All you can do is to enforce a convention not to set anything but the C local (this is what mpv does), or to duplicate standard functions using code that doesn't query locale (this is what e.g. libass does, a close dependency of mpv). Imagine they had done this for certain other things. Like errno, with all the brokenness of the locale API. This simply wouldn't have worked, shit would just have been too broken. So they didn't. But locales give a delicious sweet spot of brokenness, where things are broken enough to cause neverending pain, but not broken enough that enough effort would have spent to fix it completely. On that note, standard C11 actually can't stringify an error value. It does define strerror(), but it's not thread safe, even though C11 supports threads. The idiots could just have defined it to be thread safe. Even if your libc is horrible enough that it can't return string literals, it could just just some thread local buffer. Because C11 does define thread local variables. But hey, why care about details, if you can just create a shitty standard? (POSIX defines strerror_r(), which "solves" this problem, while still not making strerror() thread safe.) Anyway, back to threads. The interaction of locales and threads makes no sense. Why would you make locales process global? Who even wanted it to work this way? Who decided that it should keep working this way, despite being so broken (and certainly causing implementation difficulties in libc)? Was it just a fucked up psychopath? Several decades later, the moronic standard committees noticed that this was (still is) kind of a bad situation. Instead of fixing the situation, they added more garbage on top of it. (Probably for the sake of "compatibility"). Now there is a set of new functions, which allow you to override the locale for the current thread. This means you can temporarily override and restore the local on all entrypoints of your code (like you could with setlocale(), before threads were invented). And of course not all operating systems or libcs implement this. For example, I'm pretty sure Microsoft doesn't. (Microsoft got to fuck it up as usual, and only provides _configthreadlocale(). This is shitfucked on its own, because it's GLOBAL STATE to configure that GLOBAL STATE should not be GLOBAL STATE, i.e. completely broken garbage, because it requires agreement over all modules/libraries what behavior should be used. I mean, sure, makign setlocale() affect only the current thread would have been the reasonable behavior. Making this behavior configurable isn't, because you can't rely on what behavior is active.) POSIX showed some minor decency by at least introducing some variations of standard functions, which have a locale argument (e.g. toupper_l()). You just pass the locale which you want to be used, and don't have to do the set locale/call function/restore locale nonense. But OF COURSE they fucked this up too. In no less than 2 ways: - There is no statically available handle for the C locale, so you have to initialize and store it somewhere, which makes it harder to make utility functions safe, that call locale-affected standard functions and expect C semantics. The easy solution, using pthread_once() and a global variable with the created locale, will not be easily accepted by pedantic assholes, because they'll worry about allocation failure, or leaking the locale when using this in library code (and then unloading the library). Or you could have complicated library init/uninit functions, which bring a big load of their own mess. Same for automagic DLL constructors/destructors. - Not all functions have a variant that takes a locale argument, and they missed even some important ones, like snprintf() or strtod() WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK I would like to know why it took so long to standardize a half-assed solution, that, apart from being conceptually half-assed, is even incomplete and insufficient. The obvious way to fix this would have been: - deprecate the entire locale API and their use, and make it a NOP - make UTF-8 the standard character type - make the C locale behavior the default - add new APIs that explicitly take locale objects - provide an emulation layer, that can be used to transparently build legacy code without breaking them But this wouldn't have been "compatible", and the apparently incompetent standard committees would have never accepted this. As if anyone actually used this legacy garbage, except other legacy garbage. Oh yeah, and let's care a lot about legacy compatibility, and let's not care at all about modern code that either has to suffer from this, or subtly breaks when the wrong locales are active. Last but not least, the UTF-8 locale name is apparently not even standardized. At the moment I'm trying to use "C.UTF-8", which is apparently glibc _and_ Debian specific. Got to use every opportunity to make correct usage of UTF-8 harder. What luck that this commit is only for some optional relatively obscure mpv feature. Why is the C locale not UTF-8? Why did POSIX not standardize an UTF-8 locale? Well, according to something I heard a few years ago, they're considering disallowing UTF-8 as locale, because UTF-8 would violate certain ivnariants expected by C or POSIX. (But I'm not sure if I remember this correctly - probably better not to rage about it.) Now, on to libarchive. libarchive intentionally uses the locale API and all the broken crap around it to "convert" UTF-8 or UTF-16 (as contained in reasonably sane archive formats) to "char*". This is a good start! Since glibc does not think that the C locale uses UTF-8, this fails for mpv. So trying to use archive_entry_pathname() to get the archive entry name fails if the name contains non-ASCII characters. Maybe use archive_entry_pathname_utf8()? Surely that should return UTF-8, since its name seems to indicate that it returns UTF-8. But of fucking course it doesn't! libarchive's horribly convoluted code (that is full of locale API usage and other legacy shit, as well as ifdefs and OS specific code, including Windows and fucking Cygwin) somehow fucks up and fails if the locale is not set to UTF-8. I made a PR fixing this in libarchive almost 2 years ago, but it was ignored. So, would archive_entry_pathname_w() as fallback work? No, why would it? Of course this _also_ involves shitfucked code that calls shitfucked standard functions (or OS specific ifdeffed shitfuck). The truth is that at least glibc changes the meaning of wchar_t depending on the locale. Unlike most people think, wchar_t is not standardized to be an UTF variant (or even unicode) - it's an encoding that uses basic units that can be larger than 8 bit. It's an implementation defined thing. Windows defines it to 2 bytes and UTF-16, and glibc defines it to 4 bytes and UTF-32, but only if an UTF-8 locale is set (apparently). Yes. Every libarchive function dealing with strings has 3 variants: plain, _utf8, and _w. And none of these work if the locale is not set. I cannot fathom why they even have a wchar_t variant, because it's redundant and fucking useless for any modern code. Writing a UTF-16 to UTF-8 conversion routine is maybe 3 pages of code, or a few lines if you use iconv. But libarchive uses all this glorious bullshit, and ends up with 3 not working API functions, and with over 4000 lines of its own string abstraction code with gratuitous amounts of ifdefs and OS dependent code that breaks in a fairly common use case. So what we do is: - Use the idiotic POSIX 2008 API (uselocale() etc.) (Too bad for users who try to build this on a system that doesn't have these - hopefully none are left in 2017. But if there are, torturing them with obscure build errors is probably justified. Might be bad for Windows though, which is a very popular platform except on phones.) - Use the "C.UTF-8" locale, which is probably not 100% standards compliant, but works on my system, so it's fine. - Guard every libarchive call with uselocale() + restoring the locale. - Be lazy and skip some libarchive calls. Look forward to the unlikely and astonishingly stupid bugs this could produce. We could also just set a C UTF-8 local in main (since that would have no known negative effects on the rest of the code), but this won't work for libmpv. We assume that uselocale() never fails. In an unexplainable stroke of luck, POSIX made the semantics of uselocale() nice enough that user code can fail failures without introducing crash or security bugs, even if there should be an implementation fucked up enough where it's actually possible that uselocale() fails even with valid input. With all this shitty ugliness added, it finally works, without fucking up other parts of the player. This is still less bad than that time when libquivi fucked up OpenGL rendering, because calling a libquvi function would load some proxy abstraction library, which in turn loaded a KDE plugin (even if KDE was not used), which in turn called setlocale() because Qt does this, and consequently made the mpv GLSL shader generation code emit "," instead of "." for numbers, and of course only for users who had that KDE plugin installed, and lived in a part of the world where "." is not used as decimal separator. All in all, I believe this proves that software developers as a whole and as a culture produce worse results than drug addicted butt fucked monkeys randomly hacking on typewriters while inhaling the fumes of a radioactive dumpster fire fueled by chinese platsic toys for children and Elton John/Justin Bieber crossover CDs for all eternity.
2017-11-12 12:36:35 +00:00
if (mpa && mpa->locale)
freelocale(mpa->locale);
talloc_free(mpa);
}
static bool add_volume(struct mp_archive *mpa, struct stream *src,
const char* url, int index)
{
struct mp_archive_volume *vol = talloc_zero(mpa, struct mp_archive_volume);
vol->index = index;
vol->mpa = mpa;
vol->src = src;
vol->url = talloc_strdup(vol, url);
locale_t oldlocale = uselocale(mpa->locale);
bool res = archive_read_append_callback_data(mpa->arch, vol) == ARCHIVE_OK;
uselocale(oldlocale);
return res;
}
static char *standard_volume_url(void *ctx, const char *format,
struct bstr base, int index)
{
return talloc_asprintf(ctx, format, BSTR_P(base), index);
}
static char *old_rar_volume_url(void *ctx, const char *format,
struct bstr base, int index)
{
return talloc_asprintf(ctx, format, BSTR_P(base),
'r' + index / 100, index % 100);
}
struct file_pattern {
const char *match;
const char *format;
char *(*volume_url)(void *ctx, const char *format,
struct bstr base, int index);
int start;
int stop;
bool legacy;
};
static const struct file_pattern patterns[] = {
{ ".part1.rar", "%.*s.part%.1d.rar", standard_volume_url, 2, 9 },
{ ".part01.rar", "%.*s.part%.2d.rar", standard_volume_url, 2, 99 },
{ ".part001.rar", "%.*s.part%.3d.rar", standard_volume_url, 2, 999 },
{ ".part0001.rar", "%.*s.part%.4d.rar", standard_volume_url, 2, 9999 },
{ ".rar", "%.*s.%c%.2d", old_rar_volume_url, 0, 99, true },
{ ".001", "%.*s.%.3d", standard_volume_url, 2, 9999 },
{ NULL, NULL, NULL, 0, 0 },
};
static bool find_volumes(struct mp_archive *mpa, int flags)
{
struct bstr primary_url = bstr0(mpa->primary_src->url);
const struct file_pattern *pattern = patterns;
while (pattern->match) {
if (bstr_endswith0(primary_url, pattern->match))
break;
pattern++;
}
if (!pattern->match)
return true;
if (pattern->legacy && !(flags & MP_ARCHIVE_FLAG_MAYBE_VOLUMES))
return true;
struct bstr base = bstr_splice(primary_url, 0, -(int)strlen(pattern->match));
for (int i = pattern->start; i <= pattern->stop; i++) {
char* url = pattern->volume_url(mpa, pattern->format, base, i);
if (!add_volume(mpa, NULL, url, i + 1))
return false;
}
MP_WARN(mpa, "This appears to be a multi-volume archive.\n"
"Support is not very good due to libarchive limitations.\n"
"There are known cases of libarchive crashing mpv on these.\n"
"This is also an excessively inefficient and stupid way to distribute\n"
"media files. People creating them should rethink this.\n");
return true;
}
static struct mp_archive *mp_archive_new_raw(struct mp_log *log,
struct stream *src,
int flags, int max_volumes)
{
struct mp_archive *mpa = talloc_zero(NULL, struct mp_archive);
mpa->log = log;
mpa->locale = newlocale(LC_CTYPE_MASK, "C.UTF-8", (locale_t)0);
if (!mpa->locale) {
mpa->locale = newlocale(LC_CTYPE_MASK, "", (locale_t)0);
if (!mpa->locale)
goto err;
}
mpa->arch = archive_read_new();
mpa->primary_src = src;
if (!mpa->arch)
goto err;
mpa->flags = flags;
mpa->num_volumes = max_volumes ? max_volumes : INT_MAX;
// first volume is the primary stream
if (!add_volume(mpa, src, src->url, 0))
goto err;
if (!(flags & MP_ARCHIVE_FLAG_NO_VOLUMES)) {
// try to open other volumes
if (!find_volumes(mpa, flags))
goto err;
}
stream_libarchive: workaround various types of locale braindeath Fix that libarchive fails to return filenames for UTF-8/UTF-16 entries. The reason is that it uses locales and all that garbage, and mpv does not set a locale. Both C locales and wchar_t are shitfucked retarded legacy braindeath. If the C/POSIX standard committee had actually competent members, these would have been deprecated or removed long ago. (I mean, they managed to remove gets().) To justify this emotional outbreak potentially insulting to unknown persons, I will write a lot of text. Those not comfortable with toxic language should pretend this is a religious text. C locales are supposed to be a way to support certain languages and cultures easier. One example are character codepages. Back when UTF-8 was not invented yet, there were only 255 possible characters, which is not enough for anything but English and some european languages. So they decided to make the meaning of a character dependent on the current codepage. The locale (LC_CTYPE specifically) determines what character encoding is currently used. Of course nowadays, this is legacy nonsense. Everything uses UTF-8 for "char", and what doesn't is broken and terrible anyway. But the old ways stayed with us, and the stupidity of it as well. C locales were utterly moronic even when they were invented. The locale (via setlocale()) is global state, and global state is not a reasonable way to do anything. It will break libraries, or well modularized code. (The latter would be forced to strictly guard all entrypoints set set/restore locales, assuming a single threaded world.) On top of that, setting a locale randomly changes the semantics of a bunch of standard functions. If a function respects locale, you suddenly can't rely on it to behave the same on all systems. Some behavior can come as a surprise, and of course it will be dependent on the region of the user (it doesn't help that most software is US-centric, and the US locale is almost like the C locale, i.e. almost what you expect). Idiotically, locales were not just used to define the current character encoding, but the concept was used for a whole lot of things, like e. g. whether numbers should use "," or "." as decimal separaror. The latter issue is actually much worse, because it breaks basic string conversion or parsing of numbers for the purpose of interacting with file formats and such. Much can be said about how retarded locales are, even beyond what I just wrote, or will wrote below. They are so hilariously misdesigned and insufficient, I can't even fathom how this shit was _standardized_. (In any case, that meant everyone was forced to implement it.) Many C functions can't even do it correctly. For example, the character set encoding can be a multibyte encoding (not just UTF-8, but awful garbage like Shift JIS (sometimes called SHIT JIZZ), yet functions like toupper() can return only 1 byte. Or just take the fact that the locale API tries to define standard paper sizes (LC_PAPER) or telephone number formatting (LC_TELEPHONE). Who the fuck uses this, or would ever use this? But the badness doesn't stop here. At some point, they invented threads. And they put absolutely no thought into how threads should interact with locales. So they kept locales as global state. Because obviously, you want to be able to change the semantics of basic string processing functions _while_ they're running, right? (Any thread can call setlocale() at any time, and it's supposed to change the locale of all other threads.) At this point, how the fuck are you supposed to do anything correctly? You can't even temporarily switch the locale with setlocale(), because it would asynchronously fuckup the other threads. All you can do is to enforce a convention not to set anything but the C local (this is what mpv does), or to duplicate standard functions using code that doesn't query locale (this is what e.g. libass does, a close dependency of mpv). Imagine they had done this for certain other things. Like errno, with all the brokenness of the locale API. This simply wouldn't have worked, shit would just have been too broken. So they didn't. But locales give a delicious sweet spot of brokenness, where things are broken enough to cause neverending pain, but not broken enough that enough effort would have spent to fix it completely. On that note, standard C11 actually can't stringify an error value. It does define strerror(), but it's not thread safe, even though C11 supports threads. The idiots could just have defined it to be thread safe. Even if your libc is horrible enough that it can't return string literals, it could just just some thread local buffer. Because C11 does define thread local variables. But hey, why care about details, if you can just create a shitty standard? (POSIX defines strerror_r(), which "solves" this problem, while still not making strerror() thread safe.) Anyway, back to threads. The interaction of locales and threads makes no sense. Why would you make locales process global? Who even wanted it to work this way? Who decided that it should keep working this way, despite being so broken (and certainly causing implementation difficulties in libc)? Was it just a fucked up psychopath? Several decades later, the moronic standard committees noticed that this was (still is) kind of a bad situation. Instead of fixing the situation, they added more garbage on top of it. (Probably for the sake of "compatibility"). Now there is a set of new functions, which allow you to override the locale for the current thread. This means you can temporarily override and restore the local on all entrypoints of your code (like you could with setlocale(), before threads were invented). And of course not all operating systems or libcs implement this. For example, I'm pretty sure Microsoft doesn't. (Microsoft got to fuck it up as usual, and only provides _configthreadlocale(). This is shitfucked on its own, because it's GLOBAL STATE to configure that GLOBAL STATE should not be GLOBAL STATE, i.e. completely broken garbage, because it requires agreement over all modules/libraries what behavior should be used. I mean, sure, makign setlocale() affect only the current thread would have been the reasonable behavior. Making this behavior configurable isn't, because you can't rely on what behavior is active.) POSIX showed some minor decency by at least introducing some variations of standard functions, which have a locale argument (e.g. toupper_l()). You just pass the locale which you want to be used, and don't have to do the set locale/call function/restore locale nonense. But OF COURSE they fucked this up too. In no less than 2 ways: - There is no statically available handle for the C locale, so you have to initialize and store it somewhere, which makes it harder to make utility functions safe, that call locale-affected standard functions and expect C semantics. The easy solution, using pthread_once() and a global variable with the created locale, will not be easily accepted by pedantic assholes, because they'll worry about allocation failure, or leaking the locale when using this in library code (and then unloading the library). Or you could have complicated library init/uninit functions, which bring a big load of their own mess. Same for automagic DLL constructors/destructors. - Not all functions have a variant that takes a locale argument, and they missed even some important ones, like snprintf() or strtod() WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK I would like to know why it took so long to standardize a half-assed solution, that, apart from being conceptually half-assed, is even incomplete and insufficient. The obvious way to fix this would have been: - deprecate the entire locale API and their use, and make it a NOP - make UTF-8 the standard character type - make the C locale behavior the default - add new APIs that explicitly take locale objects - provide an emulation layer, that can be used to transparently build legacy code without breaking them But this wouldn't have been "compatible", and the apparently incompetent standard committees would have never accepted this. As if anyone actually used this legacy garbage, except other legacy garbage. Oh yeah, and let's care a lot about legacy compatibility, and let's not care at all about modern code that either has to suffer from this, or subtly breaks when the wrong locales are active. Last but not least, the UTF-8 locale name is apparently not even standardized. At the moment I'm trying to use "C.UTF-8", which is apparently glibc _and_ Debian specific. Got to use every opportunity to make correct usage of UTF-8 harder. What luck that this commit is only for some optional relatively obscure mpv feature. Why is the C locale not UTF-8? Why did POSIX not standardize an UTF-8 locale? Well, according to something I heard a few years ago, they're considering disallowing UTF-8 as locale, because UTF-8 would violate certain ivnariants expected by C or POSIX. (But I'm not sure if I remember this correctly - probably better not to rage about it.) Now, on to libarchive. libarchive intentionally uses the locale API and all the broken crap around it to "convert" UTF-8 or UTF-16 (as contained in reasonably sane archive formats) to "char*". This is a good start! Since glibc does not think that the C locale uses UTF-8, this fails for mpv. So trying to use archive_entry_pathname() to get the archive entry name fails if the name contains non-ASCII characters. Maybe use archive_entry_pathname_utf8()? Surely that should return UTF-8, since its name seems to indicate that it returns UTF-8. But of fucking course it doesn't! libarchive's horribly convoluted code (that is full of locale API usage and other legacy shit, as well as ifdefs and OS specific code, including Windows and fucking Cygwin) somehow fucks up and fails if the locale is not set to UTF-8. I made a PR fixing this in libarchive almost 2 years ago, but it was ignored. So, would archive_entry_pathname_w() as fallback work? No, why would it? Of course this _also_ involves shitfucked code that calls shitfucked standard functions (or OS specific ifdeffed shitfuck). The truth is that at least glibc changes the meaning of wchar_t depending on the locale. Unlike most people think, wchar_t is not standardized to be an UTF variant (or even unicode) - it's an encoding that uses basic units that can be larger than 8 bit. It's an implementation defined thing. Windows defines it to 2 bytes and UTF-16, and glibc defines it to 4 bytes and UTF-32, but only if an UTF-8 locale is set (apparently). Yes. Every libarchive function dealing with strings has 3 variants: plain, _utf8, and _w. And none of these work if the locale is not set. I cannot fathom why they even have a wchar_t variant, because it's redundant and fucking useless for any modern code. Writing a UTF-16 to UTF-8 conversion routine is maybe 3 pages of code, or a few lines if you use iconv. But libarchive uses all this glorious bullshit, and ends up with 3 not working API functions, and with over 4000 lines of its own string abstraction code with gratuitous amounts of ifdefs and OS dependent code that breaks in a fairly common use case. So what we do is: - Use the idiotic POSIX 2008 API (uselocale() etc.) (Too bad for users who try to build this on a system that doesn't have these - hopefully none are left in 2017. But if there are, torturing them with obscure build errors is probably justified. Might be bad for Windows though, which is a very popular platform except on phones.) - Use the "C.UTF-8" locale, which is probably not 100% standards compliant, but works on my system, so it's fine. - Guard every libarchive call with uselocale() + restoring the locale. - Be lazy and skip some libarchive calls. Look forward to the unlikely and astonishingly stupid bugs this could produce. We could also just set a C UTF-8 local in main (since that would have no known negative effects on the rest of the code), but this won't work for libmpv. We assume that uselocale() never fails. In an unexplainable stroke of luck, POSIX made the semantics of uselocale() nice enough that user code can fail failures without introducing crash or security bugs, even if there should be an implementation fucked up enough where it's actually possible that uselocale() fails even with valid input. With all this shitty ugliness added, it finally works, without fucking up other parts of the player. This is still less bad than that time when libquivi fucked up OpenGL rendering, because calling a libquvi function would load some proxy abstraction library, which in turn loaded a KDE plugin (even if KDE was not used), which in turn called setlocale() because Qt does this, and consequently made the mpv GLSL shader generation code emit "," instead of "." for numbers, and of course only for users who had that KDE plugin installed, and lived in a part of the world where "." is not used as decimal separator. All in all, I believe this proves that software developers as a whole and as a culture produce worse results than drug addicted butt fucked monkeys randomly hacking on typewriters while inhaling the fumes of a radioactive dumpster fire fueled by chinese platsic toys for children and Elton John/Justin Bieber crossover CDs for all eternity.
2017-11-12 12:36:35 +00:00
locale_t oldlocale = uselocale(mpa->locale);
archive_read_support_format_rar(mpa->arch);
archive_read_support_format_rar5(mpa->arch);
stream_libarchive: fix unnecessarily opening all volumes on opening Seems like I'm still not done with rar playback stuff... It turns out the reason for archive_read_open1() opening all volumes had nothing to do with libarchive's rar code, but was a consequence of how multi volume support is implemented in libarchive, and due to the fact that we enabled archive_read_support_format_zip_seekable() (through archive_read_support_format_zip()). The seekable zip format will seek to the end of the file and search for a zip "header" there. It could possibly be considered a libarchive bug that it does that even if it's fairly sure that it's a RAR file. We already do probing on a small buffer read from the start of the file (i.e. not giving libarchive a way to seek the stream before we think it's an archive), but that does not help, since libarchive needs to probe _again_. libarchive does not seem to provide a function to query the format (no archive_read_get_format()). Which seems quite strange, but at least I didn't find one. This commit works this around by doing some manual rar/zip probing. We could have gone only with rar probing. But detecting zip separately allows us to avoid that stream_libarchive seeks to the end during early probing. This is an additional bonus on top of "fixing" multi volume rar. The zip probing is from archive_read_format_zip_streamable_bid(). The rar signature is the common prefix of the rar and rar5 formats in libarchive (presumably the RAR fixed header parts without version). If the demuxer seeks to the end of the rar entry, this will still open all volumes; I'm not sure whether the old/removed rar code in mpv could handle this better. See: #7182
2020-01-09 01:25:13 +00:00
// Exclude other formats if it's probably RAR, because other formats may
// behave suboptimal with multiple volumes exposed, such as opening every
// single volume by seeking at the end of the file.
if (!(flags & MP_ARCHIVE_FLAG_MAYBE_RAR)) {
stream_libarchive: fix unnecessarily opening all volumes on opening Seems like I'm still not done with rar playback stuff... It turns out the reason for archive_read_open1() opening all volumes had nothing to do with libarchive's rar code, but was a consequence of how multi volume support is implemented in libarchive, and due to the fact that we enabled archive_read_support_format_zip_seekable() (through archive_read_support_format_zip()). The seekable zip format will seek to the end of the file and search for a zip "header" there. It could possibly be considered a libarchive bug that it does that even if it's fairly sure that it's a RAR file. We already do probing on a small buffer read from the start of the file (i.e. not giving libarchive a way to seek the stream before we think it's an archive), but that does not help, since libarchive needs to probe _again_. libarchive does not seem to provide a function to query the format (no archive_read_get_format()). Which seems quite strange, but at least I didn't find one. This commit works this around by doing some manual rar/zip probing. We could have gone only with rar probing. But detecting zip separately allows us to avoid that stream_libarchive seeks to the end during early probing. This is an additional bonus on top of "fixing" multi volume rar. The zip probing is from archive_read_format_zip_streamable_bid(). The rar signature is the common prefix of the rar and rar5 formats in libarchive (presumably the RAR fixed header parts without version). If the demuxer seeks to the end of the rar entry, this will still open all volumes; I'm not sure whether the old/removed rar code in mpv could handle this better. See: #7182
2020-01-09 01:25:13 +00:00
archive_read_support_format_7zip(mpa->arch);
archive_read_support_format_iso9660(mpa->arch);
archive_read_support_filter_bzip2(mpa->arch);
archive_read_support_filter_gzip(mpa->arch);
archive_read_support_filter_xz(mpa->arch);
archive_read_support_format_zip_streamable(mpa->arch);
// This zip reader is normally preferable. However, it seeks to the end
// of the file, which may be annoying (HTTP reconnect, volume skipping),
// so use it only as last resort, or if it's relatively likely that it's
// really zip.
if (flags & (MP_ARCHIVE_FLAG_UNSAFE | MP_ARCHIVE_FLAG_MAYBE_ZIP))
stream_libarchive: fix unnecessarily opening all volumes on opening Seems like I'm still not done with rar playback stuff... It turns out the reason for archive_read_open1() opening all volumes had nothing to do with libarchive's rar code, but was a consequence of how multi volume support is implemented in libarchive, and due to the fact that we enabled archive_read_support_format_zip_seekable() (through archive_read_support_format_zip()). The seekable zip format will seek to the end of the file and search for a zip "header" there. It could possibly be considered a libarchive bug that it does that even if it's fairly sure that it's a RAR file. We already do probing on a small buffer read from the start of the file (i.e. not giving libarchive a way to seek the stream before we think it's an archive), but that does not help, since libarchive needs to probe _again_. libarchive does not seem to provide a function to query the format (no archive_read_get_format()). Which seems quite strange, but at least I didn't find one. This commit works this around by doing some manual rar/zip probing. We could have gone only with rar probing. But detecting zip separately allows us to avoid that stream_libarchive seeks to the end during early probing. This is an additional bonus on top of "fixing" multi volume rar. The zip probing is from archive_read_format_zip_streamable_bid(). The rar signature is the common prefix of the rar and rar5 formats in libarchive (presumably the RAR fixed header parts without version). If the demuxer seeks to the end of the rar entry, this will still open all volumes; I'm not sure whether the old/removed rar code in mpv could handle this better. See: #7182
2020-01-09 01:25:13 +00:00
archive_read_support_format_zip_seekable(mpa->arch);
}
archive_read_set_read_callback(mpa->arch, read_cb);
archive_read_set_skip_callback(mpa->arch, skip_cb);
archive_read_set_open_callback(mpa->arch, open_cb);
// Allow it to close a volume.
archive_read_set_close_callback(mpa->arch, close_cb);
if (mpa->primary_src->seekable)
archive_read_set_seek_callback(mpa->arch, seek_cb);
stream_libarchive: workaround various types of locale braindeath Fix that libarchive fails to return filenames for UTF-8/UTF-16 entries. The reason is that it uses locales and all that garbage, and mpv does not set a locale. Both C locales and wchar_t are shitfucked retarded legacy braindeath. If the C/POSIX standard committee had actually competent members, these would have been deprecated or removed long ago. (I mean, they managed to remove gets().) To justify this emotional outbreak potentially insulting to unknown persons, I will write a lot of text. Those not comfortable with toxic language should pretend this is a religious text. C locales are supposed to be a way to support certain languages and cultures easier. One example are character codepages. Back when UTF-8 was not invented yet, there were only 255 possible characters, which is not enough for anything but English and some european languages. So they decided to make the meaning of a character dependent on the current codepage. The locale (LC_CTYPE specifically) determines what character encoding is currently used. Of course nowadays, this is legacy nonsense. Everything uses UTF-8 for "char", and what doesn't is broken and terrible anyway. But the old ways stayed with us, and the stupidity of it as well. C locales were utterly moronic even when they were invented. The locale (via setlocale()) is global state, and global state is not a reasonable way to do anything. It will break libraries, or well modularized code. (The latter would be forced to strictly guard all entrypoints set set/restore locales, assuming a single threaded world.) On top of that, setting a locale randomly changes the semantics of a bunch of standard functions. If a function respects locale, you suddenly can't rely on it to behave the same on all systems. Some behavior can come as a surprise, and of course it will be dependent on the region of the user (it doesn't help that most software is US-centric, and the US locale is almost like the C locale, i.e. almost what you expect). Idiotically, locales were not just used to define the current character encoding, but the concept was used for a whole lot of things, like e. g. whether numbers should use "," or "." as decimal separaror. The latter issue is actually much worse, because it breaks basic string conversion or parsing of numbers for the purpose of interacting with file formats and such. Much can be said about how retarded locales are, even beyond what I just wrote, or will wrote below. They are so hilariously misdesigned and insufficient, I can't even fathom how this shit was _standardized_. (In any case, that meant everyone was forced to implement it.) Many C functions can't even do it correctly. For example, the character set encoding can be a multibyte encoding (not just UTF-8, but awful garbage like Shift JIS (sometimes called SHIT JIZZ), yet functions like toupper() can return only 1 byte. Or just take the fact that the locale API tries to define standard paper sizes (LC_PAPER) or telephone number formatting (LC_TELEPHONE). Who the fuck uses this, or would ever use this? But the badness doesn't stop here. At some point, they invented threads. And they put absolutely no thought into how threads should interact with locales. So they kept locales as global state. Because obviously, you want to be able to change the semantics of basic string processing functions _while_ they're running, right? (Any thread can call setlocale() at any time, and it's supposed to change the locale of all other threads.) At this point, how the fuck are you supposed to do anything correctly? You can't even temporarily switch the locale with setlocale(), because it would asynchronously fuckup the other threads. All you can do is to enforce a convention not to set anything but the C local (this is what mpv does), or to duplicate standard functions using code that doesn't query locale (this is what e.g. libass does, a close dependency of mpv). Imagine they had done this for certain other things. Like errno, with all the brokenness of the locale API. This simply wouldn't have worked, shit would just have been too broken. So they didn't. But locales give a delicious sweet spot of brokenness, where things are broken enough to cause neverending pain, but not broken enough that enough effort would have spent to fix it completely. On that note, standard C11 actually can't stringify an error value. It does define strerror(), but it's not thread safe, even though C11 supports threads. The idiots could just have defined it to be thread safe. Even if your libc is horrible enough that it can't return string literals, it could just just some thread local buffer. Because C11 does define thread local variables. But hey, why care about details, if you can just create a shitty standard? (POSIX defines strerror_r(), which "solves" this problem, while still not making strerror() thread safe.) Anyway, back to threads. The interaction of locales and threads makes no sense. Why would you make locales process global? Who even wanted it to work this way? Who decided that it should keep working this way, despite being so broken (and certainly causing implementation difficulties in libc)? Was it just a fucked up psychopath? Several decades later, the moronic standard committees noticed that this was (still is) kind of a bad situation. Instead of fixing the situation, they added more garbage on top of it. (Probably for the sake of "compatibility"). Now there is a set of new functions, which allow you to override the locale for the current thread. This means you can temporarily override and restore the local on all entrypoints of your code (like you could with setlocale(), before threads were invented). And of course not all operating systems or libcs implement this. For example, I'm pretty sure Microsoft doesn't. (Microsoft got to fuck it up as usual, and only provides _configthreadlocale(). This is shitfucked on its own, because it's GLOBAL STATE to configure that GLOBAL STATE should not be GLOBAL STATE, i.e. completely broken garbage, because it requires agreement over all modules/libraries what behavior should be used. I mean, sure, makign setlocale() affect only the current thread would have been the reasonable behavior. Making this behavior configurable isn't, because you can't rely on what behavior is active.) POSIX showed some minor decency by at least introducing some variations of standard functions, which have a locale argument (e.g. toupper_l()). You just pass the locale which you want to be used, and don't have to do the set locale/call function/restore locale nonense. But OF COURSE they fucked this up too. In no less than 2 ways: - There is no statically available handle for the C locale, so you have to initialize and store it somewhere, which makes it harder to make utility functions safe, that call locale-affected standard functions and expect C semantics. The easy solution, using pthread_once() and a global variable with the created locale, will not be easily accepted by pedantic assholes, because they'll worry about allocation failure, or leaking the locale when using this in library code (and then unloading the library). Or you could have complicated library init/uninit functions, which bring a big load of their own mess. Same for automagic DLL constructors/destructors. - Not all functions have a variant that takes a locale argument, and they missed even some important ones, like snprintf() or strtod() WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK I would like to know why it took so long to standardize a half-assed solution, that, apart from being conceptually half-assed, is even incomplete and insufficient. The obvious way to fix this would have been: - deprecate the entire locale API and their use, and make it a NOP - make UTF-8 the standard character type - make the C locale behavior the default - add new APIs that explicitly take locale objects - provide an emulation layer, that can be used to transparently build legacy code without breaking them But this wouldn't have been "compatible", and the apparently incompetent standard committees would have never accepted this. As if anyone actually used this legacy garbage, except other legacy garbage. Oh yeah, and let's care a lot about legacy compatibility, and let's not care at all about modern code that either has to suffer from this, or subtly breaks when the wrong locales are active. Last but not least, the UTF-8 locale name is apparently not even standardized. At the moment I'm trying to use "C.UTF-8", which is apparently glibc _and_ Debian specific. Got to use every opportunity to make correct usage of UTF-8 harder. What luck that this commit is only for some optional relatively obscure mpv feature. Why is the C locale not UTF-8? Why did POSIX not standardize an UTF-8 locale? Well, according to something I heard a few years ago, they're considering disallowing UTF-8 as locale, because UTF-8 would violate certain ivnariants expected by C or POSIX. (But I'm not sure if I remember this correctly - probably better not to rage about it.) Now, on to libarchive. libarchive intentionally uses the locale API and all the broken crap around it to "convert" UTF-8 or UTF-16 (as contained in reasonably sane archive formats) to "char*". This is a good start! Since glibc does not think that the C locale uses UTF-8, this fails for mpv. So trying to use archive_entry_pathname() to get the archive entry name fails if the name contains non-ASCII characters. Maybe use archive_entry_pathname_utf8()? Surely that should return UTF-8, since its name seems to indicate that it returns UTF-8. But of fucking course it doesn't! libarchive's horribly convoluted code (that is full of locale API usage and other legacy shit, as well as ifdefs and OS specific code, including Windows and fucking Cygwin) somehow fucks up and fails if the locale is not set to UTF-8. I made a PR fixing this in libarchive almost 2 years ago, but it was ignored. So, would archive_entry_pathname_w() as fallback work? No, why would it? Of course this _also_ involves shitfucked code that calls shitfucked standard functions (or OS specific ifdeffed shitfuck). The truth is that at least glibc changes the meaning of wchar_t depending on the locale. Unlike most people think, wchar_t is not standardized to be an UTF variant (or even unicode) - it's an encoding that uses basic units that can be larger than 8 bit. It's an implementation defined thing. Windows defines it to 2 bytes and UTF-16, and glibc defines it to 4 bytes and UTF-32, but only if an UTF-8 locale is set (apparently). Yes. Every libarchive function dealing with strings has 3 variants: plain, _utf8, and _w. And none of these work if the locale is not set. I cannot fathom why they even have a wchar_t variant, because it's redundant and fucking useless for any modern code. Writing a UTF-16 to UTF-8 conversion routine is maybe 3 pages of code, or a few lines if you use iconv. But libarchive uses all this glorious bullshit, and ends up with 3 not working API functions, and with over 4000 lines of its own string abstraction code with gratuitous amounts of ifdefs and OS dependent code that breaks in a fairly common use case. So what we do is: - Use the idiotic POSIX 2008 API (uselocale() etc.) (Too bad for users who try to build this on a system that doesn't have these - hopefully none are left in 2017. But if there are, torturing them with obscure build errors is probably justified. Might be bad for Windows though, which is a very popular platform except on phones.) - Use the "C.UTF-8" locale, which is probably not 100% standards compliant, but works on my system, so it's fine. - Guard every libarchive call with uselocale() + restoring the locale. - Be lazy and skip some libarchive calls. Look forward to the unlikely and astonishingly stupid bugs this could produce. We could also just set a C UTF-8 local in main (since that would have no known negative effects on the rest of the code), but this won't work for libmpv. We assume that uselocale() never fails. In an unexplainable stroke of luck, POSIX made the semantics of uselocale() nice enough that user code can fail failures without introducing crash or security bugs, even if there should be an implementation fucked up enough where it's actually possible that uselocale() fails even with valid input. With all this shitty ugliness added, it finally works, without fucking up other parts of the player. This is still less bad than that time when libquivi fucked up OpenGL rendering, because calling a libquvi function would load some proxy abstraction library, which in turn loaded a KDE plugin (even if KDE was not used), which in turn called setlocale() because Qt does this, and consequently made the mpv GLSL shader generation code emit "," instead of "." for numbers, and of course only for users who had that KDE plugin installed, and lived in a part of the world where "." is not used as decimal separator. All in all, I believe this proves that software developers as a whole and as a culture produce worse results than drug addicted butt fucked monkeys randomly hacking on typewriters while inhaling the fumes of a radioactive dumpster fire fueled by chinese platsic toys for children and Elton John/Justin Bieber crossover CDs for all eternity.
2017-11-12 12:36:35 +00:00
bool fail = archive_read_open1(mpa->arch) < ARCHIVE_OK;
uselocale(oldlocale);
if (fail)
goto err;
return mpa;
err:
mp_archive_free(mpa);
return NULL;
}
struct mp_archive *mp_archive_new(struct mp_log *log, struct stream *src,
int flags, int max_volumes)
{
flags |= mp_archive_probe(src);
return mp_archive_new_raw(log, src, flags, max_volumes);
}
// Iterate entries. The first call establishes the first entry. Returns false
// if no entry found, otherwise returns true and sets mpa->entry/entry_filename.
bool mp_archive_next_entry(struct mp_archive *mpa)
{
mpa->entry = NULL;
talloc_free(mpa->entry_filename);
mpa->entry_filename = NULL;
if (!mpa->arch)
return false;
stream_libarchive: workaround various types of locale braindeath Fix that libarchive fails to return filenames for UTF-8/UTF-16 entries. The reason is that it uses locales and all that garbage, and mpv does not set a locale. Both C locales and wchar_t are shitfucked retarded legacy braindeath. If the C/POSIX standard committee had actually competent members, these would have been deprecated or removed long ago. (I mean, they managed to remove gets().) To justify this emotional outbreak potentially insulting to unknown persons, I will write a lot of text. Those not comfortable with toxic language should pretend this is a religious text. C locales are supposed to be a way to support certain languages and cultures easier. One example are character codepages. Back when UTF-8 was not invented yet, there were only 255 possible characters, which is not enough for anything but English and some european languages. So they decided to make the meaning of a character dependent on the current codepage. The locale (LC_CTYPE specifically) determines what character encoding is currently used. Of course nowadays, this is legacy nonsense. Everything uses UTF-8 for "char", and what doesn't is broken and terrible anyway. But the old ways stayed with us, and the stupidity of it as well. C locales were utterly moronic even when they were invented. The locale (via setlocale()) is global state, and global state is not a reasonable way to do anything. It will break libraries, or well modularized code. (The latter would be forced to strictly guard all entrypoints set set/restore locales, assuming a single threaded world.) On top of that, setting a locale randomly changes the semantics of a bunch of standard functions. If a function respects locale, you suddenly can't rely on it to behave the same on all systems. Some behavior can come as a surprise, and of course it will be dependent on the region of the user (it doesn't help that most software is US-centric, and the US locale is almost like the C locale, i.e. almost what you expect). Idiotically, locales were not just used to define the current character encoding, but the concept was used for a whole lot of things, like e. g. whether numbers should use "," or "." as decimal separaror. The latter issue is actually much worse, because it breaks basic string conversion or parsing of numbers for the purpose of interacting with file formats and such. Much can be said about how retarded locales are, even beyond what I just wrote, or will wrote below. They are so hilariously misdesigned and insufficient, I can't even fathom how this shit was _standardized_. (In any case, that meant everyone was forced to implement it.) Many C functions can't even do it correctly. For example, the character set encoding can be a multibyte encoding (not just UTF-8, but awful garbage like Shift JIS (sometimes called SHIT JIZZ), yet functions like toupper() can return only 1 byte. Or just take the fact that the locale API tries to define standard paper sizes (LC_PAPER) or telephone number formatting (LC_TELEPHONE). Who the fuck uses this, or would ever use this? But the badness doesn't stop here. At some point, they invented threads. And they put absolutely no thought into how threads should interact with locales. So they kept locales as global state. Because obviously, you want to be able to change the semantics of basic string processing functions _while_ they're running, right? (Any thread can call setlocale() at any time, and it's supposed to change the locale of all other threads.) At this point, how the fuck are you supposed to do anything correctly? You can't even temporarily switch the locale with setlocale(), because it would asynchronously fuckup the other threads. All you can do is to enforce a convention not to set anything but the C local (this is what mpv does), or to duplicate standard functions using code that doesn't query locale (this is what e.g. libass does, a close dependency of mpv). Imagine they had done this for certain other things. Like errno, with all the brokenness of the locale API. This simply wouldn't have worked, shit would just have been too broken. So they didn't. But locales give a delicious sweet spot of brokenness, where things are broken enough to cause neverending pain, but not broken enough that enough effort would have spent to fix it completely. On that note, standard C11 actually can't stringify an error value. It does define strerror(), but it's not thread safe, even though C11 supports threads. The idiots could just have defined it to be thread safe. Even if your libc is horrible enough that it can't return string literals, it could just just some thread local buffer. Because C11 does define thread local variables. But hey, why care about details, if you can just create a shitty standard? (POSIX defines strerror_r(), which "solves" this problem, while still not making strerror() thread safe.) Anyway, back to threads. The interaction of locales and threads makes no sense. Why would you make locales process global? Who even wanted it to work this way? Who decided that it should keep working this way, despite being so broken (and certainly causing implementation difficulties in libc)? Was it just a fucked up psychopath? Several decades later, the moronic standard committees noticed that this was (still is) kind of a bad situation. Instead of fixing the situation, they added more garbage on top of it. (Probably for the sake of "compatibility"). Now there is a set of new functions, which allow you to override the locale for the current thread. This means you can temporarily override and restore the local on all entrypoints of your code (like you could with setlocale(), before threads were invented). And of course not all operating systems or libcs implement this. For example, I'm pretty sure Microsoft doesn't. (Microsoft got to fuck it up as usual, and only provides _configthreadlocale(). This is shitfucked on its own, because it's GLOBAL STATE to configure that GLOBAL STATE should not be GLOBAL STATE, i.e. completely broken garbage, because it requires agreement over all modules/libraries what behavior should be used. I mean, sure, makign setlocale() affect only the current thread would have been the reasonable behavior. Making this behavior configurable isn't, because you can't rely on what behavior is active.) POSIX showed some minor decency by at least introducing some variations of standard functions, which have a locale argument (e.g. toupper_l()). You just pass the locale which you want to be used, and don't have to do the set locale/call function/restore locale nonense. But OF COURSE they fucked this up too. In no less than 2 ways: - There is no statically available handle for the C locale, so you have to initialize and store it somewhere, which makes it harder to make utility functions safe, that call locale-affected standard functions and expect C semantics. The easy solution, using pthread_once() and a global variable with the created locale, will not be easily accepted by pedantic assholes, because they'll worry about allocation failure, or leaking the locale when using this in library code (and then unloading the library). Or you could have complicated library init/uninit functions, which bring a big load of their own mess. Same for automagic DLL constructors/destructors. - Not all functions have a variant that takes a locale argument, and they missed even some important ones, like snprintf() or strtod() WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK I would like to know why it took so long to standardize a half-assed solution, that, apart from being conceptually half-assed, is even incomplete and insufficient. The obvious way to fix this would have been: - deprecate the entire locale API and their use, and make it a NOP - make UTF-8 the standard character type - make the C locale behavior the default - add new APIs that explicitly take locale objects - provide an emulation layer, that can be used to transparently build legacy code without breaking them But this wouldn't have been "compatible", and the apparently incompetent standard committees would have never accepted this. As if anyone actually used this legacy garbage, except other legacy garbage. Oh yeah, and let's care a lot about legacy compatibility, and let's not care at all about modern code that either has to suffer from this, or subtly breaks when the wrong locales are active. Last but not least, the UTF-8 locale name is apparently not even standardized. At the moment I'm trying to use "C.UTF-8", which is apparently glibc _and_ Debian specific. Got to use every opportunity to make correct usage of UTF-8 harder. What luck that this commit is only for some optional relatively obscure mpv feature. Why is the C locale not UTF-8? Why did POSIX not standardize an UTF-8 locale? Well, according to something I heard a few years ago, they're considering disallowing UTF-8 as locale, because UTF-8 would violate certain ivnariants expected by C or POSIX. (But I'm not sure if I remember this correctly - probably better not to rage about it.) Now, on to libarchive. libarchive intentionally uses the locale API and all the broken crap around it to "convert" UTF-8 or UTF-16 (as contained in reasonably sane archive formats) to "char*". This is a good start! Since glibc does not think that the C locale uses UTF-8, this fails for mpv. So trying to use archive_entry_pathname() to get the archive entry name fails if the name contains non-ASCII characters. Maybe use archive_entry_pathname_utf8()? Surely that should return UTF-8, since its name seems to indicate that it returns UTF-8. But of fucking course it doesn't! libarchive's horribly convoluted code (that is full of locale API usage and other legacy shit, as well as ifdefs and OS specific code, including Windows and fucking Cygwin) somehow fucks up and fails if the locale is not set to UTF-8. I made a PR fixing this in libarchive almost 2 years ago, but it was ignored. So, would archive_entry_pathname_w() as fallback work? No, why would it? Of course this _also_ involves shitfucked code that calls shitfucked standard functions (or OS specific ifdeffed shitfuck). The truth is that at least glibc changes the meaning of wchar_t depending on the locale. Unlike most people think, wchar_t is not standardized to be an UTF variant (or even unicode) - it's an encoding that uses basic units that can be larger than 8 bit. It's an implementation defined thing. Windows defines it to 2 bytes and UTF-16, and glibc defines it to 4 bytes and UTF-32, but only if an UTF-8 locale is set (apparently). Yes. Every libarchive function dealing with strings has 3 variants: plain, _utf8, and _w. And none of these work if the locale is not set. I cannot fathom why they even have a wchar_t variant, because it's redundant and fucking useless for any modern code. Writing a UTF-16 to UTF-8 conversion routine is maybe 3 pages of code, or a few lines if you use iconv. But libarchive uses all this glorious bullshit, and ends up with 3 not working API functions, and with over 4000 lines of its own string abstraction code with gratuitous amounts of ifdefs and OS dependent code that breaks in a fairly common use case. So what we do is: - Use the idiotic POSIX 2008 API (uselocale() etc.) (Too bad for users who try to build this on a system that doesn't have these - hopefully none are left in 2017. But if there are, torturing them with obscure build errors is probably justified. Might be bad for Windows though, which is a very popular platform except on phones.) - Use the "C.UTF-8" locale, which is probably not 100% standards compliant, but works on my system, so it's fine. - Guard every libarchive call with uselocale() + restoring the locale. - Be lazy and skip some libarchive calls. Look forward to the unlikely and astonishingly stupid bugs this could produce. We could also just set a C UTF-8 local in main (since that would have no known negative effects on the rest of the code), but this won't work for libmpv. We assume that uselocale() never fails. In an unexplainable stroke of luck, POSIX made the semantics of uselocale() nice enough that user code can fail failures without introducing crash or security bugs, even if there should be an implementation fucked up enough where it's actually possible that uselocale() fails even with valid input. With all this shitty ugliness added, it finally works, without fucking up other parts of the player. This is still less bad than that time when libquivi fucked up OpenGL rendering, because calling a libquvi function would load some proxy abstraction library, which in turn loaded a KDE plugin (even if KDE was not used), which in turn called setlocale() because Qt does this, and consequently made the mpv GLSL shader generation code emit "," instead of "." for numbers, and of course only for users who had that KDE plugin installed, and lived in a part of the world where "." is not used as decimal separator. All in all, I believe this proves that software developers as a whole and as a culture produce worse results than drug addicted butt fucked monkeys randomly hacking on typewriters while inhaling the fumes of a radioactive dumpster fire fueled by chinese platsic toys for children and Elton John/Justin Bieber crossover CDs for all eternity.
2017-11-12 12:36:35 +00:00
locale_t oldlocale = uselocale(mpa->locale);
bool success = false;
while (!mp_cancel_test(mpa->primary_src->cancel)) {
struct archive_entry *entry;
int r = archive_read_next_header(mpa->arch, &entry);
if (r == ARCHIVE_EOF)
break;
if (r < ARCHIVE_OK)
MP_ERR(mpa, "%s\n", archive_error_string(mpa->arch));
if (r < ARCHIVE_WARN) {
MP_FATAL(mpa, "could not read archive entry\n");
mp_archive_check_fatal(mpa, r);
break;
}
if (archive_entry_filetype(entry) != AE_IFREG)
continue;
// Some archives may have no filenames, or libarchive won't return some.
const char *fn = archive_entry_pathname(entry);
char buf[64];
if (!fn || bstr_validate_utf8(bstr0(fn)) < 0) {
snprintf(buf, sizeof(buf), "mpv_unknown#%d", mpa->entry_num);
fn = buf;
}
mpa->entry = entry;
mpa->entry_filename = talloc_strdup(mpa, fn);
mpa->entry_num += 1;
stream_libarchive: workaround various types of locale braindeath Fix that libarchive fails to return filenames for UTF-8/UTF-16 entries. The reason is that it uses locales and all that garbage, and mpv does not set a locale. Both C locales and wchar_t are shitfucked retarded legacy braindeath. If the C/POSIX standard committee had actually competent members, these would have been deprecated or removed long ago. (I mean, they managed to remove gets().) To justify this emotional outbreak potentially insulting to unknown persons, I will write a lot of text. Those not comfortable with toxic language should pretend this is a religious text. C locales are supposed to be a way to support certain languages and cultures easier. One example are character codepages. Back when UTF-8 was not invented yet, there were only 255 possible characters, which is not enough for anything but English and some european languages. So they decided to make the meaning of a character dependent on the current codepage. The locale (LC_CTYPE specifically) determines what character encoding is currently used. Of course nowadays, this is legacy nonsense. Everything uses UTF-8 for "char", and what doesn't is broken and terrible anyway. But the old ways stayed with us, and the stupidity of it as well. C locales were utterly moronic even when they were invented. The locale (via setlocale()) is global state, and global state is not a reasonable way to do anything. It will break libraries, or well modularized code. (The latter would be forced to strictly guard all entrypoints set set/restore locales, assuming a single threaded world.) On top of that, setting a locale randomly changes the semantics of a bunch of standard functions. If a function respects locale, you suddenly can't rely on it to behave the same on all systems. Some behavior can come as a surprise, and of course it will be dependent on the region of the user (it doesn't help that most software is US-centric, and the US locale is almost like the C locale, i.e. almost what you expect). Idiotically, locales were not just used to define the current character encoding, but the concept was used for a whole lot of things, like e. g. whether numbers should use "," or "." as decimal separaror. The latter issue is actually much worse, because it breaks basic string conversion or parsing of numbers for the purpose of interacting with file formats and such. Much can be said about how retarded locales are, even beyond what I just wrote, or will wrote below. They are so hilariously misdesigned and insufficient, I can't even fathom how this shit was _standardized_. (In any case, that meant everyone was forced to implement it.) Many C functions can't even do it correctly. For example, the character set encoding can be a multibyte encoding (not just UTF-8, but awful garbage like Shift JIS (sometimes called SHIT JIZZ), yet functions like toupper() can return only 1 byte. Or just take the fact that the locale API tries to define standard paper sizes (LC_PAPER) or telephone number formatting (LC_TELEPHONE). Who the fuck uses this, or would ever use this? But the badness doesn't stop here. At some point, they invented threads. And they put absolutely no thought into how threads should interact with locales. So they kept locales as global state. Because obviously, you want to be able to change the semantics of basic string processing functions _while_ they're running, right? (Any thread can call setlocale() at any time, and it's supposed to change the locale of all other threads.) At this point, how the fuck are you supposed to do anything correctly? You can't even temporarily switch the locale with setlocale(), because it would asynchronously fuckup the other threads. All you can do is to enforce a convention not to set anything but the C local (this is what mpv does), or to duplicate standard functions using code that doesn't query locale (this is what e.g. libass does, a close dependency of mpv). Imagine they had done this for certain other things. Like errno, with all the brokenness of the locale API. This simply wouldn't have worked, shit would just have been too broken. So they didn't. But locales give a delicious sweet spot of brokenness, where things are broken enough to cause neverending pain, but not broken enough that enough effort would have spent to fix it completely. On that note, standard C11 actually can't stringify an error value. It does define strerror(), but it's not thread safe, even though C11 supports threads. The idiots could just have defined it to be thread safe. Even if your libc is horrible enough that it can't return string literals, it could just just some thread local buffer. Because C11 does define thread local variables. But hey, why care about details, if you can just create a shitty standard? (POSIX defines strerror_r(), which "solves" this problem, while still not making strerror() thread safe.) Anyway, back to threads. The interaction of locales and threads makes no sense. Why would you make locales process global? Who even wanted it to work this way? Who decided that it should keep working this way, despite being so broken (and certainly causing implementation difficulties in libc)? Was it just a fucked up psychopath? Several decades later, the moronic standard committees noticed that this was (still is) kind of a bad situation. Instead of fixing the situation, they added more garbage on top of it. (Probably for the sake of "compatibility"). Now there is a set of new functions, which allow you to override the locale for the current thread. This means you can temporarily override and restore the local on all entrypoints of your code (like you could with setlocale(), before threads were invented). And of course not all operating systems or libcs implement this. For example, I'm pretty sure Microsoft doesn't. (Microsoft got to fuck it up as usual, and only provides _configthreadlocale(). This is shitfucked on its own, because it's GLOBAL STATE to configure that GLOBAL STATE should not be GLOBAL STATE, i.e. completely broken garbage, because it requires agreement over all modules/libraries what behavior should be used. I mean, sure, makign setlocale() affect only the current thread would have been the reasonable behavior. Making this behavior configurable isn't, because you can't rely on what behavior is active.) POSIX showed some minor decency by at least introducing some variations of standard functions, which have a locale argument (e.g. toupper_l()). You just pass the locale which you want to be used, and don't have to do the set locale/call function/restore locale nonense. But OF COURSE they fucked this up too. In no less than 2 ways: - There is no statically available handle for the C locale, so you have to initialize and store it somewhere, which makes it harder to make utility functions safe, that call locale-affected standard functions and expect C semantics. The easy solution, using pthread_once() and a global variable with the created locale, will not be easily accepted by pedantic assholes, because they'll worry about allocation failure, or leaking the locale when using this in library code (and then unloading the library). Or you could have complicated library init/uninit functions, which bring a big load of their own mess. Same for automagic DLL constructors/destructors. - Not all functions have a variant that takes a locale argument, and they missed even some important ones, like snprintf() or strtod() WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK I would like to know why it took so long to standardize a half-assed solution, that, apart from being conceptually half-assed, is even incomplete and insufficient. The obvious way to fix this would have been: - deprecate the entire locale API and their use, and make it a NOP - make UTF-8 the standard character type - make the C locale behavior the default - add new APIs that explicitly take locale objects - provide an emulation layer, that can be used to transparently build legacy code without breaking them But this wouldn't have been "compatible", and the apparently incompetent standard committees would have never accepted this. As if anyone actually used this legacy garbage, except other legacy garbage. Oh yeah, and let's care a lot about legacy compatibility, and let's not care at all about modern code that either has to suffer from this, or subtly breaks when the wrong locales are active. Last but not least, the UTF-8 locale name is apparently not even standardized. At the moment I'm trying to use "C.UTF-8", which is apparently glibc _and_ Debian specific. Got to use every opportunity to make correct usage of UTF-8 harder. What luck that this commit is only for some optional relatively obscure mpv feature. Why is the C locale not UTF-8? Why did POSIX not standardize an UTF-8 locale? Well, according to something I heard a few years ago, they're considering disallowing UTF-8 as locale, because UTF-8 would violate certain ivnariants expected by C or POSIX. (But I'm not sure if I remember this correctly - probably better not to rage about it.) Now, on to libarchive. libarchive intentionally uses the locale API and all the broken crap around it to "convert" UTF-8 or UTF-16 (as contained in reasonably sane archive formats) to "char*". This is a good start! Since glibc does not think that the C locale uses UTF-8, this fails for mpv. So trying to use archive_entry_pathname() to get the archive entry name fails if the name contains non-ASCII characters. Maybe use archive_entry_pathname_utf8()? Surely that should return UTF-8, since its name seems to indicate that it returns UTF-8. But of fucking course it doesn't! libarchive's horribly convoluted code (that is full of locale API usage and other legacy shit, as well as ifdefs and OS specific code, including Windows and fucking Cygwin) somehow fucks up and fails if the locale is not set to UTF-8. I made a PR fixing this in libarchive almost 2 years ago, but it was ignored. So, would archive_entry_pathname_w() as fallback work? No, why would it? Of course this _also_ involves shitfucked code that calls shitfucked standard functions (or OS specific ifdeffed shitfuck). The truth is that at least glibc changes the meaning of wchar_t depending on the locale. Unlike most people think, wchar_t is not standardized to be an UTF variant (or even unicode) - it's an encoding that uses basic units that can be larger than 8 bit. It's an implementation defined thing. Windows defines it to 2 bytes and UTF-16, and glibc defines it to 4 bytes and UTF-32, but only if an UTF-8 locale is set (apparently). Yes. Every libarchive function dealing with strings has 3 variants: plain, _utf8, and _w. And none of these work if the locale is not set. I cannot fathom why they even have a wchar_t variant, because it's redundant and fucking useless for any modern code. Writing a UTF-16 to UTF-8 conversion routine is maybe 3 pages of code, or a few lines if you use iconv. But libarchive uses all this glorious bullshit, and ends up with 3 not working API functions, and with over 4000 lines of its own string abstraction code with gratuitous amounts of ifdefs and OS dependent code that breaks in a fairly common use case. So what we do is: - Use the idiotic POSIX 2008 API (uselocale() etc.) (Too bad for users who try to build this on a system that doesn't have these - hopefully none are left in 2017. But if there are, torturing them with obscure build errors is probably justified. Might be bad for Windows though, which is a very popular platform except on phones.) - Use the "C.UTF-8" locale, which is probably not 100% standards compliant, but works on my system, so it's fine. - Guard every libarchive call with uselocale() + restoring the locale. - Be lazy and skip some libarchive calls. Look forward to the unlikely and astonishingly stupid bugs this could produce. We could also just set a C UTF-8 local in main (since that would have no known negative effects on the rest of the code), but this won't work for libmpv. We assume that uselocale() never fails. In an unexplainable stroke of luck, POSIX made the semantics of uselocale() nice enough that user code can fail failures without introducing crash or security bugs, even if there should be an implementation fucked up enough where it's actually possible that uselocale() fails even with valid input. With all this shitty ugliness added, it finally works, without fucking up other parts of the player. This is still less bad than that time when libquivi fucked up OpenGL rendering, because calling a libquvi function would load some proxy abstraction library, which in turn loaded a KDE plugin (even if KDE was not used), which in turn called setlocale() because Qt does this, and consequently made the mpv GLSL shader generation code emit "," instead of "." for numbers, and of course only for users who had that KDE plugin installed, and lived in a part of the world where "." is not used as decimal separator. All in all, I believe this proves that software developers as a whole and as a culture produce worse results than drug addicted butt fucked monkeys randomly hacking on typewriters while inhaling the fumes of a radioactive dumpster fire fueled by chinese platsic toys for children and Elton John/Justin Bieber crossover CDs for all eternity.
2017-11-12 12:36:35 +00:00
success = true;
break;
}
stream_libarchive: workaround various types of locale braindeath Fix that libarchive fails to return filenames for UTF-8/UTF-16 entries. The reason is that it uses locales and all that garbage, and mpv does not set a locale. Both C locales and wchar_t are shitfucked retarded legacy braindeath. If the C/POSIX standard committee had actually competent members, these would have been deprecated or removed long ago. (I mean, they managed to remove gets().) To justify this emotional outbreak potentially insulting to unknown persons, I will write a lot of text. Those not comfortable with toxic language should pretend this is a religious text. C locales are supposed to be a way to support certain languages and cultures easier. One example are character codepages. Back when UTF-8 was not invented yet, there were only 255 possible characters, which is not enough for anything but English and some european languages. So they decided to make the meaning of a character dependent on the current codepage. The locale (LC_CTYPE specifically) determines what character encoding is currently used. Of course nowadays, this is legacy nonsense. Everything uses UTF-8 for "char", and what doesn't is broken and terrible anyway. But the old ways stayed with us, and the stupidity of it as well. C locales were utterly moronic even when they were invented. The locale (via setlocale()) is global state, and global state is not a reasonable way to do anything. It will break libraries, or well modularized code. (The latter would be forced to strictly guard all entrypoints set set/restore locales, assuming a single threaded world.) On top of that, setting a locale randomly changes the semantics of a bunch of standard functions. If a function respects locale, you suddenly can't rely on it to behave the same on all systems. Some behavior can come as a surprise, and of course it will be dependent on the region of the user (it doesn't help that most software is US-centric, and the US locale is almost like the C locale, i.e. almost what you expect). Idiotically, locales were not just used to define the current character encoding, but the concept was used for a whole lot of things, like e. g. whether numbers should use "," or "." as decimal separaror. The latter issue is actually much worse, because it breaks basic string conversion or parsing of numbers for the purpose of interacting with file formats and such. Much can be said about how retarded locales are, even beyond what I just wrote, or will wrote below. They are so hilariously misdesigned and insufficient, I can't even fathom how this shit was _standardized_. (In any case, that meant everyone was forced to implement it.) Many C functions can't even do it correctly. For example, the character set encoding can be a multibyte encoding (not just UTF-8, but awful garbage like Shift JIS (sometimes called SHIT JIZZ), yet functions like toupper() can return only 1 byte. Or just take the fact that the locale API tries to define standard paper sizes (LC_PAPER) or telephone number formatting (LC_TELEPHONE). Who the fuck uses this, or would ever use this? But the badness doesn't stop here. At some point, they invented threads. And they put absolutely no thought into how threads should interact with locales. So they kept locales as global state. Because obviously, you want to be able to change the semantics of basic string processing functions _while_ they're running, right? (Any thread can call setlocale() at any time, and it's supposed to change the locale of all other threads.) At this point, how the fuck are you supposed to do anything correctly? You can't even temporarily switch the locale with setlocale(), because it would asynchronously fuckup the other threads. All you can do is to enforce a convention not to set anything but the C local (this is what mpv does), or to duplicate standard functions using code that doesn't query locale (this is what e.g. libass does, a close dependency of mpv). Imagine they had done this for certain other things. Like errno, with all the brokenness of the locale API. This simply wouldn't have worked, shit would just have been too broken. So they didn't. But locales give a delicious sweet spot of brokenness, where things are broken enough to cause neverending pain, but not broken enough that enough effort would have spent to fix it completely. On that note, standard C11 actually can't stringify an error value. It does define strerror(), but it's not thread safe, even though C11 supports threads. The idiots could just have defined it to be thread safe. Even if your libc is horrible enough that it can't return string literals, it could just just some thread local buffer. Because C11 does define thread local variables. But hey, why care about details, if you can just create a shitty standard? (POSIX defines strerror_r(), which "solves" this problem, while still not making strerror() thread safe.) Anyway, back to threads. The interaction of locales and threads makes no sense. Why would you make locales process global? Who even wanted it to work this way? Who decided that it should keep working this way, despite being so broken (and certainly causing implementation difficulties in libc)? Was it just a fucked up psychopath? Several decades later, the moronic standard committees noticed that this was (still is) kind of a bad situation. Instead of fixing the situation, they added more garbage on top of it. (Probably for the sake of "compatibility"). Now there is a set of new functions, which allow you to override the locale for the current thread. This means you can temporarily override and restore the local on all entrypoints of your code (like you could with setlocale(), before threads were invented). And of course not all operating systems or libcs implement this. For example, I'm pretty sure Microsoft doesn't. (Microsoft got to fuck it up as usual, and only provides _configthreadlocale(). This is shitfucked on its own, because it's GLOBAL STATE to configure that GLOBAL STATE should not be GLOBAL STATE, i.e. completely broken garbage, because it requires agreement over all modules/libraries what behavior should be used. I mean, sure, makign setlocale() affect only the current thread would have been the reasonable behavior. Making this behavior configurable isn't, because you can't rely on what behavior is active.) POSIX showed some minor decency by at least introducing some variations of standard functions, which have a locale argument (e.g. toupper_l()). You just pass the locale which you want to be used, and don't have to do the set locale/call function/restore locale nonense. But OF COURSE they fucked this up too. In no less than 2 ways: - There is no statically available handle for the C locale, so you have to initialize and store it somewhere, which makes it harder to make utility functions safe, that call locale-affected standard functions and expect C semantics. The easy solution, using pthread_once() and a global variable with the created locale, will not be easily accepted by pedantic assholes, because they'll worry about allocation failure, or leaking the locale when using this in library code (and then unloading the library). Or you could have complicated library init/uninit functions, which bring a big load of their own mess. Same for automagic DLL constructors/destructors. - Not all functions have a variant that takes a locale argument, and they missed even some important ones, like snprintf() or strtod() WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK I would like to know why it took so long to standardize a half-assed solution, that, apart from being conceptually half-assed, is even incomplete and insufficient. The obvious way to fix this would have been: - deprecate the entire locale API and their use, and make it a NOP - make UTF-8 the standard character type - make the C locale behavior the default - add new APIs that explicitly take locale objects - provide an emulation layer, that can be used to transparently build legacy code without breaking them But this wouldn't have been "compatible", and the apparently incompetent standard committees would have never accepted this. As if anyone actually used this legacy garbage, except other legacy garbage. Oh yeah, and let's care a lot about legacy compatibility, and let's not care at all about modern code that either has to suffer from this, or subtly breaks when the wrong locales are active. Last but not least, the UTF-8 locale name is apparently not even standardized. At the moment I'm trying to use "C.UTF-8", which is apparently glibc _and_ Debian specific. Got to use every opportunity to make correct usage of UTF-8 harder. What luck that this commit is only for some optional relatively obscure mpv feature. Why is the C locale not UTF-8? Why did POSIX not standardize an UTF-8 locale? Well, according to something I heard a few years ago, they're considering disallowing UTF-8 as locale, because UTF-8 would violate certain ivnariants expected by C or POSIX. (But I'm not sure if I remember this correctly - probably better not to rage about it.) Now, on to libarchive. libarchive intentionally uses the locale API and all the broken crap around it to "convert" UTF-8 or UTF-16 (as contained in reasonably sane archive formats) to "char*". This is a good start! Since glibc does not think that the C locale uses UTF-8, this fails for mpv. So trying to use archive_entry_pathname() to get the archive entry name fails if the name contains non-ASCII characters. Maybe use archive_entry_pathname_utf8()? Surely that should return UTF-8, since its name seems to indicate that it returns UTF-8. But of fucking course it doesn't! libarchive's horribly convoluted code (that is full of locale API usage and other legacy shit, as well as ifdefs and OS specific code, including Windows and fucking Cygwin) somehow fucks up and fails if the locale is not set to UTF-8. I made a PR fixing this in libarchive almost 2 years ago, but it was ignored. So, would archive_entry_pathname_w() as fallback work? No, why would it? Of course this _also_ involves shitfucked code that calls shitfucked standard functions (or OS specific ifdeffed shitfuck). The truth is that at least glibc changes the meaning of wchar_t depending on the locale. Unlike most people think, wchar_t is not standardized to be an UTF variant (or even unicode) - it's an encoding that uses basic units that can be larger than 8 bit. It's an implementation defined thing. Windows defines it to 2 bytes and UTF-16, and glibc defines it to 4 bytes and UTF-32, but only if an UTF-8 locale is set (apparently). Yes. Every libarchive function dealing with strings has 3 variants: plain, _utf8, and _w. And none of these work if the locale is not set. I cannot fathom why they even have a wchar_t variant, because it's redundant and fucking useless for any modern code. Writing a UTF-16 to UTF-8 conversion routine is maybe 3 pages of code, or a few lines if you use iconv. But libarchive uses all this glorious bullshit, and ends up with 3 not working API functions, and with over 4000 lines of its own string abstraction code with gratuitous amounts of ifdefs and OS dependent code that breaks in a fairly common use case. So what we do is: - Use the idiotic POSIX 2008 API (uselocale() etc.) (Too bad for users who try to build this on a system that doesn't have these - hopefully none are left in 2017. But if there are, torturing them with obscure build errors is probably justified. Might be bad for Windows though, which is a very popular platform except on phones.) - Use the "C.UTF-8" locale, which is probably not 100% standards compliant, but works on my system, so it's fine. - Guard every libarchive call with uselocale() + restoring the locale. - Be lazy and skip some libarchive calls. Look forward to the unlikely and astonishingly stupid bugs this could produce. We could also just set a C UTF-8 local in main (since that would have no known negative effects on the rest of the code), but this won't work for libmpv. We assume that uselocale() never fails. In an unexplainable stroke of luck, POSIX made the semantics of uselocale() nice enough that user code can fail failures without introducing crash or security bugs, even if there should be an implementation fucked up enough where it's actually possible that uselocale() fails even with valid input. With all this shitty ugliness added, it finally works, without fucking up other parts of the player. This is still less bad than that time when libquivi fucked up OpenGL rendering, because calling a libquvi function would load some proxy abstraction library, which in turn loaded a KDE plugin (even if KDE was not used), which in turn called setlocale() because Qt does this, and consequently made the mpv GLSL shader generation code emit "," instead of "." for numbers, and of course only for users who had that KDE plugin installed, and lived in a part of the world where "." is not used as decimal separator. All in all, I believe this proves that software developers as a whole and as a culture produce worse results than drug addicted butt fucked monkeys randomly hacking on typewriters while inhaling the fumes of a radioactive dumpster fire fueled by chinese platsic toys for children and Elton John/Justin Bieber crossover CDs for all eternity.
2017-11-12 12:36:35 +00:00
uselocale(oldlocale);
return success;
}
struct priv {
struct mp_archive *mpa;
bool broken_seek;
struct stream *src;
int64_t entry_size;
char *entry_name;
};
static int reopen_archive(stream_t *s)
{
struct priv *p = s->priv;
s->pos = 0;
if (!p->mpa) {
p->mpa = mp_archive_new(s->log, p->src, MP_ARCHIVE_FLAG_UNSAFE, 0);
} else {
int flags = p->mpa->flags;
int num_volumes = p->mpa->num_volumes;
mp_archive_free(p->mpa);
p->mpa = mp_archive_new_raw(s->log, p->src, flags, num_volumes);
}
if (!p->mpa)
return STREAM_ERROR;
// Follows the same logic as demux_libarchive.c.
struct mp_archive *mpa = p->mpa;
while (mp_archive_next_entry(mpa)) {
if (strcmp(p->entry_name, mpa->entry_filename) == 0) {
stream_libarchive: workaround various types of locale braindeath Fix that libarchive fails to return filenames for UTF-8/UTF-16 entries. The reason is that it uses locales and all that garbage, and mpv does not set a locale. Both C locales and wchar_t are shitfucked retarded legacy braindeath. If the C/POSIX standard committee had actually competent members, these would have been deprecated or removed long ago. (I mean, they managed to remove gets().) To justify this emotional outbreak potentially insulting to unknown persons, I will write a lot of text. Those not comfortable with toxic language should pretend this is a religious text. C locales are supposed to be a way to support certain languages and cultures easier. One example are character codepages. Back when UTF-8 was not invented yet, there were only 255 possible characters, which is not enough for anything but English and some european languages. So they decided to make the meaning of a character dependent on the current codepage. The locale (LC_CTYPE specifically) determines what character encoding is currently used. Of course nowadays, this is legacy nonsense. Everything uses UTF-8 for "char", and what doesn't is broken and terrible anyway. But the old ways stayed with us, and the stupidity of it as well. C locales were utterly moronic even when they were invented. The locale (via setlocale()) is global state, and global state is not a reasonable way to do anything. It will break libraries, or well modularized code. (The latter would be forced to strictly guard all entrypoints set set/restore locales, assuming a single threaded world.) On top of that, setting a locale randomly changes the semantics of a bunch of standard functions. If a function respects locale, you suddenly can't rely on it to behave the same on all systems. Some behavior can come as a surprise, and of course it will be dependent on the region of the user (it doesn't help that most software is US-centric, and the US locale is almost like the C locale, i.e. almost what you expect). Idiotically, locales were not just used to define the current character encoding, but the concept was used for a whole lot of things, like e. g. whether numbers should use "," or "." as decimal separaror. The latter issue is actually much worse, because it breaks basic string conversion or parsing of numbers for the purpose of interacting with file formats and such. Much can be said about how retarded locales are, even beyond what I just wrote, or will wrote below. They are so hilariously misdesigned and insufficient, I can't even fathom how this shit was _standardized_. (In any case, that meant everyone was forced to implement it.) Many C functions can't even do it correctly. For example, the character set encoding can be a multibyte encoding (not just UTF-8, but awful garbage like Shift JIS (sometimes called SHIT JIZZ), yet functions like toupper() can return only 1 byte. Or just take the fact that the locale API tries to define standard paper sizes (LC_PAPER) or telephone number formatting (LC_TELEPHONE). Who the fuck uses this, or would ever use this? But the badness doesn't stop here. At some point, they invented threads. And they put absolutely no thought into how threads should interact with locales. So they kept locales as global state. Because obviously, you want to be able to change the semantics of basic string processing functions _while_ they're running, right? (Any thread can call setlocale() at any time, and it's supposed to change the locale of all other threads.) At this point, how the fuck are you supposed to do anything correctly? You can't even temporarily switch the locale with setlocale(), because it would asynchronously fuckup the other threads. All you can do is to enforce a convention not to set anything but the C local (this is what mpv does), or to duplicate standard functions using code that doesn't query locale (this is what e.g. libass does, a close dependency of mpv). Imagine they had done this for certain other things. Like errno, with all the brokenness of the locale API. This simply wouldn't have worked, shit would just have been too broken. So they didn't. But locales give a delicious sweet spot of brokenness, where things are broken enough to cause neverending pain, but not broken enough that enough effort would have spent to fix it completely. On that note, standard C11 actually can't stringify an error value. It does define strerror(), but it's not thread safe, even though C11 supports threads. The idiots could just have defined it to be thread safe. Even if your libc is horrible enough that it can't return string literals, it could just just some thread local buffer. Because C11 does define thread local variables. But hey, why care about details, if you can just create a shitty standard? (POSIX defines strerror_r(), which "solves" this problem, while still not making strerror() thread safe.) Anyway, back to threads. The interaction of locales and threads makes no sense. Why would you make locales process global? Who even wanted it to work this way? Who decided that it should keep working this way, despite being so broken (and certainly causing implementation difficulties in libc)? Was it just a fucked up psychopath? Several decades later, the moronic standard committees noticed that this was (still is) kind of a bad situation. Instead of fixing the situation, they added more garbage on top of it. (Probably for the sake of "compatibility"). Now there is a set of new functions, which allow you to override the locale for the current thread. This means you can temporarily override and restore the local on all entrypoints of your code (like you could with setlocale(), before threads were invented). And of course not all operating systems or libcs implement this. For example, I'm pretty sure Microsoft doesn't. (Microsoft got to fuck it up as usual, and only provides _configthreadlocale(). This is shitfucked on its own, because it's GLOBAL STATE to configure that GLOBAL STATE should not be GLOBAL STATE, i.e. completely broken garbage, because it requires agreement over all modules/libraries what behavior should be used. I mean, sure, makign setlocale() affect only the current thread would have been the reasonable behavior. Making this behavior configurable isn't, because you can't rely on what behavior is active.) POSIX showed some minor decency by at least introducing some variations of standard functions, which have a locale argument (e.g. toupper_l()). You just pass the locale which you want to be used, and don't have to do the set locale/call function/restore locale nonense. But OF COURSE they fucked this up too. In no less than 2 ways: - There is no statically available handle for the C locale, so you have to initialize and store it somewhere, which makes it harder to make utility functions safe, that call locale-affected standard functions and expect C semantics. The easy solution, using pthread_once() and a global variable with the created locale, will not be easily accepted by pedantic assholes, because they'll worry about allocation failure, or leaking the locale when using this in library code (and then unloading the library). Or you could have complicated library init/uninit functions, which bring a big load of their own mess. Same for automagic DLL constructors/destructors. - Not all functions have a variant that takes a locale argument, and they missed even some important ones, like snprintf() or strtod() WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK I would like to know why it took so long to standardize a half-assed solution, that, apart from being conceptually half-assed, is even incomplete and insufficient. The obvious way to fix this would have been: - deprecate the entire locale API and their use, and make it a NOP - make UTF-8 the standard character type - make the C locale behavior the default - add new APIs that explicitly take locale objects - provide an emulation layer, that can be used to transparently build legacy code without breaking them But this wouldn't have been "compatible", and the apparently incompetent standard committees would have never accepted this. As if anyone actually used this legacy garbage, except other legacy garbage. Oh yeah, and let's care a lot about legacy compatibility, and let's not care at all about modern code that either has to suffer from this, or subtly breaks when the wrong locales are active. Last but not least, the UTF-8 locale name is apparently not even standardized. At the moment I'm trying to use "C.UTF-8", which is apparently glibc _and_ Debian specific. Got to use every opportunity to make correct usage of UTF-8 harder. What luck that this commit is only for some optional relatively obscure mpv feature. Why is the C locale not UTF-8? Why did POSIX not standardize an UTF-8 locale? Well, according to something I heard a few years ago, they're considering disallowing UTF-8 as locale, because UTF-8 would violate certain ivnariants expected by C or POSIX. (But I'm not sure if I remember this correctly - probably better not to rage about it.) Now, on to libarchive. libarchive intentionally uses the locale API and all the broken crap around it to "convert" UTF-8 or UTF-16 (as contained in reasonably sane archive formats) to "char*". This is a good start! Since glibc does not think that the C locale uses UTF-8, this fails for mpv. So trying to use archive_entry_pathname() to get the archive entry name fails if the name contains non-ASCII characters. Maybe use archive_entry_pathname_utf8()? Surely that should return UTF-8, since its name seems to indicate that it returns UTF-8. But of fucking course it doesn't! libarchive's horribly convoluted code (that is full of locale API usage and other legacy shit, as well as ifdefs and OS specific code, including Windows and fucking Cygwin) somehow fucks up and fails if the locale is not set to UTF-8. I made a PR fixing this in libarchive almost 2 years ago, but it was ignored. So, would archive_entry_pathname_w() as fallback work? No, why would it? Of course this _also_ involves shitfucked code that calls shitfucked standard functions (or OS specific ifdeffed shitfuck). The truth is that at least glibc changes the meaning of wchar_t depending on the locale. Unlike most people think, wchar_t is not standardized to be an UTF variant (or even unicode) - it's an encoding that uses basic units that can be larger than 8 bit. It's an implementation defined thing. Windows defines it to 2 bytes and UTF-16, and glibc defines it to 4 bytes and UTF-32, but only if an UTF-8 locale is set (apparently). Yes. Every libarchive function dealing with strings has 3 variants: plain, _utf8, and _w. And none of these work if the locale is not set. I cannot fathom why they even have a wchar_t variant, because it's redundant and fucking useless for any modern code. Writing a UTF-16 to UTF-8 conversion routine is maybe 3 pages of code, or a few lines if you use iconv. But libarchive uses all this glorious bullshit, and ends up with 3 not working API functions, and with over 4000 lines of its own string abstraction code with gratuitous amounts of ifdefs and OS dependent code that breaks in a fairly common use case. So what we do is: - Use the idiotic POSIX 2008 API (uselocale() etc.) (Too bad for users who try to build this on a system that doesn't have these - hopefully none are left in 2017. But if there are, torturing them with obscure build errors is probably justified. Might be bad for Windows though, which is a very popular platform except on phones.) - Use the "C.UTF-8" locale, which is probably not 100% standards compliant, but works on my system, so it's fine. - Guard every libarchive call with uselocale() + restoring the locale. - Be lazy and skip some libarchive calls. Look forward to the unlikely and astonishingly stupid bugs this could produce. We could also just set a C UTF-8 local in main (since that would have no known negative effects on the rest of the code), but this won't work for libmpv. We assume that uselocale() never fails. In an unexplainable stroke of luck, POSIX made the semantics of uselocale() nice enough that user code can fail failures without introducing crash or security bugs, even if there should be an implementation fucked up enough where it's actually possible that uselocale() fails even with valid input. With all this shitty ugliness added, it finally works, without fucking up other parts of the player. This is still less bad than that time when libquivi fucked up OpenGL rendering, because calling a libquvi function would load some proxy abstraction library, which in turn loaded a KDE plugin (even if KDE was not used), which in turn called setlocale() because Qt does this, and consequently made the mpv GLSL shader generation code emit "," instead of "." for numbers, and of course only for users who had that KDE plugin installed, and lived in a part of the world where "." is not used as decimal separator. All in all, I believe this proves that software developers as a whole and as a culture produce worse results than drug addicted butt fucked monkeys randomly hacking on typewriters while inhaling the fumes of a radioactive dumpster fire fueled by chinese platsic toys for children and Elton John/Justin Bieber crossover CDs for all eternity.
2017-11-12 12:36:35 +00:00
locale_t oldlocale = uselocale(mpa->locale);
p->entry_size = -1;
if (archive_entry_size_is_set(mpa->entry))
p->entry_size = archive_entry_size(mpa->entry);
stream_libarchive: workaround various types of locale braindeath Fix that libarchive fails to return filenames for UTF-8/UTF-16 entries. The reason is that it uses locales and all that garbage, and mpv does not set a locale. Both C locales and wchar_t are shitfucked retarded legacy braindeath. If the C/POSIX standard committee had actually competent members, these would have been deprecated or removed long ago. (I mean, they managed to remove gets().) To justify this emotional outbreak potentially insulting to unknown persons, I will write a lot of text. Those not comfortable with toxic language should pretend this is a religious text. C locales are supposed to be a way to support certain languages and cultures easier. One example are character codepages. Back when UTF-8 was not invented yet, there were only 255 possible characters, which is not enough for anything but English and some european languages. So they decided to make the meaning of a character dependent on the current codepage. The locale (LC_CTYPE specifically) determines what character encoding is currently used. Of course nowadays, this is legacy nonsense. Everything uses UTF-8 for "char", and what doesn't is broken and terrible anyway. But the old ways stayed with us, and the stupidity of it as well. C locales were utterly moronic even when they were invented. The locale (via setlocale()) is global state, and global state is not a reasonable way to do anything. It will break libraries, or well modularized code. (The latter would be forced to strictly guard all entrypoints set set/restore locales, assuming a single threaded world.) On top of that, setting a locale randomly changes the semantics of a bunch of standard functions. If a function respects locale, you suddenly can't rely on it to behave the same on all systems. Some behavior can come as a surprise, and of course it will be dependent on the region of the user (it doesn't help that most software is US-centric, and the US locale is almost like the C locale, i.e. almost what you expect). Idiotically, locales were not just used to define the current character encoding, but the concept was used for a whole lot of things, like e. g. whether numbers should use "," or "." as decimal separaror. The latter issue is actually much worse, because it breaks basic string conversion or parsing of numbers for the purpose of interacting with file formats and such. Much can be said about how retarded locales are, even beyond what I just wrote, or will wrote below. They are so hilariously misdesigned and insufficient, I can't even fathom how this shit was _standardized_. (In any case, that meant everyone was forced to implement it.) Many C functions can't even do it correctly. For example, the character set encoding can be a multibyte encoding (not just UTF-8, but awful garbage like Shift JIS (sometimes called SHIT JIZZ), yet functions like toupper() can return only 1 byte. Or just take the fact that the locale API tries to define standard paper sizes (LC_PAPER) or telephone number formatting (LC_TELEPHONE). Who the fuck uses this, or would ever use this? But the badness doesn't stop here. At some point, they invented threads. And they put absolutely no thought into how threads should interact with locales. So they kept locales as global state. Because obviously, you want to be able to change the semantics of basic string processing functions _while_ they're running, right? (Any thread can call setlocale() at any time, and it's supposed to change the locale of all other threads.) At this point, how the fuck are you supposed to do anything correctly? You can't even temporarily switch the locale with setlocale(), because it would asynchronously fuckup the other threads. All you can do is to enforce a convention not to set anything but the C local (this is what mpv does), or to duplicate standard functions using code that doesn't query locale (this is what e.g. libass does, a close dependency of mpv). Imagine they had done this for certain other things. Like errno, with all the brokenness of the locale API. This simply wouldn't have worked, shit would just have been too broken. So they didn't. But locales give a delicious sweet spot of brokenness, where things are broken enough to cause neverending pain, but not broken enough that enough effort would have spent to fix it completely. On that note, standard C11 actually can't stringify an error value. It does define strerror(), but it's not thread safe, even though C11 supports threads. The idiots could just have defined it to be thread safe. Even if your libc is horrible enough that it can't return string literals, it could just just some thread local buffer. Because C11 does define thread local variables. But hey, why care about details, if you can just create a shitty standard? (POSIX defines strerror_r(), which "solves" this problem, while still not making strerror() thread safe.) Anyway, back to threads. The interaction of locales and threads makes no sense. Why would you make locales process global? Who even wanted it to work this way? Who decided that it should keep working this way, despite being so broken (and certainly causing implementation difficulties in libc)? Was it just a fucked up psychopath? Several decades later, the moronic standard committees noticed that this was (still is) kind of a bad situation. Instead of fixing the situation, they added more garbage on top of it. (Probably for the sake of "compatibility"). Now there is a set of new functions, which allow you to override the locale for the current thread. This means you can temporarily override and restore the local on all entrypoints of your code (like you could with setlocale(), before threads were invented). And of course not all operating systems or libcs implement this. For example, I'm pretty sure Microsoft doesn't. (Microsoft got to fuck it up as usual, and only provides _configthreadlocale(). This is shitfucked on its own, because it's GLOBAL STATE to configure that GLOBAL STATE should not be GLOBAL STATE, i.e. completely broken garbage, because it requires agreement over all modules/libraries what behavior should be used. I mean, sure, makign setlocale() affect only the current thread would have been the reasonable behavior. Making this behavior configurable isn't, because you can't rely on what behavior is active.) POSIX showed some minor decency by at least introducing some variations of standard functions, which have a locale argument (e.g. toupper_l()). You just pass the locale which you want to be used, and don't have to do the set locale/call function/restore locale nonense. But OF COURSE they fucked this up too. In no less than 2 ways: - There is no statically available handle for the C locale, so you have to initialize and store it somewhere, which makes it harder to make utility functions safe, that call locale-affected standard functions and expect C semantics. The easy solution, using pthread_once() and a global variable with the created locale, will not be easily accepted by pedantic assholes, because they'll worry about allocation failure, or leaking the locale when using this in library code (and then unloading the library). Or you could have complicated library init/uninit functions, which bring a big load of their own mess. Same for automagic DLL constructors/destructors. - Not all functions have a variant that takes a locale argument, and they missed even some important ones, like snprintf() or strtod() WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK I would like to know why it took so long to standardize a half-assed solution, that, apart from being conceptually half-assed, is even incomplete and insufficient. The obvious way to fix this would have been: - deprecate the entire locale API and their use, and make it a NOP - make UTF-8 the standard character type - make the C locale behavior the default - add new APIs that explicitly take locale objects - provide an emulation layer, that can be used to transparently build legacy code without breaking them But this wouldn't have been "compatible", and the apparently incompetent standard committees would have never accepted this. As if anyone actually used this legacy garbage, except other legacy garbage. Oh yeah, and let's care a lot about legacy compatibility, and let's not care at all about modern code that either has to suffer from this, or subtly breaks when the wrong locales are active. Last but not least, the UTF-8 locale name is apparently not even standardized. At the moment I'm trying to use "C.UTF-8", which is apparently glibc _and_ Debian specific. Got to use every opportunity to make correct usage of UTF-8 harder. What luck that this commit is only for some optional relatively obscure mpv feature. Why is the C locale not UTF-8? Why did POSIX not standardize an UTF-8 locale? Well, according to something I heard a few years ago, they're considering disallowing UTF-8 as locale, because UTF-8 would violate certain ivnariants expected by C or POSIX. (But I'm not sure if I remember this correctly - probably better not to rage about it.) Now, on to libarchive. libarchive intentionally uses the locale API and all the broken crap around it to "convert" UTF-8 or UTF-16 (as contained in reasonably sane archive formats) to "char*". This is a good start! Since glibc does not think that the C locale uses UTF-8, this fails for mpv. So trying to use archive_entry_pathname() to get the archive entry name fails if the name contains non-ASCII characters. Maybe use archive_entry_pathname_utf8()? Surely that should return UTF-8, since its name seems to indicate that it returns UTF-8. But of fucking course it doesn't! libarchive's horribly convoluted code (that is full of locale API usage and other legacy shit, as well as ifdefs and OS specific code, including Windows and fucking Cygwin) somehow fucks up and fails if the locale is not set to UTF-8. I made a PR fixing this in libarchive almost 2 years ago, but it was ignored. So, would archive_entry_pathname_w() as fallback work? No, why would it? Of course this _also_ involves shitfucked code that calls shitfucked standard functions (or OS specific ifdeffed shitfuck). The truth is that at least glibc changes the meaning of wchar_t depending on the locale. Unlike most people think, wchar_t is not standardized to be an UTF variant (or even unicode) - it's an encoding that uses basic units that can be larger than 8 bit. It's an implementation defined thing. Windows defines it to 2 bytes and UTF-16, and glibc defines it to 4 bytes and UTF-32, but only if an UTF-8 locale is set (apparently). Yes. Every libarchive function dealing with strings has 3 variants: plain, _utf8, and _w. And none of these work if the locale is not set. I cannot fathom why they even have a wchar_t variant, because it's redundant and fucking useless for any modern code. Writing a UTF-16 to UTF-8 conversion routine is maybe 3 pages of code, or a few lines if you use iconv. But libarchive uses all this glorious bullshit, and ends up with 3 not working API functions, and with over 4000 lines of its own string abstraction code with gratuitous amounts of ifdefs and OS dependent code that breaks in a fairly common use case. So what we do is: - Use the idiotic POSIX 2008 API (uselocale() etc.) (Too bad for users who try to build this on a system that doesn't have these - hopefully none are left in 2017. But if there are, torturing them with obscure build errors is probably justified. Might be bad for Windows though, which is a very popular platform except on phones.) - Use the "C.UTF-8" locale, which is probably not 100% standards compliant, but works on my system, so it's fine. - Guard every libarchive call with uselocale() + restoring the locale. - Be lazy and skip some libarchive calls. Look forward to the unlikely and astonishingly stupid bugs this could produce. We could also just set a C UTF-8 local in main (since that would have no known negative effects on the rest of the code), but this won't work for libmpv. We assume that uselocale() never fails. In an unexplainable stroke of luck, POSIX made the semantics of uselocale() nice enough that user code can fail failures without introducing crash or security bugs, even if there should be an implementation fucked up enough where it's actually possible that uselocale() fails even with valid input. With all this shitty ugliness added, it finally works, without fucking up other parts of the player. This is still less bad than that time when libquivi fucked up OpenGL rendering, because calling a libquvi function would load some proxy abstraction library, which in turn loaded a KDE plugin (even if KDE was not used), which in turn called setlocale() because Qt does this, and consequently made the mpv GLSL shader generation code emit "," instead of "." for numbers, and of course only for users who had that KDE plugin installed, and lived in a part of the world where "." is not used as decimal separator. All in all, I believe this proves that software developers as a whole and as a culture produce worse results than drug addicted butt fucked monkeys randomly hacking on typewriters while inhaling the fumes of a radioactive dumpster fire fueled by chinese platsic toys for children and Elton John/Justin Bieber crossover CDs for all eternity.
2017-11-12 12:36:35 +00:00
uselocale(oldlocale);
return STREAM_OK;
}
}
mp_archive_free(p->mpa);
p->mpa = NULL;
MP_ERR(s, "archive entry not found. '%s'\n", p->entry_name);
return STREAM_ERROR;
}
static int archive_entry_fill_buffer(stream_t *s, void *buffer, int max_len)
{
struct priv *p = s->priv;
if (!p->mpa)
return 0;
stream_libarchive: workaround various types of locale braindeath Fix that libarchive fails to return filenames for UTF-8/UTF-16 entries. The reason is that it uses locales and all that garbage, and mpv does not set a locale. Both C locales and wchar_t are shitfucked retarded legacy braindeath. If the C/POSIX standard committee had actually competent members, these would have been deprecated or removed long ago. (I mean, they managed to remove gets().) To justify this emotional outbreak potentially insulting to unknown persons, I will write a lot of text. Those not comfortable with toxic language should pretend this is a religious text. C locales are supposed to be a way to support certain languages and cultures easier. One example are character codepages. Back when UTF-8 was not invented yet, there were only 255 possible characters, which is not enough for anything but English and some european languages. So they decided to make the meaning of a character dependent on the current codepage. The locale (LC_CTYPE specifically) determines what character encoding is currently used. Of course nowadays, this is legacy nonsense. Everything uses UTF-8 for "char", and what doesn't is broken and terrible anyway. But the old ways stayed with us, and the stupidity of it as well. C locales were utterly moronic even when they were invented. The locale (via setlocale()) is global state, and global state is not a reasonable way to do anything. It will break libraries, or well modularized code. (The latter would be forced to strictly guard all entrypoints set set/restore locales, assuming a single threaded world.) On top of that, setting a locale randomly changes the semantics of a bunch of standard functions. If a function respects locale, you suddenly can't rely on it to behave the same on all systems. Some behavior can come as a surprise, and of course it will be dependent on the region of the user (it doesn't help that most software is US-centric, and the US locale is almost like the C locale, i.e. almost what you expect). Idiotically, locales were not just used to define the current character encoding, but the concept was used for a whole lot of things, like e. g. whether numbers should use "," or "." as decimal separaror. The latter issue is actually much worse, because it breaks basic string conversion or parsing of numbers for the purpose of interacting with file formats and such. Much can be said about how retarded locales are, even beyond what I just wrote, or will wrote below. They are so hilariously misdesigned and insufficient, I can't even fathom how this shit was _standardized_. (In any case, that meant everyone was forced to implement it.) Many C functions can't even do it correctly. For example, the character set encoding can be a multibyte encoding (not just UTF-8, but awful garbage like Shift JIS (sometimes called SHIT JIZZ), yet functions like toupper() can return only 1 byte. Or just take the fact that the locale API tries to define standard paper sizes (LC_PAPER) or telephone number formatting (LC_TELEPHONE). Who the fuck uses this, or would ever use this? But the badness doesn't stop here. At some point, they invented threads. And they put absolutely no thought into how threads should interact with locales. So they kept locales as global state. Because obviously, you want to be able to change the semantics of basic string processing functions _while_ they're running, right? (Any thread can call setlocale() at any time, and it's supposed to change the locale of all other threads.) At this point, how the fuck are you supposed to do anything correctly? You can't even temporarily switch the locale with setlocale(), because it would asynchronously fuckup the other threads. All you can do is to enforce a convention not to set anything but the C local (this is what mpv does), or to duplicate standard functions using code that doesn't query locale (this is what e.g. libass does, a close dependency of mpv). Imagine they had done this for certain other things. Like errno, with all the brokenness of the locale API. This simply wouldn't have worked, shit would just have been too broken. So they didn't. But locales give a delicious sweet spot of brokenness, where things are broken enough to cause neverending pain, but not broken enough that enough effort would have spent to fix it completely. On that note, standard C11 actually can't stringify an error value. It does define strerror(), but it's not thread safe, even though C11 supports threads. The idiots could just have defined it to be thread safe. Even if your libc is horrible enough that it can't return string literals, it could just just some thread local buffer. Because C11 does define thread local variables. But hey, why care about details, if you can just create a shitty standard? (POSIX defines strerror_r(), which "solves" this problem, while still not making strerror() thread safe.) Anyway, back to threads. The interaction of locales and threads makes no sense. Why would you make locales process global? Who even wanted it to work this way? Who decided that it should keep working this way, despite being so broken (and certainly causing implementation difficulties in libc)? Was it just a fucked up psychopath? Several decades later, the moronic standard committees noticed that this was (still is) kind of a bad situation. Instead of fixing the situation, they added more garbage on top of it. (Probably for the sake of "compatibility"). Now there is a set of new functions, which allow you to override the locale for the current thread. This means you can temporarily override and restore the local on all entrypoints of your code (like you could with setlocale(), before threads were invented). And of course not all operating systems or libcs implement this. For example, I'm pretty sure Microsoft doesn't. (Microsoft got to fuck it up as usual, and only provides _configthreadlocale(). This is shitfucked on its own, because it's GLOBAL STATE to configure that GLOBAL STATE should not be GLOBAL STATE, i.e. completely broken garbage, because it requires agreement over all modules/libraries what behavior should be used. I mean, sure, makign setlocale() affect only the current thread would have been the reasonable behavior. Making this behavior configurable isn't, because you can't rely on what behavior is active.) POSIX showed some minor decency by at least introducing some variations of standard functions, which have a locale argument (e.g. toupper_l()). You just pass the locale which you want to be used, and don't have to do the set locale/call function/restore locale nonense. But OF COURSE they fucked this up too. In no less than 2 ways: - There is no statically available handle for the C locale, so you have to initialize and store it somewhere, which makes it harder to make utility functions safe, that call locale-affected standard functions and expect C semantics. The easy solution, using pthread_once() and a global variable with the created locale, will not be easily accepted by pedantic assholes, because they'll worry about allocation failure, or leaking the locale when using this in library code (and then unloading the library). Or you could have complicated library init/uninit functions, which bring a big load of their own mess. Same for automagic DLL constructors/destructors. - Not all functions have a variant that takes a locale argument, and they missed even some important ones, like snprintf() or strtod() WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK I would like to know why it took so long to standardize a half-assed solution, that, apart from being conceptually half-assed, is even incomplete and insufficient. The obvious way to fix this would have been: - deprecate the entire locale API and their use, and make it a NOP - make UTF-8 the standard character type - make the C locale behavior the default - add new APIs that explicitly take locale objects - provide an emulation layer, that can be used to transparently build legacy code without breaking them But this wouldn't have been "compatible", and the apparently incompetent standard committees would have never accepted this. As if anyone actually used this legacy garbage, except other legacy garbage. Oh yeah, and let's care a lot about legacy compatibility, and let's not care at all about modern code that either has to suffer from this, or subtly breaks when the wrong locales are active. Last but not least, the UTF-8 locale name is apparently not even standardized. At the moment I'm trying to use "C.UTF-8", which is apparently glibc _and_ Debian specific. Got to use every opportunity to make correct usage of UTF-8 harder. What luck that this commit is only for some optional relatively obscure mpv feature. Why is the C locale not UTF-8? Why did POSIX not standardize an UTF-8 locale? Well, according to something I heard a few years ago, they're considering disallowing UTF-8 as locale, because UTF-8 would violate certain ivnariants expected by C or POSIX. (But I'm not sure if I remember this correctly - probably better not to rage about it.) Now, on to libarchive. libarchive intentionally uses the locale API and all the broken crap around it to "convert" UTF-8 or UTF-16 (as contained in reasonably sane archive formats) to "char*". This is a good start! Since glibc does not think that the C locale uses UTF-8, this fails for mpv. So trying to use archive_entry_pathname() to get the archive entry name fails if the name contains non-ASCII characters. Maybe use archive_entry_pathname_utf8()? Surely that should return UTF-8, since its name seems to indicate that it returns UTF-8. But of fucking course it doesn't! libarchive's horribly convoluted code (that is full of locale API usage and other legacy shit, as well as ifdefs and OS specific code, including Windows and fucking Cygwin) somehow fucks up and fails if the locale is not set to UTF-8. I made a PR fixing this in libarchive almost 2 years ago, but it was ignored. So, would archive_entry_pathname_w() as fallback work? No, why would it? Of course this _also_ involves shitfucked code that calls shitfucked standard functions (or OS specific ifdeffed shitfuck). The truth is that at least glibc changes the meaning of wchar_t depending on the locale. Unlike most people think, wchar_t is not standardized to be an UTF variant (or even unicode) - it's an encoding that uses basic units that can be larger than 8 bit. It's an implementation defined thing. Windows defines it to 2 bytes and UTF-16, and glibc defines it to 4 bytes and UTF-32, but only if an UTF-8 locale is set (apparently). Yes. Every libarchive function dealing with strings has 3 variants: plain, _utf8, and _w. And none of these work if the locale is not set. I cannot fathom why they even have a wchar_t variant, because it's redundant and fucking useless for any modern code. Writing a UTF-16 to UTF-8 conversion routine is maybe 3 pages of code, or a few lines if you use iconv. But libarchive uses all this glorious bullshit, and ends up with 3 not working API functions, and with over 4000 lines of its own string abstraction code with gratuitous amounts of ifdefs and OS dependent code that breaks in a fairly common use case. So what we do is: - Use the idiotic POSIX 2008 API (uselocale() etc.) (Too bad for users who try to build this on a system that doesn't have these - hopefully none are left in 2017. But if there are, torturing them with obscure build errors is probably justified. Might be bad for Windows though, which is a very popular platform except on phones.) - Use the "C.UTF-8" locale, which is probably not 100% standards compliant, but works on my system, so it's fine. - Guard every libarchive call with uselocale() + restoring the locale. - Be lazy and skip some libarchive calls. Look forward to the unlikely and astonishingly stupid bugs this could produce. We could also just set a C UTF-8 local in main (since that would have no known negative effects on the rest of the code), but this won't work for libmpv. We assume that uselocale() never fails. In an unexplainable stroke of luck, POSIX made the semantics of uselocale() nice enough that user code can fail failures without introducing crash or security bugs, even if there should be an implementation fucked up enough where it's actually possible that uselocale() fails even with valid input. With all this shitty ugliness added, it finally works, without fucking up other parts of the player. This is still less bad than that time when libquivi fucked up OpenGL rendering, because calling a libquvi function would load some proxy abstraction library, which in turn loaded a KDE plugin (even if KDE was not used), which in turn called setlocale() because Qt does this, and consequently made the mpv GLSL shader generation code emit "," instead of "." for numbers, and of course only for users who had that KDE plugin installed, and lived in a part of the world where "." is not used as decimal separator. All in all, I believe this proves that software developers as a whole and as a culture produce worse results than drug addicted butt fucked monkeys randomly hacking on typewriters while inhaling the fumes of a radioactive dumpster fire fueled by chinese platsic toys for children and Elton John/Justin Bieber crossover CDs for all eternity.
2017-11-12 12:36:35 +00:00
locale_t oldlocale = uselocale(p->mpa->locale);
int r = archive_read_data(p->mpa->arch, buffer, max_len);
if (r < 0) {
MP_ERR(s, "%s\n", archive_error_string(p->mpa->arch));
if (mp_archive_check_fatal(p->mpa, r)) {
mp_archive_free(p->mpa);
p->mpa = NULL;
}
}
stream_libarchive: workaround various types of locale braindeath Fix that libarchive fails to return filenames for UTF-8/UTF-16 entries. The reason is that it uses locales and all that garbage, and mpv does not set a locale. Both C locales and wchar_t are shitfucked retarded legacy braindeath. If the C/POSIX standard committee had actually competent members, these would have been deprecated or removed long ago. (I mean, they managed to remove gets().) To justify this emotional outbreak potentially insulting to unknown persons, I will write a lot of text. Those not comfortable with toxic language should pretend this is a religious text. C locales are supposed to be a way to support certain languages and cultures easier. One example are character codepages. Back when UTF-8 was not invented yet, there were only 255 possible characters, which is not enough for anything but English and some european languages. So they decided to make the meaning of a character dependent on the current codepage. The locale (LC_CTYPE specifically) determines what character encoding is currently used. Of course nowadays, this is legacy nonsense. Everything uses UTF-8 for "char", and what doesn't is broken and terrible anyway. But the old ways stayed with us, and the stupidity of it as well. C locales were utterly moronic even when they were invented. The locale (via setlocale()) is global state, and global state is not a reasonable way to do anything. It will break libraries, or well modularized code. (The latter would be forced to strictly guard all entrypoints set set/restore locales, assuming a single threaded world.) On top of that, setting a locale randomly changes the semantics of a bunch of standard functions. If a function respects locale, you suddenly can't rely on it to behave the same on all systems. Some behavior can come as a surprise, and of course it will be dependent on the region of the user (it doesn't help that most software is US-centric, and the US locale is almost like the C locale, i.e. almost what you expect). Idiotically, locales were not just used to define the current character encoding, but the concept was used for a whole lot of things, like e. g. whether numbers should use "," or "." as decimal separaror. The latter issue is actually much worse, because it breaks basic string conversion or parsing of numbers for the purpose of interacting with file formats and such. Much can be said about how retarded locales are, even beyond what I just wrote, or will wrote below. They are so hilariously misdesigned and insufficient, I can't even fathom how this shit was _standardized_. (In any case, that meant everyone was forced to implement it.) Many C functions can't even do it correctly. For example, the character set encoding can be a multibyte encoding (not just UTF-8, but awful garbage like Shift JIS (sometimes called SHIT JIZZ), yet functions like toupper() can return only 1 byte. Or just take the fact that the locale API tries to define standard paper sizes (LC_PAPER) or telephone number formatting (LC_TELEPHONE). Who the fuck uses this, or would ever use this? But the badness doesn't stop here. At some point, they invented threads. And they put absolutely no thought into how threads should interact with locales. So they kept locales as global state. Because obviously, you want to be able to change the semantics of basic string processing functions _while_ they're running, right? (Any thread can call setlocale() at any time, and it's supposed to change the locale of all other threads.) At this point, how the fuck are you supposed to do anything correctly? You can't even temporarily switch the locale with setlocale(), because it would asynchronously fuckup the other threads. All you can do is to enforce a convention not to set anything but the C local (this is what mpv does), or to duplicate standard functions using code that doesn't query locale (this is what e.g. libass does, a close dependency of mpv). Imagine they had done this for certain other things. Like errno, with all the brokenness of the locale API. This simply wouldn't have worked, shit would just have been too broken. So they didn't. But locales give a delicious sweet spot of brokenness, where things are broken enough to cause neverending pain, but not broken enough that enough effort would have spent to fix it completely. On that note, standard C11 actually can't stringify an error value. It does define strerror(), but it's not thread safe, even though C11 supports threads. The idiots could just have defined it to be thread safe. Even if your libc is horrible enough that it can't return string literals, it could just just some thread local buffer. Because C11 does define thread local variables. But hey, why care about details, if you can just create a shitty standard? (POSIX defines strerror_r(), which "solves" this problem, while still not making strerror() thread safe.) Anyway, back to threads. The interaction of locales and threads makes no sense. Why would you make locales process global? Who even wanted it to work this way? Who decided that it should keep working this way, despite being so broken (and certainly causing implementation difficulties in libc)? Was it just a fucked up psychopath? Several decades later, the moronic standard committees noticed that this was (still is) kind of a bad situation. Instead of fixing the situation, they added more garbage on top of it. (Probably for the sake of "compatibility"). Now there is a set of new functions, which allow you to override the locale for the current thread. This means you can temporarily override and restore the local on all entrypoints of your code (like you could with setlocale(), before threads were invented). And of course not all operating systems or libcs implement this. For example, I'm pretty sure Microsoft doesn't. (Microsoft got to fuck it up as usual, and only provides _configthreadlocale(). This is shitfucked on its own, because it's GLOBAL STATE to configure that GLOBAL STATE should not be GLOBAL STATE, i.e. completely broken garbage, because it requires agreement over all modules/libraries what behavior should be used. I mean, sure, makign setlocale() affect only the current thread would have been the reasonable behavior. Making this behavior configurable isn't, because you can't rely on what behavior is active.) POSIX showed some minor decency by at least introducing some variations of standard functions, which have a locale argument (e.g. toupper_l()). You just pass the locale which you want to be used, and don't have to do the set locale/call function/restore locale nonense. But OF COURSE they fucked this up too. In no less than 2 ways: - There is no statically available handle for the C locale, so you have to initialize and store it somewhere, which makes it harder to make utility functions safe, that call locale-affected standard functions and expect C semantics. The easy solution, using pthread_once() and a global variable with the created locale, will not be easily accepted by pedantic assholes, because they'll worry about allocation failure, or leaking the locale when using this in library code (and then unloading the library). Or you could have complicated library init/uninit functions, which bring a big load of their own mess. Same for automagic DLL constructors/destructors. - Not all functions have a variant that takes a locale argument, and they missed even some important ones, like snprintf() or strtod() WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK I would like to know why it took so long to standardize a half-assed solution, that, apart from being conceptually half-assed, is even incomplete and insufficient. The obvious way to fix this would have been: - deprecate the entire locale API and their use, and make it a NOP - make UTF-8 the standard character type - make the C locale behavior the default - add new APIs that explicitly take locale objects - provide an emulation layer, that can be used to transparently build legacy code without breaking them But this wouldn't have been "compatible", and the apparently incompetent standard committees would have never accepted this. As if anyone actually used this legacy garbage, except other legacy garbage. Oh yeah, and let's care a lot about legacy compatibility, and let's not care at all about modern code that either has to suffer from this, or subtly breaks when the wrong locales are active. Last but not least, the UTF-8 locale name is apparently not even standardized. At the moment I'm trying to use "C.UTF-8", which is apparently glibc _and_ Debian specific. Got to use every opportunity to make correct usage of UTF-8 harder. What luck that this commit is only for some optional relatively obscure mpv feature. Why is the C locale not UTF-8? Why did POSIX not standardize an UTF-8 locale? Well, according to something I heard a few years ago, they're considering disallowing UTF-8 as locale, because UTF-8 would violate certain ivnariants expected by C or POSIX. (But I'm not sure if I remember this correctly - probably better not to rage about it.) Now, on to libarchive. libarchive intentionally uses the locale API and all the broken crap around it to "convert" UTF-8 or UTF-16 (as contained in reasonably sane archive formats) to "char*". This is a good start! Since glibc does not think that the C locale uses UTF-8, this fails for mpv. So trying to use archive_entry_pathname() to get the archive entry name fails if the name contains non-ASCII characters. Maybe use archive_entry_pathname_utf8()? Surely that should return UTF-8, since its name seems to indicate that it returns UTF-8. But of fucking course it doesn't! libarchive's horribly convoluted code (that is full of locale API usage and other legacy shit, as well as ifdefs and OS specific code, including Windows and fucking Cygwin) somehow fucks up and fails if the locale is not set to UTF-8. I made a PR fixing this in libarchive almost 2 years ago, but it was ignored. So, would archive_entry_pathname_w() as fallback work? No, why would it? Of course this _also_ involves shitfucked code that calls shitfucked standard functions (or OS specific ifdeffed shitfuck). The truth is that at least glibc changes the meaning of wchar_t depending on the locale. Unlike most people think, wchar_t is not standardized to be an UTF variant (or even unicode) - it's an encoding that uses basic units that can be larger than 8 bit. It's an implementation defined thing. Windows defines it to 2 bytes and UTF-16, and glibc defines it to 4 bytes and UTF-32, but only if an UTF-8 locale is set (apparently). Yes. Every libarchive function dealing with strings has 3 variants: plain, _utf8, and _w. And none of these work if the locale is not set. I cannot fathom why they even have a wchar_t variant, because it's redundant and fucking useless for any modern code. Writing a UTF-16 to UTF-8 conversion routine is maybe 3 pages of code, or a few lines if you use iconv. But libarchive uses all this glorious bullshit, and ends up with 3 not working API functions, and with over 4000 lines of its own string abstraction code with gratuitous amounts of ifdefs and OS dependent code that breaks in a fairly common use case. So what we do is: - Use the idiotic POSIX 2008 API (uselocale() etc.) (Too bad for users who try to build this on a system that doesn't have these - hopefully none are left in 2017. But if there are, torturing them with obscure build errors is probably justified. Might be bad for Windows though, which is a very popular platform except on phones.) - Use the "C.UTF-8" locale, which is probably not 100% standards compliant, but works on my system, so it's fine. - Guard every libarchive call with uselocale() + restoring the locale. - Be lazy and skip some libarchive calls. Look forward to the unlikely and astonishingly stupid bugs this could produce. We could also just set a C UTF-8 local in main (since that would have no known negative effects on the rest of the code), but this won't work for libmpv. We assume that uselocale() never fails. In an unexplainable stroke of luck, POSIX made the semantics of uselocale() nice enough that user code can fail failures without introducing crash or security bugs, even if there should be an implementation fucked up enough where it's actually possible that uselocale() fails even with valid input. With all this shitty ugliness added, it finally works, without fucking up other parts of the player. This is still less bad than that time when libquivi fucked up OpenGL rendering, because calling a libquvi function would load some proxy abstraction library, which in turn loaded a KDE plugin (even if KDE was not used), which in turn called setlocale() because Qt does this, and consequently made the mpv GLSL shader generation code emit "," instead of "." for numbers, and of course only for users who had that KDE plugin installed, and lived in a part of the world where "." is not used as decimal separator. All in all, I believe this proves that software developers as a whole and as a culture produce worse results than drug addicted butt fucked monkeys randomly hacking on typewriters while inhaling the fumes of a radioactive dumpster fire fueled by chinese platsic toys for children and Elton John/Justin Bieber crossover CDs for all eternity.
2017-11-12 12:36:35 +00:00
uselocale(oldlocale);
return r;
}
static int archive_entry_seek(stream_t *s, int64_t newpos)
{
struct priv *p = s->priv;
if (p->mpa && !p->broken_seek) {
locale_t oldlocale = uselocale(p->mpa->locale);
int r = archive_seek_data(p->mpa->arch, newpos, SEEK_SET);
uselocale(oldlocale);
if (r >= 0)
return 1;
MP_WARN(s, "possibly unsupported seeking - switching to reopening\n");
p->broken_seek = true;
if (reopen_archive(s) < STREAM_OK)
return -1;
}
// libarchive can't seek in most formats.
if (newpos < s->pos) {
// Hack seeking backwards into working by reopening the archive and
// starting over.
MP_VERBOSE(s, "trying to reopen archive for performing seek\n");
if (reopen_archive(s) < STREAM_OK)
return -1;
}
if (newpos > s->pos) {
if (!p->mpa && reopen_archive(s) < STREAM_OK)
return -1;
// For seeking forwards, just keep reading data (there's no libarchive
// skip function either).
char buffer[4096];
while (newpos > s->pos) {
if (mp_cancel_test(s->cancel))
return -1;
int size = MPMIN(newpos - s->pos, sizeof(buffer));
locale_t oldlocale = uselocale(p->mpa->locale);
int r = archive_read_data(p->mpa->arch, buffer, size);
if (r <= 0) {
if (r == 0 && newpos > p->entry_size) {
MP_ERR(s, "demuxer trying to seek beyond end of archive "
"entry\n");
} else if (r == 0) {
MP_ERR(s, "end of archive entry reached while seeking\n");
} else {
MP_ERR(s, "%s\n", archive_error_string(p->mpa->arch));
}
stream_libarchive: workaround various types of locale braindeath Fix that libarchive fails to return filenames for UTF-8/UTF-16 entries. The reason is that it uses locales and all that garbage, and mpv does not set a locale. Both C locales and wchar_t are shitfucked retarded legacy braindeath. If the C/POSIX standard committee had actually competent members, these would have been deprecated or removed long ago. (I mean, they managed to remove gets().) To justify this emotional outbreak potentially insulting to unknown persons, I will write a lot of text. Those not comfortable with toxic language should pretend this is a religious text. C locales are supposed to be a way to support certain languages and cultures easier. One example are character codepages. Back when UTF-8 was not invented yet, there were only 255 possible characters, which is not enough for anything but English and some european languages. So they decided to make the meaning of a character dependent on the current codepage. The locale (LC_CTYPE specifically) determines what character encoding is currently used. Of course nowadays, this is legacy nonsense. Everything uses UTF-8 for "char", and what doesn't is broken and terrible anyway. But the old ways stayed with us, and the stupidity of it as well. C locales were utterly moronic even when they were invented. The locale (via setlocale()) is global state, and global state is not a reasonable way to do anything. It will break libraries, or well modularized code. (The latter would be forced to strictly guard all entrypoints set set/restore locales, assuming a single threaded world.) On top of that, setting a locale randomly changes the semantics of a bunch of standard functions. If a function respects locale, you suddenly can't rely on it to behave the same on all systems. Some behavior can come as a surprise, and of course it will be dependent on the region of the user (it doesn't help that most software is US-centric, and the US locale is almost like the C locale, i.e. almost what you expect). Idiotically, locales were not just used to define the current character encoding, but the concept was used for a whole lot of things, like e. g. whether numbers should use "," or "." as decimal separaror. The latter issue is actually much worse, because it breaks basic string conversion or parsing of numbers for the purpose of interacting with file formats and such. Much can be said about how retarded locales are, even beyond what I just wrote, or will wrote below. They are so hilariously misdesigned and insufficient, I can't even fathom how this shit was _standardized_. (In any case, that meant everyone was forced to implement it.) Many C functions can't even do it correctly. For example, the character set encoding can be a multibyte encoding (not just UTF-8, but awful garbage like Shift JIS (sometimes called SHIT JIZZ), yet functions like toupper() can return only 1 byte. Or just take the fact that the locale API tries to define standard paper sizes (LC_PAPER) or telephone number formatting (LC_TELEPHONE). Who the fuck uses this, or would ever use this? But the badness doesn't stop here. At some point, they invented threads. And they put absolutely no thought into how threads should interact with locales. So they kept locales as global state. Because obviously, you want to be able to change the semantics of basic string processing functions _while_ they're running, right? (Any thread can call setlocale() at any time, and it's supposed to change the locale of all other threads.) At this point, how the fuck are you supposed to do anything correctly? You can't even temporarily switch the locale with setlocale(), because it would asynchronously fuckup the other threads. All you can do is to enforce a convention not to set anything but the C local (this is what mpv does), or to duplicate standard functions using code that doesn't query locale (this is what e.g. libass does, a close dependency of mpv). Imagine they had done this for certain other things. Like errno, with all the brokenness of the locale API. This simply wouldn't have worked, shit would just have been too broken. So they didn't. But locales give a delicious sweet spot of brokenness, where things are broken enough to cause neverending pain, but not broken enough that enough effort would have spent to fix it completely. On that note, standard C11 actually can't stringify an error value. It does define strerror(), but it's not thread safe, even though C11 supports threads. The idiots could just have defined it to be thread safe. Even if your libc is horrible enough that it can't return string literals, it could just just some thread local buffer. Because C11 does define thread local variables. But hey, why care about details, if you can just create a shitty standard? (POSIX defines strerror_r(), which "solves" this problem, while still not making strerror() thread safe.) Anyway, back to threads. The interaction of locales and threads makes no sense. Why would you make locales process global? Who even wanted it to work this way? Who decided that it should keep working this way, despite being so broken (and certainly causing implementation difficulties in libc)? Was it just a fucked up psychopath? Several decades later, the moronic standard committees noticed that this was (still is) kind of a bad situation. Instead of fixing the situation, they added more garbage on top of it. (Probably for the sake of "compatibility"). Now there is a set of new functions, which allow you to override the locale for the current thread. This means you can temporarily override and restore the local on all entrypoints of your code (like you could with setlocale(), before threads were invented). And of course not all operating systems or libcs implement this. For example, I'm pretty sure Microsoft doesn't. (Microsoft got to fuck it up as usual, and only provides _configthreadlocale(). This is shitfucked on its own, because it's GLOBAL STATE to configure that GLOBAL STATE should not be GLOBAL STATE, i.e. completely broken garbage, because it requires agreement over all modules/libraries what behavior should be used. I mean, sure, makign setlocale() affect only the current thread would have been the reasonable behavior. Making this behavior configurable isn't, because you can't rely on what behavior is active.) POSIX showed some minor decency by at least introducing some variations of standard functions, which have a locale argument (e.g. toupper_l()). You just pass the locale which you want to be used, and don't have to do the set locale/call function/restore locale nonense. But OF COURSE they fucked this up too. In no less than 2 ways: - There is no statically available handle for the C locale, so you have to initialize and store it somewhere, which makes it harder to make utility functions safe, that call locale-affected standard functions and expect C semantics. The easy solution, using pthread_once() and a global variable with the created locale, will not be easily accepted by pedantic assholes, because they'll worry about allocation failure, or leaking the locale when using this in library code (and then unloading the library). Or you could have complicated library init/uninit functions, which bring a big load of their own mess. Same for automagic DLL constructors/destructors. - Not all functions have a variant that takes a locale argument, and they missed even some important ones, like snprintf() or strtod() WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK I would like to know why it took so long to standardize a half-assed solution, that, apart from being conceptually half-assed, is even incomplete and insufficient. The obvious way to fix this would have been: - deprecate the entire locale API and their use, and make it a NOP - make UTF-8 the standard character type - make the C locale behavior the default - add new APIs that explicitly take locale objects - provide an emulation layer, that can be used to transparently build legacy code without breaking them But this wouldn't have been "compatible", and the apparently incompetent standard committees would have never accepted this. As if anyone actually used this legacy garbage, except other legacy garbage. Oh yeah, and let's care a lot about legacy compatibility, and let's not care at all about modern code that either has to suffer from this, or subtly breaks when the wrong locales are active. Last but not least, the UTF-8 locale name is apparently not even standardized. At the moment I'm trying to use "C.UTF-8", which is apparently glibc _and_ Debian specific. Got to use every opportunity to make correct usage of UTF-8 harder. What luck that this commit is only for some optional relatively obscure mpv feature. Why is the C locale not UTF-8? Why did POSIX not standardize an UTF-8 locale? Well, according to something I heard a few years ago, they're considering disallowing UTF-8 as locale, because UTF-8 would violate certain ivnariants expected by C or POSIX. (But I'm not sure if I remember this correctly - probably better not to rage about it.) Now, on to libarchive. libarchive intentionally uses the locale API and all the broken crap around it to "convert" UTF-8 or UTF-16 (as contained in reasonably sane archive formats) to "char*". This is a good start! Since glibc does not think that the C locale uses UTF-8, this fails for mpv. So trying to use archive_entry_pathname() to get the archive entry name fails if the name contains non-ASCII characters. Maybe use archive_entry_pathname_utf8()? Surely that should return UTF-8, since its name seems to indicate that it returns UTF-8. But of fucking course it doesn't! libarchive's horribly convoluted code (that is full of locale API usage and other legacy shit, as well as ifdefs and OS specific code, including Windows and fucking Cygwin) somehow fucks up and fails if the locale is not set to UTF-8. I made a PR fixing this in libarchive almost 2 years ago, but it was ignored. So, would archive_entry_pathname_w() as fallback work? No, why would it? Of course this _also_ involves shitfucked code that calls shitfucked standard functions (or OS specific ifdeffed shitfuck). The truth is that at least glibc changes the meaning of wchar_t depending on the locale. Unlike most people think, wchar_t is not standardized to be an UTF variant (or even unicode) - it's an encoding that uses basic units that can be larger than 8 bit. It's an implementation defined thing. Windows defines it to 2 bytes and UTF-16, and glibc defines it to 4 bytes and UTF-32, but only if an UTF-8 locale is set (apparently). Yes. Every libarchive function dealing with strings has 3 variants: plain, _utf8, and _w. And none of these work if the locale is not set. I cannot fathom why they even have a wchar_t variant, because it's redundant and fucking useless for any modern code. Writing a UTF-16 to UTF-8 conversion routine is maybe 3 pages of code, or a few lines if you use iconv. But libarchive uses all this glorious bullshit, and ends up with 3 not working API functions, and with over 4000 lines of its own string abstraction code with gratuitous amounts of ifdefs and OS dependent code that breaks in a fairly common use case. So what we do is: - Use the idiotic POSIX 2008 API (uselocale() etc.) (Too bad for users who try to build this on a system that doesn't have these - hopefully none are left in 2017. But if there are, torturing them with obscure build errors is probably justified. Might be bad for Windows though, which is a very popular platform except on phones.) - Use the "C.UTF-8" locale, which is probably not 100% standards compliant, but works on my system, so it's fine. - Guard every libarchive call with uselocale() + restoring the locale. - Be lazy and skip some libarchive calls. Look forward to the unlikely and astonishingly stupid bugs this could produce. We could also just set a C UTF-8 local in main (since that would have no known negative effects on the rest of the code), but this won't work for libmpv. We assume that uselocale() never fails. In an unexplainable stroke of luck, POSIX made the semantics of uselocale() nice enough that user code can fail failures without introducing crash or security bugs, even if there should be an implementation fucked up enough where it's actually possible that uselocale() fails even with valid input. With all this shitty ugliness added, it finally works, without fucking up other parts of the player. This is still less bad than that time when libquivi fucked up OpenGL rendering, because calling a libquvi function would load some proxy abstraction library, which in turn loaded a KDE plugin (even if KDE was not used), which in turn called setlocale() because Qt does this, and consequently made the mpv GLSL shader generation code emit "," instead of "." for numbers, and of course only for users who had that KDE plugin installed, and lived in a part of the world where "." is not used as decimal separator. All in all, I believe this proves that software developers as a whole and as a culture produce worse results than drug addicted butt fucked monkeys randomly hacking on typewriters while inhaling the fumes of a radioactive dumpster fire fueled by chinese platsic toys for children and Elton John/Justin Bieber crossover CDs for all eternity.
2017-11-12 12:36:35 +00:00
uselocale(oldlocale);
if (mp_archive_check_fatal(p->mpa, r)) {
mp_archive_free(p->mpa);
p->mpa = NULL;
}
return -1;
}
stream_libarchive: workaround various types of locale braindeath Fix that libarchive fails to return filenames for UTF-8/UTF-16 entries. The reason is that it uses locales and all that garbage, and mpv does not set a locale. Both C locales and wchar_t are shitfucked retarded legacy braindeath. If the C/POSIX standard committee had actually competent members, these would have been deprecated or removed long ago. (I mean, they managed to remove gets().) To justify this emotional outbreak potentially insulting to unknown persons, I will write a lot of text. Those not comfortable with toxic language should pretend this is a religious text. C locales are supposed to be a way to support certain languages and cultures easier. One example are character codepages. Back when UTF-8 was not invented yet, there were only 255 possible characters, which is not enough for anything but English and some european languages. So they decided to make the meaning of a character dependent on the current codepage. The locale (LC_CTYPE specifically) determines what character encoding is currently used. Of course nowadays, this is legacy nonsense. Everything uses UTF-8 for "char", and what doesn't is broken and terrible anyway. But the old ways stayed with us, and the stupidity of it as well. C locales were utterly moronic even when they were invented. The locale (via setlocale()) is global state, and global state is not a reasonable way to do anything. It will break libraries, or well modularized code. (The latter would be forced to strictly guard all entrypoints set set/restore locales, assuming a single threaded world.) On top of that, setting a locale randomly changes the semantics of a bunch of standard functions. If a function respects locale, you suddenly can't rely on it to behave the same on all systems. Some behavior can come as a surprise, and of course it will be dependent on the region of the user (it doesn't help that most software is US-centric, and the US locale is almost like the C locale, i.e. almost what you expect). Idiotically, locales were not just used to define the current character encoding, but the concept was used for a whole lot of things, like e. g. whether numbers should use "," or "." as decimal separaror. The latter issue is actually much worse, because it breaks basic string conversion or parsing of numbers for the purpose of interacting with file formats and such. Much can be said about how retarded locales are, even beyond what I just wrote, or will wrote below. They are so hilariously misdesigned and insufficient, I can't even fathom how this shit was _standardized_. (In any case, that meant everyone was forced to implement it.) Many C functions can't even do it correctly. For example, the character set encoding can be a multibyte encoding (not just UTF-8, but awful garbage like Shift JIS (sometimes called SHIT JIZZ), yet functions like toupper() can return only 1 byte. Or just take the fact that the locale API tries to define standard paper sizes (LC_PAPER) or telephone number formatting (LC_TELEPHONE). Who the fuck uses this, or would ever use this? But the badness doesn't stop here. At some point, they invented threads. And they put absolutely no thought into how threads should interact with locales. So they kept locales as global state. Because obviously, you want to be able to change the semantics of basic string processing functions _while_ they're running, right? (Any thread can call setlocale() at any time, and it's supposed to change the locale of all other threads.) At this point, how the fuck are you supposed to do anything correctly? You can't even temporarily switch the locale with setlocale(), because it would asynchronously fuckup the other threads. All you can do is to enforce a convention not to set anything but the C local (this is what mpv does), or to duplicate standard functions using code that doesn't query locale (this is what e.g. libass does, a close dependency of mpv). Imagine they had done this for certain other things. Like errno, with all the brokenness of the locale API. This simply wouldn't have worked, shit would just have been too broken. So they didn't. But locales give a delicious sweet spot of brokenness, where things are broken enough to cause neverending pain, but not broken enough that enough effort would have spent to fix it completely. On that note, standard C11 actually can't stringify an error value. It does define strerror(), but it's not thread safe, even though C11 supports threads. The idiots could just have defined it to be thread safe. Even if your libc is horrible enough that it can't return string literals, it could just just some thread local buffer. Because C11 does define thread local variables. But hey, why care about details, if you can just create a shitty standard? (POSIX defines strerror_r(), which "solves" this problem, while still not making strerror() thread safe.) Anyway, back to threads. The interaction of locales and threads makes no sense. Why would you make locales process global? Who even wanted it to work this way? Who decided that it should keep working this way, despite being so broken (and certainly causing implementation difficulties in libc)? Was it just a fucked up psychopath? Several decades later, the moronic standard committees noticed that this was (still is) kind of a bad situation. Instead of fixing the situation, they added more garbage on top of it. (Probably for the sake of "compatibility"). Now there is a set of new functions, which allow you to override the locale for the current thread. This means you can temporarily override and restore the local on all entrypoints of your code (like you could with setlocale(), before threads were invented). And of course not all operating systems or libcs implement this. For example, I'm pretty sure Microsoft doesn't. (Microsoft got to fuck it up as usual, and only provides _configthreadlocale(). This is shitfucked on its own, because it's GLOBAL STATE to configure that GLOBAL STATE should not be GLOBAL STATE, i.e. completely broken garbage, because it requires agreement over all modules/libraries what behavior should be used. I mean, sure, makign setlocale() affect only the current thread would have been the reasonable behavior. Making this behavior configurable isn't, because you can't rely on what behavior is active.) POSIX showed some minor decency by at least introducing some variations of standard functions, which have a locale argument (e.g. toupper_l()). You just pass the locale which you want to be used, and don't have to do the set locale/call function/restore locale nonense. But OF COURSE they fucked this up too. In no less than 2 ways: - There is no statically available handle for the C locale, so you have to initialize and store it somewhere, which makes it harder to make utility functions safe, that call locale-affected standard functions and expect C semantics. The easy solution, using pthread_once() and a global variable with the created locale, will not be easily accepted by pedantic assholes, because they'll worry about allocation failure, or leaking the locale when using this in library code (and then unloading the library). Or you could have complicated library init/uninit functions, which bring a big load of their own mess. Same for automagic DLL constructors/destructors. - Not all functions have a variant that takes a locale argument, and they missed even some important ones, like snprintf() or strtod() WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK WHAT THE FUCK I would like to know why it took so long to standardize a half-assed solution, that, apart from being conceptually half-assed, is even incomplete and insufficient. The obvious way to fix this would have been: - deprecate the entire locale API and their use, and make it a NOP - make UTF-8 the standard character type - make the C locale behavior the default - add new APIs that explicitly take locale objects - provide an emulation layer, that can be used to transparently build legacy code without breaking them But this wouldn't have been "compatible", and the apparently incompetent standard committees would have never accepted this. As if anyone actually used this legacy garbage, except other legacy garbage. Oh yeah, and let's care a lot about legacy compatibility, and let's not care at all about modern code that either has to suffer from this, or subtly breaks when the wrong locales are active. Last but not least, the UTF-8 locale name is apparently not even standardized. At the moment I'm trying to use "C.UTF-8", which is apparently glibc _and_ Debian specific. Got to use every opportunity to make correct usage of UTF-8 harder. What luck that this commit is only for some optional relatively obscure mpv feature. Why is the C locale not UTF-8? Why did POSIX not standardize an UTF-8 locale? Well, according to something I heard a few years ago, they're considering disallowing UTF-8 as locale, because UTF-8 would violate certain ivnariants expected by C or POSIX. (But I'm not sure if I remember this correctly - probably better not to rage about it.) Now, on to libarchive. libarchive intentionally uses the locale API and all the broken crap around it to "convert" UTF-8 or UTF-16 (as contained in reasonably sane archive formats) to "char*". This is a good start! Since glibc does not think that the C locale uses UTF-8, this fails for mpv. So trying to use archive_entry_pathname() to get the archive entry name fails if the name contains non-ASCII characters. Maybe use archive_entry_pathname_utf8()? Surely that should return UTF-8, since its name seems to indicate that it returns UTF-8. But of fucking course it doesn't! libarchive's horribly convoluted code (that is full of locale API usage and other legacy shit, as well as ifdefs and OS specific code, including Windows and fucking Cygwin) somehow fucks up and fails if the locale is not set to UTF-8. I made a PR fixing this in libarchive almost 2 years ago, but it was ignored. So, would archive_entry_pathname_w() as fallback work? No, why would it? Of course this _also_ involves shitfucked code that calls shitfucked standard functions (or OS specific ifdeffed shitfuck). The truth is that at least glibc changes the meaning of wchar_t depending on the locale. Unlike most people think, wchar_t is not standardized to be an UTF variant (or even unicode) - it's an encoding that uses basic units that can be larger than 8 bit. It's an implementation defined thing. Windows defines it to 2 bytes and UTF-16, and glibc defines it to 4 bytes and UTF-32, but only if an UTF-8 locale is set (apparently). Yes. Every libarchive function dealing with strings has 3 variants: plain, _utf8, and _w. And none of these work if the locale is not set. I cannot fathom why they even have a wchar_t variant, because it's redundant and fucking useless for any modern code. Writing a UTF-16 to UTF-8 conversion routine is maybe 3 pages of code, or a few lines if you use iconv. But libarchive uses all this glorious bullshit, and ends up with 3 not working API functions, and with over 4000 lines of its own string abstraction code with gratuitous amounts of ifdefs and OS dependent code that breaks in a fairly common use case. So what we do is: - Use the idiotic POSIX 2008 API (uselocale() etc.) (Too bad for users who try to build this on a system that doesn't have these - hopefully none are left in 2017. But if there are, torturing them with obscure build errors is probably justified. Might be bad for Windows though, which is a very popular platform except on phones.) - Use the "C.UTF-8" locale, which is probably not 100% standards compliant, but works on my system, so it's fine. - Guard every libarchive call with uselocale() + restoring the locale. - Be lazy and skip some libarchive calls. Look forward to the unlikely and astonishingly stupid bugs this could produce. We could also just set a C UTF-8 local in main (since that would have no known negative effects on the rest of the code), but this won't work for libmpv. We assume that uselocale() never fails. In an unexplainable stroke of luck, POSIX made the semantics of uselocale() nice enough that user code can fail failures without introducing crash or security bugs, even if there should be an implementation fucked up enough where it's actually possible that uselocale() fails even with valid input. With all this shitty ugliness added, it finally works, without fucking up other parts of the player. This is still less bad than that time when libquivi fucked up OpenGL rendering, because calling a libquvi function would load some proxy abstraction library, which in turn loaded a KDE plugin (even if KDE was not used), which in turn called setlocale() because Qt does this, and consequently made the mpv GLSL shader generation code emit "," instead of "." for numbers, and of course only for users who had that KDE plugin installed, and lived in a part of the world where "." is not used as decimal separator. All in all, I believe this proves that software developers as a whole and as a culture produce worse results than drug addicted butt fucked monkeys randomly hacking on typewriters while inhaling the fumes of a radioactive dumpster fire fueled by chinese platsic toys for children and Elton John/Justin Bieber crossover CDs for all eternity.
2017-11-12 12:36:35 +00:00
uselocale(oldlocale);
s->pos += r;
}
}
return 1;
}
static void archive_entry_close(stream_t *s)
{
struct priv *p = s->priv;
mp_archive_free(p->mpa);
free_stream(p->src);
}
static int64_t archive_entry_get_size(stream_t *s)
{
struct priv *p = s->priv;
return p->entry_size;
}
static int archive_entry_open(stream_t *stream)
{
struct priv *p = talloc_zero(stream, struct priv);
stream->priv = p;
if (!strchr(stream->path, '|'))
return STREAM_ERROR;
char *base = talloc_strdup(p, stream->path);
char *name = strchr(base, '|');
2023-11-18 22:30:42 +00:00
if (!name)
return STREAM_ERROR;
*name++ = '\0';
if (name[0] == '/')
name += 1;
p->entry_name = name;
mp_url_unescape_inplace(base);
stream, demux: redo origin policy thing mpv has a very weak and very annoying policy that determines whether a playlist should be used or not. For example, if you play a remote playlist, you usually don't want it to be able to read local filesystem entries. (Although for a media player the impact is small I guess.) It's weak and annoying as in that it does not prevent certain cases which could be interpreted as bad in some cases, such as allowing playlists on the local filesystem to reference remote URLs. It probably barely makes sense, but we just want to exclude some other "definitely not a good idea" things, all while playlists generally just work, so whatever. The policy is: - from the command line anything is played - local playlists can reference anything except "unsafe" streams ("unsafe" means special stream inputs like libavfilter graphs) - remote playlists can reference only remote URLs - things like "memory://" and archives are "transparent" to this This commit does... something. It replaces the weird stream flags with a slightly clearer "origin" value, which is now consequently passed down and used everywhere. It fixes some deviations from the described policy. I wanted to force archives to reference only content within them, but this would probably have been more complicated (or required different abstractions), and I'm too lazy to figure it out, so archives are now "transparent" (playlists within archives behave the same outside). There may be a lot of bugs in this. This is unfortunately a very noisy commit because: - every stream open call now needs to pass the origin - so does every demuxer open call (=> params param. gets mandatory) - most stream were changed to provide the "origin" value - the origin value needed to be passed along in a lot of places - I was too lazy to split the commit Fixes: #7274
2019-12-20 08:41:42 +00:00
p->src = stream_create(base, STREAM_READ | stream->stream_origin,
stream->cancel, stream->global);
if (!p->src) {
archive_entry_close(stream);
return STREAM_ERROR;
}
int r = reopen_archive(stream);
if (r < STREAM_OK) {
archive_entry_close(stream);
return r;
}
stream->fill_buffer = archive_entry_fill_buffer;
if (p->src->seekable) {
stream->seek = archive_entry_seek;
stream->seekable = true;
}
stream->close = archive_entry_close;
stream->get_size = archive_entry_get_size;
stream->streaming = true;
return STREAM_OK;
}
const stream_info_t stream_info_libarchive = {
.name = "libarchive",
.open = archive_entry_open,
.protocols = (const char*const[]){ "archive", NULL },
};