diff --git a/doc/APIchanges b/doc/APIchanges index c0edb643f0..618df2aab2 100644 --- a/doc/APIchanges +++ b/doc/APIchanges @@ -15,6 +15,9 @@ libavutil: 2012-10-22 API changes, most recent first: +2013-11-XX - xxxxxxx - lavu 52.54.100 - avstring.h + Add av_utf8_decode() function. + 2013-11-xx - xxxxxxx - lavc 55.44.100 - avcodec.h Add av_packet_{un,}pack_dictionary() Add AV_PKT_METADATA_UPDATE side data type, used to transmit key/value diff --git a/libavutil/Makefile b/libavutil/Makefile index 9b5cd4e6b7..02dd728d66 100644 --- a/libavutil/Makefile +++ b/libavutil/Makefile @@ -157,6 +157,7 @@ TESTPROGS = adler32 \ sha \ sha512 \ tree \ + utf8 \ xtea \ TESTPROGS-$(HAVE_LZO1X_999_COMPRESS) += lzo diff --git a/libavutil/avstring.c b/libavutil/avstring.c index eed58fae1e..20931071b3 100644 --- a/libavutil/avstring.c +++ b/libavutil/avstring.c @@ -307,6 +307,70 @@ int av_isxdigit(int c) return av_isdigit(c) || (c >= 'a' && c <= 'f'); } +int av_utf8_decode(int32_t *codep, const uint8_t **bufp, const uint8_t *buf_end, + unsigned int flags) +{ + const uint8_t *p = *bufp; + uint32_t top; + uint64_t code; + int ret = 0; + + if (p >= buf_end) + return 0; + + code = *p++; + + /* first sequence byte starts with 10, or is 1111-1110 or 1111-1111, + which is not admitted */ + if ((code & 0xc0) == 0x80 || code >= 0xFE) { + ret = AVERROR(EILSEQ); + goto end; + } + top = (code & 128) >> 1; + + while (code & top) { + int tmp; + if (p >= buf_end) { + ret = AVERROR(EILSEQ); /* incomplete sequence */ + goto end; + } + + /* we assume the byte to be in the form 10xx-xxxx */ + tmp = *p++ - 128; /* strip leading 1 */ + if (tmp>>6) { + ret = AVERROR(EILSEQ); + goto end; + } + code = (code<<6) + tmp; + top <<= 5; + } + code &= (top << 1) - 1; + + if (code >= 1<<31) { + ret = AVERROR(EILSEQ); /* out-of-range value */ + goto end; + } + + *codep = code; + + if (code > 0x10FFFF && + !(flags & AV_UTF8_FLAG_ACCEPT_INVALID_BIG_CODES)) + ret = AVERROR(EILSEQ); + if (code < 0x20 && code != 0x9 && code != 0xA && code != 0xD && + flags & AV_UTF8_FLAG_EXCLUDE_XML_INVALID_CONTROL_CODES) + ret = AVERROR(EILSEQ); + if (code >= 0xD800 && code <= 0xDFFF && + !(flags & AV_UTF8_FLAG_ACCEPT_SURROGATES)) + ret = AVERROR(EILSEQ); + if (code == 0xFFFE || code == 0xFFFF && + (!flags & AV_UTF8_FLAG_ACCEPT_NON_CHARACTERS)) + ret = AVERROR(EILSEQ); + +end: + *bufp = p; + return ret; +} + #ifdef TEST int main(void) diff --git a/libavutil/avstring.h b/libavutil/avstring.h index 438ef799eb..882a2b57dc 100644 --- a/libavutil/avstring.h +++ b/libavutil/avstring.h @@ -22,6 +22,7 @@ #define AVUTIL_AVSTRING_H #include +#include #include "attributes.h" /** @@ -295,6 +296,45 @@ enum AVEscapeMode { int av_escape(char **dst, const char *src, const char *special_chars, enum AVEscapeMode mode, int flags); +#define AV_UTF8_FLAG_ACCEPT_INVALID_BIG_CODES 1 ///< accept codepoints over 0x10FFFF +#define AV_UTF8_FLAG_ACCEPT_NON_CHARACTERS 2 ///< accept non-characters - 0xFFFE and 0xFFFF +#define AV_UTF8_FLAG_ACCEPT_SURROGATES 4 ///< accept UTF-16 surrogates codes +#define AV_UTF8_FLAG_EXCLUDE_XML_INVALID_CONTROL_CODES 8 ///< exclude control codes not accepted by XML + +#define AV_UTF8_FLAG_ACCEPT_ALL \ + AV_UTF8_FLAG_ACCEPT_INVALID_BIG_CODES|AV_UTF8_FLAG_ACCEPT_NON_CHARACTERS|AV_UTF8_FLAG_ACCEPT_SURROGATES + +/** + * Read and decode a single UTF-8 code point (character) from the + * buffer in *buf, and update *buf to point to the next byte to + * decode. + * + * In case of an invalid byte sequence, the pointer will be updated to + * the next byte after the invalid sequence and the function will + * return an error code. + * + * Depending on the specified flags, the function will also fail in + * case the decoded code point does not belong to a valid range. + * + * @note For speed-relevant code a carefully implemented use of + * GET_UTF8() may be preferred. + * + * @param codep pointer used to return the parsed code in case of success. + * The value in *codep is set even in case the range check fails. + * @param bufp pointer to the address the first byte of the sequence + * to decode, updated by the function to point to the + * byte next after the decoded sequence + * @param buf_end pointer to the end of the buffer, points to the next + * byte past the last in the buffer. This is used to + * avoid buffer overreads (in case of an unfinished + * UTF-8 sequence towards the end of the buffer). + * @param flags a collection of AV_UTF8_FLAG_* flags + * @return >= 0 in case a sequence was successfully read, a negative + * value in case of invalid sequence + */ +int av_utf8_decode(int32_t *codep, const uint8_t **bufp, const uint8_t *buf_end, + unsigned int flags); + /** * @} */ diff --git a/libavutil/utf8.c b/libavutil/utf8.c new file mode 100644 index 0000000000..37a2802b5f --- /dev/null +++ b/libavutil/utf8.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2013 Stefano Sabatini + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/avstring.h" +#include "libavutil/file.h" + +static void print_sequence(const char *p, int l, int indent) +{ + int i; + for (i = 0; i < l; i++) + printf("%02X", (uint8_t)p[i]); + printf("%*s", indent-l*2, ""); +} + +int main(int argc, char **argv) +{ + int ret; + char *filename = argv[1]; + uint8_t *file_buf; + size_t file_buf_size; + uint32_t code; + const uint8_t *p, *endp; + + ret = av_file_map(filename, &file_buf, &file_buf_size, 0, NULL); + if (ret < 0) + return 1; + + p = file_buf; + endp = file_buf + file_buf_size; + while (p < endp) { + int l, r; + const uint8_t *p0 = p; + code = UINT32_MAX; + r = av_utf8_decode(&code, &p, endp, 0); + l = (int)(p-p0); + print_sequence(p0, l, 20); + if (code != UINT32_MAX) { + printf("%-10d 0x%-10X %-5d ", code, code, l); + if (r >= 0) { + if (*p0 == '\n') printf("\\n\n"); + else printf ("%.*s\n", l, p0); + } else { + printf("invalid code range\n"); + } + } else { + printf("invalid sequence\n"); + } + } + + av_file_unmap(file_buf, file_buf_size); + return 0; +} diff --git a/libavutil/version.h b/libavutil/version.h index 3e64a20dd8..c01da935b8 100644 --- a/libavutil/version.h +++ b/libavutil/version.h @@ -75,7 +75,7 @@ */ #define LIBAVUTIL_VERSION_MAJOR 52 -#define LIBAVUTIL_VERSION_MINOR 53 +#define LIBAVUTIL_VERSION_MINOR 54 #define LIBAVUTIL_VERSION_MICRO 100 #define LIBAVUTIL_VERSION_INT AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \