/* * Copyright (c) 2022 Ben Avison * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with FFmpeg; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include "checkasm.h" #include "libavcodec/vc1dsp.h" #include "libavutil/common.h" #include "libavutil/internal.h" #include "libavutil/intreadwrite.h" #include "libavutil/mem.h" #include "libavutil/mem_internal.h" #define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) }, #define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height }, typedef struct { const char *name; size_t offset; int width; int height; } test; typedef struct matrix { size_t width; size_t height; float d[]; } matrix; static const matrix T8 = { 8, 8, { 12, 12, 12, 12, 12, 12, 12, 12, 16, 15, 9, 4, -4, -9, -15, -16, 16, 6, -6, -16, -16, -6, 6, 16, 15, -4, -16, -9, 9, 16, 4, -15, 12, -12, -12, 12, 12, -12, -12, 12, 9, -16, 4, 15, -15, -4, 16, -9, 6, -16, 16, -6, -6, 16, -16, 6, 4, -9, 15, -16, 16, -15, 9, -4 } }; static const matrix T4 = { 4, 4, { 17, 17, 17, 17, 22, 10, -10, -22, 17, -17, -17, 17, 10, -22, 22, -10 } }; static const matrix T8t = { 8, 8, { 12, 16, 16, 15, 12, 9, 6, 4, 12, 15, 6, -4, -12, -16, -16, -9, 12, 9, -6, -16, -12, 4, 16, 15, 12, 4, -16, -9, 12, 15, -6, -16, 12, -4, -16, 9, 12, -15, -6, 16, 12, -9, -6, 16, -12, -4, 16, -15, 12, -15, 6, 4, -12, 16, -16, 9, 12, -16, 16, -15, 12, -9, 6, -4 } }; static const matrix T4t = { 4, 4, { 17, 22, 17, 10, 17, 10, -17, -22, 17, -10, -17, 22, 17, -22, 17, -10 } }; static matrix *new_matrix(size_t width, size_t height) { matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float)); if (out == NULL) { fprintf(stderr, "Memory allocation failure\n"); exit(EXIT_FAILURE); } out->width = width; out->height = height; return out; } static matrix *multiply(const matrix *a, const matrix *b) { matrix *out; if (a->width != b->height) { fprintf(stderr, "Incompatible multiplication\n"); exit(EXIT_FAILURE); } out = new_matrix(b->width, a->height); for (int j = 0; j < out->height; ++j) for (int i = 0; i < out->width; ++i) { float sum = 0; for (int k = 0; k < a->width; ++k) sum += a->d[j * a->width + k] * b->d[k * b->width + i]; out->d[j * out->width + i] = sum; } return out; } static void normalise(matrix *a) { for (int j = 0; j < a->height; ++j) for (int i = 0; i < a->width; ++i) { float *p = a->d + j * a->width + i; *p *= 64; if (a->height == 4) *p /= (const unsigned[]) { 289, 292, 289, 292 } [j]; else *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j]; if (a->width == 4) *p /= (const unsigned[]) { 289, 292, 289, 292 } [i]; else *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i]; } } static void divide_and_round_nearest(matrix *a, float by) { for (int j = 0; j < a->height; ++j) for (int i = 0; i < a->width; ++i) { float *p = a->d + j * a->width + i; *p = rintf(*p / by); } } static void tweak(matrix *a) { for (int j = 4; j < a->height; ++j) for (int i = 0; i < a->width; ++i) { float *p = a->d + j * a->width + i; *p += 1; } } /* The VC-1 spec places restrictions on the values permitted at three * different stages: * - D: the input coefficients in frequency domain * - E: the intermediate coefficients, inverse-transformed only horizontally * - R: the fully inverse-transformed coefficients * * To fully cater for the ranges specified requires various intermediate * values to be held to 17-bit precision; yet these conditions do not appear * to be utilised in real-world streams. At least some assembly * implementations have chosen to restrict these values to 16-bit precision, * to accelerate the decoding of real-world streams at the cost of strict * adherence to the spec. To avoid our test marking these as failures, * reduce our random inputs. */ #define ATTENUATION 4 static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height) { matrix *raw, *tmp, *D, *E, *R; raw = new_matrix(width, height); for (int i = 0; i < width * height; ++i) raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION; tmp = multiply(height == 8 ? &T8 : &T4, raw); D = multiply(tmp, width == 8 ? &T8t : &T4t); normalise(D); divide_and_round_nearest(D, 1); for (int i = 0; i < width * height; ++i) { if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) { /* Rare, so simply try again */ av_free(raw); av_free(tmp); av_free(D); return generate_inverse_quantized_transform_coefficients(width, height); } } E = multiply(D, width == 8 ? &T8 : &T4); divide_and_round_nearest(E, 8); for (int i = 0; i < width * height; ++i) if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) { /* Rare, so simply try again */ av_free(raw); av_free(tmp); av_free(D); av_free(E); return generate_inverse_quantized_transform_coefficients(width, height); } R = multiply(height == 8 ? &T8t : &T4t, E); tweak(R); divide_and_round_nearest(R, 128); for (int i = 0; i < width * height; ++i) if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) { /* Rare, so simply try again */ av_free(raw); av_free(tmp); av_free(D); av_free(E); av_free(R); return generate_inverse_quantized_transform_coefficients(width, height); } av_free(raw); av_free(tmp); av_free(E); av_free(R); return D; } #define RANDOMIZE_BUFFER16(name, size) \ do { \ int i; \ for (i = 0; i < size; ++i) { \ uint16_t r = rnd(); \ AV_WN16A(name##0 + i, r); \ AV_WN16A(name##1 + i, r); \ } \ } while (0) #define RANDOMIZE_BUFFER8(name, size) \ do { \ int i; \ for (i = 0; i < size; ++i) { \ uint8_t r = rnd(); \ name##0[i] = r; \ name##1[i] = r; \ } \ } while (0) #define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size) \ do { \ uint8_t *p##0 = name##0, *p##1 = name##1; \ int i = (size); \ while (i-- > 0) { \ int x = 0x80 | (rnd() & 0x7F); \ x >>= rnd() % 9; \ if (rnd() & 1) \ x = -x; \ *p##1++ = *p##0++ = 0x80 + x; \ } \ } while (0) static void check_inv_trans_inplace(void) { /* Inverse transform input coefficients are stored in a 16-bit buffer * with row stride of 8 coefficients irrespective of transform size. * vc1_inv_trans_8x8 differs from the others in two ways: coefficients * are stored in column-major order, and the outputs are written back * to the input buffer, so we oversize it slightly to catch overruns. */ LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]); LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]); VC1DSPContext h; ff_vc1dsp_init(&h); if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) { matrix *coeffs; declare_func(void, int16_t *); RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8); coeffs = generate_inverse_quantized_transform_coefficients(8, 8); for (int j = 0; j < 8; ++j) for (int i = 0; i < 8; ++i) { int idx = 8 + i * 8 + j; inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i]; } call_ref(inv_trans_in0 + 8); call_new(inv_trans_in1 + 8); if (memcmp(inv_trans_in0, inv_trans_in1, 10 * 8 * sizeof (int16_t))) fail(); bench_new(inv_trans_in1 + 8); av_free(coeffs); } } static void check_inv_trans_adding(void) { /* Inverse transform input coefficients are stored in a 16-bit buffer * with row stride of 8 coefficients irrespective of transform size. */ LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]); LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]); /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and * added with saturation to an array of unsigned 8-bit values. Oversize * this by 8 samples left and right and one row above and below. */ LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]); LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]); VC1DSPContext h; const test tests[] = { VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4) VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8) VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4) VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8) VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4) VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8) VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4) }; ff_vc1dsp_init(&h); for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset); if (check_func(func, "vc1dsp.%s", tests[t].name)) { matrix *coeffs; declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *); RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8); RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24); coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height); for (int j = 0; j < tests[t].height; ++j) for (int i = 0; i < tests[t].width; ++i) { int idx = j * 8 + i; inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i]; } call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0); call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1); if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24)) fail(); bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8); av_free(coeffs); } } } static void check_loop_filter(void) { /* Deblocking filter buffers are big enough to hold a 16x16 block, * plus 16 columns left and 4 rows above to hold filter inputs * (depending on whether v or h neighbouring block edge, oversized * horizontally to maintain 16-byte alignment) plus 16 columns and * 4 rows below to catch write overflows */ LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]); LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]); VC1DSPContext h; const test tests[] = { VC1DSP_TEST(vc1_v_loop_filter4) VC1DSP_TEST(vc1_h_loop_filter4) VC1DSP_TEST(vc1_v_loop_filter8) VC1DSP_TEST(vc1_h_loop_filter8) VC1DSP_TEST(vc1_v_loop_filter16) VC1DSP_TEST(vc1_h_loop_filter16) }; ff_vc1dsp_init(&h); for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset); declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int); if (check_func(func, "vc1dsp.%s", tests[t].name)) { for (int count = 1000; count > 0; --count) { int pq = rnd() % 31 + 1; RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48); call_ref(filter_buf0 + 4 * 48 + 16, 48, pq); call_new(filter_buf1 + 4 * 48 + 16, 48, pq); if (memcmp(filter_buf0, filter_buf1, 24 * 48)) fail(); } } for (int j = 0; j < 24; ++j) for (int i = 0; i < 48; ++i) filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4); if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name)) bench_new(filter_buf1 + 4 * 48 + 16, 48, 1); if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name)) bench_new(filter_buf1 + 4 * 48 + 16, 48, 31); } } #define TEST_UNESCAPE \ do { \ for (int count = 100; count > 0; --count) { \ escaped_offset = rnd() & 7; \ unescaped_offset = rnd() & 7; \ escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7); \ RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE); \ len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \ len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \ if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE)) \ fail(); \ } \ } while (0) static void check_unescape(void) { /* This appears to be a typical length of buffer in use */ #define LOG2_UNESCAPE_BUF_SIZE 17 #define UNESCAPE_BUF_SIZE (1u<