diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index e5f318a72c..1191081440 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -150,6 +150,15 @@ void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size) void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size); +#if ARCH_X86_64 +void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); +void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); +#endif + av_cold void rgb2rgb_init_x86(void) { int cpu_flags = av_get_cpu_flags(); @@ -167,6 +176,11 @@ av_cold void rgb2rgb_init_x86(void) rgb2rgb_init_avx(); #endif /* HAVE_INLINE_ASM */ + if (EXTERNAL_SSE2(cpu_flags)) { +#if ARCH_X86_64 + uyvytoyuv422 = ff_uyvytoyuv422_sse2; +#endif + } if (EXTERNAL_SSSE3(cpu_flags)) { shuffle_bytes_0321 = ff_shuffle_bytes_0321_ssse3; shuffle_bytes_2103 = ff_shuffle_bytes_2103_ssse3; @@ -174,4 +188,9 @@ av_cold void rgb2rgb_init_x86(void) shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3; shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3; } + if (EXTERNAL_AVX(cpu_flags)) { +#if ARCH_X86_64 + uyvytoyuv422 = ff_uyvytoyuv422_avx; +#endif + } } diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index db45e313d8..156b4d2c74 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -32,6 +32,16 @@ pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 SECTION .text +%macro RSHIFT_COPY 3 +; %1 dst ; %2 src ; %3 shift +%if cpuflag(avx) + psrldq %1, %2, %3 +%else + mova %1, %2 + RSHIFT %1, %3 +%endif +%endmacro + ;------------------------------------------------------------------------------ ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size) ;------------------------------------------------------------------------------ @@ -84,3 +94,143 @@ SHUFFLE_BYTES 0, 3, 2, 1 SHUFFLE_BYTES 1, 2, 3, 0 SHUFFLE_BYTES 3, 0, 1, 2 SHUFFLE_BYTES 3, 2, 1, 0 + +;----------------------------------------------------------------------------------------------- +; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, +; const uint8_t *src, int width, int height, +; int lumStride, int chromStride, int srcStride) +;----------------------------------------------------------------------------------------------- +%macro UYVY_TO_YUV422 0 +cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w + pxor m0, m0 + pcmpeqw m1, m1 + psrlw m1, 8 + + movsxdifnidn wq, wd + movsxdifnidn lum_strideq, lum_strided + movsxdifnidn chrom_strideq, chrom_strided + movsxdifnidn src_strideq, src_strided + + mov back_wq, wq + mov whalfq, wq + shr whalfq, 1 ; whalf = width / 2 + + lea srcq, [srcq + wq * 2] + add ydstq, wq + add udstq, whalfq + add vdstq, whalfq + +.loop_line: + mov xq, wq + mov wtwoq, wq + add wtwoq, wtwoq ; wtwo = width * 2 + + neg wq + neg wtwoq + neg whalfq + + ;calc scalar loop count + and xq, mmsize * 2 - 1 + je .loop_simd + + .loop_scalar: + mov tmpb, [srcq + wtwoq + 0] + mov [udstq + whalfq], tmpb + + mov tmpb, [srcq + wtwoq + 1] + mov [ydstq + wq], tmpb + + mov tmpb, [srcq + wtwoq + 2] + mov [vdstq + whalfq], tmpb + + mov tmpb, [srcq + wtwoq + 3] + mov [ydstq + wq + 1], tmpb + + add wq, 2 + add wtwoq, 4 + add whalfq, 1 + sub xq, 2 + jg .loop_scalar + + ; check if simd loop is need + cmp wq, 0 + jge .end_line + + .loop_simd: + movu m2, [srcq + wtwoq ] + movu m3, [srcq + wtwoq + mmsize ] + movu m4, [srcq + wtwoq + mmsize * 2] + movu m5, [srcq + wtwoq + mmsize * 3] + + ; extract y part 1 + RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY... + pand m6, m1; YxYx YxYx... + + RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... + pand m7, m1 ; YxYx YxYx... + + packuswb m6, m7 ; YYYY YYYY... + movu [ydstq + wq], m6 + + ; extract y part 2 + RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY... + pand m6, m1; YxYx YxYx... + + RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY... + pand m7, m1 ; YxYx YxYx... + + packuswb m6, m7 ; YYYY YYYY... + movu [ydstq + wq + mmsize], m6 + + ; extract uv + pand m2, m1 ; UxVx... + pand m3, m1 ; UxVx... + pand m4, m1 ; UxVx... + pand m5, m1 ; UxVx... + + packuswb m2, m3 ; UVUV... + packuswb m4, m5 ; UVUV... + + ; U + pand m6, m2, m1 ; UxUx... + pand m7, m4, m1 ; UxUx... + + packuswb m6, m7 ; UUUU + movu [udstq + whalfq], m6 + + + ; V + psrlw m2, 8 ; VxVx... + psrlw m4, 8 ; VxVx... + packuswb m2, m4 ; VVVV + movu [vdstq + whalfq], m2 + + add whalfq, mmsize + add wtwoq, mmsize * 4 + add wq, mmsize * 2 + jl .loop_simd + + .end_line: + add srcq, src_strideq + add ydstq, lum_strideq + add udstq, chrom_strideq + add vdstq, chrom_strideq + + ;restore initial state of line variable + mov wq, back_wq + mov xq, wq + mov whalfq, wq + shr whalfq, 1 ; whalf = width / 2 + sub hd, 1 + jg .loop_line + + RET +%endmacro + +%if ARCH_X86_64 +INIT_XMM sse2 +UYVY_TO_YUV422 + +INIT_XMM avx +UYVY_TO_YUV422 +%endif diff --git a/tests/checkasm/sw_rgb.c b/tests/checkasm/sw_rgb.c index 8fc2cfee9e..d2b211f7b4 100644 --- a/tests/checkasm/sw_rgb.c +++ b/tests/checkasm/sw_rgb.c @@ -35,8 +35,12 @@ } while (0) static const uint8_t width[] = {12, 16, 20, 32, 36, 128}; +static const struct {uint8_t w, h, s;} planes[] = { + {12,16,12}, {16,16,16}, {20,23,25}, {32,18,48}, {8,128,16}, {128,128,128} +}; #define MAX_STRIDE 128 +#define MAX_HEIGHT 128 static void check_shuffle_bytes(void * func, const char * report) { @@ -64,6 +68,49 @@ static void check_shuffle_bytes(void * func, const char * report) } } +static void check_uyvy_to_422p() +{ + int i; + + LOCAL_ALIGNED_32(uint8_t, src0, [MAX_STRIDE * MAX_HEIGHT * 2]); + LOCAL_ALIGNED_32(uint8_t, src1, [MAX_STRIDE * MAX_HEIGHT * 2]); + LOCAL_ALIGNED_32(uint8_t, dst_y_0, [MAX_STRIDE * MAX_HEIGHT]); + LOCAL_ALIGNED_32(uint8_t, dst_y_1, [MAX_STRIDE * MAX_HEIGHT]); + LOCAL_ALIGNED_32(uint8_t, dst_u_0, [(MAX_STRIDE/2) * MAX_HEIGHT]); + LOCAL_ALIGNED_32(uint8_t, dst_u_1, [(MAX_STRIDE/2) * MAX_HEIGHT]); + LOCAL_ALIGNED_32(uint8_t, dst_v_0, [(MAX_STRIDE/2) * MAX_HEIGHT]); + LOCAL_ALIGNED_32(uint8_t, dst_v_1, [(MAX_STRIDE/2) * MAX_HEIGHT]); + + declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); + + randomize_buffers(src0, MAX_STRIDE * MAX_HEIGHT * 2); + memcpy(src1, src0, MAX_STRIDE * MAX_HEIGHT * 2); + + if (check_func(uyvytoyuv422, "uyvytoyuv422")) { + for (i = 0; i < 6; i ++) { + memset(dst_y_0, 0, MAX_STRIDE * MAX_HEIGHT); + memset(dst_y_1, 0, MAX_STRIDE * MAX_HEIGHT); + memset(dst_u_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT); + memset(dst_u_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT); + memset(dst_v_0, 0, (MAX_STRIDE/2) * MAX_HEIGHT); + memset(dst_v_1, 0, (MAX_STRIDE/2) * MAX_HEIGHT); + + call_ref(dst_y_0, dst_u_0, dst_v_0, src0, planes[i].w, planes[i].h, + MAX_STRIDE, MAX_STRIDE / 2, planes[i].s); + call_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[i].w, planes[i].h, + MAX_STRIDE, MAX_STRIDE / 2, planes[i].s); + if (memcmp(dst_y_0, dst_y_1, MAX_STRIDE * MAX_HEIGHT) || + memcmp(dst_u_0, dst_u_1, (MAX_STRIDE/2) * MAX_HEIGHT) || + memcmp(dst_v_0, dst_v_1, (MAX_STRIDE/2) * MAX_HEIGHT)) + fail(); + } + bench_new(dst_y_1, dst_u_1, dst_v_1, src1, planes[5].w, planes[5].h, + MAX_STRIDE, MAX_STRIDE / 2, planes[5].s); + } +} + void checkasm_check_sw_rgb(void) { ff_sws_rgb2rgb_init(); @@ -82,4 +129,7 @@ void checkasm_check_sw_rgb(void) check_shuffle_bytes(shuffle_bytes_3210, "shuffle_bytes_3210"); report("shuffle_bytes_3210"); + + check_uyvy_to_422p(); + report("uyvytoyuv422"); }