From 7fbbf9529397756a31850fe37036f026f34f80fc Mon Sep 17 00:00:00 2001 From: Kieran Kunhya Date: Sun, 9 Oct 2011 16:20:48 +0100 Subject: [PATCH] yuv2planeX10 SIMD Signed-off-by: Ronald S. Bultje --- libswscale/x86/scale.asm | 77 +++++++++++++++++++++++++++++++++++- libswscale/x86/swscale_mmx.c | 15 +++++++ 2 files changed, 91 insertions(+), 1 deletion(-) diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm index d35589419c..05e2d96fe6 100644 --- a/libswscale/x86/scale.asm +++ b/libswscale/x86/scale.asm @@ -1,6 +1,7 @@ ;****************************************************************************** -;* x86-optimized horizontal line scaling functions +;* x86-optimized horizontal/vertical line scaling functions ;* Copyright (c) 2011 Ronald S. Bultje +;* Kieran Kunhya ;* ;* This file is part of Libav. ;* @@ -28,6 +29,8 @@ max_19bit_int: times 4 dd 0x7ffff max_19bit_flt: times 4 dd 524287.0 minshort: times 8 dw 0x8000 unicoeff: times 4 dd 0x20000000 +yuv2yuvX_10_start: times 4 dd 0x10000 +yuv2yuvX_10_upper: times 8 dw 0x3ff SECTION .text @@ -427,3 +430,75 @@ INIT_XMM SCALE_FUNCS2 sse2, 6, 7, 8 SCALE_FUNCS2 ssse3, 6, 6, 8 SCALE_FUNCS2 sse4, 6, 6, 8 + +;----------------------------------------------------------------------------- +; vertical line scaling +; +; void yuv2plane1__(const int16_t *src, uint8_t *dst, int dstW, +; const uint8_t *dither, int offset) +; and +; void yuv2planeX__(const int16_t *filter, int filterSize, +; const int16_t **src, uint8_t *dst, int dstW, +; const uint8_t *dither, int offset) +; +; Scale one or $filterSize lines of source data to generate one line of output +; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in +; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple +; of 2. $offset is either 0 or 3. $dither holds 8 values. +;----------------------------------------------------------------------------- + +%macro yuv2planeX10 1 + +%ifdef ARCH_X86_32 +%define cntr_reg r1 +%else +%define cntr_reg r11 +%endif + +cglobal yuv2planeX10_%1, 7, 7 + xor r5, r5 +.pixelloop + mova m1, [yuv2yuvX_10_start] + mova m2, m1 + movsxdifnidn cntr_reg, r1d +.filterloop + pxor m0, m0 + + mov r6, [r2+gprsize*cntr_reg-2*gprsize] + mova m3, [r6+r5] + + mov r6, [r2+gprsize*cntr_reg-gprsize] + mova m4, [r6+r5] + + punpcklwd m5, m3, m4 + punpckhwd m3, m4 + + movd m0, [r0+2*cntr_reg-4] + SPLATD m0, m0 + + pmaddwd m5, m0 + pmaddwd m3, m0 + + paddd m2, m5 + paddd m1, m3 + + sub cntr_reg, 2 + jg .filterloop + + psrad m2, 17 + psrad m1, 17 + + packusdw m2, m1 + pminsw m2, [yuv2yuvX_10_upper] + mova [r3+r5], m2 + + add r5, mmsize + sub r4d, mmsize/2 + jg .pixelloop + REP_RET +%endmacro + +INIT_XMM +yuv2planeX10 sse4 +INIT_AVX +yuv2planeX10 avx diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c index dd7aea1492..3c0632dca5 100644 --- a/libswscale/x86/swscale_mmx.c +++ b/libswscale/x86/swscale_mmx.c @@ -211,6 +211,14 @@ SCALE_FUNCS_SSE(sse2); SCALE_FUNCS_SSE(ssse3); SCALE_FUNCS_SSE(sse4); +extern void ff_yuv2planeX10_sse4(const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset); + +extern void ff_yuv2planeX10_avx(const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset); + void ff_sws_init_swScale_mmx(SwsContext *c) { int cpu_flags = av_get_cpu_flags(); @@ -270,6 +278,13 @@ void ff_sws_init_swScale_mmx(SwsContext *c) /* Xto15 don't need special sse4 functions */ ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3); ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3); + if (c->dstBpc == 10 && !isBE(c->dstFormat)) + c->yuv2planeX = ff_yuv2planeX10_sse4; + } + + if (cpu_flags & AV_CPU_FLAG_AVX) { + if (c->dstBpc == 10 && !isBE(c->dstFormat)) + c->yuv2planeX = ff_yuv2planeX10_avx; } #endif }