mirror of https://git.ffmpeg.org/ffmpeg.git
yuv2planeX10 SIMD
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
parent
109f62e8f8
commit
7fbbf95293
|
@ -1,6 +1,7 @@
|
|||
;******************************************************************************
|
||||
;* x86-optimized horizontal line scaling functions
|
||||
;* x86-optimized horizontal/vertical line scaling functions
|
||||
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;* Kieran Kunhya <kieran@kunhya.com>
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
;*
|
||||
|
@ -28,6 +29,8 @@ max_19bit_int: times 4 dd 0x7ffff
|
|||
max_19bit_flt: times 4 dd 524287.0
|
||||
minshort: times 8 dw 0x8000
|
||||
unicoeff: times 4 dd 0x20000000
|
||||
yuv2yuvX_10_start: times 4 dd 0x10000
|
||||
yuv2yuvX_10_upper: times 8 dw 0x3ff
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
@ -427,3 +430,75 @@ INIT_XMM
|
|||
SCALE_FUNCS2 sse2, 6, 7, 8
|
||||
SCALE_FUNCS2 ssse3, 6, 6, 8
|
||||
SCALE_FUNCS2 sse4, 6, 6, 8
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; vertical line scaling
|
||||
;
|
||||
; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
|
||||
; const uint8_t *dither, int offset)
|
||||
; and
|
||||
; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
|
||||
; const int16_t **src, uint8_t *dst, int dstW,
|
||||
; const uint8_t *dither, int offset)
|
||||
;
|
||||
; Scale one or $filterSize lines of source data to generate one line of output
|
||||
; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
|
||||
; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
|
||||
; of 2. $offset is either 0 or 3. $dither holds 8 values.
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro yuv2planeX10 1
|
||||
|
||||
%ifdef ARCH_X86_32
|
||||
%define cntr_reg r1
|
||||
%else
|
||||
%define cntr_reg r11
|
||||
%endif
|
||||
|
||||
cglobal yuv2planeX10_%1, 7, 7
|
||||
xor r5, r5
|
||||
.pixelloop
|
||||
mova m1, [yuv2yuvX_10_start]
|
||||
mova m2, m1
|
||||
movsxdifnidn cntr_reg, r1d
|
||||
.filterloop
|
||||
pxor m0, m0
|
||||
|
||||
mov r6, [r2+gprsize*cntr_reg-2*gprsize]
|
||||
mova m3, [r6+r5]
|
||||
|
||||
mov r6, [r2+gprsize*cntr_reg-gprsize]
|
||||
mova m4, [r6+r5]
|
||||
|
||||
punpcklwd m5, m3, m4
|
||||
punpckhwd m3, m4
|
||||
|
||||
movd m0, [r0+2*cntr_reg-4]
|
||||
SPLATD m0, m0
|
||||
|
||||
pmaddwd m5, m0
|
||||
pmaddwd m3, m0
|
||||
|
||||
paddd m2, m5
|
||||
paddd m1, m3
|
||||
|
||||
sub cntr_reg, 2
|
||||
jg .filterloop
|
||||
|
||||
psrad m2, 17
|
||||
psrad m1, 17
|
||||
|
||||
packusdw m2, m1
|
||||
pminsw m2, [yuv2yuvX_10_upper]
|
||||
mova [r3+r5], m2
|
||||
|
||||
add r5, mmsize
|
||||
sub r4d, mmsize/2
|
||||
jg .pixelloop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM
|
||||
yuv2planeX10 sse4
|
||||
INIT_AVX
|
||||
yuv2planeX10 avx
|
||||
|
|
|
@ -211,6 +211,14 @@ SCALE_FUNCS_SSE(sse2);
|
|||
SCALE_FUNCS_SSE(ssse3);
|
||||
SCALE_FUNCS_SSE(sse4);
|
||||
|
||||
extern void ff_yuv2planeX10_sse4(const int16_t *filter, int filterSize,
|
||||
const int16_t **src, uint8_t *dest, int dstW,
|
||||
const uint8_t *dither, int offset);
|
||||
|
||||
extern void ff_yuv2planeX10_avx(const int16_t *filter, int filterSize,
|
||||
const int16_t **src, uint8_t *dest, int dstW,
|
||||
const uint8_t *dither, int offset);
|
||||
|
||||
void ff_sws_init_swScale_mmx(SwsContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
@ -270,6 +278,13 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
|
|||
/* Xto15 don't need special sse4 functions */
|
||||
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
|
||||
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
|
||||
if (c->dstBpc == 10 && !isBE(c->dstFormat))
|
||||
c->yuv2planeX = ff_yuv2planeX10_sse4;
|
||||
}
|
||||
|
||||
if (cpu_flags & AV_CPU_FLAG_AVX) {
|
||||
if (c->dstBpc == 10 && !isBE(c->dstFormat))
|
||||
c->yuv2planeX = ff_yuv2planeX10_avx;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue