mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-01-28 02:02:46 +00:00
avfilter/x86/vf_blend : add SIMD for 16 bit version of
grainextract grainmerge average extremity negation
This commit is contained in:
parent
8eb0bb1108
commit
f3df42e81d
@ -27,6 +27,8 @@
|
||||
SECTION_RODATA
|
||||
|
||||
ps_255: times 4 dd 255.0
|
||||
pd_32768 : times 4 dd 32768
|
||||
pd_65535 : times 4 dd 65535
|
||||
pw_1: times 8 dw 1
|
||||
pw_128: times 8 dw 128
|
||||
pw_255: times 8 dw 255
|
||||
@ -79,26 +81,33 @@ BLEND_INIT %1, 2, %3
|
||||
BLEND_END
|
||||
%endmacro
|
||||
|
||||
%macro GRAINEXTRACT 0
|
||||
BLEND_INIT grainextract, 6
|
||||
; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit)
|
||||
%macro GRAINEXTRACT 3-4
|
||||
BLEND_INIT %1, 6, %4
|
||||
pxor m4, m4
|
||||
%if %0 == 4 ; 16 bit
|
||||
VBROADCASTI128 m5, [pd_32768]
|
||||
%else
|
||||
VBROADCASTI128 m5, [pw_128]
|
||||
%endif
|
||||
.nextrow:
|
||||
mov xq, widthq
|
||||
.loop:
|
||||
movu m1, [topq + xq]
|
||||
movu m3, [bottomq + xq]
|
||||
punpcklbw m0, m1, m4
|
||||
punpckhbw m1, m4
|
||||
punpcklbw m2, m3, m4
|
||||
punpckhbw m3, m4
|
||||
|
||||
paddw m0, m5
|
||||
paddw m1, m5
|
||||
psubw m0, m2
|
||||
psubw m1, m3
|
||||
punpckl%2%3 m0, m1, m4
|
||||
punpckh%2%3 m1, m4
|
||||
punpckl%2%3 m2, m3, m4
|
||||
punpckh%2%3 m3, m4
|
||||
|
||||
padd%3 m0, m5
|
||||
padd%3 m1, m5
|
||||
psub%3 m0, m2
|
||||
psub%3 m1, m3
|
||||
|
||||
packus%3%2 m0, m1
|
||||
|
||||
packuswb m0, m1
|
||||
mova [dstq + xq], m0
|
||||
add xq, mmsize
|
||||
jl .loop
|
||||
@ -172,8 +181,9 @@ BLEND_INIT screen, 7
|
||||
BLEND_END
|
||||
%endmacro
|
||||
|
||||
%macro AVERAGE 0
|
||||
BLEND_INIT average, 3
|
||||
;%1 name, %2 (b or w), %3 (set if 16 bit)
|
||||
%macro AVERAGE 2-3
|
||||
BLEND_INIT %1, 3, %3
|
||||
pcmpeqb m2, m2
|
||||
|
||||
.nextrow:
|
||||
@ -184,7 +194,7 @@ BLEND_INIT average, 3
|
||||
movu m1, [bottomq + xq]
|
||||
pxor m0, m2
|
||||
pxor m1, m2
|
||||
pavgb m0, m1
|
||||
pavg%2 m0, m1
|
||||
pxor m0, m2
|
||||
mova [dstq + xq], m0
|
||||
add xq, mmsize
|
||||
@ -192,29 +202,34 @@ BLEND_INIT average, 3
|
||||
BLEND_END
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro GRAINMERGE 0
|
||||
BLEND_INIT grainmerge, 6
|
||||
; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit)
|
||||
%macro GRAINMERGE 3-4
|
||||
BLEND_INIT %1, 6, %4
|
||||
pxor m4, m4
|
||||
|
||||
%if %0 == 4 ; 16 bit
|
||||
VBROADCASTI128 m5, [pd_32768]
|
||||
%else
|
||||
VBROADCASTI128 m5, [pw_128]
|
||||
%endif
|
||||
.nextrow:
|
||||
mov xq, widthq
|
||||
|
||||
.loop:
|
||||
movu m1, [topq + xq]
|
||||
movu m3, [bottomq + xq]
|
||||
punpcklbw m0, m1, m4
|
||||
punpckhbw m1, m4
|
||||
punpcklbw m2, m3, m4
|
||||
punpckhbw m3, m4
|
||||
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
psubw m0, m5
|
||||
psubw m1, m5
|
||||
punpckl%2%3 m0, m1, m4
|
||||
punpckh%2%3 m1, m4
|
||||
punpckl%2%3 m2, m3, m4
|
||||
punpckh%2%3 m3, m4
|
||||
|
||||
padd%3 m0, m2
|
||||
padd%3 m1, m3
|
||||
psub%3 m0, m5
|
||||
psub%3 m1, m5
|
||||
|
||||
packus%3%2 m0, m1
|
||||
|
||||
packuswb m0, m1
|
||||
mova [dstq + xq], m0
|
||||
add xq, mmsize
|
||||
jl .loop
|
||||
@ -324,52 +339,73 @@ BLEND_INIT %1, 5, %4
|
||||
BLEND_END
|
||||
%endmacro
|
||||
|
||||
%macro BLEND_ABS 0
|
||||
BLEND_INIT extremity, 8
|
||||
; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit)
|
||||
%macro EXTREMITY 3-4
|
||||
BLEND_INIT %1, 8, %4
|
||||
pxor m2, m2
|
||||
%if %0 == 4; 16 bit
|
||||
VBROADCASTI128 m4, [pd_65535]
|
||||
%else
|
||||
VBROADCASTI128 m4, [pw_255]
|
||||
%endif
|
||||
.nextrow:
|
||||
mov xq, widthq
|
||||
|
||||
.loop:
|
||||
movu m0, [topq + xq]
|
||||
movu m1, [bottomq + xq]
|
||||
punpckhbw m5, m0, m2
|
||||
punpcklbw m0, m2
|
||||
punpckhbw m6, m1, m2
|
||||
punpcklbw m1, m2
|
||||
psubw m3, m4, m0
|
||||
psubw m7, m4, m5
|
||||
psubw m3, m1
|
||||
psubw m7, m6
|
||||
punpckh%2%3 m5, m0, m2
|
||||
punpckl%2%3 m0, m2
|
||||
punpckh%2%3 m6, m1, m2
|
||||
punpckl%2%3 m1, m2
|
||||
psub%3 m3, m4, m0
|
||||
psub%3 m7, m4, m5
|
||||
psub%3 m3, m1
|
||||
psub%3 m7, m6
|
||||
%if %0 == 4; 16 bit
|
||||
pabsd m3, m3
|
||||
pabsd m7, m7
|
||||
%else
|
||||
ABS2 m3, m7, m1, m6
|
||||
packuswb m3, m7
|
||||
%endif
|
||||
packus%3%2 m3, m7
|
||||
mova [dstq + xq], m3
|
||||
add xq, mmsize
|
||||
jl .loop
|
||||
BLEND_END
|
||||
%endmacro
|
||||
|
||||
BLEND_INIT negation, 8
|
||||
%macro NEGATION 3-4
|
||||
BLEND_INIT %1, 8, %4
|
||||
pxor m2, m2
|
||||
%if %0 == 4; 16 bit
|
||||
VBROADCASTI128 m4, [pd_65535]
|
||||
%else
|
||||
VBROADCASTI128 m4, [pw_255]
|
||||
%endif
|
||||
.nextrow:
|
||||
mov xq, widthq
|
||||
|
||||
.loop:
|
||||
movu m0, [topq + xq]
|
||||
movu m1, [bottomq + xq]
|
||||
punpckhbw m5, m0, m2
|
||||
punpcklbw m0, m2
|
||||
punpckhbw m6, m1, m2
|
||||
punpcklbw m1, m2
|
||||
psubw m3, m4, m0
|
||||
psubw m7, m4, m5
|
||||
psubw m3, m1
|
||||
psubw m7, m6
|
||||
punpckh%2%3 m5, m0, m2
|
||||
punpckl%2%3 m0, m2
|
||||
punpckh%2%3 m6, m1, m2
|
||||
punpckl%2%3 m1, m2
|
||||
psub%3 m3, m4, m0
|
||||
psub%3 m7, m4, m5
|
||||
psub%3 m3, m1
|
||||
psub%3 m7, m6
|
||||
%if %0 == 4; 16 bit
|
||||
pabsd m3, m3
|
||||
pabsd m7, m7
|
||||
%else
|
||||
ABS2 m3, m7, m1, m6
|
||||
psubw m0, m4, m3
|
||||
psubw m1, m4, m7
|
||||
packuswb m0, m1
|
||||
%endif
|
||||
psub%3 m0, m4, m3
|
||||
psub%3 m1, m4, m7
|
||||
packus%3%2 m0, m1
|
||||
mova [dstq + xq], m0
|
||||
add xq, mmsize
|
||||
jl .loop
|
||||
@ -384,17 +420,17 @@ BLEND_SIMPLE addition, addusb
|
||||
BLEND_SIMPLE subtract, subusb
|
||||
BLEND_SIMPLE darken, minub
|
||||
BLEND_SIMPLE lighten, maxub
|
||||
GRAINEXTRACT
|
||||
GRAINEXTRACT grainextract, b, w
|
||||
BLEND_MULTIPLY
|
||||
BLEND_SCREEN
|
||||
AVERAGE
|
||||
GRAINMERGE
|
||||
AVERAGE average, b
|
||||
GRAINMERGE grainmerge, b, w
|
||||
HARDMIX
|
||||
PHOENIX phoenix, b
|
||||
DIFFERENCE difference, b, w
|
||||
DIVIDE
|
||||
|
||||
BLEND_ABS
|
||||
EXTREMITY extremity, b, w
|
||||
NEGATION negation, b, w
|
||||
|
||||
%if ARCH_X86_64
|
||||
BLEND_SIMPLE addition_16, addusw, 1
|
||||
@ -402,18 +438,24 @@ BLEND_SIMPLE and_16, and, 1
|
||||
BLEND_SIMPLE or_16, or, 1
|
||||
BLEND_SIMPLE subtract_16, subusw, 1
|
||||
BLEND_SIMPLE xor_16, xor, 1
|
||||
AVERAGE average_16, w, 1
|
||||
%endif
|
||||
|
||||
INIT_XMM ssse3
|
||||
DIFFERENCE difference, b, w
|
||||
BLEND_ABS
|
||||
EXTREMITY extremity, b, w
|
||||
NEGATION negation, b, w
|
||||
|
||||
INIT_XMM sse4
|
||||
%if ARCH_X86_64
|
||||
BLEND_SIMPLE darken_16, minuw, 1
|
||||
BLEND_SIMPLE lighten_16, maxuw, 1
|
||||
GRAINEXTRACT grainextract_16, w, d, 1
|
||||
GRAINMERGE grainmerge_16, w, d, 1
|
||||
PHOENIX phoenix_16, w, 1
|
||||
DIFFERENCE difference_16, w, d, 1
|
||||
EXTREMITY extremity_16, w, d, 1
|
||||
NEGATION negation_16, w, d, 1
|
||||
%endif
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
@ -425,16 +467,17 @@ BLEND_SIMPLE addition, addusb
|
||||
BLEND_SIMPLE subtract, subusb
|
||||
BLEND_SIMPLE darken, minub
|
||||
BLEND_SIMPLE lighten, maxub
|
||||
GRAINEXTRACT
|
||||
GRAINEXTRACT grainextract, b, w
|
||||
BLEND_MULTIPLY
|
||||
BLEND_SCREEN
|
||||
AVERAGE
|
||||
GRAINMERGE
|
||||
AVERAGE average, b
|
||||
GRAINMERGE grainmerge, b, w
|
||||
HARDMIX
|
||||
PHOENIX phoenix, b
|
||||
|
||||
DIFFERENCE difference, b, w
|
||||
BLEND_ABS
|
||||
EXTREMITY extremity, b, w
|
||||
NEGATION negation, b, w
|
||||
|
||||
%if ARCH_X86_64
|
||||
BLEND_SIMPLE addition_16, addusw, 1
|
||||
@ -444,7 +487,12 @@ BLEND_SIMPLE lighten_16, maxuw, 1
|
||||
BLEND_SIMPLE or_16, or, 1
|
||||
BLEND_SIMPLE subtract_16, subusw, 1
|
||||
BLEND_SIMPLE xor_16, xor, 1
|
||||
GRAINEXTRACT grainextract_16, w, d, 1
|
||||
AVERAGE average_16, w, 1
|
||||
GRAINMERGE grainmerge_16, w, d, 1
|
||||
PHOENIX phoenix_16, w, 1
|
||||
DIFFERENCE difference_16, w, d, 1
|
||||
EXTREMITY extremity_16, w, d, 1
|
||||
NEGATION negation_16, w, d, 1
|
||||
%endif
|
||||
%endif
|
||||
|
@ -72,12 +72,22 @@ BLEND_FUNC(negation, avx2)
|
||||
#if ARCH_X86_64
|
||||
BLEND_FUNC(addition_16, sse2)
|
||||
BLEND_FUNC(addition_16, avx2)
|
||||
BLEND_FUNC(grainmerge_16, sse4)
|
||||
BLEND_FUNC(grainmerge_16, avx2)
|
||||
BLEND_FUNC(average_16, sse2)
|
||||
BLEND_FUNC(average_16, avx2)
|
||||
BLEND_FUNC(and_16, sse2)
|
||||
BLEND_FUNC(and_16, avx2)
|
||||
BLEND_FUNC(darken_16, sse4)
|
||||
BLEND_FUNC(darken_16, avx2)
|
||||
BLEND_FUNC(grainextract_16, sse4)
|
||||
BLEND_FUNC(grainextract_16, avx2)
|
||||
BLEND_FUNC(difference_16, sse4)
|
||||
BLEND_FUNC(difference_16, avx2)
|
||||
BLEND_FUNC(extremity_16, sse4)
|
||||
BLEND_FUNC(extremity_16, avx2)
|
||||
BLEND_FUNC(negation_16, sse4)
|
||||
BLEND_FUNC(negation_16, avx2)
|
||||
BLEND_FUNC(lighten_16, sse4)
|
||||
BLEND_FUNC(lighten_16, avx2)
|
||||
BLEND_FUNC(or_16, sse2)
|
||||
@ -152,6 +162,7 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
|
||||
switch (param->mode) {
|
||||
case BLEND_ADDITION: param->blend = ff_blend_addition_16_sse2; break;
|
||||
case BLEND_AND: param->blend = ff_blend_and_16_sse2; break;
|
||||
case BLEND_AVERAGE: param->blend = ff_blend_average_16_sse2; break;
|
||||
case BLEND_OR: param->blend = ff_blend_or_16_sse2; break;
|
||||
case BLEND_SUBTRACT: param->blend = ff_blend_subtract_16_sse2; break;
|
||||
case BLEND_XOR: param->blend = ff_blend_xor_16_sse2; break;
|
||||
@ -159,8 +170,12 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1) {
|
||||
switch (param->mode) {
|
||||
case BLEND_GRAINMERGE: param->blend = ff_blend_grainmerge_16_sse4; break;
|
||||
case BLEND_DARKEN: param->blend = ff_blend_darken_16_sse4; break;
|
||||
case BLEND_GRAINEXTRACT: param->blend = ff_blend_grainextract_16_sse4; break;
|
||||
case BLEND_DIFFERENCE: param->blend = ff_blend_difference_16_sse4; break;
|
||||
case BLEND_EXTREMITY: param->blend = ff_blend_extremity_16_sse4; break;
|
||||
case BLEND_NEGATION: param->blend = ff_blend_negation_16_sse4; break;
|
||||
case BLEND_LIGHTEN: param->blend = ff_blend_lighten_16_sse4; break;
|
||||
case BLEND_PHOENIX: param->blend = ff_blend_phoenix_16_sse4; break;
|
||||
}
|
||||
@ -168,9 +183,14 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags) && param->opacity == 1) {
|
||||
switch (param->mode) {
|
||||
case BLEND_ADDITION: param->blend = ff_blend_addition_16_avx2; break;
|
||||
case BLEND_GRAINMERGE: param->blend = ff_blend_grainmerge_16_avx2; break;
|
||||
case BLEND_AND: param->blend = ff_blend_and_16_avx2; break;
|
||||
case BLEND_AVERAGE: param->blend = ff_blend_average_16_avx2; break;
|
||||
case BLEND_DARKEN: param->blend = ff_blend_darken_16_avx2; break;
|
||||
case BLEND_GRAINEXTRACT: param->blend = ff_blend_grainextract_16_avx2; break;
|
||||
case BLEND_DIFFERENCE: param->blend = ff_blend_difference_16_avx2; break;
|
||||
case BLEND_EXTREMITY: param->blend = ff_blend_extremity_16_avx2; break;
|
||||
case BLEND_NEGATION: param->blend = ff_blend_negation_16_avx2; break;
|
||||
case BLEND_LIGHTEN: param->blend = ff_blend_lighten_16_avx2; break;
|
||||
case BLEND_OR: param->blend = ff_blend_or_16_avx2; break;
|
||||
case BLEND_PHOENIX: param->blend = ff_blend_phoenix_16_avx2; break;
|
||||
|
Loading…
Reference in New Issue
Block a user