mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-03-11 06:58:18 +00:00
x86/vf_blend: Add SSE2 optimization for divide
4.5x faster than C float version with autovectorization 10 x faster than C int version 25 x faster than C float version without autovectorization
This commit is contained in:
parent
1c9215e580
commit
222e6da605
@ -24,6 +24,7 @@
|
|||||||
|
|
||||||
SECTION_RODATA
|
SECTION_RODATA
|
||||||
|
|
||||||
|
ps_255: times 4 dd 255.0
|
||||||
pw_1: times 8 dw 1
|
pw_1: times 8 dw 1
|
||||||
pw_128: times 8 dw 128
|
pw_128: times 8 dw 128
|
||||||
pw_255: times 8 dw 255
|
pw_255: times 8 dw 255
|
||||||
@ -218,6 +219,35 @@ BLEND_INIT hardmix, 5
|
|||||||
jl .loop
|
jl .loop
|
||||||
BLEND_END
|
BLEND_END
|
||||||
|
|
||||||
|
BLEND_INIT divide, 4
|
||||||
|
pxor m2, m2
|
||||||
|
mova m3, [ps_255]
|
||||||
|
.nextrow:
|
||||||
|
mov xq, widthq
|
||||||
|
|
||||||
|
.loop:
|
||||||
|
movd m0, [topq + xq] ; 000000xx
|
||||||
|
movd m1, [bottomq + xq]
|
||||||
|
punpcklbw m0, m2 ; 00000x0x
|
||||||
|
punpcklbw m1, m2
|
||||||
|
punpcklwd m0, m2 ; 000x000x
|
||||||
|
punpcklwd m1, m2
|
||||||
|
|
||||||
|
cvtdq2ps m0, m0
|
||||||
|
cvtdq2ps m1, m1
|
||||||
|
divps m0, m1 ; a / b
|
||||||
|
mulps m0, m3 ; a / b * 255
|
||||||
|
minps m0, m3
|
||||||
|
cvttps2dq m0, m0
|
||||||
|
|
||||||
|
packssdw m0, m0 ; 00000x0x
|
||||||
|
packuswb m0, m0 ; 000000xx
|
||||||
|
movd [dstq + xq], m0
|
||||||
|
add xq, mmsize / 4
|
||||||
|
|
||||||
|
jl .loop
|
||||||
|
BLEND_END
|
||||||
|
|
||||||
BLEND_INIT phoenix, 4
|
BLEND_INIT phoenix, 4
|
||||||
mova m3, [pb_255]
|
mova m3, [pb_255]
|
||||||
.nextrow:
|
.nextrow:
|
||||||
|
@ -39,6 +39,7 @@ BLEND_FUNC(difference128, sse2)
|
|||||||
BLEND_FUNC(multiply, sse2)
|
BLEND_FUNC(multiply, sse2)
|
||||||
BLEND_FUNC(screen, sse2)
|
BLEND_FUNC(screen, sse2)
|
||||||
BLEND_FUNC(hardmix, sse2)
|
BLEND_FUNC(hardmix, sse2)
|
||||||
|
BLEND_FUNC(divide, sse2)
|
||||||
BLEND_FUNC(lighten, sse2)
|
BLEND_FUNC(lighten, sse2)
|
||||||
BLEND_FUNC(or, sse2)
|
BLEND_FUNC(or, sse2)
|
||||||
BLEND_FUNC(phoenix, sse2)
|
BLEND_FUNC(phoenix, sse2)
|
||||||
@ -61,6 +62,7 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
|
|||||||
case BLEND_AVERAGE: param->blend = ff_blend_average_sse2; break;
|
case BLEND_AVERAGE: param->blend = ff_blend_average_sse2; break;
|
||||||
case BLEND_DARKEN: param->blend = ff_blend_darken_sse2; break;
|
case BLEND_DARKEN: param->blend = ff_blend_darken_sse2; break;
|
||||||
case BLEND_DIFFERENCE128: param->blend = ff_blend_difference128_sse2; break;
|
case BLEND_DIFFERENCE128: param->blend = ff_blend_difference128_sse2; break;
|
||||||
|
case BLEND_DIVIDE: param->blend = ff_blend_divide_sse2; break;
|
||||||
case BLEND_HARDMIX: param->blend = ff_blend_hardmix_sse2; break;
|
case BLEND_HARDMIX: param->blend = ff_blend_hardmix_sse2; break;
|
||||||
case BLEND_LIGHTEN: param->blend = ff_blend_lighten_sse2; break;
|
case BLEND_LIGHTEN: param->blend = ff_blend_lighten_sse2; break;
|
||||||
case BLEND_MULTIPLY: param->blend = ff_blend_multiply_sse2; break;
|
case BLEND_MULTIPLY: param->blend = ff_blend_multiply_sse2; break;
|
||||||
|
Loading…
Reference in New Issue
Block a user