diff --git a/libswscale/arm/swscale_unscaled.c b/libswscale/arm/swscale_unscaled.c index 1b50acd7e2..ac1e4a978b 100644 --- a/libswscale/arm/swscale_unscaled.c +++ b/libswscale/arm/swscale_unscaled.c @@ -69,8 +69,8 @@ static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[ c->yuv2rgb_v2g_coeff / ((precision) == 16 ? 1 << 7 : 1), \ c->yuv2rgb_u2b_coeff / ((precision) == 16 ? 1 << 7 : 1), \ -#define DECLARE_FF_YUV420P_TO_RGBX_FUNCS(ofmt, precision) \ -int ff_yuv420p_to_##ofmt##_neon_##precision(int w, int h, \ +#define DECLARE_FF_YUVX_TO_RGBX_FUNCS(ifmt, ofmt, precision) \ +int ff_##ifmt##_to_##ofmt##_neon_##precision(int w, int h, \ uint8_t *dst, int linesize, \ const uint8_t *srcY, int linesizeY, \ const uint8_t *srcU, int linesizeU, \ @@ -79,12 +79,12 @@ int ff_yuv420p_to_##ofmt##_neon_##precision(int w, int h, int y_offset, \ int y_coeff); \ \ -static int yuv420p_to_##ofmt##_neon_wrapper_##precision(SwsContext *c, const uint8_t *src[],\ +static int ifmt##_to_##ofmt##_neon_wrapper_##precision(SwsContext *c, const uint8_t *src[], \ int srcStride[], int srcSliceY, int srcSliceH, \ uint8_t *dst[], int dstStride[]) { \ const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE(precision) }; \ \ - ff_yuv420p_to_##ofmt##_neon_##precision(c->srcW, srcSliceH, \ + ff_##ifmt##_to_##ofmt##_neon_##precision(c->srcW, srcSliceH, \ dst[0] + srcSliceY * dstStride[0], dstStride[0], \ src[0], srcStride[0], \ src[1], srcStride[1], \ @@ -96,16 +96,17 @@ static int yuv420p_to_##ofmt##_neon_wrapper_##precision(SwsContext *c, const uin return 0; \ } \ -#define DECLARE_FF_YUV420P_TO_ALL_RGBX_FUNCS(precision) \ -DECLARE_FF_YUV420P_TO_RGBX_FUNCS(argb, precision) \ -DECLARE_FF_YUV420P_TO_RGBX_FUNCS(rgba, precision) \ -DECLARE_FF_YUV420P_TO_RGBX_FUNCS(abgr, precision) \ -DECLARE_FF_YUV420P_TO_RGBX_FUNCS(bgra, precision) \ +#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx, precision) \ +DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb, precision) \ +DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba, precision) \ +DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr, precision) \ +DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra, precision) \ -#define DECLARE_FF_YUV420P_TO_ALL_RGBX_ALL_PRECISION_FUNCS \ -DECLARE_FF_YUV420P_TO_ALL_RGBX_FUNCS(16) \ +#define DECLARE_FF_YUVX_TO_ALL_RGBX_ALL_PRECISION_FUNCS(yuvx) \ +DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx, 16) \ -DECLARE_FF_YUV420P_TO_ALL_RGBX_ALL_PRECISION_FUNCS +DECLARE_FF_YUVX_TO_ALL_RGBX_ALL_PRECISION_FUNCS(yuv420p) +DECLARE_FF_YUVX_TO_ALL_RGBX_ALL_PRECISION_FUNCS(yuv422p) #define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt, precision) \ int ff_##ifmt##_to_##ofmt##_neon_##precision(int w, int h, \ @@ -178,6 +179,7 @@ static void get_unscaled_swscale_neon(SwsContext *c) { SET_FF_NVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd); SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd); SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd); + SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd); } void ff_get_unscaled_swscale_arm(SwsContext *c) diff --git a/libswscale/arm/yuv2rgb_neon.S b/libswscale/arm/yuv2rgb_neon.S index d497dd4050..829e1b65b8 100644 --- a/libswscale/arm/yuv2rgb_neon.S +++ b/libswscale/arm/yuv2rgb_neon.S @@ -159,7 +159,23 @@ vst4.8 {q6, q7}, [\dst,:128]! .endm -.macro process_16px_16 ofmt +.macro process_1l_16px_16 ofmt + compute_premult_16 d28, d29, d30, d31 + vld1.8 {q7}, [r4]! + compute_16px_16 r2, d14, d15, \ofmt +.endm + +.macro process_1l_16px_32 ofmt + compute_premult_32 d28, d30 + vld1.8 {q7}, [r4]! + vmov d28, d15 @ save right of the line of luma for later use + compute_8px_32 r2, d14, \ofmt + + compute_premult_32 d29, d31 + compute_8px_32 r2, d28, \ofmt +.endm + +.macro process_2l_16px_16 ofmt compute_premult_16 d28, d29, d30, d31 vld1.8 {q7}, [r4]! @ first line of luma @@ -169,7 +185,7 @@ compute_16px_16 r11, d14, d15, \ofmt .endm -.macro process_16px_32 ofmt +.macro process_2l_16px_32 ofmt compute_premult_32 d28, d30 vld1.8 {q7}, [r4]! @ first line of luma @@ -228,6 +244,28 @@ ldr r10,[sp, #120] @ r10 = srcV .endm +.macro load_args_yuv422p + push {r4-r12, lr} + vpush {q4-q7} + ldr r4, [sp, #104] @ r4 = srcY + ldr r5, [sp, #108] @ r5 = linesizeY + ldr r6, [sp, #112] @ r6 = srcU + ldr r7, [sp, #116] @ r7 = linesizeU + ldr r12,[sp, #124] @ r12 = linesizeV + ldr r8, [sp, #128] @ r8 = table + ldr r9, [sp, #132] @ r9 = y_offset + ldr r10,[sp, #136] @ r10 = y_coeff + vdup.16 d0, r10 @ d0 = y_coeff + vld1.16 {d1}, [r8] @ d1 = *table + add r11, r2, r3 @ r11 = dst + linesize (dst2) + lsl r8, r0, #2 + sub r3, r3, r8 @ r3 = linesize * 2 - width * 4 (padding) + sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) + sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) + sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV) + ldr r10,[sp, #120] @ r10 = srcV +.endm + .macro declare_func ifmt ofmt precision function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1 @@ -243,56 +281,89 @@ function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1 load_args_yuv420p .endif + +.ifc \ifmt,yuv422p + load_args_yuv422p +.endif + 1: mov r8, r0 @ r8 = width 2: pld [r6, #64*3] pld [r4, #64*3] - pld [r12, #64*3] vmov.i8 d10, #128 .ifc \ifmt,nv12 + pld [r12, #64*3] + vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line vsubl.u8 q14, d2, d10 @ q14 = U - 128 vsubl.u8 q15, d3, d10 @ q15 = V - 128 + + process_2l_16px_\precision \ofmt .endif .ifc \ifmt,nv21 + pld [r12, #64*3] + vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line vsubl.u8 q14, d3, d10 @ q14 = U - 128 vsubl.u8 q15, d2, d10 @ q15 = V - 128 + + process_2l_16px_\precision \ofmt .endif .ifc \ifmt,yuv420p pld [r10, #64*3] + pld [r12, #64*3] + + vld1.8 d2, [r6]! @ d2: chroma red line + vld1.8 d3, [r10]! @ d3: chroma blue line + vsubl.u8 q14, d2, d10 @ q14 = U - 128 + vsubl.u8 q15, d3, d10 @ q15 = V - 128 + + process_2l_16px_\precision \ofmt +.endif + +.ifc \ifmt,yuv422p + pld [r10, #64*3] vld1.8 d2, [r6]! @ d2: chroma red line vld1.8 d3, [r10]! @ d3: chroma blue line vsubl.u8 q14, d2, d10 @ q14 = U - 128 vsubl.u8 q15, d3, d10 @ q15 = V - 128 + + process_1l_16px_\precision \ofmt .endif - - process_16px_\precision \ofmt - subs r8, r8, #16 @ width -= 16 bgt 2b add r2, r2, r3 @ dst += padding add r4, r4, r5 @ srcY += paddingY + +.ifc \ifmt,nv12 add r11, r11, r3 @ dst2 += padding add r12, r12, r5 @ srcY2 += paddingY -.ifc \ifmt,nv12 add r6, r6, r7 @ srcC += paddingC + + subs r1, r1, #2 @ height -= 2 .endif .ifc \ifmt,nv21 + add r11, r11, r3 @ dst2 += padding + add r12, r12, r5 @ srcY2 += paddingY + add r6, r6, r7 @ srcC += paddingC + subs r1, r1, #2 @ height -= 2 .endif .ifc \ifmt,yuv420p + add r11, r11, r3 @ dst2 += padding + add r12, r12, r5 @ srcY2 += paddingY + ldr r7, [sp, #116] @ r7 = linesizeU sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) add r6, r6, r7 @ srcU += paddingU @@ -300,9 +371,17 @@ function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1 ldr r7, [sp, #124] @ r7 = linesizeV sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV) add r10, r10, r7 @ srcV += paddingV -.endif subs r1, r1, #2 @ height -= 2 +.endif + +.ifc \ifmt,yuv422p + add r6, r6, r7 @ srcU += paddingU + add r10,r10,r12 @ srcV += paddingV + + subs r1, r1, #1 @ height -= 1 +.endif + bgt 1b vpop {q4-q7} @@ -324,3 +403,5 @@ declare_rgb_funcs nv12, 32 declare_rgb_funcs nv21, 32 declare_rgb_funcs yuv420p, 16 declare_rgb_funcs yuv420p, 32 +declare_rgb_funcs yuv422p, 16 +declare_rgb_funcs yuv422p, 32