ARM: NEON optimised h264_idct_add

Originally committed as revision 16150 to svn://svn.ffmpeg.org/ffmpeg/trunk
2024-12-22 07:20:45 +00:00 · 2008-12-15 22:12:51 +00:00 · 2008-12-15 22:12:51 +00:00 · c598cf25f4
commit c598cf25f4
parent 5813e05d08
3 changed files with 82 additions and 0 deletions
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@ -451,6 +451,7 @@ OBJS-$(HAVE_IWMMXT)                    += armv4l/dsputil_iwmmxt.o       \
 OBJS-$(HAVE_NEON)                      += armv4l/dsputil_neon.o         \
                                          armv4l/dsputil_neon_s.o       \
                                          armv4l/h264dsp_neon.o         \
                                          armv4l/h264idct_neon.o        \
                                          armv4l/simple_idct_neon.o     \
 OBJS-$(ARCH_BFIN)                      += bfin/dsputil_bfin.o           \
--- a/libavcodec/armv4l/dsputil_neon.c
+++ b/libavcodec/armv4l/dsputil_neon.c
@ -92,6 +92,8 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
 void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
                                       int beta, int8_t *tc0);
 void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
 void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
 {
    c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
@ -160,4 +162,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
    c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
    c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
    c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
    c->h264_idct_add = ff_h264_idct_add_neon;
 }
--- a/libavcodec/armv4l/h264idct_neon.S
+++ b/libavcodec/armv4l/h264idct_neon.S
@ -0,0 +1,77 @@
 /*
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "asm.S"
        .fpu neon
        .text
 function ff_h264_idct_add_neon, export=1
        mov             r3,  #(1<<5)
        vmov.i16        d16, #0
        vmov.16         d16[0],   r3
        vld1.64         {d0-d3},  [r1,:128]
        vadd.i16        d0,  d0,  d16
        vswp            d1,  d2
        vadd.i16        d4,  d0,  d1
        vshr.s16        q8,  q1,  #1
        vsub.i16        d5,  d0,  d1
        vadd.i16        d6,  d2,  d17
        vsub.i16        d7,  d16, d3
        vadd.i16        q0,  q2,  q3
        vsub.i16        q1,  q2,  q3
        vtrn.16         d0,  d1
        vtrn.16         d3,  d2
        vtrn.32         d0,  d3
        vtrn.32         d1,  d2
        vadd.i16        d4,  d0,  d3
        vld1.32         {d18[0]}, [r0,:32], r2
        vswp            d1,  d3
        vshr.s16        q8,  q1,  #1
        vld1.32         {d19[1]}, [r0,:32], r2
        vsub.i16        d5,  d0,  d1
        vld1.32         {d18[1]}, [r0,:32], r2
        vadd.i16        d6,  d16, d3
        vld1.32         {d19[0]}, [r0,:32], r2
        vsub.i16        d7,  d2,  d17
        sub             r0,  r0,  r2, lsl #2
        vadd.i16        q0,  q2,  q3
        vsub.i16        q1,  q2,  q3
        vshr.s16        q0,  q0,  #6
        vshr.s16        q1,  q1,  #6
        vaddw.u8        q0,  q0,  d18
        vaddw.u8        q1,  q1,  d19
        vqmovun.s16     d0,  q0
        vqmovun.s16     d1,  q1
        vst1.32         {d0[0]},  [r0,:32], r2
        vst1.32         {d1[1]},  [r0,:32], r2
        vst1.32         {d0[1]},  [r0,:32], r2
        vst1.32         {d1[0]},  [r0,:32], r2
        bx              lr
        .endfunc