From 84e70ef004d763262ea4e795341ff240b8b10da3 Mon Sep 17 00:00:00 2001 From: Daniel Kang Date: Tue, 21 Jun 2011 00:30:23 -0400 Subject: [PATCH] h264: Add x86 assembly for 10-bit weight/biweight H.264 functions. Mainly ported from 8-bit H.264 weight/biweight. Signed-off-by: Diego Biurrun --- libavcodec/x86/Makefile | 1 + libavcodec/x86/h264_weight_10bit.asm | 321 +++++++++++++++++++++++++++ libavcodec/x86/h264dsp_mmx.c | 61 +++++ 3 files changed, 383 insertions(+) create mode 100644 libavcodec/x86/h264_weight_10bit.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index ea57bd1db6..022ab27766 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -15,6 +15,7 @@ YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ x86/h264_idct.o \ x86/h264_idct_10bit.o \ x86/h264_weight.o \ + x86/h264_weight_10bit.o \ YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \ x86/h264_intrapred_10bit.o diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm new file mode 100644 index 0000000000..1c58d72d94 --- /dev/null +++ b/libavcodec/x86/h264_weight_10bit.asm @@ -0,0 +1,321 @@ +;***************************************************************************** +;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code +;***************************************************************************** +;* Copyright (C) 2005-2011 x264 project +;* +;* Authors: Daniel Kang +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +pw_pixel_max: times 8 dw ((1 << 10)-1) +sq_1: dq 1 + dq 0 + +cextern pw_1 + +SECTION .text + +;----------------------------------------------------------------------------- +; void h264_weight(uint8_t *dst, int stride, int log2_denom, +; int weight, int offset); +;----------------------------------------------------------------------------- +%ifdef ARCH_X86_32 +DECLARE_REG_TMP 2 +%else +DECLARE_REG_TMP 10 +%endif + +%macro WEIGHT_PROLOGUE 1 + mov t0, %1 +.prologue + PROLOGUE 0,5,8 + movifnidn r0, r0mp + movifnidn r1d, r1m + movifnidn r3d, r3m + movifnidn r4d, r4m +%endmacro + +%macro WEIGHT_SETUP 1 + mova m0, [pw_1] + movd m2, r2m + pslld m0, m2 ; 1<h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; #endif + c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2; + c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2; + c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2; + c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2; + c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2; + c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2; + c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2; + c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2; + c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2; + c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2; + c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2; + c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2; + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; #if HAVE_ALIGNED_STACK @@ -463,6 +505,25 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; #endif } + if (mm_flags&AV_CPU_FLAG_SSE4) { + c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4; + c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4; + c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4; + c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4; + c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4; + c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4; + c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4; + c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4; + c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4; + c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4; + c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4; + c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4; + c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4; + } #if HAVE_AVX if (mm_flags&AV_CPU_FLAG_AVX) { c->h264_idct_dc_add =