diff --git a/libavcodec/v210enc.c b/libavcodec/v210enc.c index ef0d6ab2a4..cb887887ee 100644 --- a/libavcodec/v210enc.c +++ b/libavcodec/v210enc.c @@ -24,84 +24,188 @@ #include "avcodec.h" #include "bytestream.h" #include "internal.h" +#include "v210enc.h" + +#define CLIP(v) av_clip(v, 4, 1019) +#define CLIP8(v) av_clip(v, 1, 254) + +#define WRITE_PIXELS(a, b, c) \ + do { \ + val = CLIP(*a++); \ + val |= (CLIP(*b++) << 10) | \ + (CLIP(*c++) << 20); \ + AV_WL32(dst, val); \ + dst += 4; \ + } while (0) + +#define WRITE_PIXELS8(a, b, c) \ + do { \ + val = (CLIP8(*a++) << 2); \ + val |= (CLIP8(*b++) << 12) | \ + (CLIP8(*c++) << 22); \ + AV_WL32(dst, val); \ + dst += 4; \ + } while (0) + +static void v210_planar_pack_8_c(const uint8_t *y, const uint8_t *u, + const uint8_t *v, uint8_t *dst, + ptrdiff_t width) +{ + uint32_t val; + int i; + + /* unroll this to match the assembly */ + for (i = 0; i < width - 11; i += 12) { + WRITE_PIXELS8(u, y, v); + WRITE_PIXELS8(y, u, y); + WRITE_PIXELS8(v, y, u); + WRITE_PIXELS8(y, v, y); + WRITE_PIXELS8(u, y, v); + WRITE_PIXELS8(y, u, y); + WRITE_PIXELS8(v, y, u); + WRITE_PIXELS8(y, v, y); + } +} + +static void v210_planar_pack_10_c(const uint16_t *y, const uint16_t *u, + const uint16_t *v, uint8_t *dst, + ptrdiff_t width) +{ + uint32_t val; + int i; + + for (i = 0; i < width - 5; i += 6) { + WRITE_PIXELS(u, y, v); + WRITE_PIXELS(y, u, y); + WRITE_PIXELS(v, y, u); + WRITE_PIXELS(y, v, y); + } +} static av_cold int encode_init(AVCodecContext *avctx) { + V210EncContext *s = avctx->priv_data; + if (avctx->width & 1) { av_log(avctx, AV_LOG_ERROR, "v210 needs even width\n"); return AVERROR(EINVAL); } - if (avctx->bits_per_raw_sample != 10) - av_log(avctx, AV_LOG_WARNING, "bits per raw sample: %d != 10-bit\n", - avctx->bits_per_raw_sample); - avctx->coded_frame = av_frame_alloc(); if (!avctx->coded_frame) return AVERROR(ENOMEM); avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I; + s->pack_line_8 = v210_planar_pack_8_c; + s->pack_line_10 = v210_planar_pack_10_c; + + if (ARCH_X86) + ff_v210enc_init_x86(s); + return 0; } static int encode_frame(AVCodecContext *avctx, AVPacket *pkt, const AVFrame *pic, int *got_packet) { + V210EncContext *s = avctx->priv_data; int aligned_width = ((avctx->width + 47) / 48) * 48; int stride = aligned_width * 8 / 3; int line_padding = stride - ((avctx->width * 8 + 11) / 12) * 4; int h, w, ret; - const uint16_t *y = (const uint16_t*)pic->data[0]; - const uint16_t *u = (const uint16_t*)pic->data[1]; - const uint16_t *v = (const uint16_t*)pic->data[2]; - PutByteContext p; + uint8_t *dst; - if ((ret = ff_alloc_packet(pkt, avctx->height * stride)) < 0) { + ret = ff_alloc_packet(pkt, avctx->height * stride); + if (ret < 0) { av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n"); return ret; } + dst = pkt->data; - bytestream2_init_writer(&p, pkt->data, pkt->size); + if (pic->format == AV_PIX_FMT_YUV422P10) { + const uint16_t *y = (const uint16_t *)pic->data[0]; + const uint16_t *u = (const uint16_t *)pic->data[1]; + const uint16_t *v = (const uint16_t *)pic->data[2]; + for (h = 0; h < avctx->height; h++) { + uint32_t val; + w = (avctx->width / 6) * 6; + s->pack_line_10(y, u, v, dst, w); -#define CLIP(v) av_clip(v, 4, 1019) + y += w; + u += w >> 1; + v += w >> 1; + dst += (w / 6) * 16; + if (w < avctx->width - 1) { + WRITE_PIXELS(u, y, v); -#define WRITE_PIXELS(a, b, c) \ - do { \ - val = CLIP(*a++); \ - val |= (CLIP(*b++) << 10) | \ - (CLIP(*c++) << 20); \ - bytestream2_put_le32u(&p, val); \ - } while (0) + val = CLIP(*y++); + if (w == avctx->width - 2) { + AV_WL32(dst, val); + dst += 4; + } + } + if (w < avctx->width - 3) { + val |= (CLIP(*u++) << 10) | (CLIP(*y++) << 20); + AV_WL32(dst, val); + dst += 4; - for (h = 0; h < avctx->height; h++) { - uint32_t val; - for (w = 0; w < avctx->width - 5; w += 6) { - WRITE_PIXELS(u, y, v); - WRITE_PIXELS(y, u, y); - WRITE_PIXELS(v, y, u); - WRITE_PIXELS(y, v, y); + val = CLIP(*v++) | (CLIP(*y++) << 10); + AV_WL32(dst, val); + dst += 4; + } + + memset(dst, 0, line_padding); + dst += line_padding; + y += pic->linesize[0] / 2 - avctx->width; + u += pic->linesize[1] / 2 - avctx->width / 2; + v += pic->linesize[2] / 2 - avctx->width / 2; } - if (w < avctx->width - 1) { - WRITE_PIXELS(u, y, v); + } else if(pic->format == AV_PIX_FMT_YUV422P) { + const uint8_t *y = pic->data[0]; + const uint8_t *u = pic->data[1]; + const uint8_t *v = pic->data[2]; + for (h = 0; h < avctx->height; h++) { + uint32_t val; + w = (avctx->width / 12) * 12; + s->pack_line_8(y, u, v, dst, w); - val = CLIP(*y++); - if (w == avctx->width - 2) - bytestream2_put_le32u(&p, val); + y += w; + u += w >> 1; + v += w >> 1; + dst += (w / 12) * 32; + + for (; w < avctx->width - 5; w += 6) { + WRITE_PIXELS8(u, y, v); + WRITE_PIXELS8(y, u, y); + WRITE_PIXELS8(v, y, u); + WRITE_PIXELS8(y, v, y); + } + if (w < avctx->width - 1) { + WRITE_PIXELS8(u, y, v); + + val = CLIP8(*y++) << 2; + if (w == avctx->width - 2) { + AV_WL32(dst, val); + dst += 4; + } + } + if (w < avctx->width - 3) { + val |= (CLIP8(*u++) << 12) | (CLIP8(*y++) << 22); + AV_WL32(dst, val); + dst += 4; + + val = (CLIP8(*v++) << 2) | (CLIP8(*y++) << 12); + AV_WL32(dst, val); + dst += 4; + } + memset(dst, 0, line_padding); + dst += line_padding; + + y += pic->linesize[0] - avctx->width; + u += pic->linesize[1] - avctx->width / 2; + v += pic->linesize[2] - avctx->width / 2; } - if (w < avctx->width - 3) { - val |= (CLIP(*u++) << 10) | (CLIP(*y++) << 20); - bytestream2_put_le32u(&p, val); - - val = CLIP(*v++) | (CLIP(*y++) << 10); - bytestream2_put_le32u(&p, val); - } - - bytestream2_set_buffer(&p, 0, line_padding); - - y += pic->linesize[0] / 2 - avctx->width; - u += pic->linesize[1] / 2 - avctx->width / 2; - v += pic->linesize[2] / 2 - avctx->width / 2; } pkt->flags |= AV_PKT_FLAG_KEY; @@ -121,8 +225,9 @@ AVCodec ff_v210_encoder = { .long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"), .type = AVMEDIA_TYPE_VIDEO, .id = AV_CODEC_ID_V210, + .priv_data_size = sizeof(V210EncContext), .init = encode_init, .encode2 = encode_frame, .close = encode_close, - .pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV422P10, AV_PIX_FMT_NONE }, + .pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV422P, AV_PIX_FMT_NONE }, }; diff --git a/libavcodec/v210enc.h b/libavcodec/v210enc.h new file mode 100644 index 0000000000..f950328d30 --- /dev/null +++ b/libavcodec/v210enc.h @@ -0,0 +1,35 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCOENC_V210ENC_H +#define AVCOENC_V210ENC_H + +#include "libavutil/log.h" +#include "libavutil/opt.h" +#include "libavutil/pixfmt.h" + +typedef struct { + void (*pack_line_8)(const uint8_t *y, const uint8_t *u, + const uint8_t *v, uint8_t *dst, ptrdiff_t width); + void (*pack_line_10)(const uint16_t *y, const uint16_t *u, + const uint16_t *v, uint8_t *dst, ptrdiff_t width); +} V210EncContext; + +void ff_v210enc_init_x86(V210EncContext *s); + +#endif /* AVCOENC_V210ENC_H */ diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 3cec000e1d..7caa256c43 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -47,6 +47,7 @@ OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \ x86/rv40dsp_init.o OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o +OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o @@ -107,6 +108,7 @@ YASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o YASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \ x86/fpel.o \ x86/qpel.o +YASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm new file mode 100644 index 0000000000..595c8907b3 --- /dev/null +++ b/libavcodec/x86/v210enc.asm @@ -0,0 +1,145 @@ +;****************************************************************************** +;* V210 SIMD pack +;* Copyright (c) 2014 Kieran Kunhya +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +v210_enc_min_10: times 8 dw 0x4 +v210_enc_max_10: times 8 dw 0x3fb + +v210_enc_luma_mult_10: dw 4,1,16,4,1,16,0,0 +v210_enc_luma_shuf_10: db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11 + +v210_enc_chroma_mult_10: dw 1,4,16,0,16,1,4,0 +v210_enc_chroma_shuf_10: db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1 + +v210_enc_min_8: times 16 db 0x1 +v210_enc_max_8: times 16 db 0xfe + +v210_enc_luma_shuf_8: db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1 +v210_enc_luma_mult_8: dw 16,4,64,16,4,64,0,0 + +v210_enc_chroma_shuf1_8: db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1 +v210_enc_chroma_shuf2_8: db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1 + +v210_enc_chroma_mult_8: dw 4,16,64,0,64,4,16,0 + +SECTION .text + +%macro v210_planar_pack_10 0 + +; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width) +cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width + lea r0, [yq+2*widthq] + add uq, widthq + add vq, widthq + neg widthq + + mova m2, [v210_enc_min_10] + mova m3, [v210_enc_max_10] + +.loop + movu m0, [yq+2*widthq] + CLIPW m0, m2, m3 + + movq m1, [uq+widthq] + movhps m1, [vq+widthq] + CLIPW m1, m2, m3 + + pmullw m0, [v210_enc_luma_mult_10] + pshufb m0, [v210_enc_luma_shuf_10] + + pmullw m1, [v210_enc_chroma_mult_10] + pshufb m1, [v210_enc_chroma_shuf_10] + + por m0, m1 + + movu [dstq], m0 + + add dstq, mmsize + add widthq, 6 + jl .loop + + RET +%endmacro + +INIT_XMM ssse3 +v210_planar_pack_10 + +%macro v210_planar_pack_8 0 + +; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width) +cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width + add yq, widthq + shr widthq, 1 + add uq, widthq + add vq, widthq + neg widthq + + mova m4, [v210_enc_min_8] + mova m5, [v210_enc_max_8] + pxor m6, m6 + +.loop + movu m1, [yq+2*widthq] + CLIPUB m1, m4, m5 + + punpcklbw m0, m1, m6 + ; can't unpack high bytes in the same way because we process + ; only six bytes at a time + pshufb m1, [v210_enc_luma_shuf_8] + + pmullw m0, [v210_enc_luma_mult_8] + pmullw m1, [v210_enc_luma_mult_8] + pshufb m0, [v210_enc_luma_shuf_10] + pshufb m1, [v210_enc_luma_shuf_10] + + movq m3, [uq+widthq] + movhps m3, [vq+widthq] + CLIPUB m3, m4, m5 + + ; shuffle and multiply to get the same packing as in 10-bit + pshufb m2, m3, [v210_enc_chroma_shuf1_8] + pshufb m3, [v210_enc_chroma_shuf2_8] + + pmullw m2, [v210_enc_chroma_mult_8] + pmullw m3, [v210_enc_chroma_mult_8] + pshufb m2, [v210_enc_chroma_shuf_10] + pshufb m3, [v210_enc_chroma_shuf_10] + + por m0, m2 + por m1, m3 + + movu [dstq], m0 + movu [dstq+mmsize], m1 + + add dstq, 2*mmsize + add widthq, 6 + jl .loop + + RET +%endmacro + +INIT_XMM ssse3 +v210_planar_pack_8 +INIT_XMM avx +v210_planar_pack_8 diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c new file mode 100644 index 0000000000..95b999bc05 --- /dev/null +++ b/libavcodec/x86/v210enc_init.c @@ -0,0 +1,42 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/cpu.h" +#include "libavcodec/v210enc.h" + +void ff_v210_planar_pack_8_ssse3(const uint8_t *y, const uint8_t *u, + const uint8_t *v, uint8_t *dst, + ptrdiff_t width); +void ff_v210_planar_pack_8_avx(const uint8_t *y, const uint8_t *u, + const uint8_t *v, uint8_t *dst, ptrdiff_t width); +void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u, + const uint16_t *v, uint8_t *dst, + ptrdiff_t width); + +av_cold void ff_v210enc_init_x86(V210EncContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSSE3(cpu_flags)) { + s->pack_line_8 = ff_v210_planar_pack_8_ssse3; + s->pack_line_10 = ff_v210_planar_pack_10_ssse3; + } + + if (EXTERNAL_AVX(cpu_flags)) + s->pack_line_8 = ff_v210_planar_pack_8_avx; +} diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index 11779cf07f..9f64dd13e1 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -584,6 +584,11 @@ %endif %endmacro +%macro CLIPUB 3 ;(dst, min, max) + pmaxub %1, %2 + pminub %1, %3 +%endmacro + %macro CLIPW 3 ;(dst, min, max) pmaxsw %1, %2 pminsw %1, %3 diff --git a/tests/ref/vsynth/vsynth1-v210 b/tests/ref/vsynth/vsynth1-v210 index defd2f658b..e2f4588c54 100644 --- a/tests/ref/vsynth/vsynth1-v210 +++ b/tests/ref/vsynth/vsynth1-v210 @@ -1,4 +1,4 @@ -054d7e903fb4d9e68700c56b93d45916 *tests/data/fate/vsynth1-v210.avi +a38de84bfef53ed69204480c979dd6c4 *tests/data/fate/vsynth1-v210.avi 14752448 tests/data/fate/vsynth1-v210.avi -50973792d3f1abe04a51ee0121f077f2 *tests/data/fate/vsynth1-v210.out.rawvideo -stddev: 1.85 PSNR: 42.78 MAXDIFF: 29 bytes: 7603200/ 7603200 +2ba7f4ca302f3c4147860b9dfb12b6e4 *tests/data/fate/vsynth1-v210.out.rawvideo +stddev: 1.84 PSNR: 42.81 MAXDIFF: 29 bytes: 7603200/ 7603200 diff --git a/tests/ref/vsynth/vsynth2-v210 b/tests/ref/vsynth/vsynth2-v210 index 5e934de41b..245e2822a9 100644 --- a/tests/ref/vsynth/vsynth2-v210 +++ b/tests/ref/vsynth/vsynth2-v210 @@ -1,4 +1,4 @@ -87bb634932b3f5cacd4d08142798db17 *tests/data/fate/vsynth2-v210.avi +3aae4d5b25a4d91e4812a297c6a9ef8a *tests/data/fate/vsynth2-v210.avi 14752448 tests/data/fate/vsynth2-v210.avi -8bb1c449e1a2a94fd0d98841c04246bb *tests/data/fate/vsynth2-v210.out.rawvideo -stddev: 0.39 PSNR: 56.17 MAXDIFF: 9 bytes: 7603200/ 7603200 +99e367a50da75c2c187230889bee8e2e *tests/data/fate/vsynth2-v210.out.rawvideo +stddev: 0.40 PSNR: 56.06 MAXDIFF: 9 bytes: 7603200/ 7603200