Add gradfun filter, ported from MPlayer.

Patch by Nolan L nol888 <=> gmail >=< com. See thread: Subject: [FFmpeg-devel] [PATCH] Port gradfun to libavfilter (GCI) Date: Mon, 29 Nov 2010 07:18:14 -0500 Originally committed as revision 25942 to svn://svn.ffmpeg.org/ffmpeg/trunk
2010-12-12 17:59:10 +00:00 · 2010-12-12 17:59:10 +00:00 · d5f187fd33
parent 9d845ca40c
commit d5f187fd33
9 changed files with 498 additions and 2 deletions
--- a/1
+++ b/1
@ -64,6 +64,7 @@ version <next>:
 - hqdn3d filter added
 - RTP depacketization of QCELP
 - FLAC parser added
+- gradfun filter added


 version 0.6:
--- a/doc/filters.texi
+++ b/doc/filters.texi
@ -425,6 +425,35 @@ frei0r=perspective:0.2/0.2:0.8/0.2
 For more information see:
@url{http://piksel.org/frei0r}

+@section gradfun
+
+Fix the banding artifacts that are sometimes introduced into nearly flat
+regions by truncation to 8bit colordepth.
+Interpolate the gradients that should go where the bands are, and
+dither them.
+
+The filter takes two optional parameters, separated by ':':
+@var{strength}:@var{radius}
+
+@var{strength} is the maximum amount by which the filter will change
+any one pixel. Also the threshold for detecting nearly flat
+regions. Acceptable values range from .51 to 255, default value is
+1.2, out-of-range values will be clipped to the valid range.
+
+@var{radius} is the neighborhood to fit the gradient to. A larger
+radius makes for smoother gradients, but also prevents the filter from
+modifying the pixels near detailed regions. Acceptable values are
+8-32, default value is 16, out-of-range values will be clipped to the
+valid range.
+
+@example
+# default parameters
+gradfun=1.2:16
+
+# omitting radius
+gradfun=1.2
+@end example
+
@section hflip

 Flip the input video horizontally.
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@ -26,6 +26,7 @@ OBJS-$(CONFIG_DRAWBOX_FILTER)                += vf_drawbox.o
 OBJS-$(CONFIG_FIFO_FILTER)                   += vf_fifo.o
 OBJS-$(CONFIG_FORMAT_FILTER)                 += vf_format.o
 OBJS-$(CONFIG_FREI0R_FILTER)                 += vf_frei0r.o
+OBJS-$(CONFIG_GRADFUN_FILTER)                += vf_gradfun.o
 OBJS-$(CONFIG_HFLIP_FILTER)                  += vf_hflip.o
 OBJS-$(CONFIG_HQDN3D_FILTER)                 += vf_hqdn3d.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@ -47,6 +47,7 @@ void avfilter_register_all(void)
    REGISTER_FILTER (FIFO,        fifo,        vf);
    REGISTER_FILTER (FORMAT,      format,      vf);
    REGISTER_FILTER (FREI0R,      frei0r,      vf);
+    REGISTER_FILTER (GRADFUN,     gradfun,     vf);
    REGISTER_FILTER (HFLIP,       hflip,       vf);
    REGISTER_FILTER (HQDN3D,      hqdn3d,      vf);
    REGISTER_FILTER (NOFORMAT,    noformat,    vf);
--- a/libavfilter/avfilter.h
+++ b/libavfilter/avfilter.h
@ -27,8 +27,8 @@
 #include "libavcore/samplefmt.h"

 #define LIBAVFILTER_VERSION_MAJOR  1
-#define LIBAVFILTER_VERSION_MINOR 68
-#define LIBAVFILTER_VERSION_MICRO  1
+#define LIBAVFILTER_VERSION_MINOR 69
+#define LIBAVFILTER_VERSION_MICRO  0

 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
                                               LIBAVFILTER_VERSION_MINOR, \
--- a/libavfilter/gradfun.h
+++ b/libavfilter/gradfun.h
@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2010 Nolan Lum <nol888@gmail.com>
+ * Copyright (c) 2009 Loren Merritt <lorenm@u.washignton.edu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_GRADFUN_H
+#define AVFILTER_GRADFUN_H
+
+#include "avfilter.h"
+
+/// Holds instance-specific information for gradfun.
+typedef struct {
+    int thresh;    ///< threshold for gradient algorithm
+    int radius;    ///< blur radius
+    int chroma_w;  ///< width of the chroma planes
+    int chroma_h;  ///< weight of the chroma planes
+    int chroma_r;  ///< blur radius for the chroma planes
+    uint16_t *buf; ///< holds image data for blur algorithm passed into filter.
+    /// DSP functions.
+    void (*filter_line) (uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
+    void (*blur_line) (uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
+} GradFunContext;
+
+void ff_gradfun_filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
+void ff_gradfun_blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
+
+void ff_gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
+void ff_gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
+
+void ff_gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
+
+#endif /* AVFILTER_GRADFUN_H */
--- a/libavfilter/vf_gradfun.c
+++ b/libavfilter/vf_gradfun.c
@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2010 Nolan Lum <nol888@gmail.com>
+ * Copyright (c) 2009 Loren Merritt <lorenm@u.washignton.edu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * gradfun debanding filter, ported from MPlayer
+ * libmpcodecs/vf_gradfun.c
+ *
+ * Apply a boxblur debanding algorithm (based on the gradfun2db
+ * Avisynth filter by prunedtree).
+ * Foreach pixel, if it's within threshold of the blurred value, make it closer.
+ * So now we have a smoothed and higher bitdepth version of all the shallow
+ * gradients, while leaving detailed areas untouched.
+ * Dither it back to 8bit.
+ */
+
+#include "libavcore/imgutils.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+#include "avfilter.h"
+#include "gradfun.h"
+
+DECLARE_ALIGNED(16, static const uint16_t, dither)[8][8] = {
+    {0x00,0x60,0x18,0x78,0x06,0x66,0x1E,0x7E},
+    {0x40,0x20,0x58,0x38,0x46,0x26,0x5E,0x3E},
+    {0x10,0x70,0x08,0x68,0x16,0x76,0x0E,0x6E},
+    {0x50,0x30,0x48,0x28,0x56,0x36,0x4E,0x2E},
+    {0x04,0x64,0x1C,0x7C,0x02,0x62,0x1A,0x7A},
+    {0x44,0x24,0x5C,0x3C,0x42,0x22,0x5A,0x3A},
+    {0x14,0x74,0x0C,0x6C,0x12,0x72,0x0A,0x6A},
+    {0x54,0x34,0x4C,0x2C,0x52,0x32,0x4A,0x2A},
+};
+
+void ff_gradfun_filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
+{
+    int x;
+    for (x = 0; x < width; x++, dc += x & 1) {
+        int pix = src[x] << 7;
+        int delta = dc[0] - pix;
+        int m = abs(delta) * thresh >> 16;
+        m = FFMAX(0, 127 - m);
+        m = m * m * delta >> 14;
+        pix += m + dithers[x & 7];
+        dst[x] = av_clip_uint8(pix >> 7);
+    }
+}
+
+void ff_gradfun_blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width)
+{
+    int x, v, old;
+    for (x = 0; x < width; x++) {
+        v = buf1[x] + src[2 * x] + src[2 * x + 1] + src[2 * x + src_linesize] + src[2 * x + 1 + src_linesize];
+        old = buf[x];
+        buf[x] = v;
+        dc[x] = v - old;
+    }
+}
+
+static void filter(GradFunContext *ctx, uint8_t *dst, uint8_t *src, int width, int height, int dst_linesize, int src_linesize, int r)
+{
+    int bstride = FFALIGN(width, 16) / 2;
+    int y;
+    uint32_t dc_factor = (1 << 21) / (r * r);
+    uint16_t *dc = ctx->buf + 16;
+    uint16_t *buf = ctx->buf + bstride + 32;
+    int thresh = ctx->thresh;
+
+    memset(dc, 0, (bstride + 16) * sizeof(*buf));
+    for (y = 0; y < r; y++)
+        ctx->blur_line(dc, buf + y * bstride, buf + (y - 1) * bstride, src + 2 * y * src_linesize, src_linesize, width / 2);
+    for (;;) {
+        if (y < height - r) {
+            int mod = ((y + r) / 2) % r;
+            uint16_t *buf0 = buf + mod * bstride;
+            uint16_t *buf1 = buf + (mod ? mod - 1 : r - 1) * bstride;
+            int x, v;
+            ctx->blur_line(dc, buf0, buf1, src + (y + r) * src_linesize, src_linesize, width / 2);
+            for (x = v = 0; x < r; x++)
+                v += dc[x];
+            for (; x < width / 2; x++) {
+                v += dc[x] - dc[x-r];
+                dc[x-r] = v * dc_factor >> 16;
+            }
+            for (; x < (width + r + 1) / 2; x++)
+                dc[x-r] = v * dc_factor >> 16;
+            for (x = -r / 2; x < 0; x++)
+                dc[x] = dc[0];
+        }
+        if (y == r) {
+            for (y = 0; y < r; y++)
+                ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
+        }
+        ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
+        if (++y >= height) break;
+        ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
+        if (++y >= height) break;
+    }
+}
+
+static av_cold int init(AVFilterContext *ctx, const char *args, void *opaque)
+{
+    GradFunContext *gf = ctx->priv;
+    float thresh = 1.2;
+    int radius = 16;
+    av_unused int cpu_flags = av_get_cpu_flags();
+
+    if (args)
+        sscanf(args, "%f:%d", &thresh, &radius);
+
+    thresh = av_clipf(thresh, 0.51, 255);
+    gf->thresh = (1 << 15) / thresh;
+    gf->radius = av_clip((radius + 1) & ~1, 4, 32);
+
+    gf->blur_line = ff_gradfun_blur_line_c;
+    gf->filter_line = ff_gradfun_filter_line_c;
+
+    if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX2)
+        gf->filter_line = ff_gradfun_filter_line_mmx2;
+    if (HAVE_SSSE3 && cpu_flags & AV_CPU_FLAG_SSSE3)
+        gf->filter_line = ff_gradfun_filter_line_ssse3;
+    if (HAVE_SSE && cpu_flags & AV_CPU_FLAG_SSE2)
+        gf->blur_line = ff_gradfun_blur_line_sse2;
+
+    av_log(ctx, AV_LOG_INFO, "threshold:%.2f radius:%d\n", thresh, gf->radius);
+
+    return 0;
+}
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+    GradFunContext *gf = ctx->priv;
+    av_freep(&gf->buf);
+}
+
+static int query_formats(AVFilterContext *ctx)
+{
+    static const enum PixelFormat pix_fmts[] = {
+        PIX_FMT_YUV410P,            PIX_FMT_YUV420P,
+        PIX_FMT_GRAY8,              PIX_FMT_NV12,
+        PIX_FMT_NV21,               PIX_FMT_YUV444P,
+        PIX_FMT_YUV422P,            PIX_FMT_YUV411P,
+        PIX_FMT_NONE
+    };
+
+    avfilter_set_common_formats(ctx, avfilter_make_format_list(pix_fmts));
+
+    return 0;
+}
+
+static int config_input(AVFilterLink *inlink)
+{
+    GradFunContext *gf = inlink->dst->priv;
+    int hsub = av_pix_fmt_descriptors[inlink->format].log2_chroma_w;
+    int vsub = av_pix_fmt_descriptors[inlink->format].log2_chroma_h;
+
+    gf->buf = av_mallocz((FFALIGN(inlink->w, 16) * (gf->radius + 1) / 2 + 32) * sizeof(uint16_t));
+    if (!gf->buf)
+        return AVERROR(ENOMEM);
+
+    gf->chroma_w = -((-inlink->w) >> hsub);
+    gf->chroma_h = -((-inlink->h) >> vsub);
+    gf->chroma_r = av_clip(((((gf->radius >> hsub) + (gf->radius >> vsub)) / 2 ) + 1) & ~1, 4, 32);
+
+    return 0;
+}
+
+static void start_frame(AVFilterLink *inlink, AVFilterBufferRef *inpicref)
+{
+    AVFilterLink *outlink = inlink->dst->outputs[0];
+    AVFilterBufferRef *outpicref;
+
+    if (inpicref->perms & AV_PERM_PRESERVE) {
+        outpicref = avfilter_get_video_buffer(outlink, AV_PERM_WRITE, outlink->w, outlink->h);
+        avfilter_copy_buffer_ref_props(outpicref, inpicref);
+        outpicref->video->w = outlink->w;
+        outpicref->video->h = outlink->h;
+    } else
+        outpicref = inpicref;
+
+    outlink->out_buf = outpicref;
+    avfilter_start_frame(outlink, avfilter_ref_buffer(outpicref, ~0));
+}
+
+static void null_draw_slice(AVFilterLink *link, int y, int h, int slice_dir) { }
+
+static void end_frame(AVFilterLink *inlink)
+{
+    GradFunContext *gf = inlink->dst->priv;
+    AVFilterBufferRef *inpic = inlink->cur_buf;
+    AVFilterLink *outlink = inlink->dst->outputs[0];
+    AVFilterBufferRef *outpic = outlink->out_buf;
+    int p;
+
+    for (p = 0; p < 4 && inpic->data[p]; p++) {
+        int w = inlink->w;
+        int h = inlink->h;
+        int r = gf->radius;
+        if (p) {
+            w = gf->chroma_w;
+            h = gf->chroma_h;
+            r = gf->chroma_r;
+        }
+
+        if (FFMIN(w, h) > 2 * r)
+            filter(gf, outpic->data[p], inpic->data[p], w, h, outpic->linesize[p], inpic->linesize[p], r);
+        else if (outpic->data[p] != inpic->data[p])
+            av_image_copy_plane(outpic->data[p], outpic->linesize[p], inpic->data[p], inpic->linesize[p], w, h);
+    }
+
+    avfilter_draw_slice(outlink, 0, inlink->h, 1);
+    avfilter_end_frame(outlink);
+    avfilter_unref_buffer(inpic);
+    avfilter_unref_buffer(outpic);
+}
+
+AVFilter avfilter_vf_gradfun = {
+    .name          = "gradfun",
+    .description   = NULL_IF_CONFIG_SMALL("Debands video quickly using gradients."),
+    .priv_size     = sizeof(GradFunContext),
+    .init          = init,
+    .uninit        = uninit,
+    .query_formats = query_formats,
+
+    .inputs    = (AVFilterPad[]) {{ .name             = "default",
+                                    .type             = AVMEDIA_TYPE_VIDEO,
+                                    .config_props     = config_input,
+                                    .start_frame      = start_frame,
+                                    .draw_slice       = null_draw_slice,
+                                    .end_frame        = end_frame,
+                                    .min_perms        = AV_PERM_READ, },
+                                  { .name = NULL}},
+    .outputs   = (AVFilterPad[]) {{ .name             = "default",
+                                    .type             = AVMEDIA_TYPE_VIDEO, },
+                                  { .name = NULL}},
+};
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@ -1 +1,2 @@
 MMX-OBJS-$(CONFIG_YADIF_FILTER)              += x86/yadif.o
+MMX-OBJS-$(CONFIG_GRADFUN_FILTER)            += x86/gradfun.o
--- a/libavfilter/x86/gradfun.c
+++ b/libavfilter/x86/gradfun.c
@ -0,0 +1,162 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86_cpu.h"
+#include "libavfilter/gradfun.h"
+
+DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
+DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
+
+void ff_gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
+{
+#if HAVE_MMX
+    intptr_t x;
+    if (width & 3) {
+        x = width & ~3;
+        ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
+        width = x;
+    }
+    x = -width;
+    __asm__ volatile(
+        "movd          %4, %%mm5 \n"
+        "pxor       %%mm7, %%mm7 \n"
+        "pshufw $0, %%mm5, %%mm5 \n"
+        "movq          %6, %%mm6 \n"
+        "movq          %5, %%mm4 \n"
+        "1: \n"
+        "movd     (%2,%0), %%mm0 \n"
+        "movd     (%3,%0), %%mm1 \n"
+        "punpcklbw  %%mm7, %%mm0 \n"
+        "punpcklwd  %%mm1, %%mm1 \n"
+        "psllw         $7, %%mm0 \n"
+        "pxor       %%mm2, %%mm2 \n"
+        "psubw      %%mm0, %%mm1 \n" // delta = dc - pix
+        "psubw      %%mm1, %%mm2 \n"
+        "pmaxsw     %%mm1, %%mm2 \n"
+        "pmulhuw    %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
+        "psubw      %%mm6, %%mm2 \n"
+        "pminsw     %%mm7, %%mm2 \n" // m = -max(0, 127-m)
+        "pmullw     %%mm2, %%mm2 \n"
+        "paddw      %%mm4, %%mm0 \n" // pix += dither
+        "pmulhw     %%mm2, %%mm1 \n"
+        "psllw         $2, %%mm1 \n" // m = m*m*delta >> 14
+        "paddw      %%mm1, %%mm0 \n" // pix += m
+        "psraw         $7, %%mm0 \n"
+        "packuswb   %%mm0, %%mm0 \n"
+        "movd       %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
+        "add           $4, %0 \n"
+        "jl 1b \n"
+        "emms \n"
+        :"+r"(x)
+        :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
+         "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
+        :"memory"
+    );
+#endif
+}
+
+void ff_gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
+{
+#if HAVE_SSSE3
+    intptr_t x;
+    if (width & 7) {
+        // could be 10% faster if I somehow eliminated this
+        x = width & ~7;
+        ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
+        width = x;
+    }
+    x = -width;
+    __asm__ volatile(
+        "movd           %4, %%xmm5 \n"
+        "pxor       %%xmm7, %%xmm7 \n"
+        "pshuflw $0,%%xmm5, %%xmm5 \n"
+        "movdqa         %6, %%xmm6 \n"
+        "punpcklqdq %%xmm5, %%xmm5 \n"
+        "movdqa         %5, %%xmm4 \n"
+        "1: \n"
+        "movq      (%2,%0), %%xmm0 \n"
+        "movq      (%3,%0), %%xmm1 \n"
+        "punpcklbw  %%xmm7, %%xmm0 \n"
+        "punpcklwd  %%xmm1, %%xmm1 \n"
+        "psllw          $7, %%xmm0 \n"
+        "psubw      %%xmm0, %%xmm1 \n" // delta = dc - pix
+        "pabsw      %%xmm1, %%xmm2 \n"
+        "pmulhuw    %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16
+        "psubw      %%xmm6, %%xmm2 \n"
+        "pminsw     %%xmm7, %%xmm2 \n" // m = -max(0, 127-m)
+        "pmullw     %%xmm2, %%xmm2 \n"
+        "psllw          $1, %%xmm2 \n"
+        "paddw      %%xmm4, %%xmm0 \n" // pix += dither
+        "pmulhrsw   %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14
+        "paddw      %%xmm1, %%xmm0 \n" // pix += m
+        "psraw          $7, %%xmm0 \n"
+        "packuswb   %%xmm0, %%xmm0 \n"
+        "movq       %%xmm0, (%1,%0) \n" // dst = clip(pix>>7)
+        "add            $8, %0 \n"
+        "jl 1b \n"
+        :"+&r"(x)
+        :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
+         "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
+        :"memory"
+    );
+#endif // HAVE_SSSE3
+}
+
+void ff_gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width)
+{
+#if HAVE_SSE
+#define BLURV(load)\
+    intptr_t x = -2*width;\
+    __asm__ volatile(\
+        "movdqa %6, %%xmm7 \n"\
+        "1: \n"\
+        load"   (%4,%0), %%xmm0 \n"\
+        load"   (%5,%0), %%xmm1 \n"\
+        "movdqa  %%xmm0, %%xmm2 \n"\
+        "movdqa  %%xmm1, %%xmm3 \n"\
+        "psrlw       $8, %%xmm0 \n"\
+        "psrlw       $8, %%xmm1 \n"\
+        "pand    %%xmm7, %%xmm2 \n"\
+        "pand    %%xmm7, %%xmm3 \n"\
+        "paddw   %%xmm1, %%xmm0 \n"\
+        "paddw   %%xmm3, %%xmm2 \n"\
+        "paddw   %%xmm2, %%xmm0 \n"\
+        "paddw  (%2,%0), %%xmm0 \n"\
+        "movdqa (%1,%0), %%xmm1 \n"\
+        "movdqa  %%xmm0, (%1,%0) \n"\
+        "psubw   %%xmm1, %%xmm0 \n"\
+        "movdqa  %%xmm0, (%3,%0) \n"\
+        "add        $16, %0 \n"\
+        "jl 1b \n"\
+        :"+&r"(x)\
+        :"r"(buf+width),\
+         "r"(buf1+width),\
+         "r"(dc+width),\
+         "r"(src+width*2),\
+         "r"(src+width*2+src_linesize),\
+         "m"(*pw_ff)\
+        :"memory"\
+    );
+    if (((intptr_t) src | src_linesize) & 15) {
+        BLURV("movdqu");
+    } else {
+        BLURV("movdqa");
+    }
+#endif // HAVE_SSE
+}