From f30ff542000490d6044df0403dd8f0aee78b0106 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Sat, 21 Jul 2012 21:17:30 +0000
Subject: [PATCH 01/22] doc: Clarify licensing issues arising from external
 libraries

---
 LICENSE | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/LICENSE b/LICENSE
index e6d25f299e..1266627f39 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,5 +1,5 @@
 Libav:
-------
+======
 
 Most files in Libav are under the GNU Lesser General Public License version 2.1
 or later (LGPL v2.1+). Read the file COPYING.LGPLv2.1 for details. Some other
@@ -38,18 +38,29 @@ for you. Read the file COPYING.LGPLv3 or, if you have enabled GPL parts,
 COPYING.GPLv3 to learn the exact legal terms that apply in this case.
 
 
-external libraries:
--------------------
+external libraries
+==================
 
-Some external libraries, e.g. libx264, are under GPL and can be used in
-conjunction with Libav. They require --enable-gpl to be passed to configure
-as well.
+Libav can be combined with a number of external libraries, which sometimes
+affect the licensing of binaries resulting from the combination.
 
-The OpenCORE external libraries are under the Apache License 2.0. That license
-is incompatible with the LGPL v2.1 and the GPL v2, but not with version 3 of
-those licenses. So to combine the OpenCORE libraries with Libav, the license
-version needs to be upgraded by passing --enable-version3 to configure.
+compatible libraries
+--------------------
 
-The nonfree external library libfaac can be hooked up in Libav. You need to
-pass --enable-nonfree to configure to enable it. Employ this option with care
-as Libav then becomes nonfree and unredistributable.
+The libcdio, libx264, libxavs and libxvid libraries are under GPL. When
+combining them with Libav, Libav needs to be licensed as GPL as well by
+passing --enable-gpl to configure.
+
+The OpenCORE and VisualOn libraries are under the Apache License 2.0. That
+license is incompatible with the LGPL v2.1 and the GPL v2, but not with
+version 3 of those licenses. So to combine these libraries with Libav, the
+license version needs to be upgraded by passing --enable-version3 to configure.
+
+incompatible libraries
+----------------------
+
+The Fraunhofer AAC library, FAAC and OpenSSL are under licenses incompatible
+with all (L)GPL versions. Thus, unfortunately, since both licenses cannot be
+satisfied simultaneously, binaries resulting from the combination of Libav
+with these libraries are nonfree und unredistributable. If you wish to enable
+any of these libraries nonetheless, pass --enable-nonfree to configure.

From 3b9e832e17c3956082e589433fb0e08092cb291e Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Wed, 8 Aug 2012 00:35:43 +0200
Subject: [PATCH 02/22] x86: Drop silly "_yasm" suffixes from filenames

---
 libavcodec/x86/Makefile                                | 6 +++---
 libavcodec/x86/{dsputil_yasm.asm => dsputil.asm}       | 0
 libavcodec/x86/{dsputilenc_yasm.asm => dsputilenc.asm} | 0
 libavcodec/x86/{vc1dsp_yasm.asm => vc1dsp.asm}         | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename libavcodec/x86/{dsputil_yasm.asm => dsputil.asm} (100%)
 rename libavcodec/x86/{dsputilenc_yasm.asm => dsputilenc.asm} (100%)
 rename libavcodec/x86/{vc1dsp_yasm.asm => vc1dsp.asm} (100%)

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index eb82ef572e..57e73d8b2f 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -36,7 +36,7 @@ MMX-OBJS-$(CONFIG_VP8_DECODER)         += x86/vp8dsp-init.o
 YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
 YASM-OBJS-$(CONFIG_AC3DSP)             += x86/ac3dsp.o
 YASM-OBJS-$(CONFIG_DCT)                += x86/dct32_sse.o
-YASM-OBJS-$(CONFIG_ENCODERS)           += x86/dsputilenc_yasm.o
+YASM-OBJS-$(CONFIG_ENCODERS)           += x86/dsputilenc.o
 YASM-OBJS-$(CONFIG_FFT)                += x86/fft_mmx.o                 \
                                           $(YASM-OBJS-FFT-yes)
 YASM-OBJS-$(CONFIG_H264CHROMA)         += x86/h264_chromamc.o           \
@@ -56,11 +56,11 @@ YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_RV30_DECODER)       += x86/rv34dsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)       += x86/rv34dsp.o                 \
                                           x86/rv40dsp.o
-YASM-OBJS-$(CONFIG_VC1_DECODER)        += x86/vc1dsp_yasm.o
+YASM-OBJS-$(CONFIG_VC1_DECODER)        += x86/vc1dsp.o
 YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
 YASM-OBJS-$(CONFIG_VP6_DECODER)        += x86/vp56dsp.o
 YASM-OBJS-$(CONFIG_VP8_DECODER)        += x86/vp8dsp.o
 
-YASM-OBJS                              += x86/dsputil_yasm.o            \
+YASM-OBJS                              += x86/dsputil.o                 \
                                           x86/deinterlace.o             \
                                           x86/fmtconvert.o              \
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil.asm
similarity index 100%
rename from libavcodec/x86/dsputil_yasm.asm
rename to libavcodec/x86/dsputil.asm
diff --git a/libavcodec/x86/dsputilenc_yasm.asm b/libavcodec/x86/dsputilenc.asm
similarity index 100%
rename from libavcodec/x86/dsputilenc_yasm.asm
rename to libavcodec/x86/dsputilenc.asm
diff --git a/libavcodec/x86/vc1dsp_yasm.asm b/libavcodec/x86/vc1dsp.asm
similarity index 100%
rename from libavcodec/x86/vc1dsp_yasm.asm
rename to libavcodec/x86/vc1dsp.asm

From f4bb38cc26a4c91707194ea0b2290a028c546f7d Mon Sep 17 00:00:00 2001
From: Derek Buitenhuis <derek.buitenhuis@gmail.com>
Date: Fri, 10 Aug 2012 16:05:47 +0000
Subject: [PATCH 03/22] cllc: Rename some funcs to represent what they actually
 do

This is in preparation for adding support for other colorspaces
and coding types.

Signed-off-by: Derek Buitenhuis <derek.buitenhuis@gmail.com>
---
 libavcodec/cllc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libavcodec/cllc.c b/libavcodec/cllc.c
index d23da181ba..703f546ea9 100644
--- a/libavcodec/cllc.c
+++ b/libavcodec/cllc.c
@@ -74,8 +74,8 @@ static int read_code_table(CLLCContext *ctx, GetBitContext *gb, VLC *vlc)
                               codes, 2, 2, symbols, 1, 1, 0);
 }
 
-static int read_line(CLLCContext *ctx, GetBitContext *gb, int *top_left,
-                     VLC *vlc, uint8_t *outbuf)
+static int read_rgb24_component_line(CLLCContext *ctx, GetBitContext *gb,
+                                     int *top_left, VLC *vlc, uint8_t *outbuf)
 {
     uint8_t *dst;
     int pred, code;
@@ -104,7 +104,7 @@ static int read_line(CLLCContext *ctx, GetBitContext *gb, int *top_left,
     return 0;
 }
 
-static int decode_bgr24_frame(CLLCContext *ctx, GetBitContext *gb, AVFrame *pic)
+static int decode_rgb24_frame(CLLCContext *ctx, GetBitContext *gb, AVFrame *pic)
 {
     AVCodecContext *avctx = ctx->avctx;
     uint8_t *dst;
@@ -137,7 +137,7 @@ static int decode_bgr24_frame(CLLCContext *ctx, GetBitContext *gb, AVFrame *pic)
     /* Read in and restore every line */
     for (i = 0; i < avctx->height; i++) {
         for (j = 0; j < 3; j++)
-            read_line(ctx, gb, &pred[j], &vlc[j], &dst[j]);
+            read_rgb24_component_line(ctx, gb, &pred[j], &vlc[j], &dst[j]);
 
         dst += pic->linesize[0];
     }
@@ -219,7 +219,7 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
             return ret;
         }
 
-        ret = decode_bgr24_frame(ctx, &gb, pic);
+        ret = decode_rgb24_frame(ctx, &gb, pic);
         if (ret < 0)
             return ret;
 

From 7fda47d53b5489d6140b9f490bfc9c531521d46b Mon Sep 17 00:00:00 2001
From: Derek Buitenhuis <derek.buitenhuis@gmail.com>
Date: Fri, 10 Aug 2012 17:19:03 +0000
Subject: [PATCH 04/22] cllc: Add support for QRGB

Signed-off-by: Derek Buitenhuis <derek.buitenhuis@gmail.com>
---
 libavcodec/cllc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavcodec/cllc.c b/libavcodec/cllc.c
index 703f546ea9..68668ef39d 100644
--- a/libavcodec/cllc.c
+++ b/libavcodec/cllc.c
@@ -210,6 +210,7 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
 
     switch (coding_type) {
     case 1:
+    case 2:
         avctx->pix_fmt             = PIX_FMT_RGB24;
         avctx->bits_per_raw_sample = 8;
 

From 17c11cef9f99d31e31dd9176ee2f26bdf6e5d351 Mon Sep 17 00:00:00 2001
From: Derek Buitenhuis <derek.buitenhuis@gmail.com>
Date: Fri, 10 Aug 2012 16:05:49 +0000
Subject: [PATCH 05/22] cllc: Implement ARGB support

Signed-off-by: Derek Buitenhuis <derek.buitenhuis@gmail.com>
---
 libavcodec/cllc.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/libavcodec/cllc.c b/libavcodec/cllc.c
index 68668ef39d..a1514179b7 100644
--- a/libavcodec/cllc.c
+++ b/libavcodec/cllc.c
@@ -74,6 +74,80 @@ static int read_code_table(CLLCContext *ctx, GetBitContext *gb, VLC *vlc)
                               codes, 2, 2, symbols, 1, 1, 0);
 }
 
+/*
+ * Unlike the RGB24 read/restore, which reads in a component at a time,
+ * ARGB read/restore reads in ARGB quads.
+ */
+static int read_argb_line(CLLCContext *ctx, GetBitContext *gb, int *top_left,
+                          VLC *vlc, uint8_t *outbuf)
+{
+    uint8_t *dst;
+    int pred[4];
+    int code;
+    int i;
+
+    OPEN_READER(bits, gb);
+
+    dst     = outbuf;
+    pred[0] = top_left[0];
+    pred[1] = top_left[1];
+    pred[2] = top_left[2];
+    pred[3] = top_left[3];
+
+    for (i = 0; i < ctx->avctx->width; i++) {
+        /* Always get the alpha component */
+        UPDATE_CACHE(bits, gb);
+        GET_VLC(code, bits, gb, vlc[0].table, 7, 2);
+
+        pred[0] += code;
+        dst[0]   = pred[0];
+
+        /* Skip the components if they are  entirely transparent */
+        if (dst[0]) {
+            /* Red */
+            UPDATE_CACHE(bits, gb);
+            GET_VLC(code, bits, gb, vlc[1].table, 7, 2);
+
+            pred[1] += code;
+            dst[1]   = pred[1];
+
+            /* Green */
+            UPDATE_CACHE(bits, gb);
+            GET_VLC(code, bits, gb, vlc[2].table, 7, 2);
+
+            pred[2] += code;
+            dst[2]   = pred[2];
+
+            /* Blue */
+            UPDATE_CACHE(bits, gb);
+            GET_VLC(code, bits, gb, vlc[3].table, 7, 2);
+
+            pred[3] += code;
+            dst[3]   = pred[3];
+        } else {
+            dst[1] = 0;
+            dst[2] = 0;
+            dst[3] = 0;
+        }
+
+        dst += 4;
+    }
+
+    CLOSE_READER(bits, gb);
+
+    dst         -= 4 * ctx->avctx->width;
+    top_left[0]  = dst[0];
+
+    /* Only stash components if they are not transparent */
+    if (top_left[0]) {
+        top_left[1] = dst[1];
+        top_left[2] = dst[2];
+        top_left[3] = dst[3];
+    }
+
+    return 0;
+}
+
 static int read_rgb24_component_line(CLLCContext *ctx, GetBitContext *gb,
                                      int *top_left, VLC *vlc, uint8_t *outbuf)
 {
@@ -104,6 +178,50 @@ static int read_rgb24_component_line(CLLCContext *ctx, GetBitContext *gb,
     return 0;
 }
 
+static int decode_argb_frame(CLLCContext *ctx, GetBitContext *gb, AVFrame *pic)
+{
+    AVCodecContext *avctx = ctx->avctx;
+    uint8_t *dst;
+    int pred[4];
+    int ret;
+    int i, j;
+    VLC vlc[4];
+
+    pred[0] = 0;
+    pred[1] = 0x80;
+    pred[2] = 0x80;
+    pred[3] = 0x80;
+
+    dst = pic->data[0];
+
+    skip_bits(gb, 16);
+
+    /* Read in code table for each plane */
+    for (i = 0; i < 4; i++) {
+        ret = read_code_table(ctx, gb, &vlc[i]);
+        if (ret < 0) {
+            for (j = 0; j <= i; j++)
+                ff_free_vlc(&vlc[j]);
+
+            av_log(ctx->avctx, AV_LOG_ERROR,
+                   "Could not read code table %d.\n", i);
+            return ret;
+        }
+    }
+
+    /* Read in and restore every line */
+    for (i = 0; i < avctx->height; i++) {
+        read_argb_line(ctx, gb, pred, vlc, dst);
+
+        dst += pic->linesize[0];
+    }
+
+    for (i = 0; i < 4; i++)
+        ff_free_vlc(&vlc[i]);
+
+    return 0;
+}
+
 static int decode_rgb24_frame(CLLCContext *ctx, GetBitContext *gb, AVFrame *pic)
 {
     AVCodecContext *avctx = ctx->avctx;
@@ -224,6 +342,21 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
         if (ret < 0)
             return ret;
 
+        break;
+    case 3:
+        avctx->pix_fmt             = PIX_FMT_ARGB;
+        avctx->bits_per_raw_sample = 8;
+
+        ret = avctx->get_buffer(avctx, pic);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+            return ret;
+        }
+
+        ret = decode_argb_frame(ctx, &gb, pic);
+        if (ret < 0)
+            return ret;
+
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "Unknown coding type: %d.\n", coding_type);

From 6c4975eaafd7f8f91e81ad8d6be744a434241fd3 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Aug 2012 01:15:19 +0100
Subject: [PATCH 06/22] libavutil: add saturating addition functions

Fixed-point audio codecs often use saturating arithmetic, and
special instructions for these operations are common.

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavutil/arm/intmath.h | 15 +++++++++++++++
 libavutil/common.h      | 30 ++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/libavutil/arm/intmath.h b/libavutil/arm/intmath.h
index ce73404b37..d5a343c95f 100644
--- a/libavutil/arm/intmath.h
+++ b/libavutil/arm/intmath.h
@@ -83,6 +83,21 @@ static av_always_inline av_const unsigned av_clip_uintp2_arm(int a, int p)
     return x;
 }
 
+#define av_sat_add32 av_sat_add32_arm
+static av_always_inline int av_sat_add32_arm(int a, int b)
+{
+    int r;
+    __asm__ ("qadd %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
+    return r;
+}
+
+#define av_sat_dadd32 av_sat_dadd32_arm
+static av_always_inline int av_sat_dadd32_arm(int a, int b)
+{
+    int r;
+    __asm__ ("qdadd %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
+    return r;
+}
 
 #else /* HAVE_ARMV6 */
 
diff --git a/libavutil/common.h b/libavutil/common.h
index c99d858472..433e8e490a 100644
--- a/libavutil/common.h
+++ b/libavutil/common.h
@@ -181,6 +181,30 @@ static av_always_inline av_const unsigned av_clip_uintp2_c(int a, int p)
     else                   return  a;
 }
 
+/**
+ * Add two signed 32-bit values with saturation.
+ *
+ * @param  a one value
+ * @param  b another value
+ * @return sum with signed saturation
+ */
+static av_always_inline int av_sat_add32_c(int a, int b)
+{
+    return av_clipl_int32((int64_t)a + b);
+}
+
+/**
+ * Add a doubled value to another value with saturation at both stages.
+ *
+ * @param  a first value
+ * @param  b value doubled and added to a
+ * @return sum with signed saturation
+ */
+static av_always_inline int av_sat_dadd32_c(int a, int b)
+{
+    return av_sat_add32(a, av_sat_add32(b, b));
+}
+
 /**
  * Clip a float value into the amin-amax range.
  * @param a value to clip
@@ -387,6 +411,12 @@ static av_always_inline av_const int av_popcount64_c(uint64_t x)
 #ifndef av_clip_uintp2
 #   define av_clip_uintp2   av_clip_uintp2_c
 #endif
+#ifndef av_sat_add32
+#   define av_sat_add32     av_sat_add32_c
+#endif
+#ifndef av_sat_dadd32
+#   define av_sat_dadd32    av_sat_dadd32_c
+#endif
 #ifndef av_clipf
 #   define av_clipf         av_clipf_c
 #endif

From fddc5b9bea39968ed1f45c667869428865de7626 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Aug 2012 04:18:53 +0100
Subject: [PATCH 07/22] celp: optimise ff_celp_lp_synthesis_filter()

Adding instead of subtracting the products in the loop allows the
compiler to generate more efficient multiply-accumulate instructions
when 16-bit multiply-subtract is not available. ARM has only
multiply-accumulate for 16-bit operands.  In general, if only one
variant exists, it is usually accumulate rather than subtract.

In the same spirit, using the dedicated saturation function enables
use of any special optimised versions of this.

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/celp_filters.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/libavcodec/celp_filters.c b/libavcodec/celp_filters.c
index 4e5bcda79a..d764d19219 100644
--- a/libavcodec/celp_filters.c
+++ b/libavcodec/celp_filters.c
@@ -63,17 +63,16 @@ int ff_celp_lp_synthesis_filter(int16_t *out, const int16_t *filter_coeffs,
     int i,n;
 
     for (n = 0; n < buffer_length; n++) {
-        int sum = rounder;
+        int sum = -rounder, sum1;
         for (i = 1; i <= filter_length; i++)
-            sum -= filter_coeffs[i-1] * out[n-i];
+            sum += filter_coeffs[i-1] * out[n-i];
 
-        sum = ((sum >> 12) + in[n]) >> shift;
+        sum1 = ((-sum >> 12) + in[n]) >> shift;
+        sum  = av_clip_int16(sum1);
+
+        if (stop_on_overflow && sum != sum1)
+            return 1;
 
-        if (sum + 0x8000 > 0xFFFFU) {
-            if (stop_on_overflow)
-                return 1;
-            sum = (sum >> 31) ^ 32767;
-        }
         out[n] = sum;
     }
 

From 8b0de73464fcb110dce2f5601e4e27b2cbd33d20 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Aug 2012 01:54:15 +0100
Subject: [PATCH 08/22] g723.1: deobfuscate "(x << 4) - x" to "15 * x"

The compiler performs this optimisation.

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index 7d8a48e18a..d4158ffef7 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -914,7 +914,7 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy)
     }
 
     for (i = 0; i < SUBFRAME_LEN; i++) {
-        p->pf_gain = ((p->pf_gain << 4) - p->pf_gain + gain + (1 << 3)) >> 4;
+        p->pf_gain = (15 * p->pf_gain + gain + (1 << 3)) >> 4;
         buf[i]     = av_clip_int16((buf[i] * (p->pf_gain + (p->pf_gain >> 4)) +
                                    (1 << 10)) >> 11);
     }

From 5a43eba956d095157359e1abf639984c8ca508e4 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Fri, 10 Aug 2012 15:41:47 +0100
Subject: [PATCH 09/22] g723.1: remove unnecessary argument 'shift' from
 dot_product()

The 'shift' argument is always 1 so there is no need to pass it
explicitly in every call.

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index d4158ffef7..45808001b9 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -569,13 +569,12 @@ static void get_residual(int16_t *residual, int16_t *prev_excitation, int lag)
         residual[i] = prev_excitation[offset + (i - 2) % lag];
 }
 
-static int dot_product(const int16_t *a, const int16_t *b, int length,
-                       int shift)
+static int dot_product(const int16_t *a, const int16_t *b, int length)
 {
     int i, sum = 0;
 
     for (i = 0; i < length; i++) {
-        int64_t prod = av_clipl_int32(MUL64(a[i], b[i]) << shift);
+        int64_t prod = av_clipl_int32(MUL64(a[i], b[i]) << 1);
         sum = av_clipl_int32(sum + prod);
     }
     return sum;
@@ -606,7 +605,7 @@ static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation,
     /* Calculate adaptive vector */
     cb_ptr += subfrm.ad_cb_gain * 20;
     for (i = 0; i < SUBFRAME_LEN; i++) {
-        sum = dot_product(residual + i, cb_ptr, PITCH_ORDER, 1);
+        sum = dot_product(residual + i, cb_ptr, PITCH_ORDER);
         vector[i] = av_clipl_int32((sum << 1) + (1 << 15)) >> 16;
     }
 }
@@ -635,7 +634,7 @@ static int autocorr_max(G723_1_Context *p, int offset, int *ccr_max,
         limit = pitch_lag + 3;
 
     for (i = pitch_lag - 3; i <= limit; i++) {
-        ccr = dot_product(buf, buf + dir * i, length, 1);
+        ccr = dot_product(buf, buf + dir * i, length);
 
         if (ccr > *ccr_max) {
             *ccr_max = ccr;
@@ -734,17 +733,15 @@ static void comp_ppf_coeff(G723_1_Context *p, int offset, int pitch_lag,
         return;
 
     /* Compute target energy */
-    energy[0] = dot_product(buf, buf, SUBFRAME_LEN, 1);
+    energy[0] = dot_product(buf, buf, SUBFRAME_LEN);
 
     /* Compute forward residual energy */
     if (fwd_lag)
-        energy[2] = dot_product(buf + fwd_lag, buf + fwd_lag,
-                                SUBFRAME_LEN, 1);
+        energy[2] = dot_product(buf + fwd_lag, buf + fwd_lag, SUBFRAME_LEN);
 
     /* Compute backward residual energy */
     if (back_lag)
-        energy[4] = dot_product(buf - back_lag, buf - back_lag,
-                                SUBFRAME_LEN, 1);
+        energy[4] = dot_product(buf - back_lag, buf - back_lag, SUBFRAME_LEN);
 
     /* Normalize and shorten */
     temp1 = 0;
@@ -805,15 +802,14 @@ static int comp_interp_index(G723_1_Context *p, int pitch_lag,
     ccr   = av_clipl_int32((int64_t)ccr + (1 << 15)) >> 16;
 
     /* Compute target energy */
-    tgt_eng  = dot_product(buf, buf, SUBFRAME_LEN * 2, 1);
+    tgt_eng  = dot_product(buf, buf, SUBFRAME_LEN * 2);
     *exc_eng = av_clipl_int32((int64_t)tgt_eng + (1 << 15)) >> 16;
 
     if (ccr <= 0)
         return 0;
 
     /* Compute best energy */
-    best_eng = dot_product(buf - index, buf - index,
-                           SUBFRAME_LEN * 2, 1);
+    best_eng = dot_product(buf - index, buf - index, SUBFRAME_LEN * 2);
     best_eng = av_clipl_int32((int64_t)best_eng + (1 << 15)) >> 16;
 
     temp = best_eng * *exc_eng >> 3;
@@ -966,8 +962,8 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
 
         /* Compute auto correlation coefficients */
         auto_corr[0] = dot_product(temp_vector, temp_vector + 1,
-                                   SUBFRAME_LEN - 1, 1);
-        auto_corr[1] = dot_product(temp_vector, temp_vector, SUBFRAME_LEN, 1);
+                                   SUBFRAME_LEN - 1);
+        auto_corr[1] = dot_product(temp_vector, temp_vector, SUBFRAME_LEN);
 
         /* Compute reflection coefficient */
         temp = auto_corr[1] >> 16;

From 1eb1f6f281eb6036d363e0317c1500be4a2708f2 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Fri, 10 Aug 2012 16:42:54 +0100
Subject: [PATCH 10/22] g723.1: remove useless uses of MUL64()

The operands in both cases are 16-bit so cannot overflow a 32-bit
destination.  In gain_scale() the inputs are reduced to 14-bit,
so even the shift cannot overflow.

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index 45808001b9..f91f629311 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -574,7 +574,7 @@ static int dot_product(const int16_t *a, const int16_t *b, int length)
     int i, sum = 0;
 
     for (i = 0; i < length; i++) {
-        int64_t prod = av_clipl_int32(MUL64(a[i], b[i]) << 1);
+        int64_t prod = av_clipl_int32((int64_t)(a[i] * b[i]) << 1);
         sum = av_clipl_int32(sum + prod);
     }
     return sum;
@@ -889,9 +889,9 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy)
     num   = energy;
     denom = 0;
     for (i = 0; i < SUBFRAME_LEN; i++) {
-        int64_t temp = buf[i] >> 2;
-        temp  = av_clipl_int32(MUL64(temp, temp) << 1);
-        denom = av_clipl_int32(denom + temp);
+        int temp = buf[i] >> 2;
+        temp *= temp;
+        denom = av_clipl_int32((int64_t)denom + (temp << 1));
     }
 
     if (num && denom) {

From 4aca716a531b0bc1f05c96209cf30577d6e48baa Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Fri, 10 Aug 2012 18:15:41 +0100
Subject: [PATCH 11/22] g723.1: optimise scale_vector()

Firstly, nothing in this function can overflow 32 bits so the use
of a 64-bit type is completely unnecessary.  Secondly, the scale
is either a power of two or 0x7fff.  Doing separate loops for these
cases avoids using multiplications.  Finally, since only the number
of bits, not the actual value, of the maximum value is needed, the
bitwise or of all the values serves the purpose while being faster.

It is worth noting that even if overflow could happen, it was not
handled correctly anyway.

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index f91f629311..5be4fe046e 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -281,19 +281,21 @@ static int normalize_bits(int num, int width)
 static int scale_vector(int16_t *vector, int length)
 {
     int bits, max = 0;
-    int64_t scale;
     int i;
 
 
     for (i = 0; i < length; i++)
-        max = FFMAX(max, FFABS(vector[i]));
+        max |= FFABS(vector[i]);
 
     max   = FFMIN(max, 0x7FFF);
     bits  = normalize_bits(max, 15);
-    scale = (bits == 15) ? 0x7FFF : (1 << bits);
 
-    for (i = 0; i < length; i++)
-        vector[i] = av_clipl_int32(vector[i] * scale << 1) >> 4;
+    if (bits == 15)
+        for (i = 0; i < length; i++)
+            vector[i] = vector[i] * 0x7fff >> 3;
+    else
+        for (i = 0; i < length; i++)
+            vector[i] = vector[i] << bits >> 3;
 
     return bits - 3;
 }

From 47c73a73b0967d54dfcbc6250d861406da7b84b6 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Aug 2012 01:52:10 +0100
Subject: [PATCH 12/22] g723.1: use saturating addition functions

Use saturating addition functions instead of 64-bit intermediates
and separate clipping.  This is much faster when dedicated
instructions are available.

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index 5be4fe046e..8a394f9340 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -394,11 +394,11 @@ static void lsp2lpc(int16_t *lpc)
     for (j = 0; j < LPC_ORDER; j++) {
         int index     = lpc[j] >> 7;
         int offset    = lpc[j] & 0x7f;
-        int64_t temp1 = cos_tab[index] << 16;
+        int temp1     = cos_tab[index] << 16;
         int temp2     = (cos_tab[index + 1] - cos_tab[index]) *
                           ((offset << 8) + 0x80) << 1;
 
-        lpc[j] = -(av_clipl_int32(((temp1 + temp2) << 1) + (1 << 15)) >> 16);
+        lpc[j] = -(av_sat_dadd32(1 << 15, temp1 + temp2) >> 16);
     }
 
     /*
@@ -576,8 +576,8 @@ static int dot_product(const int16_t *a, const int16_t *b, int length)
     int i, sum = 0;
 
     for (i = 0; i < length; i++) {
-        int64_t prod = av_clipl_int32((int64_t)(a[i] * b[i]) << 1);
-        sum = av_clipl_int32(sum + prod);
+        int prod = a[i] * b[i];
+        sum = av_sat_dadd32(sum, prod);
     }
     return sum;
 }
@@ -594,7 +594,7 @@ static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation,
     int lag = pitch_lag + subfrm.ad_cb_lag - 1;
 
     int i;
-    int64_t sum;
+    int sum;
 
     get_residual(residual, prev_excitation, lag);
 
@@ -608,7 +608,7 @@ static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation,
     cb_ptr += subfrm.ad_cb_gain * 20;
     for (i = 0; i < SUBFRAME_LEN; i++) {
         sum = dot_product(residual + i, cb_ptr, PITCH_ORDER);
-        vector[i] = av_clipl_int32((sum << 1) + (1 << 15)) >> 16;
+        vector[i] = av_sat_dadd32(1 << 15, sum) >> 16;
     }
 }
 
@@ -660,7 +660,7 @@ static void comp_ppf_gains(int lag, PPFParam *ppf, enum Rate cur_rate,
                            int tgt_eng, int ccr, int res_eng)
 {
     int pf_residual;     /* square of postfiltered residual */
-    int64_t temp1, temp2;
+    int temp1, temp2;
 
     ppf->index = lag;
 
@@ -677,7 +677,7 @@ static void comp_ppf_gains(int lag, PPFParam *ppf, enum Rate cur_rate,
         /* pf_res^2 = tgt_eng + 2*ccr*gain + res_eng*gain^2 */
         temp1       = (tgt_eng << 15) + (ccr * ppf->opt_gain << 1);
         temp2       = (ppf->opt_gain * ppf->opt_gain >> 15) * res_eng;
-        pf_residual = av_clipl_int32(temp1 + temp2 + (1 << 15)) >> 16;
+        pf_residual = av_sat_add32(temp1, temp2 + (1 << 15)) >> 16;
 
         if (tgt_eng >= pf_residual << 1) {
             temp1 = 0x7fff;
@@ -801,18 +801,18 @@ static int comp_interp_index(G723_1_Context *p, int pitch_lag,
     /* Compute maximum backward cross-correlation */
     ccr   = 0;
     index = autocorr_max(p, offset, &ccr, pitch_lag, SUBFRAME_LEN * 2, -1);
-    ccr   = av_clipl_int32((int64_t)ccr + (1 << 15)) >> 16;
+    ccr   = av_sat_add32(ccr, 1 << 15) >> 16;
 
     /* Compute target energy */
     tgt_eng  = dot_product(buf, buf, SUBFRAME_LEN * 2);
-    *exc_eng = av_clipl_int32((int64_t)tgt_eng + (1 << 15)) >> 16;
+    *exc_eng = av_sat_add32(tgt_eng, 1 << 15) >> 16;
 
     if (ccr <= 0)
         return 0;
 
     /* Compute best energy */
     best_eng = dot_product(buf - index, buf - index, SUBFRAME_LEN * 2);
-    best_eng = av_clipl_int32((int64_t)best_eng + (1 << 15)) >> 16;
+    best_eng = av_sat_add32(best_eng, 1 << 15) >> 16;
 
     temp = best_eng * *exc_eng >> 3;
 
@@ -893,7 +893,7 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy)
     for (i = 0; i < SUBFRAME_LEN; i++) {
         int temp = buf[i] >> 2;
         temp *= temp;
-        denom = av_clipl_int32((int64_t)denom + (temp << 1));
+        denom = av_sat_dadd32(denom, temp);
     }
 
     if (num && denom) {
@@ -977,9 +977,8 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
 
         /* Compensation filter */
         for (j = 0; j < SUBFRAME_LEN; j++) {
-            buf_ptr[j] = av_clipl_int32((int64_t)signal_ptr[j] +
-                                        ((signal_ptr[j - 1] >> 16) *
-                                         temp << 1)) >> 16;
+            buf_ptr[j] = av_sat_dadd32(signal_ptr[j],
+                                       (signal_ptr[j - 1] >> 16) * temp) >> 16;
         }
 
         /* Compute normalized signal energy */

From 371610510361f89948b87f57799ee56deab02503 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Aug 2012 02:43:14 +0100
Subject: [PATCH 13/22] g723.1: do not needlessly use int64_t

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index 8a394f9340..8d5ac71ccd 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -710,7 +710,7 @@ static void comp_ppf_coeff(G723_1_Context *p, int offset, int pitch_lag,
 
     int16_t scale;
     int i;
-    int64_t temp1, temp2;
+    int temp1, temp2;
 
     /*
      * 0 - target energy

From 783da0d6961c6de4ecc3f56fb291738f460782da Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Aug 2012 12:16:53 +0100
Subject: [PATCH 14/22] g723.1: make autocorr_max() work on an arbitrary buffer

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index 8d5ac71ccd..e1d8591920 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -615,18 +615,17 @@ static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation,
 /**
  * Estimate maximum auto-correlation around pitch lag.
  *
- * @param p         the context
+ * @param buf       buffer with offset applied
  * @param offset    offset of the excitation vector
  * @param ccr_max   pointer to the maximum auto-correlation
  * @param pitch_lag decoded pitch lag
  * @param length    length of autocorrelation
  * @param dir       forward lag(1) / backward lag(-1)
  */
-static int autocorr_max(G723_1_Context *p, int offset, int *ccr_max,
+static int autocorr_max(const int16_t *buf, int offset, int *ccr_max,
                         int pitch_lag, int length, int dir)
 {
     int limit, ccr, lag = 0;
-    int16_t *buf = p->excitation + offset;
     int i;
 
     pitch_lag = FFMIN(PITCH_MAX - 3, pitch_lag);
@@ -721,9 +720,9 @@ static void comp_ppf_coeff(G723_1_Context *p, int offset, int pitch_lag,
      */
     int energy[5] = {0, 0, 0, 0, 0};
     int16_t *buf  = p->excitation + offset;
-    int fwd_lag   = autocorr_max(p, offset, &energy[1], pitch_lag,
+    int fwd_lag   = autocorr_max(buf, offset, &energy[1], pitch_lag,
                                  SUBFRAME_LEN, 1);
-    int back_lag  = autocorr_max(p, offset, &energy[3], pitch_lag,
+    int back_lag  = autocorr_max(buf, offset, &energy[3], pitch_lag,
                                  SUBFRAME_LEN, -1);
 
     ppf->index    = 0;
@@ -800,7 +799,7 @@ static int comp_interp_index(G723_1_Context *p, int pitch_lag,
 
     /* Compute maximum backward cross-correlation */
     ccr   = 0;
-    index = autocorr_max(p, offset, &ccr, pitch_lag, SUBFRAME_LEN * 2, -1);
+    index = autocorr_max(buf, offset, &ccr, pitch_lag, SUBFRAME_LEN * 2, -1);
     ccr   = av_sat_add32(ccr, 1 << 15) >> 16;
 
     /* Compute target energy */

From b2af2c4bee7eb9f256a9f4143ca90b020d2c9569 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Aug 2012 12:21:41 +0100
Subject: [PATCH 15/22] g723.1: make scale_vector() output to a separate buffer

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index e1d8591920..6f4898cfb9 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -278,7 +278,7 @@ static int normalize_bits(int num, int width)
 /**
  * Scale vector contents based on the largest of their absolutes.
  */
-static int scale_vector(int16_t *vector, int length)
+static int scale_vector(int16_t *dst, const int16_t *vector, int length)
 {
     int bits, max = 0;
     int i;
@@ -292,10 +292,10 @@ static int scale_vector(int16_t *vector, int length)
 
     if (bits == 15)
         for (i = 0; i < length; i++)
-            vector[i] = vector[i] * 0x7fff >> 3;
+            dst[i] = vector[i] * 0x7fff >> 3;
     else
         for (i = 0; i < length; i++)
-            vector[i] = vector[i] << bits >> 3;
+            dst[i] = vector[i] << bits >> 3;
 
     return bits - 3;
 }
@@ -791,11 +791,11 @@ static int comp_interp_index(G723_1_Context *p, int pitch_lag,
                              int *exc_eng, int *scale)
 {
     int offset = PITCH_MAX + 2 * SUBFRAME_LEN;
-    int16_t *buf = p->excitation + offset;
+    const int16_t *buf = p->excitation + offset;
 
     int index, ccr, tgt_eng, best_eng, temp;
 
-    *scale = scale_vector(p->excitation, FRAME_LEN + PITCH_MAX);
+    *scale = scale_vector(p->excitation, p->excitation, FRAME_LEN + PITCH_MAX);
 
     /* Compute maximum backward cross-correlation */
     ccr   = 0;
@@ -958,8 +958,7 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
         int scale, energy;
 
         /* Normalize */
-        memcpy(temp_vector, buf_ptr, SUBFRAME_LEN * sizeof(*temp_vector));
-        scale = scale_vector(temp_vector, SUBFRAME_LEN);
+        scale = scale_vector(temp_vector, buf_ptr, SUBFRAME_LEN);
 
         /* Compute auto correlation coefficients */
         auto_corr[0] = dot_product(temp_vector, temp_vector + 1,

From 1953264331ec653bff2e3d85391b91c0ea5299ae Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Aug 2012 03:39:30 +0100
Subject: [PATCH 16/22] g723.1: drop unnecessary variable buf_ptr in
 formant_postfilter()

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index 6f4898cfb9..a2f7dee4f4 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -926,7 +926,7 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy)
  */
 static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
 {
-    int16_t filter_coef[2][LPC_ORDER], *buf_ptr;
+    int16_t filter_coef[2][LPC_ORDER];
     int filter_signal[LPC_ORDER + FRAME_LEN], *signal_ptr;
     int i, j, k;
 
@@ -949,7 +949,7 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
     memcpy(p->iir_mem, filter_signal + FRAME_LEN,
            LPC_ORDER * sizeof(*p->iir_mem));
 
-    buf_ptr    = buf + LPC_ORDER;
+    buf += LPC_ORDER;
     signal_ptr = filter_signal + LPC_ORDER;
     for (i = 0; i < SUBFRAMES; i++) {
         int16_t temp_vector[SUBFRAME_LEN];
@@ -958,7 +958,7 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
         int scale, energy;
 
         /* Normalize */
-        scale = scale_vector(temp_vector, buf_ptr, SUBFRAME_LEN);
+        scale = scale_vector(temp_vector, buf, SUBFRAME_LEN);
 
         /* Compute auto correlation coefficients */
         auto_corr[0] = dot_product(temp_vector, temp_vector + 1,
@@ -975,8 +975,8 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
 
         /* Compensation filter */
         for (j = 0; j < SUBFRAME_LEN; j++) {
-            buf_ptr[j] = av_sat_dadd32(signal_ptr[j],
-                                       (signal_ptr[j - 1] >> 16) * temp) >> 16;
+            buf[j] = av_sat_dadd32(signal_ptr[j],
+                                   (signal_ptr[j - 1] >> 16) * temp) >> 16;
         }
 
         /* Compute normalized signal energy */
@@ -986,9 +986,9 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
         } else
             energy = auto_corr[1] >> temp;
 
-        gain_scale(p, buf_ptr, energy);
+        gain_scale(p, buf, energy);
 
-        buf_ptr    += SUBFRAME_LEN;
+        buf        += SUBFRAME_LEN;
         signal_ptr += SUBFRAME_LEN;
     }
 }

From f645710cf31f6268fbf279f4515e6012dcd11ac2 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Aug 2012 22:26:38 +0100
Subject: [PATCH 17/22] g723.1: make postfilter write directly to output buffer

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index a2f7dee4f4..c980ec2f50 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -922,9 +922,11 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy)
  *
  * @param p   the context
  * @param lpc quantized lpc coefficients
- * @param buf output buffer
+ * @param buf input buffer
+ * @param dst output buffer
  */
-static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
+static void formant_postfilter(G723_1_Context *p, int16_t *lpc,
+                               int16_t *buf, int16_t *dst)
 {
     int16_t filter_coef[2][LPC_ORDER];
     int filter_signal[LPC_ORDER + FRAME_LEN], *signal_ptr;
@@ -952,18 +954,16 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
     buf += LPC_ORDER;
     signal_ptr = filter_signal + LPC_ORDER;
     for (i = 0; i < SUBFRAMES; i++) {
-        int16_t temp_vector[SUBFRAME_LEN];
         int temp;
         int auto_corr[2];
         int scale, energy;
 
         /* Normalize */
-        scale = scale_vector(temp_vector, buf, SUBFRAME_LEN);
+        scale = scale_vector(dst, buf, SUBFRAME_LEN);
 
         /* Compute auto correlation coefficients */
-        auto_corr[0] = dot_product(temp_vector, temp_vector + 1,
-                                   SUBFRAME_LEN - 1);
-        auto_corr[1] = dot_product(temp_vector, temp_vector, SUBFRAME_LEN);
+        auto_corr[0] = dot_product(dst, dst + 1, SUBFRAME_LEN - 1);
+        auto_corr[1] = dot_product(dst, dst,     SUBFRAME_LEN);
 
         /* Compute reflection coefficient */
         temp = auto_corr[1] >> 16;
@@ -975,7 +975,7 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
 
         /* Compensation filter */
         for (j = 0; j < SUBFRAME_LEN; j++) {
-            buf[j] = av_sat_dadd32(signal_ptr[j],
+            dst[j] = av_sat_dadd32(signal_ptr[j],
                                    (signal_ptr[j - 1] >> 16) * temp) >> 16;
         }
 
@@ -986,10 +986,11 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf)
         } else
             energy = auto_corr[1] >> temp;
 
-        gain_scale(p, buf, energy);
+        gain_scale(p, dst, energy);
 
         buf        += SUBFRAME_LEN;
         signal_ptr += SUBFRAME_LEN;
+        dst        += SUBFRAME_LEN;
     }
 }
 
@@ -1136,8 +1137,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
     memcpy(p->synth_mem, p->audio + FRAME_LEN, LPC_ORDER * sizeof(*p->audio));
 
     if (p->postfilter) {
-        formant_postfilter(p, lpc, p->audio);
-        memcpy(p->frame.data[0], p->audio + LPC_ORDER, FRAME_LEN * 2);
+        formant_postfilter(p, lpc, p->audio, out);
     } else { // if output is not postfiltered it should be scaled by 2
         for (i = 0; i < FRAME_LEN; i++)
             out[i] = av_clip_int16(p->audio[LPC_ORDER + i] << 1);

From 4b728b4712403058ac4dc45daa8b5c03a688fadf Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Aug 2012 05:23:59 +0100
Subject: [PATCH 18/22] g723.1: avoid unnecessary memcpy() in residual_interp()

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index c980ec2f50..050278d16c 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -838,10 +838,9 @@ static void residual_interp(int16_t *buf, int16_t *out, int lag,
         int16_t *vector_ptr = buf + PITCH_MAX;
         /* Attenuate */
         for (i = 0; i < lag; i++)
-            vector_ptr[i - lag] = vector_ptr[i - lag] * 3 >> 2;
-        av_memcpy_backptr((uint8_t*)vector_ptr, lag * sizeof(*vector_ptr),
-                          FRAME_LEN * sizeof(*vector_ptr));
-        memcpy(out, vector_ptr, FRAME_LEN * sizeof(*vector_ptr));
+            out[i] = vector_ptr[i - lag] * 3 >> 2;
+        av_memcpy_backptr((uint8_t*)(out + lag), lag * sizeof(*out),
+                          (FRAME_LEN - lag) * sizeof(*out));
     } else {  /* Unvoiced */
         for (i = 0; i < FRAME_LEN; i++) {
             *rseed = *rseed * 521 + 259;
@@ -1100,23 +1099,31 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
                                                  ppf[j].opt_gain,
                                                  1 << 14, 15, SUBFRAME_LEN);
 
+            /* Save the excitation for the next frame */
+            memcpy(p->prev_excitation, p->excitation + FRAME_LEN,
+                   PITCH_MAX * sizeof(*p->excitation));
         } else {
             p->interp_gain = (p->interp_gain * 3 + 2) >> 2;
             if (p->erased_frames == 3) {
                 /* Mute output */
                 memset(p->excitation, 0,
                        (FRAME_LEN + PITCH_MAX) * sizeof(*p->excitation));
+                memset(p->prev_excitation, 0,
+                       PITCH_MAX * sizeof(*p->excitation));
                 memset(p->frame.data[0], 0,
                        (FRAME_LEN + LPC_ORDER) * sizeof(int16_t));
             } else {
+                int16_t *buf = p->audio + LPC_ORDER;
+
                 /* Regenerate frame */
-                residual_interp(p->excitation, p->audio + LPC_ORDER, p->interp_index,
+                residual_interp(p->excitation, buf, p->interp_index,
                                 p->interp_gain, &p->random_seed);
+
+                /* Save the excitation for the next frame */
+                memcpy(p->prev_excitation, buf + (FRAME_LEN - PITCH_MAX),
+                       PITCH_MAX * sizeof(*p->excitation));
             }
         }
-        /* Save the excitation for the next frame */
-        memcpy(p->prev_excitation, p->excitation + FRAME_LEN,
-               PITCH_MAX * sizeof(*p->excitation));
     } else {
         memset(out, 0, FRAME_LEN * 2);
         av_log(avctx, AV_LOG_WARNING,

From 35b533e4dede0b1abe3ddbe927c893819006ee75 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Aug 2012 19:59:08 +0100
Subject: [PATCH 19/22] g723.1: avoid saving/restoring excitation

Writing the scaled excitation to a scratch buffer (borrowing the
'audio' array) instead of modifying it in place avoids the need
to save and restore the unscaled values.

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index 050278d16c..c0cea192c2 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -99,7 +99,7 @@ typedef struct g723_1_context {
     int pf_gain;
     int postfilter;
 
-    int16_t audio[FRAME_LEN + LPC_ORDER];
+    int16_t audio[FRAME_LEN + LPC_ORDER + PITCH_MAX];
 } G723_1_Context;
 
 static av_cold int g723_1_decode_init(AVCodecContext *avctx)
@@ -719,7 +719,7 @@ static void comp_ppf_coeff(G723_1_Context *p, int offset, int pitch_lag,
      * 4 - backward residual energy
      */
     int energy[5] = {0, 0, 0, 0, 0};
-    int16_t *buf  = p->excitation + offset;
+    int16_t *buf  = p->audio + LPC_ORDER + offset;
     int fwd_lag   = autocorr_max(buf, offset, &energy[1], pitch_lag,
                                  SUBFRAME_LEN, 1);
     int back_lag  = autocorr_max(buf, offset, &energy[3], pitch_lag,
@@ -791,11 +791,12 @@ static int comp_interp_index(G723_1_Context *p, int pitch_lag,
                              int *exc_eng, int *scale)
 {
     int offset = PITCH_MAX + 2 * SUBFRAME_LEN;
-    const int16_t *buf = p->excitation + offset;
+    int16_t *buf = p->audio + LPC_ORDER;
 
     int index, ccr, tgt_eng, best_eng, temp;
 
-    *scale = scale_vector(p->excitation, p->excitation, FRAME_LEN + PITCH_MAX);
+    *scale = scale_vector(buf, p->excitation, FRAME_LEN + PITCH_MAX);
+    buf   += offset;
 
     /* Compute maximum backward cross-correlation */
     ccr   = 0;
@@ -1008,6 +1009,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
     int16_t *vector_ptr;
     int16_t *out;
     int bad_frame = 0, i, j, ret;
+    int16_t *audio = p->audio;
 
     if (buf_size < frame_size[dec_mode]) {
         if (buf_size)
@@ -1071,26 +1073,16 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
 
             vector_ptr = p->excitation + PITCH_MAX;
 
-            /* Save the excitation */
-            memcpy(p->audio + LPC_ORDER, vector_ptr, FRAME_LEN * sizeof(*p->audio));
-
             p->interp_index = comp_interp_index(p, p->pitch_lag[1],
                                                 &p->sid_gain, &p->cur_gain);
 
+            /* Peform pitch postfiltering */
             if (p->postfilter) {
                 i = PITCH_MAX;
                 for (j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++)
                     comp_ppf_coeff(p, i, p->pitch_lag[j >> 1],
                                    ppf + j, p->cur_rate);
-            }
 
-            /* Restore the original excitation */
-            memcpy(p->excitation, p->prev_excitation,
-                   PITCH_MAX * sizeof(*p->excitation));
-            memcpy(vector_ptr, p->audio + LPC_ORDER, FRAME_LEN * sizeof(*vector_ptr));
-
-            /* Peform pitch postfiltering */
-            if (p->postfilter)
                 for (i = 0, j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++)
                     ff_acelp_weighted_vector_sum(p->audio + LPC_ORDER + i,
                                                  vector_ptr + i,
@@ -1098,6 +1090,9 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
                                                  ppf[j].sc_gain,
                                                  ppf[j].opt_gain,
                                                  1 << 14, 15, SUBFRAME_LEN);
+            } else {
+                audio = vector_ptr - LPC_ORDER;
+            }
 
             /* Save the excitation for the next frame */
             memcpy(p->prev_excitation, p->excitation + FRAME_LEN,
@@ -1139,7 +1134,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
     memcpy(p->audio, p->synth_mem, LPC_ORDER * sizeof(*p->audio));
     for (i = LPC_ORDER, j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++)
         ff_celp_lp_synthesis_filter(p->audio + i, &lpc[j * LPC_ORDER],
-                                    p->audio + i, SUBFRAME_LEN, LPC_ORDER,
+                                    audio + i, SUBFRAME_LEN, LPC_ORDER,
                                     0, 1, 1 << 12);
     memcpy(p->synth_mem, p->audio + FRAME_LEN, LPC_ORDER * sizeof(*p->audio));
 

From cbcf1b411fac6aabf731e14e4f48bca3d956f868 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Aug 2012 20:19:39 +0100
Subject: [PATCH 20/22] g723.1: declare a variable in the block it is used

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index c0cea192c2..657c144895 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -1006,7 +1006,6 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
     int16_t cur_lsp[LPC_ORDER];
     int16_t lpc[SUBFRAMES * LPC_ORDER];
     int16_t acb_vector[SUBFRAME_LEN];
-    int16_t *vector_ptr;
     int16_t *out;
     int bad_frame = 0, i, j, ret;
     int16_t *audio = p->audio;
@@ -1051,8 +1050,9 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
         /* Generate the excitation for the frame */
         memcpy(p->excitation, p->prev_excitation,
                PITCH_MAX * sizeof(*p->excitation));
-        vector_ptr = p->excitation + PITCH_MAX;
         if (!p->erased_frames) {
+            int16_t *vector_ptr = p->excitation + PITCH_MAX;
+
             /* Update interpolation gain memory */
             p->interp_gain = fixed_cb_gain[(p->subframe[2].amp_index +
                                             p->subframe[3].amp_index) >> 1];

From 138914dcd83132f6edc6f1799c5a17e0b6b559bb Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Aug 2012 21:00:21 +0100
Subject: [PATCH 21/22] g723.1: do not bounce intermediate values via memory

Although a reasonable compiler will probably optimise out the
actual store and load, this operation still implies a truncation
to 16 bits which the compiler will probably not realise is not
necessary here.

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index 657c144895..4c1c4dad66 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -1064,9 +1064,8 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
                                    p->cur_rate);
                 /* Get the total excitation */
                 for (j = 0; j < SUBFRAME_LEN; j++) {
-                    vector_ptr[j] = av_clip_int16(vector_ptr[j] << 1);
-                    vector_ptr[j] = av_clip_int16(vector_ptr[j] +
-                                                  acb_vector[j]);
+                    int v = av_clip_int16(vector_ptr[j] << 1);
+                    vector_ptr[j] = av_clip_int16(v + acb_vector[j]);
                 }
                 vector_ptr += SUBFRAME_LEN;
             }

From 69665bd6f40f02ecf822f80c05dd2765da2dfa7b Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Aug 2012 21:04:14 +0100
Subject: [PATCH 22/22] g723.1: do not pass large structs by value

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/g723_1.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index 4c1c4dad66..37b7ff5f09 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -498,7 +498,7 @@ static void gen_dirac_train(int16_t *buf, int pitch_lag)
  * @param pitch_lag closed loop pitch lag
  * @param index     current subframe index
  */
-static void gen_fcb_excitation(int16_t *vector, G723_1_Subframe subfrm,
+static void gen_fcb_excitation(int16_t *vector, G723_1_Subframe *subfrm,
                                enum Rate cur_rate, int pitch_lag, int index)
 {
     int temp, i, j;
@@ -506,34 +506,34 @@ static void gen_fcb_excitation(int16_t *vector, G723_1_Subframe subfrm,
     memset(vector, 0, SUBFRAME_LEN * sizeof(*vector));
 
     if (cur_rate == RATE_6300) {
-        if (subfrm.pulse_pos >= max_pos[index])
+        if (subfrm->pulse_pos >= max_pos[index])
             return;
 
         /* Decode amplitudes and positions */
         j = PULSE_MAX - pulses[index];
-        temp = subfrm.pulse_pos;
+        temp = subfrm->pulse_pos;
         for (i = 0; i < SUBFRAME_LEN / GRID_SIZE; i++) {
             temp -= combinatorial_table[j][i];
             if (temp >= 0)
                 continue;
             temp += combinatorial_table[j++][i];
-            if (subfrm.pulse_sign & (1 << (PULSE_MAX - j))) {
-                vector[subfrm.grid_index + GRID_SIZE * i] =
-                                        -fixed_cb_gain[subfrm.amp_index];
+            if (subfrm->pulse_sign & (1 << (PULSE_MAX - j))) {
+                vector[subfrm->grid_index + GRID_SIZE * i] =
+                                        -fixed_cb_gain[subfrm->amp_index];
             } else {
-                vector[subfrm.grid_index + GRID_SIZE * i] =
-                                         fixed_cb_gain[subfrm.amp_index];
+                vector[subfrm->grid_index + GRID_SIZE * i] =
+                                         fixed_cb_gain[subfrm->amp_index];
             }
             if (j == PULSE_MAX)
                 break;
         }
-        if (subfrm.dirac_train == 1)
+        if (subfrm->dirac_train == 1)
             gen_dirac_train(vector, pitch_lag);
     } else { /* 5300 bps */
-        int cb_gain  = fixed_cb_gain[subfrm.amp_index];
-        int cb_shift = subfrm.grid_index;
-        int cb_sign  = subfrm.pulse_sign;
-        int cb_pos   = subfrm.pulse_pos;
+        int cb_gain  = fixed_cb_gain[subfrm->amp_index];
+        int cb_shift = subfrm->grid_index;
+        int cb_sign  = subfrm->pulse_sign;
+        int cb_pos   = subfrm->pulse_pos;
         int offset, beta, lag;
 
         for (i = 0; i < 8; i += 2) {
@@ -544,9 +544,9 @@ static void gen_fcb_excitation(int16_t *vector, G723_1_Subframe subfrm,
         }
 
         /* Enhance harmonic components */
-        lag  = pitch_contrib[subfrm.ad_cb_gain << 1] + pitch_lag +
-               subfrm.ad_cb_lag - 1;
-        beta = pitch_contrib[(subfrm.ad_cb_gain << 1) + 1];
+        lag  = pitch_contrib[subfrm->ad_cb_gain << 1] + pitch_lag +
+               subfrm->ad_cb_lag - 1;
+        beta = pitch_contrib[(subfrm->ad_cb_gain << 1) + 1];
 
         if (lag < SUBFRAME_LEN - 2) {
             for (i = lag; i < SUBFRAME_LEN; i++)
@@ -586,12 +586,12 @@ static int dot_product(const int16_t *a, const int16_t *b, int length)
  * Generate adaptive codebook excitation.
  */
 static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation,
-                               int pitch_lag, G723_1_Subframe subfrm,
+                               int pitch_lag, G723_1_Subframe *subfrm,
                                enum Rate cur_rate)
 {
     int16_t residual[SUBFRAME_LEN + PITCH_ORDER - 1];
     const int16_t *cb_ptr;
-    int lag = pitch_lag + subfrm.ad_cb_lag - 1;
+    int lag = pitch_lag + subfrm->ad_cb_lag - 1;
 
     int i;
     int sum;
@@ -605,7 +605,7 @@ static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation,
         cb_ptr = adaptive_cb_gain170;
 
     /* Calculate adaptive vector */
-    cb_ptr += subfrm.ad_cb_gain * 20;
+    cb_ptr += subfrm->ad_cb_gain * 20;
     for (i = 0; i < SUBFRAME_LEN; i++) {
         sum = dot_product(residual + i, cb_ptr, PITCH_ORDER);
         vector[i] = av_sat_dadd32(1 << 15, sum) >> 16;
@@ -1057,10 +1057,10 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
             p->interp_gain = fixed_cb_gain[(p->subframe[2].amp_index +
                                             p->subframe[3].amp_index) >> 1];
             for (i = 0; i < SUBFRAMES; i++) {
-                gen_fcb_excitation(vector_ptr, p->subframe[i], p->cur_rate,
+                gen_fcb_excitation(vector_ptr, &p->subframe[i], p->cur_rate,
                                    p->pitch_lag[i >> 1], i);
                 gen_acb_excitation(acb_vector, &p->excitation[SUBFRAME_LEN * i],
-                                   p->pitch_lag[i >> 1], p->subframe[i],
+                                   p->pitch_lag[i >> 1], &p->subframe[i],
                                    p->cur_rate);
                 /* Get the total excitation */
                 for (j = 0; j < SUBFRAME_LEN; j++) {