From 543a46b4b2ff0f02fa85e45d7b7bcda19d3be9b4 Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Sat, 24 Sep 2022 00:46:44 +0200
Subject: [PATCH] opus: convert encoder and decoder to lavu/tx

This commit changes both the encoder and decoder to use the new lavu/tx code,
which has faster C transforms and more assembly optimizations.
---
 libavcodec/opus_celt.c   | 20 ++++++++++++--------
 libavcodec/opus_celt.h   |  5 +++--
 libavcodec/opusenc.c     | 15 +++++++++------
 libavcodec/opusenc_psy.c | 13 ++++++++-----
 libavcodec/opusenc_psy.h |  4 +++-
 5 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/libavcodec/opus_celt.c b/libavcodec/opus_celt.c
index 9dbeff1927..f1fb88a56d 100644
--- a/libavcodec/opus_celt.c
+++ b/libavcodec/opus_celt.c
@@ -323,7 +323,8 @@ int ff_celt_decode_frame(CeltFrame *f, OpusRangeCoder *rc,
 {
     int i, j, downmix = 0;
     int consumed;           // bits of entropy consumed thus far for this frame
-    MDCT15Context *imdct;
+    AVTXContext *imdct;
+    av_tx_fn imdct_fn;
 
     if (channels != 1 && channels != 2) {
         av_log(f->avctx, AV_LOG_ERROR, "Invalid number of coded channels: %d\n",
@@ -385,7 +386,8 @@ int ff_celt_decode_frame(CeltFrame *f, OpusRangeCoder *rc,
     f->blocks    = f->transient ? 1 << f->size : 1;
     f->blocksize = frame_size / f->blocks;
 
-    imdct = f->imdct[f->transient ? 0 : f->size];
+    imdct = f->tx[f->transient ? 0 : f->size];
+    imdct_fn = f->tx_fn[f->transient ? 0 : f->size];
 
     if (channels == 1) {
         for (i = 0; i < CELT_MAX_BANDS; i++)
@@ -440,8 +442,8 @@ int ff_celt_decode_frame(CeltFrame *f, OpusRangeCoder *rc,
         for (j = 0; j < f->blocks; j++) {
             float *dst  = block->buf + 1024 + j * f->blocksize;
 
-            imdct->imdct_half(imdct, dst + CELT_OVERLAP / 2, f->block[i].coeffs + j,
-                              f->blocks);
+            imdct_fn(imdct, dst + CELT_OVERLAP / 2, f->block[i].coeffs + j,
+                     sizeof(float)*f->blocks);
             f->dsp->vector_fmul_window(dst, dst, dst + CELT_OVERLAP / 2,
                                        ff_celt_window, CELT_OVERLAP / 2);
         }
@@ -526,8 +528,8 @@ void ff_celt_free(CeltFrame **f)
     if (!frm)
         return;
 
-    for (i = 0; i < FF_ARRAY_ELEMS(frm->imdct); i++)
-        ff_mdct15_uninit(&frm->imdct[i]);
+    for (i = 0; i < FF_ARRAY_ELEMS(frm->tx); i++)
+        av_tx_uninit(&frm->tx[i]);
 
     ff_celt_pvq_uninit(&frm->pvq);
 
@@ -555,9 +557,11 @@ int ff_celt_init(AVCodecContext *avctx, CeltFrame **f, int output_channels,
     frm->output_channels = output_channels;
     frm->apply_phase_inv = apply_phase_inv;
 
-    for (i = 0; i < FF_ARRAY_ELEMS(frm->imdct); i++)
-        if ((ret = ff_mdct15_init(&frm->imdct[i], 1, i + 3, -1.0f/32768)) < 0)
+    for (i = 0; i < FF_ARRAY_ELEMS(frm->tx); i++) {
+        const float scale = -1.0f/32768;
+        if ((ret = av_tx_init(&frm->tx[i], &frm->tx_fn[i], AV_TX_FLOAT_MDCT, 1, 15 << (i + 3), &scale, 0)) < 0)
             goto fail;
+    }
 
     if ((ret = ff_celt_pvq_init(&frm->pvq, 0)) < 0)
         goto fail;
diff --git a/libavcodec/opus_celt.h b/libavcodec/opus_celt.h
index 661ca251de..291a544298 100644
--- a/libavcodec/opus_celt.h
+++ b/libavcodec/opus_celt.h
@@ -30,10 +30,10 @@
 #include "opus_pvq.h"
 #include "opusdsp.h"
 
-#include "mdct15.h"
 #include "libavutil/float_dsp.h"
 #include "libavutil/libm.h"
 #include "libavutil/mem_internal.h"
+#include "libavutil/tx.h"
 
 #define CELT_VECTORS                 11
 #define CELT_ALLOC_STEPS             6
@@ -93,7 +93,8 @@ typedef struct CeltBlock {
 struct CeltFrame {
     // constant values that do not change during context lifetime
     AVCodecContext      *avctx;
-    MDCT15Context       *imdct[4];
+    AVTXContext        *tx[4];
+    av_tx_fn            tx_fn[4];
     AVFloatDSPContext   *dsp;
     CeltBlock           block[2];
     CeltPVQ             *pvq;
diff --git a/libavcodec/opusenc.c b/libavcodec/opusenc.c
index a7a9d3a5f5..8cdd27d930 100644
--- a/libavcodec/opusenc.c
+++ b/libavcodec/opusenc.c
@@ -40,7 +40,8 @@ typedef struct OpusEncContext {
     AVCodecContext *avctx;
     AudioFrameQueue afq;
     AVFloatDSPContext *dsp;
-    MDCT15Context *mdct[CELT_BLOCK_NB];
+    AVTXContext *tx[CELT_BLOCK_NB];
+    av_tx_fn tx_fn[CELT_BLOCK_NB];
     CeltPVQ *pvq;
     struct FFBufQueue bufqueue;
 
@@ -204,7 +205,7 @@ static void celt_frame_mdct(OpusEncContext *s, CeltFrame *f)
                 s->dsp->vector_fmul_reverse(&win[CELT_OVERLAP], src2,
                                             ff_celt_window - 8, 128);
                 src1 = src2;
-                s->mdct[0]->mdct(s->mdct[0], b->coeffs + t, win, f->blocks);
+                s->tx_fn[0](s->tx[0], b->coeffs + t, win, sizeof(float)*f->blocks);
             }
         }
     } else {
@@ -226,7 +227,7 @@ static void celt_frame_mdct(OpusEncContext *s, CeltFrame *f)
                                         ff_celt_window - 8, 128);
             memcpy(win + lap_dst + blk_len, temp, CELT_OVERLAP*sizeof(float));
 
-            s->mdct[f->size]->mdct(s->mdct[f->size], b->coeffs, win, 1);
+            s->tx_fn[f->size](s->tx[f->size], b->coeffs, win, sizeof(float));
         }
     }
 
@@ -612,7 +613,7 @@ static av_cold int opus_encode_end(AVCodecContext *avctx)
     OpusEncContext *s = avctx->priv_data;
 
     for (int i = 0; i < CELT_BLOCK_NB; i++)
-        ff_mdct15_uninit(&s->mdct[i]);
+        av_tx_uninit(&s->tx[i]);
 
     ff_celt_pvq_uninit(&s->pvq);
     av_freep(&s->dsp);
@@ -668,9 +669,11 @@ static av_cold int opus_encode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
 
     /* I have no idea why a base scaling factor of 68 works, could be the twiddles */
-    for (int i = 0; i < CELT_BLOCK_NB; i++)
-        if ((ret = ff_mdct15_init(&s->mdct[i], 0, i + 3, 68 << (CELT_BLOCK_NB - 1 - i))))
+    for (int i = 0; i < CELT_BLOCK_NB; i++) {
+        const float scale = 68 << (CELT_BLOCK_NB - 1 - i);
+        if ((ret = av_tx_init(&s->tx[i], &s->tx_fn[i], AV_TX_FLOAT_MDCT, 0, 15 << (i + 3), &scale, 0)))
             return AVERROR(ENOMEM);
+    }
 
     /* Zero out previous energy (matters for inter first frame) */
     for (int ch = 0; ch < s->channels; ch++)
diff --git a/libavcodec/opusenc_psy.c b/libavcodec/opusenc_psy.c
index 1c8f69269c..3bff57d347 100644
--- a/libavcodec/opusenc_psy.c
+++ b/libavcodec/opusenc_psy.c
@@ -22,7 +22,6 @@
 #include "opusenc_psy.h"
 #include "opus_pvq.h"
 #include "opustab.h"
-#include "mdct15.h"
 #include "libavutil/qsort.h"
 
 static float pvq_band_cost(CeltPVQ *pvq, CeltFrame *f, OpusRangeCoder *rc, int band,
@@ -99,7 +98,8 @@ static void step_collect_psy_metrics(OpusPsyContext *s, int index)
         s->dsp->vector_fmul(s->scratch, s->scratch, s->window[s->bsize_analysis],
                             (OPUS_BLOCK_SIZE(s->bsize_analysis) << 1));
 
-        s->mdct[s->bsize_analysis]->mdct(s->mdct[s->bsize_analysis], st->coeffs[ch], s->scratch, 1);
+        s->mdct_fn[s->bsize_analysis](s->mdct[s->bsize_analysis], st->coeffs[ch],
+                                      s->scratch, sizeof(float));
 
         for (i = 0; i < CELT_MAX_BANDS; i++)
             st->bands[ch][i] = &st->coeffs[ch][ff_celt_freq_bands[i] << s->bsize_analysis];
@@ -558,13 +558,16 @@ av_cold int ff_opus_psy_init(OpusPsyContext *s, AVCodecContext *avctx,
     for (i = 0; i < CELT_BLOCK_NB; i++) {
         float tmp;
         const int len = OPUS_BLOCK_SIZE(i);
+        const float scale = 68 << (CELT_BLOCK_NB - 1 - i);
         s->window[i] = av_malloc(2*len*sizeof(float));
         if (!s->window[i]) {
             ret = AVERROR(ENOMEM);
             goto fail;
         }
         generate_window_func(s->window[i], 2*len, WFUNC_SINE, &tmp);
-        if ((ret = ff_mdct15_init(&s->mdct[i], 0, i + 3, 68 << (CELT_BLOCK_NB - 1 - i))))
+        ret = av_tx_init(&s->mdct[i], &s->mdct_fn[i], AV_TX_FLOAT_MDCT,
+                         0, 15 << (i + 3), &scale, 0);
+        if (ret < 0)
             goto fail;
     }
 
@@ -575,7 +578,7 @@ fail:
     av_freep(&s->dsp);
 
     for (i = 0; i < CELT_BLOCK_NB; i++) {
-        ff_mdct15_uninit(&s->mdct[i]);
+        av_tx_uninit(&s->mdct[i]);
         av_freep(&s->window[i]);
     }
 
@@ -598,7 +601,7 @@ av_cold int ff_opus_psy_end(OpusPsyContext *s)
     av_freep(&s->dsp);
 
     for (i = 0; i < CELT_BLOCK_NB; i++) {
-        ff_mdct15_uninit(&s->mdct[i]);
+        av_tx_uninit(&s->mdct[i]);
         av_freep(&s->window[i]);
     }
 
diff --git a/libavcodec/opusenc_psy.h b/libavcodec/opusenc_psy.h
index d4fb096a3d..0a7cdb6f2c 100644
--- a/libavcodec/opusenc_psy.h
+++ b/libavcodec/opusenc_psy.h
@@ -22,6 +22,7 @@
 #ifndef AVCODEC_OPUSENC_PSY_H
 #define AVCODEC_OPUSENC_PSY_H
 
+#include "libavutil/tx.h"
 #include "libavutil/mem_internal.h"
 
 #include "opusenc.h"
@@ -70,7 +71,8 @@ typedef struct OpusPsyContext {
     int max_steps;
 
     float *window[CELT_BLOCK_NB];
-    MDCT15Context *mdct[CELT_BLOCK_NB];
+    AVTXContext *mdct[CELT_BLOCK_NB];
+    av_tx_fn mdct_fn[CELT_BLOCK_NB];
     int bsize_analysis;
 
     DECLARE_ALIGNED(32, float, scratch)[2048];