/*
 * Copyright (C) 2019 Philip Langdale <philipl@overt.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

__device__ static const int coef_lf[2] = { 4309, 213 };
__device__ static const int coef_hf[3] = { 5570, 3801, 1016 };
__device__ static const int coef_sp[2] = { 5077, 981 };

template<typename T>
__inline__ __device__ T max3(T a, T b, T c)
{
    T x = max(a, b);
    return max(x, c);
}

template<typename T>
__inline__ __device__ T min3(T a, T b, T c)
{
    T x = min(a, b);
    return min(x, c);
}

template<typename T>
__inline__ __device__ T clip(T a, T min, T max)
{
    if (a < min) {
        return min;
    } else if (a > max) {
        return max;
    } else {
        return a;
    }
}

template<typename T>
__inline__ __device__ T filter_intra(T cur_prefs3, T cur_prefs,
                                     T cur_mrefs, T cur_mrefs3,
                                     int clip_max)
{
    int final = (coef_sp[0] * (cur_mrefs + cur_prefs) -
                 coef_sp[1] * (cur_mrefs3 + cur_prefs3)) >> 13;
    return clip(final, 0, clip_max);
}

template<typename T>
__inline__ __device__ T filter(T cur_prefs3, T cur_prefs, T cur_mrefs, T cur_mrefs3,
                               T prev2_prefs4, T prev2_prefs2, T prev2_0, T prev2_mrefs2, T prev2_mrefs4,
                               T prev_prefs, T prev_mrefs, T next_prefs, T next_mrefs,
                               T next2_prefs4, T next2_prefs2, T next2_0, T next2_mrefs2, T next2_mrefs4,
                               int clip_max)
{
    T final;

    int c = cur_mrefs;
    int d = (prev2_0 + next2_0) >> 1;
    int e = cur_prefs;

    int temporal_diff0 = abs(prev2_0 - next2_0);
    int temporal_diff1 = (abs(prev_mrefs - c) + abs(prev_prefs - e)) >> 1;
    int temporal_diff2 = (abs(next_mrefs - c) + abs(next_prefs - e)) >> 1;
    int diff = max3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2);

    if (!diff) {
        final = d;
    } else {
        int b = ((prev2_mrefs2 + next2_mrefs2) >> 1) - c;
        int f = ((prev2_prefs2 + next2_prefs2) >> 1) - e;
        int dc = d - c;
        int de = d - e;
        int mmax = max3(de, dc, min(b, f));
        int mmin = min3(de, dc, max(b, f));
        diff = max3(diff, mmin, -mmax);

        int interpol;
        if (abs(c - e) > temporal_diff0) {
            interpol = (((coef_hf[0] * (prev2_0 + next2_0)
                - coef_hf[1] * (prev2_mrefs2 + next2_mrefs2 + prev2_prefs2 + next2_prefs2)
                + coef_hf[2] * (prev2_mrefs4 + next2_mrefs4 + prev2_prefs4 + next2_mrefs4)) >> 2)
                + coef_lf[0] * (c + e) - coef_lf[1] * (cur_mrefs3 + cur_prefs3)) >> 13;
        } else {
            interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur_mrefs3 + cur_prefs3)) >> 13;
        }

        if (interpol > d + diff) {
            interpol = d + diff;
        } else if (interpol < d - diff) {
            interpol = d - diff;
        }
        final = clip(interpol, 0, clip_max);
    }

    return final;
}

template<typename T>
__inline__ __device__ void bwdif_single(T *dst,
                                        cudaTextureObject_t prev,
                                        cudaTextureObject_t cur,
                                        cudaTextureObject_t next,
                                        int dst_width, int dst_height, int dst_pitch,
                                        int src_width, int src_height,
                                        int parity, int tff,
                                        int is_field_end, int clip_max)
{
    // Identify location
    int xo = blockIdx.x * blockDim.x + threadIdx.x;
    int yo = blockIdx.y * blockDim.y + threadIdx.y;

    if (xo >= dst_width || yo >= dst_height) {
        return;
    }

    // Don't modify the primary field
    if (yo % 2 == parity) {
      dst[yo*dst_pitch+xo] = tex2D<T>(cur, xo, yo);
      return;
    }

    T cur_prefs3 = tex2D<T>(cur, xo, yo + 3);
    T cur_prefs = tex2D<T>(cur, xo, yo + 1);
    T cur_mrefs = tex2D<T>(cur, xo, yo - 1);
    T cur_mrefs3 = tex2D<T>(cur, xo, yo - 3);

    if (is_field_end) {
        dst[yo*dst_pitch+xo] =
            filter_intra(cur_prefs3, cur_prefs, cur_mrefs, cur_mrefs3, clip_max);
        return;
    }

    // Calculate temporal prediction
    int is_second_field = !(parity ^ tff);

    cudaTextureObject_t prev2 = prev;
    cudaTextureObject_t prev1 = is_second_field ? cur : prev;
    cudaTextureObject_t next1 = is_second_field ? next : cur;
    cudaTextureObject_t next2 = next;

    T prev2_prefs4 = tex2D<T>(prev2, xo,  yo + 4);
    T prev2_prefs2 = tex2D<T>(prev2, xo,  yo + 2);
    T prev2_0 = tex2D<T>(prev2, xo,  yo + 0);
    T prev2_mrefs2 = tex2D<T>(prev2, xo,  yo - 2);
    T prev2_mrefs4 = tex2D<T>(prev2, xo,  yo - 4);
    T prev_prefs = tex2D<T>(prev1, xo,  yo + 1);
    T prev_mrefs = tex2D<T>(prev1, xo,  yo - 1);
    T next_prefs = tex2D<T>(next1, xo,  yo + 1);
    T next_mrefs = tex2D<T>(next1, xo,  yo - 1);
    T next2_prefs4 = tex2D<T>(next2, xo,  yo + 4);
    T next2_prefs2 = tex2D<T>(next2, xo,  yo + 2);
    T next2_0 = tex2D<T>(next2, xo,  yo + 0);
    T next2_mrefs2 = tex2D<T>(next2, xo,  yo - 2);
    T next2_mrefs4 = tex2D<T>(next2, xo,  yo - 4);

    dst[yo*dst_pitch+xo] = filter(cur_prefs3, cur_prefs, cur_mrefs, cur_mrefs3,
                                  prev2_prefs4, prev2_prefs2, prev2_0, prev2_mrefs2, prev2_mrefs4,
                                  prev_prefs, prev_mrefs, next_prefs, next_mrefs,
                                  next2_prefs4, next2_prefs2, next2_0, next2_mrefs2, next2_mrefs4,
                                  clip_max);
}

template <typename T>
__inline__ __device__ void bwdif_double(T *dst,
                                        cudaTextureObject_t prev,
                                        cudaTextureObject_t cur,
                                        cudaTextureObject_t next,
                                        int dst_width, int dst_height, int dst_pitch,
                                        int src_width, int src_height,
                                        int parity, int tff,
                                        int is_field_end, int clip_max)
{
    int xo = blockIdx.x * blockDim.x + threadIdx.x;
    int yo = blockIdx.y * blockDim.y + threadIdx.y;

    if (xo >= dst_width || yo >= dst_height) {
        return;
    }

    if (yo % 2 == parity) {
      // Don't modify the primary field
      dst[yo*dst_pitch+xo] = tex2D<T>(cur, xo, yo);
      return;
    }

    T cur_prefs3 = tex2D<T>(cur, xo, yo + 3);
    T cur_prefs = tex2D<T>(cur, xo, yo + 1);
    T cur_mrefs = tex2D<T>(cur, xo, yo - 1);
    T cur_mrefs3 = tex2D<T>(cur, xo, yo - 3);

    if (is_field_end) {
        T final;
        final.x = filter_intra(cur_prefs3.x, cur_prefs.x, cur_mrefs.x, cur_mrefs3.x,
                               clip_max);
        final.y = filter_intra(cur_prefs3.y, cur_prefs.y, cur_mrefs.y, cur_mrefs3.y,
                               clip_max);
        dst[yo*dst_pitch+xo] = final;
        return;
    }

    int is_second_field = !(parity ^ tff);

    cudaTextureObject_t prev2 = prev;
    cudaTextureObject_t prev1 = is_second_field ? cur : prev;
    cudaTextureObject_t next1 = is_second_field ? next : cur;
    cudaTextureObject_t next2 = next;

    T prev2_prefs4 = tex2D<T>(prev2, xo,  yo + 4);
    T prev2_prefs2 = tex2D<T>(prev2, xo,  yo + 2);
    T prev2_0 = tex2D<T>(prev2, xo,  yo + 0);
    T prev2_mrefs2 = tex2D<T>(prev2, xo,  yo - 2);
    T prev2_mrefs4 = tex2D<T>(prev2, xo,  yo - 4);
    T prev_prefs = tex2D<T>(prev1, xo,  yo + 1);
    T prev_mrefs = tex2D<T>(prev1, xo,  yo - 1);
    T next_prefs = tex2D<T>(next1, xo,  yo + 1);
    T next_mrefs = tex2D<T>(next1, xo,  yo - 1);
    T next2_prefs4 = tex2D<T>(next2, xo,  yo + 4);
    T next2_prefs2 = tex2D<T>(next2, xo,  yo + 2);
    T next2_0 = tex2D<T>(next2, xo,  yo + 0);
    T next2_mrefs2 = tex2D<T>(next2, xo,  yo - 2);
    T next2_mrefs4 = tex2D<T>(next2, xo,  yo - 4);

    T final;
    final.x = filter(cur_prefs3.x, cur_prefs.x, cur_mrefs.x, cur_mrefs3.x,
                     prev2_prefs4.x, prev2_prefs2.x, prev2_0.x, prev2_mrefs2.x, prev2_mrefs4.x,
                     prev_prefs.x, prev_mrefs.x, next_prefs.x, next_mrefs.x,
                     next2_prefs4.x, next2_prefs2.x, next2_0.x, next2_mrefs2.x, next2_mrefs4.x,
                     clip_max);
    final.y = filter(cur_prefs3.y, cur_prefs.y, cur_mrefs.y, cur_mrefs3.y,
                     prev2_prefs4.y, prev2_prefs2.y, prev2_0.y, prev2_mrefs2.y, prev2_mrefs4.y,
                     prev_prefs.y, prev_mrefs.y, next_prefs.y, next_mrefs.y,
                     next2_prefs4.y, next2_prefs2.y, next2_0.y, next2_mrefs2.y, next2_mrefs4.y,
                     clip_max);

    dst[yo*dst_pitch+xo] = final;
}

extern "C" {

__global__ void bwdif_uchar(unsigned char *dst,
                            cudaTextureObject_t prev,
                            cudaTextureObject_t cur,
                            cudaTextureObject_t next,
                            int dst_width, int dst_height, int dst_pitch,
                            int src_width, int src_height,
                            int parity, int tff, int is_field_end, int clip_max)
{
    bwdif_single(dst, prev, cur, next,
                 dst_width, dst_height, dst_pitch,
                 src_width, src_height,
                 parity, tff, is_field_end, clip_max);
}

__global__ void bwdif_ushort(unsigned short *dst,
                            cudaTextureObject_t prev,
                            cudaTextureObject_t cur,
                            cudaTextureObject_t next,
                            int dst_width, int dst_height, int dst_pitch,
                            int src_width, int src_height,
                            int parity, int tff, int is_field_end, int clip_max)
{
    bwdif_single(dst, prev, cur, next,
                 dst_width, dst_height, dst_pitch,
                 src_width, src_height,
                 parity, tff, is_field_end, clip_max);
}

__global__ void bwdif_uchar2(uchar2 *dst,
                            cudaTextureObject_t prev,
                            cudaTextureObject_t cur,
                            cudaTextureObject_t next,
                            int dst_width, int dst_height, int dst_pitch,
                            int src_width, int src_height,
                            int parity, int tff, int is_field_end, int clip_max)
{
    bwdif_double(dst, prev, cur, next,
                 dst_width, dst_height, dst_pitch,
                 src_width, src_height,
                 parity, tff, is_field_end, clip_max);
}

__global__ void bwdif_ushort2(ushort2 *dst,
                            cudaTextureObject_t prev,
                            cudaTextureObject_t cur,
                            cudaTextureObject_t next,
                            int dst_width, int dst_height, int dst_pitch,
                            int src_width, int src_height,
                            int parity, int tff, int is_field_end, int clip_max)
{
    bwdif_double(dst, prev, cur, next,
                 dst_width, dst_height, dst_pitch,
                 src_width, src_height,
                 parity, tff, is_field_end, clip_max);
}

} /* extern "C" */