ffmpeg/libavcodec/alpha/motion_est_mvi_asm.S

/*
 * Alpha optimized DSP utils
 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include "regdef.h"

/* Some nicer register names.  */
#define ta t10
#define tb t11
#define tc t12
#define td AT
/* Danger: these overlap with the argument list and the return value */
#define te a5
#define tf a4
#define tg a3
#define th v0
        
        .set noat
        .set noreorder
        .arch pca56
        .text

/*****************************************************************************
 * int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size)
 *
 * This code is written with a pca56 in mind. For ev6, one should
 * really take the increased latency of 3 cycles for MVI instructions
 * into account.
 *
 * It is important to keep the loading and first use of a register as
 * far apart as possible, because if a register is accessed before it
 * has been fetched from memory, the CPU will stall.
 */
        .align 4
        .globl pix_abs16x16_mvi_asm
        .ent pix_abs16x16_mvi_asm
pix_abs16x16_mvi_asm:
        .frame sp, 0, ra, 0
        .prologue 0

#ifdef HAVE_GPROF
        lda     AT, _mcount
        jsr     AT, (AT), _mcount
#endif

        and     a1, 7, t0
        clr     v0
        lda     a3, 16
        beq     t0, $aligned
        .align 4
$unaligned:
        /* Registers:
           line 0:
           t0:  left_u -> left lo -> left
           t1:  mid
           t2:  right_u -> right hi -> right
           t3:  ref left
           t4:  ref right
           line 1:
           t5:  left_u -> left lo -> left
           t6:  mid
           t7:  right_u -> right hi -> right
           t8:  ref left
           t9:  ref right
           temp:
           ta:  left hi
           tb:  right lo
           tc:  error left
           td:  error right  */

        /* load line 0 */
        ldq_u   t0, 0(a1)       # left_u
        ldq_u   t1, 8(a1)       # mid
        ldq_u   t2, 16(a1)      # right_u
        ldq     t3, 0(a0)       # ref left
        ldq     t4, 8(a0)       # ref right
        addq    a0, a2, a0      # pix1
        addq    a1, a2, a1      # pix2
        /* load line 1 */        
        ldq_u   t5, 0(a1)       # left_u
        ldq_u   t6, 8(a1)       # mid
        ldq_u   t7, 16(a1)      # right_u
        ldq     t8, 0(a0)       # ref left
        ldq     t9, 8(a0)       # ref right
        addq    a0, a2, a0      # pix1
        addq    a1, a2, a1      # pix2
        /* calc line 0 */
        extql   t0, a1, t0      # left lo
        extqh   t1, a1, ta      # left hi
        extql   t1, a1, tb      # right lo
        or      t0, ta, t0      # left
        extqh   t2, a1, t2      # right hi
        perr    t3, t0, tc      # error left
        or      t2, tb, t2      # right
        perr    t4, t2, td      # error right
        addq    v0, tc, v0      # add error left
        addq    v0, td, v0      # add error left
        /* calc line 1 */
        extql   t5, a1, t5      # left lo
        extqh   t6, a1, ta      # left hi
        extql   t6, a1, tb      # right lo
        or      t5, ta, t5      # left
        extqh   t7, a1, t7      # right hi
        perr    t8, t5, tc      # error left
        or      t7, tb, t7      # right
        perr    t9, t7, td      # error right
        addq    v0, tc, v0      # add error left
        addq    v0, td, v0      # add error left
        /* loop */
        subq    a3,  2, a3      # h -= 2
        bne     a3, $unaligned
        ret

        .align 4
$aligned:
        /* load line 0 */
        ldq     t0, 0(a1)       # left
        ldq     t1, 8(a1)       # right
        addq    a1, a2, a1      # pix2
        ldq     t2, 0(a0)       # ref left
        ldq     t3, 8(a0)       # ref right
        addq    a0, a2, a0      # pix1
        /* load line 1 */
        ldq     t4, 0(a1)       # left
        ldq     t5, 8(a1)       # right
        addq    a1, a2, a1      # pix2
        ldq     t6, 0(a0)       # ref left
        ldq     t7, 8(a0)       # ref right
        addq    a0, a2, a0      # pix1
        /* load line 2 */
        ldq     t8, 0(a1)       # left
        ldq     t9, 8(a1)       # right
        addq    a1, a2, a1      # pix2
        ldq     ta, 0(a0)       # ref left
        ldq     tb, 8(a0)       # ref right
        addq    a0, a2, a0      # pix1
        /* load line 3 */
        ldq     tc, 0(a1)       # left
        ldq     td, 8(a1)       # right
        addq    a1, a2, a1      # pix2
        ldq     te, 0(a0)       # ref left
        ldq     tf, 8(a0)       # ref right
        /* calc line 0 */
        perr    t0, t2, t0      # error left
        addq    a0, a2, a0      # pix1
        perr    t1, t3, t1      # error right
        addq    v0, t0, v0      # add error left
        /* calc line 1 */
        perr    t4, t6, t0      # error left
        addq    v0, t1, v0      # add error right
        perr    t5, t7, t1      # error right
        addq    v0, t0, v0      # add error left
        /* calc line 2 */
        perr    t8, ta, t0      # error left
        addq    v0, t1, v0      # add error right
        perr    t9, tb, t1      # error right
        addq    v0, t0, v0      # add error left
        /* calc line 3 */
        perr    tc, te, t0      # error left
        addq    v0, t1, v0      # add error right
        perr    td, tf, t1      # error right
        addq    v0, t0, v0      # add error left
        addq    v0, t1, v0      # add error right
        /* loop */
        subq    a3,  4, a3      # h -= 4
        bne     a3, $aligned
        ret
        .end pix_abs16x16_mvi_asm
Add Alpha assembly for pix_abs16x16. Optimized for pca56, no large win on ev6. Originally committed as revision 979 to svn://svn.ffmpeg.org/ffmpeg/trunk 2002-09-29 15:14:28 +00:00			`/*`
			`* Alpha optimized DSP utils`
			`* Copyright (c) 2002 Falk Hueffner <falk@debian.org>`
			`*`
			`* This program is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation; either version 2 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program; if not, write to the Free Software`
			`* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.`
			`*/`

			`#include "regdef.h"`

			`/* Some nicer register names. */`
			`#define ta t10`
			`#define tb t11`
			`#define tc t12`
			`#define td AT`
			`/* Danger: these overlap with the argument list and the return value */`
			`#define te a5`
			`#define tf a4`
			`#define tg a3`
			`#define th v0`

			`.set noat`
			`.set noreorder`
			`.arch pca56`
			`.text`

			`/*****************************************************************************`
			`* int pix_abs16x16_mvi_asm(uint8_t pix1, uint8_t pix2, int line_size)`
			`*`
			`* This code is written with a pca56 in mind. For ev6, one should`
			`* really take the increased latency of 3 cycles for MVI instructions`
			`* into account.`
			`*`
			`* It is important to keep the loading and first use of a register as`
			`* far apart as possible, because if a register is accessed before it`
			`* has been fetched from memory, the CPU will stall.`
			`*/`
			`.align 4`
			`.globl pix_abs16x16_mvi_asm`
			`.ent pix_abs16x16_mvi_asm`
			`pix_abs16x16_mvi_asm:`
			`.frame sp, 0, ra, 0`
			`.prologue 0`

			`#ifdef HAVE_GPROF`
			`lda AT, _mcount`
			`jsr AT, (AT), _mcount`
			`#endif`

			`and a1, 7, t0`
			`clr v0`
			`lda a3, 16`
			`beq t0, $aligned`
			`.align 4`
			`$unaligned:`
			`/* Registers:`
			`line 0:`
			`t0: left_u -> left lo -> left`
			`t1: mid`
			`t2: right_u -> right hi -> right`
			`t3: ref left`
			`t4: ref right`
			`line 1:`
			`t5: left_u -> left lo -> left`
			`t6: mid`
			`t7: right_u -> right hi -> right`
			`t8: ref left`
			`t9: ref right`
			`temp:`
			`ta: left hi`
			`tb: right lo`
			`tc: error left`
			`td: error right */`

			`/* load line 0 */`
			`ldq_u t0, 0(a1) # left_u`
			`ldq_u t1, 8(a1) # mid`
			`ldq_u t2, 16(a1) # right_u`
			`ldq t3, 0(a0) # ref left`
			`ldq t4, 8(a0) # ref right`
			`addq a0, a2, a0 # pix1`
			`addq a1, a2, a1 # pix2`
			`/* load line 1 */`
			`ldq_u t5, 0(a1) # left_u`
			`ldq_u t6, 8(a1) # mid`
			`ldq_u t7, 16(a1) # right_u`
			`ldq t8, 0(a0) # ref left`
			`ldq t9, 8(a0) # ref right`
			`addq a0, a2, a0 # pix1`
			`addq a1, a2, a1 # pix2`
			`/* calc line 0 */`
			`extql t0, a1, t0 # left lo`
			`extqh t1, a1, ta # left hi`
			`extql t1, a1, tb # right lo`
			`or t0, ta, t0 # left`
			`extqh t2, a1, t2 # right hi`
			`perr t3, t0, tc # error left`
			`or t2, tb, t2 # right`
			`perr t4, t2, td # error right`
			`addq v0, tc, v0 # add error left`
			`addq v0, td, v0 # add error left`
			`/* calc line 1 */`
			`extql t5, a1, t5 # left lo`
			`extqh t6, a1, ta # left hi`
			`extql t6, a1, tb # right lo`
			`or t5, ta, t5 # left`
			`extqh t7, a1, t7 # right hi`
			`perr t8, t5, tc # error left`
			`or t7, tb, t7 # right`
			`perr t9, t7, td # error right`
			`addq v0, tc, v0 # add error left`
			`addq v0, td, v0 # add error left`
			`/* loop */`
			`subq a3, 2, a3 # h -= 2`
			`bne a3, $unaligned`
			`ret`

			`.align 4`
			`$aligned:`
			`/* load line 0 */`
			`ldq t0, 0(a1) # left`
			`ldq t1, 8(a1) # right`
			`addq a1, a2, a1 # pix2`
			`ldq t2, 0(a0) # ref left`
			`ldq t3, 8(a0) # ref right`
			`addq a0, a2, a0 # pix1`
			`/* load line 1 */`
			`ldq t4, 0(a1) # left`
			`ldq t5, 8(a1) # right`
			`addq a1, a2, a1 # pix2`
			`ldq t6, 0(a0) # ref left`
			`ldq t7, 8(a0) # ref right`
			`addq a0, a2, a0 # pix1`
			`/* load line 2 */`
			`ldq t8, 0(a1) # left`
			`ldq t9, 8(a1) # right`
			`addq a1, a2, a1 # pix2`
			`ldq ta, 0(a0) # ref left`
			`ldq tb, 8(a0) # ref right`
			`addq a0, a2, a0 # pix1`
			`/* load line 3 */`
			`ldq tc, 0(a1) # left`
			`ldq td, 8(a1) # right`
			`addq a1, a2, a1 # pix2`
			`ldq te, 0(a0) # ref left`
			`ldq tf, 8(a0) # ref right`
			`/* calc line 0 */`
			`perr t0, t2, t0 # error left`
			`addq a0, a2, a0 # pix1`
			`perr t1, t3, t1 # error right`
			`addq v0, t0, v0 # add error left`
			`/* calc line 1 */`
			`perr t4, t6, t0 # error left`
			`addq v0, t1, v0 # add error right`
			`perr t5, t7, t1 # error right`
			`addq v0, t0, v0 # add error left`
			`/* calc line 2 */`
			`perr t8, ta, t0 # error left`
			`addq v0, t1, v0 # add error right`
			`perr t9, tb, t1 # error right`
			`addq v0, t0, v0 # add error left`
			`/* calc line 3 */`
			`perr tc, te, t0 # error left`
			`addq v0, t1, v0 # add error right`
			`perr td, tf, t1 # error right`
			`addq v0, t0, v0 # add error left`
			`addq v0, t1, v0 # add error right`
			`/* loop */`
			`subq a3, 4, a3 # h -= 4`
			`bne a3, $aligned`
			`ret`
			`.end pix_abs16x16_mvi_asm`