From b95d19b02022ea9590bee571aa245ab93f37d152 Mon Sep 17 00:00:00 2001
From: Reinhard Tartler <siretart@tauware.de>
Date: Mon, 13 Jun 2011 08:56:54 +0200
Subject: [PATCH 01/24] Add some (important) changelog entries

---
 Changelog | 4 ++++
 1 file changed, 4 insertions(+)
diff --git a/Changelog b/Changelog
index da948c6d73..38f07b1440 100644
--- a/Changelog
+++ b/Changelog
@@ -5,6 +5,10 @@ releases are sorted from youngest to oldest.
 version <next>:
 
 - E-AC-3 audio encoder
+- ac3enc: add channel coupling support
+- floating-point sample format support to the ac3, eac3, dca, aac, and vorbis decoders.
+- H264/MPEG frame-level multi-threading
+- All av_metadata_* functions renamed to av_dict_* and moved to libavutil
 
 
 version 0.7_beta2:

From 5c46ad1da049f16e670d2549161c244c6ddd68ec Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Sat, 11 Jun 2011 22:53:32 +0100
Subject: [PATCH 02/24] ARM: optimised mpadsp_apply_window_fixed

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/arm/Makefile                   |   3 +
 libavcodec/arm/mpegaudiodsp_fixed_armv6.S | 143 ++++++++++++++++++++++
 libavcodec/arm/mpegaudiodsp_init_arm.c    |  33 +++++
 libavcodec/mpegaudiodsp.c                 |   1 +
 libavcodec/mpegaudiodsp.h                 |   1 +
 5 files changed, 181 insertions(+)
 create mode 100644 libavcodec/arm/mpegaudiodsp_fixed_armv6.S
 create mode 100644 libavcodec/arm/mpegaudiodsp_init_arm.c

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index a5abfdd128..3374f0e2bd 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -5,6 +5,9 @@ OBJS-$(CONFIG_DCA_DECODER)             += arm/dcadsp_init_arm.o         \
 
 ARMV6-OBJS-$(CONFIG_AC3DSP)            += arm/ac3dsp_armv6.o
 
+OBJS-$(CONFIG_MPEGAUDIODSP)            += arm/mpegaudiodsp_init_arm.o
+ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP)      += arm/mpegaudiodsp_fixed_armv6.o
+
 OBJS-$(CONFIG_VP5_DECODER)             += arm/vp56dsp_init_arm.o
 OBJS-$(CONFIG_VP6_DECODER)             += arm/vp56dsp_init_arm.o
 OBJS-$(CONFIG_VP8_DECODER)             += arm/vp8dsp_init_arm.o
diff --git a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
new file mode 100644
index 0000000000..9ec731480b
--- /dev/null
+++ b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+.macro  skip            args:vararg
+.endm
+
+.macro  sum8            lo,  hi,  w, p, t1, t2, t3, t4, rsb=skip, offs=0
+        ldr             \t1, [\w, #4*\offs]
+        ldr             \t2, [\p, #4]!
+        \rsb            \t1, \t1, #0
+  .irpc i, 135
+        ldr             \t3, [\w, #4*64*\i+4*\offs]
+        ldr             \t4, [\p, #4*64*\i]
+        smlal           \lo, \hi, \t1, \t2
+        \rsb            \t3, \t3, #0
+        ldr             \t1, [\w, #4*64*(\i+1)+4*\offs]
+        ldr             \t2, [\p, #4*64*(\i+1)]
+        smlal           \lo, \hi, \t3, \t4
+        \rsb            \t1, \t1, #0
+  .endr
+        ldr             \t3, [\w, #4*64*7+4*\offs]
+        ldr             \t4, [\p, #4*64*7]
+        smlal           \lo, \hi, \t1, \t2
+        \rsb            \t3, \t3, #0
+        smlal           \lo, \hi, \t3, \t4
+.endm
+
+.macro  round           rd,  lo,  hi
+        lsr             \rd, \lo, #24
+        bic             \lo, \lo, #0xff000000
+        orr             \rd, \rd, \hi, lsl #8
+        mov             \hi, #0
+        ssat            \rd, #16, \rd
+.endm
+
+function ff_mpadsp_apply_window_fixed_armv6, export=1
+        push            {r2,r4-r11,lr}
+
+        add             r4,  r0,  #4*512        @ synth_buf + 512
+    .rept 4
+        ldm             r0!, {r5-r12}
+        stm             r4!, {r5-r12}
+    .endr
+
+        ldr             r4,  [sp, #40]          @ incr
+        sub             r0,  r0,  #4*17         @ synth_buf + 16
+        ldr             r8,  [r2]               @ sum:low
+        add             r2,  r0,  #4*32         @ synth_buf + 48
+        rsb             r5,  r4,  r4,  lsl #5   @ 31 * incr
+        lsl             r4,  r4,  #1
+        asr             r9,  r8,  #31           @ sum:high
+        add             r5,  r3,  r5,  lsl #1   @ samples2
+        add             r6,  r1,  #4*32         @ w2
+        str             r4,  [sp, #40]
+
+        sum8            r8,  r9,  r1,  r0,  r10, r11, r12, lr
+        sum8            r8,  r9,  r1,  r2,  r10, r11, r12, lr, rsb, 32
+        round           r10, r8,  r9
+        strh            r10, [r3], r4
+
+        mov             lr,  #15
+1:
+        ldr             r12, [r0, #4]!
+        ldr             r11, [r6, #-4]!
+        ldr             r10, [r1, #4]!
+  .irpc i, 0246
+    .if \i
+        ldr             r11, [r6, #4*64*\i]
+        ldr             r10, [r1, #4*64*\i]
+    .endif
+        rsb             r11, r11, #0
+        smlal           r8,  r9,  r10, r12
+        ldr             r10, [r0, #4*64*(\i+1)]
+    .ifeq \i
+        smull           r4, r7, r11, r12
+    .else
+        smlal           r4, r7, r11, r12
+    .endif
+        ldr             r11, [r6, #4*64*(\i+1)]
+        ldr             r12, [r1, #4*64*(\i+1)]
+        rsb             r11, r11, #0
+        smlal           r8,  r9,  r12, r10
+    .iflt \i-6
+        ldr             r12, [r0, #4*64*(\i+2)]
+    .else
+        ldr             r12, [r2, #-4]!
+    .endif
+        smlal           r4,  r7,  r11, r10
+  .endr
+  .irpc i, 0246
+        ldr             r10, [r1, #4*64*\i+4*32]
+        rsb             r12, r12, #0
+        ldr             r11, [r6, #4*64*\i+4*32]
+        smlal           r8,  r9,  r10, r12
+        ldr             r10, [r2, #4*64*(\i+1)]
+        smlal           r4,  r7,  r11, r12
+        ldr             r12, [r1, #4*64*(\i+1)+4*32]
+        rsb             r10, r10, #0
+        ldr             r11, [r6, #4*64*(\i+1)+4*32]
+        smlal           r8,  r9,  r12, r10
+    .iflt \i-6
+        ldr             r12, [r2, #4*64*(\i+2)]
+    .else
+        ldr             r12, [sp, #40]
+    .endif
+        smlal           r4,  r7,  r11, r10
+  .endr
+        round           r10, r8,  r9
+        adds            r8,  r8,  r4
+        adc             r9,  r9,  r7
+        strh            r10, [r3], r12
+        round           r11, r8,  r9
+        subs            lr,  lr,  #1
+        strh            r11, [r5], -r12
+        bgt             1b
+
+        sum8            r8,  r9,  r1,  r0,  r10, r11, r12, lr, rsb, 33
+        pop             {r4}
+        round           r10, r8,  r9
+        str             r8,  [r4]
+        strh            r10, [r3]
+
+        pop             {r4-r11,pc}
+endfunc
diff --git a/libavcodec/arm/mpegaudiodsp_init_arm.c b/libavcodec/arm/mpegaudiodsp_init_arm.c
new file mode 100644
index 0000000000..94a55787ad
--- /dev/null
+++ b/libavcodec/arm/mpegaudiodsp_init_arm.c
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+#include "libavcodec/mpegaudiodsp.h"
+#include "config.h"
+
+void ff_mpadsp_apply_window_fixed_armv6(int32_t *synth_buf, int32_t *window,
+                                        int *dither, int16_t *out, int incr);
+
+void ff_mpadsp_init_arm(MPADSPContext *s)
+{
+    if (HAVE_ARMV6) {
+        s->apply_window_fixed = ff_mpadsp_apply_window_fixed_armv6;
+    }
+}
diff --git a/libavcodec/mpegaudiodsp.c b/libavcodec/mpegaudiodsp.c
index 57fe962b91..438b097d06 100644
--- a/libavcodec/mpegaudiodsp.c
+++ b/libavcodec/mpegaudiodsp.c
@@ -35,6 +35,7 @@ void ff_mpadsp_init(MPADSPContext *s)
     s->dct32_float = dct.dct32;
     s->dct32_fixed = ff_dct32_fixed;
 
+    if (ARCH_ARM)     ff_mpadsp_init_arm(s);
     if (HAVE_MMX)     ff_mpadsp_init_mmx(s);
     if (HAVE_ALTIVEC) ff_mpadsp_init_altivec(s);
 }
diff --git a/libavcodec/mpegaudiodsp.h b/libavcodec/mpegaudiodsp.h
index a47019cc4b..8a18db8325 100644
--- a/libavcodec/mpegaudiodsp.h
+++ b/libavcodec/mpegaudiodsp.h
@@ -47,6 +47,7 @@ void ff_mpa_synth_filter_float(MPADSPContext *s,
                                float *samples, int incr,
                                float *sb_samples);
 
+void ff_mpadsp_init_arm(MPADSPContext *s);
 void ff_mpadsp_init_mmx(MPADSPContext *s);
 void ff_mpadsp_init_altivec(MPADSPContext *s);
 

From 77cdfde73e91cdbcc82cdec6b8fec6f646b02782 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Mon, 13 Jun 2011 09:24:27 +0100
Subject: [PATCH 03/24] ARM: jrevdct_arm: misc cleanup

- use 'const' macro to define coeff table
- add missing endfunc
- remove superflous directives

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/arm/jrevdct_arm.S | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/libavcodec/arm/jrevdct_arm.S b/libavcodec/arm/jrevdct_arm.S
index 4fcf35101d..f7fd5c7bde 100644
--- a/libavcodec/arm/jrevdct_arm.S
+++ b/libavcodec/arm/jrevdct_arm.S
@@ -54,18 +54,15 @@
 #define FIX_M_1_961570560_ID   40
 #define FIX_M_2_562915447_ID   44
 #define FIX_0xFFFF_ID          48
-        .text
-        .align
 
 function ff_j_rev_dct_arm, export=1
         stmdb   sp!, { r4 - r12, lr }   @ all callee saved regs
-
         sub sp, sp, #4                  @ reserve some space on the stack
         str r0, [ sp ]                  @ save the DCT pointer to the stack
 
         mov lr, r0                      @ lr = pointer to the current row
         mov r12, #8                     @ r12 = row-counter
-        adr r11, const_array            @ r11 = base pointer to the constants array
+        movrel r11, const_array         @ r11 = base pointer to the constants array
 row_loop:
         ldrsh r0, [lr, # 0]             @ r0 = 'd0'
         ldrsh r2, [lr, # 2]             @ r2 = 'd2'
@@ -370,9 +367,9 @@ the_end:
         @ The end....
         add sp, sp, #4
         ldmia   sp!, { r4 - r12, pc }   @ restore callee saved regs and return
+endfunc
 
-const_array:
-        .align
+const const_array
         .word FIX_0_298631336
         .word FIX_0_541196100
         .word FIX_0_765366865
@@ -386,3 +383,4 @@ const_array:
         .word FIX_M_1_961570560
         .word FIX_M_2_562915447
         .word FIX_0xFFFF
+endconst

From 13743c7ab00be270a3a38c8ee6442336774e335c Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Mon, 13 Jun 2011 09:30:42 +0100
Subject: [PATCH 04/24] ARM: jrevdct_arm: use push/pop mnemonics

Use push/pop instead of stmdb/ldmia for stack operations.  This
is the preferred syntax.

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/arm/jrevdct_arm.S | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavcodec/arm/jrevdct_arm.S b/libavcodec/arm/jrevdct_arm.S
index f7fd5c7bde..08d42d7758 100644
--- a/libavcodec/arm/jrevdct_arm.S
+++ b/libavcodec/arm/jrevdct_arm.S
@@ -56,7 +56,7 @@
 #define FIX_0xFFFF_ID          48
 
 function ff_j_rev_dct_arm, export=1
-        stmdb   sp!, { r4 - r12, lr }   @ all callee saved regs
+        push {r4 - r12, lr}
         sub sp, sp, #4                  @ reserve some space on the stack
         str r0, [ sp ]                  @ save the DCT pointer to the stack
 
@@ -99,7 +99,7 @@ row_loop:
         add r4, r6, r3, lsl #13             @ r4 = tmp11
         rsb r3, r6, r3, lsl #13             @ r3 = tmp12
 
-        stmdb   sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
+        push {r0, r2, r3, r4} @ save on the stack tmp10, tmp13, tmp12, tmp11
 
         ldrsh r3, [lr, #10]             @ r3 = 'd3'
         ldrsh r5, [lr, #12]             @ r5 = 'd5'
@@ -133,8 +133,8 @@ row_loop:
         add r3, r3, r4                  @ r3 = tmp2
         add r1, r1, r6                  @ r1 = tmp3
 
-        ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
-                                      @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
+        pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
+                             @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
 
         @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
         add r8, r0, r1
@@ -242,7 +242,7 @@ column_loop:
         orrs r10, r9, r10
         beq empty_odd_column
 
-        stmdb   sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
+        push {r0, r2, r4, r6} @ save on the stack tmp10, tmp13, tmp12, tmp11
 
         add r0, r3, r5                  @ r0 = 'z2'
         add r2, r1, r7                  @ r2 = 'z1'
@@ -272,8 +272,8 @@ column_loop:
         add r3, r3, r4                  @ r3 = tmp2
         add r1, r1, r6                  @ r1 = tmp3
 
-        ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
-                                      @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
+        pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
+                             @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
 
         @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
         add r8, r0, r1
@@ -366,7 +366,7 @@ empty_odd_column:
 the_end:
         @ The end....
         add sp, sp, #4
-        ldmia   sp!, { r4 - r12, pc }   @ restore callee saved regs and return
+        pop {r4 - r12, pc}
 endfunc
 
 const const_array

From 9776e25db9a43e77e9b091c012bf16267d9559d7 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Mon, 13 Jun 2011 09:33:20 +0100
Subject: [PATCH 05/24] ARM: jrevdct_arm: simplify stack usage

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/arm/jrevdct_arm.S | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/libavcodec/arm/jrevdct_arm.S b/libavcodec/arm/jrevdct_arm.S
index 08d42d7758..93cbbbe8eb 100644
--- a/libavcodec/arm/jrevdct_arm.S
+++ b/libavcodec/arm/jrevdct_arm.S
@@ -56,9 +56,7 @@
 #define FIX_0xFFFF_ID          48
 
 function ff_j_rev_dct_arm, export=1
-        push {r4 - r12, lr}
-        sub sp, sp, #4                  @ reserve some space on the stack
-        str r0, [ sp ]                  @ save the DCT pointer to the stack
+        push {r0, r4 - r11, lr}
 
         mov lr, r0                      @ lr = pointer to the current row
         mov r12, #8                     @ r12 = row-counter
@@ -208,7 +206,7 @@ end_of_row_loop:
 
 start_column_loop:
         @ Start of column loop
-        ldr lr, [ sp ]
+        pop {lr}
         mov r12, #8
 column_loop:
         ldrsh r0, [lr, #( 0*8)]             @ r0 = 'd0'
@@ -365,8 +363,7 @@ empty_odd_column:
 
 the_end:
         @ The end....
-        add sp, sp, #4
-        pop {r4 - r12, pc}
+        pop {r4 - r11, pc}
 endfunc
 
 const const_array

From 294e5475c2cf4bb7a0db34931515fddc5732c266 Mon Sep 17 00:00:00 2001
From: Michael Niedermayer <michaelni@gmx.at>
Date: Fri, 1 Apr 2011 12:46:36 +0200
Subject: [PATCH 06/24] ffv1: fix undefined behavior with insane widths.

The new tables is large enough to prevent this together with our image size checks.

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavcodec/bitstream.c | 6 ++++--
 libavcodec/ffv1.c      | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/libavcodec/bitstream.c b/libavcodec/bitstream.c
index b593db55ce..17b1c69c84 100644
--- a/libavcodec/bitstream.c
+++ b/libavcodec/bitstream.c
@@ -32,11 +32,13 @@
 #include "get_bits.h"
 #include "put_bits.h"
 
-const uint8_t ff_log2_run[32]={
+const uint8_t ff_log2_run[41]={
  0, 0, 0, 0, 1, 1, 1, 1,
  2, 2, 2, 2, 3, 3, 3, 3,
  4, 4, 5, 5, 6, 6, 7, 7,
- 8, 9,10,11,12,13,14,15
+ 8, 9,10,11,12,13,14,15,
+16,17,18,19,20,21,22,23,
+24,
 };
 
 void align_put_bits(PutBitContext *s)
diff --git a/libavcodec/ffv1.c b/libavcodec/ffv1.c
index 53edbb3459..50f1062ad4 100644
--- a/libavcodec/ffv1.c
+++ b/libavcodec/ffv1.c
@@ -40,7 +40,7 @@
 #define MAX_QUANT_TABLES 8
 #define MAX_CONTEXT_INPUTS 5
 
-extern const uint8_t ff_log2_run[32];
+extern const uint8_t ff_log2_run[41];
 
 static const int8_t quant3[256]={
  0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

From a31d4b3a9929e31eb2df949cab09422267f5a5e3 Mon Sep 17 00:00:00 2001
From: Peter Ross <pross@xvid.org>
Date: Fri, 1 Apr 2011 23:11:24 +1100
Subject: [PATCH 07/24] img2: add .dpx to the list of supported file
 extensions.

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavformat/img2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavformat/img2.c b/libavformat/img2.c
index 4e82aa301b..ec37a38845 100644
--- a/libavformat/img2.c
+++ b/libavformat/img2.c
@@ -496,7 +496,7 @@ AVInputFormat ff_image2pipe_demuxer = {
 AVOutputFormat ff_image2_muxer = {
     .name           = "image2",
     .long_name      = NULL_IF_CONFIG_SMALL("image2 sequence"),
-    .extensions     = "bmp,jpeg,jpg,ljpg,pam,pbm,pcx,pgm,pgmyuv,png,"
+    .extensions     = "bmp,dpx,jpeg,jpg,ljpg,pam,pbm,pcx,pgm,pgmyuv,png,"
                       "ppm,sgi,tga,tif,tiff,jp2",
     .priv_data_size = sizeof(VideoData),
     .video_codec    = CODEC_ID_MJPEG,

From 86961eeabf06ba4ef920789cb7596e8590e82713 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Reimar=20D=C3=B6ffinger?= <Reimar.Doeffinger@gmx.de>
Date: Fri, 1 Apr 2011 19:42:31 +0200
Subject: [PATCH 08/24] ac3dec: fix doxy-style for comment ("///>" should be
 "///<" instead).

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavcodec/ac3dec.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/ac3dec.h b/libavcodec/ac3dec.h
index f0ab75ae98..590bee6b6d 100644
--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@@ -194,7 +194,7 @@ typedef struct {
 ///@}
 
 ///@defgroup arrays aligned arrays
-    DECLARE_ALIGNED(16, int,   fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];       ///> fixed-point transform coefficients
+    DECLARE_ALIGNED(16, int,   fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];       ///< fixed-point transform coefficients
     DECLARE_ALIGNED(32, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];   ///< transform coefficients
     DECLARE_ALIGNED(32, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];             ///< delay - added to the next block
     DECLARE_ALIGNED(32, float, window)[AC3_BLOCK_SIZE];                              ///< window coefficients

From 33aec3f402c07e417ac1492717ab746927253727 Mon Sep 17 00:00:00 2001
From: Michael Niedermayer <michaelni@gmx.at>
Date: Sat, 2 Apr 2011 17:57:53 +0200
Subject: [PATCH 09/24] h264: change a few comments into error messages

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavcodec/h264.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 0aa923fdbb..e1fcb62a7b 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -2339,8 +2339,10 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
         MPV_common_end(s);
     }
     if (!s->context_initialized) {
-        if(h != h0)
-            return -1;  // we cant (re-)initialize context during parallel decoding
+        if (h != h0) {
+            av_log(h->s.avctx, AV_LOG_ERROR, "Cannot (re-)initialize context during parallel decoding.\n");
+            return -1;
+        }
 
         avcodec_set_dimensions(s->avctx, s->width, s->height);
         s->avctx->sample_aspect_ratio= h->sps.sar;
@@ -2384,8 +2386,10 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
 
         s->avctx->hwaccel = ff_find_hwaccel(s->avctx->codec->id, s->avctx->pix_fmt);
 
-        if (MPV_common_init(s) < 0)
+        if (MPV_common_init(s) < 0) {
+            av_log(h->s.avctx, AV_LOG_ERROR, "MPV_common_init() failed.\n");
             return -1;
+        }
         s->first_field = 0;
         h->prev_interlaced_frame = 1;
 
@@ -2393,8 +2397,10 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
         ff_h264_alloc_tables(h);
 
         if (!HAVE_THREADS || !(s->avctx->active_thread_type&FF_THREAD_SLICE)) {
-            if (context_init(h) < 0)
+            if (context_init(h) < 0) {
+                av_log(h->s.avctx, AV_LOG_ERROR, "context_init() failed.\n");
                 return -1;
+            }
         } else {
             for(i = 1; i < s->avctx->thread_count; i++) {
                 H264Context *c;
@@ -2410,8 +2416,10 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
             }
 
             for(i = 0; i < s->avctx->thread_count; i++)
-                if(context_init(h->thread_context[i]) < 0)
+                if (context_init(h->thread_context[i]) < 0) {
+                    av_log(h->s.avctx, AV_LOG_ERROR, "context_init() failed.\n");
                     return -1;
+                }
         }
     }
 
@@ -2706,8 +2714,10 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
                 h0->single_decode_warning = 1;
             }
-            if(h != h0)
-                return 1; // deblocking switched inside frame
+            if (h != h0) {
+                av_log(h->s.avctx, AV_LOG_ERROR, "Deblocking switched inside frame.\n");
+                return 1;
+            }
         }
     }
     h->qp_thresh= 15 + 52 - FFMIN(h->slice_alpha_c0_offset, h->slice_beta_offset) - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);

From 3de33b00de219a749f3e5c9d1f1f68d7cfa58502 Mon Sep 17 00:00:00 2001
From: Wim Lewis <wiml@hhhh.org>
Date: Sat, 2 Apr 2011 15:30:24 -0700
Subject: [PATCH 10/24] avcodec.h: add or elaborate on some documentation
 comments.

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavcodec/avcodec.h | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index ce3a4a6b94..a70d8adb90 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -3253,7 +3253,9 @@ void av_resample_compensate(struct AVResampleContext *c, int sample_delta, int c
 void av_resample_close(struct AVResampleContext *c);
 
 /**
- * Allocate memory for a picture.  Call avpicture_free to free it.
+ * Allocate memory for a picture.  Call avpicture_free() to free it.
+ *
+ * \see avpicture_fill()
  *
  * @param picture the picture to be filled in
  * @param pix_fmt the format of the picture
@@ -3265,6 +3267,8 @@ int avpicture_alloc(AVPicture *picture, enum PixelFormat pix_fmt, int width, int
 
 /**
  * Free a picture previously allocated by avpicture_alloc().
+ * The data buffer used by the AVPicture is freed, but the AVPicture structure
+ * itself is not.
  *
  * @param picture the AVPicture to be freed
  */
@@ -3280,6 +3284,9 @@ void avpicture_free(AVPicture *picture);
  * will be stored in the lines_sizes array.
  * Call with ptr == NULL to get the required size for the ptr buffer.
  *
+ * To allocate the buffer and fill in the AVPicture fields in one call,
+ * use avpicture_alloc().
+ *
  * @param picture AVPicture whose fields are to be filled in
  * @param ptr Buffer which will contain or contains the actual image data
  * @param pix_fmt The format in which the picture data is stored.
@@ -3289,6 +3296,22 @@ void avpicture_free(AVPicture *picture);
  */
 int avpicture_fill(AVPicture *picture, uint8_t *ptr,
                    enum PixelFormat pix_fmt, int width, int height);
+
+/**
+ * Copy pixel data from an AVPicture into a buffer.
+ * The data is stored compactly, without any gaps for alignment or padding
+ * which may be applied by avpicture_fill().
+ *
+ * \see avpicture_get_size()
+ *
+ * @param[in] src AVPicture containing image data
+ * @param[in] pix_fmt The format in which the picture data is stored.
+ * @param[in] width the width of the image in pixels.
+ * @param[in] height the height of the image in pixels.
+ * @param[out] dest A buffer into which picture data will be copied.
+ * @param[in] dest_size The size of 'dest'.
+ * @return The number of bytes written to dest, or a negative value (error code) on error.
+ */
 int avpicture_layout(const AVPicture* src, enum PixelFormat pix_fmt, int width, int height,
                      unsigned char *dest, int dest_size);
 
@@ -3296,8 +3319,8 @@ int avpicture_layout(const AVPicture* src, enum PixelFormat pix_fmt, int width,
  * Calculate the size in bytes that a picture of the given width and height
  * would occupy if stored in the given picture format.
  * Note that this returns the size of a compact representation as generated
- * by avpicture_layout, which can be smaller than the size required for e.g.
- * avpicture_fill.
+ * by avpicture_layout(), which can be smaller than the size required for e.g.
+ * avpicture_fill().
  *
  * @param pix_fmt the given picture format
  * @param width the width of the image
@@ -3426,16 +3449,19 @@ const char *avcodec_license(void);
 
 /**
  * Initialize libavcodec.
+ * If called more than once, does nothing.
  *
  * @warning This function must be called before any other libavcodec
  * function.
+ *
+ * @warning This function is not thread-safe.
  */
 void avcodec_init(void);
 
 /**
  * Register the codec codec and initialize libavcodec.
  *
- * @see avcodec_init()
+ * @see avcodec_init(), avcodec_register_all()
  */
 void avcodec_register(AVCodec *codec);
 
@@ -3615,7 +3641,7 @@ int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2,
  * @param avctx The context which will be set up to use the given codec.
  * @param codec The codec to use within the context.
  * @return zero on success, a negative value on error
- * @see avcodec_alloc_context, avcodec_find_decoder, avcodec_find_encoder
+ * @see avcodec_alloc_context, avcodec_find_decoder, avcodec_find_encoder, avcodec_close
  */
 int avcodec_open(AVCodecContext *avctx, AVCodec *codec);
 

From 108f318d908c552d88bc7570515e4ddb9ea45e3d Mon Sep 17 00:00:00 2001
From: Michael Niedermayer <michaelni@gmx.at>
Date: Tue, 1 Mar 2011 15:59:00 +0100
Subject: [PATCH 11/24] h264: don't be so picky on decoding pps in extradata.

Fixes issue2517

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavcodec/h264.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index e1fcb62a7b..276d6e6d6c 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -995,7 +995,7 @@ int ff_h264_decode_extradata(H264Context *h)
         cnt = *(p++); // Number of pps
         for (i = 0; i < cnt; i++) {
             nalsize = AV_RB16(p) + 2;
-            if(decode_nal_units(h, p, nalsize)  != nalsize) {
+            if (decode_nal_units(h, p, nalsize) < 0) {
                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
                 return -1;
             }

From a8fd2f4e0238d6ddde0db28c5bb4b39d2f98d4ed Mon Sep 17 00:00:00 2001
From: Michael Niedermayer <michaelni@gmx.at>
Date: Tue, 8 Mar 2011 22:39:14 +0100
Subject: [PATCH 12/24] lavf: initialise reference_dts in
 av_estimate_timings_from_pts.

Fixes issue2437.

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavformat/utils.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavformat/utils.c b/libavformat/utils.c
index dc3b9d8fb8..d0fd0d46ff 100644
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@@ -1933,6 +1933,7 @@ static void av_estimate_timings_from_pts(AVFormatContext *ic, int64_t old_offset
         st= ic->streams[i];
         st->cur_dts= st->first_dts;
         st->last_IP_pts = AV_NOPTS_VALUE;
+        st->reference_dts = AV_NOPTS_VALUE;
     }
 }
 

From 8d0786ec6d066f892f29da6593e99e73a7dfd014 Mon Sep 17 00:00:00 2001
From: Carl Eugen Hoyos <cehoyos@ag.or.at>
Date: Sun, 3 Apr 2011 22:45:16 +0200
Subject: [PATCH 13/24] wav: remove an invalid free().

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavformat/wav.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libavformat/wav.c b/libavformat/wav.c
index 21374e8b93..92c9bfcbc4 100644
--- a/libavformat/wav.c
+++ b/libavformat/wav.c
@@ -52,7 +52,6 @@ static int wav_write_header(AVFormatContext *s)
     if (ff_put_wav_header(pb, s->streams[0]->codec) < 0) {
         av_log(s, AV_LOG_ERROR, "%s codec not supported in WAVE format\n",
                s->streams[0]->codec->codec ? s->streams[0]->codec->codec->name : "NONE");
-        av_free(wav);
         return -1;
     }
     ff_end_tag(pb, fmt);

From a26ce1e2df102ad085cf1a7891722ef64b80ea24 Mon Sep 17 00:00:00 2001
From: Philip Langdale <philipl@overt.org>
Date: Mon, 28 Mar 2011 21:42:02 -0700
Subject: [PATCH 14/24] h264_parser: Fix behaviour when
 PARSER_FLAG_COMPLETE_FRAMES is set.

Currently, the parser is buggy and only processes the stream extradata
when the flag is set. This fixes it to actually inspect the frames.

Whitespce will be fixed in a separate change.

Signed-off-by: Philip Langdale <philipl@overt.org>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavcodec/h264_parser.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index 621ff02925..a3149534a8 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -270,6 +270,7 @@ static int h264_parse(AVCodecParserContext *s,
             assert(pc->last_index + next >= 0 );
             ff_h264_find_frame_end(h, &pc->buffer[pc->last_index + next], -next); //update state
         }
+    }
 
         parse_nal_units(s, avctx, buf, buf_size);
 
@@ -285,7 +286,6 @@ static int h264_parse(AVCodecParserContext *s,
         if (s->flags & PARSER_FLAG_ONCE) {
             s->flags &= PARSER_FLAG_COMPLETE_FRAMES;
         }
-    }
 
     *poutbuf = buf;
     *poutbuf_size = buf_size;

From 25f05ddb1af4bd398fa92cd135e48fafe23bd92a Mon Sep 17 00:00:00 2001
From: Philip Langdale <philipl@overt.org>
Date: Mon, 28 Mar 2011 21:43:23 -0700
Subject: [PATCH 15/24] h264_parser: Fix whitespace after previous change.

Signed-off-by: Philip Langdale <philipl@overt.org>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavcodec/h264_parser.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index a3149534a8..c39baeb739 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -272,20 +272,21 @@ static int h264_parse(AVCodecParserContext *s,
         }
     }
 
-        parse_nal_units(s, avctx, buf, buf_size);
+    parse_nal_units(s, avctx, buf, buf_size);
 
-        if (h->sei_cpb_removal_delay >= 0) {
-            s->dts_sync_point    = h->sei_buffering_period_present;
-            s->dts_ref_dts_delta = h->sei_cpb_removal_delay;
-            s->pts_dts_delta     = h->sei_dpb_output_delay;
-        } else {
-            s->dts_sync_point    = INT_MIN;
-            s->dts_ref_dts_delta = INT_MIN;
-            s->pts_dts_delta     = INT_MIN;
-        }
-        if (s->flags & PARSER_FLAG_ONCE) {
-            s->flags &= PARSER_FLAG_COMPLETE_FRAMES;
-        }
+    if (h->sei_cpb_removal_delay >= 0) {
+        s->dts_sync_point    = h->sei_buffering_period_present;
+        s->dts_ref_dts_delta = h->sei_cpb_removal_delay;
+        s->pts_dts_delta     = h->sei_dpb_output_delay;
+    } else {
+        s->dts_sync_point    = INT_MIN;
+        s->dts_ref_dts_delta = INT_MIN;
+        s->pts_dts_delta     = INT_MIN;
+    }
+
+    if (s->flags & PARSER_FLAG_ONCE) {
+        s->flags &= PARSER_FLAG_COMPLETE_FRAMES;
+    }
 
     *poutbuf = buf;
     *poutbuf_size = buf_size;

From c9c493872c385cff304438ee404e38e55f04af28 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <jason@x264.com>
Date: Fri, 3 Jun 2011 01:12:28 -0700
Subject: [PATCH 16/24] 4:4:4 H.264 decoding support

Note: this is 4:4:4 from the 2007 spec revision, not the previous (now deprecated) 4:4:4 mode in H.264.
---
 libavcodec/dsputil.h               |   2 +-
 libavcodec/dsputil_template.c      |   6 +-
 libavcodec/h264.c                  | 760 +++++++++++++++++---------
 libavcodec/h264.h                  | 181 ++++---
 libavcodec/h264_cabac.c            | 819 +++++++++++++++++++++++++----
 libavcodec/h264_cavlc.c            | 198 ++++---
 libavcodec/h264_loopfilter.c       |  70 ++-
 libavcodec/h264_ps.c               |  14 +-
 libavcodec/h264dsp.h               |   8 +-
 libavcodec/h264idct_template.c     |  44 +-
 libavcodec/mpegvideo.c             |  25 +-
 libavcodec/snow.c                  |   6 +-
 libavcodec/x86/dsputil_mmx.c       |   4 +-
 libavcodec/x86/h264_i386.h         |  15 +-
 libavcodec/x86/h264_idct.asm       |  44 +-
 libavcodec/x86/h264_idct_10bit.asm |  35 +-
 16 files changed, 1595 insertions(+), 636 deletions(-)

diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index cfc574aebb..7a28b06fd5 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -507,7 +507,7 @@ typedef struct DSPContext {
 #define BASIS_SHIFT 16
 #define RECON_SHIFT 6
 
-    void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w, int sides);
+    void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides);
 #define EDGE_WIDTH 16
 #define EDGE_TOP    1
 #define EDGE_BOTTOM 2
diff --git a/libavcodec/dsputil_template.c b/libavcodec/dsputil_template.c
index 8ca6d3e414..b85931856a 100644
--- a/libavcodec/dsputil_template.c
+++ b/libavcodec/dsputil_template.c
@@ -79,7 +79,7 @@ static inline void FUNC(copy_block16)(uint8_t *dst, const uint8_t *src, int dstS
 
 /* draw the edges of width 'w' of an image of size width, height */
 //FIXME check that this is ok for mpeg4 interlaced
-static void FUNCC(draw_edges)(uint8_t *_buf, int _wrap, int width, int height, int w, int sides)
+static void FUNCC(draw_edges)(uint8_t *_buf, int _wrap, int width, int height, int w, int h, int sides)
 {
     pixel *buf = (pixel*)_buf;
     int wrap = _wrap / sizeof(pixel);
@@ -106,10 +106,10 @@ static void FUNCC(draw_edges)(uint8_t *_buf, int _wrap, int width, int height, i
     buf -= w;
     last_line = buf + (height - 1) * wrap;
     if (sides & EDGE_TOP)
-        for(i = 0; i < w; i++)
+        for(i = 0; i < h; i++)
             memcpy(buf - (i + 1) * wrap, buf, (width + w + w) * sizeof(pixel)); // top
     if (sides & EDGE_BOTTOM)
-        for (i = 0; i < w; i++)
+        for (i = 0; i < h; i++)
             memcpy(last_line + (i + 1) * wrap, last_line, (width + w + w) * sizeof(pixel)); // bottom
 }
 
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 276d6e6d6c..86ea218807 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -451,12 +451,13 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                            int src_x_offset, int src_y_offset,
                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
-                           int pixel_shift){
+                           int pixel_shift, int chroma444){
     MpegEncContext * const s = &h->s;
     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
     const int luma_xy= (mx&3) + ((my&3)<<2);
-    uint8_t * src_y = pic->data[0] + ((mx>>2) << pixel_shift) + (my>>2)*h->mb_linesize;
+    int offset = ((mx>>2) << pixel_shift) + (my>>2)*h->mb_linesize;
+    uint8_t * src_y = pic->data[0] + offset;
     uint8_t * src_cb, * src_cr;
     int extra_width= h->emu_edge_width;
     int extra_height= h->emu_edge_height;
@@ -483,6 +484,31 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
     }
 
+    if(chroma444){
+        src_cb = pic->data[1] + offset;
+        if(emu){
+            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize,
+                                    16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
+            src_cb= s->edge_emu_buffer + (2 << pixel_shift) + 2*h->mb_linesize;
+        }
+        qpix_op[luma_xy](dest_cb, src_cb, h->mb_linesize); //FIXME try variable height perhaps?
+        if(!square){
+            qpix_op[luma_xy](dest_cb + delta, src_cb + delta, h->mb_linesize);
+        }
+
+        src_cr = pic->data[2] + offset;
+        if(emu){
+            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize,
+                                    16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
+            src_cr= s->edge_emu_buffer + (2 << pixel_shift) + 2*h->mb_linesize;
+        }
+        qpix_op[luma_xy](dest_cr, src_cr, h->mb_linesize); //FIXME try variable height perhaps?
+        if(!square){
+            qpix_op[luma_xy](dest_cr + delta, src_cr + delta, h->mb_linesize);
+        }
+        return;
+    }
+
     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
 
     if(MB_FIELD){
@@ -511,14 +537,19 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
                            int x_offset, int y_offset,
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
-                           int list0, int list1, int pixel_shift){
+                           int list0, int list1, int pixel_shift, int chroma444){
     MpegEncContext * const s = &h->s;
     qpel_mc_func *qpix_op=  qpix_put;
     h264_chroma_mc_func chroma_op= chroma_put;
 
-    dest_y  += (2*x_offset << pixel_shift) + 2*y_offset*h->  mb_linesize;
-    dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
-    dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
+    dest_y  += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
+    if(chroma444){
+        dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
+        dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
+    }else{
+        dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
+        dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
+    }
     x_offset += 8*s->mb_x;
     y_offset += 8*(s->mb_y >> MB_FIELD);
 
@@ -526,7 +557,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                           qpix_op, chroma_op, pixel_shift);
+                           qpix_op, chroma_op, pixel_shift, chroma444);
 
         qpix_op=  qpix_avg;
         chroma_op= chroma_avg;
@@ -536,7 +567,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                           qpix_op, chroma_op, pixel_shift);
+                           qpix_op, chroma_op, pixel_shift, chroma444);
     }
 }
 
@@ -546,12 +577,19 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
-                           int list0, int list1, int pixel_shift){
+                           int list0, int list1, int pixel_shift, int chroma444){
     MpegEncContext * const s = &h->s;
 
-    dest_y  += (2*x_offset << pixel_shift) + 2*y_offset*h->  mb_linesize;
-    dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
-    dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
+    dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
+    if(chroma444){
+        chroma_weight_avg = luma_weight_avg;
+        chroma_weight_op = luma_weight_op;
+        dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
+        dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
+    }else{
+        dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
+        dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
+    }
     x_offset += 8*s->mb_x;
     y_offset += 8*(s->mb_y >> MB_FIELD);
 
@@ -559,17 +597,17 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
         /* don't optimize for luma-only case, since B-frames usually
          * use implicit weights => chroma too. */
         uint8_t *tmp_cb = s->obmc_scratchpad;
-        uint8_t *tmp_cr = s->obmc_scratchpad + (8 << pixel_shift);
-        uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
+        uint8_t *tmp_cr = s->obmc_scratchpad + (16 << pixel_shift);
+        uint8_t *tmp_y  = s->obmc_scratchpad + 16*h->mb_uvlinesize;
         int refn0 = h->ref_cache[0][ scan8[n] ];
         int refn1 = h->ref_cache[1][ scan8[n] ];
 
         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
                     dest_y, dest_cb, dest_cr,
-                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift);
+                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
                     tmp_y, tmp_cb, tmp_cr,
-                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift);
+                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
 
         if(h->use_weight == 2){
             int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
@@ -594,7 +632,7 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
         Picture *ref= &h->ref_list[list][refn];
         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                    qpix_put, chroma_put, pixel_shift);
+                    qpix_put, chroma_put, pixel_shift, chroma444);
 
         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
                        h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
@@ -613,21 +651,21 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
-                           int list0, int list1, int pixel_shift){
+                           int list0, int list1, int pixel_shift, int chroma444){
     if((h->use_weight==2 && list0 && list1
         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
        || h->use_weight==1)
         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
                          x_offset, y_offset, qpix_put, chroma_put,
                          weight_op[0], weight_op[3], weight_avg[0],
-                         weight_avg[3], list0, list1, pixel_shift);
+                         weight_avg[3], list0, list1, pixel_shift, chroma444);
     else
         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
-                    chroma_avg, list0, list1, pixel_shift);
+                    chroma_avg, list0, list1, pixel_shift, chroma444);
 }
 
-static inline void prefetch_motion(H264Context *h, int list, int pixel_shift){
+static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma444){
     /* fetch pixels for estimated mv 4 macroblocks ahead
      * optimized for 64byte cache lines */
     MpegEncContext * const s = &h->s;
@@ -638,8 +676,13 @@ static inline void prefetch_motion(H264Context *h, int list, int pixel_shift){
         uint8_t **src= h->ref_list[list][refn].data;
         int off= (mx << pixel_shift) + (my + (s->mb_x&3)*4)*h->mb_linesize + (64 << pixel_shift);
         s->dsp.prefetch(src[0]+off, s->linesize, 4);
-        off= ((mx>>1) << pixel_shift) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + (64 << pixel_shift);
-        s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
+        if(chroma444){
+            s->dsp.prefetch(src[1]+off, s->linesize, 4);
+            s->dsp.prefetch(src[2]+off, s->linesize, 4);
+        }else{
+            off= ((mx>>1) << pixel_shift) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + (64 << pixel_shift);
+            s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
+        }
     }
 }
 
@@ -647,7 +690,7 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
                       h264_weight_func *weight_op, h264_biweight_func *weight_avg,
-                      int pixel_shift){
+                      int pixel_shift, int chroma444){
     MpegEncContext * const s = &h->s;
     const int mb_xy= h->mb_xy;
     const int mb_type= s->current_picture.mb_type[mb_xy];
@@ -656,36 +699,36 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
 
     if(HAVE_PTHREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME))
         await_references(h);
-    prefetch_motion(h, 0, pixel_shift);
+    prefetch_motion(h, 0, pixel_shift, chroma444);
 
     if(IS_16X16(mb_type)){
         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
                 weight_op, weight_avg,
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift);
+                pixel_shift, chroma444);
     }else if(IS_16X8(mb_type)){
         mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
                 &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift);
+                pixel_shift, chroma444);
         mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
                 &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
-                pixel_shift);
+                pixel_shift, chroma444);
     }else if(IS_8X16(mb_type)){
         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                 &weight_op[2], &weight_avg[2],
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift);
+                pixel_shift, chroma444);
         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                 &weight_op[2], &weight_avg[2],
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
-                pixel_shift);
+                pixel_shift, chroma444);
     }else{
         int i;
 
@@ -702,29 +745,29 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                     &weight_op[3], &weight_avg[3],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift);
+                    pixel_shift, chroma444);
             }else if(IS_SUB_8X4(sub_mb_type)){
                 mc_part(h, n  , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
                     &weight_op[4], &weight_avg[4],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift);
+                    pixel_shift, chroma444);
                 mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
                     &weight_op[4], &weight_avg[4],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift);
+                    pixel_shift, chroma444);
             }else if(IS_SUB_4X8(sub_mb_type)){
                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                     &weight_op[5], &weight_avg[5],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift);
+                    pixel_shift, chroma444);
                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                     &weight_op[5], &weight_avg[5],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift);
+                    pixel_shift, chroma444);
             }else{
                 int j;
                 assert(IS_SUB_4X4(sub_mb_type));
@@ -735,13 +778,13 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                         &weight_op[6], &weight_avg[6],
                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift);
+                        pixel_shift, chroma444);
                 }
             }
         }
     }
 
-    prefetch_motion(h, 1, pixel_shift);
+    prefetch_motion(h, 1, pixel_shift, chroma444);
 }
 
 #define hl_motion_fn(sh, bits) \
@@ -753,10 +796,11 @@ static av_always_inline void hl_motion_ ## bits(H264Context *h, \
                                        qpel_mc_func (*qpix_avg)[16], \
                                        h264_chroma_mc_func (*chroma_avg), \
                                        h264_weight_func *weight_op, \
-                                       h264_biweight_func *weight_avg) \
+                                       h264_biweight_func *weight_avg, \
+                                       int chroma444) \
 { \
     hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, \
-              qpix_avg, chroma_avg, weight_op, weight_avg, sh); \
+              qpix_avg, chroma_avg, weight_op, weight_avg, sh, chroma444); \
 }
 hl_motion_fn(0, 8);
 hl_motion_fn(1, 16);
@@ -796,16 +840,19 @@ static void free_tables(H264Context *h, int free_rbsp){
 }
 
 static void init_dequant8_coeff_table(H264Context *h){
-    int i,q,x;
+    int i,j,q,x;
     const int max_qp = 51 + 6*(h->sps.bit_depth_luma-8);
-    h->dequant8_coeff[0] = h->dequant8_buffer[0];
-    h->dequant8_coeff[1] = h->dequant8_buffer[1];
 
-    for(i=0; i<2; i++ ){
-        if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
-            h->dequant8_coeff[1] = h->dequant8_buffer[0];
-            break;
+    for(i=0; i<6; i++ ){
+        h->dequant8_coeff[i] = h->dequant8_buffer[i];
+        for(j=0; j<i; j++){
+            if(!memcmp(h->pps.scaling_matrix8[j], h->pps.scaling_matrix8[i], 64*sizeof(uint8_t))){
+                h->dequant8_coeff[i] = h->dequant8_buffer[j];
+                break;
+            }
         }
+        if(j<i)
+            continue;
 
         for(q=0; q<max_qp+1; q++){
             int shift = div6[q];
@@ -853,7 +900,7 @@ static void init_dequant_tables(H264Context *h){
             for(x=0; x<16; x++)
                 h->dequant4_coeff[i][0][x] = 1<<6;
         if(h->pps.transform_8x8_mode)
-            for(i=0; i<2; i++)
+            for(i=0; i<6; i++)
                 for(x=0; x<64; x++)
                     h->dequant8_coeff[i][0][x] = 1<<6;
     }
@@ -868,7 +915,7 @@ int ff_h264_alloc_tables(H264Context *h){
 
     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->intra4x4_pred_mode, row_mb_num * 8  * sizeof(uint8_t), fail)
 
-    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count    , big_mb_num * 32 * sizeof(uint8_t), fail)
+    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count    , big_mb_num * 48 * sizeof(uint8_t), fail)
     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base), fail)
     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail)
 
@@ -930,8 +977,8 @@ static void clone_tables(H264Context *dst, H264Context *src, int i){
  * Allocate buffers which are not shared amongst multiple threads.
  */
 static int context_init(H264Context *h){
-    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t)*2, fail)
-    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t)*2, fail)
+    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[0], h->s.mb_width * 16*3 * sizeof(uint8_t)*2, fail)
+    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[1], h->s.mb_width * 16*3 * sizeof(uint8_t)*2, fail)
 
     h->ref_cache[0][scan8[5 ]+1] = h->ref_cache[0][scan8[7 ]+1] = h->ref_cache[0][scan8[13]+1] =
     h->ref_cache[1][scan8[5 ]+1] = h->ref_cache[1][scan8[7 ]+1] = h->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE;
@@ -1130,9 +1177,10 @@ static int decode_update_thread_context(AVCodecContext *dst, const AVCodecContex
 
         // frame_start may not be called for the next thread (if it's decoding a bottom field)
         // so this has to be allocated here
-        h->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
+        h->s.obmc_scratchpad = av_malloc(16*6*s->linesize);
 
         s->dsp.clear_blocks(h->mb);
+        s->dsp.clear_blocks(h->mb+(24*16<<h->pixel_shift));
     }
 
     //extradata/NAL handling
@@ -1151,7 +1199,7 @@ static int decode_update_thread_context(AVCodecContext *dst, const AVCodecContex
     for(i=0; i<6; i++)
         h->dequant4_coeff[i] = h->dequant4_buffer[0] + (h1->dequant4_coeff[i] - h1->dequant4_buffer[0]);
 
-    for(i=0; i<2; i++)
+    for(i=0; i<6; i++)
         h->dequant8_coeff[i] = h->dequant8_buffer[0] + (h1->dequant8_coeff[i] - h1->dequant8_buffer[0]);
 
     h->dequant_coeff_pps = h1->dequant_coeff_pps;
@@ -1206,20 +1254,20 @@ int ff_h264_frame_start(H264Context *h){
 
     for(i=0; i<16; i++){
         h->block_offset[i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
-        h->block_offset[24+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[48+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
     }
-    for(i=0; i<4; i++){
+    for(i=0; i<16; i++){
         h->block_offset[16+i]=
-        h->block_offset[20+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
-        h->block_offset[24+16+i]=
-        h->block_offset[24+20+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[32+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[48+16+i]=
+        h->block_offset[48+32+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
     }
 
     /* can't be in alloc_tables because linesize isn't known there.
      * FIXME: redo bipred weight to not require extra buffer? */
     for(i = 0; i < thread_count; i++)
         if(h->thread_context[i] && !h->thread_context[i]->s.obmc_scratchpad)
-            h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
+            h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*6*s->linesize);
 
     /* some macroblocks can be accessed before they're available in case of lost slices, mbaff or threading*/
     memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
@@ -1404,7 +1452,7 @@ static void decode_postinit(H264Context *h, int setup_finished){
         ff_thread_finish_setup(s->avctx);
 }
 
-static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
+static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int chroma444, int simple){
     MpegEncContext * const s = &h->s;
     uint8_t *top_border;
     int top_idx = 1;
@@ -1422,12 +1470,24 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
                 if (pixel_shift)
                     AV_COPY128(top_border+16, src_y+15*linesize+16);
                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
-                    if (pixel_shift) {
-                        AV_COPY128(top_border+32, src_cb+7*uvlinesize);
-                        AV_COPY128(top_border+48, src_cr+7*uvlinesize);
+                    if(chroma444){
+                        if (pixel_shift){
+                            AV_COPY128(top_border+32, src_cb + 15*uvlinesize);
+                            AV_COPY128(top_border+48, src_cb + 15*uvlinesize+16);
+                            AV_COPY128(top_border+64, src_cr + 15*uvlinesize);
+                            AV_COPY128(top_border+80, src_cr + 15*uvlinesize+16);
+                        } else {
+                            AV_COPY128(top_border+16, src_cb + 15*uvlinesize);
+                            AV_COPY128(top_border+32, src_cr + 15*uvlinesize);
+                        }
                     } else {
-                    AV_COPY64(top_border+16, src_cb+7*uvlinesize);
-                    AV_COPY64(top_border+24, src_cr+7*uvlinesize);
+                        if (pixel_shift) {
+                            AV_COPY128(top_border+32, src_cb+7*uvlinesize);
+                            AV_COPY128(top_border+48, src_cr+7*uvlinesize);
+                        } else {
+                            AV_COPY64(top_border+16, src_cb+7*uvlinesize);
+                            AV_COPY64(top_border+24, src_cr+7*uvlinesize);
+                        }
                     }
                 }
             }
@@ -1445,12 +1505,24 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
         AV_COPY128(top_border+16, src_y+16*linesize+16);
 
     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
-        if (pixel_shift) {
-            AV_COPY128(top_border+32, src_cb+8*uvlinesize);
-            AV_COPY128(top_border+48, src_cr+8*uvlinesize);
+        if(chroma444){
+            if (pixel_shift){
+                AV_COPY128(top_border+32, src_cb + 16*linesize);
+                AV_COPY128(top_border+48, src_cb + 16*linesize+16);
+                AV_COPY128(top_border+64, src_cr + 16*linesize);
+                AV_COPY128(top_border+80, src_cr + 16*linesize+16);
+            } else {
+                AV_COPY128(top_border+16, src_cb + 16*linesize);
+                AV_COPY128(top_border+32, src_cr + 16*linesize);
+            }
         } else {
-        AV_COPY64(top_border+16, src_cb+8*uvlinesize);
-        AV_COPY64(top_border+24, src_cr+8*uvlinesize);
+            if (pixel_shift) {
+                AV_COPY128(top_border+32, src_cb+8*uvlinesize);
+                AV_COPY128(top_border+48, src_cr+8*uvlinesize);
+            } else {
+                AV_COPY64(top_border+16, src_cb+8*uvlinesize);
+                AV_COPY64(top_border+24, src_cr+8*uvlinesize);
+            }
         }
     }
 }
@@ -1458,7 +1530,8 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y,
                                   uint8_t *src_cb, uint8_t *src_cr,
                                   int linesize, int uvlinesize,
-                                  int xchg, int simple, int pixel_shift){
+                                  int xchg, int chroma444,
+                                  int simple, int pixel_shift){
     MpegEncContext * const s = &h->s;
     int deblock_topleft;
     int deblock_top;
@@ -1513,13 +1586,28 @@ else      AV_COPY64(b,a);
         }
     }
     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
-        if(deblock_top){
+        if(chroma444){
             if(deblock_topleft){
-                XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1);
-                XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1);
+                XCHG(top_border_m1 + (24 << pixel_shift), src_cb - (7 << pixel_shift), 1);
+                XCHG(top_border_m1 + (40 << pixel_shift), src_cr - (7 << pixel_shift), 1);
+            }
+            XCHG(top_border + (16 << pixel_shift), src_cb + (1 << pixel_shift), xchg);
+            XCHG(top_border + (24 << pixel_shift), src_cb + (9 << pixel_shift), 1);
+            XCHG(top_border + (32 << pixel_shift), src_cr + (1 << pixel_shift), xchg);
+            XCHG(top_border + (40 << pixel_shift), src_cr + (9 << pixel_shift), 1);
+            if(s->mb_x+1 < s->mb_width){
+                XCHG(h->top_borders[top_idx][s->mb_x+1] + (16 << pixel_shift), src_cb + (17 << pixel_shift), 1);
+                XCHG(h->top_borders[top_idx][s->mb_x+1] + (32 << pixel_shift), src_cr + (17 << pixel_shift), 1);
+            }
+        } else {
+            if(deblock_top){
+                if(deblock_topleft){
+                    XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1);
+                    XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1);
+                }
+                XCHG(top_border + (16 << pixel_shift), src_cb+1+pixel_shift, 1);
+                XCHG(top_border + (24 << pixel_shift), src_cr+1+pixel_shift, 1);
             }
-            XCHG(top_border + (16 << pixel_shift), src_cb+1+pixel_shift, 1);
-            XCHG(top_border + (24 << pixel_shift), src_cr+1+pixel_shift, 1);
         }
     }
 }
@@ -1538,6 +1626,159 @@ static av_always_inline void dctcoef_set(DCTELEM *mb, int high_bit_depth, int in
         AV_WN16A(mb + index, value);
 }
 
+static av_always_inline void hl_decode_mb_predict_luma(H264Context *h, int mb_type, int is_h264, int simple, int transform_bypass,
+                                                       int pixel_shift, int *block_offset, int linesize, uint8_t *dest_y, int p)
+{
+    MpegEncContext * const s = &h->s;
+    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
+    int i;
+    int qscale = p == 0 ? s->qscale : h->chroma_qp[p-1];
+    block_offset += 16*p;
+    if(IS_INTRA4x4(mb_type)){
+        if(simple || !s->encoding){
+            if(IS_8x8DCT(mb_type)){
+                if(transform_bypass){
+                    idct_dc_add =
+                    idct_add    = s->dsp.add_pixels8;
+                }else{
+                    idct_dc_add = h->h264dsp.h264_idct8_dc_add;
+                    idct_add    = h->h264dsp.h264_idct8_add;
+                }
+                for(i=0; i<16; i+=4){
+                    uint8_t * const ptr= dest_y + block_offset[i];
+                    const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
+                    if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
+                        h->hpc.pred8x8l_add[dir](ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
+                    }else{
+                        const int nnz = h->non_zero_count_cache[ scan8[i+p*16] ];
+                        h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
+                                                    (h->topright_samples_available<<i)&0x4000, linesize);
+                        if(nnz){
+                            if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16+p*256))
+                                idct_dc_add(ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
+                            else
+                                idct_add   (ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
+                        }
+                    }
+                }
+            }else{
+                if(transform_bypass){
+                    idct_dc_add =
+                    idct_add    = s->dsp.add_pixels4;
+                }else{
+                    idct_dc_add = h->h264dsp.h264_idct_dc_add;
+                    idct_add    = h->h264dsp.h264_idct_add;
+                }
+                for(i=0; i<16; i++){
+                    uint8_t * const ptr= dest_y + block_offset[i];
+                    const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
+
+                    if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
+                        h->hpc.pred4x4_add[dir](ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
+                    }else{
+                        uint8_t *topright;
+                        int nnz, tr;
+                        uint64_t tr_high;
+                        if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
+                            const int topright_avail= (h->topright_samples_available<<i)&0x8000;
+                            assert(mb_y || linesize <= block_offset[i]);
+                            if(!topright_avail){
+                                if (pixel_shift) {
+                                    tr_high= ((uint16_t*)ptr)[3 - linesize/2]*0x0001000100010001ULL;
+                                    topright= (uint8_t*) &tr_high;
+                                } else {
+                                    tr= ptr[3 - linesize]*0x01010101;
+                                    topright= (uint8_t*) &tr;
+                                }
+                            }else
+                                topright= ptr + (4 << pixel_shift) - linesize;
+                        }else
+                            topright= NULL;
+
+                        h->hpc.pred4x4[ dir ](ptr, topright, linesize);
+                        nnz = h->non_zero_count_cache[ scan8[i+p*16] ];
+                        if(nnz){
+                            if(is_h264){
+                                if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16+p*256))
+                                    idct_dc_add(ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
+                                else
+                                    idct_add   (ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
+                            }else
+                                ff_svq3_add_idct_c(ptr, h->mb + i*16+p*256, linesize, qscale, 0);
+                        }
+                    }
+                }
+            }
+        }
+    }else{
+        h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
+        if(is_h264){
+            if(h->non_zero_count_cache[ scan8[LUMA_DC_BLOCK_INDEX+p] ]){
+                if(!transform_bypass)
+                    h->h264dsp.h264_luma_dc_dequant_idct(h->mb+(p*256 << pixel_shift), h->mb_luma_dc[p], h->dequant4_coeff[p][qscale][0]);
+                else{
+                    static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
+                                                            8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
+                    for(i = 0; i < 16; i++)
+                        dctcoef_set(h->mb+p*256, pixel_shift, dc_mapping[i], dctcoef_get(h->mb_luma_dc[p], pixel_shift, i));
+                }
+            }
+        }else
+            ff_svq3_luma_dc_dequant_idct_c(h->mb+p*256, h->mb_luma_dc[p], qscale);
+    }
+}
+
+static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, int is_h264, int simple, int transform_bypass,
+                                                    int pixel_shift, int *block_offset, int linesize, uint8_t *dest_y, int p)
+{
+    MpegEncContext * const s = &h->s;
+    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
+    int i;
+    block_offset += 16*p;
+    if(!IS_INTRA4x4(mb_type)){
+        if(is_h264){
+            if(IS_INTRA16x16(mb_type)){
+                if(transform_bypass){
+                    if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
+                        h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb + (p*256 << pixel_shift), linesize);
+                    }else{
+                        for(i=0; i<16; i++){
+                            if(h->non_zero_count_cache[ scan8[i+p*16] ] || dctcoef_get(h->mb, pixel_shift, i*16))
+                                s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + (i*16+p*256 << pixel_shift), linesize);
+                        }
+                    }
+                }else{
+                    h->h264dsp.h264_idct_add16intra(dest_y, block_offset, h->mb + (p*256 << pixel_shift), linesize, h->non_zero_count_cache+p*5*8);
+                }
+            }else if(h->cbp&15){
+                if(transform_bypass){
+                    const int di = IS_8x8DCT(mb_type) ? 4 : 1;
+                    idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
+                    for(i=0; i<16; i+=di){
+                        if(h->non_zero_count_cache[ scan8[i+p*16] ]){
+                            idct_add(dest_y + block_offset[i], h->mb + (i*16+p*256 << pixel_shift), linesize);
+                        }
+                    }
+                }else{
+                    if(IS_8x8DCT(mb_type)){
+                        h->h264dsp.h264_idct8_add4(dest_y, block_offset, h->mb + (p*256 << pixel_shift), linesize, h->non_zero_count_cache+p*5*8);
+                    }else{
+                        h->h264dsp.h264_idct_add16(dest_y, block_offset, h->mb + (p*256 << pixel_shift), linesize, h->non_zero_count_cache+p*5*8);
+                    }
+                }
+            }
+        }else{
+            for(i=0; i<16; i++){
+                if(h->non_zero_count_cache[ scan8[i+p*16] ] || h->mb[i*16+p*256]){ //FIXME benchmark weird rule, & below
+                    uint8_t * const ptr= dest_y + block_offset[i];
+                    ff_svq3_add_idct_c(ptr, h->mb + i*16 + p*256, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
+                }
+            }
+        }
+    }
+}
+
 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift){
     MpegEncContext * const s = &h->s;
     const int mb_x= s->mb_x;
@@ -1546,13 +1787,12 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
     const int mb_type= s->current_picture.mb_type[mb_xy];
     uint8_t  *dest_y, *dest_cb, *dest_cr;
     int linesize, uvlinesize /*dct_offset*/;
-    int i;
+    int i, j;
     int *block_offset = &h->block_offset[0];
     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
     /* is_h264 should always be true if SVQ3 is disabled. */
     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
-    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
 
     dest_y  = s->current_picture.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
     dest_cb = s->current_picture.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
@@ -1566,7 +1806,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
     if (!simple && MB_FIELD) {
         linesize   = h->mb_linesize   = s->linesize * 2;
         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
-        block_offset = &h->block_offset[24];
+        block_offset = &h->block_offset[48];
         if(mb_y&1){ //FIXME move out of this function?
             dest_y -= s->linesize*15;
             dest_cb-= s->uvlinesize*7;
@@ -1629,194 +1869,67 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
     } else {
         if(IS_INTRA(mb_type)){
             if(h->deblocking_filter)
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple, pixel_shift);
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, 0, simple, pixel_shift);
 
             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
             }
 
-            if(IS_INTRA4x4(mb_type)){
-                if(simple || !s->encoding){
-                    if(IS_8x8DCT(mb_type)){
-                        if(transform_bypass){
-                            idct_dc_add =
-                            idct_add    = s->dsp.add_pixels8;
-                        }else{
-                            idct_dc_add = h->h264dsp.h264_idct8_dc_add;
-                            idct_add    = h->h264dsp.h264_idct8_add;
-                        }
-                        for(i=0; i<16; i+=4){
-                            uint8_t * const ptr= dest_y + block_offset[i];
-                            const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
-                            if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
-                                h->hpc.pred8x8l_add[dir](ptr, h->mb + (i*16 << pixel_shift), linesize);
-                            }else{
-                                const int nnz = h->non_zero_count_cache[ scan8[i] ];
-                                h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
-                                                            (h->topright_samples_available<<i)&0x4000, linesize);
-                                if(nnz){
-                                    if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16))
-                                        idct_dc_add(ptr, h->mb + (i*16 << pixel_shift), linesize);
-                                    else
-                                        idct_add   (ptr, h->mb + (i*16 << pixel_shift), linesize);
-                                }
-                            }
-                        }
-                    }else{
-                        if(transform_bypass){
-                            idct_dc_add =
-                            idct_add    = s->dsp.add_pixels4;
-                        }else{
-                            idct_dc_add = h->h264dsp.h264_idct_dc_add;
-                            idct_add    = h->h264dsp.h264_idct_add;
-                        }
-                        for(i=0; i<16; i++){
-                            uint8_t * const ptr= dest_y + block_offset[i];
-                            const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
+            hl_decode_mb_predict_luma(h, mb_type, is_h264, simple, transform_bypass, pixel_shift, block_offset, linesize, dest_y, 0);
 
-                            if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
-                                h->hpc.pred4x4_add[dir](ptr, h->mb + (i*16 << pixel_shift), linesize);
-                            }else{
-                                uint8_t *topright;
-                                int nnz, tr;
-                                uint64_t tr_high;
-                                if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
-                                    const int topright_avail= (h->topright_samples_available<<i)&0x8000;
-                                    assert(mb_y || linesize <= block_offset[i]);
-                                    if(!topright_avail){
-                                        if (pixel_shift) {
-                                            tr_high= ((uint16_t*)ptr)[3 - linesize/2]*0x0001000100010001ULL;
-                                            topright= (uint8_t*) &tr_high;
-                                        } else {
-                                        tr= ptr[3 - linesize]*0x01010101;
-                                        topright= (uint8_t*) &tr;
-                                        }
-                                    }else
-                                        topright= ptr + (4 << pixel_shift) - linesize;
-                                }else
-                                    topright= NULL;
-
-                                h->hpc.pred4x4[ dir ](ptr, topright, linesize);
-                                nnz = h->non_zero_count_cache[ scan8[i] ];
-                                if(nnz){
-                                    if(is_h264){
-                                        if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16))
-                                            idct_dc_add(ptr, h->mb + (i*16 << pixel_shift), linesize);
-                                        else
-                                            idct_add   (ptr, h->mb + (i*16 << pixel_shift), linesize);
-                                    }else
-                                        ff_svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
-                                }
-                            }
-                        }
-                    }
-                }
-            }else{
-                h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
-                if(is_h264){
-                    if(h->non_zero_count_cache[ scan8[LUMA_DC_BLOCK_INDEX] ]){
-                        if(!transform_bypass)
-                            h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]);
-                        else{
-                            static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
-                                                                    8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
-                            for(i = 0; i < 16; i++)
-                                dctcoef_set(h->mb, pixel_shift, dc_mapping[i], dctcoef_get(h->mb_luma_dc, pixel_shift, i));
-                        }
-                    }
-                }else
-                    ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale);
-            }
             if(h->deblocking_filter)
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple, pixel_shift);
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, 0, simple, pixel_shift);
         }else if(is_h264){
             if (pixel_shift) {
                 hl_motion_16(h, dest_y, dest_cb, dest_cr,
                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
                              h->h264dsp.weight_h264_pixels_tab,
-                             h->h264dsp.biweight_h264_pixels_tab);
+                             h->h264dsp.biweight_h264_pixels_tab, 0);
             } else
                 hl_motion_8(h, dest_y, dest_cb, dest_cr,
                             s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
                             s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
                             h->h264dsp.weight_h264_pixels_tab,
-                            h->h264dsp.biweight_h264_pixels_tab);
+                            h->h264dsp.biweight_h264_pixels_tab, 0);
         }
 
-
-        if(!IS_INTRA4x4(mb_type)){
-            if(is_h264){
-                if(IS_INTRA16x16(mb_type)){
-                    if(transform_bypass){
-                        if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
-                            h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
-                        }else{
-                            for(i=0; i<16; i++){
-                                if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
-                                    s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + (i*16 << pixel_shift), linesize);
-                            }
-                        }
-                    }else{
-                         h->h264dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
-                    }
-                }else if(h->cbp&15){
-                    if(transform_bypass){
-                        const int di = IS_8x8DCT(mb_type) ? 4 : 1;
-                        idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
-                        for(i=0; i<16; i+=di){
-                            if(h->non_zero_count_cache[ scan8[i] ]){
-                                idct_add(dest_y + block_offset[i], h->mb + (i*16 << pixel_shift), linesize);
-                            }
-                        }
-                    }else{
-                        if(IS_8x8DCT(mb_type)){
-                            h->h264dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
-                        }else{
-                            h->h264dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
-                        }
-                    }
-                }
-            }else{
-                for(i=0; i<16; i++){
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
-                        uint8_t * const ptr= dest_y + block_offset[i];
-                        ff_svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
-                    }
-                }
-            }
-        }
+        hl_decode_mb_idct_luma(h, mb_type, is_h264, simple, transform_bypass, pixel_shift, block_offset, linesize, dest_y, 0);
 
         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
             uint8_t *dest[2] = {dest_cb, dest_cr};
             if(transform_bypass){
                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + (16*16 << pixel_shift), uvlinesize);
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + (20*16 << pixel_shift), uvlinesize);
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + (16*16*1 << pixel_shift), uvlinesize);
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 32, h->mb + (16*16*2 << pixel_shift), uvlinesize);
                 }else{
                     idct_add = s->dsp.add_pixels4;
-                    for(i=16; i<16+8; i++){
-                        if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
-                            idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize);
+                    for(j=1; j<3; j++){
+                        for(i=j*16; i<j*16+4; i++){
+                            if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
+                                idct_add   (dest[j-1] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize);
+                        }
                     }
                 }
             }else{
                 if(is_h264){
                     if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+0] ])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16 << pixel_shift)       , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16*1 << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
                     if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+1] ])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + ((16*16+4*16) << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16*2 << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
                     h->h264dsp.h264_idct_add8(dest, block_offset,
                                               h->mb, uvlinesize,
                                               h->non_zero_count_cache);
                 }else{
-                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16     , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
-                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16+4*16, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
-                    for(i=16; i<16+8; i++){
-                        if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                            uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
-                            ff_svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, ff_h264_chroma_qp[0][s->qscale + 12] - 12, 2);
+                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16*1, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
+                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16*2, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
+                    for(j=1; j<3; j++){
+                        for(i=j*16; i<j*16+4; i++){
+                            if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
+                                uint8_t * const ptr= dest[j-1] + block_offset[i];
+                                ff_svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, ff_h264_chroma_qp[0][s->qscale + 12] - 12, 2);
+                            }
                         }
                     }
                 }
@@ -1824,7 +1937,112 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
         }
     }
     if(h->cbp || IS_INTRA(mb_type))
+    {
         s->dsp.clear_blocks(h->mb);
+        s->dsp.clear_blocks(h->mb+(24*16<<pixel_shift));
+    }
+}
+
+static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simple, int pixel_shift){
+    MpegEncContext * const s = &h->s;
+    const int mb_x= s->mb_x;
+    const int mb_y= s->mb_y;
+    const int mb_xy= h->mb_xy;
+    const int mb_type= s->current_picture.mb_type[mb_xy];
+    uint8_t  *dest[3];
+    int linesize;
+    int i, j, p;
+    int *block_offset = &h->block_offset[0];
+    const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
+
+    for (p = 0; p < 3; p++)
+    {
+        dest[p] = s->current_picture.data[p] + ((mb_x << pixel_shift) + mb_y * s->linesize) * 16;
+        s->dsp.prefetch(dest[p] + (s->mb_x&3)*4*s->linesize + (64 << pixel_shift), s->linesize, 4);
+    }
+
+    h->list_counts[mb_xy]= h->list_count;
+
+    if (!simple && MB_FIELD) {
+        linesize   = h->mb_linesize = h->mb_uvlinesize = s->linesize * 2;
+        block_offset = &h->block_offset[48];
+        if(mb_y&1) //FIXME move out of this function?
+            for (p = 0; p < 3; p++)
+                dest[p] -= s->linesize*15;
+        if(FRAME_MBAFF) {
+            int list;
+            for(list=0; list<h->list_count; list++){
+                if(!USES_LIST(mb_type, list))
+                    continue;
+                if(IS_16X16(mb_type)){
+                    int8_t *ref = &h->ref_cache[list][scan8[0]];
+                    fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
+                }else{
+                    for(i=0; i<16; i+=4){
+                        int ref = h->ref_cache[list][scan8[i]];
+                        if(ref >= 0)
+                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
+                    }
+                }
+            }
+        }
+    } else {
+        linesize   = h->mb_linesize = h->mb_uvlinesize = s->linesize;
+    }
+
+    if (!simple && IS_INTRA_PCM(mb_type)) {
+        if (pixel_shift) {
+            const int bit_depth = h->sps.bit_depth_luma;
+            GetBitContext gb;
+            init_get_bits(&gb, (uint8_t*)h->mb, 768*bit_depth);
+
+            for (p = 0; p < 3; p++) {
+                for (i = 0; i < 16; i++) {
+                    uint16_t *tmp = (uint16_t*)(dest[p] + i*linesize);
+                    for (j = 0; j < 16; j++)
+                        tmp[j] = get_bits(&gb, bit_depth);
+                }
+            }
+        } else {
+            for (p = 0; p < 3; p++) {
+                for (i = 0; i < 16; i++) {
+                    memcpy(dest[p] + i*linesize, h->mb + p*128 + i*8, 16);
+                }
+            }
+        }
+    } else {
+        if(IS_INTRA(mb_type)){
+            if(h->deblocking_filter)
+                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize, linesize, 1, 1, simple, pixel_shift);
+
+            for (p = 0; p < 3; p++)
+                hl_decode_mb_predict_luma(h, mb_type, 1, simple, transform_bypass, pixel_shift, block_offset, linesize, dest[p], p);
+
+            if(h->deblocking_filter)
+                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize, linesize, 0, 1, simple, pixel_shift);
+        }else{
+            if (pixel_shift) {
+                hl_motion_16(h, dest[0], dest[1], dest[2],
+                             s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
+                             s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
+                             h->h264dsp.weight_h264_pixels_tab,
+                             h->h264dsp.biweight_h264_pixels_tab, 1);
+            } else
+                hl_motion_8(h, dest[0], dest[1], dest[2],
+                            s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
+                            s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
+                            h->h264dsp.weight_h264_pixels_tab,
+                            h->h264dsp.biweight_h264_pixels_tab, 1);
+        }
+
+        for (p = 0; p < 3; p++)
+            hl_decode_mb_idct_luma(h, mb_type, 1, simple, transform_bypass, pixel_shift, block_offset, linesize, dest[p], p);
+    }
+    if(h->cbp || IS_INTRA(mb_type))
+    {
+        s->dsp.clear_blocks(h->mb);
+        s->dsp.clear_blocks(h->mb+(24*16<<pixel_shift));
+    }
 }
 
 /**
@@ -1844,13 +2062,26 @@ static void av_noinline hl_decode_mb_complex(H264Context *h){
     hl_decode_mb_internal(h, 0, h->pixel_shift);
 }
 
+static void av_noinline hl_decode_mb_444_complex(H264Context *h){
+    hl_decode_mb_444_internal(h, 0, h->pixel_shift);
+}
+
+static void av_noinline hl_decode_mb_444_simple(H264Context *h){
+    hl_decode_mb_444_internal(h, 1, 0);
+}
+
 void ff_h264_hl_decode_mb(H264Context *h){
     MpegEncContext * const s = &h->s;
     const int mb_xy= h->mb_xy;
     const int mb_type= s->current_picture.mb_type[mb_xy];
     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
 
-    if (is_complex) {
+    if (CHROMA444) {
+        if(is_complex || h->pixel_shift)
+            hl_decode_mb_444_complex(h);
+        else
+            hl_decode_mb_444_simple(h);
+    } else if (is_complex) {
         hl_decode_mb_complex(h);
     } else if (h->pixel_shift) {
         hl_decode_mb_simple_16(h);
@@ -1866,7 +2097,7 @@ static int pred_weight_table(H264Context *h){
     h->use_weight= 0;
     h->use_weight_chroma= 0;
     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
-    if(CHROMA)
+    if(h->sps.chroma_format_idc)
         h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
     luma_def = 1<<h->luma_log2_weight_denom;
     chroma_def = 1<<h->chroma_log2_weight_denom;
@@ -1891,7 +2122,7 @@ static int pred_weight_table(H264Context *h){
                 h->luma_weight[i][list][1]= 0;
             }
 
-            if(CHROMA){
+            if(h->sps.chroma_format_idc){
                 chroma_weight_flag= get_bits1(&s->gb);
                 if(chroma_weight_flag){
                     int j;
@@ -2321,11 +2552,11 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
 
     h->b_stride=  s->mb_width*4;
 
-    s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
+    s->width = 16*s->mb_width - (2>>CHROMA444)*FFMIN(h->sps.crop_right, (8<<CHROMA444)-1);
     if(h->sps.frame_mbs_only_flag)
-        s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
+        s->height= 16*s->mb_height - (2>>CHROMA444)*FFMIN(h->sps.crop_bottom, (8<<CHROMA444)-1);
     else
-        s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 7);
+        s->height= 16*s->mb_height - (4>>CHROMA444)*FFMIN(h->sps.crop_bottom, (8<<CHROMA444)-1);
 
     if (s->context_initialized
         && (   s->width != s->avctx->width || s->height != s->avctx->height
@@ -2370,18 +2601,22 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
 
         switch (h->sps.bit_depth_luma) {
             case 9 :
-                s->avctx->pix_fmt = PIX_FMT_YUV420P9;
+                s->avctx->pix_fmt = CHROMA444 ? PIX_FMT_YUV444P9 : PIX_FMT_YUV420P9;
                 break;
             case 10 :
-                s->avctx->pix_fmt = PIX_FMT_YUV420P10;
+                s->avctx->pix_fmt = CHROMA444 ? PIX_FMT_YUV444P10 : PIX_FMT_YUV420P10;
                 break;
             default:
-        s->avctx->pix_fmt = s->avctx->get_format(s->avctx,
-                                                 s->avctx->codec->pix_fmts ?
-                                                 s->avctx->codec->pix_fmts :
-                                                 s->avctx->color_range == AVCOL_RANGE_JPEG ?
-                                                 hwaccel_pixfmt_list_h264_jpeg_420 :
-                                                 ff_hwaccel_pixfmt_list_420);
+                if (CHROMA444){
+                    s->avctx->pix_fmt = s->avctx->color_range == AVCOL_RANGE_JPEG ? PIX_FMT_YUVJ444P : PIX_FMT_YUV444P;
+                }else{
+                    s->avctx->pix_fmt = s->avctx->get_format(s->avctx,
+                                                             s->avctx->codec->pix_fmts ?
+                                                             s->avctx->codec->pix_fmts :
+                                                             s->avctx->color_range == AVCOL_RANGE_JPEG ?
+                                                             hwaccel_pixfmt_list_h264_jpeg_420 :
+                                                             ff_hwaccel_pixfmt_list_420);
+                }
         }
 
         s->avctx->hwaccel = ff_find_hwaccel(s->avctx->codec->id, s->avctx->pix_fmt);
@@ -2873,11 +3108,10 @@ static int fill_filter_caches(H264Context *h, int mb_type){
     if(IS_INTRA(mb_type))
         return 0;
 
-    AV_COPY64(&h->non_zero_count_cache[0+8*1], &h->non_zero_count[mb_xy][ 0]);
-    AV_COPY64(&h->non_zero_count_cache[0+8*2], &h->non_zero_count[mb_xy][ 8]);
-    AV_COPY32(&h->non_zero_count_cache[0+8*5], &h->non_zero_count[mb_xy][16]);
-    AV_COPY32(&h->non_zero_count_cache[4+8*3], &h->non_zero_count[mb_xy][20]);
-    AV_COPY64(&h->non_zero_count_cache[0+8*4], &h->non_zero_count[mb_xy][24]);
+    AV_COPY32(&h->non_zero_count_cache[4+8* 1], &h->non_zero_count[mb_xy][ 0]);
+    AV_COPY32(&h->non_zero_count_cache[4+8* 2], &h->non_zero_count[mb_xy][ 4]);
+    AV_COPY32(&h->non_zero_count_cache[4+8* 3], &h->non_zero_count[mb_xy][ 8]);
+    AV_COPY32(&h->non_zero_count_cache[4+8* 4], &h->non_zero_count[mb_xy][12]);
 
     h->cbp= h->cbp_table[mb_xy];
 
@@ -2929,45 +3163,45 @@ static int fill_filter_caches(H264Context *h, int mb_type){
 */
 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
     if(top_type){
-        AV_COPY32(&h->non_zero_count_cache[4+8*0], &h->non_zero_count[top_xy][4+3*8]);
+        AV_COPY32(&h->non_zero_count_cache[4+8*0], &h->non_zero_count[top_xy][3*4]);
     }
 
     if(left_type[0]){
-        h->non_zero_count_cache[3+8*1]= h->non_zero_count[left_xy[0]][7+0*8];
-        h->non_zero_count_cache[3+8*2]= h->non_zero_count[left_xy[0]][7+1*8];
-        h->non_zero_count_cache[3+8*3]= h->non_zero_count[left_xy[0]][7+2*8];
-        h->non_zero_count_cache[3+8*4]= h->non_zero_count[left_xy[0]][7+3*8];
+        h->non_zero_count_cache[3+8*1]= h->non_zero_count[left_xy[0]][3+0*4];
+        h->non_zero_count_cache[3+8*2]= h->non_zero_count[left_xy[0]][3+1*4];
+        h->non_zero_count_cache[3+8*3]= h->non_zero_count[left_xy[0]][3+2*4];
+        h->non_zero_count_cache[3+8*4]= h->non_zero_count[left_xy[0]][3+3*4];
     }
 
     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
     if(!CABAC && h->pps.transform_8x8_mode){
         if(IS_8x8DCT(top_type)){
             h->non_zero_count_cache[4+8*0]=
-            h->non_zero_count_cache[5+8*0]= h->cbp_table[top_xy] & 4;
+            h->non_zero_count_cache[5+8*0]= (h->cbp_table[top_xy] & 0x4000) >> 12;
             h->non_zero_count_cache[6+8*0]=
-            h->non_zero_count_cache[7+8*0]= h->cbp_table[top_xy] & 8;
+            h->non_zero_count_cache[7+8*0]= (h->cbp_table[top_xy] & 0x8000) >> 12;
         }
         if(IS_8x8DCT(left_type[0])){
             h->non_zero_count_cache[3+8*1]=
-            h->non_zero_count_cache[3+8*2]= h->cbp_table[left_xy[0]]&2; //FIXME check MBAFF
+            h->non_zero_count_cache[3+8*2]= (h->cbp_table[left_xy[0]]&0x2000) >> 12; //FIXME check MBAFF
         }
         if(IS_8x8DCT(left_type[1])){
             h->non_zero_count_cache[3+8*3]=
-            h->non_zero_count_cache[3+8*4]= h->cbp_table[left_xy[1]]&8; //FIXME check MBAFF
+            h->non_zero_count_cache[3+8*4]= (h->cbp_table[left_xy[1]]&0x8000) >> 12; //FIXME check MBAFF
         }
 
         if(IS_8x8DCT(mb_type)){
             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
-            h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
+            h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= (h->cbp & 0x1000) >> 12;
 
             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
-            h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
+            h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= (h->cbp & 0x2000) >> 12;
 
             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
-            h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
+            h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= (h->cbp & 0x4000) >> 12;
 
             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
-            h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
+            h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= (h->cbp & 0x8000) >> 12;
         }
     }
 
@@ -3041,8 +3275,8 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
                 s->mb_x= mb_x;
                 s->mb_y= mb_y;
                 dest_y  = s->current_picture.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
-                dest_cb = s->current_picture.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
-                dest_cr = s->current_picture.data[2] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
+                dest_cb = s->current_picture.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * (8 << CHROMA444);
+                dest_cr = s->current_picture.data[2] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * (8 << CHROMA444);
                     //FIXME simplify above
 
                 if (MB_FIELD) {
@@ -3057,7 +3291,7 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
                     linesize   = h->mb_linesize   = s->linesize;
                     uvlinesize = h->mb_uvlinesize = s->uvlinesize;
                 }
-                backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
+                backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, CHROMA444, 0);
                 if(fill_filter_caches(h, mb_type))
                     continue;
                 h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index 8c4f1ab21a..3abf895010 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -39,9 +39,6 @@
 #define interlaced_dct interlaced_dct_is_a_bad_name
 #define mb_intra mb_intra_is_not_initialized_see_mb_type
 
-#define LUMA_DC_BLOCK_INDEX   24
-#define CHROMA_DC_BLOCK_INDEX 25
-
 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
 #define COEFF_TOKEN_VLC_BITS           8
 #define TOTAL_ZEROS_VLC_BITS           9
@@ -60,8 +57,6 @@
  * of progressive decoding by about 2%. */
 #define ALLOW_INTERLACE
 
-#define ALLOW_NOCHROMA
-
 #define FMO 0
 
 /**
@@ -85,16 +80,12 @@
 #endif
 #define FIELD_OR_MBAFF_PICTURE (FRAME_MBAFF || FIELD_PICTURE)
 
-#ifdef ALLOW_NOCHROMA
-#define CHROMA h->sps.chroma_format_idc
-#else
-#define CHROMA 1
-#endif
-
 #ifndef CABAC
 #define CABAC h->pps.cabac
 #endif
 
+#define CHROMA444 (h->sps.chroma_format_idc == 3)
+
 #define EXTENDED_SAR          255
 
 #define MB_TYPE_REF0       MB_TYPE_ACPRED //dirty but it fits in 16 bit
@@ -198,7 +189,7 @@ typedef struct SPS{
     int num_reorder_frames;
     int scaling_matrix_present;
     uint8_t scaling_matrix4[6][16];
-    uint8_t scaling_matrix8[2][64];
+    uint8_t scaling_matrix8[6][64];
     int nal_hrd_parameters_present_flag;
     int vcl_hrd_parameters_present_flag;
     int pic_struct_present_flag;
@@ -233,7 +224,7 @@ typedef struct PPS{
     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
     int transform_8x8_mode;     ///< transform_8x8_mode_flag
     uint8_t scaling_matrix4[6][16];
-    uint8_t scaling_matrix8[2][64];
+    uint8_t scaling_matrix8[6][64];
     uint8_t chroma_qp_table[2][64];  ///< pre-scaled (with chroma_qp_index_offset) version of qp_table
     int chroma_qp_diff;
 }PPS;
@@ -298,21 +289,15 @@ typedef struct H264Context{
     unsigned int top_samples_available;
     unsigned int topright_samples_available;
     unsigned int left_samples_available;
-    uint8_t (*top_borders[2])[(16+2*8)*2];
+    uint8_t (*top_borders[2])[(16*3)*2];
 
     /**
      * non zero coeff count cache.
      * is 64 if not available.
      */
-    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8];
+    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[15*8];
 
-    /*
-    .UU.YYYY
-    .UU.YYYY
-    .vv.YYYY
-    .VV.YYYY
-    */
-    uint8_t (*non_zero_count)[32];
+    uint8_t (*non_zero_count)[48];
 
     /**
      * Motion vector cache.
@@ -336,7 +321,7 @@ typedef struct H264Context{
      * block_offset[ 0..23] for frame macroblocks
      * block_offset[24..47] for field macroblocks
      */
-    int block_offset[2*(16+8)];
+    int block_offset[2*(16*3)];
 
     uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
     uint32_t *mb2br_xy;
@@ -356,9 +341,9 @@ typedef struct H264Context{
     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
 
     uint32_t dequant4_buffer[6][QP_MAX_NUM+1][16]; //FIXME should these be moved down?
-    uint32_t dequant8_buffer[2][QP_MAX_NUM+1][64];
+    uint32_t dequant8_buffer[6][QP_MAX_NUM+1][64];
     uint32_t (*dequant4_coeff[6])[16];
-    uint32_t (*dequant8_coeff[2])[64];
+    uint32_t (*dequant8_coeff[6])[64];
 
     int slice_num;
     uint16_t *slice_table;     ///< slice_table_base + 2*mb_stride + 1
@@ -408,15 +393,15 @@ typedef struct H264Context{
     GetBitContext *intra_gb_ptr;
     GetBitContext *inter_gb_ptr;
 
-    DECLARE_ALIGNED(16, DCTELEM, mb)[16*24*2]; ///< as a dct coeffecient is int32_t in high depth, we need to reserve twice the space.
-    DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16*2];
+    DECLARE_ALIGNED(16, DCTELEM, mb)[16*48*2]; ///< as a dct coeffecient is int32_t in high depth, we need to reserve twice the space.
+    DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[3][16*2];
     DCTELEM mb_padding[256*2];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb
 
     /**
      * Cabac
      */
     CABACContext cabac;
-    uint8_t      cabac_state[460];
+    uint8_t      cabac_state[1024];
 
     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
     uint16_t     *cbp_table;
@@ -721,27 +706,43 @@ o-o o-o
 */
 
 /* Scan8 organization:
- *   0 1 2 3 4 5 6 7
- * 0   u u y y y y y
- * 1 u U U y Y Y Y Y
- * 2 u U U y Y Y Y Y
- * 3   v v y Y Y Y Y
- * 4 v V V y Y Y Y Y
- * 5 v V V   DYDUDV
+ *    0 1 2 3 4 5 6 7
+ * 0  DY    y y y y y
+ * 1        y Y Y Y Y
+ * 2        y Y Y Y Y
+ * 3        y Y Y Y Y
+ * 4        y Y Y Y Y
+ * 5  DU    u u u u u
+ * 6        u U U U U
+ * 7        u U U U U
+ * 8        u U U U U
+ * 9        u U U U U
+ * 10 DV    v v v v v
+ * 11       v V V V V
+ * 12       v V V V V
+ * 13       v V V V V
+ * 14       v V V V V
  * DY/DU/DV are for luma/chroma DC.
  */
 
+#define LUMA_DC_BLOCK_INDEX   48
+#define CHROMA_DC_BLOCK_INDEX 49
+
 //This table must be here because scan8[constant] must be known at compiletime
-static const uint8_t scan8[16 + 2*4 + 3]={
- 4+1*8, 5+1*8, 4+2*8, 5+2*8,
- 6+1*8, 7+1*8, 6+2*8, 7+2*8,
- 4+3*8, 5+3*8, 4+4*8, 5+4*8,
- 6+3*8, 7+3*8, 6+4*8, 7+4*8,
- 1+1*8, 2+1*8,
- 1+2*8, 2+2*8,
- 1+4*8, 2+4*8,
- 1+5*8, 2+5*8,
- 4+5*8, 5+5*8, 6+5*8
+static const uint8_t scan8[16*3 + 3]={
+ 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8,
+ 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8,
+ 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8,
+ 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8,
+ 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8,
+ 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8,
+ 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8,
+ 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8,
+ 4+11*8, 5+11*8, 4+12*8, 5+12*8,
+ 6+11*8, 7+11*8, 6+12*8, 7+12*8,
+ 4+13*8, 5+13*8, 4+14*8, 5+14*8,
+ 6+13*8, 7+13*8, 6+14*8, 7+14*8,
+ 0+ 0*8, 0+ 5*8, 0+10*8
 };
 
 static av_always_inline uint32_t pack16to32(int a, int b){
@@ -773,11 +774,11 @@ static void fill_decode_neighbors(H264Context *h, int mb_type){
     MpegEncContext * const s = &h->s;
     const int mb_xy= h->mb_xy;
     int topleft_xy, top_xy, topright_xy, left_xy[2];
-    static const uint8_t left_block_options[4][16]={
-        {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8},
-        {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8},
-        {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8},
-        {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}
+    static const uint8_t left_block_options[4][32]={
+        {0,1,2,3,7,10,8,11,3+0*4, 3+1*4, 3+2*4, 3+3*4, 1+4*4, 1+8*4, 1+5*4, 1+9*4},
+        {2,2,3,3,8,11,8,11,3+2*4, 3+2*4, 3+3*4, 3+3*4, 1+5*4, 1+9*4, 1+5*4, 1+9*4},
+        {0,0,1,1,7,10,7,10,3+0*4, 3+0*4, 3+1*4, 3+1*4, 1+4*4, 1+8*4, 1+4*4, 1+8*4},
+        {0,2,0,2,7,10,7,10,3+0*4, 3+2*4, 3+0*4, 3+2*4, 1+4*4, 1+8*4, 1+4*4, 1+8*4}
     };
 
     h->topleft_partition= -1;
@@ -947,32 +948,41 @@ static void fill_decode_caches(H264Context *h, int mb_type){
 */
 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
     if(top_type){
-        AV_COPY32(&h->non_zero_count_cache[4+8*0], &h->non_zero_count[top_xy][4+3*8]);
-            h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][1+1*8];
-            h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][2+1*8];
-
-            h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][1+2*8];
-            h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][2+2*8];
-    }else {
-            h->non_zero_count_cache[1+8*0]=
-            h->non_zero_count_cache[2+8*0]=
-
-            h->non_zero_count_cache[1+8*3]=
-            h->non_zero_count_cache[2+8*3]=
-            AV_WN32A(&h->non_zero_count_cache[4+8*0], CABAC && !IS_INTRA(mb_type) ? 0 : 0x40404040);
+        AV_COPY32(&h->non_zero_count_cache[4+8* 0], &h->non_zero_count[top_xy][4*3]);
+        if(CHROMA444){
+            AV_COPY32(&h->non_zero_count_cache[4+8* 5], &h->non_zero_count[top_xy][4* 7]);
+            AV_COPY32(&h->non_zero_count_cache[4+8*10], &h->non_zero_count[top_xy][4*11]);
+        }else{
+            AV_COPY32(&h->non_zero_count_cache[4+8* 5], &h->non_zero_count[top_xy][4* 5]);
+            AV_COPY32(&h->non_zero_count_cache[4+8*10], &h->non_zero_count[top_xy][4* 9]);
+        }
+    }else{
+        uint32_t top_empty = CABAC && !IS_INTRA(mb_type) ? 0 : 0x40404040;
+        AV_WN32A(&h->non_zero_count_cache[4+8* 0], top_empty);
+        AV_WN32A(&h->non_zero_count_cache[4+8* 5], top_empty);
+        AV_WN32A(&h->non_zero_count_cache[4+8*10], top_empty);
     }
 
     for (i=0; i<2; i++) {
         if(left_type[i]){
-            h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]];
-            h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]];
-                h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[8+4+2*i]];
-                h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[8+5+2*i]];
+            h->non_zero_count_cache[3+8* 1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]];
+            h->non_zero_count_cache[3+8* 2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]];
+            if(CHROMA444){
+                h->non_zero_count_cache[3+8* 6 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]+4*4];
+                h->non_zero_count_cache[3+8* 7 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]+4*4];
+                h->non_zero_count_cache[3+8*11 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]+8*4];
+                h->non_zero_count_cache[3+8*12 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]+8*4];
+            }else{
+                h->non_zero_count_cache[3+8* 6 +   8*i]= h->non_zero_count[left_xy[i]][left_block[8+4+2*i]];
+                h->non_zero_count_cache[3+8*11 +   8*i]= h->non_zero_count[left_xy[i]][left_block[8+5+2*i]];
+            }
         }else{
-                h->non_zero_count_cache[3+8*1 + 2*8*i]=
-                h->non_zero_count_cache[3+8*2 + 2*8*i]=
-                h->non_zero_count_cache[0+8*1 +   8*i]=
-                h->non_zero_count_cache[0+8*4 +   8*i]= CABAC && !IS_INTRA(mb_type) ? 0 : 64;
+            h->non_zero_count_cache[3+8* 1 + 2*8*i]=
+            h->non_zero_count_cache[3+8* 2 + 2*8*i]=
+            h->non_zero_count_cache[3+8* 6 + 2*8*i]=
+            h->non_zero_count_cache[3+8* 7 + 2*8*i]=
+            h->non_zero_count_cache[3+8*11 + 2*8*i]=
+            h->non_zero_count_cache[3+8*12 + 2*8*i]= CABAC && !IS_INTRA(mb_type) ? 0 : 64;
         }
     }
 
@@ -981,15 +991,15 @@ static void fill_decode_caches(H264Context *h, int mb_type){
         if(top_type) {
             h->top_cbp = h->cbp_table[top_xy];
         } else {
-            h->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
+            h->top_cbp = IS_INTRA(mb_type) ? 0x7CF : 0x00F;
         }
         // left_cbp
         if (left_type[0]) {
-            h->left_cbp = (h->cbp_table[left_xy[0]] & 0x1f0)
+            h->left_cbp =   (h->cbp_table[left_xy[0]] & 0x7F0)
                         |  ((h->cbp_table[left_xy[0]]>>(left_block[0]&(~1)))&2)
                         | (((h->cbp_table[left_xy[1]]>>(left_block[2]&(~1)))&2) << 2);
         } else {
-            h->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
+            h->left_cbp = IS_INTRA(mb_type) ? 0x7CF : 0x00F;
         }
     }
     }
@@ -1190,11 +1200,21 @@ static inline int pred_intra_mode(H264Context *h, int n){
 static inline void write_back_non_zero_count(H264Context *h){
     const int mb_xy= h->mb_xy;
 
-    AV_COPY64(&h->non_zero_count[mb_xy][ 0], &h->non_zero_count_cache[0+8*1]);
-    AV_COPY64(&h->non_zero_count[mb_xy][ 8], &h->non_zero_count_cache[0+8*2]);
-    AV_COPY32(&h->non_zero_count[mb_xy][16], &h->non_zero_count_cache[0+8*5]);
-    AV_COPY32(&h->non_zero_count[mb_xy][20], &h->non_zero_count_cache[4+8*3]);
-    AV_COPY64(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[0+8*4]);
+    AV_COPY32(&h->non_zero_count[mb_xy][ 0], &h->non_zero_count_cache[4+8* 1]);
+    AV_COPY32(&h->non_zero_count[mb_xy][ 4], &h->non_zero_count_cache[4+8* 2]);
+    AV_COPY32(&h->non_zero_count[mb_xy][ 8], &h->non_zero_count_cache[4+8* 3]);
+    AV_COPY32(&h->non_zero_count[mb_xy][12], &h->non_zero_count_cache[4+8* 4]);
+    AV_COPY32(&h->non_zero_count[mb_xy][16], &h->non_zero_count_cache[4+8* 6]);
+    AV_COPY32(&h->non_zero_count[mb_xy][20], &h->non_zero_count_cache[4+8* 7]);
+    AV_COPY32(&h->non_zero_count[mb_xy][32], &h->non_zero_count_cache[4+8*11]);
+    AV_COPY32(&h->non_zero_count[mb_xy][36], &h->non_zero_count_cache[4+8*12]);
+
+    if(CHROMA444){
+        AV_COPY32(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[4+8* 8]);
+        AV_COPY32(&h->non_zero_count[mb_xy][28], &h->non_zero_count_cache[4+8* 9]);
+        AV_COPY32(&h->non_zero_count[mb_xy][40], &h->non_zero_count_cache[4+8*13]);
+        AV_COPY32(&h->non_zero_count[mb_xy][44], &h->non_zero_count_cache[4+8*14]);
+    }
 }
 
 static inline void write_back_motion(H264Context *h, int mb_type){
@@ -1267,8 +1287,7 @@ static void av_unused decode_mb_skip(H264Context *h){
     const int mb_xy= h->mb_xy;
     int mb_type=0;
 
-    memset(h->non_zero_count[mb_xy], 0, 32);
-    memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
+    memset(h->non_zero_count[mb_xy], 0, 48);
 
     if(MB_FIELD)
         mb_type|= MB_TYPE_INTERLACED;
diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index 69af1e2ded..f30f4e1c9c 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -45,7 +45,7 @@
 
 /* Cabac pre state table */
 
-static const int8_t cabac_context_init_I[460][2] =
+static const int8_t cabac_context_init_I[1024][2] =
 {
     /* 0 - 10 */
     { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
@@ -211,10 +211,153 @@ static const int8_t cabac_context_init_I[460][2] =
     { -10,  73 }, { -10,  70 }, { -10,  69 }, {  -5,  66 },
     {  -9,  64 }, {  -5,  58 }, {   2,  59 }, {  21, -10 },
     {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
-    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 }
+    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 },
+
+    /* 460 -> 1024 */
+    { -17, 123 }, { -12, 115 }, { -16, 122 }, { -11, 115 },
+    { -12,  63 }, {  -2,  68 }, { -15,  84 }, { -13, 104 },
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
+    { -17, 123 }, { -12, 115 }, { -16, 122 }, { -11, 115 },
+    { -12,  63 }, {  -2,  68 }, { -15,  84 }, { -13, 104 },
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
+    {  -7,  93 }, { -11,  87 }, {  -3,  77 }, {  -5,  71 },
+    {  -4,  63 }, {  -4,  68 }, { -12,  84 }, {  -7,  62 },
+    {  -7,  65 }, {   8,  61 }, {   5,  56 }, {  -2,  66 },
+    {   1,  64 }, {   0,  61 }, {  -2,  78 }, {   1,  50 },
+    {   7,  52 }, {  10,  35 }, {   0,  44 }, {  11,  38 },
+    {   1,  45 }, {   0,  46 }, {   5,  44 }, {  31,  17 },
+    {   1,  51 }, {   7,  50 }, {  28,  19 }, {  16,  33 },
+    {  14,  62 }, { -13, 108 }, { -15, 100 }, { -13, 101 },
+    { -13,  91 }, { -12,  94 }, { -10,  88 }, { -16,  84 },
+    { -10,  86 }, {  -7,  83 }, { -13,  87 }, { -19,  94 },
+    {   1,  70 }, {   0,  72 }, {  -5,  74 }, {  18,  59 },
+    {  -7,  93 }, { -11,  87 }, {  -3,  77 }, {  -5,  71 },
+    {  -4,  63 }, {  -4,  68 }, { -12,  84 }, {  -7,  62 },
+    {  -7,  65 }, {   8,  61 }, {   5,  56 }, {  -2,  66 },
+    {   1,  64 }, {   0,  61 }, {  -2,  78 }, {   1,  50 },
+    {   7,  52 }, {  10,  35 }, {   0,  44 }, {  11,  38 },
+    {   1,  45 }, {   0,  46 }, {   5,  44 }, {  31,  17 },
+    {   1,  51 }, {   7,  50 }, {  28,  19 }, {  16,  33 },
+    {  14,  62 }, { -13, 108 }, { -15, 100 }, { -13, 101 },
+    { -13,  91 }, { -12,  94 }, { -10,  88 }, { -16,  84 },
+    { -10,  86 }, {  -7,  83 }, { -13,  87 }, { -19,  94 },
+    {   1,  70 }, {   0,  72 }, {  -5,  74 }, {  18,  59 },
+    {  24,   0 }, {  15,   9 }, {   8,  25 }, {  13,  18 },
+    {  15,   9 }, {  13,  19 }, {  10,  37 }, {  12,  18 },
+    {   6,  29 }, {  20,  33 }, {  15,  30 }, {   4,  45 },
+    {   1,  58 }, {   0,  62 }, {   7,  61 }, {  12,  38 },
+    {  11,  45 }, {  15,  39 }, {  11,  42 }, {  13,  44 },
+    {  16,  45 }, {  12,  41 }, {  10,  49 }, {  30,  34 },
+    {  18,  42 }, {  10,  55 }, {  17,  51 }, {  17,  46 },
+    {   0,  89 }, {  26, -19 }, {  22, -17 }, {  26, -17 },
+    {  30, -25 }, {  28, -20 }, {  33, -23 }, {  37, -27 },
+    {  33, -23 }, {  40, -28 }, {  38, -17 }, {  33, -11 },
+    {  40, -15 }, {  41,  -6 }, {  38,   1 }, {  41,  17 },
+    {  24,   0 }, {  15,   9 }, {   8,  25 }, {  13,  18 },
+    {  15,   9 }, {  13,  19 }, {  10,  37 }, {  12,  18 },
+    {   6,  29 }, {  20,  33 }, {  15,  30 }, {   4,  45 },
+    {   1,  58 }, {   0,  62 }, {   7,  61 }, {  12,  38 },
+    {  11,  45 }, {  15,  39 }, {  11,  42 }, {  13,  44 },
+    {  16,  45 }, {  12,  41 }, {  10,  49 }, {  30,  34 },
+    {  18,  42 }, {  10,  55 }, {  17,  51 }, {  17,  46 },
+    {   0,  89 }, {  26, -19 }, {  22, -17 }, {  26, -17 },
+    {  30, -25 }, {  28, -20 }, {  33, -23 }, {  37, -27 },
+    {  33, -23 }, {  40, -28 }, {  38, -17 }, {  33, -11 },
+    {  40, -15 }, {  41,  -6 }, {  38,   1 }, {  41,  17 },
+    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
+    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
+    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
+    { -23,  68 }, { -24,  50 }, { -11,  74 }, { -14, 106 },
+    { -13,  97 }, { -15,  90 }, { -12,  90 }, { -18,  88 },
+    { -10,  73 }, {  -9,  79 }, { -14,  86 }, { -10,  73 },
+    { -10,  70 }, { -10,  69 }, {  -5,  66 }, {  -9,  64 },
+    {  -5,  58 }, {   2,  59 }, {  23, -13 }, {  26, -13 },
+    {  40, -15 }, {  49, -14 }, {  44,   3 }, {  45,   6 },
+    {  44,  34 }, {  33,  54 }, {  19,  82 }, {  21, -10 },
+    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
+    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 },
+    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
+    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
+    {   0,  68 }, {  -9,  92 }, { -17, 120 }, { -20, 112 },
+    { -18, 114 }, { -11,  85 }, { -15,  92 }, { -14,  89 },
+    { -26,  71 }, { -15,  81 }, { -14,  80 }, {   0,  68 },
+    { -14,  70 }, { -24,  56 }, { -23,  68 }, { -24,  50 },
+    { -11,  74 }, { -14, 106 }, { -13,  97 }, { -15,  90 },
+    { -12,  90 }, { -18,  88 }, { -10,  73 }, {  -9,  79 },
+    { -14,  86 }, { -10,  73 }, { -10,  70 }, { -10,  69 },
+    {  -5,  66 }, {  -9,  64 }, {  -5,  58 }, {   2,  59 },
+    {  23, -13 }, {  26, -13 }, {  40, -15 }, {  49, -14 },
+    {  44,   3 }, {  45,   6 }, {  44,  34 }, {  33,  54 },
+    {  19,  82 }, {  21, -10 }, {  24, -11 }, {  28,  -8 },
+    {  28,  -1 }, {  29,   3 }, {  29,   9 }, {  35,  20 },
+    {  29,  36 }, {  14,  67 }, {  -3,  75 }, {  -1,  23 },
+    {   1,  34 }, {   1,  43 }, {   0,  54 }, {  -2,  55 },
+    {   0,  61 }, {   1,  64 }, {   0,  68 }, {  -9,  92 },
+    {  -6,  93 }, {  -6,  84 }, {  -8,  79 }, {   0,  66 },
+    {  -1,  71 }, {   0,  62 }, {  -2,  60 }, {  -2,  59 },
+    {  -5,  75 }, {  -3,  62 }, {  -4,  58 }, {  -9,  66 },
+    {  -1,  79 }, {   0,  71 }, {   3,  68 }, {  10,  44 },
+    {  -7,  62 }, {  15,  36 }, {  14,  40 }, {  16,  27 },
+    {  12,  29 }, {   1,  44 }, {  20,  36 }, {  18,  32 },
+    {   5,  42 }, {   1,  48 }, {  10,  62 }, {  17,  46 },
+    {   9,  64 }, { -12, 104 }, { -11,  97 }, { -16,  96 },
+    {  -7,  88 }, {  -8,  85 }, {  -7,  85 }, {  -9,  85 },
+    { -13,  88 }, {   4,  66 }, {  -3,  77 }, {  -3,  76 },
+    {  -6,  76 }, {  10,  58 }, {  -1,  76 }, {  -1,  83 },
+    {  -6,  93 }, {  -6,  84 }, {  -8,  79 }, {   0,  66 },
+    {  -1,  71 }, {   0,  62 }, {  -2,  60 }, {  -2,  59 },
+    {  -5,  75 }, {  -3,  62 }, {  -4,  58 }, {  -9,  66 },
+    {  -1,  79 }, {   0,  71 }, {   3,  68 }, {  10,  44 },
+    {  -7,  62 }, {  15,  36 }, {  14,  40 }, {  16,  27 },
+    {  12,  29 }, {   1,  44 }, {  20,  36 }, {  18,  32 },
+    {   5,  42 }, {   1,  48 }, {  10,  62 }, {  17,  46 },
+    {   9,  64 }, { -12, 104 }, { -11,  97 }, { -16,  96 },
+    {  -7,  88 }, {  -8,  85 }, {  -7,  85 }, {  -9,  85 },
+    { -13,  88 }, {   4,  66 }, {  -3,  77 }, {  -3,  76 },
+    {  -6,  76 }, {  10,  58 }, {  -1,  76 }, {  -1,  83 },
+    {  15,   6 }, {   6,  19 }, {   7,  16 }, {  12,  14 },
+    {  18,  13 }, {  13,  11 }, {  13,  15 }, {  15,  16 },
+    {  12,  23 }, {  13,  23 }, {  15,  20 }, {  14,  26 },
+    {  14,  44 }, {  17,  40 }, {  17,  47 }, {  24,  17 },
+    {  21,  21 }, {  25,  22 }, {  31,  27 }, {  22,  29 },
+    {  19,  35 }, {  14,  50 }, {  10,  57 }, {   7,  63 },
+    {  -2,  77 }, {  -4,  82 }, {  -3,  94 }, {   9,  69 },
+    { -12, 109 }, {  36, -35 }, {  36, -34 }, {  32, -26 },
+    {  37, -30 }, {  44, -32 }, {  34, -18 }, {  34, -15 },
+    {  40, -15 }, {  33,  -7 }, {  35,  -5 }, {  33,   0 },
+    {  38,   2 }, {  33,  13 }, {  23,  35 }, {  13,  58 },
+    {  15,   6 }, {   6,  19 }, {   7,  16 }, {  12,  14 },
+    {  18,  13 }, {  13,  11 }, {  13,  15 }, {  15,  16 },
+    {  12,  23 }, {  13,  23 }, {  15,  20 }, {  14,  26 },
+    {  14,  44 }, {  17,  40 }, {  17,  47 }, {  24,  17 },
+    {  21,  21 }, {  25,  22 }, {  31,  27 }, {  22,  29 },
+    {  19,  35 }, {  14,  50 }, {  10,  57 }, {   7,  63 },
+    {  -2,  77 }, {  -4,  82 }, {  -3,  94 }, {   9,  69 },
+    { -12, 109 }, {  36, -35 }, {  36, -34 }, {  32, -26 },
+    {  37, -30 }, {  44, -32 }, {  34, -18 }, {  34, -15 },
+    {  40, -15 }, {  33,  -7 }, {  35,  -5 }, {  33,   0 },
+    {  38,   2 }, {  33,  13 }, {  23,  35 }, {  13,  58 },
+    {  -3,  71 }, {  -6,  42 }, {  -5,  50 }, {  -3,  54 },
+    {  -2,  62 }, {   0,  58 }, {   1,  63 }, {  -2,  72 },
+    {  -1,  74 }, {  -9,  91 }, {  -5,  67 }, {  -5,  27 },
+    {  -3,  39 }, {  -2,  44 }, {   0,  46 }, { -16,  64 },
+    {  -8,  68 }, { -10,  78 }, {  -6,  77 }, { -10,  86 },
+    { -12,  92 }, { -15,  55 }, { -10,  60 }, {  -6,  62 },
+    {  -4,  65 }, { -12,  73 }, {  -8,  76 }, {  -7,  80 },
+    {  -9,  88 }, { -17, 110 }, {  -3,  71 }, {  -6,  42 },
+    {  -5,  50 }, {  -3,  54 }, {  -2,  62 }, {   0,  58 },
+    {   1,  63 }, {  -2,  72 }, {  -1,  74 }, {  -9,  91 },
+    {  -5,  67 }, {  -5,  27 }, {  -3,  39 }, {  -2,  44 },
+    {   0,  46 }, { -16,  64 }, {  -8,  68 }, { -10,  78 },
+    {  -6,  77 }, { -10,  86 }, { -12,  92 }, { -15,  55 },
+    { -10,  60 }, {  -6,  62 }, {  -4,  65 }, { -12,  73 },
+    {  -8,  76 }, {  -7,  80 }, {  -9,  88 }, { -17, 110 },
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 }
 };
 
-static const int8_t cabac_context_init_PB[3][460][2] =
+static const int8_t cabac_context_init_PB[3][1024][2] =
 {
     /* i_cabac_init_idc == 0 */
     {
@@ -370,6 +513,149 @@ static const int8_t cabac_context_init_PB[3][460][2] =
         { -14,  66 }, {   0,  59 }, {   2,  59 }, {  21, -13 },
         {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
         {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
+
+        /* 460 - 1024 */
+        {  -7,  92 }, {  -5,  89 }, {  -7,  96 }, { -13, 108 },
+        {  -3,  46 }, {  -1,  65 }, {  -1,  57 }, {  -9,  93 },
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
+        {  -7,  92 }, {  -5,  89 }, {  -7,  96 }, { -13, 108 },
+        {  -3,  46 }, {  -1,  65 }, {  -1,  57 }, {  -9,  93 },
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
+        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
+        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
+        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
+        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
+        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
+        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
+        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
+        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
+        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
+        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
+        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
+        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
+        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
+        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
+        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
+        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
+        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
+        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
+        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
+        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
+        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
+        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
+        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
+        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
+        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
+        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
+        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
+        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
+        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
+        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
+        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
+        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
+        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
+        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
+        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
+        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
+        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
+        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
+        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
+        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
+        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
+        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
+        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
+        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
+        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
+        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
+        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
+        { -16,  66 }, { -22,  65 }, { -20,  63 }, {  -5,  85 },
+        {  -6,  81 }, { -10,  77 }, {  -7,  81 }, { -17,  80 },
+        { -18,  73 }, {  -4,  74 }, { -10,  83 }, {  -9,  71 },
+        {  -9,  67 }, {  -1,  61 }, {  -8,  66 }, { -14,  66 },
+        {   0,  59 }, {   2,  59 }, {   9,  -2 }, {  26,  -9 },
+        {  33,  -9 }, {  39,  -7 }, {  41,  -2 }, {  45,   3 },
+        {  49,   9 }, {  45,  27 }, {  36,  59 }, {  21, -13 },
+        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
+        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
+        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
+        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
+        {  -8,  66 }, {  -8,  76 }, {  -4,  79 }, {  -7,  71 },
+        {  -5,  69 }, {  -9,  70 }, {  -8,  66 }, { -10,  68 },
+        { -19,  73 }, { -12,  69 }, { -16,  70 }, { -15,  67 },
+        { -20,  62 }, { -19,  70 }, { -16,  66 }, { -22,  65 },
+        { -20,  63 }, {  -5,  85 }, {  -6,  81 }, { -10,  77 },
+        {  -7,  81 }, { -17,  80 }, { -18,  73 }, {  -4,  74 },
+        { -10,  83 }, {  -9,  71 }, {  -9,  67 }, {  -1,  61 },
+        {  -8,  66 }, { -14,  66 }, {   0,  59 }, {   2,  59 },
+        {   9,  -2 }, {  26,  -9 }, {  33,  -9 }, {  39,  -7 },
+        {  41,  -2 }, {  45,   3 }, {  49,   9 }, {  45,  27 },
+        {  36,  59 }, {  21, -13 }, {  33, -14 }, {  39,  -7 },
+        {  46,  -2 }, {  51,   2 }, {  60,   6 }, {  61,  17 },
+        {  55,  34 }, {  42,  62 }, {  -6,  66 }, {  -7,  35 },
+        {  -7,  42 }, {  -8,  45 }, {  -5,  48 }, { -12,  56 },
+        {  -6,  60 }, {  -5,  62 }, {  -8,  66 }, {  -8,  76 },
+        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
+        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
+        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
+        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
+        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
+        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
+        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
+        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
+        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
+        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
+        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
+        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
+        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
+        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
+        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
+        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
+        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
+        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
+        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
+        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
+        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
+        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
+        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
+        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
+        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
+        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
+        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
+        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
+        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
+        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
+        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
+        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
+        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
+        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
+        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
+        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
+        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
+        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
+        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
+        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
+        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
+        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
+        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
+        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
+        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
+        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
+        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
+        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
+        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
+        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
+        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
+        {  -3,  74 }, { -10,  90 }, {  -6,  76 }, {  -2,  44 },
+        {   0,  45 }, {   0,  52 }, {  -3,  64 }, {  -2,  59 },
+        {  -4,  70 }, {  -4,  75 }, {  -8,  82 }, { -17, 102 },
+        {  -9,  77 }, {   3,  24 }, {   0,  42 }, {   0,  48 },
+        {   0,  55 }, {  -6,  59 }, {  -7,  71 }, { -12,  83 },
+        { -11,  87 }, { -30, 119 }, {   1,  58 }, {  -3,  29 },
+        {  -1,  36 }, {   1,  38 }, {   2,  43 }, {  -6,  55 },
+        {   0,  58 }, {   0,  64 }, {  -3,  74 }, { -10,  90 },
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 }
     },
 
     /* i_cabac_init_idc == 1 */
@@ -526,6 +812,149 @@ static const int8_t cabac_context_init_PB[3][460][2] =
         {  -9,  60 }, {   1,  54 }, {   2,  58 }, {  17, -10 },
         {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
         {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
+
+        /* 460 - 1024 */
+        {   0,  80 }, {  -5,  89 }, {  -7,  94 }, {  -4,  92 },
+        {   0,  39 }, {   0,  65 }, { -15,  84 }, { -35, 127 },
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
+        {   0,  80 }, {  -5,  89 }, {  -7,  94 }, {  -4,  92 },
+        {   0,  39 }, {   0,  65 }, { -15,  84 }, { -35, 127 },
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
+        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
+        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
+        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
+        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
+        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
+        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
+        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
+        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
+        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
+        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
+        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
+        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
+        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
+        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
+        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
+        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
+        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
+        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
+        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
+        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
+        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
+        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
+        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
+        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
+        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
+        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
+        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
+        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
+        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
+        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
+        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
+        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
+        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
+        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
+        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
+        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
+        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
+        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
+        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
+        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
+        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
+        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
+        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
+        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
+        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
+        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
+        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
+        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  -3,  81 },
+        {  -3,  76 }, {  -7,  72 }, {  -6,  78 }, { -12,  72 },
+        { -14,  68 }, {  -3,  70 }, {  -6,  76 }, {  -5,  66 },
+        {  -5,  62 }, {   0,  57 }, {  -4,  61 }, {  -9,  60 },
+        {   1,  54 }, {   2,  58 }, {  17, -10 }, {  32, -13 },
+        {  42,  -9 }, {  49,  -5 }, {  53,   0 }, {  64,   3 },
+        {  68,  10 }, {  66,  27 }, {  47,  57 }, {  17, -10 },
+        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
+        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
+        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
+        {  -2,  52 }, {  -9,  57 }, {  -6,  63 }, {  -4,  65 },
+        {  -4,  67 }, {  -7,  82 }, {  -5,  85 }, {  -6,  81 },
+        { -10,  77 }, {  -7,  81 }, { -17,  80 }, { -18,  73 },
+        {  -4,  74 }, { -10,  83 }, {  -9,  71 }, {  -9,  67 },
+        {  -1,  61 }, {  -8,  66 }, { -14,  66 }, {   0,  59 },
+        {   2,  59 }, {  -3,  81 }, {  -3,  76 }, {  -7,  72 },
+        {  -6,  78 }, { -12,  72 }, { -14,  68 }, {  -3,  70 },
+        {  -6,  76 }, {  -5,  66 }, {  -5,  62 }, {   0,  57 },
+        {  -4,  61 }, {  -9,  60 }, {   1,  54 }, {   2,  58 },
+        {  17, -10 }, {  32, -13 }, {  42,  -9 }, {  49,  -5 },
+        {  53,   0 }, {  64,   3 }, {  68,  10 }, {  66,  27 },
+        {  47,  57 }, {  17, -10 }, {  32, -13 }, {  42,  -9 },
+        {  49,  -5 }, {  53,   0 }, {  64,   3 }, {  68,  10 },
+        {  66,  27 }, {  47,  57 }, {  -5,  71 }, {   0,  24 },
+        {  -1,  36 }, {  -2,  42 }, {  -2,  52 }, {  -9,  57 },
+        {  -6,  63 }, {  -4,  65 }, {  -4,  67 }, {  -7,  82 },
+        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
+        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
+        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
+        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
+        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
+        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
+        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
+        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
+        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
+        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
+        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
+        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
+        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
+        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
+        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
+        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
+        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
+        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
+        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
+        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
+        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
+        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
+        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
+        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
+        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
+        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
+        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
+        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
+        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
+        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
+        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
+        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
+        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
+        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
+        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
+        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
+        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
+        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
+        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
+        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
+        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
+        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
+        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
+        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
+        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
+        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
+        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
+        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
+        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
+        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
+        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
+        {  -5,  74 }, {  -9,  86 }, { -23, 112 }, { -15,  71 },
+        {  -7,  61 }, {   0,  53 }, {  -5,  66 }, { -11,  77 },
+        {  -9,  80 }, {  -9,  84 }, { -10,  87 }, { -34, 127 },
+        { -21, 101 }, {  -3,  39 }, {  -5,  53 }, {  -7,  61 },
+        { -11,  75 }, { -15,  77 }, { -17,  91 }, { -25, 107 },
+        { -25, 111 }, { -28, 122 }, { -11,  76 }, { -10,  44 },
+        { -10,  52 }, { -10,  57 }, {  -9,  58 }, { -16,  72 },
+        {  -7,  69 }, {  -4,  69 }, {  -5,  74 }, {  -9,  86 },
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 }
     },
 
     /* i_cabac_init_idc == 2 */
@@ -682,6 +1111,149 @@ static const int8_t cabac_context_init_PB[3][460][2] =
         { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
         {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
         {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
+
+        /* 460 - 1024 */
+        {  11,  80 }, {   5,  76 }, {   2,  84 }, {   5,  78 },
+        {  -6,  55 }, {   4,  61 }, { -14,  83 }, { -37, 127 },
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
+        {  11,  80 }, {   5,  76 }, {   2,  84 }, {   5,  78 },
+        {  -6,  55 }, {   4,  61 }, { -14,  83 }, { -37, 127 },
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
+        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
+        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
+        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
+        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
+        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
+        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
+        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
+        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
+        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
+        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
+        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
+        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
+        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
+        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
+        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
+        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
+        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
+        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
+        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
+        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
+        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
+        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
+        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
+        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
+        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
+        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
+        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
+        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
+        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
+        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
+        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
+        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
+        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
+        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
+        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
+        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
+        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
+        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
+        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
+        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
+        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
+        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
+        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
+        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
+        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
+        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
+        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
+        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {  -3,  78 },
+        {  -8,  74 }, {  -9,  72 }, { -10,  72 }, { -18,  75 },
+        { -12,  71 }, { -11,  63 }, {  -5,  70 }, { -17,  75 },
+        { -14,  72 }, { -16,  67 }, {  -8,  53 }, { -14,  59 },
+        {  -9,  52 }, { -11,  68 }, {   9,  -2 }, {  30, -10 },
+        {  31,  -4 }, {  33,  -1 }, {  33,   7 }, {  31,  12 },
+        {  37,  23 }, {  31,  38 }, {  20,  64 }, {   9,  -2 },
+        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
+        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
+        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
+        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
+        {  -6,  68 }, { -10,  79 }, {  -3,  78 }, {  -8,  74 },
+        {  -9,  72 }, { -10,  72 }, { -18,  75 }, { -12,  71 },
+        { -11,  63 }, {  -5,  70 }, { -17,  75 }, { -14,  72 },
+        { -16,  67 }, {  -8,  53 }, { -14,  59 }, {  -9,  52 },
+        { -11,  68 }, {  -3,  78 }, {  -8,  74 }, {  -9,  72 },
+        { -10,  72 }, { -18,  75 }, { -12,  71 }, { -11,  63 },
+        {  -5,  70 }, { -17,  75 }, { -14,  72 }, { -16,  67 },
+        {  -8,  53 }, { -14,  59 }, {  -9,  52 }, { -11,  68 },
+        {   9,  -2 }, {  30, -10 }, {  31,  -4 }, {  33,  -1 },
+        {  33,   7 }, {  31,  12 }, {  37,  23 }, {  31,  38 },
+        {  20,  64 }, {   9,  -2 }, {  30, -10 }, {  31,  -4 },
+        {  33,  -1 }, {  33,   7 }, {  31,  12 }, {  37,  23 },
+        {  31,  38 }, {  20,  64 }, {  -9,  71 }, {  -7,  37 },
+        {  -8,  44 }, { -11,  49 }, { -10,  56 }, { -12,  59 },
+        {  -8,  63 }, {  -9,  67 }, {  -6,  68 }, { -10,  79 },
+        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
+        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
+        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
+        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
+        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
+        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
+        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
+        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
+        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
+        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
+        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
+        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
+        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
+        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
+        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
+        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
+        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
+        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
+        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
+        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
+        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
+        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
+        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
+        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
+        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
+        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
+        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
+        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
+        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
+        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
+        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
+        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
+        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
+        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
+        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
+        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
+        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
+        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
+        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
+        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
+        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
+        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
+        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
+        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
+        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
+        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
+        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
+        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
+        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
+        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
+        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
+        { -12,  92 }, { -18, 108 }, { -24, 115 }, { -22,  82 },
+        {  -9,  62 }, {   0,  53 }, {   0,  59 }, { -14,  85 },
+        { -13,  89 }, { -13,  94 }, { -11,  92 }, { -29, 127 },
+        { -21, 100 }, { -14,  57 }, { -12,  67 }, { -11,  71 },
+        { -10,  77 }, { -21,  85 }, { -16,  88 }, { -23, 104 },
+        { -15,  98 }, { -37, 127 }, { -10,  82 }, {  -8,  48 },
+        {  -8,  61 }, {  -8,  66 }, {  -7,  70 }, { -14,  75 },
+        { -10,  79 }, {  -9,  83 }, { -12,  92 }, { -18, 108 },
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 }
     }
 };
 
@@ -695,7 +1267,7 @@ void ff_h264_init_cabac_states(H264Context *h) {
     else                                 tab = cabac_context_init_PB[h->cabac_init_idc];
 
     /* calculate pre-state */
-    for( i= 0; i < 460; i++ ) {
+    for( i= 0; i < 1024; i++ ) {
         int pre = 2*(((tab[i][0] * slice_qp) >>4 ) + tab[i][1]) - 127;
 
         pre^= pre>>31;
@@ -957,21 +1529,22 @@ static int decode_cabac_mb_mvd( H264Context *h, int ctxbase, int amvd, int *mvda
     my += decode_cabac_mb_mvd( h, 47, amvd1, &mpy );\
 }
 
-static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
+static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int max_coeff, int is_dc ) {
     int nza, nzb;
     int ctx = 0;
+    static const uint16_t base_ctx[14] = {85,89,93,97,101,1012,460,464,468,1016,472,476,480,1020};
 
     if( is_dc ) {
-        if( cat == 0 ) {
-            nza = h->left_cbp&0x100;
-            nzb = h-> top_cbp&0x100;
-        } else {
+        if( cat == 3 ) {
             idx -= CHROMA_DC_BLOCK_INDEX;
             nza = (h->left_cbp>>(6+idx))&0x01;
             nzb = (h-> top_cbp>>(6+idx))&0x01;
+        } else {
+            idx -= LUMA_DC_BLOCK_INDEX;
+            nza = h->left_cbp&(0x100<<idx);
+            nzb = h-> top_cbp&(0x100<<idx);
         }
     } else {
-        assert(cat == 1 || cat == 2 || cat == 4);
         nza = h->non_zero_count_cache[scan8[idx] - 1];
         nzb = h->non_zero_count_cache[scan8[idx] - 8];
     }
@@ -982,7 +1555,7 @@ static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx,
     if( nzb > 0 )
         ctx += 2;
 
-    return ctx + 4 * cat;
+    return base_ctx[cat] + ctx;
 }
 
 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = {
@@ -993,16 +1566,16 @@ DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = {
 };
 
 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
-    static const int significant_coeff_flag_offset[2][6] = {
-      { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
-      { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
+    static const int significant_coeff_flag_offset[2][14] = {
+      { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 },
+      { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 }
     };
-    static const int last_coeff_flag_offset[2][6] = {
-      { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
-      { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
+    static const int last_coeff_flag_offset[2][14] = {
+      { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748 },
+      { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757 }
     };
-    static const int coeff_abs_level_m1_offset[6] = {
-        227+0, 227+10, 227+20, 227+30, 227+39, 426
+    static const int coeff_abs_level_m1_offset[14] = {
+        227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766
     };
     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
@@ -1057,7 +1630,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
     abs_level_m1_ctx_base = h->cabac_state
         + coeff_abs_level_m1_offset[cat];
 
-    if( !is_dc && cat == 5 ) {
+    if( !is_dc && max_coeff == 64 ) {
 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
         for(last= 0; last < coefs; last++) { \
             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
@@ -1075,9 +1648,11 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
         }
         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
-        coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
+        coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index,
+                                                 last_coeff_ctx_base-significant_coeff_ctx_base, sig_off);
     } else {
-        coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
+        coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index,
+                                             last_coeff_ctx_base-significant_coeff_ctx_base);
 #else
         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
     } else {
@@ -1087,16 +1662,16 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
     assert(coeff_count > 0);
 
     if( is_dc ) {
-        if( cat == 0 )
-            h->cbp_table[h->mb_xy] |= 0x100;
-        else
+        if( cat == 3 )
             h->cbp_table[h->mb_xy] |= 0x40 << (n - CHROMA_DC_BLOCK_INDEX);
+        else
+            h->cbp_table[h->mb_xy] |= 0x100 << (n - LUMA_DC_BLOCK_INDEX);
         h->non_zero_count_cache[scan8[n]] = coeff_count;
     } else {
-        if( cat == 5 )
+        if( max_coeff == 64 )
             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
         else {
-            assert( cat == 1 || cat == 2 || cat == 4 );
+            assert( cat == 1 || cat ==  2 || cat ==  4 || cat == 7 || cat == 8 || cat == 11 || cat == 12 );
             h->non_zero_count_cache[scan8[n]] = coeff_count;
         }
     }
@@ -1179,7 +1754,7 @@ static void decode_cabac_residual_nondc_internal( H264Context *h, DCTELEM *block
 
 static av_always_inline void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) {
     /* read coded block flag */
-    if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, 1 ) ] ) == 0 ) {
+    if( get_cabac( &h->cabac, &h->cabac_state[get_cabac_cbf_ctx( h, cat, n, max_coeff, 1 ) ] ) == 0 ) {
         h->non_zero_count_cache[scan8[n]] = 0;
         return;
     }
@@ -1188,13 +1763,68 @@ static av_always_inline void decode_cabac_residual_dc( H264Context *h, DCTELEM *
 
 static av_always_inline void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
     /* read coded block flag */
-    if( cat != 5 && get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, 0 ) ] ) == 0 ) {
-        h->non_zero_count_cache[scan8[n]] = 0;
+    if( (cat != 5 || CHROMA444) && get_cabac( &h->cabac, &h->cabac_state[get_cabac_cbf_ctx( h, cat, n, max_coeff, 0 ) ] ) == 0 ) {
+        if( max_coeff == 64 ) {
+            fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, 0, 1);
+        } else {
+            h->non_zero_count_cache[scan8[n]] = 0;
+        }
         return;
     }
     decode_cabac_residual_nondc_internal( h, block, cat, n, scantable, qmul, max_coeff );
 }
 
+static av_always_inline void decode_cabac_luma_residual( H264Context *h, const uint8_t *scan, const uint8_t *scan8x8, int pixel_shift, int mb_type, int cbp, int p )
+{
+    static const uint8_t ctx_cat[4][3] = {{0,6,10},{1,7,11},{2,8,12},{5,9,13}};
+    const uint32_t *qmul;
+    int i8x8, i4x4;
+    MpegEncContext * const s = &h->s;
+    int qscale = p == 0 ? s->qscale : h->chroma_qp[p-1];
+    if( IS_INTRA16x16( mb_type ) ) {
+        //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
+        AV_ZERO128(h->mb_luma_dc[p]+0);
+        AV_ZERO128(h->mb_luma_dc[p]+8);
+        AV_ZERO128(h->mb_luma_dc[p]+16);
+        AV_ZERO128(h->mb_luma_dc[p]+24);
+        decode_cabac_residual_dc(h, h->mb_luma_dc[p], ctx_cat[0][p], LUMA_DC_BLOCK_INDEX+p, scan, 16);
+
+        if( cbp&15 ) {
+            qmul = h->dequant4_coeff[p][qscale];
+            for( i4x4 = 0; i4x4 < 16; i4x4++ ) {
+                const int index = 16*p + i4x4;
+                //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", index );
+                decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), ctx_cat[1][p], index, scan + 1, qmul, 15);
+            }
+        } else {
+            fill_rectangle(&h->non_zero_count_cache[scan8[16*p]], 4, 4, 8, 0, 1);
+        }
+    } else {
+        int cqm = (IS_INTRA( mb_type ) ? 0:3) + p;
+        for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
+            if( cbp & (1<<i8x8) ) {
+                if( IS_8x8DCT(mb_type) ) {
+                    const int index = 16*p + 4*i8x8;
+                    decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), ctx_cat[3][p], index,
+                                                scan8x8, h->dequant8_coeff[cqm][qscale], 64);
+                } else {
+                    qmul = h->dequant4_coeff[cqm][qscale];
+                    for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
+                        const int index = 16*p + 4*i8x8 + i4x4;
+                        //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
+//START_TIMER
+                        decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), ctx_cat[2][p], index, scan, qmul, 16);
+//STOP_TIMER("decode_residual")
+                    }
+                }
+            } else {
+                uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8+16*p] ];
+                nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
+            }
+        }
+    }
+}
+
 /**
  * decodes a macroblock
  * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
@@ -1204,6 +1834,7 @@ int ff_h264_decode_mb_cabac(H264Context *h) {
     int mb_xy;
     int mb_type, partition_count, cbp = 0;
     int dct8x8_allowed= h->pps.transform_8x8_mode;
+    int decode_chroma = h->sps.chroma_format_idc == 1 || h->sps.chroma_format_idc == 2;
     const int pixel_shift = h->pixel_shift;
 
     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
@@ -1313,7 +1944,8 @@ decode_intra_mb:
     h->slice_table[ mb_xy ]= h->slice_num;
 
     if(IS_INTRA_PCM(mb_type)) {
-        const int mb_size = (384*h->sps.bit_depth_luma) >> 3;
+        static const uint16_t mb_sizes[4] = {256,384,512,768};
+        const int mb_size = mb_sizes[h->sps.chroma_format_idc]*h->sps.bit_depth_luma >> 3;
         const uint8_t *ptr;
 
         // We assume these blocks are very rare so we do not optimize it.
@@ -1326,20 +1958,17 @@ decode_intra_mb:
         }
 
         // The pixels are stored in the same order as levels in h->mb array.
-        memcpy(h->mb, ptr, 2*mb_size/3); ptr+=2*mb_size/3;
-        if(CHROMA){
-            memcpy(h->mb+mb_size/3, ptr, mb_size/3); ptr+=mb_size/3;
-        }
+        memcpy(h->mb, ptr, mb_size); ptr+=mb_size;
 
         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
 
         // All blocks are present
-        h->cbp_table[mb_xy] = 0x1ef;
+        h->cbp_table[mb_xy] = 0xf7ef;
         h->chroma_pred_mode_table[mb_xy] = 0;
         // In deblocking, the quantizer is 0
         s->current_picture.qscale_table[mb_xy]= 0;
         // All coeffs are present
-        memset(h->non_zero_count[mb_xy], 16, 32);
+        memset(h->non_zero_count[mb_xy], 16, 48);
         s->current_picture.mb_type[mb_xy]= mb_type;
         h->last_qscale_diff = 0;
         return 0;
@@ -1376,7 +2005,7 @@ decode_intra_mb:
             h->intra16x16_pred_mode= ff_h264_check_intra_pred_mode( h, h->intra16x16_pred_mode );
             if( h->intra16x16_pred_mode < 0 ) return -1;
         }
-        if(CHROMA){
+        if(decode_chroma){
             h->chroma_pred_mode_table[mb_xy] =
             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
 
@@ -1605,7 +2234,7 @@ decode_intra_mb:
 
     if( !IS_INTRA16x16( mb_type ) ) {
         cbp  = decode_cabac_mb_cbp_luma( h );
-        if(CHROMA)
+        if(decode_chroma)
             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
     }
 
@@ -1614,6 +2243,28 @@ decode_intra_mb:
     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
         mb_type |= MB_TYPE_8x8DCT * get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
     }
+
+    /* It would be better to do this in fill_decode_caches, but we don't know
+     * the transform mode of the current macroblock there. */
+    if (CHROMA444 && IS_8x8DCT(mb_type)){
+        int i;
+        for (i = 0; i < 2; i++){
+            if (h->left_type[i] && !IS_8x8DCT(h->left_type[i])){
+                h->non_zero_count_cache[3+8* 1 + 2*8*i]=
+                h->non_zero_count_cache[3+8* 2 + 2*8*i]=
+                h->non_zero_count_cache[3+8* 6 + 2*8*i]=
+                h->non_zero_count_cache[3+8* 7 + 2*8*i]=
+                h->non_zero_count_cache[3+8*11 + 2*8*i]=
+                h->non_zero_count_cache[3+8*12 + 2*8*i]= IS_INTRA(mb_type) ? 64 : 0;
+            }
+        }
+        if (h->top_type && !IS_8x8DCT(h->top_type)){
+            uint32_t top_empty = CABAC && !IS_INTRA(mb_type) ? 0 : 0x40404040;
+            AV_WN32A(&h->non_zero_count_cache[4+8* 0], top_empty);
+            AV_WN32A(&h->non_zero_count_cache[4+8* 5], top_empty);
+            AV_WN32A(&h->non_zero_count_cache[4+8*10], top_empty);
+        }
+    }
     s->current_picture.mb_type[mb_xy]= mb_type;
 
     if( cbp || IS_INTRA16x16( mb_type ) ) {
@@ -1658,76 +2309,38 @@ decode_intra_mb:
         }else
             h->last_qscale_diff=0;
 
-        if( IS_INTRA16x16( mb_type ) ) {
-            int i;
-            //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
-            AV_ZERO128(h->mb_luma_dc+0);
-            AV_ZERO128(h->mb_luma_dc+8);
-            AV_ZERO128(h->mb_luma_dc+16);
-            AV_ZERO128(h->mb_luma_dc+24);
-            decode_cabac_residual_dc( h, h->mb_luma_dc, 0, LUMA_DC_BLOCK_INDEX, scan, 16);
+        decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 0);
+        if(CHROMA444){
+            decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 1);
+            decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 2);
+        } else {
+            if( cbp&0x30 ){
+                int c;
+                for( c = 0; c < 2; c++ ) {
+                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
+                    decode_cabac_residual_dc(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
+                }
+            }
 
-            if( cbp&15 ) {
-                qmul = h->dequant4_coeff[0][s->qscale];
-                for( i = 0; i < 16; i++ ) {
-                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
-                    decode_cabac_residual_nondc(h, h->mb + (16*i << pixel_shift), 1, i, scan + 1, qmul, 15);
+            if( cbp&0x20 ) {
+                int c, i;
+                for( c = 0; c < 2; c++ ) {
+                    qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
+                    for( i = 0; i < 4; i++ ) {
+                        const int index = 16 + 16 * c + i;
+                        //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
+                        decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 4, index, scan + 1, qmul, 15);
+                    }
                 }
             } else {
-                fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
+                fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
+                fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
             }
-        } else {
-            int i8x8, i4x4;
-            for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
-                if( cbp & (1<<i8x8) ) {
-                    if( IS_8x8DCT(mb_type) ) {
-                        decode_cabac_residual_nondc(h, h->mb + (64*i8x8 << pixel_shift), 5, 4*i8x8,
-                            scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
-                    } else {
-                        qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
-                        for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
-                            const int index = 4*i8x8 + i4x4;
-                            //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
-//START_TIMER
-                            decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 2, index, scan, qmul, 16);
-//STOP_TIMER("decode_residual")
-                        }
-                    }
-                } else {
-                    uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
-                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
-                }
-            }
-        }
-
-        if( cbp&0x30 ){
-            int c;
-            for( c = 0; c < 2; c++ ) {
-                //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
-                decode_cabac_residual_dc(h, h->mb + ((256 + 16*4*c) << pixel_shift), 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
-            }
-        }
-
-        if( cbp&0x20 ) {
-            int c, i;
-            for( c = 0; c < 2; c++ ) {
-                qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
-                for( i = 0; i < 4; i++ ) {
-                    const int index = 16 + 4 * c + i;
-                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
-                    decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 4, index, scan + 1, qmul, 15);
-                }
-            }
-        } else {
-            uint8_t * const nnz= &h->non_zero_count_cache[0];
-            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
-            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
         }
     } else {
-        uint8_t * const nnz= &h->non_zero_count_cache[0];
-        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
-        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
-        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
+        fill_rectangle(&h->non_zero_count_cache[scan8[ 0]], 4, 4, 8, 0, 1);
+        fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
+        fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
         h->last_qscale_diff = 0;
     }
 
diff --git a/libavcodec/h264_cavlc.c b/libavcodec/h264_cavlc.c
index 2e5ea54679..497166b423 100644
--- a/libavcodec/h264_cavlc.c
+++ b/libavcodec/h264_cavlc.c
@@ -371,12 +371,12 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
 
     //FIXME put trailing_onex into the context
 
-    if(n >= CHROMA_DC_BLOCK_INDEX){
+    if(max_coeff <= 8){
         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
         total_coeff= coeff_token>>2;
     }else{
-        if(n == LUMA_DC_BLOCK_INDEX){
-            total_coeff= pred_non_zero_count(h, 0);
+        if(n >= LUMA_DC_BLOCK_INDEX){
+            total_coeff= pred_non_zero_count(h, (n - LUMA_DC_BLOCK_INDEX)*16);
             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
             total_coeff= coeff_token>>2;
         }else{
@@ -482,7 +482,8 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
     if(total_coeff == max_coeff)
         zeros_left=0;
     else{
-        if(n >= CHROMA_DC_BLOCK_INDEX)
+        /* FIXME: we don't actually support 4:2:2 yet. */
+        if(max_coeff <= 8)
             zeros_left= get_vlc2(gb, (chroma_dc_total_zeros_vlc-1)[ total_coeff ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
         else
             zeros_left= get_vlc2(gb, (total_zeros_vlc-1)[ total_coeff ].table, TOTAL_ZEROS_VLC_BITS, 1);
@@ -536,12 +537,80 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
     return 0;
 }
 
+static av_always_inline int decode_luma_residual(H264Context *h, GetBitContext *gb, const uint8_t *scan, const uint8_t *scan8x8, int pixel_shift, int mb_type, int cbp, int p){
+    int i4x4, i8x8;
+    MpegEncContext * const s = &h->s;
+    int qscale = p == 0 ? s->qscale : h->chroma_qp[p-1];
+    if(IS_INTRA16x16(mb_type)){
+        AV_ZERO128(h->mb_luma_dc[p]+0);
+        AV_ZERO128(h->mb_luma_dc[p]+8);
+        AV_ZERO128(h->mb_luma_dc[p]+16);
+        AV_ZERO128(h->mb_luma_dc[p]+24);
+        if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc[p], LUMA_DC_BLOCK_INDEX+p, scan, NULL, 16) < 0){
+            return -1; //FIXME continue if partitioned and other return -1 too
+        }
+
+        assert((cbp&15) == 0 || (cbp&15) == 15);
+
+        if(cbp&15){
+            for(i8x8=0; i8x8<4; i8x8++){
+                for(i4x4=0; i4x4<4; i4x4++){
+                    const int index= i4x4 + 4*i8x8 + p*16;
+                    if( decode_residual(h, h->intra_gb_ptr, h->mb + (16*index << pixel_shift),
+                        index, scan + 1, h->dequant4_coeff[p][qscale], 15) < 0 ){
+                        return -1;
+                    }
+                }
+            }
+            return 0xf;
+        }else{
+            fill_rectangle(&h->non_zero_count_cache[scan8[p*16]], 4, 4, 8, 0, 1);
+            return 0;
+        }
+    }else{
+        int cqm = (IS_INTRA( mb_type ) ? 0:3)+p;
+        /* For CAVLC 4:4:4, we need to keep track of the luma 8x8 CBP for deblocking nnz purposes. */
+        int new_cbp = 0;
+        for(i8x8=0; i8x8<4; i8x8++){
+            if(cbp & (1<<i8x8)){
+                if(IS_8x8DCT(mb_type)){
+                    DCTELEM *buf = &h->mb[64*i8x8+256*p << pixel_shift];
+                    uint8_t *nnz;
+                    for(i4x4=0; i4x4<4; i4x4++){
+                        const int index= i4x4 + 4*i8x8 + p*16;
+                        if( decode_residual(h, gb, buf, index, scan8x8+16*i4x4,
+                                            h->dequant8_coeff[cqm][qscale], 16) < 0 )
+                            return -1;
+                    }
+                    nnz= &h->non_zero_count_cache[ scan8[4*i8x8+p*16] ];
+                    nnz[0] += nnz[1] + nnz[8] + nnz[9];
+                    new_cbp |= !!nnz[0] << i8x8;
+                }else{
+                    for(i4x4=0; i4x4<4; i4x4++){
+                        const int index= i4x4 + 4*i8x8 + p*16;
+                        if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index,
+                                            scan, h->dequant4_coeff[cqm][qscale], 16) < 0 ){
+                            return -1;
+                        }
+                        new_cbp |= h->non_zero_count_cache[ scan8[index] ] << i8x8;
+                    }
+                }
+            }else{
+                uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8+p*16] ];
+                nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
+            }
+        }
+        return new_cbp;
+    }
+}
+
 int ff_h264_decode_mb_cavlc(H264Context *h){
     MpegEncContext * const s = &h->s;
     int mb_xy;
     int partition_count;
     unsigned int mb_type, cbp;
     int dct8x8_allowed= h->pps.transform_8x8_mode;
+    int decode_chroma = h->sps.chroma_format_idc == 1 || h->sps.chroma_format_idc == 2;
     const int pixel_shift = h->pixel_shift;
 
     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
@@ -608,19 +677,21 @@ decode_intra_mb:
 
     if(IS_INTRA_PCM(mb_type)){
         unsigned int x;
+        static const uint16_t mb_sizes[4] = {256,384,512,768};
+        const int mb_size = mb_sizes[h->sps.chroma_format_idc]*h->sps.bit_depth_luma >> 3;
 
         // We assume these blocks are very rare so we do not optimize it.
         align_get_bits(&s->gb);
 
         // The pixels are stored in the same order as levels in h->mb array.
-        for(x=0; x < (CHROMA ? 384 : 256)*h->sps.bit_depth_luma/8; x++){
+        for(x=0; x < mb_size; x++){
             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
         }
 
         // In deblocking, the quantizer is 0
         s->current_picture.qscale_table[mb_xy]= 0;
         // All coeffs are present
-        memset(h->non_zero_count[mb_xy], 16, 32);
+        memset(h->non_zero_count[mb_xy], 16, 48);
 
         s->current_picture.mb_type[mb_xy]= mb_type;
         return 0;
@@ -668,7 +739,7 @@ decode_intra_mb:
             if(h->intra16x16_pred_mode < 0)
                 return -1;
         }
-        if(CHROMA){
+        if(decode_chroma){
             pred_mode= ff_h264_check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
             if(pred_mode < 0)
                 return -1;
@@ -896,15 +967,19 @@ decode_intra_mb:
 
     if(!IS_INTRA16x16(mb_type)){
         cbp= get_ue_golomb(&s->gb);
-        if(cbp > 47){
-            av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
-            return -1;
-        }
 
-        if(CHROMA){
+        if(decode_chroma){
+            if(cbp > 47){
+                av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
+                return -1;
+            }
             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
             else                     cbp= golomb_to_inter_cbp   [cbp];
         }else{
+            if(cbp > 15){
+                av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
+                return -1;
+            }
             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
             else                     cbp= golomb_to_inter_cbp_gray[cbp];
         }
@@ -918,8 +993,9 @@ decode_intra_mb:
     s->current_picture.mb_type[mb_xy]= mb_type;
 
     if(cbp || IS_INTRA16x16(mb_type)){
-        int i8x8, i4x4, chroma_idx;
+        int i4x4, chroma_idx;
         int dquant;
+        int ret;
         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
         const uint8_t *scan, *scan8x8;
         const int max_qp = 51 + 6*(h->sps.bit_depth_luma-8);
@@ -947,85 +1023,45 @@ decode_intra_mb:
 
         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
-        if(IS_INTRA16x16(mb_type)){
-            AV_ZERO128(h->mb_luma_dc+0);
-            AV_ZERO128(h->mb_luma_dc+8);
-            AV_ZERO128(h->mb_luma_dc+16);
-            AV_ZERO128(h->mb_luma_dc+24);
-            if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc, LUMA_DC_BLOCK_INDEX, scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
-                return -1; //FIXME continue if partitioned and other return -1 too
+
+        if( (ret = decode_luma_residual(h, gb, scan, scan8x8, pixel_shift, mb_type, cbp, 0)) < 0 ){
+            return -1;
+        }
+        h->cbp_table[mb_xy] |= ret << 12;
+        if(CHROMA444){
+            if( decode_luma_residual(h, gb, scan, scan8x8, pixel_shift, mb_type, cbp, 1) < 0 ){
+                return -1;
+            }
+            if( decode_luma_residual(h, gb, scan, scan8x8, pixel_shift, mb_type, cbp, 2) < 0 ){
+                return -1;
+            }
+        } else {
+            if(cbp&0x30){
+                for(chroma_idx=0; chroma_idx<2; chroma_idx++)
+                    if( decode_residual(h, gb, h->mb + ((256 + 16*16*chroma_idx) << pixel_shift), CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
+                        return -1;
+                    }
             }
 
-            assert((cbp&15) == 0 || (cbp&15) == 15);
-
-            if(cbp&15){
-                for(i8x8=0; i8x8<4; i8x8++){
+            if(cbp&0x20){
+                for(chroma_idx=0; chroma_idx<2; chroma_idx++){
+                    const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
                     for(i4x4=0; i4x4<4; i4x4++){
-                        const int index= i4x4 + 4*i8x8;
-                        if( decode_residual(h, h->intra_gb_ptr, h->mb + (16*index << pixel_shift), index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
+                        const int index= 16 + 16*chroma_idx + i4x4;
+                        if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index, scan + 1, qmul, 15) < 0){
                             return -1;
                         }
                     }
                 }
             }else{
-                fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
+                fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
+                fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
             }
-        }else{
-            for(i8x8=0; i8x8<4; i8x8++){
-                if(cbp & (1<<i8x8)){
-                    if(IS_8x8DCT(mb_type)){
-                        DCTELEM *buf = &h->mb[64*i8x8 << pixel_shift];
-                        uint8_t *nnz;
-                        for(i4x4=0; i4x4<4; i4x4++){
-                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
-                                                h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
-                                return -1;
-                        }
-                        nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
-                        nnz[0] += nnz[1] + nnz[8] + nnz[9];
-                    }else{
-                        for(i4x4=0; i4x4<4; i4x4++){
-                            const int index= i4x4 + 4*i8x8;
-
-                            if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
-                                return -1;
-                            }
-                        }
-                    }
-                }else{
-                    uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
-                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
-                }
-            }
-        }
-
-        if(cbp&0x30){
-            for(chroma_idx=0; chroma_idx<2; chroma_idx++)
-                if( decode_residual(h, gb, h->mb + ((256 + 16*4*chroma_idx) << pixel_shift), CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
-                    return -1;
-                }
-        }
-
-        if(cbp&0x20){
-            for(chroma_idx=0; chroma_idx<2; chroma_idx++){
-                const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
-                for(i4x4=0; i4x4<4; i4x4++){
-                    const int index= 16 + 4*chroma_idx + i4x4;
-                    if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index, scan + 1, qmul, 15) < 0){
-                        return -1;
-                    }
-                }
-            }
-        }else{
-            uint8_t * const nnz= &h->non_zero_count_cache[0];
-            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
-            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
         }
     }else{
-        uint8_t * const nnz= &h->non_zero_count_cache[0];
-        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
-        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
-        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
+        fill_rectangle(&h->non_zero_count_cache[scan8[ 0]], 4, 4, 8, 0, 1);
+        fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
+        fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
     }
     s->current_picture.qscale_table[mb_xy]= s->qscale;
     write_back_non_zero_count(h);
diff --git a/libavcodec/h264_loopfilter.c b/libavcodec/h264_loopfilter.c
index 72b1905936..d4ecefcf08 100644
--- a/libavcodec/h264_loopfilter.c
+++ b/libavcodec/h264_loopfilter.c
@@ -220,7 +220,7 @@ void ff_h264_filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
 
     mb_xy = h->mb_xy;
 
-    if(!h->top_type || !h->h264dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff) {
+    if(!h->top_type || !h->h264dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff || CHROMA444) {
         ff_h264_filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
         return;
     }
@@ -353,9 +353,10 @@ static int check_mv(H264Context *h, long b_idx, long bn_idx, int mvy_limit){
     return v;
 }
 
-static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
+static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int chroma444, int dir) {
     MpegEncContext * const s = &h->s;
     int edge;
+    int chroma_qp_avg[2];
     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
     const int mbm_type = dir == 0 ? h->left_type[0] : h->top_type;
 
@@ -394,7 +395,7 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
                         bS[2]= 1+((h->cbp_table[mbn_xy] & 8)||h->non_zero_count_cache[scan8[0]+2]);
                         bS[3]= 1+((h->cbp_table[mbn_xy] & 8)||h->non_zero_count_cache[scan8[0]+3]);
                     }else{
-                    const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy] + 4+3*8;
+                    const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy] + 3*4;
                     int i;
                     for( i = 0; i < 4; i++ ) {
                         bS[i] = 1 + !!(h->non_zero_count_cache[scan8[0]+i] | mbn_nnz[i]);
@@ -407,10 +408,15 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
                 filter_mb_edgeh( &img_y[j*linesize], tmp_linesize, bS, qp, h );
-                filter_mb_edgech( &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
-                                ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1, h);
-                filter_mb_edgech( &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
-                                ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1, h);
+                chroma_qp_avg[0] = (h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
+                chroma_qp_avg[1] = (h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
+                if (chroma444) {
+                    filter_mb_edgeh (&img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[0], h);
+                    filter_mb_edgeh (&img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[1], h);
+                } else {
+                    filter_mb_edgech(&img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[0], h);
+                    filter_mb_edgech(&img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[1], h);
+                }
             }
         }else{
             DECLARE_ALIGNED(8, int16_t, bS)[4];
@@ -465,23 +471,29 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
                 //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]);
                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
                 //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
+                chroma_qp_avg[0] = (h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
+                chroma_qp_avg[1] = (h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
                 if( dir == 0 ) {
                     filter_mb_edgev( &img_y[0], linesize, bS, qp, h );
                     {
-                        int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
-                        filter_mb_edgecv( &img_cb[0], uvlinesize, bS, qp, h);
-                        if(h->pps.chroma_qp_diff)
-                            qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
-                        filter_mb_edgecv( &img_cr[0], uvlinesize, bS, qp, h);
+                        if (chroma444) {
+                            filter_mb_edgev ( &img_cb[0], uvlinesize, bS, chroma_qp_avg[0], h);
+                            filter_mb_edgev ( &img_cr[0], uvlinesize, bS, chroma_qp_avg[1], h);
+                        } else {
+                            filter_mb_edgecv( &img_cb[0], uvlinesize, bS, chroma_qp_avg[0], h);
+                            filter_mb_edgecv( &img_cr[0], uvlinesize, bS, chroma_qp_avg[1], h);
+                        }
                     }
                 } else {
                     filter_mb_edgeh( &img_y[0], linesize, bS, qp, h );
                     {
-                        int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
-                        filter_mb_edgech( &img_cb[0], uvlinesize, bS, qp, h);
-                        if(h->pps.chroma_qp_diff)
-                            qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
-                        filter_mb_edgech( &img_cr[0], uvlinesize, bS, qp, h);
+                        if (chroma444) {
+                            filter_mb_edgeh ( &img_cb[0], uvlinesize, bS, chroma_qp_avg[0], h);
+                            filter_mb_edgeh ( &img_cr[0], uvlinesize, bS, chroma_qp_avg[1], h);
+                        } else {
+                            filter_mb_edgech( &img_cb[0], uvlinesize, bS, chroma_qp_avg[0], h);
+                            filter_mb_edgech( &img_cr[0], uvlinesize, bS, chroma_qp_avg[1], h);
+                        }
                     }
                 }
             }
@@ -545,13 +557,19 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
         //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
         if( dir == 0 ) {
             filter_mb_edgev( &img_y[4*edge << h->pixel_shift], linesize, bS, qp, h );
-            if( (edge&1) == 0 ) {
+            if (chroma444) {
+                filter_mb_edgev ( &img_cb[4*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
+                filter_mb_edgev ( &img_cr[4*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
+            } else if( (edge&1) == 0 ) {
                 filter_mb_edgecv( &img_cb[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
                 filter_mb_edgecv( &img_cr[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
             }
         } else {
             filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, h );
-            if( (edge&1) == 0 ) {
+            if (chroma444) {
+                filter_mb_edgeh ( &img_cb[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], h);
+                filter_mb_edgeh ( &img_cr[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], h);
+            } else if( (edge&1) == 0 ) {
                 filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], h);
                 filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], h);
             }
@@ -589,11 +607,11 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
         } else {
             static const uint8_t offset[2][2][8]={
                 {
-                    {7+8*0, 7+8*0, 7+8*0, 7+8*0, 7+8*1, 7+8*1, 7+8*1, 7+8*1},
-                    {7+8*2, 7+8*2, 7+8*2, 7+8*2, 7+8*3, 7+8*3, 7+8*3, 7+8*3},
+                    {3+4*0, 3+4*0, 3+4*0, 3+4*0, 3+4*1, 3+4*1, 3+4*1, 3+4*1},
+                    {3+4*2, 3+4*2, 3+4*2, 3+4*2, 3+4*3, 3+4*3, 3+4*3, 3+4*3},
                 },{
-                    {7+8*0, 7+8*1, 7+8*2, 7+8*3, 7+8*0, 7+8*1, 7+8*2, 7+8*3},
-                    {7+8*0, 7+8*1, 7+8*2, 7+8*3, 7+8*0, 7+8*1, 7+8*2, 7+8*3},
+                    {3+4*0, 3+4*1, 3+4*2, 3+4*3, 3+4*0, 3+4*1, 3+4*2, 3+4*3},
+                    {3+4*0, 3+4*1, 3+4*2, 3+4*3, 3+4*0, 3+4*1, 3+4*2, 3+4*3},
                 }
             };
             const uint8_t *off= offset[MB_FIELD][mb_y&1];
@@ -650,9 +668,9 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
 
 #if CONFIG_SMALL
     for( dir = 0; dir < 2; dir++ )
-        filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
+        filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, CHROMA444, dir);
 #else
-    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
-    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
+    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, CHROMA444, 0);
+    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, CHROMA444, 1);
 #endif
 }
diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c
index a98f14aaf6..9c41e4ca73 100644
--- a/libavcodec/h264_ps.c
+++ b/libavcodec/h264_ps.c
@@ -269,7 +269,7 @@ static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_s
         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
-        fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
+        fallback_sps ? sps->scaling_matrix8[3] : default_scaling8[1]
     };
     if(get_bits1(&s->gb)){
         sps->scaling_matrix_present |= is_sps;
@@ -281,7 +281,15 @@ static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_s
         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
         if(is_sps || pps->transform_8x8_mode){
             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
-            decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
+            if(h->sps.chroma_format_idc == 3){
+                decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[0],scaling_matrix8[0]);  // Intra, Cr
+                decode_scaling_list(h,scaling_matrix8[2],64,default_scaling8[0],scaling_matrix8[1]);  // Intra, Cb
+            }
+            decode_scaling_list(h,scaling_matrix8[3],64,default_scaling8[1],fallback[3]);  // Inter, Y
+            if(h->sps.chroma_format_idc == 3){
+                decode_scaling_list(h,scaling_matrix8[4],64,default_scaling8[1],scaling_matrix8[3]);  // Inter, Cr
+                decode_scaling_list(h,scaling_matrix8[5],64,default_scaling8[1],scaling_matrix8[4]);  // Inter, Cb
+            }
         }
     }
 }
@@ -395,7 +403,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){
         if(sps->crop_left || sps->crop_top){
             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
         }
-        if(sps->crop_right >= 8 || sps->crop_bottom >= 8){
+        if(sps->crop_right >= (8<<CHROMA444) || sps->crop_bottom >= (8<<CHROMA444)){
             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
         }
     }else{
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index 864c118bb5..6972725781 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -66,10 +66,10 @@ typedef struct H264DSPContext{
     void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
     void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
 
-    void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
-    void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
-    void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
-    void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+    void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[15*8]);
+    void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[15*8]);
+    void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[15*8]);
+    void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[15*8]);
     void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul);
     void (*h264_chroma_dc_dequant_idct)(DCTELEM *block, int qmul);
 }H264DSPContext;
diff --git a/libavcodec/h264idct_template.c b/libavcodec/h264idct_template.c
index 39c9a1c9eb..e7f9af7fb0 100644
--- a/libavcodec/h264idct_template.c
+++ b/libavcodec/h264idct_template.c
@@ -30,15 +30,19 @@
 #ifndef AVCODEC_H264IDCT_INTERNAL_H
 #define AVCODEC_H264IDCT_INTERNAL_H
 //FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
-static const uint8_t scan8[16 + 2*4]={
- 4+1*8, 5+1*8, 4+2*8, 5+2*8,
- 6+1*8, 7+1*8, 6+2*8, 7+2*8,
- 4+3*8, 5+3*8, 4+4*8, 5+4*8,
- 6+3*8, 7+3*8, 6+4*8, 7+4*8,
- 1+1*8, 2+1*8,
- 1+2*8, 2+2*8,
- 1+4*8, 2+4*8,
- 1+5*8, 2+5*8,
+static const uint8_t scan8[16*3]={
+ 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8,
+ 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8,
+ 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8,
+ 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8,
+ 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8,
+ 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8,
+ 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8,
+ 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8,
+ 4+11*8, 5+11*8, 4+12*8, 5+12*8,
+ 6+11*8, 7+11*8, 6+12*8, 7+12*8,
+ 4+13*8, 5+13*8, 4+14*8, 5+14*8,
+ 6+13*8, 7+13*8, 6+14*8, 7+14*8
 };
 #endif
 
@@ -190,7 +194,7 @@ void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, DCTELEM *block, int stride){
     }
 }
 
-void FUNCC(ff_h264_idct_add16)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+void FUNCC(ff_h264_idct_add16)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
     int i;
     for(i=0; i<16; i++){
         int nnz = nnzc[ scan8[i] ];
@@ -201,7 +205,7 @@ void FUNCC(ff_h264_idct_add16)(uint8_t *dst, const int *block_offset, DCTELEM *b
     }
 }
 
-void FUNCC(ff_h264_idct_add16intra)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+void FUNCC(ff_h264_idct_add16intra)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
     int i;
     for(i=0; i<16; i++){
         if(nnzc[ scan8[i] ])             FUNCC(idct_internal      )(dst + block_offset[i], block + i*16*sizeof(pixel), stride, 4, 6, 1);
@@ -209,7 +213,7 @@ void FUNCC(ff_h264_idct_add16intra)(uint8_t *dst, const int *block_offset, DCTEL
     }
 }
 
-void FUNCC(ff_h264_idct8_add4)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+void FUNCC(ff_h264_idct8_add4)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
     int i;
     for(i=0; i<16; i+=4){
         int nnz = nnzc[ scan8[i] ];
@@ -220,13 +224,15 @@ void FUNCC(ff_h264_idct8_add4)(uint8_t *dst, const int *block_offset, DCTELEM *b
     }
 }
 
-void FUNCC(ff_h264_idct_add8)(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
-    int i;
-    for(i=16; i<16+8; i++){
-        if(nnzc[ scan8[i] ])
-            FUNCC(ff_h264_idct_add   )(dest[(i&4)>>2] + block_offset[i], block + i*16*sizeof(pixel), stride);
-        else if(((dctcoef*)block)[i*16])
-            FUNCC(ff_h264_idct_dc_add)(dest[(i&4)>>2] + block_offset[i], block + i*16*sizeof(pixel), stride);
+void FUNCC(ff_h264_idct_add8)(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
+    int i, j;
+    for(j=1; j<3; j++){
+        for(i=j*16; i<j*16+4; i++){
+            if(nnzc[ scan8[i] ])
+                FUNCC(ff_h264_idct_add   )(dest[j-1] + block_offset[i], block + i*16*sizeof(pixel), stride);
+            else if(((dctcoef*)block)[i*16])
+                FUNCC(ff_h264_idct_dc_add)(dest[j-1] + block_offset[i], block + i*16*sizeof(pixel), stride);
+        }
     }
 }
 /**
diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index 6a45da8761..4978d28b49 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -1185,15 +1185,17 @@ void MPV_frame_end(MpegEncContext *s)
        && s->current_picture.reference
        && !s->intra_only
        && !(s->flags&CODEC_FLAG_EMU_EDGE)) {
+            int hshift = av_pix_fmt_descriptors[s->avctx->pix_fmt].log2_chroma_w;
+            int vshift = av_pix_fmt_descriptors[s->avctx->pix_fmt].log2_chroma_h;
             s->dsp.draw_edges(s->current_picture.data[0], s->linesize  ,
-                              s->h_edge_pos   , s->v_edge_pos   ,
-                              EDGE_WIDTH  , EDGE_TOP | EDGE_BOTTOM);
+                              s->h_edge_pos             , s->v_edge_pos,
+                              EDGE_WIDTH        , EDGE_WIDTH        , EDGE_TOP | EDGE_BOTTOM);
             s->dsp.draw_edges(s->current_picture.data[1], s->uvlinesize,
-                              s->h_edge_pos>>1, s->v_edge_pos>>1,
-                              EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
+                              s->h_edge_pos>>hshift, s->v_edge_pos>>vshift,
+                              EDGE_WIDTH>>hshift, EDGE_WIDTH>>vshift, EDGE_TOP | EDGE_BOTTOM);
             s->dsp.draw_edges(s->current_picture.data[2], s->uvlinesize,
-                              s->h_edge_pos>>1, s->v_edge_pos>>1,
-                              EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
+                              s->h_edge_pos>>hshift, s->v_edge_pos>>vshift,
+                              EDGE_WIDTH>>hshift, EDGE_WIDTH>>vshift, EDGE_TOP | EDGE_BOTTOM);
     }
 
     emms_c();
@@ -2284,14 +2286,19 @@ void ff_draw_horiz_band(MpegEncContext *s, int y, int h){
        && !s->intra_only
        && !(s->flags&CODEC_FLAG_EMU_EDGE)) {
         int sides = 0, edge_h;
+        int hshift = av_pix_fmt_descriptors[s->avctx->pix_fmt].log2_chroma_w;
+        int vshift = av_pix_fmt_descriptors[s->avctx->pix_fmt].log2_chroma_h;
         if (y==0) sides |= EDGE_TOP;
         if (y + h >= s->v_edge_pos) sides |= EDGE_BOTTOM;
 
         edge_h= FFMIN(h, s->v_edge_pos - y);
 
-        s->dsp.draw_edges(s->current_picture_ptr->data[0] +  y    *s->linesize  , s->linesize  , s->h_edge_pos   , edge_h   , EDGE_WIDTH  , sides);
-        s->dsp.draw_edges(s->current_picture_ptr->data[1] + (y>>1)*s->uvlinesize, s->uvlinesize, s->h_edge_pos>>1, edge_h>>1, EDGE_WIDTH/2, sides);
-        s->dsp.draw_edges(s->current_picture_ptr->data[2] + (y>>1)*s->uvlinesize, s->uvlinesize, s->h_edge_pos>>1, edge_h>>1, EDGE_WIDTH/2, sides);
+        s->dsp.draw_edges(s->current_picture_ptr->data[0] +  y         *s->linesize  , s->linesize,
+                          s->h_edge_pos        , edge_h        , EDGE_WIDTH        , EDGE_WIDTH        , sides);
+        s->dsp.draw_edges(s->current_picture_ptr->data[1] + (y>>vshift)*s->uvlinesize, s->uvlinesize,
+                          s->h_edge_pos>>hshift, edge_h>>hshift, EDGE_WIDTH>>hshift, EDGE_WIDTH>>vshift, sides);
+        s->dsp.draw_edges(s->current_picture_ptr->data[2] + (y>>vshift)*s->uvlinesize, s->uvlinesize,
+                          s->h_edge_pos>>hshift, edge_h>>hshift, EDGE_WIDTH>>hshift, EDGE_WIDTH>>vshift, sides);
     }
 
     h= FFMIN(h, s->avctx->height - y);
diff --git a/libavcodec/snow.c b/libavcodec/snow.c
index 6db0b290ba..28f04f119b 100644
--- a/libavcodec/snow.c
+++ b/libavcodec/snow.c
@@ -1978,13 +1978,13 @@ static int frame_start(SnowContext *s){
     if(s->current_picture.data[0]){
         s->dsp.draw_edges(s->current_picture.data[0],
                           s->current_picture.linesize[0], w   , h   ,
-                          EDGE_WIDTH  , EDGE_TOP | EDGE_BOTTOM);
+                          EDGE_WIDTH  , EDGE_WIDTH  , EDGE_TOP | EDGE_BOTTOM);
         s->dsp.draw_edges(s->current_picture.data[1],
                           s->current_picture.linesize[1], w>>1, h>>1,
-                          EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
+                          EDGE_WIDTH/2, EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
         s->dsp.draw_edges(s->current_picture.data[2],
                           s->current_picture.linesize[2], w>>1, h>>1,
-                          EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
+                          EDGE_WIDTH/2, EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
     }
 
     release_buffer(s->avctx);
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 1cc6991666..214c6a3945 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -784,7 +784,7 @@ static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
 
 /* draw the edges of width 'w' of an image of size width, height
    this mmx version can only handle w==8 || w==16 */
-static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int sides)
+static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
 {
     uint8_t *ptr, *last_line;
     int i;
@@ -839,7 +839,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w,
 
     /* top and bottom (and hopefully also the corners) */
     if (sides&EDGE_TOP) {
-        for(i = 0; i < w; i += 4) {
+        for(i = 0; i < h; i += 4) {
             ptr= buf - (i + 1) * wrap - w;
             __asm__ volatile(
                     "1:                             \n\t"
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index c850dc2ef3..b5f77c90d5 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -36,7 +36,7 @@
 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
 static int decode_significance_x86(CABACContext *c, int max_coeff,
                                    uint8_t *significant_coeff_ctx_base,
-                                   int *index){
+                                   int *index, int last_off){
     void *end= significant_coeff_ctx_base + max_coeff - 1;
     int minusstart= -(int)significant_coeff_ctx_base;
     int minusindex= 4-(int)index;
@@ -52,10 +52,12 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
 
         "test $1, %%edx                         \n\t"
         " jz 3f                                 \n\t"
+        "add  %7, %1                            \n\t"
 
-        BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx",
+        BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx",
                              "%%bx", "%%esi", "%%eax", "%%al")
 
+        "sub  %7, %1                            \n\t"
         "mov  %2, %%"REG_a"                     \n\t"
         "movl %4, %%ecx                         \n\t"
         "add  %1, %%"REG_c"                     \n\t"
@@ -82,7 +84,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
         "movl %%esi, "RANGE    "(%3)            \n\t"
         "movl %%ebx, "LOW      "(%3)            \n\t"
         :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)
-        :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)
+        :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off)
         : "%"REG_c, "%ebx", "%edx", "%esi", "memory"
     );
     return coeff_count;
@@ -90,7 +92,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
 
 static int decode_significance_8x8_x86(CABACContext *c,
                                        uint8_t *significant_coeff_ctx_base,
-                                       int *index, const uint8_t *sig_off){
+                                       int *index, int last_off, const uint8_t *sig_off){
     int minusindex= 4-(int)index;
     int coeff_count;
     x86_reg last=0;
@@ -114,8 +116,9 @@ static int decode_significance_8x8_x86(CABACContext *c,
 
         "movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%%edi), %%edi\n\t"
         "add %5, %%"REG_D"                      \n\t"
+        "add %7, %%"REG_D"                      \n\t"
 
-        BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%"REG_D")", "%%ebx",
+        BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%"REG_D")", "%%ebx",
                              "%%bx", "%%esi", "%%eax", "%%al")
 
         "mov %2, %%"REG_a"                      \n\t"
@@ -142,7 +145,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
         "movl %%esi, "RANGE    "(%3)            \n\t"
         "movl %%ebx, "LOW      "(%3)            \n\t"
         :"=&a"(coeff_count),"+m"(last), "+m"(index)
-        :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off)
+        :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off), "m"(last_off)
         : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory"
     );
     return coeff_count;
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index f90f41c4bc..4788da98e0 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -32,14 +32,18 @@
 SECTION_RODATA
 
 ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
-scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
-           db 6+1*8, 7+1*8, 6+2*8, 7+2*8
-           db 4+3*8, 5+3*8, 4+4*8, 5+4*8
-           db 6+3*8, 7+3*8, 6+4*8, 7+4*8
-           db 1+1*8, 2+1*8
-           db 1+2*8, 2+2*8
-           db 1+4*8, 2+4*8
-           db 1+5*8, 2+5*8
+scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
+           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
+           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
+           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
+           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
+           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
+           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
+           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
+           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
+           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
+           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
+           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
 %ifdef PIC
 %define scan8 r11
 %else
@@ -617,6 +621,8 @@ cglobal h264_idct_add8_8_mmx, 5, 7, 0
     mov         r10, r0
 %endif
     call         h264_idct_add8_mmx_plane
+    mov          r5, 32
+    add          r2, 384
 %ifdef ARCH_X86_64
     add         r10, gprsize
 %else
@@ -678,6 +684,8 @@ cglobal h264_idct_add8_8_mmx2, 5, 7, 0
     lea         r11, [scan8_mem]
 %endif
     call h264_idct_add8_mmx2_plane
+    mov          r5, 32
+    add          r2, 384
 %ifdef ARCH_X86_64
     add         r10, gprsize
 %else
@@ -810,12 +818,12 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
     test        r0, r0
     jz .try%1dc
 %ifdef ARCH_X86_64
-    mov        r0d, dword [r1+%1*8+64]
+    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
     add         r0, [r10]
 %else
     mov         r0, r0m
     mov         r0, [r0]
-    add         r0, dword [r1+%1*8+64]
+    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
 %endif
     call        x264_add8x4_idct_sse2
     jmp .cycle%1end
@@ -824,16 +832,18 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
     or         r0w, word [r2+32]
     jz .cycle%1end
 %ifdef ARCH_X86_64
-    mov        r0d, dword [r1+%1*8+64]
+    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
     add         r0, [r10]
 %else
     mov         r0, r0m
     mov         r0, [r0]
-    add         r0, dword [r1+%1*8+64]
+    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
 %endif
     call        h264_idct_dc_add8_mmx2
 .cycle%1end
-%if %1 < 3
+%if %1 == 1
+    add         r2, 384+64
+%elif %1 < 3
     add         r2, 64
 %endif
 %endmacro
@@ -845,15 +855,15 @@ cglobal h264_idct_add8_8_sse2, 5, 7, 8
 %ifdef ARCH_X86_64
     mov         r10, r0
 %endif
-    add8_sse2_cycle 0, 0x09
-    add8_sse2_cycle 1, 0x11
+    add8_sse2_cycle 0, 0x34
+    add8_sse2_cycle 1, 0x3c
 %ifdef ARCH_X86_64
     add         r10, gprsize
 %else
     add        r0mp, gprsize
 %endif
-    add8_sse2_cycle 2, 0x21
-    add8_sse2_cycle 3, 0x29
+    add8_sse2_cycle 2, 0x5c
+    add8_sse2_cycle 3, 0x64
     RET
 
 ;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index 3f7cf4cefc..54636a95d0 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -29,14 +29,18 @@ SECTION_RODATA
 
 pw_pixel_max: times 8 dw ((1 << 10)-1)
 pd_32:        times 4 dd 32
-scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
-           db 6+1*8, 7+1*8, 6+2*8, 7+2*8
-           db 4+3*8, 5+3*8, 4+4*8, 5+4*8
-           db 6+3*8, 7+3*8, 6+4*8, 7+4*8
-           db 1+1*8, 2+1*8
-           db 1+2*8, 2+2*8
-           db 1+4*8, 2+4*8
-           db 1+5*8, 2+5*8
+scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
+           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
+           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
+           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
+           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
+           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
+           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
+           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
+           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
+           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
+           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
+           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
 
 %ifdef PIC
 %define scan8 r11
@@ -306,7 +310,7 @@ INIT_AVX
 IDCT_ADD16INTRA_10 avx
 %endif
 
-%assign last_block 24
+%assign last_block 36
 ;-----------------------------------------------------------------------------
 ; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
 ;-----------------------------------------------------------------------------
@@ -317,21 +321,22 @@ cglobal h264_idct_add8_10_%1,5,7
 %endif
     add      r2, 1024
     mov      r0, [r0]
-    ADD16_OP_INTRA %1, 16, 1+1*8
-    ADD16_OP_INTRA %1, 18, 1+2*8
+    ADD16_OP_INTRA %1, 16, 4+ 6*8
+    ADD16_OP_INTRA %1, 18, 4+ 7*8
+    add      r2, 1024-128*2
 %ifdef ARCH_X86_64
     mov      r0, [r10+gprsize]
 %else
     mov      r0, r0m
     mov      r0, [r0+gprsize]
 %endif
-    ADD16_OP_INTRA %1, 20, 1+4*8
-    ADD16_OP_INTRA %1, 22, 1+5*8
+    ADD16_OP_INTRA %1, 32, 4+11*8
+    ADD16_OP_INTRA %1, 34, 4+12*8
     REP_RET
     AC %1, 16
     AC %1, 18
-    AC %1, 20
-    AC %1, 22
+    AC %1, 32
+    AC %1, 34
 
 %endmacro ; IDCT_ADD8
 

From c177cfb4fb2430f4d43d27ba3c3476176f17d006 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <jason@x264.com>
Date: Thu, 9 Jun 2011 16:17:41 -0700
Subject: [PATCH 17/24] H.264: fix CODEC_FLAG_GRAY

It was broken in 4:4:4, and still did chroma deblocking for no reason in 4:2:0.
---
 libavcodec/h264.c            | 51 ++++++++++---------
 libavcodec/h264_loopfilter.c | 98 ++++++++++++++++++++----------------
 2 files changed, 84 insertions(+), 65 deletions(-)

diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 86ea218807..78ca4141a4 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -484,6 +484,8 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
     }
 
+    if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
+
     if(chroma444){
         src_cb = pic->data[1] + offset;
         if(emu){
@@ -509,8 +511,6 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
         return;
     }
 
-    if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
-
     if(MB_FIELD){
         // chroma offset when predicting from a field of opposite parity
         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
@@ -1847,24 +1847,28 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                 for (j = 0; j < 16; j++)
                     tmp_y[j] = get_bits(&gb, bit_depth);
             }
-            for (i = 0; i < 8; i++) {
-                uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize);
-                for (j = 0; j < 8; j++)
-                    tmp_cb[j] = get_bits(&gb, bit_depth);
-            }
-            for (i = 0; i < 8; i++) {
-                uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize);
-                for (j = 0; j < 8; j++)
-                    tmp_cr[j] = get_bits(&gb, bit_depth);
+            if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
+                for (i = 0; i < 8; i++) {
+                    uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize);
+                    for (j = 0; j < 8; j++)
+                        tmp_cb[j] = get_bits(&gb, bit_depth);
+                }
+                for (i = 0; i < 8; i++) {
+                    uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize);
+                    for (j = 0; j < 8; j++)
+                        tmp_cr[j] = get_bits(&gb, bit_depth);
+                }
             }
         } else {
-        for (i=0; i<16; i++) {
-            memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
-        }
-        for (i=0; i<8; i++) {
-            memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
-            memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
-        }
+            for (i=0; i<16; i++) {
+                memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
+            }
+            if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
+                for (i=0; i<8; i++) {
+                    memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
+                    memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
+                }
+            }
         }
     } else {
         if(IS_INTRA(mb_type)){
@@ -1954,8 +1958,9 @@ static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simpl
     int i, j, p;
     int *block_offset = &h->block_offset[0];
     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
+    const int plane_count = (simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) ? 3 : 1;
 
-    for (p = 0; p < 3; p++)
+    for (p = 0; p < plane_count; p++)
     {
         dest[p] = s->current_picture.data[p] + ((mb_x << pixel_shift) + mb_y * s->linesize) * 16;
         s->dsp.prefetch(dest[p] + (s->mb_x&3)*4*s->linesize + (64 << pixel_shift), s->linesize, 4);
@@ -1996,7 +2001,7 @@ static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simpl
             GetBitContext gb;
             init_get_bits(&gb, (uint8_t*)h->mb, 768*bit_depth);
 
-            for (p = 0; p < 3; p++) {
+            for (p = 0; p < plane_count; p++) {
                 for (i = 0; i < 16; i++) {
                     uint16_t *tmp = (uint16_t*)(dest[p] + i*linesize);
                     for (j = 0; j < 16; j++)
@@ -2004,7 +2009,7 @@ static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simpl
                 }
             }
         } else {
-            for (p = 0; p < 3; p++) {
+            for (p = 0; p < plane_count; p++) {
                 for (i = 0; i < 16; i++) {
                     memcpy(dest[p] + i*linesize, h->mb + p*128 + i*8, 16);
                 }
@@ -2015,7 +2020,7 @@ static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simpl
             if(h->deblocking_filter)
                 xchg_mb_border(h, dest[0], dest[1], dest[2], linesize, linesize, 1, 1, simple, pixel_shift);
 
-            for (p = 0; p < 3; p++)
+            for (p = 0; p < plane_count; p++)
                 hl_decode_mb_predict_luma(h, mb_type, 1, simple, transform_bypass, pixel_shift, block_offset, linesize, dest[p], p);
 
             if(h->deblocking_filter)
@@ -2035,7 +2040,7 @@ static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simpl
                             h->h264dsp.biweight_h264_pixels_tab, 1);
         }
 
-        for (p = 0; p < 3; p++)
+        for (p = 0; p < plane_count; p++)
             hl_decode_mb_idct_luma(h, mb_type, 1, simple, transform_bypass, pixel_shift, block_offset, linesize, dest[p], p);
     }
     if(h->cbp || IS_INTRA(mb_type))
diff --git a/libavcodec/h264_loopfilter.c b/libavcodec/h264_loopfilter.c
index d4ecefcf08..1ae534ec96 100644
--- a/libavcodec/h264_loopfilter.c
+++ b/libavcodec/h264_loopfilter.c
@@ -217,6 +217,7 @@ void ff_h264_filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
     int mb_xy;
     int mb_type, left_type;
     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
+    int chroma = !(CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
 
     mb_xy = h->mb_xy;
 
@@ -262,16 +263,18 @@ void ff_h264_filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
             filter_mb_edgeh( &img_y[4*2*linesize], linesize, bS3, qp, h);
             filter_mb_edgeh( &img_y[4*3*linesize], linesize, bS3, qp, h);
         }
-        if(left_type){
-            filter_mb_edgecv( &img_cb[2*0], uvlinesize, bS4, qpc0, h);
-            filter_mb_edgecv( &img_cr[2*0], uvlinesize, bS4, qpc0, h);
+        if(chroma){
+            if(left_type){
+                filter_mb_edgecv( &img_cb[2*0], uvlinesize, bS4, qpc0, h);
+                filter_mb_edgecv( &img_cr[2*0], uvlinesize, bS4, qpc0, h);
+            }
+            filter_mb_edgecv( &img_cb[2*2], uvlinesize, bS3, qpc, h);
+            filter_mb_edgecv( &img_cr[2*2], uvlinesize, bS3, qpc, h);
+            filter_mb_edgech( &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1, h);
+            filter_mb_edgech( &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc, h);
+            filter_mb_edgech( &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1, h);
+            filter_mb_edgech( &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc, h);
         }
-        filter_mb_edgecv( &img_cb[2*2], uvlinesize, bS3, qpc, h);
-        filter_mb_edgecv( &img_cr[2*2], uvlinesize, bS3, qpc, h);
-        filter_mb_edgech( &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1, h);
-        filter_mb_edgech( &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc, h);
-        filter_mb_edgech( &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1, h);
-        filter_mb_edgech( &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc, h);
         return;
     } else {
         LOCAL_ALIGNED_8(int16_t, bS, [2], [4][4]);
@@ -298,7 +301,7 @@ void ff_h264_filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
 #define FILTER(hv,dir,edge)\
         if(AV_RN64A(bS[dir][edge])) {                                   \
             filter_mb_edge##hv( &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir, h );\
-            if(!(edge&1)) {\
+            if(chroma && !(edge&1)) {\
                 filter_mb_edgec##hv( &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir, h );\
                 filter_mb_edgec##hv( &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir, h );\
             }\
@@ -353,7 +356,7 @@ static int check_mv(H264Context *h, long b_idx, long bn_idx, int mvy_limit){
     return v;
 }
 
-static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int chroma444, int dir) {
+static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int chroma, int chroma444, int dir) {
     MpegEncContext * const s = &h->s;
     int edge;
     int chroma_qp_avg[2];
@@ -410,12 +413,14 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
                 filter_mb_edgeh( &img_y[j*linesize], tmp_linesize, bS, qp, h );
                 chroma_qp_avg[0] = (h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
                 chroma_qp_avg[1] = (h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
-                if (chroma444) {
-                    filter_mb_edgeh (&img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[0], h);
-                    filter_mb_edgeh (&img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[1], h);
-                } else {
-                    filter_mb_edgech(&img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[0], h);
-                    filter_mb_edgech(&img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[1], h);
+                if (chroma) {
+                    if (chroma444) {
+                        filter_mb_edgeh (&img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[0], h);
+                        filter_mb_edgeh (&img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[1], h);
+                    } else {
+                        filter_mb_edgech(&img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[0], h);
+                        filter_mb_edgech(&img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[1], h);
+                    }
                 }
             }
         }else{
@@ -475,7 +480,7 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
                 chroma_qp_avg[1] = (h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
                 if( dir == 0 ) {
                     filter_mb_edgev( &img_y[0], linesize, bS, qp, h );
-                    {
+                    if (chroma) {
                         if (chroma444) {
                             filter_mb_edgev ( &img_cb[0], uvlinesize, bS, chroma_qp_avg[0], h);
                             filter_mb_edgev ( &img_cr[0], uvlinesize, bS, chroma_qp_avg[1], h);
@@ -486,7 +491,7 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
                     }
                 } else {
                     filter_mb_edgeh( &img_y[0], linesize, bS, qp, h );
-                    {
+                    if (chroma) {
                         if (chroma444) {
                             filter_mb_edgeh ( &img_cb[0], uvlinesize, bS, chroma_qp_avg[0], h);
                             filter_mb_edgeh ( &img_cr[0], uvlinesize, bS, chroma_qp_avg[1], h);
@@ -557,21 +562,25 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
         //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
         if( dir == 0 ) {
             filter_mb_edgev( &img_y[4*edge << h->pixel_shift], linesize, bS, qp, h );
-            if (chroma444) {
-                filter_mb_edgev ( &img_cb[4*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
-                filter_mb_edgev ( &img_cr[4*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
-            } else if( (edge&1) == 0 ) {
-                filter_mb_edgecv( &img_cb[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
-                filter_mb_edgecv( &img_cr[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
+            if (chroma) {
+                if (chroma444) {
+                    filter_mb_edgev ( &img_cb[4*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
+                    filter_mb_edgev ( &img_cr[4*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
+                } else if( (edge&1) == 0 ) {
+                    filter_mb_edgecv( &img_cb[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
+                    filter_mb_edgecv( &img_cr[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
+                }
             }
         } else {
             filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, h );
-            if (chroma444) {
-                filter_mb_edgeh ( &img_cb[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], h);
-                filter_mb_edgeh ( &img_cr[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], h);
-            } else if( (edge&1) == 0 ) {
-                filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], h);
-                filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], h);
+            if (chroma) {
+                if (chroma444) {
+                    filter_mb_edgeh ( &img_cb[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], h);
+                    filter_mb_edgeh ( &img_cr[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], h);
+                } else if( (edge&1) == 0 ) {
+                    filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], h);
+                    filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], h);
+                }
             }
         }
     }
@@ -584,6 +593,7 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
     int first_vertical_edge_done = 0;
     av_unused int dir;
+    int chroma = !(CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
 
     if (FRAME_MBAFF
             // and current and left pair do not have the same interlaced type
@@ -652,25 +662,29 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
         if(MB_FIELD){
             filter_mb_mbaff_edgev ( h, img_y                ,   linesize, bS  , 1, qp [0] );
             filter_mb_mbaff_edgev ( h, img_y  + 8*  linesize,   linesize, bS+4, 1, qp [1] );
-            filter_mb_mbaff_edgecv( h, img_cb,                uvlinesize, bS  , 1, bqp[0] );
-            filter_mb_mbaff_edgecv( h, img_cb + 4*uvlinesize, uvlinesize, bS+4, 1, bqp[1] );
-            filter_mb_mbaff_edgecv( h, img_cr,                uvlinesize, bS  , 1, rqp[0] );
-            filter_mb_mbaff_edgecv( h, img_cr + 4*uvlinesize, uvlinesize, bS+4, 1, rqp[1] );
+            if (chroma){
+                filter_mb_mbaff_edgecv( h, img_cb,                uvlinesize, bS  , 1, bqp[0] );
+                filter_mb_mbaff_edgecv( h, img_cb + 4*uvlinesize, uvlinesize, bS+4, 1, bqp[1] );
+                filter_mb_mbaff_edgecv( h, img_cr,                uvlinesize, bS  , 1, rqp[0] );
+                filter_mb_mbaff_edgecv( h, img_cr + 4*uvlinesize, uvlinesize, bS+4, 1, rqp[1] );
+            }
         }else{
             filter_mb_mbaff_edgev ( h, img_y              , 2*  linesize, bS  , 2, qp [0] );
             filter_mb_mbaff_edgev ( h, img_y  +   linesize, 2*  linesize, bS+1, 2, qp [1] );
-            filter_mb_mbaff_edgecv( h, img_cb,              2*uvlinesize, bS  , 2, bqp[0] );
-            filter_mb_mbaff_edgecv( h, img_cb + uvlinesize, 2*uvlinesize, bS+1, 2, bqp[1] );
-            filter_mb_mbaff_edgecv( h, img_cr,              2*uvlinesize, bS  , 2, rqp[0] );
-            filter_mb_mbaff_edgecv( h, img_cr + uvlinesize, 2*uvlinesize, bS+1, 2, rqp[1] );
+            if (chroma){
+                filter_mb_mbaff_edgecv( h, img_cb,              2*uvlinesize, bS  , 2, bqp[0] );
+                filter_mb_mbaff_edgecv( h, img_cb + uvlinesize, 2*uvlinesize, bS+1, 2, bqp[1] );
+                filter_mb_mbaff_edgecv( h, img_cr,              2*uvlinesize, bS  , 2, rqp[0] );
+                filter_mb_mbaff_edgecv( h, img_cr + uvlinesize, 2*uvlinesize, bS+1, 2, rqp[1] );
+            }
         }
     }
 
 #if CONFIG_SMALL
     for( dir = 0; dir < 2; dir++ )
-        filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, CHROMA444, dir);
+        filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, chroma, CHROMA444, dir);
 #else
-    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, CHROMA444, 0);
-    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, CHROMA444, 1);
+    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, chroma, CHROMA444, 0);
+    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, chroma, CHROMA444, 1);
 #endif
 }

From 295f0a2503550088a5ffddc5754b9fba2fa6ee60 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <jason@x264.com>
Date: Mon, 13 Jun 2011 10:21:46 -0700
Subject: [PATCH 18/24] Fix SVQ3 after adding 4:4:4 H.264 support

---
 libavcodec/svq3.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c
index 7cde5e5552..23ab209312 100644
--- a/libavcodec/svq3.c
+++ b/libavcodec/svq3.c
@@ -633,8 +633,9 @@ static int svq3_decode_mb(SVQ3Context *svq3, unsigned int mb_type)
         memset(h->intra4x4_pred_mode+h->mb2br_xy[mb_xy], DC_PRED, 8);
     }
     if (!IS_SKIP(mb_type) || s->pict_type == AV_PICTURE_TYPE_B) {
-        memset(h->non_zero_count_cache + 8, 0, 4*9*sizeof(uint8_t));
-        s->dsp.clear_blocks(h->mb);
+        memset(h->non_zero_count_cache + 8, 0, 14*8*sizeof(uint8_t));
+        s->dsp.clear_blocks(h->mb+  0);
+        s->dsp.clear_blocks(h->mb+384);
     }
 
     if (!IS_INTRA16x16(mb_type) && (!IS_SKIP(mb_type) || s->pict_type == AV_PICTURE_TYPE_B)) {
@@ -654,8 +655,8 @@ static int svq3_decode_mb(SVQ3Context *svq3, unsigned int mb_type)
         }
     }
     if (IS_INTRA16x16(mb_type)) {
-        AV_ZERO128(h->mb_luma_dc+0);
-        AV_ZERO128(h->mb_luma_dc+8);
+        AV_ZERO128(h->mb_luma_dc[0]+0);
+        AV_ZERO128(h->mb_luma_dc[0]+8);
         if (svq3_decode_block(&s->gb, h->mb_luma_dc, 0, 1)){
             av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding intra luma dc\n");
             return -1;
@@ -681,20 +682,23 @@ static int svq3_decode_mb(SVQ3Context *svq3, unsigned int mb_type)
         }
 
         if ((cbp & 0x30)) {
-            for (i = 0; i < 2; ++i) {
-              if (svq3_decode_block(&s->gb, &h->mb[16*(16 + 4*i)], 0, 3)){
+            for (i = 1; i < 3; ++i) {
+              if (svq3_decode_block(&s->gb, &h->mb[16*16*i], 0, 3)){
                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma dc block\n");
                 return -1;
               }
             }
 
             if ((cbp & 0x20)) {
-                for (i = 0; i < 8; i++) {
-                    h->non_zero_count_cache[ scan8[16+i] ] = 1;
+                for (i = 1; i < 3; i++) {
+                    for (j = 0; j < 4; j++) {
+                        k = 16*i + j;
+                        h->non_zero_count_cache[ scan8[k] ] = 1;
 
-                    if (svq3_decode_block(&s->gb, &h->mb[16*(16 + i)], 1, 1)){
-                        av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma ac block\n");
-                        return -1;
+                        if (svq3_decode_block(&s->gb, &h->mb[16*k], 1, 1)){
+                            av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma ac block\n");
+                            return -1;
+                        }
                     }
                 }
             }

From 504811baeacf8bac400962e84fca678b79068ceb Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <jason@x264.com>
Date: Mon, 13 Jun 2011 13:38:46 -0700
Subject: [PATCH 19/24] Roll back 4:4:4 H.264 for now Needs some ARM/PPC asm
 modifications.

---
 libavcodec/dsputil.h               |   2 +-
 libavcodec/dsputil_template.c      |   6 +-
 libavcodec/h264.c                  | 797 ++++++++++------------------
 libavcodec/h264.h                  | 181 +++----
 libavcodec/h264_cabac.c            | 819 ++++-------------------------
 libavcodec/h264_cavlc.c            | 198 +++----
 libavcodec/h264_loopfilter.c       | 132 ++---
 libavcodec/h264_ps.c               |  14 +-
 libavcodec/h264dsp.h               |   8 +-
 libavcodec/h264idct_template.c     |  44 +-
 libavcodec/mpegvideo.c             |  25 +-
 libavcodec/snow.c                  |   6 +-
 libavcodec/svq3.c                  |  26 +-
 libavcodec/x86/dsputil_mmx.c       |   4 +-
 libavcodec/x86/h264_i386.h         |  15 +-
 libavcodec/x86/h264_idct.asm       |  44 +-
 libavcodec/x86/h264_idct_10bit.asm |  35 +-
 17 files changed, 687 insertions(+), 1669 deletions(-)

diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 7a28b06fd5..cfc574aebb 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -507,7 +507,7 @@ typedef struct DSPContext {
 #define BASIS_SHIFT 16
 #define RECON_SHIFT 6
 
-    void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides);
+    void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w, int sides);
 #define EDGE_WIDTH 16
 #define EDGE_TOP    1
 #define EDGE_BOTTOM 2
diff --git a/libavcodec/dsputil_template.c b/libavcodec/dsputil_template.c
index b85931856a..8ca6d3e414 100644
--- a/libavcodec/dsputil_template.c
+++ b/libavcodec/dsputil_template.c
@@ -79,7 +79,7 @@ static inline void FUNC(copy_block16)(uint8_t *dst, const uint8_t *src, int dstS
 
 /* draw the edges of width 'w' of an image of size width, height */
 //FIXME check that this is ok for mpeg4 interlaced
-static void FUNCC(draw_edges)(uint8_t *_buf, int _wrap, int width, int height, int w, int h, int sides)
+static void FUNCC(draw_edges)(uint8_t *_buf, int _wrap, int width, int height, int w, int sides)
 {
     pixel *buf = (pixel*)_buf;
     int wrap = _wrap / sizeof(pixel);
@@ -106,10 +106,10 @@ static void FUNCC(draw_edges)(uint8_t *_buf, int _wrap, int width, int height, i
     buf -= w;
     last_line = buf + (height - 1) * wrap;
     if (sides & EDGE_TOP)
-        for(i = 0; i < h; i++)
+        for(i = 0; i < w; i++)
             memcpy(buf - (i + 1) * wrap, buf, (width + w + w) * sizeof(pixel)); // top
     if (sides & EDGE_BOTTOM)
-        for (i = 0; i < h; i++)
+        for (i = 0; i < w; i++)
             memcpy(last_line + (i + 1) * wrap, last_line, (width + w + w) * sizeof(pixel)); // bottom
 }
 
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 78ca4141a4..276d6e6d6c 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -451,13 +451,12 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                            int src_x_offset, int src_y_offset,
                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
-                           int pixel_shift, int chroma444){
+                           int pixel_shift){
     MpegEncContext * const s = &h->s;
     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
     const int luma_xy= (mx&3) + ((my&3)<<2);
-    int offset = ((mx>>2) << pixel_shift) + (my>>2)*h->mb_linesize;
-    uint8_t * src_y = pic->data[0] + offset;
+    uint8_t * src_y = pic->data[0] + ((mx>>2) << pixel_shift) + (my>>2)*h->mb_linesize;
     uint8_t * src_cb, * src_cr;
     int extra_width= h->emu_edge_width;
     int extra_height= h->emu_edge_height;
@@ -486,31 +485,6 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
 
     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
 
-    if(chroma444){
-        src_cb = pic->data[1] + offset;
-        if(emu){
-            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize,
-                                    16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
-            src_cb= s->edge_emu_buffer + (2 << pixel_shift) + 2*h->mb_linesize;
-        }
-        qpix_op[luma_xy](dest_cb, src_cb, h->mb_linesize); //FIXME try variable height perhaps?
-        if(!square){
-            qpix_op[luma_xy](dest_cb + delta, src_cb + delta, h->mb_linesize);
-        }
-
-        src_cr = pic->data[2] + offset;
-        if(emu){
-            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize,
-                                    16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
-            src_cr= s->edge_emu_buffer + (2 << pixel_shift) + 2*h->mb_linesize;
-        }
-        qpix_op[luma_xy](dest_cr, src_cr, h->mb_linesize); //FIXME try variable height perhaps?
-        if(!square){
-            qpix_op[luma_xy](dest_cr + delta, src_cr + delta, h->mb_linesize);
-        }
-        return;
-    }
-
     if(MB_FIELD){
         // chroma offset when predicting from a field of opposite parity
         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
@@ -537,19 +511,14 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
                            int x_offset, int y_offset,
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
-                           int list0, int list1, int pixel_shift, int chroma444){
+                           int list0, int list1, int pixel_shift){
     MpegEncContext * const s = &h->s;
     qpel_mc_func *qpix_op=  qpix_put;
     h264_chroma_mc_func chroma_op= chroma_put;
 
-    dest_y  += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    if(chroma444){
-        dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-        dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    }else{
-        dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
-        dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
-    }
+    dest_y  += (2*x_offset << pixel_shift) + 2*y_offset*h->  mb_linesize;
+    dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
+    dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
     x_offset += 8*s->mb_x;
     y_offset += 8*(s->mb_y >> MB_FIELD);
 
@@ -557,7 +526,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                           qpix_op, chroma_op, pixel_shift, chroma444);
+                           qpix_op, chroma_op, pixel_shift);
 
         qpix_op=  qpix_avg;
         chroma_op= chroma_avg;
@@ -567,7 +536,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                           qpix_op, chroma_op, pixel_shift, chroma444);
+                           qpix_op, chroma_op, pixel_shift);
     }
 }
 
@@ -577,19 +546,12 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
-                           int list0, int list1, int pixel_shift, int chroma444){
+                           int list0, int list1, int pixel_shift){
     MpegEncContext * const s = &h->s;
 
-    dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    if(chroma444){
-        chroma_weight_avg = luma_weight_avg;
-        chroma_weight_op = luma_weight_op;
-        dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-        dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    }else{
-        dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
-        dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
-    }
+    dest_y  += (2*x_offset << pixel_shift) + 2*y_offset*h->  mb_linesize;
+    dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
+    dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
     x_offset += 8*s->mb_x;
     y_offset += 8*(s->mb_y >> MB_FIELD);
 
@@ -597,17 +559,17 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
         /* don't optimize for luma-only case, since B-frames usually
          * use implicit weights => chroma too. */
         uint8_t *tmp_cb = s->obmc_scratchpad;
-        uint8_t *tmp_cr = s->obmc_scratchpad + (16 << pixel_shift);
-        uint8_t *tmp_y  = s->obmc_scratchpad + 16*h->mb_uvlinesize;
+        uint8_t *tmp_cr = s->obmc_scratchpad + (8 << pixel_shift);
+        uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
         int refn0 = h->ref_cache[0][ scan8[n] ];
         int refn1 = h->ref_cache[1][ scan8[n] ];
 
         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
                     dest_y, dest_cb, dest_cr,
-                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
+                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift);
         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
                     tmp_y, tmp_cb, tmp_cr,
-                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
+                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift);
 
         if(h->use_weight == 2){
             int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
@@ -632,7 +594,7 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
         Picture *ref= &h->ref_list[list][refn];
         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                    qpix_put, chroma_put, pixel_shift, chroma444);
+                    qpix_put, chroma_put, pixel_shift);
 
         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
                        h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
@@ -651,21 +613,21 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
-                           int list0, int list1, int pixel_shift, int chroma444){
+                           int list0, int list1, int pixel_shift){
     if((h->use_weight==2 && list0 && list1
         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
        || h->use_weight==1)
         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
                          x_offset, y_offset, qpix_put, chroma_put,
                          weight_op[0], weight_op[3], weight_avg[0],
-                         weight_avg[3], list0, list1, pixel_shift, chroma444);
+                         weight_avg[3], list0, list1, pixel_shift);
     else
         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
-                    chroma_avg, list0, list1, pixel_shift, chroma444);
+                    chroma_avg, list0, list1, pixel_shift);
 }
 
-static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma444){
+static inline void prefetch_motion(H264Context *h, int list, int pixel_shift){
     /* fetch pixels for estimated mv 4 macroblocks ahead
      * optimized for 64byte cache lines */
     MpegEncContext * const s = &h->s;
@@ -676,13 +638,8 @@ static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, in
         uint8_t **src= h->ref_list[list][refn].data;
         int off= (mx << pixel_shift) + (my + (s->mb_x&3)*4)*h->mb_linesize + (64 << pixel_shift);
         s->dsp.prefetch(src[0]+off, s->linesize, 4);
-        if(chroma444){
-            s->dsp.prefetch(src[1]+off, s->linesize, 4);
-            s->dsp.prefetch(src[2]+off, s->linesize, 4);
-        }else{
-            off= ((mx>>1) << pixel_shift) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + (64 << pixel_shift);
-            s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
-        }
+        off= ((mx>>1) << pixel_shift) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + (64 << pixel_shift);
+        s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
     }
 }
 
@@ -690,7 +647,7 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
                       h264_weight_func *weight_op, h264_biweight_func *weight_avg,
-                      int pixel_shift, int chroma444){
+                      int pixel_shift){
     MpegEncContext * const s = &h->s;
     const int mb_xy= h->mb_xy;
     const int mb_type= s->current_picture.mb_type[mb_xy];
@@ -699,36 +656,36 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
 
     if(HAVE_PTHREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME))
         await_references(h);
-    prefetch_motion(h, 0, pixel_shift, chroma444);
+    prefetch_motion(h, 0, pixel_shift);
 
     if(IS_16X16(mb_type)){
         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
                 weight_op, weight_avg,
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma444);
+                pixel_shift);
     }else if(IS_16X8(mb_type)){
         mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
                 &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma444);
+                pixel_shift);
         mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
                 &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
-                pixel_shift, chroma444);
+                pixel_shift);
     }else if(IS_8X16(mb_type)){
         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                 &weight_op[2], &weight_avg[2],
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma444);
+                pixel_shift);
         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                 &weight_op[2], &weight_avg[2],
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
-                pixel_shift, chroma444);
+                pixel_shift);
     }else{
         int i;
 
@@ -745,29 +702,29 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                     &weight_op[3], &weight_avg[3],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift);
             }else if(IS_SUB_8X4(sub_mb_type)){
                 mc_part(h, n  , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
                     &weight_op[4], &weight_avg[4],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift);
                 mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
                     &weight_op[4], &weight_avg[4],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift);
             }else if(IS_SUB_4X8(sub_mb_type)){
                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                     &weight_op[5], &weight_avg[5],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift);
                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                     &weight_op[5], &weight_avg[5],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift);
             }else{
                 int j;
                 assert(IS_SUB_4X4(sub_mb_type));
@@ -778,13 +735,13 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                         &weight_op[6], &weight_avg[6],
                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift, chroma444);
+                        pixel_shift);
                 }
             }
         }
     }
 
-    prefetch_motion(h, 1, pixel_shift, chroma444);
+    prefetch_motion(h, 1, pixel_shift);
 }
 
 #define hl_motion_fn(sh, bits) \
@@ -796,11 +753,10 @@ static av_always_inline void hl_motion_ ## bits(H264Context *h, \
                                        qpel_mc_func (*qpix_avg)[16], \
                                        h264_chroma_mc_func (*chroma_avg), \
                                        h264_weight_func *weight_op, \
-                                       h264_biweight_func *weight_avg, \
-                                       int chroma444) \
+                                       h264_biweight_func *weight_avg) \
 { \
     hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, \
-              qpix_avg, chroma_avg, weight_op, weight_avg, sh, chroma444); \
+              qpix_avg, chroma_avg, weight_op, weight_avg, sh); \
 }
 hl_motion_fn(0, 8);
 hl_motion_fn(1, 16);
@@ -840,19 +796,16 @@ static void free_tables(H264Context *h, int free_rbsp){
 }
 
 static void init_dequant8_coeff_table(H264Context *h){
-    int i,j,q,x;
+    int i,q,x;
     const int max_qp = 51 + 6*(h->sps.bit_depth_luma-8);
+    h->dequant8_coeff[0] = h->dequant8_buffer[0];
+    h->dequant8_coeff[1] = h->dequant8_buffer[1];
 
-    for(i=0; i<6; i++ ){
-        h->dequant8_coeff[i] = h->dequant8_buffer[i];
-        for(j=0; j<i; j++){
-            if(!memcmp(h->pps.scaling_matrix8[j], h->pps.scaling_matrix8[i], 64*sizeof(uint8_t))){
-                h->dequant8_coeff[i] = h->dequant8_buffer[j];
-                break;
-            }
+    for(i=0; i<2; i++ ){
+        if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
+            h->dequant8_coeff[1] = h->dequant8_buffer[0];
+            break;
         }
-        if(j<i)
-            continue;
 
         for(q=0; q<max_qp+1; q++){
             int shift = div6[q];
@@ -900,7 +853,7 @@ static void init_dequant_tables(H264Context *h){
             for(x=0; x<16; x++)
                 h->dequant4_coeff[i][0][x] = 1<<6;
         if(h->pps.transform_8x8_mode)
-            for(i=0; i<6; i++)
+            for(i=0; i<2; i++)
                 for(x=0; x<64; x++)
                     h->dequant8_coeff[i][0][x] = 1<<6;
     }
@@ -915,7 +868,7 @@ int ff_h264_alloc_tables(H264Context *h){
 
     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->intra4x4_pred_mode, row_mb_num * 8  * sizeof(uint8_t), fail)
 
-    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count    , big_mb_num * 48 * sizeof(uint8_t), fail)
+    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count    , big_mb_num * 32 * sizeof(uint8_t), fail)
     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base), fail)
     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail)
 
@@ -977,8 +930,8 @@ static void clone_tables(H264Context *dst, H264Context *src, int i){
  * Allocate buffers which are not shared amongst multiple threads.
  */
 static int context_init(H264Context *h){
-    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[0], h->s.mb_width * 16*3 * sizeof(uint8_t)*2, fail)
-    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[1], h->s.mb_width * 16*3 * sizeof(uint8_t)*2, fail)
+    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t)*2, fail)
+    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t)*2, fail)
 
     h->ref_cache[0][scan8[5 ]+1] = h->ref_cache[0][scan8[7 ]+1] = h->ref_cache[0][scan8[13]+1] =
     h->ref_cache[1][scan8[5 ]+1] = h->ref_cache[1][scan8[7 ]+1] = h->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE;
@@ -1177,10 +1130,9 @@ static int decode_update_thread_context(AVCodecContext *dst, const AVCodecContex
 
         // frame_start may not be called for the next thread (if it's decoding a bottom field)
         // so this has to be allocated here
-        h->s.obmc_scratchpad = av_malloc(16*6*s->linesize);
+        h->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
 
         s->dsp.clear_blocks(h->mb);
-        s->dsp.clear_blocks(h->mb+(24*16<<h->pixel_shift));
     }
 
     //extradata/NAL handling
@@ -1199,7 +1151,7 @@ static int decode_update_thread_context(AVCodecContext *dst, const AVCodecContex
     for(i=0; i<6; i++)
         h->dequant4_coeff[i] = h->dequant4_buffer[0] + (h1->dequant4_coeff[i] - h1->dequant4_buffer[0]);
 
-    for(i=0; i<6; i++)
+    for(i=0; i<2; i++)
         h->dequant8_coeff[i] = h->dequant8_buffer[0] + (h1->dequant8_coeff[i] - h1->dequant8_buffer[0]);
 
     h->dequant_coeff_pps = h1->dequant_coeff_pps;
@@ -1254,20 +1206,20 @@ int ff_h264_frame_start(H264Context *h){
 
     for(i=0; i<16; i++){
         h->block_offset[i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
-        h->block_offset[48+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[24+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
     }
-    for(i=0; i<16; i++){
+    for(i=0; i<4; i++){
         h->block_offset[16+i]=
-        h->block_offset[32+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
-        h->block_offset[48+16+i]=
-        h->block_offset[48+32+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[20+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[24+16+i]=
+        h->block_offset[24+20+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
     }
 
     /* can't be in alloc_tables because linesize isn't known there.
      * FIXME: redo bipred weight to not require extra buffer? */
     for(i = 0; i < thread_count; i++)
         if(h->thread_context[i] && !h->thread_context[i]->s.obmc_scratchpad)
-            h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*6*s->linesize);
+            h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
 
     /* some macroblocks can be accessed before they're available in case of lost slices, mbaff or threading*/
     memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
@@ -1452,7 +1404,7 @@ static void decode_postinit(H264Context *h, int setup_finished){
         ff_thread_finish_setup(s->avctx);
 }
 
-static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int chroma444, int simple){
+static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
     MpegEncContext * const s = &h->s;
     uint8_t *top_border;
     int top_idx = 1;
@@ -1470,24 +1422,12 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
                 if (pixel_shift)
                     AV_COPY128(top_border+16, src_y+15*linesize+16);
                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
-                    if(chroma444){
-                        if (pixel_shift){
-                            AV_COPY128(top_border+32, src_cb + 15*uvlinesize);
-                            AV_COPY128(top_border+48, src_cb + 15*uvlinesize+16);
-                            AV_COPY128(top_border+64, src_cr + 15*uvlinesize);
-                            AV_COPY128(top_border+80, src_cr + 15*uvlinesize+16);
-                        } else {
-                            AV_COPY128(top_border+16, src_cb + 15*uvlinesize);
-                            AV_COPY128(top_border+32, src_cr + 15*uvlinesize);
-                        }
+                    if (pixel_shift) {
+                        AV_COPY128(top_border+32, src_cb+7*uvlinesize);
+                        AV_COPY128(top_border+48, src_cr+7*uvlinesize);
                     } else {
-                        if (pixel_shift) {
-                            AV_COPY128(top_border+32, src_cb+7*uvlinesize);
-                            AV_COPY128(top_border+48, src_cr+7*uvlinesize);
-                        } else {
-                            AV_COPY64(top_border+16, src_cb+7*uvlinesize);
-                            AV_COPY64(top_border+24, src_cr+7*uvlinesize);
-                        }
+                    AV_COPY64(top_border+16, src_cb+7*uvlinesize);
+                    AV_COPY64(top_border+24, src_cr+7*uvlinesize);
                     }
                 }
             }
@@ -1505,24 +1445,12 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
         AV_COPY128(top_border+16, src_y+16*linesize+16);
 
     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
-        if(chroma444){
-            if (pixel_shift){
-                AV_COPY128(top_border+32, src_cb + 16*linesize);
-                AV_COPY128(top_border+48, src_cb + 16*linesize+16);
-                AV_COPY128(top_border+64, src_cr + 16*linesize);
-                AV_COPY128(top_border+80, src_cr + 16*linesize+16);
-            } else {
-                AV_COPY128(top_border+16, src_cb + 16*linesize);
-                AV_COPY128(top_border+32, src_cr + 16*linesize);
-            }
+        if (pixel_shift) {
+            AV_COPY128(top_border+32, src_cb+8*uvlinesize);
+            AV_COPY128(top_border+48, src_cr+8*uvlinesize);
         } else {
-            if (pixel_shift) {
-                AV_COPY128(top_border+32, src_cb+8*uvlinesize);
-                AV_COPY128(top_border+48, src_cr+8*uvlinesize);
-            } else {
-                AV_COPY64(top_border+16, src_cb+8*uvlinesize);
-                AV_COPY64(top_border+24, src_cr+8*uvlinesize);
-            }
+        AV_COPY64(top_border+16, src_cb+8*uvlinesize);
+        AV_COPY64(top_border+24, src_cr+8*uvlinesize);
         }
     }
 }
@@ -1530,8 +1458,7 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y,
                                   uint8_t *src_cb, uint8_t *src_cr,
                                   int linesize, int uvlinesize,
-                                  int xchg, int chroma444,
-                                  int simple, int pixel_shift){
+                                  int xchg, int simple, int pixel_shift){
     MpegEncContext * const s = &h->s;
     int deblock_topleft;
     int deblock_top;
@@ -1586,28 +1513,13 @@ else      AV_COPY64(b,a);
         }
     }
     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
-        if(chroma444){
+        if(deblock_top){
             if(deblock_topleft){
-                XCHG(top_border_m1 + (24 << pixel_shift), src_cb - (7 << pixel_shift), 1);
-                XCHG(top_border_m1 + (40 << pixel_shift), src_cr - (7 << pixel_shift), 1);
-            }
-            XCHG(top_border + (16 << pixel_shift), src_cb + (1 << pixel_shift), xchg);
-            XCHG(top_border + (24 << pixel_shift), src_cb + (9 << pixel_shift), 1);
-            XCHG(top_border + (32 << pixel_shift), src_cr + (1 << pixel_shift), xchg);
-            XCHG(top_border + (40 << pixel_shift), src_cr + (9 << pixel_shift), 1);
-            if(s->mb_x+1 < s->mb_width){
-                XCHG(h->top_borders[top_idx][s->mb_x+1] + (16 << pixel_shift), src_cb + (17 << pixel_shift), 1);
-                XCHG(h->top_borders[top_idx][s->mb_x+1] + (32 << pixel_shift), src_cr + (17 << pixel_shift), 1);
-            }
-        } else {
-            if(deblock_top){
-                if(deblock_topleft){
-                    XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1);
-                    XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1);
-                }
-                XCHG(top_border + (16 << pixel_shift), src_cb+1+pixel_shift, 1);
-                XCHG(top_border + (24 << pixel_shift), src_cr+1+pixel_shift, 1);
+                XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1);
+                XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1);
             }
+            XCHG(top_border + (16 << pixel_shift), src_cb+1+pixel_shift, 1);
+            XCHG(top_border + (24 << pixel_shift), src_cr+1+pixel_shift, 1);
         }
     }
 }
@@ -1626,159 +1538,6 @@ static av_always_inline void dctcoef_set(DCTELEM *mb, int high_bit_depth, int in
         AV_WN16A(mb + index, value);
 }
 
-static av_always_inline void hl_decode_mb_predict_luma(H264Context *h, int mb_type, int is_h264, int simple, int transform_bypass,
-                                                       int pixel_shift, int *block_offset, int linesize, uint8_t *dest_y, int p)
-{
-    MpegEncContext * const s = &h->s;
-    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
-    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
-    int i;
-    int qscale = p == 0 ? s->qscale : h->chroma_qp[p-1];
-    block_offset += 16*p;
-    if(IS_INTRA4x4(mb_type)){
-        if(simple || !s->encoding){
-            if(IS_8x8DCT(mb_type)){
-                if(transform_bypass){
-                    idct_dc_add =
-                    idct_add    = s->dsp.add_pixels8;
-                }else{
-                    idct_dc_add = h->h264dsp.h264_idct8_dc_add;
-                    idct_add    = h->h264dsp.h264_idct8_add;
-                }
-                for(i=0; i<16; i+=4){
-                    uint8_t * const ptr= dest_y + block_offset[i];
-                    const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
-                    if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
-                        h->hpc.pred8x8l_add[dir](ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
-                    }else{
-                        const int nnz = h->non_zero_count_cache[ scan8[i+p*16] ];
-                        h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
-                                                    (h->topright_samples_available<<i)&0x4000, linesize);
-                        if(nnz){
-                            if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16+p*256))
-                                idct_dc_add(ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
-                            else
-                                idct_add   (ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
-                        }
-                    }
-                }
-            }else{
-                if(transform_bypass){
-                    idct_dc_add =
-                    idct_add    = s->dsp.add_pixels4;
-                }else{
-                    idct_dc_add = h->h264dsp.h264_idct_dc_add;
-                    idct_add    = h->h264dsp.h264_idct_add;
-                }
-                for(i=0; i<16; i++){
-                    uint8_t * const ptr= dest_y + block_offset[i];
-                    const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
-
-                    if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
-                        h->hpc.pred4x4_add[dir](ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
-                    }else{
-                        uint8_t *topright;
-                        int nnz, tr;
-                        uint64_t tr_high;
-                        if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
-                            const int topright_avail= (h->topright_samples_available<<i)&0x8000;
-                            assert(mb_y || linesize <= block_offset[i]);
-                            if(!topright_avail){
-                                if (pixel_shift) {
-                                    tr_high= ((uint16_t*)ptr)[3 - linesize/2]*0x0001000100010001ULL;
-                                    topright= (uint8_t*) &tr_high;
-                                } else {
-                                    tr= ptr[3 - linesize]*0x01010101;
-                                    topright= (uint8_t*) &tr;
-                                }
-                            }else
-                                topright= ptr + (4 << pixel_shift) - linesize;
-                        }else
-                            topright= NULL;
-
-                        h->hpc.pred4x4[ dir ](ptr, topright, linesize);
-                        nnz = h->non_zero_count_cache[ scan8[i+p*16] ];
-                        if(nnz){
-                            if(is_h264){
-                                if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16+p*256))
-                                    idct_dc_add(ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
-                                else
-                                    idct_add   (ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
-                            }else
-                                ff_svq3_add_idct_c(ptr, h->mb + i*16+p*256, linesize, qscale, 0);
-                        }
-                    }
-                }
-            }
-        }
-    }else{
-        h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
-        if(is_h264){
-            if(h->non_zero_count_cache[ scan8[LUMA_DC_BLOCK_INDEX+p] ]){
-                if(!transform_bypass)
-                    h->h264dsp.h264_luma_dc_dequant_idct(h->mb+(p*256 << pixel_shift), h->mb_luma_dc[p], h->dequant4_coeff[p][qscale][0]);
-                else{
-                    static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
-                                                            8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
-                    for(i = 0; i < 16; i++)
-                        dctcoef_set(h->mb+p*256, pixel_shift, dc_mapping[i], dctcoef_get(h->mb_luma_dc[p], pixel_shift, i));
-                }
-            }
-        }else
-            ff_svq3_luma_dc_dequant_idct_c(h->mb+p*256, h->mb_luma_dc[p], qscale);
-    }
-}
-
-static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, int is_h264, int simple, int transform_bypass,
-                                                    int pixel_shift, int *block_offset, int linesize, uint8_t *dest_y, int p)
-{
-    MpegEncContext * const s = &h->s;
-    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
-    int i;
-    block_offset += 16*p;
-    if(!IS_INTRA4x4(mb_type)){
-        if(is_h264){
-            if(IS_INTRA16x16(mb_type)){
-                if(transform_bypass){
-                    if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
-                        h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb + (p*256 << pixel_shift), linesize);
-                    }else{
-                        for(i=0; i<16; i++){
-                            if(h->non_zero_count_cache[ scan8[i+p*16] ] || dctcoef_get(h->mb, pixel_shift, i*16))
-                                s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + (i*16+p*256 << pixel_shift), linesize);
-                        }
-                    }
-                }else{
-                    h->h264dsp.h264_idct_add16intra(dest_y, block_offset, h->mb + (p*256 << pixel_shift), linesize, h->non_zero_count_cache+p*5*8);
-                }
-            }else if(h->cbp&15){
-                if(transform_bypass){
-                    const int di = IS_8x8DCT(mb_type) ? 4 : 1;
-                    idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
-                    for(i=0; i<16; i+=di){
-                        if(h->non_zero_count_cache[ scan8[i+p*16] ]){
-                            idct_add(dest_y + block_offset[i], h->mb + (i*16+p*256 << pixel_shift), linesize);
-                        }
-                    }
-                }else{
-                    if(IS_8x8DCT(mb_type)){
-                        h->h264dsp.h264_idct8_add4(dest_y, block_offset, h->mb + (p*256 << pixel_shift), linesize, h->non_zero_count_cache+p*5*8);
-                    }else{
-                        h->h264dsp.h264_idct_add16(dest_y, block_offset, h->mb + (p*256 << pixel_shift), linesize, h->non_zero_count_cache+p*5*8);
-                    }
-                }
-            }
-        }else{
-            for(i=0; i<16; i++){
-                if(h->non_zero_count_cache[ scan8[i+p*16] ] || h->mb[i*16+p*256]){ //FIXME benchmark weird rule, & below
-                    uint8_t * const ptr= dest_y + block_offset[i];
-                    ff_svq3_add_idct_c(ptr, h->mb + i*16 + p*256, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
-                }
-            }
-        }
-    }
-}
-
 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift){
     MpegEncContext * const s = &h->s;
     const int mb_x= s->mb_x;
@@ -1787,12 +1546,13 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
     const int mb_type= s->current_picture.mb_type[mb_xy];
     uint8_t  *dest_y, *dest_cb, *dest_cr;
     int linesize, uvlinesize /*dct_offset*/;
-    int i, j;
+    int i;
     int *block_offset = &h->block_offset[0];
     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
     /* is_h264 should always be true if SVQ3 is disabled. */
     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
 
     dest_y  = s->current_picture.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
     dest_cb = s->current_picture.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
@@ -1806,7 +1566,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
     if (!simple && MB_FIELD) {
         linesize   = h->mb_linesize   = s->linesize * 2;
         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
-        block_offset = &h->block_offset[48];
+        block_offset = &h->block_offset[24];
         if(mb_y&1){ //FIXME move out of this function?
             dest_y -= s->linesize*15;
             dest_cb-= s->uvlinesize*7;
@@ -1847,93 +1607,216 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                 for (j = 0; j < 16; j++)
                     tmp_y[j] = get_bits(&gb, bit_depth);
             }
-            if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
-                for (i = 0; i < 8; i++) {
-                    uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize);
-                    for (j = 0; j < 8; j++)
-                        tmp_cb[j] = get_bits(&gb, bit_depth);
-                }
-                for (i = 0; i < 8; i++) {
-                    uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize);
-                    for (j = 0; j < 8; j++)
-                        tmp_cr[j] = get_bits(&gb, bit_depth);
-                }
+            for (i = 0; i < 8; i++) {
+                uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize);
+                for (j = 0; j < 8; j++)
+                    tmp_cb[j] = get_bits(&gb, bit_depth);
+            }
+            for (i = 0; i < 8; i++) {
+                uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize);
+                for (j = 0; j < 8; j++)
+                    tmp_cr[j] = get_bits(&gb, bit_depth);
             }
         } else {
-            for (i=0; i<16; i++) {
-                memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
-            }
-            if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
-                for (i=0; i<8; i++) {
-                    memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
-                    memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
-                }
-            }
+        for (i=0; i<16; i++) {
+            memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
+        }
+        for (i=0; i<8; i++) {
+            memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
+            memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
+        }
         }
     } else {
         if(IS_INTRA(mb_type)){
             if(h->deblocking_filter)
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, 0, simple, pixel_shift);
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple, pixel_shift);
 
             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
             }
 
-            hl_decode_mb_predict_luma(h, mb_type, is_h264, simple, transform_bypass, pixel_shift, block_offset, linesize, dest_y, 0);
+            if(IS_INTRA4x4(mb_type)){
+                if(simple || !s->encoding){
+                    if(IS_8x8DCT(mb_type)){
+                        if(transform_bypass){
+                            idct_dc_add =
+                            idct_add    = s->dsp.add_pixels8;
+                        }else{
+                            idct_dc_add = h->h264dsp.h264_idct8_dc_add;
+                            idct_add    = h->h264dsp.h264_idct8_add;
+                        }
+                        for(i=0; i<16; i+=4){
+                            uint8_t * const ptr= dest_y + block_offset[i];
+                            const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
+                            if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
+                                h->hpc.pred8x8l_add[dir](ptr, h->mb + (i*16 << pixel_shift), linesize);
+                            }else{
+                                const int nnz = h->non_zero_count_cache[ scan8[i] ];
+                                h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
+                                                            (h->topright_samples_available<<i)&0x4000, linesize);
+                                if(nnz){
+                                    if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16))
+                                        idct_dc_add(ptr, h->mb + (i*16 << pixel_shift), linesize);
+                                    else
+                                        idct_add   (ptr, h->mb + (i*16 << pixel_shift), linesize);
+                                }
+                            }
+                        }
+                    }else{
+                        if(transform_bypass){
+                            idct_dc_add =
+                            idct_add    = s->dsp.add_pixels4;
+                        }else{
+                            idct_dc_add = h->h264dsp.h264_idct_dc_add;
+                            idct_add    = h->h264dsp.h264_idct_add;
+                        }
+                        for(i=0; i<16; i++){
+                            uint8_t * const ptr= dest_y + block_offset[i];
+                            const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
 
+                            if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
+                                h->hpc.pred4x4_add[dir](ptr, h->mb + (i*16 << pixel_shift), linesize);
+                            }else{
+                                uint8_t *topright;
+                                int nnz, tr;
+                                uint64_t tr_high;
+                                if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
+                                    const int topright_avail= (h->topright_samples_available<<i)&0x8000;
+                                    assert(mb_y || linesize <= block_offset[i]);
+                                    if(!topright_avail){
+                                        if (pixel_shift) {
+                                            tr_high= ((uint16_t*)ptr)[3 - linesize/2]*0x0001000100010001ULL;
+                                            topright= (uint8_t*) &tr_high;
+                                        } else {
+                                        tr= ptr[3 - linesize]*0x01010101;
+                                        topright= (uint8_t*) &tr;
+                                        }
+                                    }else
+                                        topright= ptr + (4 << pixel_shift) - linesize;
+                                }else
+                                    topright= NULL;
+
+                                h->hpc.pred4x4[ dir ](ptr, topright, linesize);
+                                nnz = h->non_zero_count_cache[ scan8[i] ];
+                                if(nnz){
+                                    if(is_h264){
+                                        if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16))
+                                            idct_dc_add(ptr, h->mb + (i*16 << pixel_shift), linesize);
+                                        else
+                                            idct_add   (ptr, h->mb + (i*16 << pixel_shift), linesize);
+                                    }else
+                                        ff_svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
+                                }
+                            }
+                        }
+                    }
+                }
+            }else{
+                h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
+                if(is_h264){
+                    if(h->non_zero_count_cache[ scan8[LUMA_DC_BLOCK_INDEX] ]){
+                        if(!transform_bypass)
+                            h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]);
+                        else{
+                            static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
+                                                                    8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
+                            for(i = 0; i < 16; i++)
+                                dctcoef_set(h->mb, pixel_shift, dc_mapping[i], dctcoef_get(h->mb_luma_dc, pixel_shift, i));
+                        }
+                    }
+                }else
+                    ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale);
+            }
             if(h->deblocking_filter)
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, 0, simple, pixel_shift);
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple, pixel_shift);
         }else if(is_h264){
             if (pixel_shift) {
                 hl_motion_16(h, dest_y, dest_cb, dest_cr,
                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
                              h->h264dsp.weight_h264_pixels_tab,
-                             h->h264dsp.biweight_h264_pixels_tab, 0);
+                             h->h264dsp.biweight_h264_pixels_tab);
             } else
                 hl_motion_8(h, dest_y, dest_cb, dest_cr,
                             s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
                             s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
                             h->h264dsp.weight_h264_pixels_tab,
-                            h->h264dsp.biweight_h264_pixels_tab, 0);
+                            h->h264dsp.biweight_h264_pixels_tab);
         }
 
-        hl_decode_mb_idct_luma(h, mb_type, is_h264, simple, transform_bypass, pixel_shift, block_offset, linesize, dest_y, 0);
+
+        if(!IS_INTRA4x4(mb_type)){
+            if(is_h264){
+                if(IS_INTRA16x16(mb_type)){
+                    if(transform_bypass){
+                        if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
+                            h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
+                        }else{
+                            for(i=0; i<16; i++){
+                                if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
+                                    s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + (i*16 << pixel_shift), linesize);
+                            }
+                        }
+                    }else{
+                         h->h264dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
+                    }
+                }else if(h->cbp&15){
+                    if(transform_bypass){
+                        const int di = IS_8x8DCT(mb_type) ? 4 : 1;
+                        idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
+                        for(i=0; i<16; i+=di){
+                            if(h->non_zero_count_cache[ scan8[i] ]){
+                                idct_add(dest_y + block_offset[i], h->mb + (i*16 << pixel_shift), linesize);
+                            }
+                        }
+                    }else{
+                        if(IS_8x8DCT(mb_type)){
+                            h->h264dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
+                        }else{
+                            h->h264dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
+                        }
+                    }
+                }
+            }else{
+                for(i=0; i<16; i++){
+                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
+                        uint8_t * const ptr= dest_y + block_offset[i];
+                        ff_svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
+                    }
+                }
+            }
+        }
 
         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
             uint8_t *dest[2] = {dest_cb, dest_cr};
             if(transform_bypass){
                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + (16*16*1 << pixel_shift), uvlinesize);
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 32, h->mb + (16*16*2 << pixel_shift), uvlinesize);
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + (16*16 << pixel_shift), uvlinesize);
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + (20*16 << pixel_shift), uvlinesize);
                 }else{
                     idct_add = s->dsp.add_pixels4;
-                    for(j=1; j<3; j++){
-                        for(i=j*16; i<j*16+4; i++){
-                            if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
-                                idct_add   (dest[j-1] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize);
-                        }
+                    for(i=16; i<16+8; i++){
+                        if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
+                            idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize);
                     }
                 }
             }else{
                 if(is_h264){
                     if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+0] ])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16*1 << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16 << pixel_shift)       , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
                     if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+1] ])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16*2 << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + ((16*16+4*16) << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
                     h->h264dsp.h264_idct_add8(dest, block_offset,
                                               h->mb, uvlinesize,
                                               h->non_zero_count_cache);
                 }else{
-                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16*1, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
-                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16*2, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
-                    for(j=1; j<3; j++){
-                        for(i=j*16; i<j*16+4; i++){
-                            if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                                uint8_t * const ptr= dest[j-1] + block_offset[i];
-                                ff_svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, ff_h264_chroma_qp[0][s->qscale + 12] - 12, 2);
-                            }
+                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16     , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
+                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16+4*16, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
+                    for(i=16; i<16+8; i++){
+                        if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
+                            uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
+                            ff_svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, ff_h264_chroma_qp[0][s->qscale + 12] - 12, 2);
                         }
                     }
                 }
@@ -1941,113 +1824,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
         }
     }
     if(h->cbp || IS_INTRA(mb_type))
-    {
         s->dsp.clear_blocks(h->mb);
-        s->dsp.clear_blocks(h->mb+(24*16<<pixel_shift));
-    }
-}
-
-static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simple, int pixel_shift){
-    MpegEncContext * const s = &h->s;
-    const int mb_x= s->mb_x;
-    const int mb_y= s->mb_y;
-    const int mb_xy= h->mb_xy;
-    const int mb_type= s->current_picture.mb_type[mb_xy];
-    uint8_t  *dest[3];
-    int linesize;
-    int i, j, p;
-    int *block_offset = &h->block_offset[0];
-    const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
-    const int plane_count = (simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) ? 3 : 1;
-
-    for (p = 0; p < plane_count; p++)
-    {
-        dest[p] = s->current_picture.data[p] + ((mb_x << pixel_shift) + mb_y * s->linesize) * 16;
-        s->dsp.prefetch(dest[p] + (s->mb_x&3)*4*s->linesize + (64 << pixel_shift), s->linesize, 4);
-    }
-
-    h->list_counts[mb_xy]= h->list_count;
-
-    if (!simple && MB_FIELD) {
-        linesize   = h->mb_linesize = h->mb_uvlinesize = s->linesize * 2;
-        block_offset = &h->block_offset[48];
-        if(mb_y&1) //FIXME move out of this function?
-            for (p = 0; p < 3; p++)
-                dest[p] -= s->linesize*15;
-        if(FRAME_MBAFF) {
-            int list;
-            for(list=0; list<h->list_count; list++){
-                if(!USES_LIST(mb_type, list))
-                    continue;
-                if(IS_16X16(mb_type)){
-                    int8_t *ref = &h->ref_cache[list][scan8[0]];
-                    fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
-                }else{
-                    for(i=0; i<16; i+=4){
-                        int ref = h->ref_cache[list][scan8[i]];
-                        if(ref >= 0)
-                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
-                    }
-                }
-            }
-        }
-    } else {
-        linesize   = h->mb_linesize = h->mb_uvlinesize = s->linesize;
-    }
-
-    if (!simple && IS_INTRA_PCM(mb_type)) {
-        if (pixel_shift) {
-            const int bit_depth = h->sps.bit_depth_luma;
-            GetBitContext gb;
-            init_get_bits(&gb, (uint8_t*)h->mb, 768*bit_depth);
-
-            for (p = 0; p < plane_count; p++) {
-                for (i = 0; i < 16; i++) {
-                    uint16_t *tmp = (uint16_t*)(dest[p] + i*linesize);
-                    for (j = 0; j < 16; j++)
-                        tmp[j] = get_bits(&gb, bit_depth);
-                }
-            }
-        } else {
-            for (p = 0; p < plane_count; p++) {
-                for (i = 0; i < 16; i++) {
-                    memcpy(dest[p] + i*linesize, h->mb + p*128 + i*8, 16);
-                }
-            }
-        }
-    } else {
-        if(IS_INTRA(mb_type)){
-            if(h->deblocking_filter)
-                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize, linesize, 1, 1, simple, pixel_shift);
-
-            for (p = 0; p < plane_count; p++)
-                hl_decode_mb_predict_luma(h, mb_type, 1, simple, transform_bypass, pixel_shift, block_offset, linesize, dest[p], p);
-
-            if(h->deblocking_filter)
-                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize, linesize, 0, 1, simple, pixel_shift);
-        }else{
-            if (pixel_shift) {
-                hl_motion_16(h, dest[0], dest[1], dest[2],
-                             s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
-                             s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
-                             h->h264dsp.weight_h264_pixels_tab,
-                             h->h264dsp.biweight_h264_pixels_tab, 1);
-            } else
-                hl_motion_8(h, dest[0], dest[1], dest[2],
-                            s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
-                            s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
-                            h->h264dsp.weight_h264_pixels_tab,
-                            h->h264dsp.biweight_h264_pixels_tab, 1);
-        }
-
-        for (p = 0; p < plane_count; p++)
-            hl_decode_mb_idct_luma(h, mb_type, 1, simple, transform_bypass, pixel_shift, block_offset, linesize, dest[p], p);
-    }
-    if(h->cbp || IS_INTRA(mb_type))
-    {
-        s->dsp.clear_blocks(h->mb);
-        s->dsp.clear_blocks(h->mb+(24*16<<pixel_shift));
-    }
 }
 
 /**
@@ -2067,26 +1844,13 @@ static void av_noinline hl_decode_mb_complex(H264Context *h){
     hl_decode_mb_internal(h, 0, h->pixel_shift);
 }
 
-static void av_noinline hl_decode_mb_444_complex(H264Context *h){
-    hl_decode_mb_444_internal(h, 0, h->pixel_shift);
-}
-
-static void av_noinline hl_decode_mb_444_simple(H264Context *h){
-    hl_decode_mb_444_internal(h, 1, 0);
-}
-
 void ff_h264_hl_decode_mb(H264Context *h){
     MpegEncContext * const s = &h->s;
     const int mb_xy= h->mb_xy;
     const int mb_type= s->current_picture.mb_type[mb_xy];
     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
 
-    if (CHROMA444) {
-        if(is_complex || h->pixel_shift)
-            hl_decode_mb_444_complex(h);
-        else
-            hl_decode_mb_444_simple(h);
-    } else if (is_complex) {
+    if (is_complex) {
         hl_decode_mb_complex(h);
     } else if (h->pixel_shift) {
         hl_decode_mb_simple_16(h);
@@ -2102,7 +1866,7 @@ static int pred_weight_table(H264Context *h){
     h->use_weight= 0;
     h->use_weight_chroma= 0;
     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
-    if(h->sps.chroma_format_idc)
+    if(CHROMA)
         h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
     luma_def = 1<<h->luma_log2_weight_denom;
     chroma_def = 1<<h->chroma_log2_weight_denom;
@@ -2127,7 +1891,7 @@ static int pred_weight_table(H264Context *h){
                 h->luma_weight[i][list][1]= 0;
             }
 
-            if(h->sps.chroma_format_idc){
+            if(CHROMA){
                 chroma_weight_flag= get_bits1(&s->gb);
                 if(chroma_weight_flag){
                     int j;
@@ -2557,11 +2321,11 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
 
     h->b_stride=  s->mb_width*4;
 
-    s->width = 16*s->mb_width - (2>>CHROMA444)*FFMIN(h->sps.crop_right, (8<<CHROMA444)-1);
+    s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
     if(h->sps.frame_mbs_only_flag)
-        s->height= 16*s->mb_height - (2>>CHROMA444)*FFMIN(h->sps.crop_bottom, (8<<CHROMA444)-1);
+        s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
     else
-        s->height= 16*s->mb_height - (4>>CHROMA444)*FFMIN(h->sps.crop_bottom, (8<<CHROMA444)-1);
+        s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 7);
 
     if (s->context_initialized
         && (   s->width != s->avctx->width || s->height != s->avctx->height
@@ -2606,22 +2370,18 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
 
         switch (h->sps.bit_depth_luma) {
             case 9 :
-                s->avctx->pix_fmt = CHROMA444 ? PIX_FMT_YUV444P9 : PIX_FMT_YUV420P9;
+                s->avctx->pix_fmt = PIX_FMT_YUV420P9;
                 break;
             case 10 :
-                s->avctx->pix_fmt = CHROMA444 ? PIX_FMT_YUV444P10 : PIX_FMT_YUV420P10;
+                s->avctx->pix_fmt = PIX_FMT_YUV420P10;
                 break;
             default:
-                if (CHROMA444){
-                    s->avctx->pix_fmt = s->avctx->color_range == AVCOL_RANGE_JPEG ? PIX_FMT_YUVJ444P : PIX_FMT_YUV444P;
-                }else{
-                    s->avctx->pix_fmt = s->avctx->get_format(s->avctx,
-                                                             s->avctx->codec->pix_fmts ?
-                                                             s->avctx->codec->pix_fmts :
-                                                             s->avctx->color_range == AVCOL_RANGE_JPEG ?
-                                                             hwaccel_pixfmt_list_h264_jpeg_420 :
-                                                             ff_hwaccel_pixfmt_list_420);
-                }
+        s->avctx->pix_fmt = s->avctx->get_format(s->avctx,
+                                                 s->avctx->codec->pix_fmts ?
+                                                 s->avctx->codec->pix_fmts :
+                                                 s->avctx->color_range == AVCOL_RANGE_JPEG ?
+                                                 hwaccel_pixfmt_list_h264_jpeg_420 :
+                                                 ff_hwaccel_pixfmt_list_420);
         }
 
         s->avctx->hwaccel = ff_find_hwaccel(s->avctx->codec->id, s->avctx->pix_fmt);
@@ -3113,10 +2873,11 @@ static int fill_filter_caches(H264Context *h, int mb_type){
     if(IS_INTRA(mb_type))
         return 0;
 
-    AV_COPY32(&h->non_zero_count_cache[4+8* 1], &h->non_zero_count[mb_xy][ 0]);
-    AV_COPY32(&h->non_zero_count_cache[4+8* 2], &h->non_zero_count[mb_xy][ 4]);
-    AV_COPY32(&h->non_zero_count_cache[4+8* 3], &h->non_zero_count[mb_xy][ 8]);
-    AV_COPY32(&h->non_zero_count_cache[4+8* 4], &h->non_zero_count[mb_xy][12]);
+    AV_COPY64(&h->non_zero_count_cache[0+8*1], &h->non_zero_count[mb_xy][ 0]);
+    AV_COPY64(&h->non_zero_count_cache[0+8*2], &h->non_zero_count[mb_xy][ 8]);
+    AV_COPY32(&h->non_zero_count_cache[0+8*5], &h->non_zero_count[mb_xy][16]);
+    AV_COPY32(&h->non_zero_count_cache[4+8*3], &h->non_zero_count[mb_xy][20]);
+    AV_COPY64(&h->non_zero_count_cache[0+8*4], &h->non_zero_count[mb_xy][24]);
 
     h->cbp= h->cbp_table[mb_xy];
 
@@ -3168,45 +2929,45 @@ static int fill_filter_caches(H264Context *h, int mb_type){
 */
 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
     if(top_type){
-        AV_COPY32(&h->non_zero_count_cache[4+8*0], &h->non_zero_count[top_xy][3*4]);
+        AV_COPY32(&h->non_zero_count_cache[4+8*0], &h->non_zero_count[top_xy][4+3*8]);
     }
 
     if(left_type[0]){
-        h->non_zero_count_cache[3+8*1]= h->non_zero_count[left_xy[0]][3+0*4];
-        h->non_zero_count_cache[3+8*2]= h->non_zero_count[left_xy[0]][3+1*4];
-        h->non_zero_count_cache[3+8*3]= h->non_zero_count[left_xy[0]][3+2*4];
-        h->non_zero_count_cache[3+8*4]= h->non_zero_count[left_xy[0]][3+3*4];
+        h->non_zero_count_cache[3+8*1]= h->non_zero_count[left_xy[0]][7+0*8];
+        h->non_zero_count_cache[3+8*2]= h->non_zero_count[left_xy[0]][7+1*8];
+        h->non_zero_count_cache[3+8*3]= h->non_zero_count[left_xy[0]][7+2*8];
+        h->non_zero_count_cache[3+8*4]= h->non_zero_count[left_xy[0]][7+3*8];
     }
 
     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
     if(!CABAC && h->pps.transform_8x8_mode){
         if(IS_8x8DCT(top_type)){
             h->non_zero_count_cache[4+8*0]=
-            h->non_zero_count_cache[5+8*0]= (h->cbp_table[top_xy] & 0x4000) >> 12;
+            h->non_zero_count_cache[5+8*0]= h->cbp_table[top_xy] & 4;
             h->non_zero_count_cache[6+8*0]=
-            h->non_zero_count_cache[7+8*0]= (h->cbp_table[top_xy] & 0x8000) >> 12;
+            h->non_zero_count_cache[7+8*0]= h->cbp_table[top_xy] & 8;
         }
         if(IS_8x8DCT(left_type[0])){
             h->non_zero_count_cache[3+8*1]=
-            h->non_zero_count_cache[3+8*2]= (h->cbp_table[left_xy[0]]&0x2000) >> 12; //FIXME check MBAFF
+            h->non_zero_count_cache[3+8*2]= h->cbp_table[left_xy[0]]&2; //FIXME check MBAFF
         }
         if(IS_8x8DCT(left_type[1])){
             h->non_zero_count_cache[3+8*3]=
-            h->non_zero_count_cache[3+8*4]= (h->cbp_table[left_xy[1]]&0x8000) >> 12; //FIXME check MBAFF
+            h->non_zero_count_cache[3+8*4]= h->cbp_table[left_xy[1]]&8; //FIXME check MBAFF
         }
 
         if(IS_8x8DCT(mb_type)){
             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
-            h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= (h->cbp & 0x1000) >> 12;
+            h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
 
             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
-            h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= (h->cbp & 0x2000) >> 12;
+            h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
 
             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
-            h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= (h->cbp & 0x4000) >> 12;
+            h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
 
             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
-            h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= (h->cbp & 0x8000) >> 12;
+            h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
         }
     }
 
@@ -3280,8 +3041,8 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
                 s->mb_x= mb_x;
                 s->mb_y= mb_y;
                 dest_y  = s->current_picture.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
-                dest_cb = s->current_picture.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * (8 << CHROMA444);
-                dest_cr = s->current_picture.data[2] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * (8 << CHROMA444);
+                dest_cb = s->current_picture.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
+                dest_cr = s->current_picture.data[2] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
                     //FIXME simplify above
 
                 if (MB_FIELD) {
@@ -3296,7 +3057,7 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
                     linesize   = h->mb_linesize   = s->linesize;
                     uvlinesize = h->mb_uvlinesize = s->uvlinesize;
                 }
-                backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, CHROMA444, 0);
+                backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
                 if(fill_filter_caches(h, mb_type))
                     continue;
                 h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index 3abf895010..8c4f1ab21a 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -39,6 +39,9 @@
 #define interlaced_dct interlaced_dct_is_a_bad_name
 #define mb_intra mb_intra_is_not_initialized_see_mb_type
 
+#define LUMA_DC_BLOCK_INDEX   24
+#define CHROMA_DC_BLOCK_INDEX 25
+
 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
 #define COEFF_TOKEN_VLC_BITS           8
 #define TOTAL_ZEROS_VLC_BITS           9
@@ -57,6 +60,8 @@
  * of progressive decoding by about 2%. */
 #define ALLOW_INTERLACE
 
+#define ALLOW_NOCHROMA
+
 #define FMO 0
 
 /**
@@ -80,12 +85,16 @@
 #endif
 #define FIELD_OR_MBAFF_PICTURE (FRAME_MBAFF || FIELD_PICTURE)
 
+#ifdef ALLOW_NOCHROMA
+#define CHROMA h->sps.chroma_format_idc
+#else
+#define CHROMA 1
+#endif
+
 #ifndef CABAC
 #define CABAC h->pps.cabac
 #endif
 
-#define CHROMA444 (h->sps.chroma_format_idc == 3)
-
 #define EXTENDED_SAR          255
 
 #define MB_TYPE_REF0       MB_TYPE_ACPRED //dirty but it fits in 16 bit
@@ -189,7 +198,7 @@ typedef struct SPS{
     int num_reorder_frames;
     int scaling_matrix_present;
     uint8_t scaling_matrix4[6][16];
-    uint8_t scaling_matrix8[6][64];
+    uint8_t scaling_matrix8[2][64];
     int nal_hrd_parameters_present_flag;
     int vcl_hrd_parameters_present_flag;
     int pic_struct_present_flag;
@@ -224,7 +233,7 @@ typedef struct PPS{
     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
     int transform_8x8_mode;     ///< transform_8x8_mode_flag
     uint8_t scaling_matrix4[6][16];
-    uint8_t scaling_matrix8[6][64];
+    uint8_t scaling_matrix8[2][64];
     uint8_t chroma_qp_table[2][64];  ///< pre-scaled (with chroma_qp_index_offset) version of qp_table
     int chroma_qp_diff;
 }PPS;
@@ -289,15 +298,21 @@ typedef struct H264Context{
     unsigned int top_samples_available;
     unsigned int topright_samples_available;
     unsigned int left_samples_available;
-    uint8_t (*top_borders[2])[(16*3)*2];
+    uint8_t (*top_borders[2])[(16+2*8)*2];
 
     /**
      * non zero coeff count cache.
      * is 64 if not available.
      */
-    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[15*8];
+    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8];
 
-    uint8_t (*non_zero_count)[48];
+    /*
+    .UU.YYYY
+    .UU.YYYY
+    .vv.YYYY
+    .VV.YYYY
+    */
+    uint8_t (*non_zero_count)[32];
 
     /**
      * Motion vector cache.
@@ -321,7 +336,7 @@ typedef struct H264Context{
      * block_offset[ 0..23] for frame macroblocks
      * block_offset[24..47] for field macroblocks
      */
-    int block_offset[2*(16*3)];
+    int block_offset[2*(16+8)];
 
     uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
     uint32_t *mb2br_xy;
@@ -341,9 +356,9 @@ typedef struct H264Context{
     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
 
     uint32_t dequant4_buffer[6][QP_MAX_NUM+1][16]; //FIXME should these be moved down?
-    uint32_t dequant8_buffer[6][QP_MAX_NUM+1][64];
+    uint32_t dequant8_buffer[2][QP_MAX_NUM+1][64];
     uint32_t (*dequant4_coeff[6])[16];
-    uint32_t (*dequant8_coeff[6])[64];
+    uint32_t (*dequant8_coeff[2])[64];
 
     int slice_num;
     uint16_t *slice_table;     ///< slice_table_base + 2*mb_stride + 1
@@ -393,15 +408,15 @@ typedef struct H264Context{
     GetBitContext *intra_gb_ptr;
     GetBitContext *inter_gb_ptr;
 
-    DECLARE_ALIGNED(16, DCTELEM, mb)[16*48*2]; ///< as a dct coeffecient is int32_t in high depth, we need to reserve twice the space.
-    DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[3][16*2];
+    DECLARE_ALIGNED(16, DCTELEM, mb)[16*24*2]; ///< as a dct coeffecient is int32_t in high depth, we need to reserve twice the space.
+    DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16*2];
     DCTELEM mb_padding[256*2];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb
 
     /**
      * Cabac
      */
     CABACContext cabac;
-    uint8_t      cabac_state[1024];
+    uint8_t      cabac_state[460];
 
     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
     uint16_t     *cbp_table;
@@ -706,43 +721,27 @@ o-o o-o
 */
 
 /* Scan8 organization:
- *    0 1 2 3 4 5 6 7
- * 0  DY    y y y y y
- * 1        y Y Y Y Y
- * 2        y Y Y Y Y
- * 3        y Y Y Y Y
- * 4        y Y Y Y Y
- * 5  DU    u u u u u
- * 6        u U U U U
- * 7        u U U U U
- * 8        u U U U U
- * 9        u U U U U
- * 10 DV    v v v v v
- * 11       v V V V V
- * 12       v V V V V
- * 13       v V V V V
- * 14       v V V V V
+ *   0 1 2 3 4 5 6 7
+ * 0   u u y y y y y
+ * 1 u U U y Y Y Y Y
+ * 2 u U U y Y Y Y Y
+ * 3   v v y Y Y Y Y
+ * 4 v V V y Y Y Y Y
+ * 5 v V V   DYDUDV
  * DY/DU/DV are for luma/chroma DC.
  */
 
-#define LUMA_DC_BLOCK_INDEX   48
-#define CHROMA_DC_BLOCK_INDEX 49
-
 //This table must be here because scan8[constant] must be known at compiletime
-static const uint8_t scan8[16*3 + 3]={
- 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8,
- 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8,
- 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8,
- 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8,
- 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8,
- 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8,
- 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8,
- 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8,
- 4+11*8, 5+11*8, 4+12*8, 5+12*8,
- 6+11*8, 7+11*8, 6+12*8, 7+12*8,
- 4+13*8, 5+13*8, 4+14*8, 5+14*8,
- 6+13*8, 7+13*8, 6+14*8, 7+14*8,
- 0+ 0*8, 0+ 5*8, 0+10*8
+static const uint8_t scan8[16 + 2*4 + 3]={
+ 4+1*8, 5+1*8, 4+2*8, 5+2*8,
+ 6+1*8, 7+1*8, 6+2*8, 7+2*8,
+ 4+3*8, 5+3*8, 4+4*8, 5+4*8,
+ 6+3*8, 7+3*8, 6+4*8, 7+4*8,
+ 1+1*8, 2+1*8,
+ 1+2*8, 2+2*8,
+ 1+4*8, 2+4*8,
+ 1+5*8, 2+5*8,
+ 4+5*8, 5+5*8, 6+5*8
 };
 
 static av_always_inline uint32_t pack16to32(int a, int b){
@@ -774,11 +773,11 @@ static void fill_decode_neighbors(H264Context *h, int mb_type){
     MpegEncContext * const s = &h->s;
     const int mb_xy= h->mb_xy;
     int topleft_xy, top_xy, topright_xy, left_xy[2];
-    static const uint8_t left_block_options[4][32]={
-        {0,1,2,3,7,10,8,11,3+0*4, 3+1*4, 3+2*4, 3+3*4, 1+4*4, 1+8*4, 1+5*4, 1+9*4},
-        {2,2,3,3,8,11,8,11,3+2*4, 3+2*4, 3+3*4, 3+3*4, 1+5*4, 1+9*4, 1+5*4, 1+9*4},
-        {0,0,1,1,7,10,7,10,3+0*4, 3+0*4, 3+1*4, 3+1*4, 1+4*4, 1+8*4, 1+4*4, 1+8*4},
-        {0,2,0,2,7,10,7,10,3+0*4, 3+2*4, 3+0*4, 3+2*4, 1+4*4, 1+8*4, 1+4*4, 1+8*4}
+    static const uint8_t left_block_options[4][16]={
+        {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8},
+        {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8},
+        {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8},
+        {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}
     };
 
     h->topleft_partition= -1;
@@ -948,41 +947,32 @@ static void fill_decode_caches(H264Context *h, int mb_type){
 */
 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
     if(top_type){
-        AV_COPY32(&h->non_zero_count_cache[4+8* 0], &h->non_zero_count[top_xy][4*3]);
-        if(CHROMA444){
-            AV_COPY32(&h->non_zero_count_cache[4+8* 5], &h->non_zero_count[top_xy][4* 7]);
-            AV_COPY32(&h->non_zero_count_cache[4+8*10], &h->non_zero_count[top_xy][4*11]);
-        }else{
-            AV_COPY32(&h->non_zero_count_cache[4+8* 5], &h->non_zero_count[top_xy][4* 5]);
-            AV_COPY32(&h->non_zero_count_cache[4+8*10], &h->non_zero_count[top_xy][4* 9]);
-        }
-    }else{
-        uint32_t top_empty = CABAC && !IS_INTRA(mb_type) ? 0 : 0x40404040;
-        AV_WN32A(&h->non_zero_count_cache[4+8* 0], top_empty);
-        AV_WN32A(&h->non_zero_count_cache[4+8* 5], top_empty);
-        AV_WN32A(&h->non_zero_count_cache[4+8*10], top_empty);
+        AV_COPY32(&h->non_zero_count_cache[4+8*0], &h->non_zero_count[top_xy][4+3*8]);
+            h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][1+1*8];
+            h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][2+1*8];
+
+            h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][1+2*8];
+            h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][2+2*8];
+    }else {
+            h->non_zero_count_cache[1+8*0]=
+            h->non_zero_count_cache[2+8*0]=
+
+            h->non_zero_count_cache[1+8*3]=
+            h->non_zero_count_cache[2+8*3]=
+            AV_WN32A(&h->non_zero_count_cache[4+8*0], CABAC && !IS_INTRA(mb_type) ? 0 : 0x40404040);
     }
 
     for (i=0; i<2; i++) {
         if(left_type[i]){
-            h->non_zero_count_cache[3+8* 1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]];
-            h->non_zero_count_cache[3+8* 2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]];
-            if(CHROMA444){
-                h->non_zero_count_cache[3+8* 6 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]+4*4];
-                h->non_zero_count_cache[3+8* 7 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]+4*4];
-                h->non_zero_count_cache[3+8*11 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]+8*4];
-                h->non_zero_count_cache[3+8*12 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]+8*4];
-            }else{
-                h->non_zero_count_cache[3+8* 6 +   8*i]= h->non_zero_count[left_xy[i]][left_block[8+4+2*i]];
-                h->non_zero_count_cache[3+8*11 +   8*i]= h->non_zero_count[left_xy[i]][left_block[8+5+2*i]];
-            }
+            h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]];
+            h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]];
+                h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[8+4+2*i]];
+                h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[8+5+2*i]];
         }else{
-            h->non_zero_count_cache[3+8* 1 + 2*8*i]=
-            h->non_zero_count_cache[3+8* 2 + 2*8*i]=
-            h->non_zero_count_cache[3+8* 6 + 2*8*i]=
-            h->non_zero_count_cache[3+8* 7 + 2*8*i]=
-            h->non_zero_count_cache[3+8*11 + 2*8*i]=
-            h->non_zero_count_cache[3+8*12 + 2*8*i]= CABAC && !IS_INTRA(mb_type) ? 0 : 64;
+                h->non_zero_count_cache[3+8*1 + 2*8*i]=
+                h->non_zero_count_cache[3+8*2 + 2*8*i]=
+                h->non_zero_count_cache[0+8*1 +   8*i]=
+                h->non_zero_count_cache[0+8*4 +   8*i]= CABAC && !IS_INTRA(mb_type) ? 0 : 64;
         }
     }
 
@@ -991,15 +981,15 @@ static void fill_decode_caches(H264Context *h, int mb_type){
         if(top_type) {
             h->top_cbp = h->cbp_table[top_xy];
         } else {
-            h->top_cbp = IS_INTRA(mb_type) ? 0x7CF : 0x00F;
+            h->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
         }
         // left_cbp
         if (left_type[0]) {
-            h->left_cbp =   (h->cbp_table[left_xy[0]] & 0x7F0)
+            h->left_cbp = (h->cbp_table[left_xy[0]] & 0x1f0)
                         |  ((h->cbp_table[left_xy[0]]>>(left_block[0]&(~1)))&2)
                         | (((h->cbp_table[left_xy[1]]>>(left_block[2]&(~1)))&2) << 2);
         } else {
-            h->left_cbp = IS_INTRA(mb_type) ? 0x7CF : 0x00F;
+            h->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
         }
     }
     }
@@ -1200,21 +1190,11 @@ static inline int pred_intra_mode(H264Context *h, int n){
 static inline void write_back_non_zero_count(H264Context *h){
     const int mb_xy= h->mb_xy;
 
-    AV_COPY32(&h->non_zero_count[mb_xy][ 0], &h->non_zero_count_cache[4+8* 1]);
-    AV_COPY32(&h->non_zero_count[mb_xy][ 4], &h->non_zero_count_cache[4+8* 2]);
-    AV_COPY32(&h->non_zero_count[mb_xy][ 8], &h->non_zero_count_cache[4+8* 3]);
-    AV_COPY32(&h->non_zero_count[mb_xy][12], &h->non_zero_count_cache[4+8* 4]);
-    AV_COPY32(&h->non_zero_count[mb_xy][16], &h->non_zero_count_cache[4+8* 6]);
-    AV_COPY32(&h->non_zero_count[mb_xy][20], &h->non_zero_count_cache[4+8* 7]);
-    AV_COPY32(&h->non_zero_count[mb_xy][32], &h->non_zero_count_cache[4+8*11]);
-    AV_COPY32(&h->non_zero_count[mb_xy][36], &h->non_zero_count_cache[4+8*12]);
-
-    if(CHROMA444){
-        AV_COPY32(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[4+8* 8]);
-        AV_COPY32(&h->non_zero_count[mb_xy][28], &h->non_zero_count_cache[4+8* 9]);
-        AV_COPY32(&h->non_zero_count[mb_xy][40], &h->non_zero_count_cache[4+8*13]);
-        AV_COPY32(&h->non_zero_count[mb_xy][44], &h->non_zero_count_cache[4+8*14]);
-    }
+    AV_COPY64(&h->non_zero_count[mb_xy][ 0], &h->non_zero_count_cache[0+8*1]);
+    AV_COPY64(&h->non_zero_count[mb_xy][ 8], &h->non_zero_count_cache[0+8*2]);
+    AV_COPY32(&h->non_zero_count[mb_xy][16], &h->non_zero_count_cache[0+8*5]);
+    AV_COPY32(&h->non_zero_count[mb_xy][20], &h->non_zero_count_cache[4+8*3]);
+    AV_COPY64(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[0+8*4]);
 }
 
 static inline void write_back_motion(H264Context *h, int mb_type){
@@ -1287,7 +1267,8 @@ static void av_unused decode_mb_skip(H264Context *h){
     const int mb_xy= h->mb_xy;
     int mb_type=0;
 
-    memset(h->non_zero_count[mb_xy], 0, 48);
+    memset(h->non_zero_count[mb_xy], 0, 32);
+    memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
 
     if(MB_FIELD)
         mb_type|= MB_TYPE_INTERLACED;
diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index f30f4e1c9c..69af1e2ded 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -45,7 +45,7 @@
 
 /* Cabac pre state table */
 
-static const int8_t cabac_context_init_I[1024][2] =
+static const int8_t cabac_context_init_I[460][2] =
 {
     /* 0 - 10 */
     { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
@@ -211,153 +211,10 @@ static const int8_t cabac_context_init_I[1024][2] =
     { -10,  73 }, { -10,  70 }, { -10,  69 }, {  -5,  66 },
     {  -9,  64 }, {  -5,  58 }, {   2,  59 }, {  21, -10 },
     {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
-    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 },
-
-    /* 460 -> 1024 */
-    { -17, 123 }, { -12, 115 }, { -16, 122 }, { -11, 115 },
-    { -12,  63 }, {  -2,  68 }, { -15,  84 }, { -13, 104 },
-    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
-    { -17, 123 }, { -12, 115 }, { -16, 122 }, { -11, 115 },
-    { -12,  63 }, {  -2,  68 }, { -15,  84 }, { -13, 104 },
-    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
-    {  -7,  93 }, { -11,  87 }, {  -3,  77 }, {  -5,  71 },
-    {  -4,  63 }, {  -4,  68 }, { -12,  84 }, {  -7,  62 },
-    {  -7,  65 }, {   8,  61 }, {   5,  56 }, {  -2,  66 },
-    {   1,  64 }, {   0,  61 }, {  -2,  78 }, {   1,  50 },
-    {   7,  52 }, {  10,  35 }, {   0,  44 }, {  11,  38 },
-    {   1,  45 }, {   0,  46 }, {   5,  44 }, {  31,  17 },
-    {   1,  51 }, {   7,  50 }, {  28,  19 }, {  16,  33 },
-    {  14,  62 }, { -13, 108 }, { -15, 100 }, { -13, 101 },
-    { -13,  91 }, { -12,  94 }, { -10,  88 }, { -16,  84 },
-    { -10,  86 }, {  -7,  83 }, { -13,  87 }, { -19,  94 },
-    {   1,  70 }, {   0,  72 }, {  -5,  74 }, {  18,  59 },
-    {  -7,  93 }, { -11,  87 }, {  -3,  77 }, {  -5,  71 },
-    {  -4,  63 }, {  -4,  68 }, { -12,  84 }, {  -7,  62 },
-    {  -7,  65 }, {   8,  61 }, {   5,  56 }, {  -2,  66 },
-    {   1,  64 }, {   0,  61 }, {  -2,  78 }, {   1,  50 },
-    {   7,  52 }, {  10,  35 }, {   0,  44 }, {  11,  38 },
-    {   1,  45 }, {   0,  46 }, {   5,  44 }, {  31,  17 },
-    {   1,  51 }, {   7,  50 }, {  28,  19 }, {  16,  33 },
-    {  14,  62 }, { -13, 108 }, { -15, 100 }, { -13, 101 },
-    { -13,  91 }, { -12,  94 }, { -10,  88 }, { -16,  84 },
-    { -10,  86 }, {  -7,  83 }, { -13,  87 }, { -19,  94 },
-    {   1,  70 }, {   0,  72 }, {  -5,  74 }, {  18,  59 },
-    {  24,   0 }, {  15,   9 }, {   8,  25 }, {  13,  18 },
-    {  15,   9 }, {  13,  19 }, {  10,  37 }, {  12,  18 },
-    {   6,  29 }, {  20,  33 }, {  15,  30 }, {   4,  45 },
-    {   1,  58 }, {   0,  62 }, {   7,  61 }, {  12,  38 },
-    {  11,  45 }, {  15,  39 }, {  11,  42 }, {  13,  44 },
-    {  16,  45 }, {  12,  41 }, {  10,  49 }, {  30,  34 },
-    {  18,  42 }, {  10,  55 }, {  17,  51 }, {  17,  46 },
-    {   0,  89 }, {  26, -19 }, {  22, -17 }, {  26, -17 },
-    {  30, -25 }, {  28, -20 }, {  33, -23 }, {  37, -27 },
-    {  33, -23 }, {  40, -28 }, {  38, -17 }, {  33, -11 },
-    {  40, -15 }, {  41,  -6 }, {  38,   1 }, {  41,  17 },
-    {  24,   0 }, {  15,   9 }, {   8,  25 }, {  13,  18 },
-    {  15,   9 }, {  13,  19 }, {  10,  37 }, {  12,  18 },
-    {   6,  29 }, {  20,  33 }, {  15,  30 }, {   4,  45 },
-    {   1,  58 }, {   0,  62 }, {   7,  61 }, {  12,  38 },
-    {  11,  45 }, {  15,  39 }, {  11,  42 }, {  13,  44 },
-    {  16,  45 }, {  12,  41 }, {  10,  49 }, {  30,  34 },
-    {  18,  42 }, {  10,  55 }, {  17,  51 }, {  17,  46 },
-    {   0,  89 }, {  26, -19 }, {  22, -17 }, {  26, -17 },
-    {  30, -25 }, {  28, -20 }, {  33, -23 }, {  37, -27 },
-    {  33, -23 }, {  40, -28 }, {  38, -17 }, {  33, -11 },
-    {  40, -15 }, {  41,  -6 }, {  38,   1 }, {  41,  17 },
-    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
-    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
-    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
-    { -23,  68 }, { -24,  50 }, { -11,  74 }, { -14, 106 },
-    { -13,  97 }, { -15,  90 }, { -12,  90 }, { -18,  88 },
-    { -10,  73 }, {  -9,  79 }, { -14,  86 }, { -10,  73 },
-    { -10,  70 }, { -10,  69 }, {  -5,  66 }, {  -9,  64 },
-    {  -5,  58 }, {   2,  59 }, {  23, -13 }, {  26, -13 },
-    {  40, -15 }, {  49, -14 }, {  44,   3 }, {  45,   6 },
-    {  44,  34 }, {  33,  54 }, {  19,  82 }, {  21, -10 },
-    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
-    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 },
-    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
-    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
-    {   0,  68 }, {  -9,  92 }, { -17, 120 }, { -20, 112 },
-    { -18, 114 }, { -11,  85 }, { -15,  92 }, { -14,  89 },
-    { -26,  71 }, { -15,  81 }, { -14,  80 }, {   0,  68 },
-    { -14,  70 }, { -24,  56 }, { -23,  68 }, { -24,  50 },
-    { -11,  74 }, { -14, 106 }, { -13,  97 }, { -15,  90 },
-    { -12,  90 }, { -18,  88 }, { -10,  73 }, {  -9,  79 },
-    { -14,  86 }, { -10,  73 }, { -10,  70 }, { -10,  69 },
-    {  -5,  66 }, {  -9,  64 }, {  -5,  58 }, {   2,  59 },
-    {  23, -13 }, {  26, -13 }, {  40, -15 }, {  49, -14 },
-    {  44,   3 }, {  45,   6 }, {  44,  34 }, {  33,  54 },
-    {  19,  82 }, {  21, -10 }, {  24, -11 }, {  28,  -8 },
-    {  28,  -1 }, {  29,   3 }, {  29,   9 }, {  35,  20 },
-    {  29,  36 }, {  14,  67 }, {  -3,  75 }, {  -1,  23 },
-    {   1,  34 }, {   1,  43 }, {   0,  54 }, {  -2,  55 },
-    {   0,  61 }, {   1,  64 }, {   0,  68 }, {  -9,  92 },
-    {  -6,  93 }, {  -6,  84 }, {  -8,  79 }, {   0,  66 },
-    {  -1,  71 }, {   0,  62 }, {  -2,  60 }, {  -2,  59 },
-    {  -5,  75 }, {  -3,  62 }, {  -4,  58 }, {  -9,  66 },
-    {  -1,  79 }, {   0,  71 }, {   3,  68 }, {  10,  44 },
-    {  -7,  62 }, {  15,  36 }, {  14,  40 }, {  16,  27 },
-    {  12,  29 }, {   1,  44 }, {  20,  36 }, {  18,  32 },
-    {   5,  42 }, {   1,  48 }, {  10,  62 }, {  17,  46 },
-    {   9,  64 }, { -12, 104 }, { -11,  97 }, { -16,  96 },
-    {  -7,  88 }, {  -8,  85 }, {  -7,  85 }, {  -9,  85 },
-    { -13,  88 }, {   4,  66 }, {  -3,  77 }, {  -3,  76 },
-    {  -6,  76 }, {  10,  58 }, {  -1,  76 }, {  -1,  83 },
-    {  -6,  93 }, {  -6,  84 }, {  -8,  79 }, {   0,  66 },
-    {  -1,  71 }, {   0,  62 }, {  -2,  60 }, {  -2,  59 },
-    {  -5,  75 }, {  -3,  62 }, {  -4,  58 }, {  -9,  66 },
-    {  -1,  79 }, {   0,  71 }, {   3,  68 }, {  10,  44 },
-    {  -7,  62 }, {  15,  36 }, {  14,  40 }, {  16,  27 },
-    {  12,  29 }, {   1,  44 }, {  20,  36 }, {  18,  32 },
-    {   5,  42 }, {   1,  48 }, {  10,  62 }, {  17,  46 },
-    {   9,  64 }, { -12, 104 }, { -11,  97 }, { -16,  96 },
-    {  -7,  88 }, {  -8,  85 }, {  -7,  85 }, {  -9,  85 },
-    { -13,  88 }, {   4,  66 }, {  -3,  77 }, {  -3,  76 },
-    {  -6,  76 }, {  10,  58 }, {  -1,  76 }, {  -1,  83 },
-    {  15,   6 }, {   6,  19 }, {   7,  16 }, {  12,  14 },
-    {  18,  13 }, {  13,  11 }, {  13,  15 }, {  15,  16 },
-    {  12,  23 }, {  13,  23 }, {  15,  20 }, {  14,  26 },
-    {  14,  44 }, {  17,  40 }, {  17,  47 }, {  24,  17 },
-    {  21,  21 }, {  25,  22 }, {  31,  27 }, {  22,  29 },
-    {  19,  35 }, {  14,  50 }, {  10,  57 }, {   7,  63 },
-    {  -2,  77 }, {  -4,  82 }, {  -3,  94 }, {   9,  69 },
-    { -12, 109 }, {  36, -35 }, {  36, -34 }, {  32, -26 },
-    {  37, -30 }, {  44, -32 }, {  34, -18 }, {  34, -15 },
-    {  40, -15 }, {  33,  -7 }, {  35,  -5 }, {  33,   0 },
-    {  38,   2 }, {  33,  13 }, {  23,  35 }, {  13,  58 },
-    {  15,   6 }, {   6,  19 }, {   7,  16 }, {  12,  14 },
-    {  18,  13 }, {  13,  11 }, {  13,  15 }, {  15,  16 },
-    {  12,  23 }, {  13,  23 }, {  15,  20 }, {  14,  26 },
-    {  14,  44 }, {  17,  40 }, {  17,  47 }, {  24,  17 },
-    {  21,  21 }, {  25,  22 }, {  31,  27 }, {  22,  29 },
-    {  19,  35 }, {  14,  50 }, {  10,  57 }, {   7,  63 },
-    {  -2,  77 }, {  -4,  82 }, {  -3,  94 }, {   9,  69 },
-    { -12, 109 }, {  36, -35 }, {  36, -34 }, {  32, -26 },
-    {  37, -30 }, {  44, -32 }, {  34, -18 }, {  34, -15 },
-    {  40, -15 }, {  33,  -7 }, {  35,  -5 }, {  33,   0 },
-    {  38,   2 }, {  33,  13 }, {  23,  35 }, {  13,  58 },
-    {  -3,  71 }, {  -6,  42 }, {  -5,  50 }, {  -3,  54 },
-    {  -2,  62 }, {   0,  58 }, {   1,  63 }, {  -2,  72 },
-    {  -1,  74 }, {  -9,  91 }, {  -5,  67 }, {  -5,  27 },
-    {  -3,  39 }, {  -2,  44 }, {   0,  46 }, { -16,  64 },
-    {  -8,  68 }, { -10,  78 }, {  -6,  77 }, { -10,  86 },
-    { -12,  92 }, { -15,  55 }, { -10,  60 }, {  -6,  62 },
-    {  -4,  65 }, { -12,  73 }, {  -8,  76 }, {  -7,  80 },
-    {  -9,  88 }, { -17, 110 }, {  -3,  71 }, {  -6,  42 },
-    {  -5,  50 }, {  -3,  54 }, {  -2,  62 }, {   0,  58 },
-    {   1,  63 }, {  -2,  72 }, {  -1,  74 }, {  -9,  91 },
-    {  -5,  67 }, {  -5,  27 }, {  -3,  39 }, {  -2,  44 },
-    {   0,  46 }, { -16,  64 }, {  -8,  68 }, { -10,  78 },
-    {  -6,  77 }, { -10,  86 }, { -12,  92 }, { -15,  55 },
-    { -10,  60 }, {  -6,  62 }, {  -4,  65 }, { -12,  73 },
-    {  -8,  76 }, {  -7,  80 }, {  -9,  88 }, { -17, 110 },
-    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
-    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
-    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 }
+    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 }
 };
 
-static const int8_t cabac_context_init_PB[3][1024][2] =
+static const int8_t cabac_context_init_PB[3][460][2] =
 {
     /* i_cabac_init_idc == 0 */
     {
@@ -513,149 +370,6 @@ static const int8_t cabac_context_init_PB[3][1024][2] =
         { -14,  66 }, {   0,  59 }, {   2,  59 }, {  21, -13 },
         {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
         {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
-
-        /* 460 - 1024 */
-        {  -7,  92 }, {  -5,  89 }, {  -7,  96 }, { -13, 108 },
-        {  -3,  46 }, {  -1,  65 }, {  -1,  57 }, {  -9,  93 },
-        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
-        {  -7,  92 }, {  -5,  89 }, {  -7,  96 }, { -13, 108 },
-        {  -3,  46 }, {  -1,  65 }, {  -1,  57 }, {  -9,  93 },
-        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
-        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
-        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
-        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
-        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
-        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
-        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
-        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
-        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
-        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
-        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
-        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
-        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
-        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
-        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
-        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
-        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
-        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
-        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
-        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
-        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
-        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
-        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
-        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
-        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
-        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
-        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
-        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
-        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
-        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
-        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
-        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
-        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
-        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
-        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
-        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
-        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
-        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
-        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
-        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
-        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
-        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
-        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
-        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
-        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
-        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
-        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
-        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
-        { -16,  66 }, { -22,  65 }, { -20,  63 }, {  -5,  85 },
-        {  -6,  81 }, { -10,  77 }, {  -7,  81 }, { -17,  80 },
-        { -18,  73 }, {  -4,  74 }, { -10,  83 }, {  -9,  71 },
-        {  -9,  67 }, {  -1,  61 }, {  -8,  66 }, { -14,  66 },
-        {   0,  59 }, {   2,  59 }, {   9,  -2 }, {  26,  -9 },
-        {  33,  -9 }, {  39,  -7 }, {  41,  -2 }, {  45,   3 },
-        {  49,   9 }, {  45,  27 }, {  36,  59 }, {  21, -13 },
-        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
-        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
-        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
-        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
-        {  -8,  66 }, {  -8,  76 }, {  -4,  79 }, {  -7,  71 },
-        {  -5,  69 }, {  -9,  70 }, {  -8,  66 }, { -10,  68 },
-        { -19,  73 }, { -12,  69 }, { -16,  70 }, { -15,  67 },
-        { -20,  62 }, { -19,  70 }, { -16,  66 }, { -22,  65 },
-        { -20,  63 }, {  -5,  85 }, {  -6,  81 }, { -10,  77 },
-        {  -7,  81 }, { -17,  80 }, { -18,  73 }, {  -4,  74 },
-        { -10,  83 }, {  -9,  71 }, {  -9,  67 }, {  -1,  61 },
-        {  -8,  66 }, { -14,  66 }, {   0,  59 }, {   2,  59 },
-        {   9,  -2 }, {  26,  -9 }, {  33,  -9 }, {  39,  -7 },
-        {  41,  -2 }, {  45,   3 }, {  49,   9 }, {  45,  27 },
-        {  36,  59 }, {  21, -13 }, {  33, -14 }, {  39,  -7 },
-        {  46,  -2 }, {  51,   2 }, {  60,   6 }, {  61,  17 },
-        {  55,  34 }, {  42,  62 }, {  -6,  66 }, {  -7,  35 },
-        {  -7,  42 }, {  -8,  45 }, {  -5,  48 }, { -12,  56 },
-        {  -6,  60 }, {  -5,  62 }, {  -8,  66 }, {  -8,  76 },
-        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
-        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
-        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
-        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
-        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
-        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
-        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
-        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
-        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
-        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
-        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
-        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
-        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
-        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
-        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
-        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
-        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
-        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
-        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
-        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
-        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
-        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
-        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
-        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
-        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
-        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
-        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
-        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
-        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
-        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
-        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
-        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
-        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
-        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
-        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
-        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
-        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
-        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
-        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
-        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
-        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
-        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
-        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
-        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
-        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
-        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
-        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
-        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
-        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
-        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
-        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
-        {  -3,  74 }, { -10,  90 }, {  -6,  76 }, {  -2,  44 },
-        {   0,  45 }, {   0,  52 }, {  -3,  64 }, {  -2,  59 },
-        {  -4,  70 }, {  -4,  75 }, {  -8,  82 }, { -17, 102 },
-        {  -9,  77 }, {   3,  24 }, {   0,  42 }, {   0,  48 },
-        {   0,  55 }, {  -6,  59 }, {  -7,  71 }, { -12,  83 },
-        { -11,  87 }, { -30, 119 }, {   1,  58 }, {  -3,  29 },
-        {  -1,  36 }, {   1,  38 }, {   2,  43 }, {  -6,  55 },
-        {   0,  58 }, {   0,  64 }, {  -3,  74 }, { -10,  90 },
-        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
-        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
-        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 }
     },
 
     /* i_cabac_init_idc == 1 */
@@ -812,149 +526,6 @@ static const int8_t cabac_context_init_PB[3][1024][2] =
         {  -9,  60 }, {   1,  54 }, {   2,  58 }, {  17, -10 },
         {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
         {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
-
-        /* 460 - 1024 */
-        {   0,  80 }, {  -5,  89 }, {  -7,  94 }, {  -4,  92 },
-        {   0,  39 }, {   0,  65 }, { -15,  84 }, { -35, 127 },
-        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
-        {   0,  80 }, {  -5,  89 }, {  -7,  94 }, {  -4,  92 },
-        {   0,  39 }, {   0,  65 }, { -15,  84 }, { -35, 127 },
-        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
-        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
-        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
-        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
-        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
-        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
-        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
-        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
-        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
-        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
-        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
-        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
-        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
-        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
-        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
-        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
-        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
-        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
-        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
-        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
-        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
-        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
-        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
-        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
-        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
-        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
-        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
-        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
-        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
-        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
-        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
-        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
-        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
-        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
-        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
-        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
-        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
-        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
-        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
-        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
-        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
-        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
-        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
-        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
-        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
-        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
-        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
-        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
-        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  -3,  81 },
-        {  -3,  76 }, {  -7,  72 }, {  -6,  78 }, { -12,  72 },
-        { -14,  68 }, {  -3,  70 }, {  -6,  76 }, {  -5,  66 },
-        {  -5,  62 }, {   0,  57 }, {  -4,  61 }, {  -9,  60 },
-        {   1,  54 }, {   2,  58 }, {  17, -10 }, {  32, -13 },
-        {  42,  -9 }, {  49,  -5 }, {  53,   0 }, {  64,   3 },
-        {  68,  10 }, {  66,  27 }, {  47,  57 }, {  17, -10 },
-        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
-        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
-        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
-        {  -2,  52 }, {  -9,  57 }, {  -6,  63 }, {  -4,  65 },
-        {  -4,  67 }, {  -7,  82 }, {  -5,  85 }, {  -6,  81 },
-        { -10,  77 }, {  -7,  81 }, { -17,  80 }, { -18,  73 },
-        {  -4,  74 }, { -10,  83 }, {  -9,  71 }, {  -9,  67 },
-        {  -1,  61 }, {  -8,  66 }, { -14,  66 }, {   0,  59 },
-        {   2,  59 }, {  -3,  81 }, {  -3,  76 }, {  -7,  72 },
-        {  -6,  78 }, { -12,  72 }, { -14,  68 }, {  -3,  70 },
-        {  -6,  76 }, {  -5,  66 }, {  -5,  62 }, {   0,  57 },
-        {  -4,  61 }, {  -9,  60 }, {   1,  54 }, {   2,  58 },
-        {  17, -10 }, {  32, -13 }, {  42,  -9 }, {  49,  -5 },
-        {  53,   0 }, {  64,   3 }, {  68,  10 }, {  66,  27 },
-        {  47,  57 }, {  17, -10 }, {  32, -13 }, {  42,  -9 },
-        {  49,  -5 }, {  53,   0 }, {  64,   3 }, {  68,  10 },
-        {  66,  27 }, {  47,  57 }, {  -5,  71 }, {   0,  24 },
-        {  -1,  36 }, {  -2,  42 }, {  -2,  52 }, {  -9,  57 },
-        {  -6,  63 }, {  -4,  65 }, {  -4,  67 }, {  -7,  82 },
-        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
-        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
-        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
-        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
-        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
-        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
-        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
-        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
-        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
-        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
-        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
-        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
-        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
-        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
-        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
-        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
-        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
-        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
-        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
-        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
-        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
-        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
-        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
-        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
-        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
-        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
-        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
-        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
-        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
-        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
-        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
-        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
-        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
-        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
-        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
-        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
-        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
-        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
-        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
-        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
-        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
-        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
-        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
-        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
-        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
-        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
-        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
-        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
-        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
-        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
-        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
-        {  -5,  74 }, {  -9,  86 }, { -23, 112 }, { -15,  71 },
-        {  -7,  61 }, {   0,  53 }, {  -5,  66 }, { -11,  77 },
-        {  -9,  80 }, {  -9,  84 }, { -10,  87 }, { -34, 127 },
-        { -21, 101 }, {  -3,  39 }, {  -5,  53 }, {  -7,  61 },
-        { -11,  75 }, { -15,  77 }, { -17,  91 }, { -25, 107 },
-        { -25, 111 }, { -28, 122 }, { -11,  76 }, { -10,  44 },
-        { -10,  52 }, { -10,  57 }, {  -9,  58 }, { -16,  72 },
-        {  -7,  69 }, {  -4,  69 }, {  -5,  74 }, {  -9,  86 },
-        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
-        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
-        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 }
     },
 
     /* i_cabac_init_idc == 2 */
@@ -1111,149 +682,6 @@ static const int8_t cabac_context_init_PB[3][1024][2] =
         { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
         {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
         {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
-
-        /* 460 - 1024 */
-        {  11,  80 }, {   5,  76 }, {   2,  84 }, {   5,  78 },
-        {  -6,  55 }, {   4,  61 }, { -14,  83 }, { -37, 127 },
-        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
-        {  11,  80 }, {   5,  76 }, {   2,  84 }, {   5,  78 },
-        {  -6,  55 }, {   4,  61 }, { -14,  83 }, { -37, 127 },
-        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
-        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
-        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
-        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
-        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
-        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
-        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
-        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
-        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
-        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
-        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
-        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
-        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
-        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
-        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
-        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
-        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
-        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
-        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
-        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
-        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
-        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
-        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
-        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
-        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
-        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
-        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
-        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
-        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
-        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
-        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
-        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
-        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
-        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
-        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
-        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
-        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
-        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
-        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
-        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
-        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
-        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
-        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
-        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
-        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
-        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
-        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
-        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
-        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {  -3,  78 },
-        {  -8,  74 }, {  -9,  72 }, { -10,  72 }, { -18,  75 },
-        { -12,  71 }, { -11,  63 }, {  -5,  70 }, { -17,  75 },
-        { -14,  72 }, { -16,  67 }, {  -8,  53 }, { -14,  59 },
-        {  -9,  52 }, { -11,  68 }, {   9,  -2 }, {  30, -10 },
-        {  31,  -4 }, {  33,  -1 }, {  33,   7 }, {  31,  12 },
-        {  37,  23 }, {  31,  38 }, {  20,  64 }, {   9,  -2 },
-        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
-        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
-        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
-        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
-        {  -6,  68 }, { -10,  79 }, {  -3,  78 }, {  -8,  74 },
-        {  -9,  72 }, { -10,  72 }, { -18,  75 }, { -12,  71 },
-        { -11,  63 }, {  -5,  70 }, { -17,  75 }, { -14,  72 },
-        { -16,  67 }, {  -8,  53 }, { -14,  59 }, {  -9,  52 },
-        { -11,  68 }, {  -3,  78 }, {  -8,  74 }, {  -9,  72 },
-        { -10,  72 }, { -18,  75 }, { -12,  71 }, { -11,  63 },
-        {  -5,  70 }, { -17,  75 }, { -14,  72 }, { -16,  67 },
-        {  -8,  53 }, { -14,  59 }, {  -9,  52 }, { -11,  68 },
-        {   9,  -2 }, {  30, -10 }, {  31,  -4 }, {  33,  -1 },
-        {  33,   7 }, {  31,  12 }, {  37,  23 }, {  31,  38 },
-        {  20,  64 }, {   9,  -2 }, {  30, -10 }, {  31,  -4 },
-        {  33,  -1 }, {  33,   7 }, {  31,  12 }, {  37,  23 },
-        {  31,  38 }, {  20,  64 }, {  -9,  71 }, {  -7,  37 },
-        {  -8,  44 }, { -11,  49 }, { -10,  56 }, { -12,  59 },
-        {  -8,  63 }, {  -9,  67 }, {  -6,  68 }, { -10,  79 },
-        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
-        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
-        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
-        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
-        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
-        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
-        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
-        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
-        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
-        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
-        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
-        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
-        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
-        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
-        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
-        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
-        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
-        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
-        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
-        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
-        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
-        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
-        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
-        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
-        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
-        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
-        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
-        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
-        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
-        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
-        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
-        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
-        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
-        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
-        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
-        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
-        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
-        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
-        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
-        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
-        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
-        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
-        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
-        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
-        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
-        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
-        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
-        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
-        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
-        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
-        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
-        { -12,  92 }, { -18, 108 }, { -24, 115 }, { -22,  82 },
-        {  -9,  62 }, {   0,  53 }, {   0,  59 }, { -14,  85 },
-        { -13,  89 }, { -13,  94 }, { -11,  92 }, { -29, 127 },
-        { -21, 100 }, { -14,  57 }, { -12,  67 }, { -11,  71 },
-        { -10,  77 }, { -21,  85 }, { -16,  88 }, { -23, 104 },
-        { -15,  98 }, { -37, 127 }, { -10,  82 }, {  -8,  48 },
-        {  -8,  61 }, {  -8,  66 }, {  -7,  70 }, { -14,  75 },
-        { -10,  79 }, {  -9,  83 }, { -12,  92 }, { -18, 108 },
-        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
-        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
-        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 }
     }
 };
 
@@ -1267,7 +695,7 @@ void ff_h264_init_cabac_states(H264Context *h) {
     else                                 tab = cabac_context_init_PB[h->cabac_init_idc];
 
     /* calculate pre-state */
-    for( i= 0; i < 1024; i++ ) {
+    for( i= 0; i < 460; i++ ) {
         int pre = 2*(((tab[i][0] * slice_qp) >>4 ) + tab[i][1]) - 127;
 
         pre^= pre>>31;
@@ -1529,22 +957,21 @@ static int decode_cabac_mb_mvd( H264Context *h, int ctxbase, int amvd, int *mvda
     my += decode_cabac_mb_mvd( h, 47, amvd1, &mpy );\
 }
 
-static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int max_coeff, int is_dc ) {
+static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
     int nza, nzb;
     int ctx = 0;
-    static const uint16_t base_ctx[14] = {85,89,93,97,101,1012,460,464,468,1016,472,476,480,1020};
 
     if( is_dc ) {
-        if( cat == 3 ) {
+        if( cat == 0 ) {
+            nza = h->left_cbp&0x100;
+            nzb = h-> top_cbp&0x100;
+        } else {
             idx -= CHROMA_DC_BLOCK_INDEX;
             nza = (h->left_cbp>>(6+idx))&0x01;
             nzb = (h-> top_cbp>>(6+idx))&0x01;
-        } else {
-            idx -= LUMA_DC_BLOCK_INDEX;
-            nza = h->left_cbp&(0x100<<idx);
-            nzb = h-> top_cbp&(0x100<<idx);
         }
     } else {
+        assert(cat == 1 || cat == 2 || cat == 4);
         nza = h->non_zero_count_cache[scan8[idx] - 1];
         nzb = h->non_zero_count_cache[scan8[idx] - 8];
     }
@@ -1555,7 +982,7 @@ static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx,
     if( nzb > 0 )
         ctx += 2;
 
-    return base_ctx[cat] + ctx;
+    return ctx + 4 * cat;
 }
 
 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = {
@@ -1566,16 +993,16 @@ DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = {
 };
 
 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
-    static const int significant_coeff_flag_offset[2][14] = {
-      { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 },
-      { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 }
+    static const int significant_coeff_flag_offset[2][6] = {
+      { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
+      { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
     };
-    static const int last_coeff_flag_offset[2][14] = {
-      { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748 },
-      { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757 }
+    static const int last_coeff_flag_offset[2][6] = {
+      { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
+      { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
     };
-    static const int coeff_abs_level_m1_offset[14] = {
-        227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766
+    static const int coeff_abs_level_m1_offset[6] = {
+        227+0, 227+10, 227+20, 227+30, 227+39, 426
     };
     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
@@ -1630,7 +1057,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
     abs_level_m1_ctx_base = h->cabac_state
         + coeff_abs_level_m1_offset[cat];
 
-    if( !is_dc && max_coeff == 64 ) {
+    if( !is_dc && cat == 5 ) {
 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
         for(last= 0; last < coefs; last++) { \
             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
@@ -1648,11 +1075,9 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
         }
         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
-        coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index,
-                                                 last_coeff_ctx_base-significant_coeff_ctx_base, sig_off);
+        coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
     } else {
-        coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index,
-                                             last_coeff_ctx_base-significant_coeff_ctx_base);
+        coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
 #else
         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
     } else {
@@ -1662,16 +1087,16 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
     assert(coeff_count > 0);
 
     if( is_dc ) {
-        if( cat == 3 )
-            h->cbp_table[h->mb_xy] |= 0x40 << (n - CHROMA_DC_BLOCK_INDEX);
+        if( cat == 0 )
+            h->cbp_table[h->mb_xy] |= 0x100;
         else
-            h->cbp_table[h->mb_xy] |= 0x100 << (n - LUMA_DC_BLOCK_INDEX);
+            h->cbp_table[h->mb_xy] |= 0x40 << (n - CHROMA_DC_BLOCK_INDEX);
         h->non_zero_count_cache[scan8[n]] = coeff_count;
     } else {
-        if( max_coeff == 64 )
+        if( cat == 5 )
             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
         else {
-            assert( cat == 1 || cat ==  2 || cat ==  4 || cat == 7 || cat == 8 || cat == 11 || cat == 12 );
+            assert( cat == 1 || cat == 2 || cat == 4 );
             h->non_zero_count_cache[scan8[n]] = coeff_count;
         }
     }
@@ -1754,7 +1179,7 @@ static void decode_cabac_residual_nondc_internal( H264Context *h, DCTELEM *block
 
 static av_always_inline void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) {
     /* read coded block flag */
-    if( get_cabac( &h->cabac, &h->cabac_state[get_cabac_cbf_ctx( h, cat, n, max_coeff, 1 ) ] ) == 0 ) {
+    if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, 1 ) ] ) == 0 ) {
         h->non_zero_count_cache[scan8[n]] = 0;
         return;
     }
@@ -1763,68 +1188,13 @@ static av_always_inline void decode_cabac_residual_dc( H264Context *h, DCTELEM *
 
 static av_always_inline void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
     /* read coded block flag */
-    if( (cat != 5 || CHROMA444) && get_cabac( &h->cabac, &h->cabac_state[get_cabac_cbf_ctx( h, cat, n, max_coeff, 0 ) ] ) == 0 ) {
-        if( max_coeff == 64 ) {
-            fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, 0, 1);
-        } else {
-            h->non_zero_count_cache[scan8[n]] = 0;
-        }
+    if( cat != 5 && get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, 0 ) ] ) == 0 ) {
+        h->non_zero_count_cache[scan8[n]] = 0;
         return;
     }
     decode_cabac_residual_nondc_internal( h, block, cat, n, scantable, qmul, max_coeff );
 }
 
-static av_always_inline void decode_cabac_luma_residual( H264Context *h, const uint8_t *scan, const uint8_t *scan8x8, int pixel_shift, int mb_type, int cbp, int p )
-{
-    static const uint8_t ctx_cat[4][3] = {{0,6,10},{1,7,11},{2,8,12},{5,9,13}};
-    const uint32_t *qmul;
-    int i8x8, i4x4;
-    MpegEncContext * const s = &h->s;
-    int qscale = p == 0 ? s->qscale : h->chroma_qp[p-1];
-    if( IS_INTRA16x16( mb_type ) ) {
-        //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
-        AV_ZERO128(h->mb_luma_dc[p]+0);
-        AV_ZERO128(h->mb_luma_dc[p]+8);
-        AV_ZERO128(h->mb_luma_dc[p]+16);
-        AV_ZERO128(h->mb_luma_dc[p]+24);
-        decode_cabac_residual_dc(h, h->mb_luma_dc[p], ctx_cat[0][p], LUMA_DC_BLOCK_INDEX+p, scan, 16);
-
-        if( cbp&15 ) {
-            qmul = h->dequant4_coeff[p][qscale];
-            for( i4x4 = 0; i4x4 < 16; i4x4++ ) {
-                const int index = 16*p + i4x4;
-                //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", index );
-                decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), ctx_cat[1][p], index, scan + 1, qmul, 15);
-            }
-        } else {
-            fill_rectangle(&h->non_zero_count_cache[scan8[16*p]], 4, 4, 8, 0, 1);
-        }
-    } else {
-        int cqm = (IS_INTRA( mb_type ) ? 0:3) + p;
-        for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
-            if( cbp & (1<<i8x8) ) {
-                if( IS_8x8DCT(mb_type) ) {
-                    const int index = 16*p + 4*i8x8;
-                    decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), ctx_cat[3][p], index,
-                                                scan8x8, h->dequant8_coeff[cqm][qscale], 64);
-                } else {
-                    qmul = h->dequant4_coeff[cqm][qscale];
-                    for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
-                        const int index = 16*p + 4*i8x8 + i4x4;
-                        //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
-//START_TIMER
-                        decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), ctx_cat[2][p], index, scan, qmul, 16);
-//STOP_TIMER("decode_residual")
-                    }
-                }
-            } else {
-                uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8+16*p] ];
-                nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
-            }
-        }
-    }
-}
-
 /**
  * decodes a macroblock
  * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
@@ -1834,7 +1204,6 @@ int ff_h264_decode_mb_cabac(H264Context *h) {
     int mb_xy;
     int mb_type, partition_count, cbp = 0;
     int dct8x8_allowed= h->pps.transform_8x8_mode;
-    int decode_chroma = h->sps.chroma_format_idc == 1 || h->sps.chroma_format_idc == 2;
     const int pixel_shift = h->pixel_shift;
 
     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
@@ -1944,8 +1313,7 @@ decode_intra_mb:
     h->slice_table[ mb_xy ]= h->slice_num;
 
     if(IS_INTRA_PCM(mb_type)) {
-        static const uint16_t mb_sizes[4] = {256,384,512,768};
-        const int mb_size = mb_sizes[h->sps.chroma_format_idc]*h->sps.bit_depth_luma >> 3;
+        const int mb_size = (384*h->sps.bit_depth_luma) >> 3;
         const uint8_t *ptr;
 
         // We assume these blocks are very rare so we do not optimize it.
@@ -1958,17 +1326,20 @@ decode_intra_mb:
         }
 
         // The pixels are stored in the same order as levels in h->mb array.
-        memcpy(h->mb, ptr, mb_size); ptr+=mb_size;
+        memcpy(h->mb, ptr, 2*mb_size/3); ptr+=2*mb_size/3;
+        if(CHROMA){
+            memcpy(h->mb+mb_size/3, ptr, mb_size/3); ptr+=mb_size/3;
+        }
 
         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
 
         // All blocks are present
-        h->cbp_table[mb_xy] = 0xf7ef;
+        h->cbp_table[mb_xy] = 0x1ef;
         h->chroma_pred_mode_table[mb_xy] = 0;
         // In deblocking, the quantizer is 0
         s->current_picture.qscale_table[mb_xy]= 0;
         // All coeffs are present
-        memset(h->non_zero_count[mb_xy], 16, 48);
+        memset(h->non_zero_count[mb_xy], 16, 32);
         s->current_picture.mb_type[mb_xy]= mb_type;
         h->last_qscale_diff = 0;
         return 0;
@@ -2005,7 +1376,7 @@ decode_intra_mb:
             h->intra16x16_pred_mode= ff_h264_check_intra_pred_mode( h, h->intra16x16_pred_mode );
             if( h->intra16x16_pred_mode < 0 ) return -1;
         }
-        if(decode_chroma){
+        if(CHROMA){
             h->chroma_pred_mode_table[mb_xy] =
             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
 
@@ -2234,7 +1605,7 @@ decode_intra_mb:
 
     if( !IS_INTRA16x16( mb_type ) ) {
         cbp  = decode_cabac_mb_cbp_luma( h );
-        if(decode_chroma)
+        if(CHROMA)
             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
     }
 
@@ -2243,28 +1614,6 @@ decode_intra_mb:
     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
         mb_type |= MB_TYPE_8x8DCT * get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
     }
-
-    /* It would be better to do this in fill_decode_caches, but we don't know
-     * the transform mode of the current macroblock there. */
-    if (CHROMA444 && IS_8x8DCT(mb_type)){
-        int i;
-        for (i = 0; i < 2; i++){
-            if (h->left_type[i] && !IS_8x8DCT(h->left_type[i])){
-                h->non_zero_count_cache[3+8* 1 + 2*8*i]=
-                h->non_zero_count_cache[3+8* 2 + 2*8*i]=
-                h->non_zero_count_cache[3+8* 6 + 2*8*i]=
-                h->non_zero_count_cache[3+8* 7 + 2*8*i]=
-                h->non_zero_count_cache[3+8*11 + 2*8*i]=
-                h->non_zero_count_cache[3+8*12 + 2*8*i]= IS_INTRA(mb_type) ? 64 : 0;
-            }
-        }
-        if (h->top_type && !IS_8x8DCT(h->top_type)){
-            uint32_t top_empty = CABAC && !IS_INTRA(mb_type) ? 0 : 0x40404040;
-            AV_WN32A(&h->non_zero_count_cache[4+8* 0], top_empty);
-            AV_WN32A(&h->non_zero_count_cache[4+8* 5], top_empty);
-            AV_WN32A(&h->non_zero_count_cache[4+8*10], top_empty);
-        }
-    }
     s->current_picture.mb_type[mb_xy]= mb_type;
 
     if( cbp || IS_INTRA16x16( mb_type ) ) {
@@ -2309,38 +1658,76 @@ decode_intra_mb:
         }else
             h->last_qscale_diff=0;
 
-        decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 0);
-        if(CHROMA444){
-            decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 1);
-            decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 2);
-        } else {
-            if( cbp&0x30 ){
-                int c;
-                for( c = 0; c < 2; c++ ) {
-                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
-                    decode_cabac_residual_dc(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
-                }
-            }
+        if( IS_INTRA16x16( mb_type ) ) {
+            int i;
+            //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
+            AV_ZERO128(h->mb_luma_dc+0);
+            AV_ZERO128(h->mb_luma_dc+8);
+            AV_ZERO128(h->mb_luma_dc+16);
+            AV_ZERO128(h->mb_luma_dc+24);
+            decode_cabac_residual_dc( h, h->mb_luma_dc, 0, LUMA_DC_BLOCK_INDEX, scan, 16);
 
-            if( cbp&0x20 ) {
-                int c, i;
-                for( c = 0; c < 2; c++ ) {
-                    qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
-                    for( i = 0; i < 4; i++ ) {
-                        const int index = 16 + 16 * c + i;
-                        //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
-                        decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 4, index, scan + 1, qmul, 15);
-                    }
+            if( cbp&15 ) {
+                qmul = h->dequant4_coeff[0][s->qscale];
+                for( i = 0; i < 16; i++ ) {
+                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
+                    decode_cabac_residual_nondc(h, h->mb + (16*i << pixel_shift), 1, i, scan + 1, qmul, 15);
                 }
             } else {
-                fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
-                fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
+                fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
+            }
+        } else {
+            int i8x8, i4x4;
+            for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
+                if( cbp & (1<<i8x8) ) {
+                    if( IS_8x8DCT(mb_type) ) {
+                        decode_cabac_residual_nondc(h, h->mb + (64*i8x8 << pixel_shift), 5, 4*i8x8,
+                            scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
+                    } else {
+                        qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
+                        for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
+                            const int index = 4*i8x8 + i4x4;
+                            //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
+//START_TIMER
+                            decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 2, index, scan, qmul, 16);
+//STOP_TIMER("decode_residual")
+                        }
+                    }
+                } else {
+                    uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
+                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
+                }
             }
         }
+
+        if( cbp&0x30 ){
+            int c;
+            for( c = 0; c < 2; c++ ) {
+                //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
+                decode_cabac_residual_dc(h, h->mb + ((256 + 16*4*c) << pixel_shift), 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
+            }
+        }
+
+        if( cbp&0x20 ) {
+            int c, i;
+            for( c = 0; c < 2; c++ ) {
+                qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
+                for( i = 0; i < 4; i++ ) {
+                    const int index = 16 + 4 * c + i;
+                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
+                    decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 4, index, scan + 1, qmul, 15);
+                }
+            }
+        } else {
+            uint8_t * const nnz= &h->non_zero_count_cache[0];
+            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
+        }
     } else {
-        fill_rectangle(&h->non_zero_count_cache[scan8[ 0]], 4, 4, 8, 0, 1);
-        fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
-        fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
+        uint8_t * const nnz= &h->non_zero_count_cache[0];
+        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
+        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
         h->last_qscale_diff = 0;
     }
 
diff --git a/libavcodec/h264_cavlc.c b/libavcodec/h264_cavlc.c
index 497166b423..2e5ea54679 100644
--- a/libavcodec/h264_cavlc.c
+++ b/libavcodec/h264_cavlc.c
@@ -371,12 +371,12 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
 
     //FIXME put trailing_onex into the context
 
-    if(max_coeff <= 8){
+    if(n >= CHROMA_DC_BLOCK_INDEX){
         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
         total_coeff= coeff_token>>2;
     }else{
-        if(n >= LUMA_DC_BLOCK_INDEX){
-            total_coeff= pred_non_zero_count(h, (n - LUMA_DC_BLOCK_INDEX)*16);
+        if(n == LUMA_DC_BLOCK_INDEX){
+            total_coeff= pred_non_zero_count(h, 0);
             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
             total_coeff= coeff_token>>2;
         }else{
@@ -482,8 +482,7 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
     if(total_coeff == max_coeff)
         zeros_left=0;
     else{
-        /* FIXME: we don't actually support 4:2:2 yet. */
-        if(max_coeff <= 8)
+        if(n >= CHROMA_DC_BLOCK_INDEX)
             zeros_left= get_vlc2(gb, (chroma_dc_total_zeros_vlc-1)[ total_coeff ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
         else
             zeros_left= get_vlc2(gb, (total_zeros_vlc-1)[ total_coeff ].table, TOTAL_ZEROS_VLC_BITS, 1);
@@ -537,80 +536,12 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
     return 0;
 }
 
-static av_always_inline int decode_luma_residual(H264Context *h, GetBitContext *gb, const uint8_t *scan, const uint8_t *scan8x8, int pixel_shift, int mb_type, int cbp, int p){
-    int i4x4, i8x8;
-    MpegEncContext * const s = &h->s;
-    int qscale = p == 0 ? s->qscale : h->chroma_qp[p-1];
-    if(IS_INTRA16x16(mb_type)){
-        AV_ZERO128(h->mb_luma_dc[p]+0);
-        AV_ZERO128(h->mb_luma_dc[p]+8);
-        AV_ZERO128(h->mb_luma_dc[p]+16);
-        AV_ZERO128(h->mb_luma_dc[p]+24);
-        if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc[p], LUMA_DC_BLOCK_INDEX+p, scan, NULL, 16) < 0){
-            return -1; //FIXME continue if partitioned and other return -1 too
-        }
-
-        assert((cbp&15) == 0 || (cbp&15) == 15);
-
-        if(cbp&15){
-            for(i8x8=0; i8x8<4; i8x8++){
-                for(i4x4=0; i4x4<4; i4x4++){
-                    const int index= i4x4 + 4*i8x8 + p*16;
-                    if( decode_residual(h, h->intra_gb_ptr, h->mb + (16*index << pixel_shift),
-                        index, scan + 1, h->dequant4_coeff[p][qscale], 15) < 0 ){
-                        return -1;
-                    }
-                }
-            }
-            return 0xf;
-        }else{
-            fill_rectangle(&h->non_zero_count_cache[scan8[p*16]], 4, 4, 8, 0, 1);
-            return 0;
-        }
-    }else{
-        int cqm = (IS_INTRA( mb_type ) ? 0:3)+p;
-        /* For CAVLC 4:4:4, we need to keep track of the luma 8x8 CBP for deblocking nnz purposes. */
-        int new_cbp = 0;
-        for(i8x8=0; i8x8<4; i8x8++){
-            if(cbp & (1<<i8x8)){
-                if(IS_8x8DCT(mb_type)){
-                    DCTELEM *buf = &h->mb[64*i8x8+256*p << pixel_shift];
-                    uint8_t *nnz;
-                    for(i4x4=0; i4x4<4; i4x4++){
-                        const int index= i4x4 + 4*i8x8 + p*16;
-                        if( decode_residual(h, gb, buf, index, scan8x8+16*i4x4,
-                                            h->dequant8_coeff[cqm][qscale], 16) < 0 )
-                            return -1;
-                    }
-                    nnz= &h->non_zero_count_cache[ scan8[4*i8x8+p*16] ];
-                    nnz[0] += nnz[1] + nnz[8] + nnz[9];
-                    new_cbp |= !!nnz[0] << i8x8;
-                }else{
-                    for(i4x4=0; i4x4<4; i4x4++){
-                        const int index= i4x4 + 4*i8x8 + p*16;
-                        if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index,
-                                            scan, h->dequant4_coeff[cqm][qscale], 16) < 0 ){
-                            return -1;
-                        }
-                        new_cbp |= h->non_zero_count_cache[ scan8[index] ] << i8x8;
-                    }
-                }
-            }else{
-                uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8+p*16] ];
-                nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
-            }
-        }
-        return new_cbp;
-    }
-}
-
 int ff_h264_decode_mb_cavlc(H264Context *h){
     MpegEncContext * const s = &h->s;
     int mb_xy;
     int partition_count;
     unsigned int mb_type, cbp;
     int dct8x8_allowed= h->pps.transform_8x8_mode;
-    int decode_chroma = h->sps.chroma_format_idc == 1 || h->sps.chroma_format_idc == 2;
     const int pixel_shift = h->pixel_shift;
 
     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
@@ -677,21 +608,19 @@ decode_intra_mb:
 
     if(IS_INTRA_PCM(mb_type)){
         unsigned int x;
-        static const uint16_t mb_sizes[4] = {256,384,512,768};
-        const int mb_size = mb_sizes[h->sps.chroma_format_idc]*h->sps.bit_depth_luma >> 3;
 
         // We assume these blocks are very rare so we do not optimize it.
         align_get_bits(&s->gb);
 
         // The pixels are stored in the same order as levels in h->mb array.
-        for(x=0; x < mb_size; x++){
+        for(x=0; x < (CHROMA ? 384 : 256)*h->sps.bit_depth_luma/8; x++){
             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
         }
 
         // In deblocking, the quantizer is 0
         s->current_picture.qscale_table[mb_xy]= 0;
         // All coeffs are present
-        memset(h->non_zero_count[mb_xy], 16, 48);
+        memset(h->non_zero_count[mb_xy], 16, 32);
 
         s->current_picture.mb_type[mb_xy]= mb_type;
         return 0;
@@ -739,7 +668,7 @@ decode_intra_mb:
             if(h->intra16x16_pred_mode < 0)
                 return -1;
         }
-        if(decode_chroma){
+        if(CHROMA){
             pred_mode= ff_h264_check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
             if(pred_mode < 0)
                 return -1;
@@ -967,19 +896,15 @@ decode_intra_mb:
 
     if(!IS_INTRA16x16(mb_type)){
         cbp= get_ue_golomb(&s->gb);
+        if(cbp > 47){
+            av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
+            return -1;
+        }
 
-        if(decode_chroma){
-            if(cbp > 47){
-                av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
-                return -1;
-            }
+        if(CHROMA){
             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
             else                     cbp= golomb_to_inter_cbp   [cbp];
         }else{
-            if(cbp > 15){
-                av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
-                return -1;
-            }
             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
             else                     cbp= golomb_to_inter_cbp_gray[cbp];
         }
@@ -993,9 +918,8 @@ decode_intra_mb:
     s->current_picture.mb_type[mb_xy]= mb_type;
 
     if(cbp || IS_INTRA16x16(mb_type)){
-        int i4x4, chroma_idx;
+        int i8x8, i4x4, chroma_idx;
         int dquant;
-        int ret;
         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
         const uint8_t *scan, *scan8x8;
         const int max_qp = 51 + 6*(h->sps.bit_depth_luma-8);
@@ -1023,45 +947,85 @@ decode_intra_mb:
 
         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
-
-        if( (ret = decode_luma_residual(h, gb, scan, scan8x8, pixel_shift, mb_type, cbp, 0)) < 0 ){
-            return -1;
-        }
-        h->cbp_table[mb_xy] |= ret << 12;
-        if(CHROMA444){
-            if( decode_luma_residual(h, gb, scan, scan8x8, pixel_shift, mb_type, cbp, 1) < 0 ){
-                return -1;
-            }
-            if( decode_luma_residual(h, gb, scan, scan8x8, pixel_shift, mb_type, cbp, 2) < 0 ){
-                return -1;
-            }
-        } else {
-            if(cbp&0x30){
-                for(chroma_idx=0; chroma_idx<2; chroma_idx++)
-                    if( decode_residual(h, gb, h->mb + ((256 + 16*16*chroma_idx) << pixel_shift), CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
-                        return -1;
-                    }
+        if(IS_INTRA16x16(mb_type)){
+            AV_ZERO128(h->mb_luma_dc+0);
+            AV_ZERO128(h->mb_luma_dc+8);
+            AV_ZERO128(h->mb_luma_dc+16);
+            AV_ZERO128(h->mb_luma_dc+24);
+            if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc, LUMA_DC_BLOCK_INDEX, scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
+                return -1; //FIXME continue if partitioned and other return -1 too
             }
 
-            if(cbp&0x20){
-                for(chroma_idx=0; chroma_idx<2; chroma_idx++){
-                    const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
+            assert((cbp&15) == 0 || (cbp&15) == 15);
+
+            if(cbp&15){
+                for(i8x8=0; i8x8<4; i8x8++){
                     for(i4x4=0; i4x4<4; i4x4++){
-                        const int index= 16 + 16*chroma_idx + i4x4;
-                        if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index, scan + 1, qmul, 15) < 0){
+                        const int index= i4x4 + 4*i8x8;
+                        if( decode_residual(h, h->intra_gb_ptr, h->mb + (16*index << pixel_shift), index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
                             return -1;
                         }
                     }
                 }
             }else{
-                fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
-                fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
+                fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
+            }
+        }else{
+            for(i8x8=0; i8x8<4; i8x8++){
+                if(cbp & (1<<i8x8)){
+                    if(IS_8x8DCT(mb_type)){
+                        DCTELEM *buf = &h->mb[64*i8x8 << pixel_shift];
+                        uint8_t *nnz;
+                        for(i4x4=0; i4x4<4; i4x4++){
+                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
+                                                h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
+                                return -1;
+                        }
+                        nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
+                        nnz[0] += nnz[1] + nnz[8] + nnz[9];
+                    }else{
+                        for(i4x4=0; i4x4<4; i4x4++){
+                            const int index= i4x4 + 4*i8x8;
+
+                            if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
+                                return -1;
+                            }
+                        }
+                    }
+                }else{
+                    uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
+                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
+                }
             }
         }
+
+        if(cbp&0x30){
+            for(chroma_idx=0; chroma_idx<2; chroma_idx++)
+                if( decode_residual(h, gb, h->mb + ((256 + 16*4*chroma_idx) << pixel_shift), CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
+                    return -1;
+                }
+        }
+
+        if(cbp&0x20){
+            for(chroma_idx=0; chroma_idx<2; chroma_idx++){
+                const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
+                for(i4x4=0; i4x4<4; i4x4++){
+                    const int index= 16 + 4*chroma_idx + i4x4;
+                    if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index, scan + 1, qmul, 15) < 0){
+                        return -1;
+                    }
+                }
+            }
+        }else{
+            uint8_t * const nnz= &h->non_zero_count_cache[0];
+            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
+        }
     }else{
-        fill_rectangle(&h->non_zero_count_cache[scan8[ 0]], 4, 4, 8, 0, 1);
-        fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
-        fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
+        uint8_t * const nnz= &h->non_zero_count_cache[0];
+        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
+        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
     }
     s->current_picture.qscale_table[mb_xy]= s->qscale;
     write_back_non_zero_count(h);
diff --git a/libavcodec/h264_loopfilter.c b/libavcodec/h264_loopfilter.c
index 1ae534ec96..72b1905936 100644
--- a/libavcodec/h264_loopfilter.c
+++ b/libavcodec/h264_loopfilter.c
@@ -217,11 +217,10 @@ void ff_h264_filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
     int mb_xy;
     int mb_type, left_type;
     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
-    int chroma = !(CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
 
     mb_xy = h->mb_xy;
 
-    if(!h->top_type || !h->h264dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff || CHROMA444) {
+    if(!h->top_type || !h->h264dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff) {
         ff_h264_filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
         return;
     }
@@ -263,18 +262,16 @@ void ff_h264_filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
             filter_mb_edgeh( &img_y[4*2*linesize], linesize, bS3, qp, h);
             filter_mb_edgeh( &img_y[4*3*linesize], linesize, bS3, qp, h);
         }
-        if(chroma){
-            if(left_type){
-                filter_mb_edgecv( &img_cb[2*0], uvlinesize, bS4, qpc0, h);
-                filter_mb_edgecv( &img_cr[2*0], uvlinesize, bS4, qpc0, h);
-            }
-            filter_mb_edgecv( &img_cb[2*2], uvlinesize, bS3, qpc, h);
-            filter_mb_edgecv( &img_cr[2*2], uvlinesize, bS3, qpc, h);
-            filter_mb_edgech( &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1, h);
-            filter_mb_edgech( &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc, h);
-            filter_mb_edgech( &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1, h);
-            filter_mb_edgech( &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc, h);
+        if(left_type){
+            filter_mb_edgecv( &img_cb[2*0], uvlinesize, bS4, qpc0, h);
+            filter_mb_edgecv( &img_cr[2*0], uvlinesize, bS4, qpc0, h);
         }
+        filter_mb_edgecv( &img_cb[2*2], uvlinesize, bS3, qpc, h);
+        filter_mb_edgecv( &img_cr[2*2], uvlinesize, bS3, qpc, h);
+        filter_mb_edgech( &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1, h);
+        filter_mb_edgech( &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc, h);
+        filter_mb_edgech( &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1, h);
+        filter_mb_edgech( &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc, h);
         return;
     } else {
         LOCAL_ALIGNED_8(int16_t, bS, [2], [4][4]);
@@ -301,7 +298,7 @@ void ff_h264_filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
 #define FILTER(hv,dir,edge)\
         if(AV_RN64A(bS[dir][edge])) {                                   \
             filter_mb_edge##hv( &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir, h );\
-            if(chroma && !(edge&1)) {\
+            if(!(edge&1)) {\
                 filter_mb_edgec##hv( &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir, h );\
                 filter_mb_edgec##hv( &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir, h );\
             }\
@@ -356,10 +353,9 @@ static int check_mv(H264Context *h, long b_idx, long bn_idx, int mvy_limit){
     return v;
 }
 
-static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int chroma, int chroma444, int dir) {
+static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
     MpegEncContext * const s = &h->s;
     int edge;
-    int chroma_qp_avg[2];
     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
     const int mbm_type = dir == 0 ? h->left_type[0] : h->top_type;
 
@@ -398,7 +394,7 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
                         bS[2]= 1+((h->cbp_table[mbn_xy] & 8)||h->non_zero_count_cache[scan8[0]+2]);
                         bS[3]= 1+((h->cbp_table[mbn_xy] & 8)||h->non_zero_count_cache[scan8[0]+3]);
                     }else{
-                    const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy] + 3*4;
+                    const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy] + 4+3*8;
                     int i;
                     for( i = 0; i < 4; i++ ) {
                         bS[i] = 1 + !!(h->non_zero_count_cache[scan8[0]+i] | mbn_nnz[i]);
@@ -411,17 +407,10 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
                 filter_mb_edgeh( &img_y[j*linesize], tmp_linesize, bS, qp, h );
-                chroma_qp_avg[0] = (h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
-                chroma_qp_avg[1] = (h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
-                if (chroma) {
-                    if (chroma444) {
-                        filter_mb_edgeh (&img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[0], h);
-                        filter_mb_edgeh (&img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[1], h);
-                    } else {
-                        filter_mb_edgech(&img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[0], h);
-                        filter_mb_edgech(&img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[1], h);
-                    }
-                }
+                filter_mb_edgech( &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
+                                ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1, h);
+                filter_mb_edgech( &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
+                                ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1, h);
             }
         }else{
             DECLARE_ALIGNED(8, int16_t, bS)[4];
@@ -476,29 +465,23 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
                 //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]);
                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
                 //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
-                chroma_qp_avg[0] = (h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
-                chroma_qp_avg[1] = (h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
                 if( dir == 0 ) {
                     filter_mb_edgev( &img_y[0], linesize, bS, qp, h );
-                    if (chroma) {
-                        if (chroma444) {
-                            filter_mb_edgev ( &img_cb[0], uvlinesize, bS, chroma_qp_avg[0], h);
-                            filter_mb_edgev ( &img_cr[0], uvlinesize, bS, chroma_qp_avg[1], h);
-                        } else {
-                            filter_mb_edgecv( &img_cb[0], uvlinesize, bS, chroma_qp_avg[0], h);
-                            filter_mb_edgecv( &img_cr[0], uvlinesize, bS, chroma_qp_avg[1], h);
-                        }
+                    {
+                        int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
+                        filter_mb_edgecv( &img_cb[0], uvlinesize, bS, qp, h);
+                        if(h->pps.chroma_qp_diff)
+                            qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
+                        filter_mb_edgecv( &img_cr[0], uvlinesize, bS, qp, h);
                     }
                 } else {
                     filter_mb_edgeh( &img_y[0], linesize, bS, qp, h );
-                    if (chroma) {
-                        if (chroma444) {
-                            filter_mb_edgeh ( &img_cb[0], uvlinesize, bS, chroma_qp_avg[0], h);
-                            filter_mb_edgeh ( &img_cr[0], uvlinesize, bS, chroma_qp_avg[1], h);
-                        } else {
-                            filter_mb_edgech( &img_cb[0], uvlinesize, bS, chroma_qp_avg[0], h);
-                            filter_mb_edgech( &img_cr[0], uvlinesize, bS, chroma_qp_avg[1], h);
-                        }
+                    {
+                        int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
+                        filter_mb_edgech( &img_cb[0], uvlinesize, bS, qp, h);
+                        if(h->pps.chroma_qp_diff)
+                            qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
+                        filter_mb_edgech( &img_cr[0], uvlinesize, bS, qp, h);
                     }
                 }
             }
@@ -562,25 +545,15 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
         //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
         if( dir == 0 ) {
             filter_mb_edgev( &img_y[4*edge << h->pixel_shift], linesize, bS, qp, h );
-            if (chroma) {
-                if (chroma444) {
-                    filter_mb_edgev ( &img_cb[4*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
-                    filter_mb_edgev ( &img_cr[4*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
-                } else if( (edge&1) == 0 ) {
-                    filter_mb_edgecv( &img_cb[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
-                    filter_mb_edgecv( &img_cr[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
-                }
+            if( (edge&1) == 0 ) {
+                filter_mb_edgecv( &img_cb[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
+                filter_mb_edgecv( &img_cr[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
             }
         } else {
             filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, h );
-            if (chroma) {
-                if (chroma444) {
-                    filter_mb_edgeh ( &img_cb[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], h);
-                    filter_mb_edgeh ( &img_cr[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], h);
-                } else if( (edge&1) == 0 ) {
-                    filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], h);
-                    filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], h);
-                }
+            if( (edge&1) == 0 ) {
+                filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], h);
+                filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], h);
             }
         }
     }
@@ -593,7 +566,6 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
     int first_vertical_edge_done = 0;
     av_unused int dir;
-    int chroma = !(CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
 
     if (FRAME_MBAFF
             // and current and left pair do not have the same interlaced type
@@ -617,11 +589,11 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
         } else {
             static const uint8_t offset[2][2][8]={
                 {
-                    {3+4*0, 3+4*0, 3+4*0, 3+4*0, 3+4*1, 3+4*1, 3+4*1, 3+4*1},
-                    {3+4*2, 3+4*2, 3+4*2, 3+4*2, 3+4*3, 3+4*3, 3+4*3, 3+4*3},
+                    {7+8*0, 7+8*0, 7+8*0, 7+8*0, 7+8*1, 7+8*1, 7+8*1, 7+8*1},
+                    {7+8*2, 7+8*2, 7+8*2, 7+8*2, 7+8*3, 7+8*3, 7+8*3, 7+8*3},
                 },{
-                    {3+4*0, 3+4*1, 3+4*2, 3+4*3, 3+4*0, 3+4*1, 3+4*2, 3+4*3},
-                    {3+4*0, 3+4*1, 3+4*2, 3+4*3, 3+4*0, 3+4*1, 3+4*2, 3+4*3},
+                    {7+8*0, 7+8*1, 7+8*2, 7+8*3, 7+8*0, 7+8*1, 7+8*2, 7+8*3},
+                    {7+8*0, 7+8*1, 7+8*2, 7+8*3, 7+8*0, 7+8*1, 7+8*2, 7+8*3},
                 }
             };
             const uint8_t *off= offset[MB_FIELD][mb_y&1];
@@ -662,29 +634,25 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
         if(MB_FIELD){
             filter_mb_mbaff_edgev ( h, img_y                ,   linesize, bS  , 1, qp [0] );
             filter_mb_mbaff_edgev ( h, img_y  + 8*  linesize,   linesize, bS+4, 1, qp [1] );
-            if (chroma){
-                filter_mb_mbaff_edgecv( h, img_cb,                uvlinesize, bS  , 1, bqp[0] );
-                filter_mb_mbaff_edgecv( h, img_cb + 4*uvlinesize, uvlinesize, bS+4, 1, bqp[1] );
-                filter_mb_mbaff_edgecv( h, img_cr,                uvlinesize, bS  , 1, rqp[0] );
-                filter_mb_mbaff_edgecv( h, img_cr + 4*uvlinesize, uvlinesize, bS+4, 1, rqp[1] );
-            }
+            filter_mb_mbaff_edgecv( h, img_cb,                uvlinesize, bS  , 1, bqp[0] );
+            filter_mb_mbaff_edgecv( h, img_cb + 4*uvlinesize, uvlinesize, bS+4, 1, bqp[1] );
+            filter_mb_mbaff_edgecv( h, img_cr,                uvlinesize, bS  , 1, rqp[0] );
+            filter_mb_mbaff_edgecv( h, img_cr + 4*uvlinesize, uvlinesize, bS+4, 1, rqp[1] );
         }else{
             filter_mb_mbaff_edgev ( h, img_y              , 2*  linesize, bS  , 2, qp [0] );
             filter_mb_mbaff_edgev ( h, img_y  +   linesize, 2*  linesize, bS+1, 2, qp [1] );
-            if (chroma){
-                filter_mb_mbaff_edgecv( h, img_cb,              2*uvlinesize, bS  , 2, bqp[0] );
-                filter_mb_mbaff_edgecv( h, img_cb + uvlinesize, 2*uvlinesize, bS+1, 2, bqp[1] );
-                filter_mb_mbaff_edgecv( h, img_cr,              2*uvlinesize, bS  , 2, rqp[0] );
-                filter_mb_mbaff_edgecv( h, img_cr + uvlinesize, 2*uvlinesize, bS+1, 2, rqp[1] );
-            }
+            filter_mb_mbaff_edgecv( h, img_cb,              2*uvlinesize, bS  , 2, bqp[0] );
+            filter_mb_mbaff_edgecv( h, img_cb + uvlinesize, 2*uvlinesize, bS+1, 2, bqp[1] );
+            filter_mb_mbaff_edgecv( h, img_cr,              2*uvlinesize, bS  , 2, rqp[0] );
+            filter_mb_mbaff_edgecv( h, img_cr + uvlinesize, 2*uvlinesize, bS+1, 2, rqp[1] );
         }
     }
 
 #if CONFIG_SMALL
     for( dir = 0; dir < 2; dir++ )
-        filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, chroma, CHROMA444, dir);
+        filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
 #else
-    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, chroma, CHROMA444, 0);
-    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, chroma, CHROMA444, 1);
+    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
+    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
 #endif
 }
diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c
index 9c41e4ca73..a98f14aaf6 100644
--- a/libavcodec/h264_ps.c
+++ b/libavcodec/h264_ps.c
@@ -269,7 +269,7 @@ static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_s
         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
-        fallback_sps ? sps->scaling_matrix8[3] : default_scaling8[1]
+        fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
     };
     if(get_bits1(&s->gb)){
         sps->scaling_matrix_present |= is_sps;
@@ -281,15 +281,7 @@ static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_s
         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
         if(is_sps || pps->transform_8x8_mode){
             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
-            if(h->sps.chroma_format_idc == 3){
-                decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[0],scaling_matrix8[0]);  // Intra, Cr
-                decode_scaling_list(h,scaling_matrix8[2],64,default_scaling8[0],scaling_matrix8[1]);  // Intra, Cb
-            }
-            decode_scaling_list(h,scaling_matrix8[3],64,default_scaling8[1],fallback[3]);  // Inter, Y
-            if(h->sps.chroma_format_idc == 3){
-                decode_scaling_list(h,scaling_matrix8[4],64,default_scaling8[1],scaling_matrix8[3]);  // Inter, Cr
-                decode_scaling_list(h,scaling_matrix8[5],64,default_scaling8[1],scaling_matrix8[4]);  // Inter, Cb
-            }
+            decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
         }
     }
 }
@@ -403,7 +395,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){
         if(sps->crop_left || sps->crop_top){
             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
         }
-        if(sps->crop_right >= (8<<CHROMA444) || sps->crop_bottom >= (8<<CHROMA444)){
+        if(sps->crop_right >= 8 || sps->crop_bottom >= 8){
             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
         }
     }else{
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index 6972725781..864c118bb5 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -66,10 +66,10 @@ typedef struct H264DSPContext{
     void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
     void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
 
-    void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[15*8]);
-    void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[15*8]);
-    void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[15*8]);
-    void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[15*8]);
+    void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+    void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+    void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+    void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
     void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul);
     void (*h264_chroma_dc_dequant_idct)(DCTELEM *block, int qmul);
 }H264DSPContext;
diff --git a/libavcodec/h264idct_template.c b/libavcodec/h264idct_template.c
index e7f9af7fb0..39c9a1c9eb 100644
--- a/libavcodec/h264idct_template.c
+++ b/libavcodec/h264idct_template.c
@@ -30,19 +30,15 @@
 #ifndef AVCODEC_H264IDCT_INTERNAL_H
 #define AVCODEC_H264IDCT_INTERNAL_H
 //FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
-static const uint8_t scan8[16*3]={
- 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8,
- 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8,
- 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8,
- 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8,
- 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8,
- 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8,
- 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8,
- 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8,
- 4+11*8, 5+11*8, 4+12*8, 5+12*8,
- 6+11*8, 7+11*8, 6+12*8, 7+12*8,
- 4+13*8, 5+13*8, 4+14*8, 5+14*8,
- 6+13*8, 7+13*8, 6+14*8, 7+14*8
+static const uint8_t scan8[16 + 2*4]={
+ 4+1*8, 5+1*8, 4+2*8, 5+2*8,
+ 6+1*8, 7+1*8, 6+2*8, 7+2*8,
+ 4+3*8, 5+3*8, 4+4*8, 5+4*8,
+ 6+3*8, 7+3*8, 6+4*8, 7+4*8,
+ 1+1*8, 2+1*8,
+ 1+2*8, 2+2*8,
+ 1+4*8, 2+4*8,
+ 1+5*8, 2+5*8,
 };
 #endif
 
@@ -194,7 +190,7 @@ void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, DCTELEM *block, int stride){
     }
 }
 
-void FUNCC(ff_h264_idct_add16)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
+void FUNCC(ff_h264_idct_add16)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
     int i;
     for(i=0; i<16; i++){
         int nnz = nnzc[ scan8[i] ];
@@ -205,7 +201,7 @@ void FUNCC(ff_h264_idct_add16)(uint8_t *dst, const int *block_offset, DCTELEM *b
     }
 }
 
-void FUNCC(ff_h264_idct_add16intra)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
+void FUNCC(ff_h264_idct_add16intra)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
     int i;
     for(i=0; i<16; i++){
         if(nnzc[ scan8[i] ])             FUNCC(idct_internal      )(dst + block_offset[i], block + i*16*sizeof(pixel), stride, 4, 6, 1);
@@ -213,7 +209,7 @@ void FUNCC(ff_h264_idct_add16intra)(uint8_t *dst, const int *block_offset, DCTEL
     }
 }
 
-void FUNCC(ff_h264_idct8_add4)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
+void FUNCC(ff_h264_idct8_add4)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
     int i;
     for(i=0; i<16; i+=4){
         int nnz = nnzc[ scan8[i] ];
@@ -224,15 +220,13 @@ void FUNCC(ff_h264_idct8_add4)(uint8_t *dst, const int *block_offset, DCTELEM *b
     }
 }
 
-void FUNCC(ff_h264_idct_add8)(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
-    int i, j;
-    for(j=1; j<3; j++){
-        for(i=j*16; i<j*16+4; i++){
-            if(nnzc[ scan8[i] ])
-                FUNCC(ff_h264_idct_add   )(dest[j-1] + block_offset[i], block + i*16*sizeof(pixel), stride);
-            else if(((dctcoef*)block)[i*16])
-                FUNCC(ff_h264_idct_dc_add)(dest[j-1] + block_offset[i], block + i*16*sizeof(pixel), stride);
-        }
+void FUNCC(ff_h264_idct_add8)(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=16; i<16+8; i++){
+        if(nnzc[ scan8[i] ])
+            FUNCC(ff_h264_idct_add   )(dest[(i&4)>>2] + block_offset[i], block + i*16*sizeof(pixel), stride);
+        else if(((dctcoef*)block)[i*16])
+            FUNCC(ff_h264_idct_dc_add)(dest[(i&4)>>2] + block_offset[i], block + i*16*sizeof(pixel), stride);
     }
 }
 /**
diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index 4978d28b49..6a45da8761 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -1185,17 +1185,15 @@ void MPV_frame_end(MpegEncContext *s)
        && s->current_picture.reference
        && !s->intra_only
        && !(s->flags&CODEC_FLAG_EMU_EDGE)) {
-            int hshift = av_pix_fmt_descriptors[s->avctx->pix_fmt].log2_chroma_w;
-            int vshift = av_pix_fmt_descriptors[s->avctx->pix_fmt].log2_chroma_h;
             s->dsp.draw_edges(s->current_picture.data[0], s->linesize  ,
-                              s->h_edge_pos             , s->v_edge_pos,
-                              EDGE_WIDTH        , EDGE_WIDTH        , EDGE_TOP | EDGE_BOTTOM);
+                              s->h_edge_pos   , s->v_edge_pos   ,
+                              EDGE_WIDTH  , EDGE_TOP | EDGE_BOTTOM);
             s->dsp.draw_edges(s->current_picture.data[1], s->uvlinesize,
-                              s->h_edge_pos>>hshift, s->v_edge_pos>>vshift,
-                              EDGE_WIDTH>>hshift, EDGE_WIDTH>>vshift, EDGE_TOP | EDGE_BOTTOM);
+                              s->h_edge_pos>>1, s->v_edge_pos>>1,
+                              EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
             s->dsp.draw_edges(s->current_picture.data[2], s->uvlinesize,
-                              s->h_edge_pos>>hshift, s->v_edge_pos>>vshift,
-                              EDGE_WIDTH>>hshift, EDGE_WIDTH>>vshift, EDGE_TOP | EDGE_BOTTOM);
+                              s->h_edge_pos>>1, s->v_edge_pos>>1,
+                              EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
     }
 
     emms_c();
@@ -2286,19 +2284,14 @@ void ff_draw_horiz_band(MpegEncContext *s, int y, int h){
        && !s->intra_only
        && !(s->flags&CODEC_FLAG_EMU_EDGE)) {
         int sides = 0, edge_h;
-        int hshift = av_pix_fmt_descriptors[s->avctx->pix_fmt].log2_chroma_w;
-        int vshift = av_pix_fmt_descriptors[s->avctx->pix_fmt].log2_chroma_h;
         if (y==0) sides |= EDGE_TOP;
         if (y + h >= s->v_edge_pos) sides |= EDGE_BOTTOM;
 
         edge_h= FFMIN(h, s->v_edge_pos - y);
 
-        s->dsp.draw_edges(s->current_picture_ptr->data[0] +  y         *s->linesize  , s->linesize,
-                          s->h_edge_pos        , edge_h        , EDGE_WIDTH        , EDGE_WIDTH        , sides);
-        s->dsp.draw_edges(s->current_picture_ptr->data[1] + (y>>vshift)*s->uvlinesize, s->uvlinesize,
-                          s->h_edge_pos>>hshift, edge_h>>hshift, EDGE_WIDTH>>hshift, EDGE_WIDTH>>vshift, sides);
-        s->dsp.draw_edges(s->current_picture_ptr->data[2] + (y>>vshift)*s->uvlinesize, s->uvlinesize,
-                          s->h_edge_pos>>hshift, edge_h>>hshift, EDGE_WIDTH>>hshift, EDGE_WIDTH>>vshift, sides);
+        s->dsp.draw_edges(s->current_picture_ptr->data[0] +  y    *s->linesize  , s->linesize  , s->h_edge_pos   , edge_h   , EDGE_WIDTH  , sides);
+        s->dsp.draw_edges(s->current_picture_ptr->data[1] + (y>>1)*s->uvlinesize, s->uvlinesize, s->h_edge_pos>>1, edge_h>>1, EDGE_WIDTH/2, sides);
+        s->dsp.draw_edges(s->current_picture_ptr->data[2] + (y>>1)*s->uvlinesize, s->uvlinesize, s->h_edge_pos>>1, edge_h>>1, EDGE_WIDTH/2, sides);
     }
 
     h= FFMIN(h, s->avctx->height - y);
diff --git a/libavcodec/snow.c b/libavcodec/snow.c
index 28f04f119b..6db0b290ba 100644
--- a/libavcodec/snow.c
+++ b/libavcodec/snow.c
@@ -1978,13 +1978,13 @@ static int frame_start(SnowContext *s){
     if(s->current_picture.data[0]){
         s->dsp.draw_edges(s->current_picture.data[0],
                           s->current_picture.linesize[0], w   , h   ,
-                          EDGE_WIDTH  , EDGE_WIDTH  , EDGE_TOP | EDGE_BOTTOM);
+                          EDGE_WIDTH  , EDGE_TOP | EDGE_BOTTOM);
         s->dsp.draw_edges(s->current_picture.data[1],
                           s->current_picture.linesize[1], w>>1, h>>1,
-                          EDGE_WIDTH/2, EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
+                          EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
         s->dsp.draw_edges(s->current_picture.data[2],
                           s->current_picture.linesize[2], w>>1, h>>1,
-                          EDGE_WIDTH/2, EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
+                          EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
     }
 
     release_buffer(s->avctx);
diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c
index 23ab209312..7cde5e5552 100644
--- a/libavcodec/svq3.c
+++ b/libavcodec/svq3.c
@@ -633,9 +633,8 @@ static int svq3_decode_mb(SVQ3Context *svq3, unsigned int mb_type)
         memset(h->intra4x4_pred_mode+h->mb2br_xy[mb_xy], DC_PRED, 8);
     }
     if (!IS_SKIP(mb_type) || s->pict_type == AV_PICTURE_TYPE_B) {
-        memset(h->non_zero_count_cache + 8, 0, 14*8*sizeof(uint8_t));
-        s->dsp.clear_blocks(h->mb+  0);
-        s->dsp.clear_blocks(h->mb+384);
+        memset(h->non_zero_count_cache + 8, 0, 4*9*sizeof(uint8_t));
+        s->dsp.clear_blocks(h->mb);
     }
 
     if (!IS_INTRA16x16(mb_type) && (!IS_SKIP(mb_type) || s->pict_type == AV_PICTURE_TYPE_B)) {
@@ -655,8 +654,8 @@ static int svq3_decode_mb(SVQ3Context *svq3, unsigned int mb_type)
         }
     }
     if (IS_INTRA16x16(mb_type)) {
-        AV_ZERO128(h->mb_luma_dc[0]+0);
-        AV_ZERO128(h->mb_luma_dc[0]+8);
+        AV_ZERO128(h->mb_luma_dc+0);
+        AV_ZERO128(h->mb_luma_dc+8);
         if (svq3_decode_block(&s->gb, h->mb_luma_dc, 0, 1)){
             av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding intra luma dc\n");
             return -1;
@@ -682,23 +681,20 @@ static int svq3_decode_mb(SVQ3Context *svq3, unsigned int mb_type)
         }
 
         if ((cbp & 0x30)) {
-            for (i = 1; i < 3; ++i) {
-              if (svq3_decode_block(&s->gb, &h->mb[16*16*i], 0, 3)){
+            for (i = 0; i < 2; ++i) {
+              if (svq3_decode_block(&s->gb, &h->mb[16*(16 + 4*i)], 0, 3)){
                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma dc block\n");
                 return -1;
               }
             }
 
             if ((cbp & 0x20)) {
-                for (i = 1; i < 3; i++) {
-                    for (j = 0; j < 4; j++) {
-                        k = 16*i + j;
-                        h->non_zero_count_cache[ scan8[k] ] = 1;
+                for (i = 0; i < 8; i++) {
+                    h->non_zero_count_cache[ scan8[16+i] ] = 1;
 
-                        if (svq3_decode_block(&s->gb, &h->mb[16*k], 1, 1)){
-                            av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma ac block\n");
-                            return -1;
-                        }
+                    if (svq3_decode_block(&s->gb, &h->mb[16*(16 + i)], 1, 1)){
+                        av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma ac block\n");
+                        return -1;
                     }
                 }
             }
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 214c6a3945..1cc6991666 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -784,7 +784,7 @@ static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
 
 /* draw the edges of width 'w' of an image of size width, height
    this mmx version can only handle w==8 || w==16 */
-static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
+static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int sides)
 {
     uint8_t *ptr, *last_line;
     int i;
@@ -839,7 +839,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w,
 
     /* top and bottom (and hopefully also the corners) */
     if (sides&EDGE_TOP) {
-        for(i = 0; i < h; i += 4) {
+        for(i = 0; i < w; i += 4) {
             ptr= buf - (i + 1) * wrap - w;
             __asm__ volatile(
                     "1:                             \n\t"
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index b5f77c90d5..c850dc2ef3 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -36,7 +36,7 @@
 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
 static int decode_significance_x86(CABACContext *c, int max_coeff,
                                    uint8_t *significant_coeff_ctx_base,
-                                   int *index, int last_off){
+                                   int *index){
     void *end= significant_coeff_ctx_base + max_coeff - 1;
     int minusstart= -(int)significant_coeff_ctx_base;
     int minusindex= 4-(int)index;
@@ -52,12 +52,10 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
 
         "test $1, %%edx                         \n\t"
         " jz 3f                                 \n\t"
-        "add  %7, %1                            \n\t"
 
-        BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx",
+        BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx",
                              "%%bx", "%%esi", "%%eax", "%%al")
 
-        "sub  %7, %1                            \n\t"
         "mov  %2, %%"REG_a"                     \n\t"
         "movl %4, %%ecx                         \n\t"
         "add  %1, %%"REG_c"                     \n\t"
@@ -84,7 +82,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
         "movl %%esi, "RANGE    "(%3)            \n\t"
         "movl %%ebx, "LOW      "(%3)            \n\t"
         :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)
-        :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off)
+        :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)
         : "%"REG_c, "%ebx", "%edx", "%esi", "memory"
     );
     return coeff_count;
@@ -92,7 +90,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
 
 static int decode_significance_8x8_x86(CABACContext *c,
                                        uint8_t *significant_coeff_ctx_base,
-                                       int *index, int last_off, const uint8_t *sig_off){
+                                       int *index, const uint8_t *sig_off){
     int minusindex= 4-(int)index;
     int coeff_count;
     x86_reg last=0;
@@ -116,9 +114,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
 
         "movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%%edi), %%edi\n\t"
         "add %5, %%"REG_D"                      \n\t"
-        "add %7, %%"REG_D"                      \n\t"
 
-        BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%"REG_D")", "%%ebx",
+        BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%"REG_D")", "%%ebx",
                              "%%bx", "%%esi", "%%eax", "%%al")
 
         "mov %2, %%"REG_a"                      \n\t"
@@ -145,7 +142,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
         "movl %%esi, "RANGE    "(%3)            \n\t"
         "movl %%ebx, "LOW      "(%3)            \n\t"
         :"=&a"(coeff_count),"+m"(last), "+m"(index)
-        :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off), "m"(last_off)
+        :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off)
         : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory"
     );
     return coeff_count;
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 4788da98e0..f90f41c4bc 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -32,18 +32,14 @@
 SECTION_RODATA
 
 ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
-scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
-           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
-           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
-           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
-           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
-           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
-           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
-           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
-           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
-           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
-           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
-           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
+scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
+           db 6+1*8, 7+1*8, 6+2*8, 7+2*8
+           db 4+3*8, 5+3*8, 4+4*8, 5+4*8
+           db 6+3*8, 7+3*8, 6+4*8, 7+4*8
+           db 1+1*8, 2+1*8
+           db 1+2*8, 2+2*8
+           db 1+4*8, 2+4*8
+           db 1+5*8, 2+5*8
 %ifdef PIC
 %define scan8 r11
 %else
@@ -621,8 +617,6 @@ cglobal h264_idct_add8_8_mmx, 5, 7, 0
     mov         r10, r0
 %endif
     call         h264_idct_add8_mmx_plane
-    mov          r5, 32
-    add          r2, 384
 %ifdef ARCH_X86_64
     add         r10, gprsize
 %else
@@ -684,8 +678,6 @@ cglobal h264_idct_add8_8_mmx2, 5, 7, 0
     lea         r11, [scan8_mem]
 %endif
     call h264_idct_add8_mmx2_plane
-    mov          r5, 32
-    add          r2, 384
 %ifdef ARCH_X86_64
     add         r10, gprsize
 %else
@@ -818,12 +810,12 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
     test        r0, r0
     jz .try%1dc
 %ifdef ARCH_X86_64
-    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
+    mov        r0d, dword [r1+%1*8+64]
     add         r0, [r10]
 %else
     mov         r0, r0m
     mov         r0, [r0]
-    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
+    add         r0, dword [r1+%1*8+64]
 %endif
     call        x264_add8x4_idct_sse2
     jmp .cycle%1end
@@ -832,18 +824,16 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
     or         r0w, word [r2+32]
     jz .cycle%1end
 %ifdef ARCH_X86_64
-    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
+    mov        r0d, dword [r1+%1*8+64]
     add         r0, [r10]
 %else
     mov         r0, r0m
     mov         r0, [r0]
-    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
+    add         r0, dword [r1+%1*8+64]
 %endif
     call        h264_idct_dc_add8_mmx2
 .cycle%1end
-%if %1 == 1
-    add         r2, 384+64
-%elif %1 < 3
+%if %1 < 3
     add         r2, 64
 %endif
 %endmacro
@@ -855,15 +845,15 @@ cglobal h264_idct_add8_8_sse2, 5, 7, 8
 %ifdef ARCH_X86_64
     mov         r10, r0
 %endif
-    add8_sse2_cycle 0, 0x34
-    add8_sse2_cycle 1, 0x3c
+    add8_sse2_cycle 0, 0x09
+    add8_sse2_cycle 1, 0x11
 %ifdef ARCH_X86_64
     add         r10, gprsize
 %else
     add        r0mp, gprsize
 %endif
-    add8_sse2_cycle 2, 0x5c
-    add8_sse2_cycle 3, 0x64
+    add8_sse2_cycle 2, 0x21
+    add8_sse2_cycle 3, 0x29
     RET
 
 ;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index 54636a95d0..3f7cf4cefc 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -29,18 +29,14 @@ SECTION_RODATA
 
 pw_pixel_max: times 8 dw ((1 << 10)-1)
 pd_32:        times 4 dd 32
-scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
-           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
-           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
-           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
-           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
-           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
-           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
-           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
-           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
-           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
-           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
-           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
+scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
+           db 6+1*8, 7+1*8, 6+2*8, 7+2*8
+           db 4+3*8, 5+3*8, 4+4*8, 5+4*8
+           db 6+3*8, 7+3*8, 6+4*8, 7+4*8
+           db 1+1*8, 2+1*8
+           db 1+2*8, 2+2*8
+           db 1+4*8, 2+4*8
+           db 1+5*8, 2+5*8
 
 %ifdef PIC
 %define scan8 r11
@@ -310,7 +306,7 @@ INIT_AVX
 IDCT_ADD16INTRA_10 avx
 %endif
 
-%assign last_block 36
+%assign last_block 24
 ;-----------------------------------------------------------------------------
 ; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
 ;-----------------------------------------------------------------------------
@@ -321,22 +317,21 @@ cglobal h264_idct_add8_10_%1,5,7
 %endif
     add      r2, 1024
     mov      r0, [r0]
-    ADD16_OP_INTRA %1, 16, 4+ 6*8
-    ADD16_OP_INTRA %1, 18, 4+ 7*8
-    add      r2, 1024-128*2
+    ADD16_OP_INTRA %1, 16, 1+1*8
+    ADD16_OP_INTRA %1, 18, 1+2*8
 %ifdef ARCH_X86_64
     mov      r0, [r10+gprsize]
 %else
     mov      r0, r0m
     mov      r0, [r0+gprsize]
 %endif
-    ADD16_OP_INTRA %1, 32, 4+11*8
-    ADD16_OP_INTRA %1, 34, 4+12*8
+    ADD16_OP_INTRA %1, 20, 1+4*8
+    ADD16_OP_INTRA %1, 22, 1+5*8
     REP_RET
     AC %1, 16
     AC %1, 18
-    AC %1, 32
-    AC %1, 34
+    AC %1, 20
+    AC %1, 22
 
 %endmacro ; IDCT_ADD8
 

From 36151b3e3112cd7d8ae0e02e850dee16bd966696 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 10 Jun 2011 11:45:03 -0400
Subject: [PATCH 20/24] ac3enc: use function pointer to choose between AC-3 and
 E-AC-3 header output functions.

---
 libavcodec/ac3enc.c | 11 +++++++----
 libavcodec/ac3enc.h |  2 ++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index e8ccde514a..9403bf6443 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -1987,10 +1987,7 @@ static void output_frame(AC3EncodeContext *s, unsigned char *frame)
 
     init_put_bits(&s->pb, frame, AC3_MAX_CODED_FRAME_SIZE);
 
-    if (CONFIG_EAC3_ENCODER && s->eac3)
-        ff_eac3_output_frame_header(s);
-    else
-        ac3_output_frame_header(s);
+    s->output_frame_header(s);
 
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
         output_audio_block(s, blk);
@@ -2732,6 +2729,12 @@ static av_cold int ac3_encode_init(AVCodecContext *avctx)
         s->crc_inv[1] = pow_poly((CRC16_POLY >> 1), (8 * frame_size_58) - 16, CRC16_POLY);
     }
 
+    /* set function pointers */
+    if (CONFIG_EAC3_ENCODER && s->eac3)
+        s->output_frame_header = ff_eac3_output_frame_header;
+    else
+        s->output_frame_header = ac3_output_frame_header;
+
     set_bandwidth(s);
 
     exponent_init(s);
diff --git a/libavcodec/ac3enc.h b/libavcodec/ac3enc.h
index 0541683537..34ca2e449f 100644
--- a/libavcodec/ac3enc.h
+++ b/libavcodec/ac3enc.h
@@ -209,6 +209,8 @@ typedef struct AC3EncodeContext {
     int ref_bap_set;                                         ///< indicates if ref_bap pointers have been set
 
     DECLARE_ALIGNED(32, SampleType, windowed_samples)[AC3_WINDOW_SIZE];
+
+    void (*output_frame_header)(struct AC3EncodeContext *s);
 } AC3EncodeContext;
 
 #endif /* AVCODEC_AC3ENC_H */

From e754dfc0bba4f81fe797f240fca94fea5dfd925e Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 10 Jun 2011 12:42:36 -0400
Subject: [PATCH 21/24] ac3enc: dynamically allocate AC3EncodeContext fields
 windowed_samples and mdct

This will allow the same struct to be used for both the fixed and float ac3
encoders.
---
 libavcodec/ac3enc.c | 15 ++++++++++-----
 libavcodec/ac3enc.h |  5 ++---
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index 9403bf6443..e71afe62ee 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -273,12 +273,12 @@ static void apply_mdct(AC3EncodeContext *s)
             AC3Block *block = &s->blocks[blk];
             const SampleType *input_samples = &s->planar_samples[ch][blk * AC3_BLOCK_SIZE];
 
-            apply_window(&s->dsp, s->windowed_samples, input_samples, s->mdct.window, AC3_WINDOW_SIZE);
+            apply_window(&s->dsp, s->windowed_samples, input_samples, s->mdct->window, AC3_WINDOW_SIZE);
 
             block->coeff_shift[ch+1] = normalize_samples(s);
 
-            s->mdct.fft.mdct_calcw(&s->mdct.fft, block->mdct_coef[ch+1],
-                                   s->windowed_samples);
+            s->mdct->fft.mdct_calcw(&s->mdct->fft, block->mdct_coef[ch+1],
+                                    s->windowed_samples);
         }
     }
 }
@@ -2318,6 +2318,7 @@ static av_cold int ac3_encode_close(AVCodecContext *avctx)
     int blk, ch;
     AC3EncodeContext *s = avctx->priv_data;
 
+    av_freep(&s->windowed_samples);
     for (ch = 0; ch < s->channels; ch++)
         av_freep(&s->planar_samples[ch]);
     av_freep(&s->planar_samples);
@@ -2343,7 +2344,8 @@ static av_cold int ac3_encode_close(AVCodecContext *avctx)
         av_freep(&block->qmant);
     }
 
-    mdct_end(&s->mdct);
+    mdct_end(s->mdct);
+    av_freep(&s->mdct);
 
     av_freep(&avctx->coded_frame);
     return 0;
@@ -2598,6 +2600,8 @@ static av_cold int allocate_buffers(AVCodecContext *avctx)
     AC3EncodeContext *s = avctx->priv_data;
     int channels = s->channels + 1; /* includes coupling channel */
 
+    FF_ALLOC_OR_GOTO(avctx, s->windowed_samples, AC3_WINDOW_SIZE *
+                     sizeof(*s->windowed_samples), alloc_fail);
     FF_ALLOC_OR_GOTO(avctx, s->planar_samples, s->channels * sizeof(*s->planar_samples),
                      alloc_fail);
     for (ch = 0; ch < s->channels; ch++) {
@@ -2741,7 +2745,8 @@ static av_cold int ac3_encode_init(AVCodecContext *avctx)
 
     bit_alloc_init(s);
 
-    ret = mdct_init(avctx, &s->mdct, 9);
+    FF_ALLOCZ_OR_GOTO(avctx, s->mdct, sizeof(AC3MDCTContext), init_fail);
+    ret = mdct_init(avctx, s->mdct, 9);
     if (ret)
         goto init_fail;
 
diff --git a/libavcodec/ac3enc.h b/libavcodec/ac3enc.h
index 34ca2e449f..ccdb963a7c 100644
--- a/libavcodec/ac3enc.h
+++ b/libavcodec/ac3enc.h
@@ -128,7 +128,7 @@ typedef struct AC3EncodeContext {
     PutBitContext pb;                       ///< bitstream writer context
     DSPContext dsp;
     AC3DSPContext ac3dsp;                   ///< AC-3 optimized functions
-    AC3MDCTContext mdct;                    ///< MDCT context
+    AC3MDCTContext *mdct;                   ///< MDCT context
 
     AC3Block blocks[AC3_MAX_BLOCKS];        ///< per-block info
 
@@ -189,6 +189,7 @@ typedef struct AC3EncodeContext {
     int frame_bits;                         ///< all frame bits except exponents and mantissas
     int exponent_bits;                      ///< number of bits used for exponents
 
+    SampleType *windowed_samples;
     SampleType **planar_samples;
     uint8_t *bap_buffer;
     uint8_t *bap1_buffer;
@@ -208,8 +209,6 @@ typedef struct AC3EncodeContext {
     uint8_t *ref_bap     [AC3_MAX_CHANNELS][AC3_MAX_BLOCKS]; ///< bit allocation pointers (bap)
     int ref_bap_set;                                         ///< indicates if ref_bap pointers have been set
 
-    DECLARE_ALIGNED(32, SampleType, windowed_samples)[AC3_WINDOW_SIZE];
-
     void (*output_frame_header)(struct AC3EncodeContext *s);
 } AC3EncodeContext;
 

From e0cc66df61664bb6f9271d9aae3c778e1f906b4c Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 10 Jun 2011 14:57:19 -0400
Subject: [PATCH 22/24] ac3enc: split templated float vs. fixed functions into
 a separate file.

Function pointers are used for templated functions instead of needlessly
duplicating many functions.
---
 libavcodec/Makefile               |   9 +-
 libavcodec/ac3enc.c               | 438 +++---------------------------
 libavcodec/ac3enc.h               |  79 ++++++
 libavcodec/ac3enc_fixed.c         |  40 ++-
 libavcodec/ac3enc_float.c         |  52 ++--
 libavcodec/ac3enc_opts_template.c |   3 +
 libavcodec/ac3enc_template.c      | 377 +++++++++++++++++++++++++
 libavcodec/eac3enc.c              |  24 ++
 8 files changed, 577 insertions(+), 445 deletions(-)
 create mode 100644 libavcodec/ac3enc_template.c

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 581d6bf399..0cfa08c1c3 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -60,8 +60,9 @@ OBJS-$(CONFIG_AAC_ENCODER)             += aacenc.o aaccoder.o    \
                                           mpeg4audio.o kbdwin.o
 OBJS-$(CONFIG_AASC_DECODER)            += aasc.o msrledec.o
 OBJS-$(CONFIG_AC3_DECODER)             += ac3dec.o ac3dec_data.o ac3.o kbdwin.o
-OBJS-$(CONFIG_AC3_ENCODER)             += ac3enc_float.o ac3tab.o ac3.o kbdwin.o
-OBJS-$(CONFIG_AC3_FIXED_ENCODER)       += ac3enc_fixed.o ac3tab.o ac3.o
+OBJS-$(CONFIG_AC3_ENCODER)             += ac3enc_float.o ac3enc.o ac3tab.o \
+                                          ac3.o kbdwin.o
+OBJS-$(CONFIG_AC3_FIXED_ENCODER)       += ac3enc_fixed.o ac3enc.o ac3tab.o ac3.o
 OBJS-$(CONFIG_ALAC_DECODER)            += alac.o
 OBJS-$(CONFIG_ALAC_ENCODER)            += alacenc.o
 OBJS-$(CONFIG_ALS_DECODER)             += alsdec.o bgmc.o mpeg4audio.o
@@ -124,8 +125,8 @@ OBJS-$(CONFIG_DVVIDEO_DECODER)         += dv.o dvdata.o
 OBJS-$(CONFIG_DVVIDEO_ENCODER)         += dv.o dvdata.o
 OBJS-$(CONFIG_DXA_DECODER)             += dxa.o
 OBJS-$(CONFIG_EAC3_DECODER)            += eac3dec.o eac3dec_data.o
-OBJS-$(CONFIG_EAC3_ENCODER)            += eac3enc.o ac3enc_float.o ac3tab.o \
-                                          ac3.o kbdwin.o
+OBJS-$(CONFIG_EAC3_ENCODER)            += eac3enc.o ac3enc.o ac3enc_float.o \
+                                          ac3tab.o ac3.o kbdwin.o
 OBJS-$(CONFIG_EACMV_DECODER)           += eacmv.o
 OBJS-$(CONFIG_EAMAD_DECODER)           += eamad.o eaidct.o mpeg12.o \
                                           mpeg12data.o mpegvideo.o  \
diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index e71afe62ee..1147ed142e 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -67,46 +67,6 @@ static const float extmixlev_options[EXTMIXLEV_NUM_OPTIONS] = {
 };
 
 
-#define OFFSET(param) offsetof(AC3EncodeContext, options.param)
-#define AC3ENC_PARAM (AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM)
-
-#define AC3ENC_TYPE_AC3_FIXED   0
-#define AC3ENC_TYPE_AC3         1
-#define AC3ENC_TYPE_EAC3        2
-
-#if CONFIG_AC3ENC_FLOAT
-#define AC3ENC_TYPE AC3ENC_TYPE_AC3
-#include "ac3enc_opts_template.c"
-static AVClass ac3enc_class = { "AC-3 Encoder", av_default_item_name,
-                                ac3_options, LIBAVUTIL_VERSION_INT };
-#undef AC3ENC_TYPE
-#define AC3ENC_TYPE AC3ENC_TYPE_EAC3
-#include "ac3enc_opts_template.c"
-static AVClass eac3enc_class = { "E-AC-3 Encoder", av_default_item_name,
-                                 eac3_options, LIBAVUTIL_VERSION_INT };
-#else
-#define AC3ENC_TYPE AC3ENC_TYPE_AC3_FIXED
-#include "ac3enc_opts_template.c"
-static AVClass ac3enc_class = { "Fixed-Point AC-3 Encoder", av_default_item_name,
-                                ac3fixed_options, LIBAVUTIL_VERSION_INT };
-#endif
-
-
-/* prototypes for functions in ac3enc_fixed.c and ac3enc_float.c */
-
-static av_cold void mdct_end(AC3MDCTContext *mdct);
-
-static av_cold int mdct_init(AVCodecContext *avctx, AC3MDCTContext *mdct,
-                             int nbits);
-
-static void apply_window(DSPContext *dsp, SampleType *output, const SampleType *input,
-                         const SampleType *window, unsigned int len);
-
-static int normalize_samples(AC3EncodeContext *s);
-
-static void scale_coefficients(AC3EncodeContext *s);
-
-
 /**
  * LUT for number of exponent groups.
  * exponent_group_tab[coupling][exponent strategy-1][number of coefficients]
@@ -117,7 +77,7 @@ static uint8_t exponent_group_tab[2][3][256];
 /**
  * List of supported channel layouts.
  */
-static const int64_t ac3_channel_layouts[] = {
+const int64_t ff_ac3_channel_layouts[19] = {
      AV_CH_LAYOUT_MONO,
      AV_CH_LAYOUT_STEREO,
      AV_CH_LAYOUT_2_1,
@@ -230,60 +190,6 @@ static void adjust_frame_size(AC3EncodeContext *s)
 }
 
 
-/**
- * Deinterleave input samples.
- * Channels are reordered from Libav's default order to AC-3 order.
- */
-static void deinterleave_input_samples(AC3EncodeContext *s,
-                                       const SampleType *samples)
-{
-    int ch, i;
-
-    /* deinterleave and remap input samples */
-    for (ch = 0; ch < s->channels; ch++) {
-        const SampleType *sptr;
-        int sinc;
-
-        /* copy last 256 samples of previous frame to the start of the current frame */
-        memcpy(&s->planar_samples[ch][0], &s->planar_samples[ch][AC3_FRAME_SIZE],
-               AC3_BLOCK_SIZE * sizeof(s->planar_samples[0][0]));
-
-        /* deinterleave */
-        sinc = s->channels;
-        sptr = samples + s->channel_map[ch];
-        for (i = AC3_BLOCK_SIZE; i < AC3_FRAME_SIZE+AC3_BLOCK_SIZE; i++) {
-            s->planar_samples[ch][i] = *sptr;
-            sptr += sinc;
-        }
-    }
-}
-
-
-/**
- * Apply the MDCT to input samples to generate frequency coefficients.
- * This applies the KBD window and normalizes the input to reduce precision
- * loss due to fixed-point calculations.
- */
-static void apply_mdct(AC3EncodeContext *s)
-{
-    int blk, ch;
-
-    for (ch = 0; ch < s->channels; ch++) {
-        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
-            AC3Block *block = &s->blocks[blk];
-            const SampleType *input_samples = &s->planar_samples[ch][blk * AC3_BLOCK_SIZE];
-
-            apply_window(&s->dsp, s->windowed_samples, input_samples, s->mdct->window, AC3_WINDOW_SIZE);
-
-            block->coeff_shift[ch+1] = normalize_samples(s);
-
-            s->mdct->fft.mdct_calcw(&s->mdct->fft, block->mdct_coef[ch+1],
-                                    s->windowed_samples);
-        }
-    }
-}
-
-
 static void compute_coupling_strategy(AC3EncodeContext *s)
 {
     int blk, ch;
@@ -345,296 +251,6 @@ static void compute_coupling_strategy(AC3EncodeContext *s)
 }
 
 
-/**
- * Calculate a single coupling coordinate.
- */
-static inline float calc_cpl_coord(float energy_ch, float energy_cpl)
-{
-    float coord = 0.125;
-    if (energy_cpl > 0)
-        coord *= sqrtf(energy_ch / energy_cpl);
-    return coord;
-}
-
-
-/**
- * Calculate coupling channel and coupling coordinates.
- * TODO: Currently this is only used for the floating-point encoder. I was
- *       able to make it work for the fixed-point encoder, but quality was
- *       generally lower in most cases than not using coupling. If a more
- *       adaptive coupling strategy were to be implemented it might be useful
- *       at that time to use coupling for the fixed-point encoder as well.
- */
-static void apply_channel_coupling(AC3EncodeContext *s)
-{
-#if CONFIG_AC3ENC_FLOAT
-    LOCAL_ALIGNED_16(float,   cpl_coords,       [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]);
-    LOCAL_ALIGNED_16(int32_t, fixed_cpl_coords, [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]);
-    int blk, ch, bnd, i, j;
-    CoefSumType energy[AC3_MAX_BLOCKS][AC3_MAX_CHANNELS][16] = {{{0}}};
-    int num_cpl_coefs = s->num_cpl_subbands * 12;
-
-    memset(cpl_coords,       0, AC3_MAX_BLOCKS * sizeof(*cpl_coords));
-    memset(fixed_cpl_coords, 0, AC3_MAX_BLOCKS * sizeof(*fixed_cpl_coords));
-
-    /* calculate coupling channel from fbw channels */
-    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
-        AC3Block *block = &s->blocks[blk];
-        CoefType *cpl_coef = &block->mdct_coef[CPL_CH][s->start_freq[CPL_CH]];
-        if (!block->cpl_in_use)
-            continue;
-        memset(cpl_coef-1, 0, (num_cpl_coefs+4) * sizeof(*cpl_coef));
-        for (ch = 1; ch <= s->fbw_channels; ch++) {
-            CoefType *ch_coef = &block->mdct_coef[ch][s->start_freq[CPL_CH]];
-            if (!block->channel_in_cpl[ch])
-                continue;
-            for (i = 0; i < num_cpl_coefs; i++)
-                cpl_coef[i] += ch_coef[i];
-        }
-        /* note: coupling start bin % 4 will always be 1 and num_cpl_coefs
-                 will always be a multiple of 12, so we need to subtract 1 from
-                 the start and add 4 to the length when using optimized
-                 functions which require 16-byte alignment. */
-
-        /* coefficients must be clipped to +/- 1.0 in order to be encoded */
-        s->dsp.vector_clipf(cpl_coef-1, cpl_coef-1, -1.0f, 1.0f, num_cpl_coefs+4);
-
-        /* scale coupling coefficients from float to 24-bit fixed-point */
-        s->ac3dsp.float_to_fixed24(&block->fixed_coef[CPL_CH][s->start_freq[CPL_CH]-1],
-                                   cpl_coef-1, num_cpl_coefs+4);
-    }
-
-    /* calculate energy in each band in coupling channel and each fbw channel */
-    /* TODO: possibly use SIMD to speed up energy calculation */
-    bnd = 0;
-    i = s->start_freq[CPL_CH];
-    while (i < s->cpl_end_freq) {
-        int band_size = s->cpl_band_sizes[bnd];
-        for (ch = CPL_CH; ch <= s->fbw_channels; ch++) {
-            for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
-                AC3Block *block = &s->blocks[blk];
-                if (!block->cpl_in_use || (ch > CPL_CH && !block->channel_in_cpl[ch]))
-                    continue;
-                for (j = 0; j < band_size; j++) {
-                    CoefType v = block->mdct_coef[ch][i+j];
-                    MAC_COEF(energy[blk][ch][bnd], v, v);
-                }
-            }
-        }
-        i += band_size;
-        bnd++;
-    }
-
-    /* determine which blocks to send new coupling coordinates for */
-    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
-        AC3Block *block  = &s->blocks[blk];
-        AC3Block *block0 = blk ? &s->blocks[blk-1] : NULL;
-        int new_coords = 0;
-        CoefSumType coord_diff[AC3_MAX_CHANNELS] = {0,};
-
-        if (block->cpl_in_use) {
-            /* calculate coupling coordinates for all blocks and calculate the
-               average difference between coordinates in successive blocks */
-            for (ch = 1; ch <= s->fbw_channels; ch++) {
-                if (!block->channel_in_cpl[ch])
-                    continue;
-
-                for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
-                    cpl_coords[blk][ch][bnd] = calc_cpl_coord(energy[blk][ch][bnd],
-                                                              energy[blk][CPL_CH][bnd]);
-                    if (blk > 0 && block0->cpl_in_use &&
-                        block0->channel_in_cpl[ch]) {
-                        coord_diff[ch] += fabs(cpl_coords[blk-1][ch][bnd] -
-                                               cpl_coords[blk  ][ch][bnd]);
-                    }
-                }
-                coord_diff[ch] /= s->num_cpl_bands;
-            }
-
-            /* send new coordinates if this is the first block, if previous
-             * block did not use coupling but this block does, the channels
-             * using coupling has changed from the previous block, or the
-             * coordinate difference from the last block for any channel is
-             * greater than a threshold value. */
-            if (blk == 0) {
-                new_coords = 1;
-            } else if (!block0->cpl_in_use) {
-                new_coords = 1;
-            } else {
-                for (ch = 1; ch <= s->fbw_channels; ch++) {
-                    if (block->channel_in_cpl[ch] && !block0->channel_in_cpl[ch]) {
-                        new_coords = 1;
-                        break;
-                    }
-                }
-                if (!new_coords) {
-                    for (ch = 1; ch <= s->fbw_channels; ch++) {
-                        if (block->channel_in_cpl[ch] && coord_diff[ch] > 0.04) {
-                            new_coords = 1;
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-        block->new_cpl_coords = new_coords;
-    }
-
-    /* calculate final coupling coordinates, taking into account reusing of
-       coordinates in successive blocks */
-    for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
-        blk = 0;
-        while (blk < AC3_MAX_BLOCKS) {
-            int blk1;
-            CoefSumType energy_cpl;
-            AC3Block *block  = &s->blocks[blk];
-
-            if (!block->cpl_in_use) {
-                blk++;
-                continue;
-            }
-
-            energy_cpl = energy[blk][CPL_CH][bnd];
-            blk1 = blk+1;
-            while (!s->blocks[blk1].new_cpl_coords && blk1 < AC3_MAX_BLOCKS) {
-                if (s->blocks[blk1].cpl_in_use)
-                    energy_cpl += energy[blk1][CPL_CH][bnd];
-                blk1++;
-            }
-
-            for (ch = 1; ch <= s->fbw_channels; ch++) {
-                CoefType energy_ch;
-                if (!block->channel_in_cpl[ch])
-                    continue;
-                energy_ch = energy[blk][ch][bnd];
-                blk1 = blk+1;
-                while (!s->blocks[blk1].new_cpl_coords && blk1 < AC3_MAX_BLOCKS) {
-                    if (s->blocks[blk1].cpl_in_use)
-                        energy_ch += energy[blk1][ch][bnd];
-                    blk1++;
-                }
-                cpl_coords[blk][ch][bnd] = calc_cpl_coord(energy_ch, energy_cpl);
-            }
-            blk = blk1;
-        }
-    }
-
-    /* calculate exponents/mantissas for coupling coordinates */
-    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
-        AC3Block *block = &s->blocks[blk];
-        if (!block->cpl_in_use || !block->new_cpl_coords)
-            continue;
-
-        s->ac3dsp.float_to_fixed24(fixed_cpl_coords[blk][1],
-                                   cpl_coords[blk][1],
-                                   s->fbw_channels * 16);
-        s->ac3dsp.extract_exponents(block->cpl_coord_exp[1],
-                                    fixed_cpl_coords[blk][1],
-                                    s->fbw_channels * 16);
-
-        for (ch = 1; ch <= s->fbw_channels; ch++) {
-            int bnd, min_exp, max_exp, master_exp;
-
-            /* determine master exponent */
-            min_exp = max_exp = block->cpl_coord_exp[ch][0];
-            for (bnd = 1; bnd < s->num_cpl_bands; bnd++) {
-                int exp = block->cpl_coord_exp[ch][bnd];
-                min_exp = FFMIN(exp, min_exp);
-                max_exp = FFMAX(exp, max_exp);
-            }
-            master_exp = ((max_exp - 15) + 2) / 3;
-            master_exp = FFMAX(master_exp, 0);
-            while (min_exp < master_exp * 3)
-                master_exp--;
-            for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
-                block->cpl_coord_exp[ch][bnd] = av_clip(block->cpl_coord_exp[ch][bnd] -
-                                                        master_exp * 3, 0, 15);
-            }
-            block->cpl_master_exp[ch] = master_exp;
-
-            /* quantize mantissas */
-            for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
-                int cpl_exp  = block->cpl_coord_exp[ch][bnd];
-                int cpl_mant = (fixed_cpl_coords[blk][ch][bnd] << (5 + cpl_exp + master_exp * 3)) >> 24;
-                if (cpl_exp == 15)
-                    cpl_mant >>= 1;
-                else
-                    cpl_mant -= 16;
-
-                block->cpl_coord_mant[ch][bnd] = cpl_mant;
-            }
-        }
-    }
-
-    if (CONFIG_EAC3_ENCODER && s->eac3)
-        ff_eac3_set_cpl_states(s);
-#endif /* CONFIG_AC3ENC_FLOAT */
-}
-
-
-/**
- * Determine rematrixing flags for each block and band.
- */
-static void compute_rematrixing_strategy(AC3EncodeContext *s)
-{
-    int nb_coefs;
-    int blk, bnd, i;
-    AC3Block *block, *block0;
-
-    if (s->channel_mode != AC3_CHMODE_STEREO)
-        return;
-
-    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
-        block = &s->blocks[blk];
-        block->new_rematrixing_strategy = !blk;
-
-        if (!s->rematrixing_enabled) {
-            block0 = block;
-            continue;
-        }
-
-        block->num_rematrixing_bands = 4;
-        if (block->cpl_in_use) {
-            block->num_rematrixing_bands -= (s->start_freq[CPL_CH] <= 61);
-            block->num_rematrixing_bands -= (s->start_freq[CPL_CH] == 37);
-            if (blk && block->num_rematrixing_bands != block0->num_rematrixing_bands)
-                block->new_rematrixing_strategy = 1;
-        }
-        nb_coefs = FFMIN(block->end_freq[1], block->end_freq[2]);
-
-        for (bnd = 0; bnd < block->num_rematrixing_bands; bnd++) {
-            /* calculate calculate sum of squared coeffs for one band in one block */
-            int start = ff_ac3_rematrix_band_tab[bnd];
-            int end   = FFMIN(nb_coefs, ff_ac3_rematrix_band_tab[bnd+1]);
-            CoefSumType sum[4] = {0,};
-            for (i = start; i < end; i++) {
-                CoefType lt = block->mdct_coef[1][i];
-                CoefType rt = block->mdct_coef[2][i];
-                CoefType md = lt + rt;
-                CoefType sd = lt - rt;
-                MAC_COEF(sum[0], lt, lt);
-                MAC_COEF(sum[1], rt, rt);
-                MAC_COEF(sum[2], md, md);
-                MAC_COEF(sum[3], sd, sd);
-            }
-
-            /* compare sums to determine if rematrixing will be used for this band */
-            if (FFMIN(sum[2], sum[3]) < FFMIN(sum[0], sum[1]))
-                block->rematrixing_flags[bnd] = 1;
-            else
-                block->rematrixing_flags[bnd] = 0;
-
-            /* determine if new rematrixing flags will be sent */
-            if (blk &&
-                block->rematrixing_flags[bnd] != block0->rematrixing_flags[bnd]) {
-                block->new_rematrixing_strategy = 1;
-            }
-        }
-        block0 = block;
-    }
-}
-
-
 /**
  * Apply stereo rematrixing to coefficients based on rematrixing flags.
  */
@@ -1467,7 +1083,7 @@ static int compute_bit_allocation(AC3EncodeContext *s)
         if (s->cpl_on) {
             s->cpl_on = 0;
             compute_coupling_strategy(s);
-            compute_rematrixing_strategy(s);
+            s->compute_rematrixing_strategy(s);
             apply_rematrixing(s);
             process_exponents(s);
             ret = compute_bit_allocation(s);
@@ -2262,8 +1878,8 @@ static int validate_metadata(AVCodecContext *avctx)
 /**
  * Encode a single AC-3 frame.
  */
-static int ac3_encode_frame(AVCodecContext *avctx, unsigned char *frame,
-                            int buf_size, void *data)
+int ff_ac3_encode_frame(AVCodecContext *avctx, unsigned char *frame,
+                        int buf_size, void *data)
 {
     AC3EncodeContext *s = avctx->priv_data;
     const SampleType *samples = data;
@@ -2278,19 +1894,19 @@ static int ac3_encode_frame(AVCodecContext *avctx, unsigned char *frame,
     if (s->bit_alloc.sr_code == 1 || s->eac3)
         adjust_frame_size(s);
 
-    deinterleave_input_samples(s, samples);
+    s->deinterleave_input_samples(s, samples);
 
-    apply_mdct(s);
+    s->apply_mdct(s);
 
-    scale_coefficients(s);
+    s->scale_coefficients(s);
 
     s->cpl_on = s->cpl_enabled;
     compute_coupling_strategy(s);
 
     if (s->cpl_on)
-        apply_channel_coupling(s);
+        s->apply_channel_coupling(s);
 
-    compute_rematrixing_strategy(s);
+    s->compute_rematrixing_strategy(s);
 
     apply_rematrixing(s);
 
@@ -2313,7 +1929,7 @@ static int ac3_encode_frame(AVCodecContext *avctx, unsigned char *frame,
 /**
  * Finalize encoding and free any memory allocated by the encoder.
  */
-static av_cold int ac3_encode_close(AVCodecContext *avctx)
+av_cold int ff_ac3_encode_close(AVCodecContext *avctx)
 {
     int blk, ch;
     AC3EncodeContext *s = avctx->priv_data;
@@ -2344,7 +1960,7 @@ static av_cold int ac3_encode_close(AVCodecContext *avctx)
         av_freep(&block->qmant);
     }
 
-    mdct_end(s->mdct);
+    s->mdct_end(s->mdct);
     av_freep(&s->mdct);
 
     av_freep(&avctx->coded_frame);
@@ -2515,8 +2131,7 @@ static av_cold int validate_options(AVCodecContext *avctx, AC3EncodeContext *s)
                              (s->channel_mode == AC3_CHMODE_STEREO);
 
     s->cpl_enabled = s->options.channel_coupling &&
-                     s->channel_mode >= AC3_CHMODE_STEREO &&
-                     CONFIG_AC3ENC_FLOAT;
+                     s->channel_mode >= AC3_CHMODE_STEREO && !s->fixed_point;
 
     return 0;
 }
@@ -2674,7 +2289,7 @@ static av_cold int allocate_buffers(AVCodecContext *avctx)
         }
     }
 
-    if (CONFIG_AC3ENC_FLOAT) {
+    if (!s->fixed_point) {
         FF_ALLOCZ_OR_GOTO(avctx, s->fixed_coef_buffer, AC3_MAX_BLOCKS * channels *
                           AC3_MAX_COEFS * sizeof(*s->fixed_coef_buffer), alloc_fail);
         for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
@@ -2703,7 +2318,7 @@ alloc_fail:
 /**
  * Initialize the encoder.
  */
-static av_cold int ac3_encode_init(AVCodecContext *avctx)
+av_cold int ff_ac3_encode_init(AVCodecContext *avctx)
 {
     AC3EncodeContext *s = avctx->priv_data;
     int ret, frame_size_58;
@@ -2734,6 +2349,27 @@ static av_cold int ac3_encode_init(AVCodecContext *avctx)
     }
 
     /* set function pointers */
+    if (CONFIG_AC3_FIXED_ENCODER && s->fixed_point) {
+        s->mdct_end                     = ff_ac3_fixed_mdct_end;
+        s->mdct_init                    = ff_ac3_fixed_mdct_init;
+        s->apply_window                 = ff_ac3_fixed_apply_window;
+        s->normalize_samples            = ff_ac3_fixed_normalize_samples;
+        s->scale_coefficients           = ff_ac3_fixed_scale_coefficients;
+        s->deinterleave_input_samples   = ff_ac3_fixed_deinterleave_input_samples;
+        s->apply_mdct                   = ff_ac3_fixed_apply_mdct;
+        s->apply_channel_coupling       = ff_ac3_fixed_apply_channel_coupling;
+        s->compute_rematrixing_strategy = ff_ac3_fixed_compute_rematrixing_strategy;
+    } else if (CONFIG_AC3_ENCODER || CONFIG_EAC3_ENCODER) {
+        s->mdct_end                     = ff_ac3_float_mdct_end;
+        s->mdct_init                    = ff_ac3_float_mdct_init;
+        s->apply_window                 = ff_ac3_float_apply_window;
+        s->normalize_samples            = ff_ac3_float_normalize_samples;
+        s->scale_coefficients           = ff_ac3_float_scale_coefficients;
+        s->deinterleave_input_samples   = ff_ac3_float_deinterleave_input_samples;
+        s->apply_mdct                   = ff_ac3_float_apply_mdct;
+        s->apply_channel_coupling       = ff_ac3_float_apply_channel_coupling;
+        s->compute_rematrixing_strategy = ff_ac3_float_compute_rematrixing_strategy;
+    }
     if (CONFIG_EAC3_ENCODER && s->eac3)
         s->output_frame_header = ff_eac3_output_frame_header;
     else
@@ -2746,7 +2382,7 @@ static av_cold int ac3_encode_init(AVCodecContext *avctx)
     bit_alloc_init(s);
 
     FF_ALLOCZ_OR_GOTO(avctx, s->mdct, sizeof(AC3MDCTContext), init_fail);
-    ret = mdct_init(avctx, s->mdct, 9);
+    ret = s->mdct_init(avctx, s->mdct, 9);
     if (ret)
         goto init_fail;
 
@@ -2763,6 +2399,6 @@ static av_cold int ac3_encode_init(AVCodecContext *avctx)
 
     return 0;
 init_fail:
-    ac3_encode_close(avctx);
+    ff_ac3_encode_close(avctx);
     return ret;
 }
diff --git a/libavcodec/ac3enc.h b/libavcodec/ac3enc.h
index ccdb963a7c..e9d7e0a83a 100644
--- a/libavcodec/ac3enc.h
+++ b/libavcodec/ac3enc.h
@@ -40,18 +40,28 @@
 #define CONFIG_AC3ENC_FLOAT 0
 #endif
 
+#define OFFSET(param) offsetof(AC3EncodeContext, options.param)
+#define AC3ENC_PARAM (AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM)
+
+#define AC3ENC_TYPE_AC3_FIXED   0
+#define AC3ENC_TYPE_AC3         1
+#define AC3ENC_TYPE_EAC3        2
+
 #if CONFIG_AC3ENC_FLOAT
+#define AC3_NAME(x) ff_ac3_float_ ## x
 #define MAC_COEF(d,a,b) ((d)+=(a)*(b))
 typedef float SampleType;
 typedef float CoefType;
 typedef float CoefSumType;
 #else
+#define AC3_NAME(x) ff_ac3_fixed_ ## x
 #define MAC_COEF(d,a,b) MAC64(d,a,b)
 typedef int16_t SampleType;
 typedef int32_t CoefType;
 typedef int64_t CoefSumType;
 #endif
 
+
 typedef struct AC3MDCTContext {
     const SampleType *window;           ///< MDCT window function
     FFTContext fft;                     ///< FFT context for MDCT calculation
@@ -132,6 +142,7 @@ typedef struct AC3EncodeContext {
 
     AC3Block blocks[AC3_MAX_BLOCKS];        ///< per-block info
 
+    int fixed_point;                        ///< indicates if fixed-point encoder is being used
     int eac3;                               ///< indicates if this is E-AC-3 vs. AC-3
     int bitstream_id;                       ///< bitstream id                           (bsid)
     int bitstream_mode;                     ///< bitstream mode                         (bsmod)
@@ -209,7 +220,75 @@ typedef struct AC3EncodeContext {
     uint8_t *ref_bap     [AC3_MAX_CHANNELS][AC3_MAX_BLOCKS]; ///< bit allocation pointers (bap)
     int ref_bap_set;                                         ///< indicates if ref_bap pointers have been set
 
+    /* fixed vs. float function pointers */
+    void (*mdct_end)(AC3MDCTContext *mdct);
+    int  (*mdct_init)(AVCodecContext *avctx, AC3MDCTContext *mdct, int nbits);
+    void (*apply_window)(DSPContext *dsp, SampleType *output,
+                         const SampleType *input, const SampleType *window,
+                         unsigned int len);
+    int  (*normalize_samples)(struct AC3EncodeContext *s);
+    void (*scale_coefficients)(struct AC3EncodeContext *s);
+
+    /* fixed vs. float templated function pointers */
+    void (*deinterleave_input_samples)(struct AC3EncodeContext *s,
+                                       const SampleType *samples);
+    void (*apply_mdct)(struct AC3EncodeContext *s);
+    void (*apply_channel_coupling)(struct AC3EncodeContext *s);
+    void (*compute_rematrixing_strategy)(struct AC3EncodeContext *s);
+
+    /* AC-3 vs. E-AC-3 function pointers */
     void (*output_frame_header)(struct AC3EncodeContext *s);
 } AC3EncodeContext;
 
+
+extern const int64_t ff_ac3_channel_layouts[19];
+
+int ff_ac3_encode_init(AVCodecContext *avctx);
+
+int ff_ac3_encode_frame(AVCodecContext *avctx, unsigned char *frame,
+                        int buf_size, void *data);
+
+int ff_ac3_encode_close(AVCodecContext *avctx);
+
+
+/* prototypes for functions in ac3enc_fixed.c and ac3enc_float.c */
+
+void ff_ac3_fixed_mdct_end(AC3MDCTContext *mdct);
+void ff_ac3_float_mdct_end(AC3MDCTContext *mdct);
+
+int ff_ac3_fixed_mdct_init(AVCodecContext *avctx, AC3MDCTContext *mdct,
+                           int nbits);
+int ff_ac3_float_mdct_init(AVCodecContext *avctx, AC3MDCTContext *mdct,
+                           int nbits);
+
+void ff_ac3_fixed_apply_window(DSPContext *dsp, SampleType *output,
+                               const SampleType *input,
+                               const SampleType *window, unsigned int len);
+void ff_ac3_float_apply_window(DSPContext *dsp, SampleType *output,
+                               const SampleType *input,
+                               const SampleType *window, unsigned int len);
+
+int ff_ac3_fixed_normalize_samples(AC3EncodeContext *s);
+int ff_ac3_float_normalize_samples(AC3EncodeContext *s);
+
+void ff_ac3_fixed_scale_coefficients(AC3EncodeContext *s);
+void ff_ac3_float_scale_coefficients(AC3EncodeContext *s);
+
+
+/* prototypes for functions in ac3enc_template.c */
+
+void ff_ac3_fixed_deinterleave_input_samples(AC3EncodeContext *s,
+                                             const SampleType *samples);
+void ff_ac3_float_deinterleave_input_samples(AC3EncodeContext *s,
+                                             const SampleType *samples);
+
+void ff_ac3_fixed_apply_mdct(AC3EncodeContext *s);
+void ff_ac3_float_apply_mdct(AC3EncodeContext *s);
+
+void ff_ac3_fixed_apply_channel_coupling(AC3EncodeContext *s);
+void ff_ac3_float_apply_channel_coupling(AC3EncodeContext *s);
+
+void ff_ac3_fixed_compute_rematrixing_strategy(AC3EncodeContext *s);
+void ff_ac3_float_compute_rematrixing_strategy(AC3EncodeContext *s);
+
 #endif /* AVCODEC_AC3ENC_H */
diff --git a/libavcodec/ac3enc_fixed.c b/libavcodec/ac3enc_fixed.c
index 035ebb3de9..0620a6ac1a 100644
--- a/libavcodec/ac3enc_fixed.c
+++ b/libavcodec/ac3enc_fixed.c
@@ -28,13 +28,20 @@
 
 #define CONFIG_FFT_FLOAT 0
 #undef CONFIG_AC3ENC_FLOAT
-#include "ac3enc.c"
+#include "ac3enc.h"
+
+#define AC3ENC_TYPE AC3ENC_TYPE_AC3_FIXED
+#include "ac3enc_opts_template.c"
+static AVClass ac3enc_class = { "Fixed-Point AC-3 Encoder", av_default_item_name,
+                                ac3fixed_options, LIBAVUTIL_VERSION_INT };
+
+#include "ac3enc_template.c"
 
 
 /**
  * Finalize MDCT and free allocated memory.
  */
-static av_cold void mdct_end(AC3MDCTContext *mdct)
+av_cold void AC3_NAME(mdct_end)(AC3MDCTContext *mdct)
 {
     ff_mdct_end(&mdct->fft);
 }
@@ -44,8 +51,8 @@ static av_cold void mdct_end(AC3MDCTContext *mdct)
  * Initialize MDCT tables.
  * @param nbits log2(MDCT size)
  */
-static av_cold int mdct_init(AVCodecContext *avctx, AC3MDCTContext *mdct,
-                             int nbits)
+av_cold int AC3_NAME(mdct_init)(AVCodecContext *avctx, AC3MDCTContext *mdct,
+                                int nbits)
 {
     int ret = ff_mdct_init(&mdct->fft, nbits, 0, -1.0);
     mdct->window = ff_ac3_window;
@@ -56,8 +63,9 @@ static av_cold int mdct_init(AVCodecContext *avctx, AC3MDCTContext *mdct,
 /**
  * Apply KBD window to input samples prior to MDCT.
  */
-static void apply_window(DSPContext *dsp, int16_t *output, const int16_t *input,
-                         const int16_t *window, unsigned int len)
+void AC3_NAME(apply_window)(DSPContext *dsp, int16_t *output,
+                            const int16_t *input, const int16_t *window,
+                            unsigned int len)
 {
     dsp->apply_window_int16(output, input, window, len);
 }
@@ -82,7 +90,7 @@ static int log2_tab(AC3EncodeContext *s, int16_t *src, int len)
  *
  * @return exponent shift
  */
-static int normalize_samples(AC3EncodeContext *s)
+int AC3_NAME(normalize_samples)(AC3EncodeContext *s)
 {
     int v = 14 - log2_tab(s, s->windowed_samples, AC3_WINDOW_SIZE);
     if (v > 0)
@@ -95,7 +103,7 @@ static int normalize_samples(AC3EncodeContext *s)
 /**
  * Scale MDCT coefficients to 25-bit signed fixed-point.
  */
-static void scale_coefficients(AC3EncodeContext *s)
+void AC3_NAME(scale_coefficients)(AC3EncodeContext *s)
 {
     int blk, ch;
 
@@ -109,17 +117,25 @@ static void scale_coefficients(AC3EncodeContext *s)
 }
 
 
+static av_cold int ac3_fixed_encode_init(AVCodecContext *avctx)
+{
+    AC3EncodeContext *s = avctx->priv_data;
+    s->fixed_point = 1;
+    return ff_ac3_encode_init(avctx);
+}
+
+
 AVCodec ff_ac3_fixed_encoder = {
     "ac3_fixed",
     AVMEDIA_TYPE_AUDIO,
     CODEC_ID_AC3,
     sizeof(AC3EncodeContext),
-    ac3_encode_init,
-    ac3_encode_frame,
-    ac3_encode_close,
+    ac3_fixed_encode_init,
+    ff_ac3_encode_frame,
+    ff_ac3_encode_close,
     NULL,
     .sample_fmts = (const enum AVSampleFormat[]){AV_SAMPLE_FMT_S16,AV_SAMPLE_FMT_NONE},
     .long_name = NULL_IF_CONFIG_SMALL("ATSC A/52A (AC-3)"),
     .priv_class = &ac3enc_class,
-    .channel_layouts = ac3_channel_layouts,
+    .channel_layouts = ff_ac3_channel_layouts,
 };
diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
index 012c31de5d..9c7e88ed1c 100644
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -27,14 +27,25 @@
  */
 
 #define CONFIG_AC3ENC_FLOAT 1
-#include "ac3enc.c"
+#include "ac3enc.h"
+#include "eac3enc.h"
 #include "kbdwin.h"
 
 
+#if CONFIG_AC3_ENCODER
+#define AC3ENC_TYPE AC3ENC_TYPE_AC3
+#include "ac3enc_opts_template.c"
+static AVClass ac3enc_class = { "AC-3 Encoder", av_default_item_name,
+                                ac3_options, LIBAVUTIL_VERSION_INT };
+#endif
+
+#include "ac3enc_template.c"
+
+
 /**
  * Finalize MDCT and free allocated memory.
  */
-static av_cold void mdct_end(AC3MDCTContext *mdct)
+av_cold void ff_ac3_float_mdct_end(AC3MDCTContext *mdct)
 {
     ff_mdct_end(&mdct->fft);
     av_freep(&mdct->window);
@@ -45,8 +56,8 @@ static av_cold void mdct_end(AC3MDCTContext *mdct)
  * Initialize MDCT tables.
  * @param nbits log2(MDCT size)
  */
-static av_cold int mdct_init(AVCodecContext *avctx, AC3MDCTContext *mdct,
-                             int nbits)
+av_cold int ff_ac3_float_mdct_init(AVCodecContext *avctx, AC3MDCTContext *mdct,
+                                   int nbits)
 {
     float *window;
     int i, n, n2;
@@ -71,8 +82,9 @@ static av_cold int mdct_init(AVCodecContext *avctx, AC3MDCTContext *mdct,
 /**
  * Apply KBD window to input samples prior to MDCT.
  */
-static void apply_window(DSPContext *dsp, float *output, const float *input,
-                         const float *window, unsigned int len)
+void ff_ac3_float_apply_window(DSPContext *dsp, float *output,
+                               const float *input, const float *window,
+                               unsigned int len)
 {
     dsp->vector_fmul(output, input, window, len);
 }
@@ -81,7 +93,7 @@ static void apply_window(DSPContext *dsp, float *output, const float *input,
 /**
  * Normalize the input samples to use the maximum available precision.
  */
-static int normalize_samples(AC3EncodeContext *s)
+int ff_ac3_float_normalize_samples(AC3EncodeContext *s)
 {
     /* Normalization is not needed for floating-point samples, so just return 0 */
     return 0;
@@ -91,7 +103,7 @@ static int normalize_samples(AC3EncodeContext *s)
 /**
  * Scale MDCT coefficients from float to 24-bit fixed-point.
  */
-static void scale_coefficients(AC3EncodeContext *s)
+void ff_ac3_float_scale_coefficients(AC3EncodeContext *s)
 {
     int chan_size = AC3_MAX_COEFS * AC3_MAX_BLOCKS;
     s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer + chan_size,
@@ -106,29 +118,13 @@ AVCodec ff_ac3_encoder = {
     AVMEDIA_TYPE_AUDIO,
     CODEC_ID_AC3,
     sizeof(AC3EncodeContext),
-    ac3_encode_init,
-    ac3_encode_frame,
-    ac3_encode_close,
+    ff_ac3_encode_init,
+    ff_ac3_encode_frame,
+    ff_ac3_encode_close,
     NULL,
     .sample_fmts = (const enum AVSampleFormat[]){AV_SAMPLE_FMT_FLT,AV_SAMPLE_FMT_NONE},
     .long_name = NULL_IF_CONFIG_SMALL("ATSC A/52A (AC-3)"),
     .priv_class = &ac3enc_class,
-    .channel_layouts = ac3_channel_layouts,
-};
-#endif
-
-#if CONFIG_EAC3_ENCODER
-AVCodec ff_eac3_encoder = {
-    .name            = "eac3",
-    .type            = AVMEDIA_TYPE_AUDIO,
-    .id              = CODEC_ID_EAC3,
-    .priv_data_size  = sizeof(AC3EncodeContext),
-    .init            = ac3_encode_init,
-    .encode          = ac3_encode_frame,
-    .close           = ac3_encode_close,
-    .sample_fmts     = (const enum AVSampleFormat[]){AV_SAMPLE_FMT_FLT,AV_SAMPLE_FMT_NONE},
-    .long_name       = NULL_IF_CONFIG_SMALL("ATSC A/52 E-AC-3"),
-    .priv_class      = &eac3enc_class,
-    .channel_layouts = ac3_channel_layouts,
+    .channel_layouts = ff_ac3_channel_layouts,
 };
 #endif
diff --git a/libavcodec/ac3enc_opts_template.c b/libavcodec/ac3enc_opts_template.c
index e16e0d0878..39138a1083 100644
--- a/libavcodec/ac3enc_opts_template.c
+++ b/libavcodec/ac3enc_opts_template.c
@@ -19,6 +19,9 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/opt.h"
+#include "ac3.h"
+
 #if AC3ENC_TYPE == AC3ENC_TYPE_AC3_FIXED
 static const AVOption ac3fixed_options[] = {
 #elif AC3ENC_TYPE == AC3ENC_TYPE_AC3
diff --git a/libavcodec/ac3enc_template.c b/libavcodec/ac3enc_template.c
new file mode 100644
index 0000000000..d88fa225a1
--- /dev/null
+++ b/libavcodec/ac3enc_template.c
@@ -0,0 +1,377 @@
+/*
+ * AC-3 encoder float/fixed template
+ * Copyright (c) 2000 Fabrice Bellard
+ * Copyright (c) 2006-2011 Justin Ruggles <justin.ruggles@gmail.com>
+ * Copyright (c) 2006-2010 Prakash Punnoor <prakash@punnoor.de>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AC-3 encoder float/fixed template
+ */
+
+#include <stdint.h>
+
+#include "ac3enc.h"
+
+
+/**
+ * Deinterleave input samples.
+ * Channels are reordered from Libav's default order to AC-3 order.
+ */
+void AC3_NAME(deinterleave_input_samples)(AC3EncodeContext *s,
+                                          const SampleType *samples)
+{
+    int ch, i;
+
+    /* deinterleave and remap input samples */
+    for (ch = 0; ch < s->channels; ch++) {
+        const SampleType *sptr;
+        int sinc;
+
+        /* copy last 256 samples of previous frame to the start of the current frame */
+        memcpy(&s->planar_samples[ch][0], &s->planar_samples[ch][AC3_FRAME_SIZE],
+               AC3_BLOCK_SIZE * sizeof(s->planar_samples[0][0]));
+
+        /* deinterleave */
+        sinc = s->channels;
+        sptr = samples + s->channel_map[ch];
+        for (i = AC3_BLOCK_SIZE; i < AC3_FRAME_SIZE+AC3_BLOCK_SIZE; i++) {
+            s->planar_samples[ch][i] = *sptr;
+            sptr += sinc;
+        }
+    }
+}
+
+
+/**
+ * Apply the MDCT to input samples to generate frequency coefficients.
+ * This applies the KBD window and normalizes the input to reduce precision
+ * loss due to fixed-point calculations.
+ */
+void AC3_NAME(apply_mdct)(AC3EncodeContext *s)
+{
+    int blk, ch;
+
+    for (ch = 0; ch < s->channels; ch++) {
+        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
+            AC3Block *block = &s->blocks[blk];
+            const SampleType *input_samples = &s->planar_samples[ch][blk * AC3_BLOCK_SIZE];
+
+            s->apply_window(&s->dsp, s->windowed_samples, input_samples,
+                            s->mdct->window, AC3_WINDOW_SIZE);
+
+            if (s->fixed_point)
+                block->coeff_shift[ch+1] = s->normalize_samples(s);
+
+            s->mdct->fft.mdct_calcw(&s->mdct->fft, block->mdct_coef[ch+1],
+                                    s->windowed_samples);
+        }
+    }
+}
+
+
+/**
+ * Calculate a single coupling coordinate.
+ */
+static inline float calc_cpl_coord(float energy_ch, float energy_cpl)
+{
+    float coord = 0.125;
+    if (energy_cpl > 0)
+        coord *= sqrtf(energy_ch / energy_cpl);
+    return coord;
+}
+
+
+/**
+ * Calculate coupling channel and coupling coordinates.
+ * TODO: Currently this is only used for the floating-point encoder. I was
+ *       able to make it work for the fixed-point encoder, but quality was
+ *       generally lower in most cases than not using coupling. If a more
+ *       adaptive coupling strategy were to be implemented it might be useful
+ *       at that time to use coupling for the fixed-point encoder as well.
+ */
+void AC3_NAME(apply_channel_coupling)(AC3EncodeContext *s)
+{
+#if CONFIG_AC3ENC_FLOAT
+    LOCAL_ALIGNED_16(float,   cpl_coords,       [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]);
+    LOCAL_ALIGNED_16(int32_t, fixed_cpl_coords, [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]);
+    int blk, ch, bnd, i, j;
+    CoefSumType energy[AC3_MAX_BLOCKS][AC3_MAX_CHANNELS][16] = {{{0}}};
+    int num_cpl_coefs = s->num_cpl_subbands * 12;
+
+    memset(cpl_coords,       0, AC3_MAX_BLOCKS * sizeof(*cpl_coords));
+    memset(fixed_cpl_coords, 0, AC3_MAX_BLOCKS * sizeof(*fixed_cpl_coords));
+
+    /* calculate coupling channel from fbw channels */
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
+        AC3Block *block = &s->blocks[blk];
+        CoefType *cpl_coef = &block->mdct_coef[CPL_CH][s->start_freq[CPL_CH]];
+        if (!block->cpl_in_use)
+            continue;
+        memset(cpl_coef-1, 0, (num_cpl_coefs+4) * sizeof(*cpl_coef));
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
+            CoefType *ch_coef = &block->mdct_coef[ch][s->start_freq[CPL_CH]];
+            if (!block->channel_in_cpl[ch])
+                continue;
+            for (i = 0; i < num_cpl_coefs; i++)
+                cpl_coef[i] += ch_coef[i];
+        }
+        /* note: coupling start bin % 4 will always be 1 and num_cpl_coefs
+                 will always be a multiple of 12, so we need to subtract 1 from
+                 the start and add 4 to the length when using optimized
+                 functions which require 16-byte alignment. */
+
+        /* coefficients must be clipped to +/- 1.0 in order to be encoded */
+        s->dsp.vector_clipf(cpl_coef-1, cpl_coef-1, -1.0f, 1.0f, num_cpl_coefs+4);
+
+        /* scale coupling coefficients from float to 24-bit fixed-point */
+        s->ac3dsp.float_to_fixed24(&block->fixed_coef[CPL_CH][s->start_freq[CPL_CH]-1],
+                                   cpl_coef-1, num_cpl_coefs+4);
+    }
+
+    /* calculate energy in each band in coupling channel and each fbw channel */
+    /* TODO: possibly use SIMD to speed up energy calculation */
+    bnd = 0;
+    i = s->start_freq[CPL_CH];
+    while (i < s->cpl_end_freq) {
+        int band_size = s->cpl_band_sizes[bnd];
+        for (ch = CPL_CH; ch <= s->fbw_channels; ch++) {
+            for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
+                AC3Block *block = &s->blocks[blk];
+                if (!block->cpl_in_use || (ch > CPL_CH && !block->channel_in_cpl[ch]))
+                    continue;
+                for (j = 0; j < band_size; j++) {
+                    CoefType v = block->mdct_coef[ch][i+j];
+                    MAC_COEF(energy[blk][ch][bnd], v, v);
+                }
+            }
+        }
+        i += band_size;
+        bnd++;
+    }
+
+    /* determine which blocks to send new coupling coordinates for */
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
+        AC3Block *block  = &s->blocks[blk];
+        AC3Block *block0 = blk ? &s->blocks[blk-1] : NULL;
+        int new_coords = 0;
+        CoefSumType coord_diff[AC3_MAX_CHANNELS] = {0,};
+
+        if (block->cpl_in_use) {
+            /* calculate coupling coordinates for all blocks and calculate the
+               average difference between coordinates in successive blocks */
+            for (ch = 1; ch <= s->fbw_channels; ch++) {
+                if (!block->channel_in_cpl[ch])
+                    continue;
+
+                for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
+                    cpl_coords[blk][ch][bnd] = calc_cpl_coord(energy[blk][ch][bnd],
+                                                              energy[blk][CPL_CH][bnd]);
+                    if (blk > 0 && block0->cpl_in_use &&
+                        block0->channel_in_cpl[ch]) {
+                        coord_diff[ch] += fabs(cpl_coords[blk-1][ch][bnd] -
+                                               cpl_coords[blk  ][ch][bnd]);
+                    }
+                }
+                coord_diff[ch] /= s->num_cpl_bands;
+            }
+
+            /* send new coordinates if this is the first block, if previous
+             * block did not use coupling but this block does, the channels
+             * using coupling has changed from the previous block, or the
+             * coordinate difference from the last block for any channel is
+             * greater than a threshold value. */
+            if (blk == 0) {
+                new_coords = 1;
+            } else if (!block0->cpl_in_use) {
+                new_coords = 1;
+            } else {
+                for (ch = 1; ch <= s->fbw_channels; ch++) {
+                    if (block->channel_in_cpl[ch] && !block0->channel_in_cpl[ch]) {
+                        new_coords = 1;
+                        break;
+                    }
+                }
+                if (!new_coords) {
+                    for (ch = 1; ch <= s->fbw_channels; ch++) {
+                        if (block->channel_in_cpl[ch] && coord_diff[ch] > 0.04) {
+                            new_coords = 1;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+        block->new_cpl_coords = new_coords;
+    }
+
+    /* calculate final coupling coordinates, taking into account reusing of
+       coordinates in successive blocks */
+    for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
+        blk = 0;
+        while (blk < AC3_MAX_BLOCKS) {
+            int blk1;
+            CoefSumType energy_cpl;
+            AC3Block *block  = &s->blocks[blk];
+
+            if (!block->cpl_in_use) {
+                blk++;
+                continue;
+            }
+
+            energy_cpl = energy[blk][CPL_CH][bnd];
+            blk1 = blk+1;
+            while (!s->blocks[blk1].new_cpl_coords && blk1 < AC3_MAX_BLOCKS) {
+                if (s->blocks[blk1].cpl_in_use)
+                    energy_cpl += energy[blk1][CPL_CH][bnd];
+                blk1++;
+            }
+
+            for (ch = 1; ch <= s->fbw_channels; ch++) {
+                CoefType energy_ch;
+                if (!block->channel_in_cpl[ch])
+                    continue;
+                energy_ch = energy[blk][ch][bnd];
+                blk1 = blk+1;
+                while (!s->blocks[blk1].new_cpl_coords && blk1 < AC3_MAX_BLOCKS) {
+                    if (s->blocks[blk1].cpl_in_use)
+                        energy_ch += energy[blk1][ch][bnd];
+                    blk1++;
+                }
+                cpl_coords[blk][ch][bnd] = calc_cpl_coord(energy_ch, energy_cpl);
+            }
+            blk = blk1;
+        }
+    }
+
+    /* calculate exponents/mantissas for coupling coordinates */
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
+        AC3Block *block = &s->blocks[blk];
+        if (!block->cpl_in_use || !block->new_cpl_coords)
+            continue;
+
+        s->ac3dsp.float_to_fixed24(fixed_cpl_coords[blk][1],
+                                   cpl_coords[blk][1],
+                                   s->fbw_channels * 16);
+        s->ac3dsp.extract_exponents(block->cpl_coord_exp[1],
+                                    fixed_cpl_coords[blk][1],
+                                    s->fbw_channels * 16);
+
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
+            int bnd, min_exp, max_exp, master_exp;
+
+            /* determine master exponent */
+            min_exp = max_exp = block->cpl_coord_exp[ch][0];
+            for (bnd = 1; bnd < s->num_cpl_bands; bnd++) {
+                int exp = block->cpl_coord_exp[ch][bnd];
+                min_exp = FFMIN(exp, min_exp);
+                max_exp = FFMAX(exp, max_exp);
+            }
+            master_exp = ((max_exp - 15) + 2) / 3;
+            master_exp = FFMAX(master_exp, 0);
+            while (min_exp < master_exp * 3)
+                master_exp--;
+            for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
+                block->cpl_coord_exp[ch][bnd] = av_clip(block->cpl_coord_exp[ch][bnd] -
+                                                        master_exp * 3, 0, 15);
+            }
+            block->cpl_master_exp[ch] = master_exp;
+
+            /* quantize mantissas */
+            for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
+                int cpl_exp  = block->cpl_coord_exp[ch][bnd];
+                int cpl_mant = (fixed_cpl_coords[blk][ch][bnd] << (5 + cpl_exp + master_exp * 3)) >> 24;
+                if (cpl_exp == 15)
+                    cpl_mant >>= 1;
+                else
+                    cpl_mant -= 16;
+
+                block->cpl_coord_mant[ch][bnd] = cpl_mant;
+            }
+        }
+    }
+
+    if (CONFIG_EAC3_ENCODER && s->eac3)
+        ff_eac3_set_cpl_states(s);
+#endif /* CONFIG_AC3ENC_FLOAT */
+}
+
+
+/**
+ * Determine rematrixing flags for each block and band.
+ */
+void AC3_NAME(compute_rematrixing_strategy)(AC3EncodeContext *s)
+{
+    int nb_coefs;
+    int blk, bnd, i;
+    AC3Block *block, *av_uninit(block0);
+
+    if (s->channel_mode != AC3_CHMODE_STEREO)
+        return;
+
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
+        block = &s->blocks[blk];
+        block->new_rematrixing_strategy = !blk;
+
+        if (!s->rematrixing_enabled) {
+            block0 = block;
+            continue;
+        }
+
+        block->num_rematrixing_bands = 4;
+        if (block->cpl_in_use) {
+            block->num_rematrixing_bands -= (s->start_freq[CPL_CH] <= 61);
+            block->num_rematrixing_bands -= (s->start_freq[CPL_CH] == 37);
+            if (blk && block->num_rematrixing_bands != block0->num_rematrixing_bands)
+                block->new_rematrixing_strategy = 1;
+        }
+        nb_coefs = FFMIN(block->end_freq[1], block->end_freq[2]);
+
+        for (bnd = 0; bnd < block->num_rematrixing_bands; bnd++) {
+            /* calculate calculate sum of squared coeffs for one band in one block */
+            int start = ff_ac3_rematrix_band_tab[bnd];
+            int end   = FFMIN(nb_coefs, ff_ac3_rematrix_band_tab[bnd+1]);
+            CoefSumType sum[4] = {0,};
+            for (i = start; i < end; i++) {
+                CoefType lt = block->mdct_coef[1][i];
+                CoefType rt = block->mdct_coef[2][i];
+                CoefType md = lt + rt;
+                CoefType sd = lt - rt;
+                MAC_COEF(sum[0], lt, lt);
+                MAC_COEF(sum[1], rt, rt);
+                MAC_COEF(sum[2], md, md);
+                MAC_COEF(sum[3], sd, sd);
+            }
+
+            /* compare sums to determine if rematrixing will be used for this band */
+            if (FFMIN(sum[2], sum[3]) < FFMIN(sum[0], sum[1]))
+                block->rematrixing_flags[bnd] = 1;
+            else
+                block->rematrixing_flags[bnd] = 0;
+
+            /* determine if new rematrixing flags will be sent */
+            if (blk &&
+                block->rematrixing_flags[bnd] != block0->rematrixing_flags[bnd]) {
+                block->new_rematrixing_strategy = 1;
+            }
+        }
+        block0 = block;
+    }
+}
diff --git a/libavcodec/eac3enc.c b/libavcodec/eac3enc.c
index 20f4b879c6..d37acaf20b 100644
--- a/libavcodec/eac3enc.c
+++ b/libavcodec/eac3enc.c
@@ -28,6 +28,13 @@
 #include "ac3enc.h"
 #include "eac3enc.h"
 
+
+#define AC3ENC_TYPE AC3ENC_TYPE_EAC3
+#include "ac3enc_opts_template.c"
+static AVClass eac3enc_class = { "E-AC-3 Encoder", av_default_item_name,
+                                 eac3_options, LIBAVUTIL_VERSION_INT };
+
+
 void ff_eac3_set_cpl_states(AC3EncodeContext *s)
 {
     int ch, blk;
@@ -129,3 +136,20 @@ void ff_eac3_output_frame_header(AC3EncodeContext *s)
     /* block start info */
     put_bits(&s->pb, 1, 0);
 }
+
+
+#if CONFIG_EAC3_ENCODER
+AVCodec ff_eac3_encoder = {
+    .name            = "eac3",
+    .type            = AVMEDIA_TYPE_AUDIO,
+    .id              = CODEC_ID_EAC3,
+    .priv_data_size  = sizeof(AC3EncodeContext),
+    .init            = ff_ac3_encode_init,
+    .encode          = ff_ac3_encode_frame,
+    .close           = ff_ac3_encode_close,
+    .sample_fmts     = (const enum AVSampleFormat[]){AV_SAMPLE_FMT_FLT,AV_SAMPLE_FMT_NONE},
+    .long_name       = NULL_IF_CONFIG_SMALL("ATSC A/52 E-AC-3"),
+    .priv_class      = &eac3enc_class,
+    .channel_layouts = ff_ac3_channel_layouts,
+};
+#endif

From 38c304addd978410956c8ff02ea83a6ffb9a606a Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 10 Jun 2011 15:17:55 -0400
Subject: [PATCH 23/24] ac3enc: remove empty ac3_float function that is never
 called

---
 libavcodec/ac3enc.c       |  1 -
 libavcodec/ac3enc.h       |  1 -
 libavcodec/ac3enc_float.c | 10 ----------
 3 files changed, 12 deletions(-)

diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index 1147ed142e..78e81654e3 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -2363,7 +2363,6 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx)
         s->mdct_end                     = ff_ac3_float_mdct_end;
         s->mdct_init                    = ff_ac3_float_mdct_init;
         s->apply_window                 = ff_ac3_float_apply_window;
-        s->normalize_samples            = ff_ac3_float_normalize_samples;
         s->scale_coefficients           = ff_ac3_float_scale_coefficients;
         s->deinterleave_input_samples   = ff_ac3_float_deinterleave_input_samples;
         s->apply_mdct                   = ff_ac3_float_apply_mdct;
diff --git a/libavcodec/ac3enc.h b/libavcodec/ac3enc.h
index e9d7e0a83a..1d17484321 100644
--- a/libavcodec/ac3enc.h
+++ b/libavcodec/ac3enc.h
@@ -269,7 +269,6 @@ void ff_ac3_float_apply_window(DSPContext *dsp, SampleType *output,
                                const SampleType *window, unsigned int len);
 
 int ff_ac3_fixed_normalize_samples(AC3EncodeContext *s);
-int ff_ac3_float_normalize_samples(AC3EncodeContext *s);
 
 void ff_ac3_fixed_scale_coefficients(AC3EncodeContext *s);
 void ff_ac3_float_scale_coefficients(AC3EncodeContext *s);
diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
index 9c7e88ed1c..43fbb954d6 100644
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -90,16 +90,6 @@ void ff_ac3_float_apply_window(DSPContext *dsp, float *output,
 }
 
 
-/**
- * Normalize the input samples to use the maximum available precision.
- */
-int ff_ac3_float_normalize_samples(AC3EncodeContext *s)
-{
-    /* Normalization is not needed for floating-point samples, so just return 0 */
-    return 0;
-}
-
-
 /**
  * Scale MDCT coefficients from float to 24-bit fixed-point.
  */

From 35bdaf3d427b6856df01d41ee826bd515440ec46 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Fri, 10 Jun 2011 20:27:50 +0200
Subject: [PATCH 24/24] utils: Drop pointless '#if 1' preprocessor directive.

---
 libavformat/utils.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libavformat/utils.c b/libavformat/utils.c
index d0fd0d46ff..60f4d03e4b 100644
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@@ -1577,14 +1577,12 @@ int64_t av_gen_search(AVFormatContext *s, int stream_index, int64_t target_ts, i
 
     pos = (flags & AVSEEK_FLAG_BACKWARD) ? pos_min : pos_max;
     ts  = (flags & AVSEEK_FLAG_BACKWARD) ?  ts_min :  ts_max;
-#if 1
     pos_min = pos;
     ts_min = read_timestamp(s, stream_index, &pos_min, INT64_MAX);
     pos_min++;
     ts_max = read_timestamp(s, stream_index, &pos_min, INT64_MAX);
     av_dlog(s, "pos=0x%"PRIx64" %"PRId64"<=%"PRId64"<=%"PRId64"\n",
             pos, ts_min, target_ts, ts_max);
-#endif
     *ts_ret= ts;
     return pos;
 }