Increase alignment of av_malloc() as needed by AVX ASM.

Signed-off-by: Reinhard Tartler <siretart@tauware.de>
2011-04-23 19:24:31 +02:00 · 2011-04-23 19:24:31 +02:00 · 13dfce3d44
parent 33cbfa6fa3
commit 13dfce3d44
1 changed files with 7 additions and 9 deletions
--- a/libavutil/mem.c
+++ b/libavutil/mem.c
@ -69,21 +69,21 @@ void *av_malloc(size_t size)
 #endif
    /* let's disallow possible ambiguous cases */
-    if(size > (INT_MAX-16) )
+    if(size > (INT_MAX-32) )
        return NULL;
 #if CONFIG_MEMALIGN_HACK
-    ptr = malloc(size+16);
+    ptr = malloc(size+32);
    if(!ptr)
        return ptr;
-    diff= ((-(long)ptr - 1)&15) + 1;
+    diff= ((-(long)ptr - 1)&31) + 1;
    ptr = (char*)ptr + diff;
    ((char*)ptr)[-1]= diff;
 #elif HAVE_POSIX_MEMALIGN
-    if (posix_memalign(&ptr,16,size))
+    if (posix_memalign(&ptr,32,size))
        ptr = NULL;
 #elif HAVE_MEMALIGN
-    ptr = memalign(16,size);
+    ptr = memalign(32,size);
    /* Why 64?
       Indeed, we should align it:
         on 4 for 386
@ -93,10 +93,8 @@ void *av_malloc(size_t size)
       Because L1 and L2 caches are aligned on those values.
       But I don't want to code such logic here!
     */
-     /* Why 16?
+     /* Why 32?
-        Because some CPUs need alignment, for example SSE2 on P4, & most RISC CPUs
+        For AVX ASM. SSE / NEON needs only 16.
        it will just trigger an exception and the unaligned load will be done in the
        exception handler or it will just segfault (SSE2 on P4).
        Why not larger? Because I did not see a difference in benchmarks ...
     */
     /* benchmarks with P3