diff --git a/asmalign.h b/asmalign.h
deleted file mode 100644
index 45a59cdaa7..0000000000
--- a/asmalign.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifdef SYS_DARWIN
-#define ASMALIGN8  ".align 3\n\t"
-#define ASMALIGN16 ".align 4\n\t"
-#else
-#define ASMALIGN8  ".balign 8\n\t"
-#define ASMALIGN16 ".balign 16\n\t"
-#endif
diff --git a/liba52/downmix.c b/liba52/downmix.c
index fc28071618..91c21a2ef7 100644
--- a/liba52/downmix.c
+++ b/liba52/downmix.c
@@ -28,7 +28,6 @@
  */
 
 #include "config.h"
-#include "asmalign.h"
 
 #include <string.h>
 #include <inttypes.h>
@@ -694,7 +693,7 @@ static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias)
 	"movlps %2, %%xmm7		\n\t"
 	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
 	"mov $-1024, %%"REG_S"		\n\t"
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:				\n\t"
 	"movaps (%0, %%"REG_S"), %%xmm0	\n\t" 
 	"movaps 16(%0, %%"REG_S"), %%xmm1\n\t" 
@@ -717,7 +716,7 @@ static void mix3to1_SSE (sample_t * samples, sample_t bias)
 	"movlps %1, %%xmm7		\n\t"
 	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
 	"mov $-1024, %%"REG_S"		\n\t"
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:				\n\t"
 	"movaps (%0, %%"REG_S"), %%xmm0	\n\t" 
 	"movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" 
@@ -738,7 +737,7 @@ static void mix4to1_SSE (sample_t * samples, sample_t bias)
 	"movlps %1, %%xmm7		\n\t"
 	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
 	"mov $-1024, %%"REG_S"		\n\t"
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:				\n\t"
 	"movaps (%0, %%"REG_S"), %%xmm0	\n\t" 
 	"movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" 
@@ -760,7 +759,7 @@ static void mix5to1_SSE (sample_t * samples, sample_t bias)
 	"movlps %1, %%xmm7		\n\t"
 	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
 	"mov $-1024, %%"REG_S"		\n\t"
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:				\n\t"
 	"movaps (%0, %%"REG_S"), %%xmm0	\n\t" 
 	"movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" 
@@ -783,7 +782,7 @@ static void mix3to2_SSE (sample_t * samples, sample_t bias)
 	"movlps %1, %%xmm7		\n\t"
 	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
 	"mov $-1024, %%"REG_S"		\n\t"
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:				\n\t"
 	"movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" 
 	"addps %%xmm7, %%xmm0		\n\t" //common
@@ -806,7 +805,7 @@ static void mix21to2_SSE (sample_t * left, sample_t * right, sample_t bias)
 		"movlps %2, %%xmm7		\n\t"
 		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
 		"mov $-1024, %%"REG_S"		\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		"movaps 1024(%1, %%"REG_S"), %%xmm0\n\t" 
 		"addps %%xmm7, %%xmm0		\n\t" //common
@@ -829,7 +828,7 @@ static void mix21toS_SSE (sample_t * samples, sample_t bias)
 		"movlps %1, %%xmm7		\n\t"
 		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
 		"mov $-1024, %%"REG_S"		\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		"movaps 2048(%0, %%"REG_S"), %%xmm0\n\t"  // surround
 		"movaps (%0, %%"REG_S"), %%xmm1	\n\t" 
@@ -853,7 +852,7 @@ static void mix31to2_SSE (sample_t * samples, sample_t bias)
 		"movlps %1, %%xmm7		\n\t"
 		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
 		"mov $-1024, %%"REG_S"		\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		"movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"  
 		"addps 3072(%0, %%"REG_S"), %%xmm0\n\t"  
@@ -877,7 +876,7 @@ static void mix31toS_SSE (sample_t * samples, sample_t bias)
 		"movlps %1, %%xmm7		\n\t"
 		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
 		"mov $-1024, %%"REG_S"		\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		"movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"  
 		"movaps 3072(%0, %%"REG_S"), %%xmm3\n\t" // surround
@@ -903,7 +902,7 @@ static void mix22toS_SSE (sample_t * samples, sample_t bias)
 		"movlps %1, %%xmm7		\n\t"
 		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
 		"mov $-1024, %%"REG_S"		\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		"movaps 2048(%0, %%"REG_S"), %%xmm0\n\t"  
 		"addps 3072(%0, %%"REG_S"), %%xmm0\n\t" // surround
@@ -928,7 +927,7 @@ static void mix32to2_SSE (sample_t * samples, sample_t bias)
 	"movlps %1, %%xmm7		\n\t"
 	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
 	"mov $-1024, %%"REG_S"		\n\t"
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:				\n\t"
 	"movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" 
 	"addps %%xmm7, %%xmm0		\n\t" // common
@@ -952,7 +951,7 @@ static void mix32toS_SSE (sample_t * samples, sample_t bias)
 	"movlps %1, %%xmm7		\n\t"
 	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
 	"mov $-1024, %%"REG_S"		\n\t"
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:				\n\t"
 	"movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" 
 	"movaps 3072(%0, %%"REG_S"), %%xmm2\n\t" 
@@ -979,7 +978,7 @@ static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias)
 		"movlps %2, %%xmm7		\n\t"
 		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
 		"mov $-1024, %%"REG_S"		\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		"movaps (%0, %%"REG_S"), %%xmm0	\n\t"  
 		"movaps 16(%0, %%"REG_S"), %%xmm1\n\t"  
@@ -1001,7 +1000,7 @@ static void zero_MMX(sample_t * samples)
 	asm volatile(
 		"mov $-1024, %%"REG_S"		\n\t"
 		"pxor %%mm0, %%mm0		\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		"movq %%mm0, (%0, %%"REG_S")	\n\t"
 		"movq %%mm0, 8(%0, %%"REG_S")	\n\t"
@@ -1261,7 +1260,7 @@ static void mix2to1_3dnow (sample_t * dest, sample_t * src, sample_t bias)
 	"movd  %2, %%mm7	\n\t"
 	"punpckldq %2, %%mm7	\n\t"
 	"mov $-1024, %%"REG_S"	\n\t"
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:			\n\t"
 	"movq  (%0, %%"REG_S"), %%mm0	\n\t" 
 	"movq  8(%0, %%"REG_S"), %%mm1	\n\t"
@@ -1292,7 +1291,7 @@ static void mix3to1_3dnow (sample_t * samples, sample_t bias)
 	"movd  %1, %%mm7	\n\t"
 	"punpckldq %1, %%mm7	\n\t"
 	"mov $-1024, %%"REG_S"	\n\t"
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:			\n\t"
 	"movq  (%0, %%"REG_S"), %%mm0	\n\t" 
 	"movq  8(%0, %%"REG_S"), %%mm1	\n\t"
@@ -1319,7 +1318,7 @@ static void mix4to1_3dnow (sample_t * samples, sample_t bias)
 	"movd  %1, %%mm7	\n\t"
 	"punpckldq %1, %%mm7	\n\t"
 	"mov $-1024, %%"REG_S"	\n\t"
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:			\n\t"
 	"movq  (%0, %%"REG_S"), %%mm0	\n\t" 
 	"movq  8(%0, %%"REG_S"), %%mm1	\n\t"
@@ -1348,7 +1347,7 @@ static void mix5to1_3dnow (sample_t * samples, sample_t bias)
 	"movd  %1, %%mm7	\n\t"
 	"punpckldq %1, %%mm7	\n\t"
 	"mov $-1024, %%"REG_S"	\n\t"
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:			\n\t"
 	"movq  (%0, %%"REG_S"), %%mm0	\n\t" 
 	"movq  8(%0, %%"REG_S"), %%mm1	\n\t"
@@ -1379,7 +1378,7 @@ static void mix3to2_3dnow (sample_t * samples, sample_t bias)
 	"movd  %1, %%mm7	\n\t"
 	"punpckldq %1, %%mm7	\n\t"
 	"mov $-1024, %%"REG_S"	\n\t"
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:			\n\t"
 	"movq   1024(%0, %%"REG_S"), %%mm0\n\t" 
 	"movq   1032(%0, %%"REG_S"), %%mm1\n\t"
@@ -1410,7 +1409,7 @@ static void mix21to2_3dnow (sample_t * left, sample_t * right, sample_t bias)
 		"movd  %2, %%mm7	\n\t"
 		"punpckldq %2, %%mm7	\n\t"
 		"mov $-1024, %%"REG_S"	\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:			\n\t"
 		"movq  1024(%1, %%"REG_S"), %%mm0\n\t" 
 		"movq  1032(%1, %%"REG_S"), %%mm1\n\t"
@@ -1441,7 +1440,7 @@ static void mix21toS_3dnow (sample_t * samples, sample_t bias)
 		"movd  %1, %%mm7	\n\t"
 		"punpckldq %1, %%mm7	\n\t"
 		"mov $-1024, %%"REG_S"	\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:			\n\t"
 		"movq  2048(%0, %%"REG_S"), %%mm0\n\t"  // surround
 		"movq  2056(%0, %%"REG_S"), %%mm1\n\t"  // surround
@@ -1474,7 +1473,7 @@ static void mix31to2_3dnow (sample_t * samples, sample_t bias)
 		"movd  %1, %%mm7	\n\t"
 		"punpckldq %1, %%mm7	\n\t"
 		"mov $-1024, %%"REG_S"	\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:			\n\t"
 		"movq  1024(%0, %%"REG_S"), %%mm0\n\t"  
 		"movq  1032(%0, %%"REG_S"), %%mm1\n\t"
@@ -1507,7 +1506,7 @@ static void mix31toS_3dnow (sample_t * samples, sample_t bias)
 		"movd  %1, %%mm7	\n\t"
 		"punpckldq %1, %%mm7	\n\t"
 		"mov $-1024, %%"REG_S"	\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:			\n\t"
 		"movq   1024(%0, %%"REG_S"), %%mm0\n\t"  
 		"movq   1032(%0, %%"REG_S"), %%mm1\n\t"
@@ -1544,7 +1543,7 @@ static void mix22toS_3dnow (sample_t * samples, sample_t bias)
 		"movd  %1, %%mm7	\n\t"
 		"punpckldq %1, %%mm7	\n\t"
 		"mov $-1024, %%"REG_S"	\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:			\n\t"
 		"movq  2048(%0, %%"REG_S"), %%mm0\n\t"  
 		"movq  2056(%0, %%"REG_S"), %%mm1\n\t"
@@ -1579,7 +1578,7 @@ static void mix32to2_3dnow (sample_t * samples, sample_t bias)
 	"movd  %1, %%mm7	\n\t"
 	"punpckldq %1, %%mm7	\n\t"
 	"mov $-1024, %%"REG_S"	\n\t"
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:			\n\t"
 	"movq   1024(%0, %%"REG_S"), %%mm0\n\t" 
 	"movq   1032(%0, %%"REG_S"), %%mm1\n\t"
@@ -1611,7 +1610,7 @@ static void mix32toS_3dnow (sample_t * samples, sample_t bias)
 {
 	asm volatile(
 	"mov $-1024, %%"REG_S"		\n\t"
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:			\n\t"
 	"movd  %1, %%mm7		\n\t"
 	"punpckldq %1, %%mm7		\n\t"
@@ -1652,7 +1651,7 @@ static void move2to1_3dnow (sample_t * src, sample_t * dest, sample_t bias)
 		"movd  %2, %%mm7	\n\t"
 		"punpckldq %2, %%mm7	\n\t"
 		"mov $-1024, %%"REG_S"	\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:			\n\t"
 		"movq  (%0, %%"REG_S"), %%mm0	\n\t"  
 		"movq  8(%0, %%"REG_S"), %%mm1	\n\t"
diff --git a/liba52/imdct.c b/liba52/imdct.c
index 48dc536c96..aa4a82547d 100644
--- a/liba52/imdct.c
+++ b/liba52/imdct.c
@@ -34,7 +34,6 @@
  */
 
 #include "config.h"
-#include "asmalign.h"
 
 #include <math.h>
 #include <stdio.h>
@@ -769,7 +768,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
 		"lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t"
 		"mov $1008, %%"REG_D"			\n\t"
 		"push %%"REG_BP"			\n\t" //use ebp without telling gcc
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:					\n\t"
 		"movlps (%0, %%"REG_S"), %%xmm0	\n\t" // XXXI
 		"movhps 8(%0, %%"REG_D"), %%xmm0	\n\t" // RXXI
@@ -828,7 +827,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
 		"xorps %%xmm1, %%xmm1	\n\t"
 		"xorps %%xmm2, %%xmm2	\n\t"
 		"mov %0, %%"REG_S"	\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:			\n\t"
 		"movlps (%%"REG_S"), %%xmm0\n\t" //buf[p]
 		"movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q]
@@ -849,7 +848,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
 	asm volatile(
 		"movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
 		"mov %0, %%"REG_S"		\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		"movaps 16(%%"REG_S"), %%xmm2	\n\t" //r2,i2,r3,i3
 		"shufps $0xB4, %%xmm2, %%xmm2	\n\t" //r2,i2,i3,r3
@@ -880,7 +879,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
 		"xorps %%xmm5, %%xmm5		\n\t"
 		"xorps %%xmm2, %%xmm2		\n\t"
 		"mov %0, %%"REG_S"		\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		"movaps 32(%%"REG_S"), %%xmm2	\n\t" //r4,i4,r5,i5
 		"movaps 48(%%"REG_S"), %%xmm3	\n\t" //r6,i6,r7,i7
@@ -921,7 +920,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
 	buf_offset = buf+128;
 	asm volatile(
 		"mov %0, %%"REG_S"			\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:					\n\t"
 		"xor %%"REG_D", %%"REG_D"		\n\t" // k
 		"lea (%%"REG_S", %3), %%"REG_d"		\n\t"
@@ -953,7 +952,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
     /* Post IFFT complex multiply  plus IFFT complex conjugate*/
 	asm volatile(
 		"mov $-1024, %%"REG_S"			\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:					\n\t"
 		"movaps (%0, %%"REG_S"), %%xmm0		\n\t"
 		"movaps (%0, %%"REG_S"), %%xmm1		\n\t"
@@ -979,7 +978,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
 		"xor %%"REG_S", %%"REG_S"		\n\t"  // 0
 		"movss %3, %%xmm2			\n\t"  // bias
 		"shufps $0x00, %%xmm2, %%xmm2		\n\t"  // bias, bias, ...
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:					\n\t"
 		"movlps (%0, %%"REG_S"), %%xmm0		\n\t" // ? ? A ?
 		"movlps 8(%0, %%"REG_S"), %%xmm1	\n\t" // ? ? C ?
@@ -1006,7 +1005,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
 		"xor %%"REG_S", %%"REG_S"		\n\t"  // 0
 		"movss %3, %%xmm2			\n\t"  // bias
 		"shufps $0x00, %%xmm2, %%xmm2		\n\t"  // bias, bias, ...
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:					\n\t"
 		"movlps (%0, %%"REG_S"), %%xmm0		\n\t" // ? ? ? A
 		"movlps 8(%0, %%"REG_S"), %%xmm1	\n\t" // ? ? ? C
@@ -1033,7 +1032,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
 	asm volatile(
 		"xor %%"REG_D", %%"REG_D"		\n\t"  // 0
 		"xor %%"REG_S", %%"REG_S"		\n\t"  // 0
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:					\n\t"
 		"movlps (%0, %%"REG_S"), %%xmm0		\n\t" // ? ? ? A
 		"movlps 8(%0, %%"REG_S"), %%xmm1	\n\t" // ? ? ? C
@@ -1055,7 +1054,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
 	asm volatile(
 		"mov $1024, %%"REG_D"			\n\t"  // 1024
 		"xor %%"REG_S", %%"REG_S"		\n\t"  // 0
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:					\n\t"
 		"movlps (%0, %%"REG_S"), %%xmm0	\n\t" // ? ? A ?
 		"movlps 8(%0, %%"REG_S"), %%xmm1	\n\t" // ? ? C ?
diff --git a/libmpcodecs/vf_decimate.c b/libmpcodecs/vf_decimate.c
index 03c05a4af5..614af13e46 100644
--- a/libmpcodecs/vf_decimate.c
+++ b/libmpcodecs/vf_decimate.c
@@ -5,7 +5,6 @@
 #include "config.h"
 #include "mp_msg.h"
 #include "cpudetect.h"
-#include "asmalign.h"
 
 #include "img_format.h"
 #include "mp_image.h"
@@ -29,7 +28,7 @@ static int diff_MMX(unsigned char *old, unsigned char *new, int os, int ns)
 		"pxor %%mm4, %%mm4 \n\t"
 		"pxor %%mm7, %%mm7 \n\t"
 		
-		ASMALIGN16
+		ASMALIGN(4)
 		"1: \n\t"
 		
 		"movq (%%"REG_S"), %%mm0 \n\t"
diff --git a/libmpcodecs/vf_divtc.c b/libmpcodecs/vf_divtc.c
index 2efd7a33be..2b0b4e6377 100644
--- a/libmpcodecs/vf_divtc.c
+++ b/libmpcodecs/vf_divtc.c
@@ -8,7 +8,6 @@
 #include "mp_msg.h"
 #include "cpudetect.h"
 #include "bswap.h"
-#include "asmalign.h"
 
 #include "img_format.h"
 #include "mp_image.h"
@@ -42,7 +41,7 @@ static int diff_MMX(unsigned char *old, unsigned char *new, int os, int ns)
 	"pxor %%mm4, %%mm4 \n\t"
 	"pxor %%mm7, %%mm7 \n\t"
 
-	ASMALIGN16
+	ASMALIGN(4)
 	"1: \n\t"
 
 	"movq (%%"REG_S"), %%mm0 \n\t"
diff --git a/libmpcodecs/vf_eq.c b/libmpcodecs/vf_eq.c
index ae05aede7c..571244f6f4 100644
--- a/libmpcodecs/vf_eq.c
+++ b/libmpcodecs/vf_eq.c
@@ -6,7 +6,6 @@
 #include "config.h"
 #include "mp_msg.h"
 #include "cpudetect.h"
-#include "asmalign.h"
 
 #include "img_format.h"
 #include "mp_image.h"
@@ -51,7 +50,7 @@ static void process_MMX(unsigned char *dest, int dstride, unsigned char *src, in
 			"movq (%6), %%mm4 \n\t"
 			"pxor %%mm0, %%mm0 \n\t"
 			"movl %4, %%eax\n\t"
-                       ASMALIGN16
+			ASMALIGN(4)
 			"1: \n\t"
 			"movq (%0), %%mm1 \n\t"
 			"movq (%0), %%mm2 \n\t"
diff --git a/libmpcodecs/vf_eq2.c b/libmpcodecs/vf_eq2.c
index faac982f75..e4be137582 100644
--- a/libmpcodecs/vf_eq2.c
+++ b/libmpcodecs/vf_eq2.c
@@ -18,7 +18,6 @@
 #include "config.h"
 #include "mp_msg.h"
 #include "cpudetect.h"
-#include "asmalign.h"
 
 #include "img_format.h"
 #include "mp_image.h"
@@ -136,7 +135,7 @@ void affine_1d_MMX (eq2_param_t *par, unsigned char *dst, unsigned char *src,
       "movq (%6), %%mm4 \n\t"
       "pxor %%mm0, %%mm0 \n\t"
       "movl %4, %%eax\n\t"
-      ASMALIGN16
+      ASMALIGN(4)
       "1: \n\t"
       "movq (%0), %%mm1 \n\t"
       "movq (%0), %%mm2 \n\t"
diff --git a/libmpcodecs/vf_fspp.c b/libmpcodecs/vf_fspp.c
index fc475630a2..89b63ad701 100644
--- a/libmpcodecs/vf_fspp.c
+++ b/libmpcodecs/vf_fspp.c
@@ -37,7 +37,6 @@
 #include <math.h>
 
 #include "config.h"
-#include "asmalign.h"
 
 #include "mp_msg.h"
 #include "cpudetect.h"
@@ -884,7 +883,7 @@ static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int
 static void column_fidct_mmx(int16_t* thr_adr,  DCTELEM *data,  DCTELEM *output,  int cnt)
 {
     asm volatile(
-	ASMALIGN16
+	ASMALIGN(4)
 	"1:                   \n\t"
 	"movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
 	//
diff --git a/libmpcodecs/vf_halfpack.c b/libmpcodecs/vf_halfpack.c
index 67da7959fd..12acb89839 100644
--- a/libmpcodecs/vf_halfpack.c
+++ b/libmpcodecs/vf_halfpack.c
@@ -6,7 +6,6 @@
 #include "config.h"
 #include "mp_msg.h"
 #include "cpudetect.h"
-#include "asmalign.h"
 
 #include "img_format.h"
 #include "mp_image.h"
@@ -41,7 +40,7 @@ static void halfpack_MMX(unsigned char *dst, unsigned char *src[3],
 	for (h/=2; h; h--) {
 		asm (
 			"pxor %%mm0, %%mm0 \n\t"
-			ASMALIGN16
+			ASMALIGN(4)
 			"1: \n\t"
 			"movq (%0), %%mm1 \n\t"
 			"movq (%0), %%mm2 \n\t"
diff --git a/libmpcodecs/vf_ilpack.c b/libmpcodecs/vf_ilpack.c
index 3123f9226a..f039a17548 100644
--- a/libmpcodecs/vf_ilpack.c
+++ b/libmpcodecs/vf_ilpack.c
@@ -6,7 +6,6 @@
 #include "config.h"
 #include "mp_msg.h"
 #include "cpudetect.h"
-#include "asmalign.h"
 
 #include "img_format.h"
 #include "mp_image.h"
@@ -66,7 +65,7 @@ static void pack_nn_MMX(unsigned char *dst, unsigned char *y,
 {
 	int j;
 	asm volatile (""
-		ASMALIGN16                
+		ASMALIGN(4)
 		"1: \n\t"
 		"movq (%0), %%mm1 \n\t"
 		"movq (%0), %%mm2 \n\t"
@@ -105,7 +104,7 @@ static void pack_li_0_MMX(unsigned char *dst, unsigned char *y,
 #endif
 		"pxor %%mm0, %%mm0 \n\t"
 		
-		ASMALIGN16 
+		ASMALIGN(4)
 		".Lli0: \n\t"
 		"movq (%%"REG_S"), %%mm1 \n\t"
 		"movq (%%"REG_S"), %%mm2 \n\t"
@@ -213,7 +212,7 @@ static void pack_li_1_MMX(unsigned char *dst, unsigned char *y,
 #endif
 		"pxor %%mm0, %%mm0 \n\t"
 		
-		ASMALIGN16  
+		ASMALIGN(4)
 		".Lli1: \n\t"
 		"movq (%%"REG_S"), %%mm1 \n\t"
 		"movq (%%"REG_S"), %%mm2 \n\t"
diff --git a/libmpcodecs/vf_ivtc.c b/libmpcodecs/vf_ivtc.c
index 50cabe0ee1..9c30a02477 100644
--- a/libmpcodecs/vf_ivtc.c
+++ b/libmpcodecs/vf_ivtc.c
@@ -5,7 +5,6 @@
 #include "config.h"
 #include "mp_msg.h"
 #include "cpudetect.h"
-#include "asmalign.h"
 
 #include "img_format.h"
 #include "mp_image.h"
@@ -68,7 +67,7 @@ static void block_diffs_MMX(struct metrics *m, unsigned char *old, unsigned char
 		"pxor %%mm5, %%mm5 \n\t" // 4 odd difference sums
 		"pxor %%mm7, %%mm7 \n\t" // all zeros
 		
-		ASMALIGN16  
+		ASMALIGN(4)
 		"1: \n\t"
 		
 		// Even difference
@@ -128,7 +127,7 @@ static void block_diffs_MMX(struct metrics *m, unsigned char *old, unsigned char
 		"pxor %%mm5, %%mm5 \n\t" // Temporal noise
 		"pxor %%mm6, %%mm6 \n\t" // Current spacial noise
 		
-		ASMALIGN16  
+		ASMALIGN(4)
 		"2: \n\t"
 		
 		"movq (%%"REG_S"), %%mm0 \n\t"
@@ -182,7 +181,7 @@ static void block_diffs_MMX(struct metrics *m, unsigned char *old, unsigned char
 		"pxor %%mm5, %%mm5 \n\t"
 		"pxor %%mm6, %%mm6 \n\t"
 		
-		ASMALIGN16
+		ASMALIGN(4)
 		"3: \n\t"
 		
 		"movq (%%"REG_S"), %%mm0 \n\t"
diff --git a/libmpcodecs/vf_noise.c b/libmpcodecs/vf_noise.c
index 33c344ec06..d430ef3281 100644
--- a/libmpcodecs/vf_noise.c
+++ b/libmpcodecs/vf_noise.c
@@ -25,7 +25,6 @@
 #include "config.h"
 #include "mp_msg.h"
 #include "cpudetect.h"
-#include "asmalign.h"
 
 #ifdef HAVE_MALLOC_H
 #include <malloc.h>
@@ -154,7 +153,7 @@ static inline void lineNoise_MMX(uint8_t *dst, uint8_t *src, int8_t *noise, int
 		"pcmpeqb %%mm7, %%mm7		\n\t"
 		"psllw $15, %%mm7		\n\t"
 		"packsswb %%mm7, %%mm7		\n\t"
-		ASMALIGN16   
+		ASMALIGN(4)
 		"1:				\n\t"
 		"movq (%0, %%"REG_a"), %%mm0	\n\t"
 		"movq (%1, %%"REG_a"), %%mm1	\n\t"
@@ -183,7 +182,7 @@ static inline void lineNoise_MMX2(uint8_t *dst, uint8_t *src, int8_t *noise, int
 		"pcmpeqb %%mm7, %%mm7		\n\t"
 		"psllw $15, %%mm7		\n\t"
 		"packsswb %%mm7, %%mm7		\n\t"
-		ASMALIGN16  
+		ASMALIGN(4)
 		"1:				\n\t"
 		"movq (%0, %%"REG_a"), %%mm0	\n\t"
 		"movq (%1, %%"REG_a"), %%mm1	\n\t"
@@ -221,7 +220,7 @@ static inline void lineNoiseAvg_MMX(uint8_t *dst, uint8_t *src, int len, int8_t
 
 	asm volatile(
 		"mov %5, %%"REG_a"		\n\t"
-		ASMALIGN16   
+		ASMALIGN(4)
 		"1:				\n\t"
 		"movq (%1, %%"REG_a"), %%mm1	\n\t"
 		"movq (%0, %%"REG_a"), %%mm0	\n\t"
diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index 807da6166f..bd8b9e065a 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -12,8 +12,6 @@
 #include <stddef.h>
 #include <inttypes.h> /* for __WORDSIZE */
 
-#include "asmalign.h"
-
 #ifndef __WORDSIZE
 // #warning You have misconfigured system and probably will lose performance!
 #define __WORDSIZE MP_WORDSIZE
@@ -343,7 +341,7 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_
 		"movq %3, %%mm5			\n\t"
 		"movq %4, %%mm6			\n\t"
 		"movq %5, %%mm7			\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		PREFETCH" 32(%1)		\n\t"
 		"movd	(%1), %%mm0		\n\t"
@@ -500,7 +498,7 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_
 		"movq %3, %%mm5			\n\t"
 		"movq %4, %%mm6			\n\t"
 		"movq %5, %%mm7			\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		PREFETCH" 32(%1)		\n\t"
 		"movd	(%1), %%mm0		\n\t"
@@ -1355,7 +1353,7 @@ static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long s
 /* TODO: unroll this loop */
 	asm volatile (
 		"xor %%"REG_a", %%"REG_a"	\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		PREFETCH" 32(%0, %%"REG_a")	\n\t"
 		"movq (%0, %%"REG_a"), %%mm0	\n\t"
@@ -1405,7 +1403,7 @@ static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long s
 		"movq "MANGLE(mask24r)", %%mm5	\n\t"
 		"movq "MANGLE(mask24g)", %%mm6	\n\t"
 		"movq "MANGLE(mask24b)", %%mm7	\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		PREFETCH" 32(%1, %%"REG_a")	\n\t"
 		"movq   (%1, %%"REG_a"), %%mm0	\n\t" // BGR BGR BG
@@ -1475,7 +1473,7 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u
 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
 		asm volatile(
 			"xor %%"REG_a", %%"REG_a"	\n\t"
-			ASMALIGN16
+			ASMALIGN(4)
 			"1:				\n\t"
 			PREFETCH" 32(%1, %%"REG_a", 2)	\n\t"
 			PREFETCH" 32(%2, %%"REG_a")	\n\t"
@@ -1628,7 +1626,7 @@ static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *u
 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
 		asm volatile(
 			"xor %%"REG_a", %%"REG_a"	\n\t"
-			ASMALIGN16
+			ASMALIGN(4)
 			"1:				\n\t"
 			PREFETCH" 32(%1, %%"REG_a", 2)	\n\t"
 			PREFETCH" 32(%2, %%"REG_a")	\n\t"
@@ -1752,7 +1750,7 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
 			"xor %%"REG_a", %%"REG_a"	\n\t"
 			"pcmpeqw %%mm7, %%mm7		\n\t"
 			"psrlw $8, %%mm7		\n\t" // FF,00,FF,00...
-			ASMALIGN16
+			ASMALIGN(4)
 			"1:				\n\t"
 			PREFETCH" 64(%0, %%"REG_a", 4)	\n\t"
 			"movq (%0, %%"REG_a", 4), %%mm0	\n\t" // YUYV YUYV(0)
@@ -1805,7 +1803,7 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
 
 		asm volatile(
 			"xor %%"REG_a", %%"REG_a"	\n\t"
-			ASMALIGN16
+			ASMALIGN(4)
 			"1:				\n\t"
 			PREFETCH" 64(%0, %%"REG_a", 4)	\n\t"
 			"movq (%0, %%"REG_a", 4), %%mm0	\n\t" // YUYV YUYV(0)
@@ -1990,7 +1988,7 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
 			"xorl %%eax, %%eax		\n\t"
 			"pcmpeqw %%mm7, %%mm7		\n\t"
 			"psrlw $8, %%mm7		\n\t" // FF,00,FF,00...
-			ASMALIGN16
+			ASMALIGN(4)
 			"1:				\n\t"
 			PREFETCH" 64(%0, %%eax, 4)	\n\t"
 			"movq (%0, %%eax, 4), %%mm0	\n\t" // UYVY UYVY(0)
@@ -2043,7 +2041,7 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
 
 		asm volatile(
 			"xorl %%eax, %%eax		\n\t"
-			ASMALIGN16
+			ASMALIGN(4)
 			"1:				\n\t"
 			PREFETCH" 64(%0, %%eax, 4)	\n\t"
 			"movq (%0, %%eax, 4), %%mm0	\n\t" // YUYV YUYV(0)
@@ -2121,7 +2119,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
 				"movq "MANGLE(w1111)", %%mm5		\n\t"
 				"pxor %%mm7, %%mm7		\n\t"
 				"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
-				ASMALIGN16
+				ASMALIGN(4)
 				"1:				\n\t"
 				PREFETCH" 64(%0, %%"REG_b")	\n\t"
 				"movd (%0, %%"REG_b"), %%mm0	\n\t"
@@ -2195,7 +2193,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
 			"pxor %%mm7, %%mm7		\n\t"
 			"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
 			"add %%"REG_b", %%"REG_b"	\n\t"
-			ASMALIGN16
+			ASMALIGN(4)
 			"1:				\n\t"
 			PREFETCH" 64(%0, %%"REG_b")	\n\t"
 			PREFETCH" 64(%1, %%"REG_b")	\n\t"
diff --git a/libswscale/swscale_template.c b/libswscale/swscale_template.c
index 8588820a64..3bf20f9a23 100644
--- a/libswscale/swscale_template.c
+++ b/libswscale/swscale_template.c
@@ -16,8 +16,6 @@
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 */
 
-#include "asmalign.h"
-
 #undef REAL_MOVNTQ
 #undef MOVNTQ
 #undef PAVGB
@@ -74,7 +72,7 @@
 			"movq %%mm3, %%mm4		\n\t"\
 			"lea " offset "(%0), %%"REG_d"	\n\t"\
 			"mov (%%"REG_d"), %%"REG_S"	\n\t"\
-			ASMALIGN16 /* FIXME Unroll? */\
+			ASMALIGN(4) /* FIXME Unroll? */\
 			"1:				\n\t"\
 			"movq 8(%%"REG_d"), %%mm0	\n\t" /* filterCoeff */\
 			"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
@@ -112,7 +110,7 @@
                         "pxor %%mm6, %%mm6              \n\t"\
                         "pxor %%mm7, %%mm7              \n\t"\
 			"mov (%%"REG_d"), %%"REG_S"	\n\t"\
-			ASMALIGN16 \
+			ASMALIGN(4) \
 			"1:				\n\t"\
 			"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0\n\t" /* srcData */\
 			"movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
@@ -167,7 +165,7 @@
 
 #define YSCALEYUV2YV121 \
 			"mov %2, %%"REG_a"		\n\t"\
-			ASMALIGN16 /* FIXME Unroll? */\
+			ASMALIGN(4) /* FIXME Unroll? */\
 			"1:				\n\t"\
 			"movq (%0, %%"REG_a", 2), %%mm0	\n\t"\
 			"movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
@@ -188,14 +186,14 @@
 #define YSCALEYUV2PACKEDX \
 	asm volatile(\
 		"xor %%"REG_a", %%"REG_a"	\n\t"\
-		ASMALIGN16\
+		ASMALIGN(4)\
 		"nop				\n\t"\
 		"1:				\n\t"\
 		"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
 		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
 		"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
 		"movq %%mm3, %%mm4		\n\t"\
-		ASMALIGN16\
+		ASMALIGN(4)\
 		"2:				\n\t"\
 		"movq 8(%%"REG_d"), %%mm0	\n\t" /* filterCoeff */\
 		"movq (%%"REG_S", %%"REG_a"), %%mm2	\n\t" /* UsrcData */\
@@ -213,7 +211,7 @@
 		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
 		"movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
 		"movq %%mm1, %%mm7		\n\t"\
-		ASMALIGN16\
+		ASMALIGN(4)\
 		"2:				\n\t"\
 		"movq 8(%%"REG_d"), %%mm0	\n\t" /* filterCoeff */\
 		"movq (%%"REG_S", %%"REG_a", 2), %%mm2	\n\t" /* Y1srcData */\
@@ -237,7 +235,7 @@
 #define YSCALEYUV2PACKEDX_ACCURATE \
 	asm volatile(\
 		"xor %%"REG_a", %%"REG_a"	\n\t"\
-		ASMALIGN16\
+		ASMALIGN(4)\
 		"nop				\n\t"\
 		"1:				\n\t"\
 		"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
@@ -246,7 +244,7 @@
                 "pxor %%mm5, %%mm5              \n\t"\
                 "pxor %%mm6, %%mm6              \n\t"\
                 "pxor %%mm7, %%mm7              \n\t"\
-		ASMALIGN16\
+		ASMALIGN(4)\
 		"2:				\n\t"\
 		"movq (%%"REG_S", %%"REG_a"), %%mm0	\n\t" /* UsrcData */\
 		"movq 4096(%%"REG_S", %%"REG_a"), %%mm2	\n\t" /* VsrcData */\
@@ -290,7 +288,7 @@
                 "pxor %%mm5, %%mm5              \n\t"\
                 "pxor %%mm7, %%mm7              \n\t"\
                 "pxor %%mm6, %%mm6              \n\t"\
-		ASMALIGN16\
+		ASMALIGN(4)\
 		"2:				\n\t"\
 		"movq (%%"REG_S", %%"REG_a", 2), %%mm0	\n\t" /* Y1srcData */\
 		"movq 8(%%"REG_S", %%"REG_a", 2), %%mm2	\n\t" /* Y2srcData */\
@@ -374,7 +372,7 @@
 		"punpcklwd %%mm5, %%mm5		\n\t"\
 		"punpcklwd %%mm5, %%mm5		\n\t"\
 		"xor %%"REG_a", %%"REG_a"		\n\t"\
-		ASMALIGN16\
+		ASMALIGN(4)\
 		"1:				\n\t"\
 		"movq (%0, %%"REG_a", 2), %%mm0	\n\t" /*buf0[eax]*/\
 		"movq (%1, %%"REG_a", 2), %%mm1	\n\t" /*buf1[eax]*/\
@@ -427,7 +425,7 @@
 		"movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
 		"movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
 		"xor "#index", "#index"		\n\t"\
-		ASMALIGN16\
+		ASMALIGN(4)\
 		"1:				\n\t"\
 		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
 		"movq (%3, "#index"), %%mm3	\n\t" /* uvbuf1[eax]*/\
@@ -459,7 +457,7 @@
                 
 #define REAL_YSCALEYUV2RGB(index, c) \
 		"xor "#index", "#index"	\n\t"\
-		ASMALIGN16\
+		ASMALIGN(4)\
 		"1:				\n\t"\
 		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
 		"movq (%3, "#index"), %%mm3	\n\t" /* uvbuf1[eax]*/\
@@ -525,7 +523,7 @@
                 
 #define REAL_YSCALEYUV2PACKED1(index, c) \
 		"xor "#index", "#index"		\n\t"\
-		ASMALIGN16\
+		ASMALIGN(4)\
 		"1:				\n\t"\
 		"movq (%2, "#index"), %%mm3	\n\t" /* uvbuf0[eax]*/\
 		"movq 4096(%2, "#index"), %%mm4	\n\t" /* uvbuf0[eax+2048]*/\
@@ -540,7 +538,7 @@
                 
 #define REAL_YSCALEYUV2RGB1(index, c) \
 		"xor "#index", "#index"	\n\t"\
-		ASMALIGN16\
+		ASMALIGN(4)\
 		"1:				\n\t"\
 		"movq (%2, "#index"), %%mm3	\n\t" /* uvbuf0[eax]*/\
 		"movq 4096(%2, "#index"), %%mm4	\n\t" /* uvbuf0[eax+2048]*/\
@@ -589,7 +587,7 @@
 
 #define REAL_YSCALEYUV2PACKED1b(index, c) \
 		"xor "#index", "#index"		\n\t"\
-		ASMALIGN16\
+		ASMALIGN(4)\
 		"1:				\n\t"\
 		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
 		"movq (%3, "#index"), %%mm3	\n\t" /* uvbuf1[eax]*/\
@@ -608,7 +606,7 @@
 // do vertical chrominance interpolation
 #define REAL_YSCALEYUV2RGB1b(index, c) \
 		"xor "#index", "#index"		\n\t"\
-		ASMALIGN16\
+		ASMALIGN(4)\
 		"1:				\n\t"\
 		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
 		"movq (%3, "#index"), %%mm3	\n\t" /* uvbuf1[eax]*/\
@@ -1868,7 +1866,7 @@ static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
 		"movq "MANGLE(w1111)", %%mm5		\n\t"
 		"pxor %%mm7, %%mm7		\n\t"
 		"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		PREFETCH" 64(%0, %%"REG_b")	\n\t"
 		"movd (%0, %%"REG_b"), %%mm0	\n\t"
@@ -1954,7 +1952,7 @@ static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1
 		"pxor %%mm7, %%mm7		\n\t"
 		"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"	\n\t"
 		"add %%"REG_b", %%"REG_b"	\n\t"
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		PREFETCH" 64(%0, %%"REG_b")	\n\t"
 		PREFETCH" 64(%1, %%"REG_b")	\n\t"
@@ -2261,7 +2259,7 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
 			"movq "MANGLE(w02)", %%mm6	\n\t"
 			"push %%"REG_BP"		\n\t" // we use 7 regs here ...
 			"mov %%"REG_a", %%"REG_BP"	\n\t"
-			ASMALIGN16
+			ASMALIGN(4)
 			"1:				\n\t"
 			"movzwl (%2, %%"REG_BP"), %%eax	\n\t"
 			"movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
@@ -2299,7 +2297,7 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
 			"movq "MANGLE(w02)", %%mm6	\n\t"
 			"push %%"REG_BP"		\n\t" // we use 7 regs here ...
 			"mov %%"REG_a", %%"REG_BP"	\n\t"
-			ASMALIGN16
+			ASMALIGN(4)
 			"1:				\n\t"
 			"movzwl (%2, %%"REG_BP"), %%eax	\n\t"
 			"movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
@@ -2348,7 +2346,7 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
 		asm volatile(
 			"pxor %%mm7, %%mm7		\n\t"
 			"movq "MANGLE(w02)", %%mm6	\n\t"
-			ASMALIGN16
+			ASMALIGN(4)
 			"1:				\n\t"
 			"mov %2, %%"REG_c"		\n\t"
 			"movzwl (%%"REG_c", %0), %%eax	\n\t"
@@ -2532,7 +2530,7 @@ FUNNY_Y_CODE
 		"xor %%"REG_a", %%"REG_a"	\n\t" // i
 		"xor %%"REG_b", %%"REG_b"	\n\t" // xx
 		"xorl %%ecx, %%ecx		\n\t" // 2*xalpha
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		"movzbl  (%0, %%"REG_b"), %%edi	\n\t" //src[xx]
 		"movzbl 1(%0, %%"REG_b"), %%esi	\n\t" //src[xx+1]
@@ -2729,7 +2727,7 @@ FUNNY_UV_CODE
 		"xor %%"REG_a", %%"REG_a"	\n\t" // i
 		"xor %%"REG_b", %%"REG_b"		\n\t" // xx
 		"xorl %%ecx, %%ecx		\n\t" // 2*xalpha
-		ASMALIGN16
+		ASMALIGN(4)
 		"1:				\n\t"
 		"mov %0, %%"REG_S"		\n\t"
 		"movzbl  (%%"REG_S", %%"REG_b"), %%edi	\n\t" //src[xx]