From 3c55ce039d5b90afad35e19fc8ca6d147dd3f976 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Rullg=C3=A5rd?= <mans@mansr.com>
Date: Sat, 18 Apr 2009 00:00:28 +0000
Subject: [PATCH] ARM asm for AV_RN*()

ARMv6 and later support unaligned loads and stores for single
word/halfword but not double/multiple.  GCC is ignorant of this and
will always use bytewise accesses for unaligned data.  Casting to an
int32_t pointer is dangerous since a load/store double or multiple
instruction might be used (this happens with some code in FFmpeg).
Implementing the AV_[RW]* macros with inline asm using only supported
instructions gives fast and safe unaligned accesses.  ARM RVCT does
the right thing with generic code.

This gives an overall speedup of up to 10%.

Originally committed as revision 18601 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavutil/arm/intreadwrite.h | 78 ++++++++++++++++++++++++++++++++++++
 libavutil/intreadwrite.h     |  3 ++
 2 files changed, 81 insertions(+)
 create mode 100644 libavutil/arm/intreadwrite.h

diff --git a/libavutil/arm/intreadwrite.h b/libavutil/arm/intreadwrite.h
new file mode 100644
index 0000000000..de2e5532e6
--- /dev/null
+++ b/libavutil/arm/intreadwrite.h
@@ -0,0 +1,78 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_ARM_INTREADWRITE_H
+#define AVUTIL_ARM_INTREADWRITE_H
+
+#include <stdint.h>
+#include "config.h"
+
+#if HAVE_FAST_UNALIGNED && HAVE_INLINE_ASM
+
+#define AV_RN16 AV_RN16
+static inline uint16_t AV_RN16(const void *p)
+{
+    uint16_t v;
+    __asm__ ("ldrh %0, %1" : "=r"(v) : "m"(*(const uint16_t *)p));
+    return v;
+}
+
+#define AV_WN16 AV_WN16
+static inline void AV_WN16(void *p, uint16_t v)
+{
+    __asm__ ("strh %1, %0" : "=m"(*(uint16_t *)p) : "r"(v));
+}
+
+#define AV_RN32 AV_RN32
+static inline uint32_t AV_RN32(const void *p)
+{
+    uint32_t v;
+    __asm__ ("ldr  %0, %1" : "=r"(v) : "m"(*(const uint32_t *)p));
+    return v;
+}
+
+#define AV_WN32 AV_WN32
+static inline void AV_WN32(void *p, uint32_t v)
+{
+    __asm__ ("str  %1, %0" : "=m"(*(uint32_t *)p) : "r"(v));
+}
+
+#define AV_RN64 AV_RN64
+static inline uint64_t AV_RN64(const void *p)
+{
+    union { uint64_t v; uint32_t hl[2]; } v;
+    __asm__ ("ldr   %0, %2  \n\t"
+             "ldr   %1, %3  \n\t"
+             : "=r"(v.hl[0]), "=r"(v.hl[1])
+             : "m"(*(const uint32_t*)p), "m"(*((const uint32_t*)p+1)));
+    return v.v;
+}
+
+#define AV_WN64 AV_WN64
+static inline void AV_WN64(void *p, uint64_t v)
+{
+    union { uint64_t v; uint32_t hl[2]; } vv = { v };
+    __asm__ ("str  %2, %0  \n\t"
+             "str  %3, %1  \n\t"
+             : "=m"(*(uint32_t*)p), "=m"(*((uint32_t*)p+1))
+             : "r"(vv.hl[0]), "r"(vv.hl[1]));
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVUTIL_ARM_INTREADWRITE_H */
diff --git a/libavutil/intreadwrite.h b/libavutil/intreadwrite.h
index b1c5c2acbd..42fb890a0e 100644
--- a/libavutil/intreadwrite.h
+++ b/libavutil/intreadwrite.h
@@ -29,6 +29,9 @@
  * defined, even if these are implemented as inline functions.
  */
 
+#if   ARCH_ARM
+#   include "arm/intreadwrite.h"
+#endif
 
 /*
  * Define AV_[RW]N helper macros to simplify definitions not provided