overhaul sh atomics for new atomics framework, add j-core cas.l backend

sh needs runtime-selected atomic backends since there are a number of supported models that use non-forwards-compatible (non-smp-compatible) atomic mechanisms. previously, the code paths for this were highly inefficient since they involved C function calls with multiple branches in the callee and heavy spills in the caller. the new code performs calls the runtime-selected asm fragment from inline asm with extremely minimal clobbers, rather than using a function call. for the sh4a case where the atomic mechanism is known and there is no forward-compatibility issue, the movli.l and movco.l instructions are provided as a_ll and a_sc, allowing the new shared atomic.h to generate efficient inline versions of all the basic atomic operations without needing a cas loop.
2016-01-21 19:28:15 +00:00 · 2016-01-21 19:28:15 +00:00 · 61b1e75f7d
parent 1315596b51
commit 61b1e75f7d
6 changed files with 111 additions and 263 deletions
--- a/arch/sh/atomic_arch.h
+++ b/arch/sh/atomic_arch.h
@ -1,96 +1,46 @@
-#define LLSC_CLOBBERS "r0", "t", "memory"
+#if defined(__SH4A__)
 #define LLSC_START(mem) "synco\n"  \
 	"0:	movli.l @" mem ", r0\n"
 #define LLSC_END(mem)              \
 	"1:	movco.l r0, @" mem "\n"    \
 	"	bf 0b\n"                   \
 	"	synco\n"
-static inline int __sh_cas_llsc(volatile int *p, int t, int s)
+#define a_ll a_ll
 static inline int a_ll(volatile int *p)
 {
-	int old;
+	int v;
-	__asm__ __volatile__(
+	__asm__ __volatile__ ("movli.l @%1, %0" : "=z"(v) : "r"(p), "m"(*p));
-		LLSC_START("%1")
+	return v;
 		"	mov r0, %0\n"
 		"	cmp/eq %0, %2\n"
 		"	bf 1f\n"
 		"	mov %3, r0\n"
 		LLSC_END("%1")
 		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : LLSC_CLOBBERS);
 	return old;
 }
-static inline int __sh_swap_llsc(volatile int *x, int v)
+#define a_sc a_sc
 static inline int a_sc(volatile int *p, int v)
 {
-	int old;
+	int r;
 	__asm__ __volatile__ (
-		LLSC_START("%1")
+		"movco.l %2, @%3 ; movt %0"
-		"	mov r0, %0\n"
+		: "=r"(r), "=m"(*p) : "z"(v), "r"(p) : "memory", "cc");
-		"	mov %2, r0\n"
+	return r;
 		LLSC_END("%1")
 		: "=&r"(old) : "r"(x), "r"(v) : LLSC_CLOBBERS);
 	return old;
 }
-static inline int __sh_fetch_add_llsc(volatile int *x, int v)
+#define a_barrier a_barrier
 static inline void a_barrier()
 {
-	int old;
+	__asm__ __volatile__ ("synco" : : "memory");
 	__asm__ __volatile__(
 		LLSC_START("%1")
 		"	mov r0, %0\n"
 		"	add %2, r0\n"
 		LLSC_END("%1")
 		: "=&r"(old) : "r"(x), "r"(v) : LLSC_CLOBBERS);
 	return old;
 }
-static inline void __sh_store_llsc(volatile int *p, int x)
+#define a_pre_llsc a_barrier
-{
+#define a_post_llsc a_barrier
 	__asm__ __volatile__(
 		"	synco\n"
 		"	mov.l %1, @%0\n"
 		"	synco\n"
 		: : "r"(p), "r"(x) : "memory");
 }
 static inline void __sh_and_llsc(volatile int *x, int v)
 {
 	__asm__ __volatile__(
 		LLSC_START("%0")
 		"	and %1, r0\n"
 		LLSC_END("%0")
 		: : "r"(x), "r"(v) : LLSC_CLOBBERS);
 }
 static inline void __sh_or_llsc(volatile int *x, int v)
 {
 	__asm__ __volatile__(
 		LLSC_START("%0")
 		"	or %1, r0\n"
 		LLSC_END("%0")
 		: : "r"(x), "r"(v) : LLSC_CLOBBERS);
 }
 #ifdef __SH4A__
 #define a_cas(p,t,s)     __sh_cas_llsc(p,t,s)
 #define a_swap(x,v)      __sh_swap_llsc(x,v)
 #define a_fetch_add(x,v) __sh_fetch_add_llsc(x, v)
 #define a_store(x,v)     __sh_store_llsc(x, v)
 #define a_and(x,v)       __sh_and_llsc(x, v)
 #define a_or(x,v)        __sh_or_llsc(x, v)
 #else
-int  __sh_cas(volatile int *, int, int);
+#define a_cas a_cas
-int  __sh_swap(volatile int *, int);
+__attribute__((__visibility__("hidden"))) extern const void *__sh_cas_ptr;
-int  __sh_fetch_add(volatile int *, int);
+static inline int a_cas(volatile int *p, int t, int s)
-void __sh_store(volatile int *, int);
+{
-void __sh_and(volatile int *, int);
+	register int r1 __asm__("r1");
-void __sh_or(volatile int *, int);
+	register int r2 __asm__("r2") = t;
 	register int r3 __asm__("r3") = s;
 	__asm__ __volatile__ (
 		"jsr @%4 ; nop"
 		: "=r"(r1), "+r"(r3) : "z"(p), "r"(r2), "r"(__sh_cas_ptr)
 		: "memory", "pr", "cc");
 	return r3;
 }
 #define a_cas(p,t,s)     __sh_cas(p,t,s)
 #define a_swap(x,v)      __sh_swap(x,v)
 #define a_fetch_add(x,v) __sh_fetch_add(x, v)
 #define a_store(x,v)     __sh_store(x, v)
 #define a_and(x,v)       __sh_and(x, v)
 #define a_or(x,v)        __sh_or(x, v)
 #endif
--- a/arch/sh/src/atomic.c
+++ b/arch/sh/src/atomic.c
@ -1,158 +0,0 @@
 #ifndef __SH4A__
 #include "sh_atomic.h"
 #include "atomic.h"
 #include "libc.h"
 static inline unsigned mask()
 {
 	unsigned sr;
 	__asm__ __volatile__ ( "\n"
 	"	stc sr,r0 \n"
 	"	mov r0,%0 \n"
 	"	or #0xf0,r0 \n"
 	"	ldc r0,sr \n"
 	: "=&r"(sr) : : "memory", "r0" );
 	return sr;
 }
 static inline void unmask(unsigned sr)
 {
 	__asm__ __volatile__ ( "ldc %0,sr" : : "r"(sr) : "memory" );
 }
 /* gusa is a hack in the kernel which lets you create a sequence of instructions
 * which will be restarted if the process is preempted in the middle of the
 * sequence. It will do for implementing atomics on non-smp systems. ABI is:
 * r0  = address of first instruction after the atomic sequence
 * r1  = original stack pointer
 * r15 = -1 * length of atomic sequence in bytes
 */
 #define GUSA_CLOBBERS   "r0", "r1", "memory"
 #define GUSA_START(mem,old,nop)    \
 	"	.align 2\n"                \
 	"	mova 1f, r0\n"             \
 	nop                            \
 	"	mov r15, r1\n"             \
 	"	mov #(0f-1f), r15\n"       \
 	"0:	mov.l @" mem ", " old "\n"
 /* the target of mova must be 4 byte aligned, so we may need a nop */
 #define GUSA_START_ODD(mem,old)  GUSA_START(mem,old,"")
 #define GUSA_START_EVEN(mem,old) GUSA_START(mem,old,"\tnop\n")
 #define GUSA_END(mem,new)          \
 	"	mov.l " new ", @" mem "\n" \
 	"1:	mov r1, r15\n"
 int __sh_cas(volatile int *p, int t, int s)
 {
 	if (__sh_atomic_model == SH_A_LLSC) return __sh_cas_llsc(p, t, s);
 	if (__sh_atomic_model == SH_A_IMASK) {
 		unsigned sr = mask();
 		int old = *p;
 		if (old==t) *p = s;
 		unmask(sr);
 		return old;
 	}
 	int old;
 	__asm__ __volatile__(
 		GUSA_START_EVEN("%1", "%0")
 		"	cmp/eq %0, %2\n"
 		"	bf 1f\n"
 		GUSA_END("%1", "%3")
 		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : GUSA_CLOBBERS, "t");
 	return old;
 }
 int __sh_swap(volatile int *x, int v)
 {
 	if (__sh_atomic_model == SH_A_LLSC) return __sh_swap_llsc(x, v);
 	if (__sh_atomic_model == SH_A_IMASK) {
 		unsigned sr = mask();
 		int old = *x;
 		*x = v;
 		unmask(sr);
 		return old;
 	}
 	int old;
 	__asm__ __volatile__(
 		GUSA_START_EVEN("%1", "%0")
 		GUSA_END("%1", "%2")
 		: "=&r"(old) : "r"(x), "r"(v) : GUSA_CLOBBERS);
 	return old;
 }
 int __sh_fetch_add(volatile int *x, int v)
 {
 	if (__sh_atomic_model == SH_A_LLSC) return __sh_fetch_add_llsc(x, v);
 	if (__sh_atomic_model == SH_A_IMASK) {
 		unsigned sr = mask();
 		int old = *x;
 		*x = old + v;
 		unmask(sr);
 		return old;
 	}
 	int old, dummy;
 	__asm__ __volatile__(
 		GUSA_START_EVEN("%2", "%0")
 		"	mov %0, %1\n"
 		"	add %3, %1\n"
 		GUSA_END("%2", "%1")
 		: "=&r"(old), "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
 	return old;
 }
 void __sh_store(volatile int *p, int x)
 {
 	if (__sh_atomic_model == SH_A_LLSC) return __sh_store_llsc(p, x);
 	__asm__ __volatile__(
 		"	mov.l %1, @%0\n"
 		: : "r"(p), "r"(x) : "memory");
 }
 void __sh_and(volatile int *x, int v)
 {
 	if (__sh_atomic_model == SH_A_LLSC) return __sh_and_llsc(x, v);
 	if (__sh_atomic_model == SH_A_IMASK) {
 		unsigned sr = mask();
 		int old = *x;
 		*x = old & v;
 		unmask(sr);
 		return;
 	}
 	int dummy;
 	__asm__ __volatile__(
 		GUSA_START_ODD("%1", "%0")
 		"	and %2, %0\n"
 		GUSA_END("%1", "%0")
 		: "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
 }
 void __sh_or(volatile int *x, int v)
 {
 	if (__sh_atomic_model == SH_A_LLSC) return __sh_or_llsc(x, v);
 	if (__sh_atomic_model == SH_A_IMASK) {
 		unsigned sr = mask();
 		int old = *x;
 		*x = old | v;
 		unmask(sr);
 		return;
 	}
 	int dummy;
 	__asm__ __volatile__(
 		GUSA_START_ODD("%1", "%0")
 		"	or %2, %0\n"
 		GUSA_END("%1", "%0")
 		: "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
 }
 #endif
--- a/arch/sh/src/sh_atomic.h
+++ b/arch/sh/src/sh_atomic.h
@ -1,15 +0,0 @@
 #ifndef _SH_ATOMIC_H
 #define _SH_ATOMIC_H
 #define SH_A_GUSA 0
 #define SH_A_LLSC 1
 #define SH_A_CAS 2
 #if !defined(__SH3__) && !defined(__SH4__)
 #define SH_A_IMASK 3
 #else
 #define SH_A_IMASK -1LL /* unmatchable by unsigned int */
 #endif
 extern __attribute__((__visibility__("hidden"))) unsigned __sh_atomic_model;
 #endif
--- a/src/thread/sh/__set_thread_area.c
+++ b/src/thread/sh/__set_thread_area.c
@ -1,34 +1,40 @@
 #include "pthread_impl.h"
 #include "libc.h"
 #include "sh_atomic.h"
 #include <elf.h>
 /* Also perform sh-specific init */
 #define CPU_HAS_LLSC 0x0040
 #define CPU_HAS_CAS_L 0x0400
-__attribute__((__visibility__("hidden"))) unsigned __sh_atomic_model, __sh_nommu;
+__attribute__((__visibility__("hidden")))
 extern const char __sh_cas_gusa[], __sh_cas_llsc[], __sh_cas_imask[], __sh_cas_cas_l[];
 __attribute__((__visibility__("hidden")))
 const void *__sh_cas_ptr;
 __attribute__((__visibility__("hidden")))
 unsigned __sh_nommu;
 int __set_thread_area(void *p)
 {
 	size_t *aux;
 	__asm__ __volatile__ ( "ldc %0, gbr" : : "r"(p) : "memory" );
 #ifndef __SH4A__
-	if (__hwcap & CPU_HAS_LLSC) {
+	__sh_cas_ptr = __sh_cas_gusa;
 		__sh_atomic_model = SH_A_LLSC;
 		return 0;
 	}
 #if !defined(__SH3__) && !defined(__SH4__)
 	for (aux=libc.auxv; *aux; aux+=2) {
 		if (*aux != AT_PLATFORM) continue;
 		const char *s = (void *)aux[1];
 		if (s[0]!='s' || s[1]!='h' || s[2]!='2' || s[3]-'0'<10u) break;
-		__sh_atomic_model = SH_A_IMASK;
+		__sh_cas_ptr = __sh_cas_imask;
 		__sh_nommu = 1;
 		return 0;
 	}
 #endif
-	/* __sh_atomic_model = SH_A_GUSA; */ /* 0, default */
+	if (__hwcap & CPU_HAS_CAS_L)
 		__sh_cas_ptr = __sh_cas_cas_l;
 	else if (__hwcap & CPU_HAS_LLSC)
 		__sh_cas_ptr = __sh_cas_llsc;
 #endif
 	return 0;
 }
--- a/src/thread/sh/__set_thread_area.s
+++ b/src/thread/sh/__set_thread_area.s
--- a/src/thread/sh/atomics.s
+++ b/src/thread/sh/atomics.s
@ -0,0 +1,65 @@
 /* Contract for all versions is same as cas.l r2,r3,@r0
 * pr and r1 are also clobbered (by jsr & r1 as temp).
 * r0,r2,r4-r15 must be preserved.
 * r3 contains result (==r2 iff cas succeeded). */
 	.align 2
 .global __sh_cas_gusa
 .hidden __sh_cas_gusa
 __sh_cas_gusa:
 	mov.l r5,@-r15
 	mov.l r4,@-r15
 	mov r0,r4
 	mova 1f,r0
 	mov r15,r1
 	mov #(0f-1f),r15
 0:	mov.l @r4,r5
 	cmp/eq r5,r2
 	bf 1f
 	mov.l r3,@r4
 1:	mov r1,r15
 	mov r5,r3
 	mov r4,r0
 	mov.l @r15+,r4
 	rts
 	 mov.l @r15+,r5
 .global __sh_cas_llsc
 .hidden __sh_cas_llsc
 __sh_cas_llsc:
 	mov r0,r1
 	synco
 0:	movli.l @r1,r0
 	cmp/eq r0,r2
 	bf 1f
 	mov r3,r0
 	movco.l r0,@r1
 	bf 0b
 	mov r2,r0
 1:	synco
 	mov r0,r3
 	rts
 	 mov r1,r0
 .global __sh_cas_imask
 .hidden __sh_cas_imask
 __sh_cas_imask:
 	mov r0,r1
 	stc sr,r0
 	mov.l r0,@-r15
 	or #0xf0,r0
 	ldc r0,sr
 	mov.l @r1,r0
 	cmp/eq r0,r2
 	bf 1f
 	mov.l r3,@r1
 1:	ldc.l @r15+,sr
 	mov r0,r3
 	rts
 	 mov r1,r0
 .global __sh_cas_cas_l
 .hidden __sh_cas_cas_l
 __sh_cas_cas_l:
 	rts
 	 .word 0x2323 /* cas.l r2,r3,@r0 */