[lttng-dev] [PATCH 5/7] Replace the arch-specific memory barriers with __atomic builtins

Tue Mar 21 09:31:00 EDT 2023

Instead of a custom code, use the __atomic_thread_fence() builtin to
implement the cmm_mb(), cmm_rmb(), cmm_wmb(), cmm_smp_mb(),
cmm_smp_rmb(), and cmm_smp_wmb() on all architectures, and
cmm_read_barrier_depends() on alpha (otherwise it's still no-op).

family of functions

Signed-off-by: Ondřej Surý <ondrej at sury.org>
---
 include/urcu/arch/alpha.h   |  6 +++---
 include/urcu/arch/arm.h     | 14 -------------
 include/urcu/arch/generic.h |  6 +++---
 include/urcu/arch/mips.h    |  6 ------
 include/urcu/arch/nios2.h   |  2 --
 include/urcu/arch/ppc.h     | 25 ----------------------
 include/urcu/arch/s390.h    |  2 --
 include/urcu/arch/sparc64.h | 13 ------------
 include/urcu/arch/x86.h     | 42 +++----------------------------------
 9 files changed, 9 insertions(+), 107 deletions(-)

diff --git a/include/urcu/arch/alpha.h b/include/urcu/arch/alpha.h
index dc33e28..61687c7 100644
--- a/include/urcu/arch/alpha.h
+++ b/include/urcu/arch/alpha.h
@@ -29,9 +29,9 @@
 extern "C" {
 #endif
 
-#define cmm_mb()			__asm__ __volatile__ ("mb":::"memory")
-#define cmm_wmb()			__asm__ __volatile__ ("wmb":::"memory")
-#define cmm_read_barrier_depends()	__asm__ __volatile__ ("mb":::"memory")
+#ifndef cmm_read_barrier_depends
+#define cmm_read_barrier_depends()	__atomic_thread_fence(__ATOMIC_CONSUME)
+#endif
 
 /*
  * On Linux, define the membarrier system call number if not yet available in
diff --git a/include/urcu/arch/arm.h b/include/urcu/arch/arm.h
index 54ca4fa..b3671dc 100644
--- a/include/urcu/arch/arm.h
+++ b/include/urcu/arch/arm.h
@@ -39,20 +39,6 @@ extern "C" {
 /* For backwards compat. */
 #define CONFIG_RCU_ARM_HAVE_DMB 1
 
-/*
- * Issues full system DMB operation.
- */
-#define cmm_mb()	__asm__ __volatile__ ("dmb sy":::"memory")
-#define cmm_rmb()	__asm__ __volatile__ ("dmb sy":::"memory")
-#define cmm_wmb()	__asm__ __volatile__ ("dmb sy":::"memory")
-
-/*
- * Issues DMB operation only to the inner shareable domain.
- */
-#define cmm_smp_mb()	__asm__ __volatile__ ("dmb ish":::"memory")
-#define cmm_smp_rmb()	__asm__ __volatile__ ("dmb ish":::"memory")
-#define cmm_smp_wmb()	__asm__ __volatile__ ("dmb ish":::"memory")
-
 #endif /* URCU_ARCH_ARMV7 */
 
 #include <stdlib.h>
diff --git a/include/urcu/arch/generic.h b/include/urcu/arch/generic.h
index be6e41e..2715162 100644
--- a/include/urcu/arch/generic.h
+++ b/include/urcu/arch/generic.h
@@ -44,15 +44,15 @@ extern "C" {
  */
 
 #ifndef cmm_mb
-#define cmm_mb()    __sync_synchronize()
+#define cmm_mb()	__atomic_thread_fence(__ATOMIC_SEQ_CST)
 #endif
 
 #ifndef cmm_rmb
-#define cmm_rmb()	cmm_mb()
+#define cmm_rmb()	__atomic_thread_fence(__ATOMIC_ACQUIRE)
 #endif
 
 #ifndef cmm_wmb
-#define cmm_wmb()	cmm_mb()
+#define cmm_wmb()	__atomic_thread_fence(__ATOMIC_RELEASE)
 #endif
 
 #define cmm_mc()	cmm_barrier()
diff --git a/include/urcu/arch/mips.h b/include/urcu/arch/mips.h
index ea5b7e9..ffe65c0 100644
--- a/include/urcu/arch/mips.h
+++ b/include/urcu/arch/mips.h
@@ -30,12 +30,6 @@
 extern "C" {
 #endif
 
-#define cmm_mb()			__asm__ __volatile__ (		    \
-					"	.set	mips2		\n" \
-					"	sync			\n" \
-					"	.set	mips0		\n" \
-					:::"memory")
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/urcu/arch/nios2.h b/include/urcu/arch/nios2.h
index b4f3e50..cd6bdb8 100644
--- a/include/urcu/arch/nios2.h
+++ b/include/urcu/arch/nios2.h
@@ -29,8 +29,6 @@
 extern "C" {
 #endif
 
-#define cmm_mb()	cmm_barrier()
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/urcu/arch/ppc.h b/include/urcu/arch/ppc.h
index 791529e..618f79c 100644
--- a/include/urcu/arch/ppc.h
+++ b/include/urcu/arch/ppc.h
@@ -34,31 +34,6 @@ extern "C" {
 /* Include size of POWER5+ L3 cache lines: 256 bytes */
 #define CAA_CACHE_LINE_SIZE	256
 
-#ifdef __NO_LWSYNC__
-#define LWSYNC_OPCODE	"sync\n"
-#else
-#define LWSYNC_OPCODE	"lwsync\n"
-#endif
-
-/*
- * Use sync for all cmm_mb/rmb/wmb barriers because lwsync does not
- * preserve ordering of cacheable vs. non-cacheable accesses, so it
- * should not be used to order with respect to MMIO operations.  An
- * eieio+lwsync pair is also not enough for cmm_rmb, because it will
- * order cacheable and non-cacheable memory operations separately---i.e.
- * not the latter against the former.
- */
-#define cmm_mb()         __asm__ __volatile__ ("sync":::"memory")
-
-/*
- * lwsync orders loads in cacheable memory with respect to other loads,
- * and stores in cacheable memory with respect to other stores.
- * Therefore, use it for barriers ordering accesses to cacheable memory
- * only.
- */
-#define cmm_smp_rmb()    __asm__ __volatile__ (LWSYNC_OPCODE:::"memory")
-#define cmm_smp_wmb()    __asm__ __volatile__ (LWSYNC_OPCODE:::"memory")
-
 #define mftbl()						\
 	__extension__					\
 	({ 						\
diff --git a/include/urcu/arch/s390.h b/include/urcu/arch/s390.h
index 67461b4..c22fdf9 100644
--- a/include/urcu/arch/s390.h
+++ b/include/urcu/arch/s390.h
@@ -39,8 +39,6 @@ extern "C" {
 
 #define CAA_CACHE_LINE_SIZE	128
 
-#define cmm_mb()    __asm__ __volatile__("bcr 15,0" : : : "memory")
-
 #define HAS_CAA_GET_CYCLES
 
 typedef uint64_t caa_cycles_t;
diff --git a/include/urcu/arch/sparc64.h b/include/urcu/arch/sparc64.h
index 1ff40f5..b4e25ca 100644
--- a/include/urcu/arch/sparc64.h
+++ b/include/urcu/arch/sparc64.h
@@ -40,19 +40,6 @@ extern "C" {
 
 #define CAA_CACHE_LINE_SIZE	256
 
-/*
- * Inspired from the Linux kernel. Workaround Spitfire bug #51.
- */
-#define membar_safe(type)			\
-__asm__ __volatile__("ba,pt %%xcc, 1f\n\t"	\
-		     "membar " type "\n"	\
-		     "1:\n"			\
-		     : : : "memory")
-
-#define cmm_mb()	membar_safe("#LoadLoad | #LoadStore | #StoreStore | #StoreLoad")
-#define cmm_rmb()	membar_safe("#LoadLoad")
-#define cmm_wmb()	membar_safe("#StoreStore")
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/urcu/arch/x86.h b/include/urcu/arch/x86.h
index 744f9f9..af4487d 100644
--- a/include/urcu/arch/x86.h
+++ b/include/urcu/arch/x86.h
@@ -46,44 +46,8 @@ extern "C" {
 /* For backwards compat */
 #define CONFIG_RCU_HAVE_FENCE 1
 
-#define cmm_mb()    __asm__ __volatile__ ("mfence":::"memory")
-
-/*
- * Define cmm_rmb/cmm_wmb to "strict" barriers that may be needed when
- * using SSE or working with I/O areas.  cmm_smp_rmb/cmm_smp_wmb are
- * only compiler barriers, which is enough for general use.
- */
-#define cmm_rmb()     __asm__ __volatile__ ("lfence":::"memory")
-#define cmm_wmb()     __asm__ __volatile__ ("sfence"::: "memory")
-#define cmm_smp_rmb() cmm_barrier()
-#define cmm_smp_wmb() cmm_barrier()
-
-#else
-
-/*
- * We leave smp_rmb/smp_wmb as full barriers for processors that do not have
- * fence instructions.
- *
- * An empty cmm_smp_rmb() may not be enough on old PentiumPro multiprocessor
- * systems, due to an erratum.  The Linux kernel says that "Even distro
- * kernels should think twice before enabling this", but for now let's
- * be conservative and leave the full barrier on 32-bit processors.  Also,
- * IDT WinChip supports weak store ordering, and the kernel may enable it
- * under our feet; cmm_smp_wmb() ceases to be a nop for these processors.
- */
-#if (CAA_BITS_PER_LONG == 32)
-#define cmm_mb()    __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
-#define cmm_rmb()    __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
-#define cmm_wmb()    __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
-#else
-#define cmm_mb()    __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
-#define cmm_rmb()    __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
-#define cmm_wmb()    __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
-#endif
 #endif
 
-#define caa_cpu_relax()	__asm__ __volatile__ ("rep; nop" : : : "memory")
-
 #define HAS_CAA_GET_CYCLES
 
 #define rdtscll(val)							  \
@@ -98,10 +62,10 @@ typedef uint64_t caa_cycles_t;
 
 static inline caa_cycles_t caa_get_cycles(void)
 {
-        caa_cycles_t ret = 0;
+	caa_cycles_t ret = 0;
 
-        rdtscll(ret);
-        return ret;
+	rdtscll(ret);
+	return ret;
 }
 
 /*
-- 
2.39.2