[lttng-dev] [PATCH 5/7] Replace the arch-specific memory barriers with __atomic builtins

Tue Mar 21 16:19:11 EDT 2023

On 2023-03-21 09:31, Ondřej Surý via lttng-dev wrote:
> Instead of a custom code, use the __atomic_thread_fence() builtin to
> implement the cmm_mb(), cmm_rmb(), cmm_wmb(), cmm_smp_mb(),
> cmm_smp_rmb(), and cmm_smp_wmb() on all architectures, and
> cmm_read_barrier_depends() on alpha (otherwise it's still no-op).
> 
> family of functions
> 
> Signed-off-by: Ondřej Surý <ondrej at sury.org>
> ---
>   include/urcu/arch/alpha.h   |  6 +++---
>   include/urcu/arch/arm.h     | 14 -------------
>   include/urcu/arch/generic.h |  6 +++---
>   include/urcu/arch/mips.h    |  6 ------
>   include/urcu/arch/nios2.h   |  2 --
>   include/urcu/arch/ppc.h     | 25 ----------------------
>   include/urcu/arch/s390.h    |  2 --
>   include/urcu/arch/sparc64.h | 13 ------------
>   include/urcu/arch/x86.h     | 42 +++----------------------------------
>   9 files changed, 9 insertions(+), 107 deletions(-)
> 
> diff --git a/include/urcu/arch/alpha.h b/include/urcu/arch/alpha.h
> index dc33e28..61687c7 100644
> --- a/include/urcu/arch/alpha.h
> +++ b/include/urcu/arch/alpha.h
> @@ -29,9 +29,9 @@
>   extern "C" {
>   #endif
>   
> -#define cmm_mb()			__asm__ __volatile__ ("mb":::"memory")
> -#define cmm_wmb()			__asm__ __volatile__ ("wmb":::"memory")
> -#define cmm_read_barrier_depends()	__asm__ __volatile__ ("mb":::"memory")
> +#ifndef cmm_read_barrier_depends
> +#define cmm_read_barrier_depends()	__atomic_thread_fence(__ATOMIC_CONSUME)
> +#endif

I don't expect a #ifndef in arch-specific code. I would expect the 
ifndef in the generic code.

[...]

> diff --git a/include/urcu/arch/generic.h b/include/urcu/arch/generic.h
> index be6e41e..2715162 100644
> --- a/include/urcu/arch/generic.h
> +++ b/include/urcu/arch/generic.h
> @@ -44,15 +44,15 @@ extern "C" {
>    */
>   
>   #ifndef cmm_mb
> -#define cmm_mb()    __sync_synchronize()
> +#define cmm_mb()	__atomic_thread_fence(__ATOMIC_SEQ_CST)
>   #endif
>   
>   #ifndef cmm_rmb
> -#define cmm_rmb()	cmm_mb()
> +#define cmm_rmb()	__atomic_thread_fence(__ATOMIC_ACQUIRE)
>   #endif
>   
>   #ifndef cmm_wmb
> -#define cmm_wmb()	cmm_mb()
> +#define cmm_wmb()	__atomic_thread_fence(__ATOMIC_RELEASE)

I don't think rmb/wmb map to ACQUIRE/RELEASE semantic. This is incorrect 
AFAIU. ACQUIRE/RELEASE are semi-permeable barriers preventing code 
motion in one direction or the other, whereas rmb/wmb are barriers that 
only affect code motion of either loads or stores (but in both directions).

In the generic case, rmb/wmb could map to 
__atomic_thread_fence(__ATOMIC_SEQ_CST).

>   #endif
>   
>   #define cmm_mc()	cmm_barrier()
[...]
> diff --git a/include/urcu/arch/ppc.h b/include/urcu/arch/ppc.h
> index 791529e..618f79c 100644
> --- a/include/urcu/arch/ppc.h
> +++ b/include/urcu/arch/ppc.h
> @@ -34,31 +34,6 @@ extern "C" {
>   /* Include size of POWER5+ L3 cache lines: 256 bytes */
>   #define CAA_CACHE_LINE_SIZE	256
>   
> -#ifdef __NO_LWSYNC__
> -#define LWSYNC_OPCODE	"sync\n"
> -#else
> -#define LWSYNC_OPCODE	"lwsync\n"
> -#endif
> -
> -/*
> - * Use sync for all cmm_mb/rmb/wmb barriers because lwsync does not
> - * preserve ordering of cacheable vs. non-cacheable accesses, so it
> - * should not be used to order with respect to MMIO operations.  An
> - * eieio+lwsync pair is also not enough for cmm_rmb, because it will
> - * order cacheable and non-cacheable memory operations separately---i.e.
> - * not the latter against the former.
> - */
> -#define cmm_mb()         __asm__ __volatile__ ("sync":::"memory")

I agree that we will want to use the generic implementation for smp_mb.

> -
> -/*
> - * lwsync orders loads in cacheable memory with respect to other loads,
> - * and stores in cacheable memory with respect to other stores.
> - * Therefore, use it for barriers ordering accesses to cacheable memory
> - * only.
> - */
> -#define cmm_smp_rmb()    __asm__ __volatile__ (LWSYNC_OPCODE:::"memory")
> -#define cmm_smp_wmb()    __asm__ __volatile__ (LWSYNC_OPCODE:::"memory")

I suspect that using the generic implementation will be slower than 
lwsync. I am tempted to keep a custom implementation for rmb/wmb on ppc. 
We could have a build mode specific for TSAN which overrides those to 
use smp_mb instead.

> -
>   #define mftbl()						\
>   	__extension__					\
>   	({ 						\
[...]
> diff --git a/include/urcu/arch/sparc64.h b/include/urcu/arch/sparc64.h
> index 1ff40f5..b4e25ca 100644
> --- a/include/urcu/arch/sparc64.h
> +++ b/include/urcu/arch/sparc64.h
> @@ -40,19 +40,6 @@ extern "C" {
>   
>   #define CAA_CACHE_LINE_SIZE	256
>   
> -/*
> - * Inspired from the Linux kernel. Workaround Spitfire bug #51.
> - */
> -#define membar_safe(type)			\
> -__asm__ __volatile__("ba,pt %%xcc, 1f\n\t"	\
> -		     "membar " type "\n"	\
> -		     "1:\n"			\
> -		     : : : "memory")
> -
> -#define cmm_mb()	membar_safe("#LoadLoad | #LoadStore | #StoreStore | #StoreLoad")
> -#define cmm_rmb()	membar_safe("#LoadLoad")
> -#define cmm_wmb()	membar_safe("#StoreStore")

Same comment as for ppc.

> -
>   #ifdef __cplusplus
>   }
>   #endif
> diff --git a/include/urcu/arch/x86.h b/include/urcu/arch/x86.h
> index 744f9f9..af4487d 100644
> --- a/include/urcu/arch/x86.h
> +++ b/include/urcu/arch/x86.h
> @@ -46,44 +46,8 @@ extern "C" {
>   /* For backwards compat */
>   #define CONFIG_RCU_HAVE_FENCE 1
>   
> -#define cmm_mb()    __asm__ __volatile__ ("mfence":::"memory")
> -
> -/*
> - * Define cmm_rmb/cmm_wmb to "strict" barriers that may be needed when
> - * using SSE or working with I/O areas.  cmm_smp_rmb/cmm_smp_wmb are
> - * only compiler barriers, which is enough for general use.
> - */
> -#define cmm_rmb()     __asm__ __volatile__ ("lfence":::"memory")
> -#define cmm_wmb()     __asm__ __volatile__ ("sfence"::: "memory")
> -#define cmm_smp_rmb() cmm_barrier()
> -#define cmm_smp_wmb() cmm_barrier()

Relying on the generic barrier for rmb and wmb would slow things down on 
x86, we may want to do like I suggest for ppc.

> -
> -#else
> -
> -/*
> - * We leave smp_rmb/smp_wmb as full barriers for processors that do not have
> - * fence instructions.
> - *
> - * An empty cmm_smp_rmb() may not be enough on old PentiumPro multiprocessor
> - * systems, due to an erratum.  The Linux kernel says that "Even distro
> - * kernels should think twice before enabling this", but for now let's
> - * be conservative and leave the full barrier on 32-bit processors.  Also,
> - * IDT WinChip supports weak store ordering, and the kernel may enable it
> - * under our feet; cmm_smp_wmb() ceases to be a nop for these processors.
> - */
> -#if (CAA_BITS_PER_LONG == 32)
> -#define cmm_mb()    __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
> -#define cmm_rmb()    __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
> -#define cmm_wmb()    __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
> -#else
> -#define cmm_mb()    __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
> -#define cmm_rmb()    __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
> -#define cmm_wmb()    __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
> -#endif

Removing this removes support for older i686 and for URCU_ARCH_K1OM 
(Xeon Phi). Do we intend to remove that support ?

Thanks,

Mathieu

>   #endif
>   
> -#define caa_cpu_relax()	__asm__ __volatile__ ("rep; nop" : : : "memory")
> -
>   #define HAS_CAA_GET_CYCLES
>   
>   #define rdtscll(val)							  \
> @@ -98,10 +62,10 @@ typedef uint64_t caa_cycles_t;
>   
>   static inline caa_cycles_t caa_get_cycles(void)
>   {
> -        caa_cycles_t ret = 0;
> +	caa_cycles_t ret = 0;
>   
> -        rdtscll(ret);
> -        return ret;
> +	rdtscll(ret);
> +	return ret;
>   }

This whitespace to tab cleanup should be moved to its own patch.

Thanks,

Mathieu

>   
>   /*

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com