[lttng-dev] [PATCH 5/7] Replace the arch-specific memory barriers with __atomic builtins
Mathieu Desnoyers
mathieu.desnoyers at efficios.com
Tue Mar 21 16:19:11 EDT 2023
On 2023-03-21 09:31, Ondřej Surý via lttng-dev wrote:
> Instead of a custom code, use the __atomic_thread_fence() builtin to
> implement the cmm_mb(), cmm_rmb(), cmm_wmb(), cmm_smp_mb(),
> cmm_smp_rmb(), and cmm_smp_wmb() on all architectures, and
> cmm_read_barrier_depends() on alpha (otherwise it's still no-op).
>
> family of functions
>
> Signed-off-by: Ondřej Surý <ondrej at sury.org>
> ---
> include/urcu/arch/alpha.h | 6 +++---
> include/urcu/arch/arm.h | 14 -------------
> include/urcu/arch/generic.h | 6 +++---
> include/urcu/arch/mips.h | 6 ------
> include/urcu/arch/nios2.h | 2 --
> include/urcu/arch/ppc.h | 25 ----------------------
> include/urcu/arch/s390.h | 2 --
> include/urcu/arch/sparc64.h | 13 ------------
> include/urcu/arch/x86.h | 42 +++----------------------------------
> 9 files changed, 9 insertions(+), 107 deletions(-)
>
> diff --git a/include/urcu/arch/alpha.h b/include/urcu/arch/alpha.h
> index dc33e28..61687c7 100644
> --- a/include/urcu/arch/alpha.h
> +++ b/include/urcu/arch/alpha.h
> @@ -29,9 +29,9 @@
> extern "C" {
> #endif
>
> -#define cmm_mb() __asm__ __volatile__ ("mb":::"memory")
> -#define cmm_wmb() __asm__ __volatile__ ("wmb":::"memory")
> -#define cmm_read_barrier_depends() __asm__ __volatile__ ("mb":::"memory")
> +#ifndef cmm_read_barrier_depends
> +#define cmm_read_barrier_depends() __atomic_thread_fence(__ATOMIC_CONSUME)
> +#endif
I don't expect a #ifndef in arch-specific code. I would expect the
ifndef in the generic code.
[...]
> diff --git a/include/urcu/arch/generic.h b/include/urcu/arch/generic.h
> index be6e41e..2715162 100644
> --- a/include/urcu/arch/generic.h
> +++ b/include/urcu/arch/generic.h
> @@ -44,15 +44,15 @@ extern "C" {
> */
>
> #ifndef cmm_mb
> -#define cmm_mb() __sync_synchronize()
> +#define cmm_mb() __atomic_thread_fence(__ATOMIC_SEQ_CST)
> #endif
>
> #ifndef cmm_rmb
> -#define cmm_rmb() cmm_mb()
> +#define cmm_rmb() __atomic_thread_fence(__ATOMIC_ACQUIRE)
> #endif
>
> #ifndef cmm_wmb
> -#define cmm_wmb() cmm_mb()
> +#define cmm_wmb() __atomic_thread_fence(__ATOMIC_RELEASE)
I don't think rmb/wmb map to ACQUIRE/RELEASE semantic. This is incorrect
AFAIU. ACQUIRE/RELEASE are semi-permeable barriers preventing code
motion in one direction or the other, whereas rmb/wmb are barriers that
only affect code motion of either loads or stores (but in both directions).
In the generic case, rmb/wmb could map to
__atomic_thread_fence(__ATOMIC_SEQ_CST).
> #endif
>
> #define cmm_mc() cmm_barrier()
[...]
> diff --git a/include/urcu/arch/ppc.h b/include/urcu/arch/ppc.h
> index 791529e..618f79c 100644
> --- a/include/urcu/arch/ppc.h
> +++ b/include/urcu/arch/ppc.h
> @@ -34,31 +34,6 @@ extern "C" {
> /* Include size of POWER5+ L3 cache lines: 256 bytes */
> #define CAA_CACHE_LINE_SIZE 256
>
> -#ifdef __NO_LWSYNC__
> -#define LWSYNC_OPCODE "sync\n"
> -#else
> -#define LWSYNC_OPCODE "lwsync\n"
> -#endif
> -
> -/*
> - * Use sync for all cmm_mb/rmb/wmb barriers because lwsync does not
> - * preserve ordering of cacheable vs. non-cacheable accesses, so it
> - * should not be used to order with respect to MMIO operations. An
> - * eieio+lwsync pair is also not enough for cmm_rmb, because it will
> - * order cacheable and non-cacheable memory operations separately---i.e.
> - * not the latter against the former.
> - */
> -#define cmm_mb() __asm__ __volatile__ ("sync":::"memory")
I agree that we will want to use the generic implementation for smp_mb.
> -
> -/*
> - * lwsync orders loads in cacheable memory with respect to other loads,
> - * and stores in cacheable memory with respect to other stores.
> - * Therefore, use it for barriers ordering accesses to cacheable memory
> - * only.
> - */
> -#define cmm_smp_rmb() __asm__ __volatile__ (LWSYNC_OPCODE:::"memory")
> -#define cmm_smp_wmb() __asm__ __volatile__ (LWSYNC_OPCODE:::"memory")
I suspect that using the generic implementation will be slower than
lwsync. I am tempted to keep a custom implementation for rmb/wmb on ppc.
We could have a build mode specific for TSAN which overrides those to
use smp_mb instead.
> -
> #define mftbl() \
> __extension__ \
> ({ \
[...]
> diff --git a/include/urcu/arch/sparc64.h b/include/urcu/arch/sparc64.h
> index 1ff40f5..b4e25ca 100644
> --- a/include/urcu/arch/sparc64.h
> +++ b/include/urcu/arch/sparc64.h
> @@ -40,19 +40,6 @@ extern "C" {
>
> #define CAA_CACHE_LINE_SIZE 256
>
> -/*
> - * Inspired from the Linux kernel. Workaround Spitfire bug #51.
> - */
> -#define membar_safe(type) \
> -__asm__ __volatile__("ba,pt %%xcc, 1f\n\t" \
> - "membar " type "\n" \
> - "1:\n" \
> - : : : "memory")
> -
> -#define cmm_mb() membar_safe("#LoadLoad | #LoadStore | #StoreStore | #StoreLoad")
> -#define cmm_rmb() membar_safe("#LoadLoad")
> -#define cmm_wmb() membar_safe("#StoreStore")
Same comment as for ppc.
> -
> #ifdef __cplusplus
> }
> #endif
> diff --git a/include/urcu/arch/x86.h b/include/urcu/arch/x86.h
> index 744f9f9..af4487d 100644
> --- a/include/urcu/arch/x86.h
> +++ b/include/urcu/arch/x86.h
> @@ -46,44 +46,8 @@ extern "C" {
> /* For backwards compat */
> #define CONFIG_RCU_HAVE_FENCE 1
>
> -#define cmm_mb() __asm__ __volatile__ ("mfence":::"memory")
> -
> -/*
> - * Define cmm_rmb/cmm_wmb to "strict" barriers that may be needed when
> - * using SSE or working with I/O areas. cmm_smp_rmb/cmm_smp_wmb are
> - * only compiler barriers, which is enough for general use.
> - */
> -#define cmm_rmb() __asm__ __volatile__ ("lfence":::"memory")
> -#define cmm_wmb() __asm__ __volatile__ ("sfence"::: "memory")
> -#define cmm_smp_rmb() cmm_barrier()
> -#define cmm_smp_wmb() cmm_barrier()
Relying on the generic barrier for rmb and wmb would slow things down on
x86, we may want to do like I suggest for ppc.
> -
> -#else
> -
> -/*
> - * We leave smp_rmb/smp_wmb as full barriers for processors that do not have
> - * fence instructions.
> - *
> - * An empty cmm_smp_rmb() may not be enough on old PentiumPro multiprocessor
> - * systems, due to an erratum. The Linux kernel says that "Even distro
> - * kernels should think twice before enabling this", but for now let's
> - * be conservative and leave the full barrier on 32-bit processors. Also,
> - * IDT WinChip supports weak store ordering, and the kernel may enable it
> - * under our feet; cmm_smp_wmb() ceases to be a nop for these processors.
> - */
> -#if (CAA_BITS_PER_LONG == 32)
> -#define cmm_mb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
> -#define cmm_rmb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
> -#define cmm_wmb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
> -#else
> -#define cmm_mb() __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
> -#define cmm_rmb() __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
> -#define cmm_wmb() __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
> -#endif
Removing this removes support for older i686 and for URCU_ARCH_K1OM
(Xeon Phi). Do we intend to remove that support ?
Thanks,
Mathieu
> #endif
>
> -#define caa_cpu_relax() __asm__ __volatile__ ("rep; nop" : : : "memory")
> -
> #define HAS_CAA_GET_CYCLES
>
> #define rdtscll(val) \
> @@ -98,10 +62,10 @@ typedef uint64_t caa_cycles_t;
>
> static inline caa_cycles_t caa_get_cycles(void)
> {
> - caa_cycles_t ret = 0;
> + caa_cycles_t ret = 0;
>
> - rdtscll(ret);
> - return ret;
> + rdtscll(ret);
> + return ret;
> }
This whitespace to tab cleanup should be moved to its own patch.
Thanks,
Mathieu
>
> /*
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
More information about the lttng-dev
mailing list