[lttng-dev] [PATCH 4/7] Replace the internal pointer manipulation with __atomic builtins

Mathieu Desnoyers mathieu.desnoyers at efficios.com
Mon Mar 20 14:25:20 EDT 2023


On 2023-03-17 17:37, Ondřej Surý via lttng-dev wrote:
> Instead of custom code, use the __atomic builtins to implement the
> rcu_dereference(), rcu_cmpxchg_pointer(), rcu_xchg_pointer() and
> rcu_assign_pointer().

This also changes the cmm_mb() family of functions, but not everywhere. 
This should be documented.

I'm also unsure why architecture code has #ifndef cmm_mb when we would 
expect the generic arch implementation to be conditional (the other way 
around).

> 
> Signed-off-by: Ondřej Surý <ondrej at sury.org>
> ---
>   include/urcu/arch.h           | 20 +++++++++
>   include/urcu/arch/alpha.h     |  6 +++
>   include/urcu/arch/arm.h       | 12 ++++++
>   include/urcu/arch/mips.h      |  2 +
>   include/urcu/arch/nios2.h     |  2 +
>   include/urcu/arch/ppc.h       |  6 +++
>   include/urcu/arch/s390.h      |  2 +
>   include/urcu/arch/sparc64.h   |  6 +++
>   include/urcu/arch/x86.h       | 20 +++++++++
>   include/urcu/static/pointer.h | 77 +++++++----------------------------
>   10 files changed, 90 insertions(+), 63 deletions(-)
> 
> diff --git a/include/urcu/arch.h b/include/urcu/arch.h
> index d3914da..aec6fa1 100644
> --- a/include/urcu/arch.h
> +++ b/include/urcu/arch.h
> @@ -21,6 +21,26 @@
>   #ifndef _URCU_ARCH_H
>   #define _URCU_ARCH_H
>   
> +#if !defined(__has_feature)
> +#define __has_feature(x) 0
> +#endif /* if !defined(__has_feature) */
> +
> +/* GCC defines __SANITIZE_ADDRESS__, so reuse the macro for clang */
> +#if __has_feature(address_sanitizer)
> +#define __SANITIZE_ADDRESS__ 1
> +#endif /* if __has_feature(address_sanitizer) */
> +
> +#ifdef __SANITIZE_THREAD__
> +/* FIXME: Somebody who understands the barriers should look into this */
> +#define cmm_mb() __atomic_thread_fence(__ATOMIC_ACQ_REL)

This really needs to be __ATOMIC_SEQ_CST.

> +#define cmm_rmb() __atomic_thread_fence(__ATOMIC_ACQUIRE)
> +#define cmm_wmb() __atomic_thread_fence(__ATOMIC_RELEASE)

I am really unsure that rmb/wmb semantics map to acq/rel. Paul, can you 
confirm ?

> +#define cmm_smp_mb() __atomic_thread_fence(__ATOMIC_ACQ_REL)

SEQ_CST.

> +#define cmm_smp_rmb() __atomic_thread_fence(__ATOMIC_ACQUIRE)
> +#define cmm_smp_wmb() __atomic_thread_fence(__ATOMIC_RELEASE)

Unsure (see above).

> +#define cmm_read_barrier_depends() __atomic_thread_fence(__ATOMIC_ACQ_REL)

This would map to __ATOMIC_CONSUME, but AFAIK the current implementation 
of this semantic is done with __ATOMIC_ACQUIRE which is stronger than 
what is really needed here. So we can expect a slowdown on some 
architectures if we go that way.

Should we favor code simplicity and long-term maintainability at the 
expense of performance in the short-term ? Or should we keep 
arch-specific implementations until the toolchains end up implementing a 
proper consume semantic ?

> +#endif
> +
>   /*
>    * Architecture detection using compiler defines.
>    *
> diff --git a/include/urcu/arch/alpha.h b/include/urcu/arch/alpha.h
> index dc33e28..84526ef 100644
> --- a/include/urcu/arch/alpha.h
> +++ b/include/urcu/arch/alpha.h
> @@ -29,9 +29,15 @@
>   extern "C" {
>   #endif
>   
> +#ifndef cmm_mb
>   #define cmm_mb()			__asm__ __volatile__ ("mb":::"memory")
> +#endif
> +#ifndef cmm_wmb
>   #define cmm_wmb()			__asm__ __volatile__ ("wmb":::"memory")
> +#endif
> +#ifndef cmm_read_barrier_depends
>   #define cmm_read_barrier_depends()	__asm__ __volatile__ ("mb":::"memory")
> +#endif
>   
[...]
> diff --git a/include/urcu/static/pointer.h b/include/urcu/static/pointer.h
> index 9e46a57..3f116f3 100644
> --- a/include/urcu/static/pointer.h
> +++ b/include/urcu/static/pointer.h
> @@ -38,6 +38,8 @@
>   extern "C" {
>   #endif
>   
> +#define _rcu_get_pointer(addr) __atomic_load_n(addr, __ATOMIC_CONSUME)
> +
>   /**
>    * _rcu_dereference - reads (copy) a RCU-protected pointer to a local variable
>    * into a RCU read-side critical section. The pointer can later be safely
> @@ -49,14 +51,6 @@ extern "C" {
>    * Inserts memory barriers on architectures that require them (currently only
>    * Alpha) and documents which pointers are protected by RCU.
>    *
> - * With C standards prior to C11/C++11, the compiler memory barrier in
> - * CMM_LOAD_SHARED() ensures that value-speculative optimizations (e.g.
> - * VSS: Value Speculation Scheduling) does not perform the data read
> - * before the pointer read by speculating the value of the pointer.
> - * Correct ordering is ensured because the pointer is read as a volatile
> - * access. This acts as a global side-effect operation, which forbids
> - * reordering of dependent memory operations.

We should document that we end up relying on CONSUME for rcu_dereference 
in the patch commit message.

> - *
>    * With C standards C11/C++11, concerns about dependency-breaking
>    * optimizations are taken care of by the "memory_order_consume" atomic
>    * load.
> @@ -65,10 +59,6 @@ extern "C" {
>    * explicit because the pointer used as input argument is a pointer,
>    * not an _Atomic type as required by C11/C++11.
>    *
> - * By defining URCU_DEREFERENCE_USE_VOLATILE, the user requires use of
> - * volatile access to implement rcu_dereference rather than
> - * memory_order_consume load from the C11/C++11 standards.
> - *
>    * This may improve performance on weakly-ordered architectures where
>    * the compiler implements memory_order_consume as a
>    * memory_order_acquire, which is stricter than required by the
> @@ -83,35 +73,7 @@ extern "C" {
>    * meets the 10-line criterion in LGPL, allowing this function to be
>    * expanded directly in non-LGPL code.
>    */
> -
> -#if !defined (URCU_DEREFERENCE_USE_VOLATILE) &&		\
> -	((defined (__cplusplus) && __cplusplus >= 201103L) ||	\
> -	(defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))
> -# define __URCU_DEREFERENCE_USE_ATOMIC_CONSUME
> -#endif
> -
> -/*
> - * If p is const (the pointer itself, not what it points to), using
> - * __typeof__(p) would declare a const variable, leading to
> - * -Wincompatible-pointer-types errors.  Using the statement expression
> - * makes it an rvalue and gets rid of the const-ness.
> - */
> -#ifdef __URCU_DEREFERENCE_USE_ATOMIC_CONSUME
> -# define _rcu_dereference(p) __extension__ ({						\
> -				__typeof__(__extension__ ({				\
> -					__typeof__(p) __attribute__((unused)) _________p0 = { 0 }; \
> -					_________p0;					\
> -				})) _________p1;					\
> -				__atomic_load(&(p), &_________p1, __ATOMIC_CONSUME);	\
> -				(_________p1);						\
> -			})
> -#else
> -# define _rcu_dereference(p) __extension__ ({						\
> -				__typeof__(p) _________p1 = CMM_LOAD_SHARED(p);		\
> -				cmm_smp_read_barrier_depends();				\
> -				(_________p1);						\
> -			})
> -#endif
> +#define _rcu_dereference(p) _rcu_get_pointer(&(p))
>   
>   /**
>    * _rcu_cmpxchg_pointer - same as rcu_assign_pointer, but tests if the pointer
> @@ -126,12 +88,12 @@ extern "C" {
>    * meets the 10-line criterion in LGPL, allowing this function to be
>    * expanded directly in non-LGPL code.
>    */
> -#define _rcu_cmpxchg_pointer(p, old, _new)				\
> -	__extension__							\
> -	({								\
> -		__typeof__(*p) _________pold = (old);			\
> -		__typeof__(*p) _________pnew = (_new);			\
> -		uatomic_cmpxchg(p, _________pold, _________pnew);	\
> +#define _rcu_cmpxchg_pointer(p, old, _new)						\
> +	({										\
> +		__typeof__(*(p)) __old = old;						\
> +		__atomic_compare_exchange_n(p, &__old, _new, 0,				\
> +					    __ATOMIC_ACQ_REL, __ATOMIC_CONSUME);	\

__ATOMIC_SEQ_CST on both success and failure.

> +		__old;									\
>   	})
>   
>   /**
> @@ -145,22 +107,11 @@ extern "C" {
>    * meets the 10-line criterion in LGPL, allowing this function to be
>    * expanded directly in non-LGPL code.
>    */
> -#define _rcu_xchg_pointer(p, v)				\
> -	__extension__					\
> -	({						\
> -		__typeof__(*p) _________pv = (v);	\
> -		uatomic_xchg(p, _________pv);		\
> -	})
> -
> +#define _rcu_xchg_pointer(p, v) \
> +	__atomic_exchange_n(p, v, __ATOMIC_ACQ_REL)

__ATOMIC_SEQ_CST.

>   
> -#define _rcu_set_pointer(p, v)				\
> -	do {						\
> -		__typeof__(*p) _________pv = (v);	\
> -		if (!__builtin_constant_p(v) || 	\
> -		    ((v) != NULL))			\
> -			cmm_wmb();				\
> -		uatomic_set(p, _________pv);		\
> -	} while (0)
> +#define _rcu_set_pointer(p, v) \
> +	__atomic_store_n(p, v, __ATOMIC_RELEASE)

OK.

Thanks,

Mathieu

>   
>   /**
>    * _rcu_assign_pointer - assign (publicize) a pointer to a new data structure
> @@ -178,7 +129,7 @@ extern "C" {
>    * meets the 10-line criterion in LGPL, allowing this function to be
>    * expanded directly in non-LGPL code.
>    */
> -#define _rcu_assign_pointer(p, v)	_rcu_set_pointer(&(p), v)
> +#define _rcu_assign_pointer(p, v) rcu_set_pointer(&(p), v)
>   
>   #ifdef __cplusplus
>   }

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com



More information about the lttng-dev mailing list