[lttng-dev] [PATCH] urcu: avoid false sharing for rcu_gp_ctr

Wed May 8 05:01:01 EDT 2013

Hi, Mathieu,

There is a big compatible problem in URCU which should be fix in next round.

LB: liburcu built on the system which has sys_membarrier().
LU: liburcu built on the system which does NOT have sys_membarrier().

LBM: liburcu-mb ....
LUM: liburcu-mb ...

AB: application(-lliburcu) built on the system which has sys_membarrier().
AU: application(-lliburcu) built on the system which does NOT have sys_membarrier().

ABM application(-lliburcu-mb) ...
AUM application(-lliburcu-mb) ...

AB/AU + LB/LU: 4 combinations
ABM/AUM + LBM/LUM: 4 combinations

I remember some of the 8 combinations can't works due to symbols are miss match.
only LU+AB and LB+AU ?

could you check it?

How to fix it: In LU and AU, keep all the symbol name/ABI as LA and AB, but only
the behaviors falls back to URCU_MB.

Thanks,
Lai

On 05/06/2013 08:44 PM, Mathieu Desnoyers wrote:
> Since we are bumping the urcu soname version number to 2.0.0 for the
> upcoming urcu 0.8 anyway, it's time to merge this patch.
> 
> Thanks!
> 
> Mathieu
> 
> * Lai Jiangshan (laijs at cn.fujitsu.com) wrote:
>> @rcu_gp_ctr and @registry share the same cache line, it causes
>> false sharing and slowdown both of the read site and update site.
>>
>> Fix: Use different cache line for them.
>>
>> Although rcu_gp_futex is updated less than rcu_gp_ctr, but
>> they always be accessed at almost the same time, so we also move rcu_gp_futex
>> to the cacheline of rcu_gp_ctr to reduce the cacheline-usage or cache-missing
>> of read site.
>>
>>
>> test: (4X6=24 CPUs)
>>
>> Before patch:
>>
>> [root at localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20
>> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 wdur      0 nr_writers   1 wdelay      0 nr_reads   2100285330 nr_writes      3390219 nr_ops   2103675549
>> [root at localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20
>> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 wdur      0 nr_writers   1 wdelay      0 nr_reads   1619868562 nr_writes      3529478 nr_ops   1623398040
>> [root at localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20
>> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 wdur      0 nr_writers   1 wdelay      0 nr_reads   1949067038 nr_writes      3469334 nr_ops   1952536372
>>
>>
>> after patch:
>>
>> [root at localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20
>> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 wdur      0 nr_writers   1 wdelay      0 nr_reads   3380191848 nr_writes      4903248 nr_ops   3385095096
>> [root at localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20
>> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 wdur      0 nr_writers   1 wdelay      0 nr_reads   3397637486 nr_writes      4129809 nr_ops   3401767295
>>
>> Singed-by: Lai Jiangshan <laijs at cn.fujitsu.com>
>> ---
>> diff --git a/urcu.c b/urcu.c
>> index 15def09..436d71c 100644
>> --- a/urcu.c
>> +++ b/urcu.c
>> @@ -83,16 +83,7 @@ void __attribute__((destructor)) rcu_exit(void);
>>  #endif
>>  
>>  static pthread_mutex_t rcu_gp_lock = PTHREAD_MUTEX_INITIALIZER;
>> -
>> -int32_t rcu_gp_futex;
>> -
>> -/*
>> - * Global grace period counter.
>> - * Contains the current RCU_GP_CTR_PHASE.
>> - * Also has a RCU_GP_COUNT of 1, to accelerate the reader fast path.
>> - * Written to only by writer with mutex taken. Read by both writer and readers.
>> - */
>> -unsigned long rcu_gp_ctr = RCU_GP_COUNT;
>> +struct urcu_gp rcu_gp = { .ctr = RCU_GP_COUNT };
>>  
>>  /*
>>   * Written to only by each individual reader. Read by both the reader and the
>> @@ -217,8 +208,8 @@ static void wait_gp(void)
>>  {
>>  	/* Read reader_gp before read futex */
>>  	smp_mb_master(RCU_MB_GROUP);
>> -	if (uatomic_read(&rcu_gp_futex) == -1)
>> -		futex_async(&rcu_gp_futex, FUTEX_WAIT, -1,
>> +	if (uatomic_read(&rcu_gp.futex) == -1)
>> +		futex_async(&rcu_gp.futex, FUTEX_WAIT, -1,
>>  		      NULL, NULL, 0);
>>  }
>>  
>> @@ -232,12 +223,12 @@ static void wait_for_readers(struct cds_list_head *input_readers,
>>  	/*
>>  	 * Wait for each thread URCU_TLS(rcu_reader).ctr to either
>>  	 * indicate quiescence (not nested), or observe the current
>> -	 * rcu_gp_ctr value.
>> +	 * rcu_gp.ctr value.
>>  	 */
>>  	for (;;) {
>>  		wait_loops++;
>>  		if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {
>> -			uatomic_dec(&rcu_gp_futex);
>> +			uatomic_dec(&rcu_gp.futex);
>>  			/* Write futex before read reader_gp */
>>  			smp_mb_master(RCU_MB_GROUP);
>>  		}
>> @@ -270,7 +261,7 @@ static void wait_for_readers(struct cds_list_head *input_readers,
>>  			if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {
>>  				/* Read reader_gp before write futex */
>>  				smp_mb_master(RCU_MB_GROUP);
>> -				uatomic_set(&rcu_gp_futex, 0);
>> +				uatomic_set(&rcu_gp.futex, 0);
>>  			}
>>  			break;
>>  		} else {
>> @@ -289,7 +280,7 @@ static void wait_for_readers(struct cds_list_head *input_readers,
>>  			if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {
>>  				/* Read reader_gp before write futex */
>>  				smp_mb_master(RCU_MB_GROUP);
>> -				uatomic_set(&rcu_gp_futex, 0);
>> +				uatomic_set(&rcu_gp.futex, 0);
>>  			}
>>  			break;
>>  		} else {
>> @@ -357,10 +348,10 @@ void synchronize_rcu(void)
>>  
>>  	/*
>>  	 * Must finish waiting for quiescent state for original parity before
>> -	 * committing next rcu_gp_ctr update to memory. Failure to do so could
>> +	 * committing next rcu_gp.ctr update to memory. Failure to do so could
>>  	 * result in the writer waiting forever while new readers are always
>>  	 * accessing data (no progress).  Enforce compiler-order of load
>> -	 * URCU_TLS(rcu_reader).ctr before store to rcu_gp_ctr.
>> +	 * URCU_TLS(rcu_reader).ctr before store to rcu_gp.ctr.
>>  	 */
>>  	cmm_barrier();
>>  
>> @@ -372,13 +363,13 @@ void synchronize_rcu(void)
>>  	cmm_smp_mb();
>>  
>>  	/* Switch parity: 0 -> 1, 1 -> 0 */
>> -	CMM_STORE_SHARED(rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR_PHASE);
>> +	CMM_STORE_SHARED(rcu_gp.ctr, rcu_gp.ctr ^ RCU_GP_CTR_PHASE);
>>  
>>  	/*
>> -	 * Must commit rcu_gp_ctr update to memory before waiting for quiescent
>> +	 * Must commit rcu_gp.ctr update to memory before waiting for quiescent
>>  	 * state. Failure to do so could result in the writer waiting forever
>>  	 * while new readers are always accessing data (no progress). Enforce
>> -	 * compiler-order of store to rcu_gp_ctr before load rcu_reader ctr.
>> +	 * compiler-order of store to rcu_gp.ctr before load rcu_reader ctr.
>>  	 */
>>  	cmm_barrier();
>>  
>> diff --git a/urcu/static/urcu.h b/urcu/static/urcu.h
>> index 973826a..0dd733e 100644
>> --- a/urcu/static/urcu.h
>> +++ b/urcu/static/urcu.h
>> @@ -213,12 +213,20 @@ static inline void smp_mb_slave(int group)
>>  #define RCU_GP_CTR_PHASE	(1UL << (sizeof(unsigned long) << 2))
>>  #define RCU_GP_CTR_NEST_MASK	(RCU_GP_CTR_PHASE - 1)
>>  
>> -/*
>> - * Global quiescent period counter with low-order bits unused.
>> - * Using a int rather than a char to eliminate false register dependencies
>> - * causing stalls on some architectures.
>> - */
>> -extern unsigned long rcu_gp_ctr;
>> +struct urcu_gp {
>> +	/*
>> +	 * Global grace period counter.
>> +	 * Contains the current RCU_GP_CTR_PHASE.
>> +	 * Also has a RCU_GP_COUNT of 1, to accelerate the reader fast path.
>> +	 * Written to only by writer with mutex taken.
>> +	 * Read by both writer and readers.
>> +	 */
>> +	unsigned long ctr;
>> +
>> +	int32_t futex;
>> +} __attribute__((aligned(CAA_CACHE_LINE_SIZE)));
>> +
>> +extern struct urcu_gp rcu_gp;
>>  
>>  struct rcu_reader {
>>  	/* Data used by both reader and synchronize_rcu() */
>> @@ -231,16 +239,14 @@ struct rcu_reader {
>>  
>>  extern DECLARE_URCU_TLS(struct rcu_reader, rcu_reader);
>>  
>> -extern int32_t rcu_gp_futex;
>> -
>>  /*
>>   * Wake-up waiting synchronize_rcu(). Called from many concurrent threads.
>>   */
>>  static inline void wake_up_gp(void)
>>  {
>> -	if (caa_unlikely(uatomic_read(&rcu_gp_futex) == -1)) {
>> -		uatomic_set(&rcu_gp_futex, 0);
>> -		futex_async(&rcu_gp_futex, FUTEX_WAKE, 1,
>> +	if (caa_unlikely(uatomic_read(&rcu_gp.futex) == -1)) {
>> +		uatomic_set(&rcu_gp.futex, 0);
>> +		futex_async(&rcu_gp.futex, FUTEX_WAKE, 1,
>>  		      NULL, NULL, 0);
>>  	}
>>  }
>> @@ -256,13 +262,13 @@ static inline enum rcu_state rcu_reader_state(unsigned long *ctr)
>>  	v = CMM_LOAD_SHARED(*ctr);
>>  	if (!(v & RCU_GP_CTR_NEST_MASK))
>>  		return RCU_READER_INACTIVE;
>> -	if (!((v ^ rcu_gp_ctr) & RCU_GP_CTR_PHASE))
>> +	if (!((v ^ rcu_gp.ctr) & RCU_GP_CTR_PHASE))
>>  		return RCU_READER_ACTIVE_CURRENT;
>>  	return RCU_READER_ACTIVE_OLD;
>>  }
>>  
>>  /*
>> - * Helper for _rcu_read_lock().  The format of rcu_gp_ctr (as well as
>> + * Helper for _rcu_read_lock().  The format of rcu_gp.ctr (as well as
>>   * the per-thread rcu_reader.ctr) has the upper bits containing a count of
>>   * _rcu_read_lock() nesting, and a lower-order bit that contains either zero
>>   * or RCU_GP_CTR_PHASE.  The smp_mb_slave() ensures that the accesses in
>> @@ -271,7 +277,7 @@ static inline enum rcu_state rcu_reader_state(unsigned long *ctr)
>>  static inline void _rcu_read_lock_update(unsigned long tmp)
>>  {
>>  	if (caa_likely(!(tmp & RCU_GP_CTR_NEST_MASK))) {
>> -		_CMM_STORE_SHARED(URCU_TLS(rcu_reader).ctr, _CMM_LOAD_SHARED(rcu_gp_ctr));
>> +		_CMM_STORE_SHARED(URCU_TLS(rcu_reader).ctr, _CMM_LOAD_SHARED(rcu_gp.ctr));
>>  		smp_mb_slave(RCU_MB_GROUP);
>>  	} else
>>  		_CMM_STORE_SHARED(URCU_TLS(rcu_reader).ctr, tmp + RCU_GP_COUNT);
>