we can define rcu_gp_ctr and registry with aligned attribute, but it is not reliable way<div><br></div><div>We need only this:</div><div>unsigned long rcu_gp_ctr __attribute((aligned and padded(don't put other var next to it except the futex)))<span></span><br>

<br>On Saturday, December 8, 2012, Mathieu Desnoyers  wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">* Lai Jiangshan (<a href="javascript:;" onclick="_e(event, 'cvml', 'laijs@cn.fujitsu.com')">laijs@cn.fujitsu.com</a>) wrote:<br>


> @rcu_gp_ctr and @registry share the same cache line, it causes<br>

> false sharing and slowdown both of the read site and update site.<br>

><br>

> Fix: Use different cache line for them.<br>

><br>

> Although rcu_gp_futex is updated less than rcu_gp_ctr, but<br>

> they always be accessed at almost the same time, so we also move rcu_gp_futex<br>

> to the cacheline of rcu_gp_ctr to reduce the cacheline-usage or cache-missing<br>

> of read site.<br>

<br>

Hi Lai,<br>

<br>

I agree on the goal: placing registry and rcu_gp_ctr on different<br>

cache-lines. And yes, it makes sense to put rcu_gp_ctr and rcu_gp_futex<br>

on the same cache-line. I agree that the next patch is fine too (keeping<br>

qsbr and other urcu similar). This is indeed what I try to ensure<br>

myself.<br>

<br>

I'm just concerned that this patch seems to break ABI compability for<br>

liburcu: the read-side, within applications, would have to be<br>

recompiled. So I guess we should decide if we do this change in a way<br>

that does not break the ABI (e.g. not introducing a structure), or if we<br>

choose to bump the library version number.<br>

<br>

Thoughts ?<br>

<br>

Thanks,<br>

<br>

Mathieu<br>

<br>

><br>

><br>

> test: (4X6=24 CPUs)<br>

><br>

> Before patch:<br>

><br>

> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20<br>

> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 wdur      0 nr_writers   1 wdelay      0 nr_reads   2100285330 nr_writes      3390219 nr_ops   2103675549<br>

> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20<br>

> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 wdur      0 nr_writers   1 wdelay      0 nr_reads   1619868562 nr_writes      3529478 nr_ops   1623398040<br>

> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20<br>

> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 wdur      0 nr_writers   1 wdelay      0 nr_reads   1949067038 nr_writes      3469334 nr_ops   1952536372<br>

><br>

><br>

> after patch:<br>

><br>

> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20<br>

> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 wdur      0 nr_writers   1 wdelay      0 nr_reads   3380191848 nr_writes      4903248 nr_ops   3385095096<br>

> [root@localhost userspace-rcu]# ./tests/test_urcu_mb 20 1 20<br>

> SUMMARY ./tests/test_urcu_mb      testdur   20 nr_readers  20 rdur      0 wdur      0 nr_writers   1 wdelay      0 nr_reads   3397637486 nr_writes      4129809 nr_ops   3401767295<br>

><br>

> Singed-by: Lai Jiangshan <<a href="javascript:;" onclick="_e(event, 'cvml', 'laijs@cn.fujitsu.com')">laijs@cn.fujitsu.com</a>><br>

> ---<br>

> diff --git a/urcu.c b/urcu.c<br>

> index 15def09..436d71c 100644<br>

> --- a/urcu.c<br>

> +++ b/urcu.c<br>

> @@ -83,16 +83,7 @@ void __attribute__((destructor)) rcu_exit(void);<br>

>  #endif<br>

><br>

>  static pthread_mutex_t rcu_gp_lock = PTHREAD_MUTEX_INITIALIZER;<br>

> -<br>

> -int32_t rcu_gp_futex;<br>

> -<br>

> -/*<br>

> - * Global grace period counter.<br>

> - * Contains the current RCU_GP_CTR_PHASE.<br>

> - * Also has a RCU_GP_COUNT of 1, to accelerate the reader fast path.<br>

> - * Written to only by writer with mutex taken. Read by both writer and readers.<br>

> - */<br>

> -unsigned long rcu_gp_ctr = RCU_GP_COUNT;<br>

> +struct urcu_gp rcu_gp = { .ctr = RCU_GP_COUNT };<br>

><br>

>  /*<br>

>   * Written to only by each individual reader. Read by both the reader and the<br>

> @@ -217,8 +208,8 @@ static void wait_gp(void)<br>

>  {<br>

>       /* Read reader_gp before read futex */<br>

>       smp_mb_master(RCU_MB_GROUP);<br>

> -     if (uatomic_read(&rcu_gp_futex) == -1)<br>

> -             futex_async(&rcu_gp_futex, FUTEX_WAIT, -1,<br>

> +     if (uatomic_read(&rcu_gp.futex) == -1)<br>

> +             futex_async(&rcu_gp.futex, FUTEX_WAIT, -1,<br>

>                     NULL, NULL, 0);<br>

>  }<br>

><br>

> @@ -232,12 +223,12 @@ static void wait_for_readers(struct cds_list_head *input_readers,<br>

>       /*<br>

>        * Wait for each thread URCU_TLS(rcu_reader).ctr to either<br>

>        * indicate quiescence (not nested), or observe the current<br>

> -      * rcu_gp_ctr value.<br>

> +      * rcu_gp.ctr value.<br>

>        */<br>

>       for (;;) {<br>

>               wait_loops++;<br>

>               if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {<br>

> -                     uatomic_dec(&rcu_gp_futex);<br>

> +                     uatomic_dec(&rcu_gp.futex);<br>

>                       /* Write futex before read reader_gp */<br>

>                       smp_mb_master(RCU_MB_GROUP);<br>

>               }<br>

> @@ -270,7 +261,7 @@ static void wait_for_readers(struct cds_list_head *input_readers,<br>

>                       if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {<br>

>                               /* Read reader_gp before write futex */<br>

>                               smp_mb_master(RCU_MB_GROUP);<br>

> -                             uatomic_set(&rcu_gp_futex, 0);<br>

> +                             uatomic_set(&rcu_gp.futex, 0);<br>

>                       }<br>

>                       break;<br>

>               } else {<br>

> @@ -289,7 +280,7 @@ static void wait_for_readers(struct cds_list_head *input_readers,<br>

>                       if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {<br>

>                               /* Read reader_gp before write futex */<br>

>                               smp_mb_master(RCU_MB_GROUP);<br>

> -                             uatomic_set(&rcu_gp_futex, 0);<br>

> +                             uatomic_set(&rcu_gp.futex, 0);<br>

>                       }<br>

>                       break;<br>

>               } else {<br>

> @@ -357,10 +348,10 @@ void synchronize_rcu(void)<br>

><br>

>       /*<br>

>        * Must finish waiting for quiescent state for original parity before<br>

> -      * committing next rcu_gp_ctr update to memory. Failure to do so could<br>

> +      * committing next rcu_gp.ctr update to memory. Failure to do so could<br>

>        * result in the writer waiting forever while new readers are always<br>

>        * accessing data (no progress).  Enforce compiler-order of load<br>

> -      * URCU_TLS(rcu_reader).ctr before store to rcu_gp_ctr.<br>

> +      * URCU_TLS(rcu_reader).ctr before store to rcu_gp.ctr.<br>

>        */<br>

>       cmm_barrier();<br>

><br>

> @@ -372,13 +363,13 @@ void synchronize_rcu(void)<br>

>       cmm_smp_mb();<br>

><br>

>       /* Switch parity: 0 -> 1, 1 -> 0 */<br>

> -     CMM_STORE_SHARED(rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR_PHASE);<br>

> +     CMM_STORE_SHARED(rcu_gp.ctr, rcu_gp.ctr ^ RCU_GP_CTR_PHASE);<br>

><br>

>       /*<br>

> -      * Must commit rcu_gp_ctr update to memory before waiting for quiescent<br>

> +      * Must commit rcu_gp.ctr update to memory before waiting for quiescent<br>

>        * state. Failure to do so could result in the writer waiting forever<br>

>        * while new readers are always accessing data (no progress). Enforce<br>

> -      * compiler-order of store to rcu_gp_ctr before load rcu_reader ctr.<br>

> +      * compiler-order of store to rcu_gp.ctr before load rcu_reader ctr.<br>

>        */<br>

>       cmm_barrier();<br>

><br>

> diff --git a/urcu/static/urcu.h b/urcu/static/urcu.h<br>

> index 973826a..0dd733e 100644<br>

> --- a/urcu/static/urcu.h<br>

> +++ b/urcu/static/urcu.h<br>

> @@ -213,12 +213,20 @@ static inline void smp_mb_slave(int group)<br>

>  #define RCU_GP_CTR_PHASE     (1UL << (sizeof(unsigned long) << 2))<br>

>  #define RCU_GP_CTR_NEST_MASK (RCU_GP_CTR_PHASE - 1)<br>

><br>

> -/*<br>

> - * Global quiescent period counter with low-order bits unused.<br>

> - * Using a int rather than a char to eliminate false register dependencies<br>

> - * causing stalls on some architectures.<br>

> - */<br>

> -extern unsigned long rcu_gp_ctr;<br>

> +struct urcu_gp {<br>

> +     /*<br>

> +      * Global grace period counter.<br>

> +      * Contains the current RCU_GP_CTR_PHASE.<br>

> +      * Also has a RCU_GP_COUNT of 1, to accelerate the reader fast path.<br>

> +      * Written to only by writer with mutex taken.<br>

> +      * Read by both writer and readers.<br>

> +      */<br>

> +     unsigned long ctr;<br>

> +<br>

> +     int32_t futex;<br>

> +} __attribute__((aligned(CAA_CACHE_LINE_SIZE)));<br>

> +<br>

> +extern struct urcu_gp rcu_gp;<br>

><br>

>  struct rcu_reader {<br>

>       /* Data used by both reader and synchronize_rcu() */<br>

> @@ -231,16 +239,14 @@ struct rcu_reader {<br>

><br>

>  extern DECLARE_URCU_TLS(struct rcu_reader, rcu_reader);<br>

><br>

> -extern int32_t rcu_gp_futex;<br>

> -<br>

>  /*<br>

>   * Wake-up waiting synchronize_rcu(). Called from many concurrent threads.<br>

>   */<br>

>  static inline void wake_up_gp(void)<br>

>  {<br>

> -     if (caa_unlikely(uatomic_read(&rcu_gp_futex) == -1)) {<br>

> -             uatomic_set(&rcu_gp_futex, 0);<br>

> -             futex_async(&rcu_gp_futex, FUTEX_WAKE, 1,<br>

> +     if (caa_unlikely(uatomic_read(&rcu_gp.futex) == -1)) {<br>

> +             uatomic_set(&rcu_gp.futex, 0);<br>

> +             futex_async(&rcu_gp.futex, FUTEX_WAKE, 1,<br>

>                     NULL, NULL, 0);<br>

>       }<br>

>  }<br>

> @@ -256,13 +262,13 @@ static inline enum rcu_state rcu_reader_state(unsigned long *ctr)<br>

>       v = CMM_LOAD_SHARED(*ctr);<br>

>       if (!(v & RCU_GP_CTR_NEST_MASK))<br>

>               return RCU_READER_INACTIVE;<br>

> -     if (!((v ^ rcu_gp_ctr) & RCU_GP_CTR_PHASE))<br>

> +     if (!((v ^ rcu_gp.ctr) & RCU_GP_CTR_PHASE))<br>

>               return RCU_READER_ACTIVE_CURRENT;<br>

>       return RCU_READER_ACTIVE_OLD;<br>

>  }<br>

><br>

>  /*<br>

> - * Helper for _rcu_read_lock().  The format of rcu_gp_ctr (as well as<br>

> + * Helper for _rcu_read_lock().  The format of rcu_gp.ctr (as well as<br>

>   * the per-thread rcu_reader.ctr) has the upper bits containing a count of<br>

>   * _rcu_read_lock() nesting, and a lower-order bit that contains either zero<br>

>   * or RCU_GP_CTR_PHASE.  The smp_mb_slave() ensures that the accesses in<br>

> @@ -271,7 +277,7 @@ static inline enum rcu_state rcu_reader_state(unsigned long *ctr)<br>

>  static inline void _rcu_read_lock_update(unsigned long tmp)<br>

>  {<br>

>       if (caa_likely(!(tmp & RCU_GP_CTR_NEST_MASK))) {<br>

> -             _CMM_STORE_SHARED(URCU_TLS(rcu_reader).ctr, _CMM_LOAD_SHARED(rcu_gp_ctr));<br>

> +             _CMM_STORE_SHARED(URCU_TLS(rcu_reader).ctr, _CMM_LOAD_SHARED(rcu_gp.ctr));<br>

>               smp_mb_slave(RCU_MB_GROUP);<br>

>       } else<br>

>               _CMM_STORE_SHARED(URCU_TLS(rcu_reader).ctr, tmp + RCU_GP_COUNT);<br>

<br>

--<br>

Mathieu Desnoyers<br>

Operating System Efficiency R&D Consultant<br>

EfficiOS Inc.<br>

<a href="http://www.efficios.com" target="_blank">http://www.efficios.com</a><br>

<br>

_______________________________________________<br>

lttng-dev mailing list<br>

<a href="javascript:;" onclick="_e(event, 'cvml', 'lttng-dev@lists.lttng.org')">lttng-dev@lists.lttng.org</a><br>

<a href="http://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev" target="_blank">http://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev</a><br>

</blockquote></div>