[ltt-dev] [rp] [URCU RFC patch 2/3] call_rcu: use futex for wakeup scheme

Paul E. McKenney paulmck at linux.vnet.ibm.com
Mon Jun 6 16:06:24 EDT 2011


On Mon, Jun 06, 2011 at 01:46:00PM -0400, Mathieu Desnoyers wrote:
> If I remove the URCU_CALL_RCU_RT flag from the rbtree single writer
> test, thus using the pthread_cond_signal mechanism, there is a huge
> slowdown: without cpu affinity for the worker threads, it crawls to 129
> updates/s (looks like mutex contention between the thread calling
> call_rcu and the call_rcu thread). Adding CPU affinity to the per-cpu
> call_rcu threads, I get 546 updates/s, which is slightly better (better
> cache locality, and maybe the mutex contention is not as bad thanks to
> the two threads sharing the same CPU).
> 
> So I decided to try replacing pthread_cond_wait/signal with my
> futex-based implementation I use for the rest of the urcu lib: it has
> the advantage of removing the mutex from the call_rcu() execution
> entirely, sampling the "futex" variable without any mutex whatsoever for
> the case where no wakeup is needed.
> 
> Disabling URCU_CALL_RCU_RT flag, with per-cpu affined call_rcu threads,
> with my futex-based wakeup implementation, I get 55754 updates/s (even
> better than with URCU_CALL_RCU_RT flag!).

Assuming that compat_futex.c handles the non-Linux case, works for me!

							Thanx, Paul

> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers at efficios.com>
> ---
>  urcu-call-rcu-impl.h |   56 +++++++++++++++++++++++++--------------------------
>  1 file changed, 28 insertions(+), 28 deletions(-)
> 
> Index: userspace-rcu/urcu-call-rcu-impl.h
> ===================================================================
> --- userspace-rcu.orig/urcu-call-rcu-impl.h
> +++ userspace-rcu/urcu-call-rcu-impl.h
> @@ -39,6 +39,7 @@
>  #include "urcu-call-rcu.h"
>  #include "urcu-pointer.h"
>  #include "urcu/list.h"
> +#include "urcu/urcu-futex.h"
> 
>  /* Data structure that identifies a call_rcu thread. */
> 
> @@ -46,7 +47,7 @@ struct call_rcu_data {
>  	struct cds_wfq_queue cbs;
>  	unsigned long flags;
>  	pthread_mutex_t mtx;
> -	pthread_cond_t cond;
> +	int futex;
>  	unsigned long qlen;
>  	pthread_t tid;
>  	int cpu_affinity;
> @@ -88,6 +89,26 @@ static struct call_rcu_data *default_cal
>  static struct call_rcu_data **per_cpu_call_rcu_data;
>  static long maxcpus;
> 
> +static void call_rcu_wait(struct call_rcu_data *crdp)
> +{
> +	/* Read call_rcu list before read futex */
> +	cmm_smp_mb();
> +	if (uatomic_read(&crdp->futex) == -1)
> +		futex_async(&crdp->futex, FUTEX_WAIT, -1,
> +		      NULL, NULL, 0);
> +}
> +
> +static void call_rcu_wake_up(struct call_rcu_data *crdp)
> +{
> +	/* Write to call_rcu list before reading/writing futex */
> +	cmm_smp_mb();
> +	if (unlikely(uatomic_read(&crdp->futex) == -1)) {
> +		uatomic_set(&crdp->futex, 0);
> +		futex_async(&crdp->futex, FUTEX_WAKE, 1,
> +		      NULL, NULL, 0);
> +	}
> +}
> +
>  /* Allocate the array if it has not already been allocated. */
> 
>  static void alloc_cpu_call_rcu_data(void)
> @@ -219,19 +240,9 @@ static void *call_rcu_thread(void *arg)
>  		if (crdp->flags & URCU_CALL_RCU_RT)
>  			poll(NULL, 0, 10);
>  		else {
> -			call_rcu_lock(&crdp->mtx);
> -			_CMM_STORE_SHARED(crdp->flags,
> -				     crdp->flags & ~URCU_CALL_RCU_RUNNING);
> -			if (&crdp->cbs.head ==
> -			    _CMM_LOAD_SHARED(crdp->cbs.tail) &&
> -			    pthread_cond_wait(&crdp->cond, &crdp->mtx) != 0) {
> -				perror("pthread_cond_wait");
> -				exit(-1);
> -			}
> -			_CMM_STORE_SHARED(crdp->flags,
> -				     crdp->flags | URCU_CALL_RCU_RUNNING);
> +			if (&crdp->cbs.head == _CMM_LOAD_SHARED(crdp->cbs.tail))
> +				call_rcu_wait(crdp);
>  			poll(NULL, 0, 10);
> -			call_rcu_unlock(&crdp->mtx);
>  		}
>  	}
>  	call_rcu_lock(&crdp->mtx);
> @@ -264,11 +275,8 @@ static void call_rcu_data_init(struct ca
>  		perror("pthread_mutex_init");
>  		exit(-1);
>  	}
> -	if (pthread_cond_init(&crdp->cond, NULL) != 0) {
> -		perror("pthread_cond_init");
> -		exit(-1);
> -	}
> -	crdp->flags = flags | URCU_CALL_RCU_RUNNING;
> +	crdp->futex = 0;
> +	crdp->flags = flags;
>  	cds_list_add(&crdp->list, &call_rcu_data_list);
>  	crdp->cpu_affinity = cpu_affinity;
>  	cmm_smp_mb();  /* Structure initialized before pointer is planted. */
> @@ -492,16 +500,8 @@ int create_all_cpu_call_rcu_data(unsigne
>   */
>  static void wake_call_rcu_thread(struct call_rcu_data *crdp)
>  {
> -	if (!(_CMM_LOAD_SHARED(crdp->flags) & URCU_CALL_RCU_RT)) {
> -		call_rcu_lock(&crdp->mtx);
> -		if (!(_CMM_LOAD_SHARED(crdp->flags) & URCU_CALL_RCU_RUNNING)) {
> -			if (pthread_cond_signal(&crdp->cond) != 0) {
> -				perror("pthread_cond_signal");
> -				exit(-1);
> -			}
> -		}
> -		call_rcu_unlock(&crdp->mtx);
> -	}
> +	if (!(_CMM_LOAD_SHARED(crdp->flags) & URCU_CALL_RCU_RT))
> +		call_rcu_wake_up(crdp);
>  }
> 
>  /*
> 
> 
> _______________________________________________
> rp mailing list
> rp at svcs.cs.pdx.edu
> http://svcs.cs.pdx.edu/mailman/listinfo/rp




More information about the lttng-dev mailing list