[ltt-dev] [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU
Mathieu Desnoyers
mathieu.desnoyers at polymtl.ca
Fri Oct 3 00:41:27 EDT 2008
* Paul E. McKenney (paulmck at linux.vnet.ibm.com) wrote:
> Hello!
>
> This patch adds stalled-CPU detection to Classic RCU. This capability
> is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which
> defaults disabled. This is a debugging feature to detect infinite loops
> in kernel code, not something that non-kernel-hackers would be expected
> to care about. This feature can detect looping CPUs in !PREEMPT builds
> and looping CPUs with preemption disabled in PREEMPT builds. This is
> essentially a port of this functionality from the treercu patch, replacing
> the stall debug patch that is already in tip/core/rcu (commit 67182ae1c4).
>
> The changes from the patch in tip/core/rcu include making the config
> variable name match that in treercu, changing from seconds to jiffies to
> avoid spurious warnings, and printing a boot message when this feature
> is enabled.
>
Hi Paul,
Thanks for the previous explanations. Out of curiosity, what can this
patch do that the nmi watchdog can't do ?
Mathieu
> Signed-off-by: Paul E. McKenney <paulmck at linux.vnet.ibm.com>
> ---
>
> include/linux/rcuclassic.h | 12 ++-
> kernel/rcuclassic.c | 166 +++++++++++++++++++++++----------------------
> lib/Kconfig.debug | 2
> 3 files changed, 96 insertions(+), 84 deletions(-)
>
> diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
> index 29bf528..2d72d20 100644
> --- a/include/linux/rcuclassic.h
> +++ b/include/linux/rcuclassic.h
> @@ -40,15 +40,21 @@
> #include <linux/cpumask.h>
> #include <linux/seqlock.h>
>
> +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
> +#define RCU_SECONDS_TILL_STALL_CHECK 3 * HZ /* for rcp->jiffies_stall */
> +#define RCU_SECONDS_TILL_STALL_RECHECK 30 * HZ /* for rcp->jiffies_stall */
> +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
>
> /* Global control variables for rcupdate callback mechanism. */
> struct rcu_ctrlblk {
> long cur; /* Current batch number. */
> long completed; /* Number of the last completed batch */
> long pending; /* Number of the last pending batch */
> -#ifdef CONFIG_DEBUG_RCU_STALL
> - unsigned long gp_check; /* Time grace period should end, in seconds. */
> -#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
> +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
> + unsigned long gp_start; /* Time at which GP started in jiffies. */
> + unsigned long jiffies_stall;
> + /* Time at which to check for CPU stalls. */
> +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
>
> int signaled;
>
> diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
> index ed15128..eae2fb6 100644
> --- a/kernel/rcuclassic.c
> +++ b/kernel/rcuclassic.c
> @@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
> }
> }
>
> +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
> +
> +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
> +{
> + rcp->gp_start = jiffies;
> + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
> +}
> +
> +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
> +{
> + int cpu;
> + long delta;
> + unsigned long flags;
> +
> + /* Only let one CPU complain about others per time interval. */
> +
> + spin_lock_irqsave(&rcp->lock, flags);
> + delta = jiffies - rcp->jiffies_stall;
> + if (delta < 2 || rcp->cur != rcp->completed) {
> + spin_unlock_irqrestore(&rcp->lock, flags);
> + return;
> + }
> + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
> + spin_unlock_irqrestore(&rcp->lock, flags);
> +
> + /* OK, time to rat on our buddy... */
> +
> + printk(KERN_ERR "RCU detected CPU stalls:");
> + for_each_possible_cpu(cpu) {
> + if (cpu_isset(cpu, rcp->cpumask))
> + printk(" %d", cpu);
> + }
> + printk(" (detected by %d, t=%ld jiffies)\n",
> + smp_processor_id(), (long)(jiffies - rcp->gp_start));
> +}
> +
> +static void print_cpu_stall(struct rcu_ctrlblk *rcp)
> +{
> + unsigned long flags;
> +
> + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
> + smp_processor_id(), jiffies,
> + jiffies - rcp->gp_start);
> + dump_stack();
> + spin_lock_irqsave(&rcp->lock, flags);
> + if ((long)(jiffies - rcp->jiffies_stall) >= 0)
> + rcp->jiffies_stall =
> + jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
> + spin_unlock_irqrestore(&rcp->lock, flags);
> + set_need_resched(); /* kick ourselves to get things going. */
> +}
> +
> +static void check_cpu_stall(struct rcu_ctrlblk *rcp)
> +{
> + long delta;
> +
> + delta = jiffies - rcp->jiffies_stall;
> + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
> +
> + /* We haven't checked in, so go dump stack. */
> + print_cpu_stall(rcp);
> +
> + } else if (rcp->cur != rcp->completed && delta >= 2) {
> +
> + /* They had two seconds to dump stack, so complain. */
> + print_other_cpu_stall(rcp);
> + }
> +}
> +
> +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
> +
> +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
> +{
> +}
> +
> +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
> +{
> +}
> +
> +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
> +
> /**
> * call_rcu - Queue an RCU callback for invocation after a grace period.
> * @head: structure to be used for queueing the RCU updates.
> @@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
> * period (if necessary).
> */
>
> -#ifdef CONFIG_DEBUG_RCU_STALL
> -
> -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
> -{
> - rcp->gp_check = get_seconds() + 3;
> -}
> -
> -static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
> -{
> - int cpu;
> - long delta;
> - unsigned long flags;
> -
> - /* Only let one CPU complain about others per time interval. */
> -
> - spin_lock_irqsave(&rcp->lock, flags);
> - delta = get_seconds() - rcp->gp_check;
> - if (delta < 2L || cpus_empty(rcp->cpumask)) {
> - spin_unlock(&rcp->lock);
> - return;
> - }
> - rcp->gp_check = get_seconds() + 30;
> - spin_unlock_irqrestore(&rcp->lock, flags);
> -
> - /* OK, time to rat on our buddy... */
> -
> - printk(KERN_ERR "RCU detected CPU stalls:");
> - for_each_cpu_mask(cpu, rcp->cpumask)
> - printk(" %d", cpu);
> - printk(" (detected by %d, t=%lu/%lu)\n",
> - smp_processor_id(), get_seconds(), rcp->gp_check);
> -}
> -
> -static void print_cpu_stall(struct rcu_ctrlblk *rcp)
> -{
> - unsigned long flags;
> -
> - printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
> - smp_processor_id(), get_seconds(), rcp->gp_check);
> - dump_stack();
> - spin_lock_irqsave(&rcp->lock, flags);
> - if ((long)(get_seconds() - rcp->gp_check) >= 0L)
> - rcp->gp_check = get_seconds() + 30;
> - spin_unlock_irqrestore(&rcp->lock, flags);
> -}
> -
> -static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
> -{
> - long delta;
> -
> - delta = get_seconds() - rcp->gp_check;
> - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) {
> -
> - /* We haven't checked in, so go dump stack. */
> -
> - print_cpu_stall(rcp);
> -
> - } else {
> - if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
> - /* They had two seconds to dump stack, so complain. */
> - print_other_cpu_stall(rcp);
> - }
> - }
> -}
> -
> -#else /* #ifdef CONFIG_DEBUG_RCU_STALL */
> -
> -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
> -{
> -}
> -
> -static inline void
> -check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
> -{
> -}
> -
> -#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */
> -
> /*
> * Register a new batch of callbacks, and start it up if there is currently no
> * active batch and the batch to be registered has not already occurred.
> @@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
> if (rcp->cur != rcp->pending &&
> rcp->completed == rcp->cur) {
> rcp->cur++;
> - record_gp_check_time(rcp);
> + record_gp_stall_check_time(rcp);
>
> /*
> * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
> @@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
> static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
> {
> /* Check for CPU stalls, if enabled. */
> - check_cpu_stall(rcp, rdp);
> + check_cpu_stall(rcp);
>
> if (rdp->nxtlist) {
> long completed_snap = ACCESS_ONCE(rcp->completed);
> @@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
> */
> void __init __rcu_init(void)
> {
> +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
> + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
> +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
> rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
> (void *)(long)smp_processor_id());
> /* Register notifier for non-boot CPUs */
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 4e921a8..e0e0582 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -616,7 +616,7 @@ config RCU_TORTURE_TEST_RUNNABLE
> Say N here if you want the RCU torture tests to start only
> after being manually enabled via /proc.
>
> -config RCU_CPU_STALL
> +config RCU_CPU_STALL_DETECTOR
> bool "Check for stalled CPUs delaying RCU grace periods"
> depends on CLASSIC_RCU
> default n
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
More information about the lttng-dev
mailing list