All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Fix a complex race in hrtimer code.
@ 2010-10-08  2:33 Salman Qazi
  2010-10-08 18:01 ` Salman Qazi
  2010-10-11 22:43 ` Andrew Morton
  0 siblings, 2 replies; 15+ messages in thread
From: Salman Qazi @ 2010-10-08  2:33 UTC (permalink / raw)
  To: akpm, linux-kernel

The race is described as follows:

CPU X                                 CPU Y
remove_hrtimer
// state & QUEUED == 0
timer->state = CALLBACK
unlock timer base
timer->f(n) //very long
                                  hrtimer_start
                                    lock timer base
                                    remove_hrtimer // no effect
                                    hrtimer_enqueue
                                    timer->state = CALLBACK |
                                                   QUEUED
                                    unlock timer base
                                  hrtimer_start
                                    lock timer base
                                    remove_hrtimer
                                        mode = INACTIVE
                                        // CALLBACK bit lost!
                                    switch_hrtimer_base
                                            CALLBACK bit not set:
                                                    timer->base
                                                    changes to a
                                                    different CPU.
lock this CPU's timer base

Bug reproducible and fix testable using a kernel module that hrtimer_start()s
a timer with a very long running callback from multiple CPUs:

MODULE_LICENSE("GPL");

static long timer_func_time = 1000;
module_param(timer_func_time, long, 0600);
static long timer_starts = 2500;
module_param(timer_starts, long, 0600);
static long timer_expire = 1000;
module_param(timer_expire, long, 0600);

DEFINE_PER_CPU(struct task_struct *, hrtimer_thread);

/* There are other issues, like deadlocks between multiple hrtimer_start observed
 * calls, at least in 2.6.34 that this lock works around.  Will look into
 * those later.
 */
static DEFINE_SPINLOCK(blah_lock);

static ktime_t mytime;
static struct hrtimer timer;

static enum hrtimer_restart timer_restart(struct hrtimer *timer)
{
        unsigned long i;
        /* We have to make the callback longer to improve the
         * probability of having a race.
         */
        for (i = 0; i < timer_func_time / 100; i++) {
                touch_nmi_watchdog();
                touch_softlockup_watchdog();
                udelay(100);
        }

        return HRTIMER_NORESTART;
}

static int restart_hrtimer_repeatedly(void *input)
{
        int i;
        unsigned long range;
        while (!kthread_should_stop()) {
                for (i = 0; i < timer_starts; i++) {
                        /* Avoid deadlocks for now */
                        spin_lock(&blah_lock);
                        hrtimer_start(&timer, mytime, HRTIMER_MODE_REL);
                        spin_unlock(&blah_lock);
                        touch_nmi_watchdog();
                        touch_softlockup_watchdog();
                }
                cond_resched();
        }
        hrtimer_cancel(&timer);
        return 0;
}

static int hrtimer_unit_test_init(void)
{
        int cpu;
        mytime = ktime_set(0, 0);
        mytime = ktime_add_ns(mytime, timer_expire);
        hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        timer.function = timer_restart;
        for_each_online_cpu(cpu) {
                per_cpu(hrtimer_thread, cpu) = kthread_create(
                        restart_hrtimer_repeatedly, NULL, "hrtimer_test/%d",
                        cpu);
                if (IS_ERR(per_cpu(hrtimer_thread, cpu))) {
                       printk(KERN_ERR
                                "Failed to create hrtimer test thread\n");
                        BUG();
                }
                kthread_bind(per_cpu(hrtimer_thread, cpu), cpu);
                wake_up_process(per_cpu(hrtimer_thread, cpu));
        }
        return 0;
}

static void hrtimer_unit_test_exit(void)
{
        int cpu;
        for_each_online_cpu(cpu) {
                (void)kthread_stop(per_cpu(hrtimer_thread, cpu));
        }
}

module_init(hrtimer_unit_test_init);
module_exit(hrtimer_unit_test_exit);
---
 kernel/hrtimer.c |    7 +++++--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1decafb..4769c51 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -944,8 +944,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 		debug_deactivate(timer);
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
-		__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
-				 reprogram);
+		__remove_hrtimer(timer, base,
+				 (timer->state & HRTIMER_STATE_CALLBACK), reprogram);
 		return 1;
 	}
 	return 0;
@@ -1231,6 +1231,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
 		enqueue_hrtimer(timer, base);
 	}
+
+	BUG_ON(!(timer->state & HRTIMER_STATE_CALLBACK));
+
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fix a complex race in hrtimer code.
  2010-10-08  2:33 [PATCH] Fix a complex race in hrtimer code Salman Qazi
@ 2010-10-08 18:01 ` Salman Qazi
  2010-10-11 22:43 ` Andrew Morton
  1 sibling, 0 replies; 15+ messages in thread
From: Salman Qazi @ 2010-10-08 18:01 UTC (permalink / raw)
  To: akpm, linux-kernel

On Thu, Oct 7, 2010 at 7:33 PM, Salman Qazi <sqazi@google.com> wrote:
> The race is described as follows:
>
> CPU X                                 CPU Y
> remove_hrtimer
> // state & QUEUED == 0
> timer->state = CALLBACK
> unlock timer base
> timer->f(n) //very long
>                                  hrtimer_start
>                                    lock timer base
>                                    remove_hrtimer // no effect
>                                    hrtimer_enqueue
>                                    timer->state = CALLBACK |
>                                                   QUEUED
>                                    unlock timer base
>                                  hrtimer_start
>                                    lock timer base
>                                    remove_hrtimer
>                                        mode = INACTIVE
>                                        // CALLBACK bit lost!
>                                    switch_hrtimer_base
>                                            CALLBACK bit not set:
>                                                    timer->base
>                                                    changes to a
>                                                    different CPU.
> lock this CPU's timer base
>
> Bug reproducible and fix testable using a kernel module that hrtimer_start()s
> a timer with a very long running callback from multiple CPUs:
>
> MODULE_LICENSE("GPL");
>
> static long timer_func_time = 1000;
> module_param(timer_func_time, long, 0600);
> static long timer_starts = 2500;
> module_param(timer_starts, long, 0600);
> static long timer_expire = 1000;
> module_param(timer_expire, long, 0600);
>
> DEFINE_PER_CPU(struct task_struct *, hrtimer_thread);
>
> /* There are other issues, like deadlocks between multiple hrtimer_start observed
>  * calls, at least in 2.6.34 that this lock works around.  Will look into
>  * those later.
>  */
> static DEFINE_SPINLOCK(blah_lock);
>
> static ktime_t mytime;
> static struct hrtimer timer;
>
> static enum hrtimer_restart timer_restart(struct hrtimer *timer)
> {
>        unsigned long i;
>        /* We have to make the callback longer to improve the
>         * probability of having a race.
>         */
>        for (i = 0; i < timer_func_time / 100; i++) {
>                touch_nmi_watchdog();
>                touch_softlockup_watchdog();
>                udelay(100);
>        }
>
>        return HRTIMER_NORESTART;
> }
>
> static int restart_hrtimer_repeatedly(void *input)
> {
>        int i;
>        unsigned long range;
>        while (!kthread_should_stop()) {
>                for (i = 0; i < timer_starts; i++) {
>                        /* Avoid deadlocks for now */
>                        spin_lock(&blah_lock);
>                        hrtimer_start(&timer, mytime, HRTIMER_MODE_REL);
>                        spin_unlock(&blah_lock);
>                        touch_nmi_watchdog();
>                        touch_softlockup_watchdog();
>                }
>                cond_resched();
>        }
>        hrtimer_cancel(&timer);
>        return 0;
> }
>
> static int hrtimer_unit_test_init(void)
> {
>        int cpu;
>        mytime = ktime_set(0, 0);
>        mytime = ktime_add_ns(mytime, timer_expire);
>        hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>        timer.function = timer_restart;
>        for_each_online_cpu(cpu) {
>                per_cpu(hrtimer_thread, cpu) = kthread_create(
>                        restart_hrtimer_repeatedly, NULL, "hrtimer_test/%d",
>                        cpu);
>                if (IS_ERR(per_cpu(hrtimer_thread, cpu))) {
>                       printk(KERN_ERR
>                                "Failed to create hrtimer test thread\n");
>                        BUG();
>                }
>                kthread_bind(per_cpu(hrtimer_thread, cpu), cpu);
>                wake_up_process(per_cpu(hrtimer_thread, cpu));
>        }
>        return 0;
> }
>
> static void hrtimer_unit_test_exit(void)
> {
>        int cpu;
>        for_each_online_cpu(cpu) {
>                (void)kthread_stop(per_cpu(hrtimer_thread, cpu));
>        }
> }
>
> module_init(hrtimer_unit_test_init);
> module_exit(hrtimer_unit_test_exit);
> ---
>  kernel/hrtimer.c |    7 +++++--
>  1 files changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
> index 1decafb..4769c51 100644
> --- a/kernel/hrtimer.c
> +++ b/kernel/hrtimer.c
> @@ -944,8 +944,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
>                debug_deactivate(timer);
>                timer_stats_hrtimer_clear_start_info(timer);
>                reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
> -               __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
> -                                reprogram);
> +               __remove_hrtimer(timer, base,
> +                                (timer->state & HRTIMER_STATE_CALLBACK), reprogram);
>                return 1;
>        }
>        return 0;
> @@ -1231,6 +1231,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
>                BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
>                enqueue_hrtimer(timer, base);
>        }
> +
> +       BUG_ON(!(timer->state & HRTIMER_STATE_CALLBACK));
> +
>        timer->state &= ~HRTIMER_STATE_CALLBACK;
>  }
>
>
>

Oh, I forgot to include the crash dumps that we can produce in less
than a minute without this patch by
enabling NMI watchdog.  Basically, we get rb_tree corruption that
results an in infinite loop in rb_insert_color.
I am not sure about the exact sequence of events from the original
loss of CALLBACK bit to this corruption in 2.6.36 (as I first debugged
it in 2.6.26 *twenty six*), however having the patch makes it go away,
just like
it does for similar corruption in 2.6.26:

cim7:~# [  143.712006] BUG: NMI Watchdog detected LOCKUP on CPU1, ip
ffffffff8120b09a, registers:
[  143.712006] CPU 1
[  143.712006] Modules linked in: hrtimer_unit_test ipv6 sata_sil tg3
msr cpuid eeprom genrtc
[  143.712006]
[  143.712006] Pid: 4925, comm: hrtimer_test/1 Not tainted
2.6.36-smp-DEV #4 Unicorn_QCS_00  /E7320,6300ESB
[  143.712006] RIP: 0010:[<ffffffff8120b09a>]  [<ffffffff8120b09a>]
rb_insert_color+0x6a/0xe5
[  143.712006] RSP: 0018:ffff880013e83dd8  EFLAGS: 00000082
[  143.712006] RAX: 0000000000000000 RBX: ffffffffa000d680 RCX: ffffffffa000d680
[  143.712006] RDX: ffffffffa000d680 RSI: ffff88001440f298 RDI: ffffffffa000d680
[  143.712006] RBP: ffff880013e83e00 R08: 0000000000000000 R09: ffffffff818e7178
[  143.712006] R10: ffff8800081a2000 R11: ffff88000bc65d70 R12: ffffffffa000d680
[  143.712006] R13: ffffffffa000d680 R14: ffff88001440f298 R15: ffffffffa000d680
[  143.712006] FS:  0000000000000000(0000) GS:ffff88000c400000(0000)
knlGS:0000000000000000
[  143.712006] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[  143.712006] CR2: 00000000f686a000 CR3: 00000000080db000 CR4: 00000000000006e0
[  143.712006] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  143.712006] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  143.712006] Process hrtimer_test/1 (pid: 4925, threadinfo
ffff880013e82000, task ffff88000bd69740)
[  143.712006] Stack:
[  143.712006]  ffffffffa000d680 0000000000000000 ffffffffa000d688
ffff88001440f288
[  143.712006] <0> ffff88001440f288 ffff880013e83e30 ffffffff81085645
ffffffffa000d680
[  143.712006] <0> 000000203856036c 0000000000000000 0000000000000001
ffff880013e83eb0
[  143.712006] Call Trace:
[  143.712006]  [<ffffffff81085645>] enqueue_hrtimer+0x9d/0xae
[  143.712006]  [<ffffffff81086092>] __hrtimer_start_range_ns+0x1a8/0x2a6
[  143.712006]  [<ffffffffa000d155>] ?
restart_hrtimer_repeatedly+0x0/0x84 [hrtimer_unit_test]
[  143.712006]  [<ffffffff810861a8>] hrtimer_start+0x18/0x1a
[  143.712006]  [<ffffffffa000d189>]
restart_hrtimer_repeatedly+0x34/0x84 [hrtimer_unit_test]
[  143.712006]  [<ffffffffa000d155>] ?
restart_hrtimer_repeatedly+0x0/0x84 [hrtimer_unit_test]
[  143.712006]  [<ffffffff81082223>] kthread+0x82/0x8a
[  143.712006]  [<ffffffff81031994>] kernel_thread_helper+0x4/0x10
[  143.712006]  [<ffffffff810821a1>] ? kthread+0x0/0x8a
[  143.712006]  [<ffffffff81031990>] ? kernel_thread_helper+0x0/0x10
[  143.712006] Code: 74 05 49 89 df eb 0e 4c 89 f6 48 89 df 49 89 dd
e8 ec fe ff ff 49 83 0f 01 4c 89 f6 4c 89 e7 49 83 24 24 fe e8 38 ff
ff ff eb 52 <48> 85 c0 74 1d 48 8b 10 f6 c2 01 75 15 48 83 ca 01 4d 89
e5 48
[  143.712006] ---[ end trace a701d1f011c8230f ]---
[  143.712006] Kernel panic - not syncing: Non maskable interrupt
[  143.712006] Pid: 4925, comm: hrtimer_test/1 Tainted: G      D
2.6.36-smp-DEV #4
[  143.712006] Call Trace:
[  143.712006]  <NMI>  [<ffffffff8144a6d8>] panic+0x91/0x197
[  143.712006]  [<ffffffff8144d96c>] die_nmi+0xa4/0xe2
[  143.712006]  [<ffffffff8144dfd0>] nmi_watchdog_tick+0xff/0x16d
[  143.712006]  [<ffffffff8144d4db>] do_nmi+0xe2/0x29c
[  143.712006]  [<ffffffff8144cf3a>] nmi+0x1a/0x20
[  143.712006]  [<ffffffff8120b09a>] ? rb_insert_color+0x6a/0xe5
[  143.712006]  <<EOE>>  [<ffffffff81085645>] enqueue_hrtimer+0x9d/0xae
[  143.712006]  [<ffffffff81086092>] __hrtimer_start_range_ns+0x1a8/0x2a6
[  143.712006]  [<ffffffffa000d155>] ?
restart_hrtimer_repeatedly+0x0/0x84 [hrtimer_unit_test]
[  143.712006]  [<ffffffff810861a8>] hrtimer_start+0x18/0x1a
[  143.712006]  [<ffffffffa000d189>]
restart_hrtimer_repeatedly+0x34/0x84 [hrtimer_unit_test]
[  143.712006]  [<ffffffffa000d155>] ?
restart_hrtimer_repeatedly+0x0/0x84 [hrtimer_unit_test]
[  143.712006]  [<ffffffff81082223>] kthread+0x82/0x8a
[  143.712006]  [<ffffffff81031994>] kernel_thread_helper+0x4/0x10
[  143.712006]  [<ffffffff810821a1>] ? kthread+0x0/0x8a
[  143.712006]  [<ffffffff81031990>] ? kernel_thread_helper+0x0/0x10
[  143.712006] Rebooting in 10 seconds..
Booting...

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fix a complex race in hrtimer code.
  2010-10-08  2:33 [PATCH] Fix a complex race in hrtimer code Salman Qazi
  2010-10-08 18:01 ` Salman Qazi
@ 2010-10-11 22:43 ` Andrew Morton
  2010-10-11 23:18   ` Salman Qazi
  1 sibling, 1 reply; 15+ messages in thread
From: Andrew Morton @ 2010-10-11 22:43 UTC (permalink / raw)
  To: Salman Qazi; +Cc: linux-kernel, Thomas Gleixner

On Thu, 07 Oct 2010 19:33:56 -0700
Salman Qazi <sqazi@google.com> wrote:

> The race is described as follows:
> 
> CPU X                                 CPU Y
> remove_hrtimer
> // state & QUEUED == 0
> timer->state = CALLBACK
> unlock timer base
> timer->f(n) //very long
>                                   hrtimer_start
>                                     lock timer base
>                                     remove_hrtimer // no effect
>                                     hrtimer_enqueue
>                                     timer->state = CALLBACK |
>                                                    QUEUED
>                                     unlock timer base
>                                   hrtimer_start
>                                     lock timer base
>                                     remove_hrtimer
>                                         mode = INACTIVE
>                                         // CALLBACK bit lost!
>                                     switch_hrtimer_base
>                                             CALLBACK bit not set:
>                                                     timer->base
>                                                     changes to a
>                                                     different CPU.
> lock this CPU's timer base
> 
> Bug reproducible and fix testable using a kernel module that hrtimer_start()s
> a timer with a very long running callback from multiple CPUs:
> 
> MODULE_LICENSE("GPL");
> 
> static long timer_func_time = 1000;
> module_param(timer_func_time, long, 0600);
> static long timer_starts = 2500;
> module_param(timer_starts, long, 0600);
> static long timer_expire = 1000;
> module_param(timer_expire, long, 0600);
> 
> DEFINE_PER_CPU(struct task_struct *, hrtimer_thread);
> 
> /* There are other issues, like deadlocks between multiple hrtimer_start observed
>  * calls, at least in 2.6.34 that this lock works around.  Will look into
>  * those later.
>  */
> static DEFINE_SPINLOCK(blah_lock);
> 
> static ktime_t mytime;
> static struct hrtimer timer;
> 
> static enum hrtimer_restart timer_restart(struct hrtimer *timer)
> {
>         unsigned long i;
>         /* We have to make the callback longer to improve the
>          * probability of having a race.
>          */
>         for (i = 0; i < timer_func_time / 100; i++) {
>                 touch_nmi_watchdog();
>                 touch_softlockup_watchdog();
>                 udelay(100);
>         }
> 
>         return HRTIMER_NORESTART;
> }
> 
> static int restart_hrtimer_repeatedly(void *input)
> {
>         int i;
>         unsigned long range;
>         while (!kthread_should_stop()) {
>                 for (i = 0; i < timer_starts; i++) {
>                         /* Avoid deadlocks for now */
>                         spin_lock(&blah_lock);
>                         hrtimer_start(&timer, mytime, HRTIMER_MODE_REL);
>                         spin_unlock(&blah_lock);
>                         touch_nmi_watchdog();
>                         touch_softlockup_watchdog();
>                 }
>                 cond_resched();
>         }
>         hrtimer_cancel(&timer);
>         return 0;
> }
> 
> static int hrtimer_unit_test_init(void)
> {
>         int cpu;
>         mytime = ktime_set(0, 0);
>         mytime = ktime_add_ns(mytime, timer_expire);
>         hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>         timer.function = timer_restart;
>         for_each_online_cpu(cpu) {
>                 per_cpu(hrtimer_thread, cpu) = kthread_create(
>                         restart_hrtimer_repeatedly, NULL, "hrtimer_test/%d",
>                         cpu);
>                 if (IS_ERR(per_cpu(hrtimer_thread, cpu))) {
>                        printk(KERN_ERR
>                                 "Failed to create hrtimer test thread\n");
>                         BUG();
>                 }
>                 kthread_bind(per_cpu(hrtimer_thread, cpu), cpu);
>                 wake_up_process(per_cpu(hrtimer_thread, cpu));
>         }
>         return 0;
> }
> 
> static void hrtimer_unit_test_exit(void)
> {
>         int cpu;
>         for_each_online_cpu(cpu) {
>                 (void)kthread_stop(per_cpu(hrtimer_thread, cpu));
>         }
> }
> 
> module_init(hrtimer_unit_test_init);
> module_exit(hrtimer_unit_test_exit);
> ---
>  kernel/hrtimer.c |    7 +++++--
>  1 files changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
> index 1decafb..4769c51 100644
> --- a/kernel/hrtimer.c
> +++ b/kernel/hrtimer.c
> @@ -944,8 +944,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
>  		debug_deactivate(timer);
>  		timer_stats_hrtimer_clear_start_info(timer);
>  		reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
> -		__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
> -				 reprogram);
> +		__remove_hrtimer(timer, base,
> +				 (timer->state & HRTIMER_STATE_CALLBACK), reprogram);
>  		return 1;
>  	}
>  	return 0;
> @@ -1231,6 +1231,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
>  		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
>  		enqueue_hrtimer(timer, base);
>  	}
> +
> +	BUG_ON(!(timer->state & HRTIMER_STATE_CALLBACK));
> +
>  	timer->state &= ~HRTIMER_STATE_CALLBACK;
>  }

I worry about the BUG_ON(), especially if we're going to put this into
2.6.36 and -stable (as we probably should).  If it triggers
unexpectedly, we just made a big mess.

It's generally better to use a non-fatal assertion, often a
triggers-once one (WARN_ON_ONCE()).  That way our mistake doesn't result in
killing people's machines or flooding their logs.

The one case where BUG_ON() _is_ justified is when we expect that the
kernel is utterly hosed, perhaps to the point of corrupting user data
or application results.

Another downside of using BUG_ON() is that it makes the problem harder
to solve: often the user's box is now dead, sometimes to the point that
the trace didn't even get into the log files.  The user cannot get
control of the machine to have a look in /proc files or anything else. 
We really did shoot our foot off.


Poeple are using BUG waaaaay too often.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fix a complex race in hrtimer code.
  2010-10-11 22:43 ` Andrew Morton
@ 2010-10-11 23:18   ` Salman Qazi
  2010-10-11 23:23     ` Andrew Morton
  0 siblings, 1 reply; 15+ messages in thread
From: Salman Qazi @ 2010-10-11 23:18 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Thomas Gleixner

On Mon, Oct 11, 2010 at 3:43 PM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Thu, 07 Oct 2010 19:33:56 -0700
> Salman Qazi <sqazi@google.com> wrote:
>
>> The race is described as follows:
>>
>> CPU X                                 CPU Y
>> remove_hrtimer
>> // state & QUEUED == 0
>> timer->state = CALLBACK
>> unlock timer base
>> timer->f(n) //very long
>>                                   hrtimer_start
>>                                     lock timer base
>>                                     remove_hrtimer // no effect
>>                                     hrtimer_enqueue
>>                                     timer->state = CALLBACK |
>>                                                    QUEUED
>>                                     unlock timer base
>>                                   hrtimer_start
>>                                     lock timer base
>>                                     remove_hrtimer
>>                                         mode = INACTIVE
>>                                         // CALLBACK bit lost!
>>                                     switch_hrtimer_base
>>                                             CALLBACK bit not set:
>>                                                     timer->base
>>                                                     changes to a
>>                                                     different CPU.
>> lock this CPU's timer base
>>
>> Bug reproducible and fix testable using a kernel module that hrtimer_start()s
>> a timer with a very long running callback from multiple CPUs:
>>
>> MODULE_LICENSE("GPL");
>>
>> static long timer_func_time = 1000;
>> module_param(timer_func_time, long, 0600);
>> static long timer_starts = 2500;
>> module_param(timer_starts, long, 0600);
>> static long timer_expire = 1000;
>> module_param(timer_expire, long, 0600);
>>
>> DEFINE_PER_CPU(struct task_struct *, hrtimer_thread);
>>
>> /* There are other issues, like deadlocks between multiple hrtimer_start observed
>>  * calls, at least in 2.6.34 that this lock works around.  Will look into
>>  * those later.
>>  */
>> static DEFINE_SPINLOCK(blah_lock);
>>
>> static ktime_t mytime;
>> static struct hrtimer timer;
>>
>> static enum hrtimer_restart timer_restart(struct hrtimer *timer)
>> {
>>         unsigned long i;
>>         /* We have to make the callback longer to improve the
>>          * probability of having a race.
>>          */
>>         for (i = 0; i < timer_func_time / 100; i++) {
>>                 touch_nmi_watchdog();
>>                 touch_softlockup_watchdog();
>>                 udelay(100);
>>         }
>>
>>         return HRTIMER_NORESTART;
>> }
>>
>> static int restart_hrtimer_repeatedly(void *input)
>> {
>>         int i;
>>         unsigned long range;
>>         while (!kthread_should_stop()) {
>>                 for (i = 0; i < timer_starts; i++) {
>>                         /* Avoid deadlocks for now */
>>                         spin_lock(&blah_lock);
>>                         hrtimer_start(&timer, mytime, HRTIMER_MODE_REL);
>>                         spin_unlock(&blah_lock);
>>                         touch_nmi_watchdog();
>>                         touch_softlockup_watchdog();
>>                 }
>>                 cond_resched();
>>         }
>>         hrtimer_cancel(&timer);
>>         return 0;
>> }
>>
>> static int hrtimer_unit_test_init(void)
>> {
>>         int cpu;
>>         mytime = ktime_set(0, 0);
>>         mytime = ktime_add_ns(mytime, timer_expire);
>>         hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>>         timer.function = timer_restart;
>>         for_each_online_cpu(cpu) {
>>                 per_cpu(hrtimer_thread, cpu) = kthread_create(
>>                         restart_hrtimer_repeatedly, NULL, "hrtimer_test/%d",
>>                         cpu);
>>                 if (IS_ERR(per_cpu(hrtimer_thread, cpu))) {
>>                        printk(KERN_ERR
>>                                 "Failed to create hrtimer test thread\n");
>>                         BUG();
>>                 }
>>                 kthread_bind(per_cpu(hrtimer_thread, cpu), cpu);
>>                 wake_up_process(per_cpu(hrtimer_thread, cpu));
>>         }
>>         return 0;
>> }
>>
>> static void hrtimer_unit_test_exit(void)
>> {
>>         int cpu;
>>         for_each_online_cpu(cpu) {
>>                 (void)kthread_stop(per_cpu(hrtimer_thread, cpu));
>>         }
>> }
>>
>> module_init(hrtimer_unit_test_init);
>> module_exit(hrtimer_unit_test_exit);
>> ---
>>  kernel/hrtimer.c |    7 +++++--
>>  1 files changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
>> index 1decafb..4769c51 100644
>> --- a/kernel/hrtimer.c
>> +++ b/kernel/hrtimer.c
>> @@ -944,8 +944,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
>>               debug_deactivate(timer);
>>               timer_stats_hrtimer_clear_start_info(timer);
>>               reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
>> -             __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
>> -                              reprogram);
>> +             __remove_hrtimer(timer, base,
>> +                              (timer->state & HRTIMER_STATE_CALLBACK), reprogram);
>>               return 1;
>>       }
>>       return 0;
>> @@ -1231,6 +1231,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
>>               BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
>>               enqueue_hrtimer(timer, base);
>>       }
>> +
>> +     BUG_ON(!(timer->state & HRTIMER_STATE_CALLBACK));
>> +
>>       timer->state &= ~HRTIMER_STATE_CALLBACK;
>>  }
>
> I worry about the BUG_ON(), especially if we're going to put this into
> 2.6.36 and -stable (as we probably should).  If it triggers
> unexpectedly, we just made a big mess.
>
> It's generally better to use a non-fatal assertion, often a
> triggers-once one (WARN_ON_ONCE()).  That way our mistake doesn't result in
> killing people's machines or flooding their logs.
>
> The one case where BUG_ON() _is_ justified is when we expect that the
> kernel is utterly hosed, perhaps to the point of corrupting user data
> or application results.
>
> Another downside of using BUG_ON() is that it makes the problem harder
> to solve: often the user's box is now dead, sometimes to the point that
> the trace didn't even get into the log files.  The user cannot get
> control of the machine to have a look in /proc files or anything else.
> We really did shoot our foot off.
>
>
> Poeple are using BUG waaaaay too often.
>

If people are attentive about WARN_ON_ONCE, then I suppose that is a
better alternative.

One of the reasons why I personally like BUG_ON is that it catches
problems early and makes people look (i.e. one can't ignore it, just
because one didn't notice, understand or care) before it compounds any
further.  It is very hard to debug a corrupted radix tree.  It is much
easier to understand and fix the fact that our assumptions regarding
the hrtimer state and locking are incorrect.

I'd rather have a BUG_ON firing 100 times, close to the root cause,
instead of one case of unexplained data corruption far from the root
cause.  Just my $0.02.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fix a complex race in hrtimer code.
  2010-10-11 23:18   ` Salman Qazi
@ 2010-10-11 23:23     ` Andrew Morton
  2010-10-11 23:43       ` Salman Qazi
  0 siblings, 1 reply; 15+ messages in thread
From: Andrew Morton @ 2010-10-11 23:23 UTC (permalink / raw)
  To: Salman Qazi; +Cc: linux-kernel, Thomas Gleixner

On Mon, 11 Oct 2010 16:18:52 -0700
Salman Qazi <sqazi@google.com> wrote:

> On Mon, Oct 11, 2010 at 3:43 PM, Andrew Morton
> <akpm@linux-foundation.org> wrote:
>
> > Poeple are using BUG waaaaay too often.
> >
> 
> If people are attentive about WARN_ON_ONCE, then I suppose that is a
> better alternative.
> 
> One of the reasons why I personally like BUG_ON is that it catches
> problems early and makes people look (i.e. one can't ignore it, just
> because one didn't notice, understand or care) before it compounds any
> further.  It is very hard to debug a corrupted radix tree.  It is much
> easier to understand and fix the fact that our assumptions regarding
> the hrtimer state and locking are incorrect.
> 
> I'd rather have a BUG_ON firing 100 times, close to the root cause,
> instead of one case of unexplained data corruption far from the root
> cause.  Just my $0.02.

You're assuming that you are the one who observes the BUG_ON.  In a
large number of cases (probably the majority) it's some random person
on another continent.  Their box is dead and after a reboot there's
nothing in the logs.  Now what?


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fix a complex race in hrtimer code.
  2010-10-11 23:23     ` Andrew Morton
@ 2010-10-11 23:43       ` Salman Qazi
  2010-10-12  0:02         ` Salman Qazi
  0 siblings, 1 reply; 15+ messages in thread
From: Salman Qazi @ 2010-10-11 23:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Thomas Gleixner

On Mon, Oct 11, 2010 at 4:23 PM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Mon, 11 Oct 2010 16:18:52 -0700
> Salman Qazi <sqazi@google.com> wrote:
>
>> On Mon, Oct 11, 2010 at 3:43 PM, Andrew Morton
>> <akpm@linux-foundation.org> wrote:
>>
>> > Poeple are using BUG waaaaay too often.
>> >
>>
>> If people are attentive about WARN_ON_ONCE, then I suppose that is a
>> better alternative.
>>
>> One of the reasons why I personally like BUG_ON is that it catches
>> problems early and makes people look (i.e. one can't ignore it, just
>> because one didn't notice, understand or care) before it compounds any
>> further.  It is very hard to debug a corrupted radix tree.  It is much
>> easier to understand and fix the fact that our assumptions regarding
>> the hrtimer state and locking are incorrect.
>>
>> I'd rather have a BUG_ON firing 100 times, close to the root cause,
>> instead of one case of unexplained data corruption far from the root
>> cause.  Just my $0.02.
>
> You're assuming that you are the one who observes the BUG_ON.  In a
> large number of cases (probably the majority) it's some random person
> on another continent.  Their box is dead and after a reboot there's
> nothing in the logs.  Now what?
>
>

fair enough.  I'll send a patch with a WARN_ON_ONCE instead.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] Fix a complex race in hrtimer code.
  2010-10-11 23:43       ` Salman Qazi
@ 2010-10-12  0:02         ` Salman Qazi
  2010-10-12  8:49           ` Thomas Gleixner
  0 siblings, 1 reply; 15+ messages in thread
From: Salman Qazi @ 2010-10-12  0:02 UTC (permalink / raw)
  To: akpm, tglx, linux-kernel

The race is described as follows:

CPU X                                 CPU Y
remove_hrtimer
// state & QUEUED == 0
timer->state = CALLBACK
unlock timer base
timer->f(n) //very long
                                  hrtimer_start
                                    lock timer base
                                    remove_hrtimer // no effect
                                    hrtimer_enqueue
                                    timer->state = CALLBACK |
                                                   QUEUED
                                    unlock timer base
                                  hrtimer_start
                                    lock timer base
                                    remove_hrtimer
                                        mode = INACTIVE
                                        // CALLBACK bit lost!
                                    switch_hrtimer_base
                                            CALLBACK bit not set:
                                                    timer->base
                                                    changes to a
                                                    different CPU.
lock this CPU's timer base

Bug reproducible and fix testable using a kernel module that hrtimer_start()s
a timer with a very long running callback from multiple CPUs:

MODULE_LICENSE("GPL");

static long timer_func_time = 1000;
module_param(timer_func_time, long, 0600);
static long timer_starts = 2500;
module_param(timer_starts, long, 0600);
static long timer_expire = 1000;
module_param(timer_expire, long, 0600);

DEFINE_PER_CPU(struct task_struct *, hrtimer_thread);

/* There are other issues, like deadlocks between multiple hrtimer_start observed
 * calls, at least in 2.6.34 that this lock works around.  Will look into
 * those later.
 */
static DEFINE_SPINLOCK(blah_lock);

static ktime_t mytime;
static struct hrtimer timer;

static enum hrtimer_restart timer_restart(struct hrtimer *timer)
{
        unsigned long i;
        /* We have to make the callback longer to improve the
         * probability of having a race.
         */
        for (i = 0; i < timer_func_time / 100; i++) {
                touch_nmi_watchdog();
                touch_softlockup_watchdog();
                udelay(100);
        }

        return HRTIMER_NORESTART;
}

static int restart_hrtimer_repeatedly(void *input)
{
        int i;
        unsigned long range;
        while (!kthread_should_stop()) {
                for (i = 0; i < timer_starts; i++) {
                        /* Avoid deadlocks for now */
                        spin_lock(&blah_lock);
                        hrtimer_start(&timer, mytime, HRTIMER_MODE_REL);
                        spin_unlock(&blah_lock);
                        touch_nmi_watchdog();
                        touch_softlockup_watchdog();
                }
                cond_resched();
        }
        hrtimer_cancel(&timer);
        return 0;
}

static int hrtimer_unit_test_init(void)
{
        int cpu;
        mytime = ktime_set(0, 0);
        mytime = ktime_add_ns(mytime, timer_expire);
        hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        timer.function = timer_restart;
        for_each_online_cpu(cpu) {
                per_cpu(hrtimer_thread, cpu) = kthread_create(
                        restart_hrtimer_repeatedly, NULL, "hrtimer_test/%d",
                        cpu);
                if (IS_ERR(per_cpu(hrtimer_thread, cpu))) {
                       printk(KERN_ERR
                                "Failed to create hrtimer test thread\n");
                        BUG();
                }
                kthread_bind(per_cpu(hrtimer_thread, cpu), cpu);
                wake_up_process(per_cpu(hrtimer_thread, cpu));
        }
        return 0;
}

static void hrtimer_unit_test_exit(void)
{
        int cpu;
        for_each_online_cpu(cpu) {
                (void)kthread_stop(per_cpu(hrtimer_thread, cpu));
        }
}

module_init(hrtimer_unit_test_init);
module_exit(hrtimer_unit_test_exit);
---
 kernel/hrtimer.c |    7 +++++--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1decafb..57846d5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -944,8 +944,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 		debug_deactivate(timer);
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
-		__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
-				 reprogram);
+		__remove_hrtimer(timer, base,
+				 (timer->state & HRTIMER_STATE_CALLBACK), reprogram);
 		return 1;
 	}
 	return 0;
@@ -1231,6 +1231,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
 		enqueue_hrtimer(timer, base);
 	}
+
+	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fix a complex race in hrtimer code.
  2010-10-12  0:02         ` Salman Qazi
@ 2010-10-12  8:49           ` Thomas Gleixner
  2010-10-12 14:25             ` Salman Qazi
  2010-10-12 14:28             ` [PATCH] Fix a complex race in hrtimer code Salman Qazi
  0 siblings, 2 replies; 15+ messages in thread
From: Thomas Gleixner @ 2010-10-12  8:49 UTC (permalink / raw)
  To: Salman Qazi; +Cc: akpm, linux-kernel

On Mon, 11 Oct 2010, Salman Qazi wrote:

> The race is described as follows:
> 
> CPU X                                 CPU Y
> remove_hrtimer
> // state & QUEUED == 0
> timer->state = CALLBACK
> unlock timer base
> timer->f(n) //very long
>                                   hrtimer_start
>                                     lock timer base
>                                     remove_hrtimer // no effect
>                                     hrtimer_enqueue
>                                     timer->state = CALLBACK |
>                                                    QUEUED
>                                     unlock timer base
>                                   hrtimer_start
>                                     lock timer base
>                                     remove_hrtimer
>                                         mode = INACTIVE
>                                         // CALLBACK bit lost!
>                                     switch_hrtimer_base
>                                             CALLBACK bit not set:
>                                                     timer->base
>                                                     changes to a
>                                                     different CPU.
> lock this CPU's timer base

Good catch.
 
> Bug reproducible and fix testable using a kernel module that hrtimer_start()s
> a timer with a very long running callback from multiple CPUs:
> 
> MODULE_LICENSE("GPL");
> 
> static long timer_func_time = 1000;
> module_param(timer_func_time, long, 0600);
> static long timer_starts = 2500;
> module_param(timer_starts, long, 0600);
> static long timer_expire = 1000;
> module_param(timer_expire, long, 0600);
> 
> DEFINE_PER_CPU(struct task_struct *, hrtimer_thread);
> 
> /* There are other issues, like deadlocks between multiple hrtimer_start observed
>  * calls, at least in 2.6.34 that this lock works around.  Will look into
>  * those later.

Well, we don't have to work around callsites not serializing themself
in the core code, right ?

> ---
>  kernel/hrtimer.c |    7 +++++--
>  1 files changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
> index 1decafb..57846d5 100644
> --- a/kernel/hrtimer.c
> +++ b/kernel/hrtimer.c
> @@ -944,8 +944,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
>  		debug_deactivate(timer);
>  		timer_stats_hrtimer_clear_start_info(timer);
>  		reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
> -		__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
> -				 reprogram);
> +		__remove_hrtimer(timer, base,
> +				 (timer->state & HRTIMER_STATE_CALLBACK), reprogram);
>  		return 1;
>  	}
>  	return 0;
> @@ -1231,6 +1231,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
>  		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
>  		enqueue_hrtimer(timer, base);
>  	}
> +
> +	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
> +
>  	timer->state &= ~HRTIMER_STATE_CALLBACK;
>  }

Can I please get your Signed-off-by so this can be queued ?

Thanks,

	tglx
 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] Fix a complex race in hrtimer code.
  2010-10-12  8:49           ` Thomas Gleixner
@ 2010-10-12 14:25             ` Salman Qazi
  2010-10-14 11:34               ` [tip:timers/urgent] hrtimer: Preserve timer state in remove_hrtimer() tip-bot for Salman Qazi
  2010-10-12 14:28             ` [PATCH] Fix a complex race in hrtimer code Salman Qazi
  1 sibling, 1 reply; 15+ messages in thread
From: Salman Qazi @ 2010-10-12 14:25 UTC (permalink / raw)
  To: akpm, tglx, linux-kernel

The race is described as follows:

CPU X                                 CPU Y
remove_hrtimer
// state & QUEUED == 0
timer->state = CALLBACK
unlock timer base
timer->f(n) //very long
                                  hrtimer_start
                                    lock timer base
                                    remove_hrtimer // no effect
                                    hrtimer_enqueue
                                    timer->state = CALLBACK |
                                                   QUEUED
                                    unlock timer base
                                  hrtimer_start
                                    lock timer base
                                    remove_hrtimer
                                        mode = INACTIVE
                                        // CALLBACK bit lost!
                                    switch_hrtimer_base
                                            CALLBACK bit not set:
                                                    timer->base
                                                    changes to a
                                                    different CPU.
lock this CPU's timer base

Bug reproducible and fix testable using a kernel module that hrtimer_start()s
a timer with a very long running callback from multiple CPUs:

MODULE_LICENSE("GPL");

static long timer_func_time = 1000;
module_param(timer_func_time, long, 0600);
static long timer_starts = 2500;
module_param(timer_starts, long, 0600);
static long timer_expire = 1000;
module_param(timer_expire, long, 0600);

DEFINE_PER_CPU(struct task_struct *, hrtimer_thread);

/* There are other issues, like deadlocks between multiple hrtimer_start observed
 * calls, at least in 2.6.34 that this lock works around.  Will look into
 * those later.
 */
static DEFINE_SPINLOCK(blah_lock);

static ktime_t mytime;
static struct hrtimer timer;

static enum hrtimer_restart timer_restart(struct hrtimer *timer)
{
        unsigned long i;
        /* We have to make the callback longer to improve the
         * probability of having a race.
         */
        for (i = 0; i < timer_func_time / 100; i++) {
                touch_nmi_watchdog();
                touch_softlockup_watchdog();
                udelay(100);
        }

        return HRTIMER_NORESTART;
}

static int restart_hrtimer_repeatedly(void *input)
{
        int i;
        unsigned long range;
        while (!kthread_should_stop()) {
                for (i = 0; i < timer_starts; i++) {
                        /* Avoid deadlocks for now */
                        spin_lock(&blah_lock);
                        hrtimer_start(&timer, mytime, HRTIMER_MODE_REL);
                        spin_unlock(&blah_lock);
                        touch_nmi_watchdog();
                        touch_softlockup_watchdog();
                }
                cond_resched();
        }
        hrtimer_cancel(&timer);
        return 0;
}

static int hrtimer_unit_test_init(void)
{
        int cpu;
        mytime = ktime_set(0, 0);
        mytime = ktime_add_ns(mytime, timer_expire);
        hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        timer.function = timer_restart;
        for_each_online_cpu(cpu) {
                per_cpu(hrtimer_thread, cpu) = kthread_create(
                        restart_hrtimer_repeatedly, NULL, "hrtimer_test/%d",
                        cpu);
                if (IS_ERR(per_cpu(hrtimer_thread, cpu))) {
                       printk(KERN_ERR
                                "Failed to create hrtimer test thread\n");
                        BUG();
                }
                kthread_bind(per_cpu(hrtimer_thread, cpu), cpu);
                wake_up_process(per_cpu(hrtimer_thread, cpu));
        }
        return 0;
}

static void hrtimer_unit_test_exit(void)
{
        int cpu;
        for_each_online_cpu(cpu) {
                (void)kthread_stop(per_cpu(hrtimer_thread, cpu));
        }
}

module_init(hrtimer_unit_test_init);
module_exit(hrtimer_unit_test_exit);

Signed-off-by: Salman Qazi <sqazi@google.com>
---
 kernel/hrtimer.c |    7 +++++--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1decafb..57846d5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -944,8 +944,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 		debug_deactivate(timer);
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
-		__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
-				 reprogram);
+		__remove_hrtimer(timer, base,
+				 (timer->state & HRTIMER_STATE_CALLBACK), reprogram);
 		return 1;
 	}
 	return 0;
@@ -1231,6 +1231,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
 		enqueue_hrtimer(timer, base);
 	}
+
+	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fix a complex race in hrtimer code.
  2010-10-12  8:49           ` Thomas Gleixner
  2010-10-12 14:25             ` Salman Qazi
@ 2010-10-12 14:28             ` Salman Qazi
  2010-10-12 16:54               ` Thomas Gleixner
  1 sibling, 1 reply; 15+ messages in thread
From: Salman Qazi @ 2010-10-12 14:28 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: akpm, linux-kernel

On Tue, Oct 12, 2010 at 1:49 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Mon, 11 Oct 2010, Salman Qazi wrote:
>
>> The race is described as follows:
>>
>> CPU X                                 CPU Y
>> remove_hrtimer
>> // state & QUEUED == 0
>> timer->state = CALLBACK
>> unlock timer base
>> timer->f(n) //very long
>>                                   hrtimer_start
>>                                     lock timer base
>>                                     remove_hrtimer // no effect
>>                                     hrtimer_enqueue
>>                                     timer->state = CALLBACK |
>>                                                    QUEUED
>>                                     unlock timer base
>>                                   hrtimer_start
>>                                     lock timer base
>>                                     remove_hrtimer
>>                                         mode = INACTIVE
>>                                         // CALLBACK bit lost!
>>                                     switch_hrtimer_base
>>                                             CALLBACK bit not set:
>>                                                     timer->base
>>                                                     changes to a
>>                                                     different CPU.
>> lock this CPU's timer base
>
> Good catch.
>
>> Bug reproducible and fix testable using a kernel module that hrtimer_start()s
>> a timer with a very long running callback from multiple CPUs:
>>
>> MODULE_LICENSE("GPL");
>>
>> static long timer_func_time = 1000;
>> module_param(timer_func_time, long, 0600);
>> static long timer_starts = 2500;
>> module_param(timer_starts, long, 0600);
>> static long timer_expire = 1000;
>> module_param(timer_expire, long, 0600);
>>
>> DEFINE_PER_CPU(struct task_struct *, hrtimer_thread);
>>
>> /* There are other issues, like deadlocks between multiple hrtimer_start observed
>>  * calls, at least in 2.6.34 that this lock works around.  Will look into
>>  * those later.
>
> Well, we don't have to work around callsites not serializing themself
> in the core code, right ?

I assumed that the semantics were that hrtimer_starts are serialized
with respect to each other and with respect to cancels.   You seem to
disagree.

In any case, I have to rerun that test without this lock with the
patch present.  It's possible that it was a symptom of the same bug
that we just didn't observe in production.

>
>> ---
>>  kernel/hrtimer.c |    7 +++++--
>>  1 files changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
>> index 1decafb..57846d5 100644
>> --- a/kernel/hrtimer.c
>> +++ b/kernel/hrtimer.c
>> @@ -944,8 +944,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
>>               debug_deactivate(timer);
>>               timer_stats_hrtimer_clear_start_info(timer);
>>               reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
>> -             __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
>> -                              reprogram);
>> +             __remove_hrtimer(timer, base,
>> +                              (timer->state & HRTIMER_STATE_CALLBACK), reprogram);
>>               return 1;
>>       }
>>       return 0;
>> @@ -1231,6 +1231,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
>>               BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
>>               enqueue_hrtimer(timer, base);
>>       }
>> +
>> +     WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
>> +
>>       timer->state &= ~HRTIMER_STATE_CALLBACK;
>>  }
>
> Can I please get your Signed-off-by so this can be queued ?
>
> Thanks,
>
>        tglx
>
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fix a complex race in hrtimer code.
  2010-10-12 14:28             ` [PATCH] Fix a complex race in hrtimer code Salman Qazi
@ 2010-10-12 16:54               ` Thomas Gleixner
  2010-10-12 17:38                 ` Salman Qazi
  0 siblings, 1 reply; 15+ messages in thread
From: Thomas Gleixner @ 2010-10-12 16:54 UTC (permalink / raw)
  To: Salman Qazi; +Cc: Andrew Morton, LKML, Peter Zijlstra

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1242 bytes --]

On Tue, 12 Oct 2010, Salman Qazi wrote:
> On Tue, Oct 12, 2010 at 1:49 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> > On Mon, 11 Oct 2010, Salman Qazi wrote:
> >> /* There are other issues, like deadlocks between multiple hrtimer_start observed
> >>  * calls, at least in 2.6.34 that this lock works around.  Will look into
> >>  * those later.
> >
> > Well, we don't have to work around callsites not serializing themself
> > in the core code, right ?
> 
> I assumed that the semantics were that hrtimer_starts are serialized
> with respect to each other and with respect to cancels.   You seem to
> disagree.

Yes, I disagree. The code makes sure that cancel/start does not
conflict with a running callback, but it's not responsible for random
code fiddling with the same timer, really.

The outcome of random start/cancel operations on two cpus of the same
timer is just unpredictible, so where is the point of caring about
that in the core code ?

> In any case, I have to rerun that test without this lock with the
> patch present.  It's possible that it was a symptom of the same bug
> that we just didn't observe in production.

Which bug did you observe in production and what's the code which is
triggering this?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Fix a complex race in hrtimer code.
  2010-10-12 16:54               ` Thomas Gleixner
@ 2010-10-12 17:38                 ` Salman Qazi
  0 siblings, 0 replies; 15+ messages in thread
From: Salman Qazi @ 2010-10-12 17:38 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andrew Morton, LKML, Peter Zijlstra

On Tue, Oct 12, 2010 at 9:54 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Tue, 12 Oct 2010, Salman Qazi wrote:
>> On Tue, Oct 12, 2010 at 1:49 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>> > On Mon, 11 Oct 2010, Salman Qazi wrote:
>> >> /* There are other issues, like deadlocks between multiple hrtimer_start observed
>> >>  * calls, at least in 2.6.34 that this lock works around.  Will look into
>> >>  * those later.
>> >
>> > Well, we don't have to work around callsites not serializing themself
>> > in the core code, right ?
>>
>> I assumed that the semantics were that hrtimer_starts are serialized
>> with respect to each other and with respect to cancels.   You seem to
>> disagree.
>
> Yes, I disagree. The code makes sure that cancel/start does not
> conflict with a running callback, but it's not responsible for random
> code fiddling with the same timer, really.

Sounds reasonable.

>
> The outcome of random start/cancel operations on two cpus of the same
> timer is just unpredictible, so where is the point of caring about
> that in the core code ?
>
>> In any case, I have to rerun that test without this lock with the
>> patch present.  It's possible that it was a symptom of the same bug
>> that we just didn't observe in production.
>
> Which bug did you observe in production and what's the code which is
> triggering this?

The initial observation in production that there is a lockup (noticed
through the NMI watchdog) in rb_insert_color when running certain
networking workloads that use the net cgroup subsystem.  Further down
in the stack trace, we notice that hrtimer for qdisc_watchdog (see:
net/sch_api.c) is the one that is being started when this happened and
then I remembered that I had already fixed something similar in our
tree for 2.6.26 and basically forgotten about that patch.   So, I went
back and dug out this patch and it fixed the problem.  Looking back,
the symptom was almost identical in 2.6.26, with the exception that it
would crash in rb_erase instead of locking up in rb_insert_color.

I wrote the above test to mimic the behaviour of the qdisc_watchdog
timer (except taken to extremes): long running callback function and
returning with NORESTART and it reproduces the symptom reliably.

>
> Thanks,
>
>        tglx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [tip:timers/urgent] hrtimer: Preserve timer state in remove_hrtimer()
  2010-10-12 14:25             ` Salman Qazi
@ 2010-10-14 11:34               ` tip-bot for Salman Qazi
  2010-10-14 13:23                 ` Peter Zijlstra
  0 siblings, 1 reply; 15+ messages in thread
From: tip-bot for Salman Qazi @ 2010-10-14 11:34 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, peterz, sqazi, tglx

Commit-ID:  f13d4f979c518119bba5439dd2364d76d31dcd3f
Gitweb:     http://git.kernel.org/tip/f13d4f979c518119bba5439dd2364d76d31dcd3f
Author:     Salman Qazi <sqazi@google.com>
AuthorDate: Tue, 12 Oct 2010 07:25:19 -0700
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Thu, 14 Oct 2010 13:29:59 +0200

hrtimer: Preserve timer state in remove_hrtimer()

The race is described as follows:

CPU X                                 CPU Y
remove_hrtimer
// state & QUEUED == 0
timer->state = CALLBACK
unlock timer base
timer->f(n) //very long
                                  hrtimer_start
                                    lock timer base
                                    remove_hrtimer // no effect
                                    hrtimer_enqueue
                                    timer->state = CALLBACK |
                                                   QUEUED
                                    unlock timer base
                                  hrtimer_start
                                    lock timer base
                                    remove_hrtimer
                                        mode = INACTIVE
                                        // CALLBACK bit lost!
                                    switch_hrtimer_base
                                            CALLBACK bit not set:
                                                    timer->base
                                                    changes to a
                                                    different CPU.
lock this CPU's timer base

The bug was introduced with commit ca109491f (hrtimer: removing all ur
callback modes) in 2.6.29

[ tglx: Feed new state via local variable and add a comment. ]

Signed-off-by: Salman Qazi <sqazi@google.com>
Cc: akpm@linux-foundation.org
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20101012142351.8485.21823.stgit@dungbeetle.mtv.corp.google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org
---
 kernel/hrtimer.c |   13 +++++++++++--
 1 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1decafb..72206cf 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -931,6 +931,7 @@ static inline int
 remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 {
 	if (hrtimer_is_queued(timer)) {
+		unsigned long state;
 		int reprogram;
 
 		/*
@@ -944,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 		debug_deactivate(timer);
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
-		__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
-				 reprogram);
+		/*
+		 * We must preserve the CALLBACK state flag here,
+		 * otherwise we could move the timer base in
+		 * switch_hrtimer_base.
+		 */
+		state = timer->state & HRTIMER_STATE_CALLBACK;
+		__remove_hrtimer(timer, base, state, reprogram);
 		return 1;
 	}
 	return 0;
@@ -1231,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
 		enqueue_hrtimer(timer, base);
 	}
+
+	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [tip:timers/urgent] hrtimer: Preserve timer state in remove_hrtimer()
  2010-10-14 11:34               ` [tip:timers/urgent] hrtimer: Preserve timer state in remove_hrtimer() tip-bot for Salman Qazi
@ 2010-10-14 13:23                 ` Peter Zijlstra
  2010-10-14 13:34                   ` Thomas Gleixner
  0 siblings, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2010-10-14 13:23 UTC (permalink / raw)
  To: mingo, hpa, linux-kernel, sqazi, tglx; +Cc: linux-tip-commits

On Thu, 2010-10-14 at 11:34 +0000, tip-bot for Salman Qazi wrote:
> The bug was introduced with commit ca109491f (hrtimer: removing all ur
> callback modes) in 2.6.29 

Not actually sure about that,. afaict its due to unlocking the base lock
over the callback, which comes from 2d44ae4d7 (hrtimer: clean up
cpu->base locking tricks), and existed before that in the special case
of the scheduler tick (for which this problem is very unlikely to have
ever triggered because we simply don't call start on it when its
running).

Anyway, the patch does look good, so

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [tip:timers/urgent] hrtimer: Preserve timer state in remove_hrtimer()
  2010-10-14 13:23                 ` Peter Zijlstra
@ 2010-10-14 13:34                   ` Thomas Gleixner
  0 siblings, 0 replies; 15+ messages in thread
From: Thomas Gleixner @ 2010-10-14 13:34 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: mingo, hpa, linux-kernel, sqazi, linux-tip-commits

On Thu, 14 Oct 2010, Peter Zijlstra wrote:

> On Thu, 2010-10-14 at 11:34 +0000, tip-bot for Salman Qazi wrote:
> > The bug was introduced with commit ca109491f (hrtimer: removing all ur
> > callback modes) in 2.6.29 
> 
> Not actually sure about that,. afaict its due to unlocking the base lock
> over the callback, which comes from 2d44ae4d7 (hrtimer: clean up
> cpu->base locking tricks), and existed before that in the special case
> of the scheduler tick (for which this problem is very unlikely to have
> ever triggered because we simply don't call start on it when its
> running).

Yeah, I got confused on that :)

 
> Anyway, the patch does look good, so


     tglx

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2010-10-14 13:35 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-10-08  2:33 [PATCH] Fix a complex race in hrtimer code Salman Qazi
2010-10-08 18:01 ` Salman Qazi
2010-10-11 22:43 ` Andrew Morton
2010-10-11 23:18   ` Salman Qazi
2010-10-11 23:23     ` Andrew Morton
2010-10-11 23:43       ` Salman Qazi
2010-10-12  0:02         ` Salman Qazi
2010-10-12  8:49           ` Thomas Gleixner
2010-10-12 14:25             ` Salman Qazi
2010-10-14 11:34               ` [tip:timers/urgent] hrtimer: Preserve timer state in remove_hrtimer() tip-bot for Salman Qazi
2010-10-14 13:23                 ` Peter Zijlstra
2010-10-14 13:34                   ` Thomas Gleixner
2010-10-12 14:28             ` [PATCH] Fix a complex race in hrtimer code Salman Qazi
2010-10-12 16:54               ` Thomas Gleixner
2010-10-12 17:38                 ` Salman Qazi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.