linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock
@ 2009-01-25 20:50 Frederic Weisbecker
  2009-01-26 13:32 ` Ingo Molnar
  2009-01-26 13:48 ` Peter Zijlstra
  0 siblings, 2 replies; 31+ messages in thread
From: Frederic Weisbecker @ 2009-01-25 20:50 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, Andrew Morton, Mandeep Singh Baines, Frederic Weisbecker

This patch adds a counter for writers that enter a rwlock slow path.
For example it can be useful for slow background tasks which perform some jobs
on the tasklist, such as the hung_task detector (kernel/hung_task.c).

It adds a inc/dec pair on the slow path and 4 bytes for each rwlocks, so the overhead
is not null.

Only x86 is supported for now, writers_spinning_lock() will return 0 on other archs (which
is perhaps not a good idea).

Comments?

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/Kconfig                          |    4 ++++
 arch/x86/Kconfig                      |    1 +
 arch/x86/include/asm/spinlock.h       |    5 +++++
 arch/x86/include/asm/spinlock_types.h |    1 +
 arch/x86/lib/rwlock_64.S              |   10 +++++++---
 include/linux/spinlock.h              |    7 +++++++
 6 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 550dab2..86c22e0 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -106,3 +106,7 @@ config HAVE_CLK
 	  The <linux/clk.h> calls support software clock gating and
 	  thus are a key power management tool on many systems.
 
+config HAVE_NB_WRITERS_SPINNING_LOCK
+	bool
+
+
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 967e114..0aeae17 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -40,6 +40,7 @@ config X86
 	select HAVE_GENERIC_DMA_COHERENT if X86_32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select USER_STACKTRACE_SUPPORT
+	select HAVE_NB_WRITERS_SPINNING_LOCK
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 139b424..d9ee21b 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -283,6 +283,11 @@ static inline int __raw_write_trylock(raw_rwlock_t *lock)
 	return 0;
 }
 
+static inline int __raw_writers_spinning_lock(raw_rwlock_t *lock)
+{
+	return lock->spinning_writers;
+}
+
 static inline void __raw_read_unlock(raw_rwlock_t *rw)
 {
 	asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 845f81c..163e6de 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -13,6 +13,7 @@ typedef struct raw_spinlock {
 
 typedef struct {
 	unsigned int lock;
+	unsigned int spinning_writers;
 } raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S
index 05ea55f..9589b74 100644
--- a/arch/x86/lib/rwlock_64.S
+++ b/arch/x86/lib/rwlock_64.S
@@ -9,14 +9,18 @@
 ENTRY(__write_lock_failed)
 	CFI_STARTPROC
 	LOCK_PREFIX
+	incl 4(%rdi) /* add ourself as a spinning writer */
+1:	LOCK_PREFIX
 	addl $RW_LOCK_BIAS,(%rdi)
-1:	rep
+2:	rep
 	nop
 	cmpl $RW_LOCK_BIAS,(%rdi)
-	jne 1b
+	jne 2b
 	LOCK_PREFIX
 	subl $RW_LOCK_BIAS,(%rdi)
-	jnz  __write_lock_failed
+	jnz  1b
+	LOCK_PREFIX
+	decl 4(%rdi)
 	ret
 	CFI_ENDPROC
 END(__write_lock_failed)
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index e0c0fcc..8ce22af 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -121,6 +121,13 @@ do {								\
 
 #define spin_is_locked(lock)	__raw_spin_is_locked(&(lock)->raw_lock)
 
+#ifdef CONFIG_HAVE_NB_WRITERS_SPINNING_LOCK
+#define writers_spinning_lock(rwlock)				\
+	__raw_writers_spinning_lock(&(rwlock)->raw_lock)
+#else
+#define writers_spinning_lock(rwlock)	0
+#endif
+
 #ifdef CONFIG_GENERIC_LOCKBREAK
 #define spin_is_contended(lock) ((lock)->break_lock)
 #else
-- 
1.6.1



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock
  2009-01-25 20:50 [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock Frederic Weisbecker
@ 2009-01-26 13:32 ` Ingo Molnar
  2009-01-26 13:48 ` Peter Zijlstra
  1 sibling, 0 replies; 31+ messages in thread
From: Ingo Molnar @ 2009-01-26 13:32 UTC (permalink / raw)
  To: Frederic Weisbecker, Peter Zijlstra
  Cc: linux-kernel, Andrew Morton, Mandeep Singh Baines


* Frederic Weisbecker <fweisbec@gmail.com> wrote:

> This patch adds a counter for writers that enter a rwlock slow path. For 
> example it can be useful for slow background tasks which perform some 
> jobs on the tasklist, such as the hung_task detector 
> (kernel/hung_task.c).
> 
> It adds a inc/dec pair on the slow path and 4 bytes for each rwlocks, so 
> the overhead is not null.
> 
> Only x86 is supported for now, writers_spinning_lock() will return 0 on 
> other archs (which is perhaps not a good idea).
> 
> Comments?

hm, it increases the rwlock data type:

> diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> index 845f81c..163e6de 100644
> --- a/arch/x86/include/asm/spinlock_types.h
> +++ b/arch/x86/include/asm/spinlock_types.h
> @@ -13,6 +13,7 @@ typedef struct raw_spinlock {
>  
>  typedef struct {
>  	unsigned int lock;
> +	unsigned int spinning_writers;
>  } raw_rwlock_t;

that's generally not done lightly. Performance figures for a relevant 
workload are obligatory in this case - proving that it's worth the size 
bloat.

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock
  2009-01-25 20:50 [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock Frederic Weisbecker
  2009-01-26 13:32 ` Ingo Molnar
@ 2009-01-26 13:48 ` Peter Zijlstra
  2009-01-26 15:25   ` Frédéric Weisbecker
  1 sibling, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2009-01-26 13:48 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Ingo Molnar, linux-kernel, Andrew Morton, Mandeep Singh Baines

On Sun, 2009-01-25 at 12:50 -0800, Frederic Weisbecker wrote:
> This patch adds a counter for writers that enter a rwlock slow path.
> For example it can be useful for slow background tasks which perform some jobs
> on the tasklist, such as the hung_task detector (kernel/hung_task.c).
> 
> It adds a inc/dec pair on the slow path and 4 bytes for each rwlocks, so the overhead
> is not null.
> 
> Only x86 is supported for now, writers_spinning_lock() will return 0 on other archs (which
> is perhaps not a good idea).
> 
> Comments?

_why_ ?




^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock
  2009-01-26 13:48 ` Peter Zijlstra
@ 2009-01-26 15:25   ` Frédéric Weisbecker
  2009-01-26 15:37     ` Peter Zijlstra
  0 siblings, 1 reply; 31+ messages in thread
From: Frédéric Weisbecker @ 2009-01-26 15:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Andrew Morton, Mandeep Singh Baines

2009/1/26 Peter Zijlstra <peterz@infradead.org>:
> On Sun, 2009-01-25 at 12:50 -0800, Frederic Weisbecker wrote:
>> This patch adds a counter for writers that enter a rwlock slow path.
>> For example it can be useful for slow background tasks which perform some jobs
>> on the tasklist, such as the hung_task detector (kernel/hung_task.c).
>>
>> It adds a inc/dec pair on the slow path and 4 bytes for each rwlocks, so the overhead
>> is not null.
>>
>> Only x86 is supported for now, writers_spinning_lock() will return 0 on other archs (which
>> is perhaps not a good idea).
>>
>> Comments?
>
> _why_ ?

The hung task detector runs a periodic loop through the task_list, and
currently it doesn't run
over an arbitrary threshold of tasks to not hold the task_list lock
for too long.

So we thought about a way to detect if there are some writers waiting
for the lock, anf if so, release
the lock, schedule and retry.

I'm not sure this is something easy to measure, since this is more
about small latency gains (if there are).

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock
  2009-01-26 15:25   ` Frédéric Weisbecker
@ 2009-01-26 15:37     ` Peter Zijlstra
  2009-01-26 16:04       ` Frédéric Weisbecker
  0 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2009-01-26 15:37 UTC (permalink / raw)
  To: Frédéric Weisbecker
  Cc: Ingo Molnar, linux-kernel, Andrew Morton, Mandeep Singh Baines

On Mon, 2009-01-26 at 16:25 +0100, Frédéric Weisbecker wrote:
> 2009/1/26 Peter Zijlstra <peterz@infradead.org>:
> > On Sun, 2009-01-25 at 12:50 -0800, Frederic Weisbecker wrote:
> >> This patch adds a counter for writers that enter a rwlock slow path.
> >> For example it can be useful for slow background tasks which perform some jobs
> >> on the tasklist, such as the hung_task detector (kernel/hung_task.c).
> >>
> >> It adds a inc/dec pair on the slow path and 4 bytes for each rwlocks, so the overhead
> >> is not null.
> >>
> >> Only x86 is supported for now, writers_spinning_lock() will return 0 on other archs (which
> >> is perhaps not a good idea).
> >>
> >> Comments?
> >
> > _why_ ?
> 
> The hung task detector runs a periodic loop through the task_list, and
> currently it doesn't run
> over an arbitrary threshold of tasks to not hold the task_list lock
> for too long.
> 
> So we thought about a way to detect if there are some writers waiting
> for the lock, anf if so, release
> the lock, schedule and retry.

Ah, if it can do that, then it can also use RCU, no? Only users who
really have to hold off new tasks need the read-task_lock. The rest can
use RCU.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock
  2009-01-26 15:37     ` Peter Zijlstra
@ 2009-01-26 16:04       ` Frédéric Weisbecker
  2009-01-26 17:36         ` Mandeep Baines
  0 siblings, 1 reply; 31+ messages in thread
From: Frédéric Weisbecker @ 2009-01-26 16:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Andrew Morton, Mandeep Singh Baines

2009/1/26 Peter Zijlstra <peterz@infradead.org>:
> On Mon, 2009-01-26 at 16:25 +0100, Frédéric Weisbecker wrote:
>> 2009/1/26 Peter Zijlstra <peterz@infradead.org>:
>> > On Sun, 2009-01-25 at 12:50 -0800, Frederic Weisbecker wrote:
>> >> This patch adds a counter for writers that enter a rwlock slow path.
>> >> For example it can be useful for slow background tasks which perform some jobs
>> >> on the tasklist, such as the hung_task detector (kernel/hung_task.c).
>> >>
>> >> It adds a inc/dec pair on the slow path and 4 bytes for each rwlocks, so the overhead
>> >> is not null.
>> >>
>> >> Only x86 is supported for now, writers_spinning_lock() will return 0 on other archs (which
>> >> is perhaps not a good idea).
>> >>
>> >> Comments?
>> >
>> > _why_ ?
>>
>> The hung task detector runs a periodic loop through the task_list, and
>> currently it doesn't run
>> over an arbitrary threshold of tasks to not hold the task_list lock
>> for too long.
>>
>> So we thought about a way to detect if there are some writers waiting
>> for the lock, anf if so, release
>> the lock, schedule and retry.
>
> Ah, if it can do that, then it can also use RCU, no? Only users who
> really have to hold off new tasks need the read-task_lock. The rest can
> use RCU.


Really?
That sounds a good news. Mandeep, what do you think?

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock
  2009-01-26 16:04       ` Frédéric Weisbecker
@ 2009-01-26 17:36         ` Mandeep Baines
  2009-01-26 17:41           ` Peter Zijlstra
  0 siblings, 1 reply; 31+ messages in thread
From: Mandeep Baines @ 2009-01-26 17:36 UTC (permalink / raw)
  To: Frédéric Weisbecker
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Andrew Morton

Hi Frédéric,

Unfortunately, this can't be done for hung_task. It writes to the
task_struct here:

static void check_hung_task(struct task_struct *t, unsigned long now,
                            unsigned long timeout)
{
        unsigned long switch_count = t->nvcsw + t->nivcsw;

        if (t->flags & PF_FROZEN)
                return;

        if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
                t->last_switch_count = switch_count;
                t->last_switch_timestamp = now;
                return;
        }

It is able to get away with using only a read_lock because no one else
reads or writes to these fields.

Regards,
Mandeep

On Mon, Jan 26, 2009 at 8:04 AM, Frédéric Weisbecker <fweisbec@gmail.com> wrote:
> 2009/1/26 Peter Zijlstra <peterz@infradead.org>:
>> On Mon, 2009-01-26 at 16:25 +0100, Frédéric Weisbecker wrote:
>>> 2009/1/26 Peter Zijlstra <peterz@infradead.org>:
>>> > On Sun, 2009-01-25 at 12:50 -0800, Frederic Weisbecker wrote:
>>> >> This patch adds a counter for writers that enter a rwlock slow path.
>>> >> For example it can be useful for slow background tasks which perform some jobs
>>> >> on the tasklist, such as the hung_task detector (kernel/hung_task.c).
>>> >>
>>> >> It adds a inc/dec pair on the slow path and 4 bytes for each rwlocks, so the overhead
>>> >> is not null.
>>> >>
>>> >> Only x86 is supported for now, writers_spinning_lock() will return 0 on other archs (which
>>> >> is perhaps not a good idea).
>>> >>
>>> >> Comments?
>>> >
>>> > _why_ ?
>>>
>>> The hung task detector runs a periodic loop through the task_list, and
>>> currently it doesn't run
>>> over an arbitrary threshold of tasks to not hold the task_list lock
>>> for too long.
>>>
>>> So we thought about a way to detect if there are some writers waiting
>>> for the lock, anf if so, release
>>> the lock, schedule and retry.
>>
>> Ah, if it can do that, then it can also use RCU, no? Only users who
>> really have to hold off new tasks need the read-task_lock. The rest can
>> use RCU.
>
>
> Really?
> That sounds a good news. Mandeep, what do you think?
>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock
  2009-01-26 17:36         ` Mandeep Baines
@ 2009-01-26 17:41           ` Peter Zijlstra
  2009-01-27  0:30             ` [PATCH v4] softlockup: remove hung_task_check_count Mandeep Singh Baines
  0 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2009-01-26 17:41 UTC (permalink / raw)
  To: Mandeep Baines
  Cc: Frédéric Weisbecker, Ingo Molnar, linux-kernel, Andrew Morton

On Mon, 2009-01-26 at 09:36 -0800, Mandeep Baines wrote:

> Unfortunately, this can't be done for hung_task. It writes to the
> task_struct here:

Don't top post!

> static void check_hung_task(struct task_struct *t, unsigned long now,
>                             unsigned long timeout)
> {
>         unsigned long switch_count = t->nvcsw + t->nivcsw;
> 
>         if (t->flags & PF_FROZEN)
>                 return;
> 
>         if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
>                 t->last_switch_count = switch_count;
>                 t->last_switch_timestamp = now;
>                 return;
>         }
> 
> It is able to get away with using only a read_lock because no one else
> reads or writes to these fields.

How would RCU be different here?


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v4] softlockup: remove hung_task_check_count
  2009-01-26 17:41           ` Peter Zijlstra
@ 2009-01-27  0:30             ` Mandeep Singh Baines
  2009-01-27  9:27               ` Frederic Weisbecker
  2009-01-27 13:26               ` Ingo Molnar
  0 siblings, 2 replies; 31+ messages in thread
From: Mandeep Singh Baines @ 2009-01-27  0:30 UTC (permalink / raw)
  To: linux-kernel, Peter Zijlstra, Frédéric Weisbecker, Ingo Molnar
  Cc: rientjes, mbligh, thockin, Andrew Morton

Peter Zijlstra (peterz@infradead.org) wrote:
> On Mon, 2009-01-26 at 09:36 -0800, Mandeep Baines wrote:
> 
> > Unfortunately, this can't be done for hung_task. It writes to the
> > task_struct here:
> 
> Don't top post!
> 
> > static void check_hung_task(struct task_struct *t, unsigned long now,
> >                             unsigned long timeout)
> > {
> >         unsigned long switch_count = t->nvcsw + t->nivcsw;
> > 
> >         if (t->flags & PF_FROZEN)
> >                 return;
> > 
> >         if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
> >                 t->last_switch_count = switch_count;
> >                 t->last_switch_timestamp = now;
> >                 return;
> >         }
> > 
> > It is able to get away with using only a read_lock because no one else
> > reads or writes to these fields.
> 
> How would RCU be different here?
> 

My bad, RCU wouldn't be any different. I misunderstood how RCU works. Just
spent the morning reading the LWN 3-part series on RCU and I think I'm able to
grok it now;)

Below is a patch to hung_task which removes the hung_task_check_count and
converts the read_locks to RCU.

Thanks Frédéric and Peter!

---
To avoid holding the tasklist lock too long, hung_task_check_count was used
as an upper bound on the number of tasks that are checked by hung_task.
This patch removes the hung_task_check_count sysctl.

Instead of checking a limited number of tasks, all tasks are checked. To
avoid holding the CPU for too long, need_resched() is checked often. To
avoid blocking out writers, the read_lock has been converted to an
rcu_read_lock().

It is safe convert to an rcu_read_lock() because the tasks and thread_group
lists are both protected by list_*_rcu() operations. The worst that can
happen is that hung_task will update last_switch_timestamp field of a DEAD
task.

The design was proposed by Frédéric Weisbecker. Peter Zijlstra suggested
the use of RCU.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
---
 include/linux/sched.h |    1 -
 kernel/hung_task.c    |   12 +++---------
 kernel/sysctl.c       |    9 ---------
 3 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f2f94d5..278121c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -315,7 +315,6 @@ static inline void touch_all_softlockup_watchdogs(void)
 
 #ifdef CONFIG_DETECT_HUNG_TASK
 extern unsigned int  sysctl_hung_task_panic;
-extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
 extern unsigned long sysctl_hung_task_warnings;
 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ba8ccd4..7d67350 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -17,11 +17,6 @@
 #include <linux/sysctl.h>
 
 /*
- * Have a reasonable limit on the number of tasks checked:
- */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
-
-/*
  * Zero means infinite timeout - no checking done:
  */
 unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
@@ -116,7 +111,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
  */
 static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
-	int max_count = sysctl_hung_task_check_count;
 	unsigned long now = get_timestamp();
 	struct task_struct *g, *t;
 
@@ -127,16 +121,16 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	if (test_taint(TAINT_DIE) || did_panic)
 		return;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	do_each_thread(g, t) {
-		if (!--max_count)
+		if (need_resched())
 			goto unlock;
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, now, timeout);
 	} while_each_thread(g, t);
  unlock:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 }
 
 static void update_poll_jiffies(void)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2481ed3..16526a2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -820,15 +820,6 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "hung_task_check_count",
-		.data		= &sysctl_hung_task_check_count,
-		.maxlen		= sizeof(unsigned long),
-		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
-		.strategy	= &sysctl_intvec,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_timeout_secs",
 		.data		= &sysctl_hung_task_timeout_secs,
 		.maxlen		= sizeof(unsigned long),
-- 
1.5.4.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH v4] softlockup: remove hung_task_check_count
  2009-01-27  0:30             ` [PATCH v4] softlockup: remove hung_task_check_count Mandeep Singh Baines
@ 2009-01-27  9:27               ` Frederic Weisbecker
  2009-01-27 13:26               ` Ingo Molnar
  1 sibling, 0 replies; 31+ messages in thread
From: Frederic Weisbecker @ 2009-01-27  9:27 UTC (permalink / raw)
  To: Mandeep Singh Baines
  Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, rientjes, mbligh,
	thockin, Andrew Morton

On Mon, Jan 26, 2009 at 04:30:55PM -0800, Mandeep Singh Baines wrote:
> Peter Zijlstra (peterz@infradead.org) wrote:
> > On Mon, 2009-01-26 at 09:36 -0800, Mandeep Baines wrote:
> > 
> > > Unfortunately, this can't be done for hung_task. It writes to the
> > > task_struct here:
> > 
> > Don't top post!
> > 
> > > static void check_hung_task(struct task_struct *t, unsigned long now,
> > >                             unsigned long timeout)
> > > {
> > >         unsigned long switch_count = t->nvcsw + t->nivcsw;
> > > 
> > >         if (t->flags & PF_FROZEN)
> > >                 return;
> > > 
> > >         if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
> > >                 t->last_switch_count = switch_count;
> > >                 t->last_switch_timestamp = now;
> > >                 return;
> > >         }
> > > 
> > > It is able to get away with using only a read_lock because no one else
> > > reads or writes to these fields.
> > 
> > How would RCU be different here?
> > 
> 
> My bad, RCU wouldn't be any different. I misunderstood how RCU works. Just
> spent the morning reading the LWN 3-part series on RCU and I think I'm able to
> grok it now;)
> 
> Below is a patch to hung_task which removes the hung_task_check_count and
> converts the read_locks to RCU.
> 
> Thanks Frédéric and Peter!
> 
> ---
> To avoid holding the tasklist lock too long, hung_task_check_count was used
> as an upper bound on the number of tasks that are checked by hung_task.
> This patch removes the hung_task_check_count sysctl.
> 
> Instead of checking a limited number of tasks, all tasks are checked. To
> avoid holding the CPU for too long, need_resched() is checked often. To
> avoid blocking out writers, the read_lock has been converted to an
> rcu_read_lock().
> 
> It is safe convert to an rcu_read_lock() because the tasks and thread_group
> lists are both protected by list_*_rcu() operations. The worst that can
> happen is that hung_task will update last_switch_timestamp field of a DEAD
> task.
> 
> The design was proposed by Frédéric Weisbecker. Peter Zijlstra suggested
> the use of RCU.
> 
> Signed-off-by: Mandeep Singh Baines <msb@google.com>
> ---
>  include/linux/sched.h |    1 -
>  kernel/hung_task.c    |   12 +++---------
>  kernel/sysctl.c       |    9 ---------
>  3 files changed, 3 insertions(+), 19 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index f2f94d5..278121c 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -315,7 +315,6 @@ static inline void touch_all_softlockup_watchdogs(void)
>  
>  #ifdef CONFIG_DETECT_HUNG_TASK
>  extern unsigned int  sysctl_hung_task_panic;
> -extern unsigned long sysctl_hung_task_check_count;
>  extern unsigned long sysctl_hung_task_timeout_secs;
>  extern unsigned long sysctl_hung_task_warnings;
>  extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index ba8ccd4..7d67350 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -17,11 +17,6 @@
>  #include <linux/sysctl.h>
>  
>  /*
> - * Have a reasonable limit on the number of tasks checked:
> - */
> -unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
> -
> -/*
>   * Zero means infinite timeout - no checking done:
>   */
>  unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
> @@ -116,7 +111,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
>   */
>  static void check_hung_uninterruptible_tasks(unsigned long timeout)
>  {
> -	int max_count = sysctl_hung_task_check_count;
>  	unsigned long now = get_timestamp();
>  	struct task_struct *g, *t;
>  
> @@ -127,16 +121,16 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
>  	if (test_taint(TAINT_DIE) || did_panic)
>  		return;
>  
> -	read_lock(&tasklist_lock);
> +	rcu_read_lock();
>  	do_each_thread(g, t) {
> -		if (!--max_count)
> +		if (need_resched())
>  			goto unlock;
>  		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
>  		if (t->state == TASK_UNINTERRUPTIBLE)
>  			check_hung_task(t, now, timeout);
>  	} while_each_thread(g, t);
>   unlock:
> -	read_unlock(&tasklist_lock);
> +	rcu_read_unlock();
>  }
>  
>  static void update_poll_jiffies(void)
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 2481ed3..16526a2 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -820,15 +820,6 @@ static struct ctl_table kern_table[] = {
>  	},
>  	{
>  		.ctl_name	= CTL_UNNUMBERED,
> -		.procname	= "hung_task_check_count",
> -		.data		= &sysctl_hung_task_check_count,
> -		.maxlen		= sizeof(unsigned long),
> -		.mode		= 0644,
> -		.proc_handler	= &proc_doulongvec_minmax,
> -		.strategy	= &sysctl_intvec,
> -	},
> -	{
> -		.ctl_name	= CTL_UNNUMBERED,
>  		.procname	= "hung_task_timeout_secs",
>  		.data		= &sysctl_hung_task_timeout_secs,
>  		.maxlen		= sizeof(unsigned long),
> -- 
> 1.5.4.5
> 


That looks good :-)


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4] softlockup: remove hung_task_check_count
  2009-01-27  0:30             ` [PATCH v4] softlockup: remove hung_task_check_count Mandeep Singh Baines
  2009-01-27  9:27               ` Frederic Weisbecker
@ 2009-01-27 13:26               ` Ingo Molnar
  2009-01-27 18:48                 ` Mandeep Singh Baines
  1 sibling, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2009-01-27 13:26 UTC (permalink / raw)
  To: Mandeep Singh Baines
  Cc: linux-kernel, Peter Zijlstra, Frédéric Weisbecker,
	rientjes, mbligh, thockin, Andrew Morton


* Mandeep Singh Baines <msb@google.com> wrote:

> The design was proposed by Frédéric Weisbecker. Peter Zijlstra suggested 
> the use of RCU.

ok, this looks _much_ cleaner.

One question:

> -	read_lock(&tasklist_lock);
> +	rcu_read_lock();
>  	do_each_thread(g, t) {
> -		if (!--max_count)
> +		if (need_resched())
>  			goto unlock;

Isnt it dangerous to skip a check just because we got marked for 
reschedule? Since it runs so rarely it could by accident be preempted and 
we'd not get any checking done for a long time.

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4] softlockup: remove hung_task_check_count
  2009-01-27 13:26               ` Ingo Molnar
@ 2009-01-27 18:48                 ` Mandeep Singh Baines
  2009-01-28  8:25                   ` Peter Zijlstra
  0 siblings, 1 reply; 31+ messages in thread
From: Mandeep Singh Baines @ 2009-01-27 18:48 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, Peter Zijlstra, Frédéric Weisbecker,
	rientjes, mbligh, thockin, Andrew Morton

Ingo Molnar (mingo@elte.hu) wrote:
> 
> * Mandeep Singh Baines <msb@google.com> wrote:
> 
> > The design was proposed by Frédéric Weisbecker. Peter Zijlstra suggested 
> > the use of RCU.
> 
> ok, this looks _much_ cleaner.
> 
> One question:
> 
> > -	read_lock(&tasklist_lock);
> > +	rcu_read_lock();
> >  	do_each_thread(g, t) {
> > -		if (!--max_count)
> > +		if (need_resched())
> >  			goto unlock;
> 
> Isnt it dangerous to skip a check just because we got marked for 
> reschedule? Since it runs so rarely it could by accident be preempted and 
> we'd not get any checking done for a long time.
> 

Yeah, the checking could be deferred indefinitely. So you could have a system
where tasks are hung but it takes a really long time to detect this and
finally panic the system. Not so good for high-availability.

What if a check_count sysctl was added? But instead of being a max_check_count,
it would be a min_check_count. This would guarantee that a minimum amount of
checking is done.

Alternatively, the code for trying to continue the iteration after a
reschedule could be re-inserted. That code is a little tricky and
potentially fragile since continuation of a tasklist iteration is not
really supported by the sched.h APIs. But it is does have the nice properties
of getting through the entire list almost all the time and still playing nice
with the scheduler.

Regards,
Mandeep

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4] softlockup: remove hung_task_check_count
  2009-01-27 18:48                 ` Mandeep Singh Baines
@ 2009-01-28  8:25                   ` Peter Zijlstra
  2009-01-29  1:42                     ` Mandeep Singh Baines
  0 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2009-01-28  8:25 UTC (permalink / raw)
  To: Mandeep Singh Baines
  Cc: Ingo Molnar, linux-kernel, Frédéric Weisbecker,
	rientjes, mbligh, thockin, Andrew Morton

On Tue, 2009-01-27 at 10:48 -0800, Mandeep Singh Baines wrote:
> Ingo Molnar (mingo@elte.hu) wrote:
> > 
> > * Mandeep Singh Baines <msb@google.com> wrote:
> > 
> > > The design was proposed by Frédéric Weisbecker. Peter Zijlstra suggested 
> > > the use of RCU.
> > 
> > ok, this looks _much_ cleaner.
> > 
> > One question:
> > 
> > > -	read_lock(&tasklist_lock);
> > > +	rcu_read_lock();
> > >  	do_each_thread(g, t) {
> > > -		if (!--max_count)
> > > +		if (need_resched())
> > >  			goto unlock;
> > 
> > Isnt it dangerous to skip a check just because we got marked for 
> > reschedule? Since it runs so rarely it could by accident be preempted and 
> > we'd not get any checking done for a long time.
> > 
> 
> Yeah, the checking could be deferred indefinitely. So you could have a system
> where tasks are hung but it takes a really long time to detect this and
> finally panic the system. Not so good for high-availability.

Why break out at all? Are you that worried about khungtaskd introducing
latencies? Is using preemptible RCU an option for you?


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4] softlockup: remove hung_task_check_count
  2009-01-28  8:25                   ` Peter Zijlstra
@ 2009-01-29  1:42                     ` Mandeep Singh Baines
  2009-01-30 20:41                       ` Mandeep Singh Baines
                                         ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: Mandeep Singh Baines @ 2009-01-29  1:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Frédéric Weisbecker,
	rientjes, mbligh, thockin, Andrew Morton

Peter Zijlstra (peterz@infradead.org) wrote:
> On Tue, 2009-01-27 at 10:48 -0800, Mandeep Singh Baines wrote:
> > Ingo Molnar (mingo@elte.hu) wrote:
> > > 
> > > * Mandeep Singh Baines <msb@google.com> wrote:
> > > 
> > > > The design was proposed by Frédéric Weisbecker. Peter Zijlstra suggested 
> > > > the use of RCU.
> > > 
> > > ok, this looks _much_ cleaner.
> > > 
> > > One question:
> > > 
> > > > -	read_lock(&tasklist_lock);
> > > > +	rcu_read_lock();
> > > >  	do_each_thread(g, t) {
> > > > -		if (!--max_count)
> > > > +		if (need_resched())
> > > >  			goto unlock;
> > > 
> > > Isnt it dangerous to skip a check just because we got marked for 
> > > reschedule? Since it runs so rarely it could by accident be preempted and 
> > > we'd not get any checking done for a long time.
> > > 
> > 
> > Yeah, the checking could be deferred indefinitely. So you could have a system
> > where tasks are hung but it takes a really long time to detect this and
> > finally panic the system. Not so good for high-availability.
> 
> Why break out at all? Are you that worried about khungtaskd introducing
> latencies?

Yes, I was worried about disabling preemption for an unbounded amount of
time.

> Is using preemptible RCU an option for you?
> 

I had not even considered that. To be honest, I had not even heard of it
till now. So I spent another morning at LWN grokking preemptible RCU;)

I think it can work. I'm a little worried about the OOM risk. It could take
a really long time to iterate over the task list. A lot of pending kfree()s
could build up in that time.

I'm still unclear as to whether khungtaskd would get priority boosted
or not. Are only tasks that explicitly block (by sleeping on a lock)
boostable or would a task which got pre-empted inside the critical section
also be boostable? My worry here is khungtaskd would run at high priority for
an indefinite amount of time making it more overhead. Is there an OOMing-soon
booster?

But I think this can all be mitigated by adding back the max_count sysctl.
But preemptible RCU would allow increasing the default to a much higher
value or we may even be able to make the default "check all tasks".

Running out of time today but I'll send out a patch tomorrow.

Thanks Peter!

Regards,
Mandeep

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4] softlockup: remove hung_task_check_count
  2009-01-29  1:42                     ` Mandeep Singh Baines
@ 2009-01-30 20:41                       ` Mandeep Singh Baines
  2009-01-30 20:46                       ` [PATCH 1/2] softlockup: convert read_lock in hung_task to rcu_read_lock Mandeep Singh Baines
  2009-01-30 20:49                       ` [PATCH 2/2] softlockup: check all tasks in hung_task Mandeep Singh Baines
  2 siblings, 0 replies; 31+ messages in thread
From: Mandeep Singh Baines @ 2009-01-30 20:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Frédéric Weisbecker,
	rientjes, mbligh, thockin, Andrew Morton

Mandeep Singh Baines (msb@google.com) wrote:
> Peter Zijlstra (peterz@infradead.org) wrote:
> > 
> > Why break out at all? Are you that worried about khungtaskd introducing
> > latencies?
> 
> Yes, I was worried about disabling preemption for an unbounded amount of
> time.
> 
> > Is using preemptible RCU an option for you?
> > 
> 
> I had not even considered that. To be honest, I had not even heard of it
> till now. So I spent another morning at LWN grokking preemptible RCU;)
> 
> I think it can work. I'm a little worried about the OOM risk. It could take
> a really long time to iterate over the task list. A lot of pending kfree()s
> could build up in that time.
> 

I misunderstood preemptible RCU. I assumed it was a new API but its not. So
I don't think preemptible RCU is an option since it would force a dependency
on CONFIG_PREEMPT_RCU.

I'm going to break up this patch in two. One patch for converting to rcu.
A second patch which will support checking all tasks. To support checking
all tasks I reverted back to a design similar to Frédéric original proposal.

I'll send the patches out right after this email.

[PATCH 1/2] softlockup: convert read_lock in hung_task to rcu_read_lock
[PATCH 2/2] softlockup: check all tasks in hung_task

Regards,
Mandeep

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 1/2] softlockup: convert read_lock in hung_task to rcu_read_lock
  2009-01-29  1:42                     ` Mandeep Singh Baines
  2009-01-30 20:41                       ` Mandeep Singh Baines
@ 2009-01-30 20:46                       ` Mandeep Singh Baines
  2009-01-30 20:49                       ` [PATCH 2/2] softlockup: check all tasks in hung_task Mandeep Singh Baines
  2 siblings, 0 replies; 31+ messages in thread
From: Mandeep Singh Baines @ 2009-01-30 20:46 UTC (permalink / raw)
  To: Ingo Molnar, linux-kernel, Peter Zijlstra, Frédéric Weisbecker
  Cc: rientjes, mbligh, thockin, Andrew Morton

Peter Zijlstra suggested the use of RCU.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
---
 kernel/hung_task.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ba8ccd4..a841db3 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -127,7 +127,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	if (test_taint(TAINT_DIE) || did_panic)
 		return;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	do_each_thread(g, t) {
 		if (!--max_count)
 			goto unlock;
@@ -136,7 +136,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 			check_hung_task(t, now, timeout);
 	} while_each_thread(g, t);
  unlock:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 }
 
 static void update_poll_jiffies(void)
-- 
1.5.4.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 2/2] softlockup: check all tasks in hung_task
  2009-01-29  1:42                     ` Mandeep Singh Baines
  2009-01-30 20:41                       ` Mandeep Singh Baines
  2009-01-30 20:46                       ` [PATCH 1/2] softlockup: convert read_lock in hung_task to rcu_read_lock Mandeep Singh Baines
@ 2009-01-30 20:49                       ` Mandeep Singh Baines
  2009-01-31 19:22                         ` Peter Zijlstra
  2 siblings, 1 reply; 31+ messages in thread
From: Mandeep Singh Baines @ 2009-01-30 20:49 UTC (permalink / raw)
  To: Ingo Molnar, linux-kernel, Peter Zijlstra, Frédéric Weisbecker
  Cc: rientjes, mbligh, thockin, Andrew Morton

Instead of checking only hung_task_check_count tasks, all tasks are checked.
hung_task_check_count is still used to put an upper bound on the critical
section. Every hung_task_check_count checks, the critical section is
refreshed. Keeping the critical section small minimizes time preemption is
disabled and keeps rcu grace periods small.

To prevent following a stale pointer, get_task_struct is called on g and t.
To verify that g and t have not been unhashed while outside the critical
section, the task states are checked.

The design was proposed by Frédéric Weisbecker.

Frédéric Weisbecker (fweisbec@gmail.com) wrote:
>
> Instead of having this arbitrary limit of tasks, why not just
> lurk the need_resched() and then schedule if it needs too.
>
> I know that sounds a bit racy, because you will have to release the
> tasklist_lock and
> a lot of things can happen in the task list until you become resched.
> But you can do a get_task_struct() on g and t before your thread is
> going to sleep and then put them
> when it is awaken.
> Perhaps some tasks will disappear or be appended in the list before g
> and t, but that doesn't really matter:
> if they disappear, they didn't lockup, and if they were appended, they
> are not enough cold to be analyzed :-)
>
> This way you can drop the arbitrary limit of task number given by the user....
>
> Frederic.
>

Signed-off-by: Mandeep Singh Baines <msb@google.com>
---
 kernel/hung_task.c |   28 ++++++++++++++++++++++++++--
 1 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index a841db3..1c8c9f9 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -109,6 +109,25 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 		panic("hung_task: blocked tasks");
 }
 
+ /*
+  * To avoid extending the RCU grace period for an unbounded amount of time,
+  * periodically exit the critical section and enter a new one.
+  *
+  * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+  * exit the grace period. For classic RCU, a reschedule is required.
+  */
+static void check_hung_rcu_refresh(struct task_struct *g, struct task_struct *t)
+{
+	get_task_struct(g);
+	get_task_struct(t);
+	rcu_read_unlock();
+	if (need_resched())
+		schedule();
+	rcu_read_lock();
+	put_task_struct(t);
+	put_task_struct(g);
+}
+
 /*
  * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
  * a really long time (120 seconds). If that happens, print out
@@ -129,8 +148,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 
 	rcu_read_lock();
 	do_each_thread(g, t) {
-		if (!--max_count)
-			goto unlock;
+		if (sysctl_hung_task_check_count && !(max_count--)) {
+			max_count = sysctl_hung_task_check_count;
+			check_hung_rcu_refresh(g, t);
+			/* Exit if t or g was unhashed during refresh. */
+			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
+				goto unlock;
+		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, now, timeout);
-- 
1.5.4.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] softlockup: check all tasks in hung_task
  2009-01-30 20:49                       ` [PATCH 2/2] softlockup: check all tasks in hung_task Mandeep Singh Baines
@ 2009-01-31 19:22                         ` Peter Zijlstra
  2009-02-03  0:05                           ` [PATCH 2/2 v2] " Mandeep Singh Baines
  0 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2009-01-31 19:22 UTC (permalink / raw)
  To: Mandeep Singh Baines
  Cc: Ingo Molnar, linux-kernel, Frédéric Weisbecker,
	rientjes, mbligh, thockin, Andrew Morton

On Fri, 2009-01-30 at 12:49 -0800, Mandeep Singh Baines wrote:
> Instead of checking only hung_task_check_count tasks, all tasks are checked.
> hung_task_check_count is still used to put an upper bound on the critical
> section. Every hung_task_check_count checks, the critical section is
> refreshed. Keeping the critical section small minimizes time preemption is
> disabled and keeps rcu grace periods small.
> 
> To prevent following a stale pointer, get_task_struct is called on g and t.
> To verify that g and t have not been unhashed while outside the critical
> section, the task states are checked.
> 
> The design was proposed by Frédéric Weisbecker.
> 
> Frédéric Weisbecker (fweisbec@gmail.com) wrote:
> >
> > Instead of having this arbitrary limit of tasks, why not just
> > lurk the need_resched() and then schedule if it needs too.
> >
> > I know that sounds a bit racy, because you will have to release the
> > tasklist_lock and
> > a lot of things can happen in the task list until you become resched.
> > But you can do a get_task_struct() on g and t before your thread is
> > going to sleep and then put them
> > when it is awaken.
> > Perhaps some tasks will disappear or be appended in the list before g
> > and t, but that doesn't really matter:
> > if they disappear, they didn't lockup, and if they were appended, they
> > are not enough cold to be analyzed :-)
> >
> > This way you can drop the arbitrary limit of task number given by the user....
> >
> > Frederic.
> >
> 
> Signed-off-by: Mandeep Singh Baines <msb@google.com>
> ---
>  kernel/hung_task.c |   28 ++++++++++++++++++++++++++--
>  1 files changed, 26 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index a841db3..1c8c9f9 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -109,6 +109,25 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
>  		panic("hung_task: blocked tasks");
>  }
>  
> + /*
> +  * To avoid extending the RCU grace period for an unbounded amount of time,
> +  * periodically exit the critical section and enter a new one.
> +  *
> +  * For preemptible RCU it is sufficient to call rcu_read_unlock in order
> +  * exit the grace period. For classic RCU, a reschedule is required.
> +  */
> +static void check_hung_rcu_refresh(struct task_struct *g, struct task_struct *t)
> +{
> +	get_task_struct(g);
> +	get_task_struct(t);
> +	rcu_read_unlock();
> +	if (need_resched())
> +		schedule();

won't a simple cond_resched(), do?

> +	rcu_read_lock();
> +	put_task_struct(t);
> +	put_task_struct(g);
> +}
> +
>  /*
>   * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
>   * a really long time (120 seconds). If that happens, print out
> @@ -129,8 +148,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
>  
>  	rcu_read_lock();
>  	do_each_thread(g, t) {
> -		if (!--max_count)
> -			goto unlock;
> +		if (sysctl_hung_task_check_count && !(max_count--)) {
> +			max_count = sysctl_hung_task_check_count;
> +			check_hung_rcu_refresh(g, t);
> +			/* Exit if t or g was unhashed during refresh. */
> +			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
> +				goto unlock;
> +		}

Its all a bit ugly, but I suppose there's no way around that.

>  		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
>  		if (t->state == TASK_UNINTERRUPTIBLE)
>  			check_hung_task(t, now, timeout);


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 2/2 v2] softlockup: check all tasks in hung_task
  2009-01-31 19:22                         ` Peter Zijlstra
@ 2009-02-03  0:05                           ` Mandeep Singh Baines
  2009-02-03 12:23                             ` Ingo Molnar
  0 siblings, 1 reply; 31+ messages in thread
From: Mandeep Singh Baines @ 2009-02-03  0:05 UTC (permalink / raw)
  To: Ingo Molnar, Frédéric Weisbecker, linux-kernel
  Cc: rientjes, mbligh, thockin, Andrew Morton

On Sat, Jan 31, 2009 at 11:22 AM, Peter Zijlstra <peterz@infradead.org> wro=
te:
> On Fri, 2009-01-30 at 12:49 -0800, Mandeep Singh Baines wrote:

<snip>

>> +static void check_hung_rcu_refresh(struct task_struct *g, struct task_s=
truct *t)
>> +{
>> +     get_task_struct(g);
>> +     get_task_struct(t);
>> +     rcu_read_unlock();
>> +     if (need_resched())
>> +             schedule();
>
> won't a simple cond_resched(), do?
>

Yes, it will. Thanks Peter!

>> +     rcu_read_lock();
>> +     put_task_struct(t);
>> +     put_task_struct(g);
>> +}
>> +
>>  /*
>>   * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
>>   * a really long time (120 seconds). If that happens, print out
>> @@ -129,8 +148,13 @@ static void check_hung_uninterruptible_tasks(unsign=
ed long timeout)
>>
>>       rcu_read_lock();
>>       do_each_thread(g, t) {
>> -             if (!--max_count)
>> -                     goto unlock;
>> +             if (sysctl_hung_task_check_count && !(max_count--)) {
>> +                     max_count =3D sysctl_hung_task_check_count;
>> +                     check_hung_rcu_refresh(g, t);
>> +                     /* Exit if t or g was unhashed during refresh. */
>> +                     if (t->state =3D=3D TASK_DEAD || g->state =3D=3D T=
ASK_DEAD)
>> +                             goto unlock;
>> +             }
>
> Its all a bit ugly, but I suppose there's no way around that.

Yeah, need to guarantee that next of t and g isn't pointing to a freed
task_struct. If the t or g has not been unlinked, then its guaranteed that
the next fields are valid.

If the state is != TASK_DEAD then its guaranteed that the task hasn't been
unlinked. I guess that's the shakiest assumption in the chain. If the code were
ever changed to set the state to TASK_DEAD after unlinking ...

<snip>

Here's v2 of this patch. The only thing that's changed is the cond_resched().

---
Instead of checking only hung_task_check_count tasks, all tasks are checked.
hung_task_check_count is still used to put an upper bound on the critical
section. Every hung_task_check_count checks, the critical section is
refreshed. Keeping the critical section small minimizes time preemption is
disabled and keeps rcu grace periods small.

To prevent following a stale pointer, get_task_struct is called on g and t.
To verify that g and t have not been unhashed while outside the critical
section, the task states are checked.

The design was proposed by Frédéric Weisbecker.

Frédéric Weisbecker (fweisbec@gmail.com) wrote:
>
> Instead of having this arbitrary limit of tasks, why not just
> lurk the need_resched() and then schedule if it needs too.
>
> I know that sounds a bit racy, because you will have to release the
> tasklist_lock and
> a lot of things can happen in the task list until you become resched.
> But you can do a get_task_struct() on g and t before your thread is
> going to sleep and then put them
> when it is awaken.
> Perhaps some tasks will disappear or be appended in the list before g
> and t, but that doesn't really matter:
> if they disappear, they didn't lockup, and if they were appended, they
> are not enough cold to be analyzed :-)
>
> This way you can drop the arbitrary limit of task number given by the user....
>
> Frederic.
>

Signed-off-by: Mandeep Singh Baines <msb@google.com>
---
 kernel/hung_task.c |   27 +++++++++++++++++++++++++--
 1 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index a841db3..464c96e 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -109,6 +109,24 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 		panic("hung_task: blocked tasks");
 }
 
+ /*
+  * To avoid extending the RCU grace period for an unbounded amount of time,
+  * periodically exit the critical section and enter a new one.
+  *
+  * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+  * exit the grace period. For classic RCU, a reschedule is required.
+  */
+static void check_hung_rcu_refresh(struct task_struct *g, struct task_struct *t)
+{
+	get_task_struct(g);
+	get_task_struct(t);
+	rcu_read_unlock();
+	cond_resched();
+	rcu_read_lock();
+	put_task_struct(t);
+	put_task_struct(g);
+}
+
 /*
  * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
  * a really long time (120 seconds). If that happens, print out
@@ -129,8 +147,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 
 	rcu_read_lock();
 	do_each_thread(g, t) {
-		if (!--max_count)
-			goto unlock;
+		if (sysctl_hung_task_check_count && !(max_count--)) {
+			max_count = sysctl_hung_task_check_count;
+			check_hung_rcu_refresh(g, t);
+			/* Exit if t or g was unhashed during refresh. */
+			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
+				goto unlock;
+		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, now, timeout);
-- 
1.5.4.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2 v2] softlockup: check all tasks in hung_task
  2009-02-03  0:05                           ` [PATCH 2/2 v2] " Mandeep Singh Baines
@ 2009-02-03 12:23                             ` Ingo Molnar
  2009-02-03 20:56                               ` [PATCH 2/2 v3] " Mandeep Singh Baines
  0 siblings, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2009-02-03 12:23 UTC (permalink / raw)
  To: Mandeep Singh Baines
  Cc: Frédéric Weisbecker, linux-kernel, rientjes, mbligh,
	thockin, Andrew Morton


* Mandeep Singh Baines <msb@google.com> wrote:

> +		if (sysctl_hung_task_check_count && !(max_count--)) {
> +			max_count = sysctl_hung_task_check_count;

just a minor nit, why not:

	!--max_count

?

That way we can lose the parenthesis and we'll also not overcount and wont 
let max_count go down to -1.

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 2/2 v3] softlockup: check all tasks in hung_task
  2009-02-03 12:23                             ` Ingo Molnar
@ 2009-02-03 20:56                               ` Mandeep Singh Baines
  2009-02-04 19:43                                 ` Ingo Molnar
  0 siblings, 1 reply; 31+ messages in thread
From: Mandeep Singh Baines @ 2009-02-03 20:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Frédéric Weisbecker, Peter Zijlstra, linux-kernel,
	rientjes, mbligh, thockin, Andrew Morton

Ingo Molnar (mingo@elte.hu) wrote:
> 
> * Mandeep Singh Baines <msb@google.com> wrote:
> 
> > +		if (sysctl_hung_task_check_count && !(max_count--)) {
> > +			max_count = sysctl_hung_task_check_count;
> 
> just a minor nit, why not:
> 
> 	!--max_count
> 
> ?
> 
> That way we can lose the parenthesis and we'll also not overcount and wont 
> let max_count go down to -1.
> 
> 	Ingo

Good question. Yeah, there's really nothing about this patch that requires
changing the !--max_count test.

Changed the condition back to a simple !--max_count test.

---
Instead of checking only hung_task_check_count tasks, all tasks are checked.
hung_task_check_count is still used to put an upper bound on the critical
section. Every hung_task_check_count checks, the critical section is
refreshed. Keeping the critical section small minimizes time preemption is
disabled and keeps rcu grace periods small.

To prevent following a stale pointer, get_task_struct is called on g and t.
To verify that g and t have not been unhashed while outside the critical
section, the task states are checked.

The design was proposed by Frédéric Weisbecker.

Frédéric Weisbecker (fweisbec@gmail.com) wrote:
>
> Instead of having this arbitrary limit of tasks, why not just
> lurk the need_resched() and then schedule if it needs too.
>
> I know that sounds a bit racy, because you will have to release the
> tasklist_lock and
> a lot of things can happen in the task list until you become resched.
> But you can do a get_task_struct() on g and t before your thread is
> going to sleep and then put them
> when it is awaken.
> Perhaps some tasks will disappear or be appended in the list before g
> and t, but that doesn't really matter:
> if they disappear, they didn't lockup, and if they were appended, they
> are not enough cold to be analyzed :-)
>
> This way you can drop the arbitrary limit of task number given by the user....
>
> Frederic.
>

Signed-off-by: Mandeep Singh Baines <msb@google.com>
---
 kernel/hung_task.c |   27 +++++++++++++++++++++++++--
 1 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index a841db3..f47eea4 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -109,6 +109,24 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 		panic("hung_task: blocked tasks");
 }
 
+ /*
+  * To avoid extending the RCU grace period for an unbounded amount of time,
+  * periodically exit the critical section and enter a new one.
+  *
+  * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+  * exit the grace period. For classic RCU, a reschedule is required.
+  */
+static void check_hung_rcu_refresh(struct task_struct *g, struct task_struct *t)
+{
+	get_task_struct(g);
+	get_task_struct(t);
+	rcu_read_unlock();
+	cond_resched();
+	rcu_read_lock();
+	put_task_struct(t);
+	put_task_struct(g);
+}
+
 /*
  * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
  * a really long time (120 seconds). If that happens, print out
@@ -129,8 +147,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 
 	rcu_read_lock();
 	do_each_thread(g, t) {
-		if (!--max_count)
-			goto unlock;
+		if (!--max_count) {
+			max_count = sysctl_hung_task_check_count;
+			check_hung_rcu_refresh(g, t);
+			/* Exit if t or g was unhashed during refresh. */
+			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
+				goto unlock;
+		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, now, timeout);
-- 
1.5.4.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2 v3] softlockup: check all tasks in hung_task
  2009-02-03 20:56                               ` [PATCH 2/2 v3] " Mandeep Singh Baines
@ 2009-02-04 19:43                                 ` Ingo Molnar
  2009-02-05  4:35                                   ` [PATCH 2/2 v4] " Mandeep Singh Baines
  0 siblings, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2009-02-04 19:43 UTC (permalink / raw)
  To: Mandeep Singh Baines
  Cc: Frédéric Weisbecker, Peter Zijlstra, linux-kernel,
	rientjes, mbligh, thockin, Andrew Morton


* Mandeep Singh Baines <msb@google.com> wrote:

> +static void check_hung_rcu_refresh(struct task_struct *g, struct task_struct *t)

please rename this to rcu_lock_break().

>  	do_each_thread(g, t) {
> -		if (!--max_count)
> -			goto unlock;
> +		if (!--max_count) {
> +			max_count = sysctl_hung_task_check_count;
> +			check_hung_rcu_refresh(g, t);
> +			/* Exit if t or g was unhashed during refresh. */
> +			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
> +				goto unlock;

Thinking about it some more, i think a slightly different approach (that has 
the same end effect):

 - Add a "static const int check_count_batching = 1024;" variable that adds 
   some natural batching - and initialize max_count to that value. There's 
   little point to make that batching configurable.

 - Leave sysctl_hung_task_check_count present but change its default to 
   something really large like MAX_PID.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 2/2 v4] softlockup: check all tasks in hung_task
  2009-02-04 19:43                                 ` Ingo Molnar
@ 2009-02-05  4:35                                   ` Mandeep Singh Baines
  2009-02-05 14:34                                     ` Ingo Molnar
  0 siblings, 1 reply; 31+ messages in thread
From: Mandeep Singh Baines @ 2009-02-05  4:35 UTC (permalink / raw)
  To: Ingo Molnar, Frédéric Weisbecker, Peter Zijlstra, linux-kernel
  Cc: rientjes, mbligh, thockin, Andrew Morton

Ingo Molnar (mingo@elte.hu) wrote:
> 
> * Mandeep Singh Baines <msb@google.com> wrote:
> 
> > +static void check_hung_rcu_refresh(struct task_struct *g, struct task_struct *t)
> 
> please rename this to rcu_lock_break().
> 

Fixed.

> >  	do_each_thread(g, t) {
> > -		if (!--max_count)
> > -			goto unlock;
> > +		if (!--max_count) {
> > +			max_count = sysctl_hung_task_check_count;
> > +			check_hung_rcu_refresh(g, t);
> > +			/* Exit if t or g was unhashed during refresh. */
> > +			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
> > +				goto unlock;
> 
> Thinking about it some more, i think a slightly different approach (that has 
> the same end effect):
> 
>  - Add a "static const int check_count_batching = 1024;" variable that adds 
>    some natural batching - and initialize max_count to that value. There's 
>    little point to make that batching configurable.
> 

Fixed.

The batch_count controls the preemptibility of hung_task. While it might
not make sense to expose the value to user-space, we may want to use a
different value for the PREEMPT config (not sure what the specific values
should be):

#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RCU)
static const int check_count_batching = 256;
#else
static const int check_count_batching = 2048;
#endif

>  - Leave sysctl_hung_task_check_count present but change its default to 
>    something really large like MAX_PID.

Fixed.

Alternatively, the user could renice khungtaskd in order to control the share
of CPU used.

---
Changed the default value of hung_task_check_count to PID_MAX_LIMIT.
hung_task_batch_count added to put an upper bound on the critical
section. Every hung_task_batch_count checks, the rcu lock is broken.
Keeping the critical section small minimizes time preemption is disabled
and keeps rcu grace periods small.

To prevent following a stale pointer, get_task_struct is called on g and t.
To verify that g and t have not been unhashed while outside the critical
section, the task states are checked.

The design was proposed by Frédéric Weisbecker.

Frédéric Weisbecker (fweisbec@gmail.com) wrote:
>
> Instead of having this arbitrary limit of tasks, why not just
> lurk the need_resched() and then schedule if it needs too.
>
> I know that sounds a bit racy, because you will have to release the
> tasklist_lock and
> a lot of things can happen in the task list until you become resched.
> But you can do a get_task_struct() on g and t before your thread is
> going to sleep and then put them
> when it is awaken.
> Perhaps some tasks will disappear or be appended in the list before g
> and t, but that doesn't really matter:
> if they disappear, they didn't lockup, and if they were appended, they
> are not enough cold to be analyzed :-)
>
> This way you can drop the arbitrary limit of task number given by the user....
>
> Frederic.
>

Signed-off-by: Mandeep Singh Baines <msb@google.com>
---
 kernel/hung_task.c |   39 +++++++++++++++++++++++++++++++++++++--
 1 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index a841db3..34b678c 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -17,9 +17,18 @@
 #include <linux/sysctl.h>
 
 /*
- * Have a reasonable limit on the number of tasks checked:
+ * The number of tasks checked:
  */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+
+/*
+ * Limit number of tasks checked in a batch.
+ *
+ * This value controls the preemptibility of khungtaskd since preemption
+ * is disabled during the critical section. It also controls the size of
+ * the RCU grace period. So it needs to be upper-bound.
+ */
+static const int hung_task_batching = 1024;
 
 /*
  * Zero means infinite timeout - no checking done:
@@ -109,6 +118,24 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 		panic("hung_task: blocked tasks");
 }
 
+ /*
+  * To avoid extending the RCU grace period for an unbounded amount of time,
+  * periodically exit the critical section and enter a new one.
+  *
+  * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+  * exit the grace period. For classic RCU, a reschedule is required.
+  */
+static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
+{
+	get_task_struct(g);
+	get_task_struct(t);
+	rcu_read_unlock();
+	cond_resched();
+	rcu_read_lock();
+	put_task_struct(t);
+	put_task_struct(g);
+}
+
 /*
  * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
  * a really long time (120 seconds). If that happens, print out
@@ -116,6 +143,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
  */
 static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
+	int batch_count = hung_task_batching;
 	int max_count = sysctl_hung_task_check_count;
 	unsigned long now = get_timestamp();
 	struct task_struct *g, *t;
@@ -131,6 +159,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	do_each_thread(g, t) {
 		if (!--max_count)
 			goto unlock;
+		if (!--batch_count) {
+			batch_count = hung_task_batching;
+			rcu_lock_break(g, t);
+			/* Exit if t or g was unhashed during refresh. */
+			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
+				goto unlock;
+		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, now, timeout);
-- 
1.5.4.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2 v4] softlockup: check all tasks in hung_task
  2009-02-05  4:35                                   ` [PATCH 2/2 v4] " Mandeep Singh Baines
@ 2009-02-05 14:34                                     ` Ingo Molnar
  2009-02-05 17:48                                       ` Andrew Morton
  2009-02-05 17:56                                       ` [PATCH] softlockup: convert read_lock in hung_task to rcu_read_lock Mandeep Singh Baines
  0 siblings, 2 replies; 31+ messages in thread
From: Ingo Molnar @ 2009-02-05 14:34 UTC (permalink / raw)
  To: Mandeep Singh Baines
  Cc: Frédéric Weisbecker, Peter Zijlstra, linux-kernel,
	rientjes, mbligh, thockin, Andrew Morton


* Mandeep Singh Baines <msb@google.com> wrote:

> The batch_count controls the preemptibility of hung_task. While it might 
> not make sense to expose the value to user-space, we may want to use a 
> different value for the PREEMPT config (not sure what the specific values 
> should be):
> 
> #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RCU)
> static const int check_count_batching = 256;
> #else
> static const int check_count_batching = 2048;
> #endif

That really is not worth it - and such #ifdefs always look ugly and split 
testing as well.

I've applied your patch to tip:core/softlockup, thanks Mandeep!

	Ingo

------------->
>From 9952001914a7230ab88212a1e2d0e80208bdb907 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Wed, 4 Feb 2009 20:35:48 -0800
Subject: [PATCH] softlockup: check all tasks in hung_task
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 8bit

Impact: extend the scope of hung-task checks

Changed the default value of hung_task_check_count to PID_MAX_LIMIT.
hung_task_batch_count added to put an upper bound on the critical
section. Every hung_task_batch_count checks, the rcu lock is never
held for a too long time.

Keeping the critical section small minimizes time preemption is disabled
and keeps rcu grace periods small.

To prevent following a stale pointer, get_task_struct is called on g and t.
To verify that g and t have not been unhashed while outside the critical
section, the task states are checked.

The design was proposed by Frédéric Weisbecker.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Suggested-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c |   39 +++++++++++++++++++++++++++++++++++++--
 1 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ba8ccd4..d32d293 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -17,9 +17,18 @@
 #include <linux/sysctl.h>
 
 /*
- * Have a reasonable limit on the number of tasks checked:
+ * The number of tasks checked:
  */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+
+/*
+ * Limit number of tasks checked in a batch.
+ *
+ * This value controls the preemptibility of khungtaskd since preemption
+ * is disabled during the critical section. It also controls the size of
+ * the RCU grace period. So it needs to be upper-bound.
+ */
+static const int hung_task_batching = 1024;
 
 /*
  * Zero means infinite timeout - no checking done:
@@ -109,6 +118,24 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 		panic("hung_task: blocked tasks");
 }
 
+ /*
+  * To avoid extending the RCU grace period for an unbounded amount of time,
+  * periodically exit the critical section and enter a new one.
+  *
+  * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+  * exit the grace period. For classic RCU, a reschedule is required.
+  */
+static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
+{
+	get_task_struct(g);
+	get_task_struct(t);
+	rcu_read_unlock();
+	cond_resched();
+	rcu_read_lock();
+	put_task_struct(t);
+	put_task_struct(g);
+}
+
 /*
  * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
  * a really long time (120 seconds). If that happens, print out
@@ -116,6 +143,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
  */
 static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
+	int batch_count = hung_task_batching;
 	int max_count = sysctl_hung_task_check_count;
 	unsigned long now = get_timestamp();
 	struct task_struct *g, *t;
@@ -131,6 +159,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	do_each_thread(g, t) {
 		if (!--max_count)
 			goto unlock;
+		if (!--batch_count) {
+			batch_count = hung_task_batching;
+			rcu_lock_break(g, t);
+			/* Exit if t or g was unhashed during refresh. */
+			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
+				goto unlock;
+		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, now, timeout);

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2 v4] softlockup: check all tasks in hung_task
  2009-02-05 14:34                                     ` Ingo Molnar
@ 2009-02-05 17:48                                       ` Andrew Morton
  2009-02-05 18:07                                         ` Ingo Molnar
  2009-02-05 18:40                                         ` Mandeep Singh Baines
  2009-02-05 17:56                                       ` [PATCH] softlockup: convert read_lock in hung_task to rcu_read_lock Mandeep Singh Baines
  1 sibling, 2 replies; 31+ messages in thread
From: Andrew Morton @ 2009-02-05 17:48 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mandeep Singh Baines, Frédéric Weisbecker,
	Peter Zijlstra, linux-kernel, rientjes, mbligh, thockin

On Thu, 5 Feb 2009 15:34:53 +0100 Ingo Molnar <mingo@elte.hu> wrote:

> 
> Subject: [PATCH] softlockup: check all tasks in hung_task
> 
> Impact: extend the scope of hung-task checks
> 

A nanonit:

> +static const int hung_task_batching = 1024;

static const definitions look pretty but they're a bit misleading.

>  static void check_hung_uninterruptible_tasks(unsigned long timeout)
>  {
> +	int batch_count = hung_task_batching;
>  	int max_count = sysctl_hung_task_check_count;
>  	unsigned long now = get_timestamp();
>  	struct task_struct *g, *t;
> @@ -131,6 +159,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
>  	do_each_thread(g, t) {
>  		if (!--max_count)
>  			goto unlock;
> +		if (!--batch_count) {
> +			batch_count = hung_task_batching;
> +			rcu_lock_break(g, t);
> +			/* Exit if t or g was unhashed during refresh. */
> +			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
> +				goto unlock;
> +		}
>  		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
>  		if (t->state == TASK_UNINTERRUPTIBLE)
>  			check_hung_task(t, now, timeout);

The reader of this area of the code will expect that hung_task_batching
is a variable.  It _looks_ like the value of that variable can be altered
at any time by some other thread.  It _looks_ like this code will explode
if someone has accidentally set hung_task_batching to zero, etc.

But none of that is actually true, because hung_task_batching is, surprisingly,
a compile-time constant.

All this misleadingness would be fixed if it were called
HUNG_TASK_BATCHING.  But then it wouldn't be pretty.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH] softlockup: convert read_lock in hung_task to rcu_read_lock
  2009-02-05 14:34                                     ` Ingo Molnar
  2009-02-05 17:48                                       ` Andrew Morton
@ 2009-02-05 17:56                                       ` Mandeep Singh Baines
  2009-02-05 18:13                                         ` Ingo Molnar
  1 sibling, 1 reply; 31+ messages in thread
From: Mandeep Singh Baines @ 2009-02-05 17:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Frédéric Weisbecker, Peter Zijlstra, linux-kernel,
	rientjes, mbligh, thockin, Andrew Morton

Ingo Molnar (mingo@elte.hu) wrote:
> 
> I've applied your patch to tip:core/softlockup, thanks Mandeep!
> 

Oops, I put the conversion to rcu_read_lock in a seperate patch. I
understand now the convention is to use a patch series only when the
patches can be applied independently.

---
Since the tasklist is protected by rcu list operations, it is safe
to convert the read_lock()s to rcu_read_lock().

Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mandeep Singh Baines <msb@google.com>
---
 kernel/hung_task.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d32d293..34b678c 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -155,7 +155,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	if (test_taint(TAINT_DIE) || did_panic)
 		return;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	do_each_thread(g, t) {
 		if (!--max_count)
 			goto unlock;
@@ -171,7 +171,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 			check_hung_task(t, now, timeout);
 	} while_each_thread(g, t);
  unlock:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 }
 
 static void update_poll_jiffies(void)
-- 
1.5.4.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2 v4] softlockup: check all tasks in hung_task
  2009-02-05 17:48                                       ` Andrew Morton
@ 2009-02-05 18:07                                         ` Ingo Molnar
  2009-02-05 18:30                                           ` Andrew Morton
  2009-02-05 18:40                                         ` Mandeep Singh Baines
  1 sibling, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2009-02-05 18:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mandeep Singh Baines, Frédéric Weisbecker,
	Peter Zijlstra, linux-kernel, rientjes, mbligh, thockin


* Andrew Morton <akpm@linux-foundation.org> wrote:

> On Thu, 5 Feb 2009 15:34:53 +0100 Ingo Molnar <mingo@elte.hu> wrote:
> 
> > 
> > Subject: [PATCH] softlockup: check all tasks in hung_task
> > 
> > Impact: extend the scope of hung-task checks
> > 
> 
> A nanonit:

agreed.

> > +static const int hung_task_batching = 1024;
> 
> static const definitions look pretty but they're a bit misleading.
> 
> >  static void check_hung_uninterruptible_tasks(unsigned long timeout)
> >  {
> > +	int batch_count = hung_task_batching;
> >  	int max_count = sysctl_hung_task_check_count;
> >  	unsigned long now = get_timestamp();
> >  	struct task_struct *g, *t;
> > @@ -131,6 +159,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
> >  	do_each_thread(g, t) {
> >  		if (!--max_count)
> >  			goto unlock;
> > +		if (!--batch_count) {
> > +			batch_count = hung_task_batching;
> > +			rcu_lock_break(g, t);
> > +			/* Exit if t or g was unhashed during refresh. */
> > +			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
> > +				goto unlock;
> > +		}
> >  		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
> >  		if (t->state == TASK_UNINTERRUPTIBLE)
> >  			check_hung_task(t, now, timeout);
> 
> The reader of this area of the code will expect that hung_task_batching
> is a variable.  It _looks_ like the value of that variable can be altered
> at any time by some other thread.  It _looks_ like this code will explode
> if someone has accidentally set hung_task_batching to zero, etc.
> 
> But none of that is actually true, because hung_task_batching is, surprisingly,
> a compile-time constant.
> 
> All this misleadingness would be fixed if it were called
> HUNG_TASK_BATCHING.  But then it wouldn't be pretty.

i keep running into this paradox myself too. Explicit const C types are the 
perfect replacements for defines, but they create confusion by making it 
look like a variable.

I tend to agree with you that avoiding the confusion is more important than 
having a type - it's not like we are about to have any type related troubles 
here. So i amended the commit in the way below - does that look good to you?

	Ingo

---------------->
>From 9d03ba30018a546d20d4aa8bba58978492c82520 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Wed, 4 Feb 2009 20:35:48 -0800
Subject: [PATCH] softlockup: check all tasks in hung_task
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 8bit

Impact: extend the scope of hung-task checks

Changed the default value of hung_task_check_count to PID_MAX_LIMIT.
hung_task_batch_count added to put an upper bound on the critical
section. Every hung_task_batch_count checks, the rcu lock is never
held for a too long time.

Keeping the critical section small minimizes time preemption is disabled
and keeps rcu grace periods small.

To prevent following a stale pointer, get_task_struct is called on g and t.
To verify that g and t have not been unhashed while outside the critical
section, the task states are checked.

The design was proposed by Frédéric Weisbecker.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Suggested-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c |   39 +++++++++++++++++++++++++++++++++++++--
 1 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ba8ccd4..3c6190b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -17,9 +17,18 @@
 #include <linux/sysctl.h>
 
 /*
- * Have a reasonable limit on the number of tasks checked:
+ * The number of tasks checked:
  */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+
+/*
+ * Limit number of tasks checked in a batch.
+ *
+ * This value controls the preemptibility of khungtaskd since preemption
+ * is disabled during the critical section. It also controls the size of
+ * the RCU grace period. So it needs to be upper-bound.
+ */
+#define HUNG_TASK_BATCHING 1024;
 
 /*
  * Zero means infinite timeout - no checking done:
@@ -110,6 +119,24 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 }
 
 /*
+ * To avoid extending the RCU grace period for an unbounded amount of time,
+ * periodically exit the critical section and enter a new one.
+ *
+ * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+ * exit the grace period. For classic RCU, a reschedule is required.
+ */
+static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
+{
+	get_task_struct(g);
+	get_task_struct(t);
+	rcu_read_unlock();
+	cond_resched();
+	rcu_read_lock();
+	put_task_struct(t);
+	put_task_struct(g);
+}
+
+/*
  * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
  * a really long time (120 seconds). If that happens, print out
  * a warning.
@@ -117,6 +144,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
 	int max_count = sysctl_hung_task_check_count;
+	int batch_count = HUNG_TASK_BATCHING;
 	unsigned long now = get_timestamp();
 	struct task_struct *g, *t;
 
@@ -131,6 +159,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	do_each_thread(g, t) {
 		if (!--max_count)
 			goto unlock;
+		if (!--batch_count) {
+			batch_count = HUNG_TASK_BATCHING;
+			rcu_lock_break(g, t);
+			/* Exit if t or g was unhashed during refresh. */
+			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
+				goto unlock;
+		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, now, timeout);

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH] softlockup: convert read_lock in hung_task to rcu_read_lock
  2009-02-05 17:56                                       ` [PATCH] softlockup: convert read_lock in hung_task to rcu_read_lock Mandeep Singh Baines
@ 2009-02-05 18:13                                         ` Ingo Molnar
  0 siblings, 0 replies; 31+ messages in thread
From: Ingo Molnar @ 2009-02-05 18:13 UTC (permalink / raw)
  To: Mandeep Singh Baines
  Cc: Frédéric Weisbecker, Peter Zijlstra, linux-kernel,
	rientjes, mbligh, thockin, Andrew Morton


* Mandeep Singh Baines <msb@google.com> wrote:

> Ingo Molnar (mingo@elte.hu) wrote:
> > 
> > I've applied your patch to tip:core/softlockup, thanks Mandeep!
> > 
> 
> Oops, I put the conversion to rcu_read_lock in a seperate patch. I
> understand now the convention is to use a patch series only when the
> patches can be applied independently.

What you did is in fact the preferred approach: keep independent stuff 
independent and make patches as gradual and split-up as possible. (as long 
as the interim state is a working kernel too)

> ---
> Since the tasklist is protected by rcu list operations, it is safe
> to convert the read_lock()s to rcu_read_lock().
> 
> Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Signed-off-by: Mandeep Singh Baines <msb@google.com>
> ---
>  kernel/hung_task.c |    4 ++--
>  1 files changed, 2 insertions(+), 2 deletions(-)

Applied to tip:core/softlockup, thanks!

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2 v4] softlockup: check all tasks in hung_task
  2009-02-05 18:07                                         ` Ingo Molnar
@ 2009-02-05 18:30                                           ` Andrew Morton
  2009-02-05 18:58                                             ` Ingo Molnar
  0 siblings, 1 reply; 31+ messages in thread
From: Andrew Morton @ 2009-02-05 18:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mandeep Singh Baines, Frédéric Weisbecker,
	Peter Zijlstra, linux-kernel, rientjes, mbligh, thockin

On Thu, 5 Feb 2009 19:07:49 +0100 Ingo Molnar <mingo@elte.hu> wrote:

> 
> So i amended the commit in the way below - does that look good to you?
> 

yup.

> +#define HUNG_TASK_BATCHING 1024;

s/;//



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2 v4] softlockup: check all tasks in hung_task
  2009-02-05 17:48                                       ` Andrew Morton
  2009-02-05 18:07                                         ` Ingo Molnar
@ 2009-02-05 18:40                                         ` Mandeep Singh Baines
  1 sibling, 0 replies; 31+ messages in thread
From: Mandeep Singh Baines @ 2009-02-05 18:40 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ingo Molnar, Frédéric Weisbecker, Peter Zijlstra,
	linux-kernel, rientjes, mbligh, thockin

Andrew Morton (akpm@linux-foundation.org) wrote:
> On Thu, 5 Feb 2009 15:34:53 +0100 Ingo Molnar <mingo@elte.hu> wrote:
> 
> > 
> > Subject: [PATCH] softlockup: check all tasks in hung_task
> > 
> > Impact: extend the scope of hung-task checks
> > 
> 
> A nanonit:
> 
> > +static const int hung_task_batching = 1024;
> 
> static const definitions look pretty but they're a bit misleading.
> 
> >  static void check_hung_uninterruptible_tasks(unsigned long timeout)
> >  {
> > +	int batch_count = hung_task_batching;
> >  	int max_count = sysctl_hung_task_check_count;
> >  	unsigned long now = get_timestamp();
> >  	struct task_struct *g, *t;
> > @@ -131,6 +159,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
> >  	do_each_thread(g, t) {
> >  		if (!--max_count)
> >  			goto unlock;
> > +		if (!--batch_count) {
> > +			batch_count = hung_task_batching;
> > +			rcu_lock_break(g, t);
> > +			/* Exit if t or g was unhashed during refresh. */
> > +			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
> > +				goto unlock;
> > +		}
> >  		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
> >  		if (t->state == TASK_UNINTERRUPTIBLE)
> >  			check_hung_task(t, now, timeout);
> 
> The reader of this area of the code will expect that hung_task_batching
> is a variable.  It _looks_ like the value of that variable can be altered
> at any time by some other thread.  It _looks_ like this code will explode
> if someone has accidentally set hung_task_batching to zero, etc.
> 

The code would not break if hung_task_batching was exported out as a sysctl.

If hung_task_batching is set to zero at any time, the behavior will be
to process all tasks in one batch. This seems like a reasonable behavior
for the  zero case. It is also consistent with the behavior of
sysctl_hung_task_check_count.

Maybe a comment should be added by the declaration of both variables explaining
the zero behavior?

> But none of that is actually true, because hung_task_batching is, surprisingly,
> a compile-time constant.
> 
> All this misleadingness would be fixed if it were called
> HUNG_TASK_BATCHING.  But then it wouldn't be pretty.
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2 v4] softlockup: check all tasks in hung_task
  2009-02-05 18:30                                           ` Andrew Morton
@ 2009-02-05 18:58                                             ` Ingo Molnar
  0 siblings, 0 replies; 31+ messages in thread
From: Ingo Molnar @ 2009-02-05 18:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mandeep Singh Baines, Frédéric Weisbecker,
	Peter Zijlstra, linux-kernel, rientjes, mbligh, thockin


* Andrew Morton <akpm@linux-foundation.org> wrote:

> On Thu, 5 Feb 2009 19:07:49 +0100 Ingo Molnar <mingo@elte.hu> wrote:
> 
> > 
> > So i amended the commit in the way below - does that look good to you?
> > 
> 
> yup.

added your ack - thanks!

> > +#define HUNG_TASK_BATCHING 1024;
> 
> s/;//

fixed.

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2009-02-05 18:58 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-01-25 20:50 [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock Frederic Weisbecker
2009-01-26 13:32 ` Ingo Molnar
2009-01-26 13:48 ` Peter Zijlstra
2009-01-26 15:25   ` Frédéric Weisbecker
2009-01-26 15:37     ` Peter Zijlstra
2009-01-26 16:04       ` Frédéric Weisbecker
2009-01-26 17:36         ` Mandeep Baines
2009-01-26 17:41           ` Peter Zijlstra
2009-01-27  0:30             ` [PATCH v4] softlockup: remove hung_task_check_count Mandeep Singh Baines
2009-01-27  9:27               ` Frederic Weisbecker
2009-01-27 13:26               ` Ingo Molnar
2009-01-27 18:48                 ` Mandeep Singh Baines
2009-01-28  8:25                   ` Peter Zijlstra
2009-01-29  1:42                     ` Mandeep Singh Baines
2009-01-30 20:41                       ` Mandeep Singh Baines
2009-01-30 20:46                       ` [PATCH 1/2] softlockup: convert read_lock in hung_task to rcu_read_lock Mandeep Singh Baines
2009-01-30 20:49                       ` [PATCH 2/2] softlockup: check all tasks in hung_task Mandeep Singh Baines
2009-01-31 19:22                         ` Peter Zijlstra
2009-02-03  0:05                           ` [PATCH 2/2 v2] " Mandeep Singh Baines
2009-02-03 12:23                             ` Ingo Molnar
2009-02-03 20:56                               ` [PATCH 2/2 v3] " Mandeep Singh Baines
2009-02-04 19:43                                 ` Ingo Molnar
2009-02-05  4:35                                   ` [PATCH 2/2 v4] " Mandeep Singh Baines
2009-02-05 14:34                                     ` Ingo Molnar
2009-02-05 17:48                                       ` Andrew Morton
2009-02-05 18:07                                         ` Ingo Molnar
2009-02-05 18:30                                           ` Andrew Morton
2009-02-05 18:58                                             ` Ingo Molnar
2009-02-05 18:40                                         ` Mandeep Singh Baines
2009-02-05 17:56                                       ` [PATCH] softlockup: convert read_lock in hung_task to rcu_read_lock Mandeep Singh Baines
2009-02-05 18:13                                         ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).