linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC GIT PULL] scheduler fix for autogroups
@ 2012-12-01 11:16 Ingo Molnar
  2012-12-01 21:16 ` Linus Torvalds
  0 siblings, 1 reply; 11+ messages in thread
From: Ingo Molnar @ 2012-12-01 11:16 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton

Linus,

Please [RFC] pull the latest sched-urgent-for-linus git tree 
from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus

   HEAD: 5258f386ea4e8454bc801fb443e8a4217da1947c sched/autogroup: Fix crash on reboot when autogroup is disabled

Larger and scarier than what I'd like to have so I marked it 
[RFC], but the autogroup fix looks important enough - it's 
enabled in a number of distros.

[ The other reason for the RFC: unfortunately the new feature 
  flag change "sched: Add WAKEUP_PREEMPTION feature flag, on by 
  default" got added weeks ago and got stuck below the autogroup 
  fix - I can rebase and untangle it if it's a problem. (It is 
  not expected to cause any change in behavior or problems.) ]

Thanks,

	Ingo

------------------>
Ingo Molnar (1):
      sched: Add WAKEUP_PREEMPTION feature flag, on by default

Mike Galbraith (1):
      sched/autogroup: Fix crash on reboot when autogroup is disabled


 fs/proc/base.c            | 78 -----------------------------------------------
 kernel/sched/auto_group.c | 68 +++++++----------------------------------
 kernel/sched/auto_group.h |  9 +-----
 kernel/sched/fair.c       |  2 +-
 kernel/sched/features.h   |  5 +++
 kernel/sysctl.c           |  6 ++--
 6 files changed, 20 insertions(+), 148 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1b6c84c..bb1d962 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1271,81 +1271,6 @@ static const struct file_operations proc_pid_sched_operations = {
 
 #endif
 
-#ifdef CONFIG_SCHED_AUTOGROUP
-/*
- * Print out autogroup related information:
- */
-static int sched_autogroup_show(struct seq_file *m, void *v)
-{
-	struct inode *inode = m->private;
-	struct task_struct *p;
-
-	p = get_proc_task(inode);
-	if (!p)
-		return -ESRCH;
-	proc_sched_autogroup_show_task(p, m);
-
-	put_task_struct(p);
-
-	return 0;
-}
-
-static ssize_t
-sched_autogroup_write(struct file *file, const char __user *buf,
-	    size_t count, loff_t *offset)
-{
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct task_struct *p;
-	char buffer[PROC_NUMBUF];
-	int nice;
-	int err;
-
-	memset(buffer, 0, sizeof(buffer));
-	if (count > sizeof(buffer) - 1)
-		count = sizeof(buffer) - 1;
-	if (copy_from_user(buffer, buf, count))
-		return -EFAULT;
-
-	err = kstrtoint(strstrip(buffer), 0, &nice);
-	if (err < 0)
-		return err;
-
-	p = get_proc_task(inode);
-	if (!p)
-		return -ESRCH;
-
-	err = proc_sched_autogroup_set_nice(p, nice);
-	if (err)
-		count = err;
-
-	put_task_struct(p);
-
-	return count;
-}
-
-static int sched_autogroup_open(struct inode *inode, struct file *filp)
-{
-	int ret;
-
-	ret = single_open(filp, sched_autogroup_show, NULL);
-	if (!ret) {
-		struct seq_file *m = filp->private_data;
-
-		m->private = inode;
-	}
-	return ret;
-}
-
-static const struct file_operations proc_pid_sched_autogroup_operations = {
-	.open		= sched_autogroup_open,
-	.read		= seq_read,
-	.write		= sched_autogroup_write,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
-
 static ssize_t comm_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *offset)
 {
@@ -3036,9 +2961,6 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
-#ifdef CONFIG_SCHED_AUTOGROUP
-	REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
-#endif
 	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	INF("syscall",    S_IRUGO, proc_pid_syscall),
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21..0f1bacb 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -110,6 +110,9 @@ out_fail:
 
 bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 {
+	if (!sysctl_sched_autogroup_enabled)
+		return false;
+
 	if (tg != &root_task_group)
 		return false;
 
@@ -143,15 +146,11 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 
 	p->signal->autogroup = autogroup_kref_get(ag);
 
-	if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
-		goto out;
-
 	t = p;
 	do {
 		sched_move_task(t);
 	} while_each_thread(p, t);
 
-out:
 	unlock_task_sighand(p, &flags);
 	autogroup_kref_put(prev);
 }
@@ -159,8 +158,11 @@ out:
 /* Allocates GFP_KERNEL, cannot be called under any spinlock */
 void sched_autogroup_create_attach(struct task_struct *p)
 {
-	struct autogroup *ag = autogroup_create();
+	struct autogroup *ag;
 
+	if (!sysctl_sched_autogroup_enabled)
+		return;
+	ag = autogroup_create();
 	autogroup_move_group(p, ag);
 	/* drop extra reference added by autogroup_create() */
 	autogroup_kref_put(ag);
@@ -176,11 +178,15 @@ EXPORT_SYMBOL(sched_autogroup_detach);
 
 void sched_autogroup_fork(struct signal_struct *sig)
 {
+	if (!sysctl_sched_autogroup_enabled)
+		return;
 	sig->autogroup = autogroup_task_get(current);
 }
 
 void sched_autogroup_exit(struct signal_struct *sig)
 {
+	if (!sysctl_sched_autogroup_enabled)
+		return;
 	autogroup_kref_put(sig->autogroup);
 }
 
@@ -193,58 +199,6 @@ static int __init setup_autogroup(char *str)
 
 __setup("noautogroup", setup_autogroup);
 
-#ifdef CONFIG_PROC_FS
-
-int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
-{
-	static unsigned long next = INITIAL_JIFFIES;
-	struct autogroup *ag;
-	int err;
-
-	if (nice < -20 || nice > 19)
-		return -EINVAL;
-
-	err = security_task_setnice(current, nice);
-	if (err)
-		return err;
-
-	if (nice < 0 && !can_nice(current, nice))
-		return -EPERM;
-
-	/* this is a heavy operation taking global locks.. */
-	if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
-		return -EAGAIN;
-
-	next = HZ / 10 + jiffies;
-	ag = autogroup_task_get(p);
-
-	down_write(&ag->lock);
-	err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
-	if (!err)
-		ag->nice = nice;
-	up_write(&ag->lock);
-
-	autogroup_kref_put(ag);
-
-	return err;
-}
-
-void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
-{
-	struct autogroup *ag = autogroup_task_get(p);
-
-	if (!task_group_is_autogroup(ag->tg))
-		goto out;
-
-	down_read(&ag->lock);
-	seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
-	up_read(&ag->lock);
-
-out:
-	autogroup_kref_put(ag);
-}
-#endif /* CONFIG_PROC_FS */
-
 #ifdef CONFIG_SCHED_DEBUG
 int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd0471..4552c6b 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -4,11 +4,6 @@
 #include <linux/rwsem.h>
 
 struct autogroup {
-	/*
-	 * reference doesn't mean how many thread attach to this
-	 * autogroup now. It just stands for the number of task
-	 * could use this autogroup.
-	 */
 	struct kref		kref;
 	struct task_group	*tg;
 	struct rw_semaphore	lock;
@@ -29,9 +24,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
 {
-	int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
-
-	if (enabled && task_wants_autogroup(p, tg))
+	if (task_wants_autogroup(p, tg))
 		return p->signal->autogroup->tg;
 
 	return tg;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a1..f936552 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2907,7 +2907,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
 	 * is driven by the tick):
 	 */
-	if (unlikely(p->policy != SCHED_NORMAL))
+	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
 		return;
 
 	find_matching_se(&se, &pse);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefca..e68e69a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -32,6 +32,11 @@ SCHED_FEAT(LAST_BUDDY, true)
 SCHED_FEAT(CACHE_HOT_BUDDY, true)
 
 /*
+ * Allow wakeup-time preemption of the current task:
+ */
+SCHED_FEAT(WAKEUP_PREEMPTION, true)
+
+/*
  * Use arch dependent cpu power functions
  */
 SCHED_FEAT(ARCH_POWER, true)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 81c7b1a..2914d0f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -363,10 +363,8 @@ static struct ctl_table kern_table[] = {
 		.procname	= "sched_autogroup_enabled",
 		.data		= &sysctl_sched_autogroup_enabled,
 		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [RFC GIT PULL] scheduler fix for autogroups
  2012-12-01 11:16 [RFC GIT PULL] scheduler fix for autogroups Ingo Molnar
@ 2012-12-01 21:16 ` Linus Torvalds
  2012-12-01 21:44   ` Ingo Molnar
  0 siblings, 1 reply; 11+ messages in thread
From: Linus Torvalds @ 2012-12-01 21:16 UTC (permalink / raw)
  To: Ingo Molnar, Mike Galbraith
  Cc: Linux Kernel Mailing List, Peter Zijlstra, Thomas Gleixner,
	Andrew Morton

On Sat, Dec 1, 2012 at 3:16 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> Please [RFC] pull the latest sched-urgent-for-linus git tree
> from:

No. That patch is braindead. I wouldn't pull it even if it wasn't this late.

Why the hell leave a read-only 'sched_autogroup_enabled' proc file?
What the f*ck is the point? It looks like the flag still exists (we
test it), but now there's no point to it, since you can't change it.

What am I missing?

                Linus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC GIT PULL] scheduler fix for autogroups
  2012-12-01 21:16 ` Linus Torvalds
@ 2012-12-01 21:44   ` Ingo Molnar
  2012-12-01 21:47     ` Ingo Molnar
                       ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Ingo Molnar @ 2012-12-01 21:44 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Mike Galbraith, Linux Kernel Mailing List, Peter Zijlstra,
	Thomas Gleixner, Andrew Morton


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sat, Dec 1, 2012 at 3:16 AM, Ingo Molnar <mingo@kernel.org> wrote:
> >
> > Please [RFC] pull the latest sched-urgent-for-linus git tree
> > from:
> 
> No. That patch is braindead. I wouldn't pull it even if it 
> wasn't this late.
> 
> Why the hell leave a read-only 'sched_autogroup_enabled' proc 
> file?
>
> What the f*ck is the point? It looks like the flag still 
> exists (we test it), but now there's no point to it, since you 
> can't change it.
> 
> What am I missing?

You are not missing anything. That flag is my fault not Mike's: 
I booted the initial version of that patch but was unsure 
whether autogroups was enabled - it's a pretty transparent 
feature. So I figured that having that flag (but readonly) would 
give us this information definitely.

So I suggested to Mike to keep that flag so that user-space is 
informed that autogroups is enabled. It seemed like a cute 
usability twist at that time, and there's existing precedent for 
it in /proc, but now I'm not so sure anymore...

Should we use some other file for that - or no file at all and 
just emit a bootup printk for kernel hackers with a short 
attention span?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC GIT PULL] scheduler fix for autogroups
  2012-12-01 21:44   ` Ingo Molnar
@ 2012-12-01 21:47     ` Ingo Molnar
  2012-12-01 22:03     ` Linus Torvalds
  2012-12-02  8:21     ` Mike Galbraith
  2 siblings, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2012-12-01 21:47 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Mike Galbraith, Linux Kernel Mailing List, Peter Zijlstra,
	Thomas Gleixner, Andrew Morton


* Ingo Molnar <mingo@kernel.org> wrote:

> You are not missing anything. That flag is my fault not 
> Mike's: I booted the initial version of that patch but was 
> unsure whether autogroups was enabled - it's a pretty 
> transparent feature. So I figured that having that flag (but 
> readonly) would give us this information definitely.

The other reason was that the original version of the patch also 
added a boot parameter - to enable/disable autogroups from the 
boot command line. With *that* configuration twist it made sense 
to present this information somewhere in /proc as well.

But then we got rid of the boot parameter to simplify the patch 
- which further reduced the sense of the 
/proc/sys/kernel/sched_autogroup_enabled flag - which now can 
only ever be 1.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC GIT PULL] scheduler fix for autogroups
  2012-12-01 21:44   ` Ingo Molnar
  2012-12-01 21:47     ` Ingo Molnar
@ 2012-12-01 22:03     ` Linus Torvalds
  2012-12-02  7:34       ` Mike Galbraith
  2012-12-02  8:21     ` Mike Galbraith
  2 siblings, 1 reply; 11+ messages in thread
From: Linus Torvalds @ 2012-12-01 22:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mike Galbraith, Linux Kernel Mailing List, Peter Zijlstra,
	Thomas Gleixner, Andrew Morton

On Sat, Dec 1, 2012 at 1:44 PM, Ingo Molnar <mingo@kernel.org> wrote:
>
> You are not missing anything. That flag is my fault not Mike's:
> I booted the initial version of that patch but was unsure
> whether autogroups was enabled - it's a pretty transparent
> feature. So I figured that having that flag (but readonly) would
> give us this information definitely.

So what's the advantage of it being read-only at all?

Since the flag is clearly *used*, make it read-write, and then all my
objections go away (except for a slight worry that the dropping of
/proc/<pid>/autogroup_nice or whatever it is could break some odd
system app, but I don't worry *too* much about that).

Disabling autogroup is clearly something people might want, since the
code tests for it. So removing the flag entirely seems wrong too. But
if it exists, it should be writable. No?

             Linus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC GIT PULL] scheduler fix for autogroups
  2012-12-01 22:03     ` Linus Torvalds
@ 2012-12-02  7:34       ` Mike Galbraith
  0 siblings, 0 replies; 11+ messages in thread
From: Mike Galbraith @ 2012-12-02  7:34 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Linux Kernel Mailing List, Peter Zijlstra,
	Thomas Gleixner, Andrew Morton

On Sat, 2012-12-01 at 14:03 -0800, Linus Torvalds wrote: 
> On Sat, Dec 1, 2012 at 1:44 PM, Ingo Molnar <mingo@kernel.org> wrote:
> >
> > You are not missing anything. That flag is my fault not Mike's:
> > I booted the initial version of that patch but was unsure
> > whether autogroups was enabled - it's a pretty transparent
> > feature. So I figured that having that flag (but readonly) would
> > give us this information definitely.
> 
> So what's the advantage of it being read-only at all?
> 
> Since the flag is clearly *used*, make it read-write, and then all my
> objections go away (except for a slight worry that the dropping of
> /proc/<pid>/autogroup_nice or whatever it is could break some odd
> system app, but I don't worry *too* much about that).
> 
> Disabling autogroup is clearly something people might want, since the
> code tests for it. So removing the flag entirely seems wrong too. But
> if it exists, it should be writable. No?

No, because turning autogroup off at runtime is what now makes boom.

With Peter's race fix in place, lazy movement (was noop on UP) is gone,
mandating that you either walk the box, moving all tasks when you flick
the switch, or you remove the ability to flick the switch other than
'off' at boot time.  You didn't like the original instant on/off switch,
and I didn't like the thought of making autogroup bigger to fix the
explosion either, so went for rip the switch and problematic /proc stuff
that really should have never existed in the first place out option.

-Mike


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC GIT PULL] scheduler fix for autogroups
  2012-12-01 21:44   ` Ingo Molnar
  2012-12-01 21:47     ` Ingo Molnar
  2012-12-01 22:03     ` Linus Torvalds
@ 2012-12-02  8:21     ` Mike Galbraith
  2012-12-02 19:27       ` Ingo Molnar
  2 siblings, 1 reply; 11+ messages in thread
From: Mike Galbraith @ 2012-12-02  8:21 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Linux Kernel Mailing List, Peter Zijlstra,
	Thomas Gleixner, Andrew Morton

On Sat, 2012-12-01 at 22:44 +0100, Ingo Molnar wrote:

> Should we use some other file for that - or no file at all and 
> just emit a bootup printk for kernel hackers with a short 
> attention span?

Or, whack the file and don't bother with a printk either.  If it's in
your config, and your command line doesn't contain noautogroup, it's on,
so the info is already present (until buffer gets full).  That makes for
even fewer lines dedicated to dinky sideline feature.

Or (as previously mentioned) just depreciate (or rip out) the whole
thing since systemd is propagating everywhere anyway, and offers the
same functionality.

For 3.7, a revert of 800d4d30c8f2 would prevent the explosion when folks
play with the now non-functional on/off switch (task groups are required
to _always_ exist, that commit busted the autogroup assumption), so is
perhaps a viable quickfix until autogroups fate is decided?

-Mike


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC GIT PULL] scheduler fix for autogroups
  2012-12-02  8:21     ` Mike Galbraith
@ 2012-12-02 19:27       ` Ingo Molnar
  2012-12-02 19:36         ` Linus Torvalds
  0 siblings, 1 reply; 11+ messages in thread
From: Ingo Molnar @ 2012-12-02 19:27 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Linus Torvalds, Linux Kernel Mailing List, Peter Zijlstra,
	Thomas Gleixner, Andrew Morton


* Mike Galbraith <efault@gmx.de> wrote:

> On Sat, 2012-12-01 at 22:44 +0100, Ingo Molnar wrote:
> 
> > Should we use some other file for that - or no file at all and 
> > just emit a bootup printk for kernel hackers with a short 
> > attention span?
> 
> Or, whack the file and don't bother with a printk either.  If 
> it's in your config, and your command line doesn't contain 
> noautogroup, it's on, so the info is already present (until 
> buffer gets full).  That makes for even fewer lines dedicated 
> to dinky sideline feature.
> 
> Or (as previously mentioned) just depreciate (or rip out) the 
> whole thing since systemd is propagating everywhere anyway, 
> and offers the same functionality.
> 
> For 3.7, a revert of 800d4d30c8f2 would prevent the explosion 
> when folks play with the now non-functional on/off switch 
> (task groups are required to _always_ exist, that commit 
> busted the autogroup assumption), so is perhaps a viable 
> quickfix until autogroups fate is decided?

Linus, which one would be your preference? I'm fine with the 
first and third options - #2 that rips it all out looks like
a sad removal of an otherwise useful feature.

( The fourth option would be to fix the dynamic knobs - there's 
  no patch for that yet. )

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC GIT PULL] scheduler fix for autogroups
  2012-12-02 19:27       ` Ingo Molnar
@ 2012-12-02 19:36         ` Linus Torvalds
  2012-12-03  5:25           ` [patch] " Mike Galbraith
  0 siblings, 1 reply; 11+ messages in thread
From: Linus Torvalds @ 2012-12-02 19:36 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mike Galbraith, Linux Kernel Mailing List, Peter Zijlstra,
	Thomas Gleixner, Andrew Morton

On Sun, Dec 2, 2012 at 11:27 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Mike Galbraith <efault@gmx.de> wrote:
>
>> On Sat, 2012-12-01 at 22:44 +0100, Ingo Molnar wrote:
>>
>> > Should we use some other file for that - or no file at all and
>> > just emit a bootup printk for kernel hackers with a short
>> > attention span?
>>
>> Or, whack the file and don't bother with a printk either.  If
>> it's in your config, and your command line doesn't contain
>> noautogroup, it's on, so the info is already present (until
>> buffer gets full).  That makes for even fewer lines dedicated
>> to dinky sideline feature.
>>
>> Or (as previously mentioned) just depreciate (or rip out) the
>> whole thing since systemd is propagating everywhere anyway,
>> and offers the same functionality.
>>
>> For 3.7, a revert of 800d4d30c8f2 would prevent the explosion
>> when folks play with the now non-functional on/off switch
>> (task groups are required to _always_ exist, that commit
>> busted the autogroup assumption), so is perhaps a viable
>> quickfix until autogroups fate is decided?
>
> Linus, which one would be your preference? I'm fine with the
> first and third options - #2 that rips it all out looks like
> a sad removal of an otherwise useful feature.

I suspect #3 is the best option right now - just revert 800d4d30c8f2.

Willing to write a changelog with the pointer to the actual oops that
happens due to this issue?

               Linus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [patch] Re: [RFC GIT PULL] scheduler fix for autogroups
  2012-12-02 19:36         ` Linus Torvalds
@ 2012-12-03  5:25           ` Mike Galbraith
  2012-12-03  5:36             ` Ingo Molnar
  0 siblings, 1 reply; 11+ messages in thread
From: Mike Galbraith @ 2012-12-03  5:25 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Linux Kernel Mailing List, Peter Zijlstra,
	Thomas Gleixner, Andrew Morton, Yong Zhang

On Sun, 2012-12-02 at 11:36 -0800, Linus Torvalds wrote: 
> On Sun, Dec 2, 2012 at 11:27 AM, Ingo Molnar <mingo@kernel.org> wrote:
> >
> > * Mike Galbraith <efault@gmx.de> wrote:
> >
> >> On Sat, 2012-12-01 at 22:44 +0100, Ingo Molnar wrote:
> >>
> >> > Should we use some other file for that - or no file at all and
> >> > just emit a bootup printk for kernel hackers with a short
> >> > attention span?
> >>
> >> Or, whack the file and don't bother with a printk either.  If
> >> it's in your config, and your command line doesn't contain
> >> noautogroup, it's on, so the info is already present (until
> >> buffer gets full).  That makes for even fewer lines dedicated
> >> to dinky sideline feature.
> >>
> >> Or (as previously mentioned) just depreciate (or rip out) the
> >> whole thing since systemd is propagating everywhere anyway,
> >> and offers the same functionality.
> >>
> >> For 3.7, a revert of 800d4d30c8f2 would prevent the explosion
> >> when folks play with the now non-functional on/off switch
> >> (task groups are required to _always_ exist, that commit
> >> busted the autogroup assumption), so is perhaps a viable
> >> quickfix until autogroups fate is decided?
> >
> > Linus, which one would be your preference? I'm fine with the
> > first and third options - #2 that rips it all out looks like
> > a sad removal of an otherwise useful feature.
> 
> I suspect #3 is the best option right now - just revert 800d4d30c8f2.
> 
> Willing to write a changelog with the pointer to the actual oops that
> happens due to this issue?

I don't have a link, so reproduced/captured it.  With systemd-sysvinit
(bleh) installed, it's trivial to reproduce:

Add echo 0 > /proc/sys/kernel/sched_autogroup_enabled to /root/.bashrc
(or wherever), boot box, type reboot, box explodes.

revert 800d4d30 sched, autogroup: Stop going ahead if autogroup is disabled

Between 8323f26ce and 800d4d30, autogroup is a wreck.  With both
applied, all you have to do to crash a box is disable autogroup
during boot up, then reboot.. boom, NULL pointer dereference due
to 800d4d30 not allowing autogroup to move things, and 8323f26ce
making that the only way to switch runqueues.
    
[  202.187747] BUG: unable to handle kernel NULL pointer dereference at           (null)
[  202.191644] IP: [<ffffffff81063ac0>] effective_load.isra.43+0x50/0x90
[  202.191644] PGD 220a74067 PUD 220402067 PMD 0 
[  202.191644] Oops: 0000 [#1] SMP 
[  202.191644] Modules linked in: nfs nfsd fscache lockd nfs_acl auth_rpcgss sunrpc exportfs bridge stp cpufreq_conservative cpufreq_ondemand cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf ext3 jbd fuse nls_iso8859_1 snd_hda_codec_realtek nls_cp437 snd_hda_intel vfat fat snd_hda_codec e1000e sr_mod snd_hwdep cdrom snd_pcm sg snd_timer usb_storage snd firewire_ohci usb_libusual firewire_core soundcore uas snd_page_alloc i2c_i801 coretemp edd microcode hid_generic button crc_itu_t ipv6 autofs4 ext4 mbcache jbd2 crc16 usbhid hid sd_mod uhci_hcd ahci libahci libata rtc_cmos ehci_hcd scsi_mod thermal fan usbcore processor usb_common
[  202.191644] CPU 0 
[  202.191644] Pid: 7047, comm: systemd-user-se Not tainted 3.6.8-smp #7 MEDIONPC MS-7502/MS-7502
[  202.191644] RIP: 0010:[<ffffffff81063ac0>]  [<ffffffff81063ac0>] effective_load.isra.43+0x50/0x90
[  202.191644] RSP: 0018:ffff880221ddfbd8  EFLAGS: 00010086
[  202.191644] RAX: 0000000000000400 RBX: ffff88022621d880 RCX: 0000000000000000
[  202.191644] RDX: 0000000000000000 RSI: 0000000000000002 RDI: ffff880220a363a0
[  202.191644] RBP: ffff880221ddfbd8 R08: 0000000000000400 R09: 00000000000115c0
[  202.191644] R10: 0000000000000000 R11: 0000000000000400 R12: ffff8802214ed180
[  202.191644] R13: 00000000000003fd R14: 0000000000000000 R15: 0000000000000003
[  202.191644] FS:  00007f174a81c7a0(0000) GS:ffff88022fc00000(0000) knlGS:0000000000000000
[  202.191644] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  202.191644] CR2: 0000000000000000 CR3: 0000000221fad000 CR4: 00000000000007f0
[  202.191644] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  202.191644] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  202.191644] Process systemd-user-se (pid: 7047, threadinfo ffff880221dde000, task ffff88022618b3a0)
[  202.191644] Stack:
[  202.191644]  ffff880221ddfc88 ffffffff81063d55 0000000000000400 00000000000115c0
[  202.191644]  ffff88022235c218 ffffffff814ef9e8 ffffea0000000000 ffff88022621d880
[  202.191644]  ffff880227007200 ffffffff00000003 0000000000000010 0000000000018f38
[  202.191644] Call Trace:
[  202.191644]  [<ffffffff81063d55>] select_task_rq_fair+0x255/0x780
[  202.191644]  [<ffffffff810607e6>] try_to_wake_up+0x156/0x2c0
[  202.191644]  [<ffffffff8106098b>] wake_up_state+0xb/0x10
[  202.191644]  [<ffffffff81044f88>] signal_wake_up+0x28/0x40
[  202.191644]  [<ffffffff81045406>] complete_signal+0x1d6/0x250
[  202.191644]  [<ffffffff810455f0>] __send_signal+0x170/0x310
[  202.191644]  [<ffffffff810457d0>] send_signal+0x40/0x80
[  202.191644]  [<ffffffff81046257>] do_send_sig_info+0x47/0x90
[  202.191644]  [<ffffffff8104649a>] group_send_sig_info+0x4a/0x70
[  202.191644]  [<ffffffff810465ba>] kill_pid_info+0x3a/0x60
[  202.191644]  [<ffffffff81047ac7>] sys_kill+0x97/0x1a0
[  202.191644]  [<ffffffff810ebc10>] ? vfs_read+0x120/0x160
[  202.191644]  [<ffffffff810ebc95>] ? sys_read+0x45/0x90
[  202.191644]  [<ffffffff8134bde2>] system_call_fastpath+0x16/0x1b
[  202.191644] Code: 49 0f af 41 50 31 d2 49 f7 f0 48 83 f8 01 48 0f 46 c6 48 2b 07 48 8b bf 40 01 00 00 48 85 ff 74 3a 45 31 c0 48 8b 8f 50 01 00 00 <48> 8b 11 4c 8b 89 80 00 00 00 49 89 d2 48 01 d0 45 8b 59 58 4c 
[  202.191644] RIP  [<ffffffff81063ac0>] effective_load.isra.43+0x50/0x90
[  202.191644]  RSP <ffff880221ddfbd8>
[  202.191644] CR2: 0000000000000000

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Yong Zhang <yong.zhang0@gmail.com>
Cc: stable@vger.kernel.org

---
 kernel/sched/auto_group.c |    4 ----
 kernel/sched/auto_group.h |    5 -----
 2 files changed, 9 deletions(-)

--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -143,15 +143,11 @@ autogroup_move_group(struct task_struct
 
 	p->signal->autogroup = autogroup_kref_get(ag);
 
-	if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
-		goto out;
-
 	t = p;
 	do {
 		sched_move_task(t);
 	} while_each_thread(p, t);
 
-out:
 	unlock_task_sighand(p, &flags);
 	autogroup_kref_put(prev);
 }
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -4,11 +4,6 @@
 #include <linux/rwsem.h>
 
 struct autogroup {
-	/*
-	 * reference doesn't mean how many thread attach to this
-	 * autogroup now. It just stands for the number of task
-	 * could use this autogroup.
-	 */
 	struct kref		kref;
 	struct task_group	*tg;
 	struct rw_semaphore	lock;



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] Re: [RFC GIT PULL] scheduler fix for autogroups
  2012-12-03  5:25           ` [patch] " Mike Galbraith
@ 2012-12-03  5:36             ` Ingo Molnar
  0 siblings, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2012-12-03  5:36 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Linus Torvalds, Linux Kernel Mailing List, Peter Zijlstra,
	Thomas Gleixner, Andrew Morton, Yong Zhang


* Mike Galbraith <efault@gmx.de> wrote:

> > Willing to write a changelog with the pointer to the actual 
> > oops that happens due to this issue?
> 
> I don't have a link, so reproduced/captured it.  With 
> systemd-sysvinit (bleh) installed, it's trivial to reproduce:
> 
> Add echo 0 > /proc/sys/kernel/sched_autogroup_enabled to /root/.bashrc
> (or wherever), boot box, type reboot, box explodes.
> 
> revert 800d4d30 sched, autogroup: Stop going ahead if autogroup is disabled
> 
> Between 8323f26ce and 800d4d30, autogroup is a wreck.  With both

Slightly decoded, for our human readers:

 8323f26ce342 ("sched: Fix race in task_group()")

:-)

> applied, all you have to do to crash a box is disable autogroup
> during boot up, then reboot.. boom, NULL pointer dereference due
> to 800d4d30 not allowing autogroup to move things, and 8323f26ce
> making that the only way to switch runqueues.
>     
> [  202.187747] BUG: unable to handle kernel NULL pointer dereference at           (null)
> [  202.191644] IP: [<ffffffff81063ac0>] effective_load.isra.43+0x50/0x90
> [  202.191644] PGD 220a74067 PUD 220402067 PMD 0 
> [  202.191644] Oops: 0000 [#1] SMP 
> [  202.191644] Modules linked in: nfs nfsd fscache lockd nfs_acl auth_rpcgss sunrpc exportfs bridge stp cpufreq_conservative cpufreq_ondemand cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf ext3 jbd fuse nls_iso8859_1 snd_hda_codec_realtek nls_cp437 snd_hda_intel vfat fat snd_hda_codec e1000e sr_mod snd_hwdep cdrom snd_pcm sg snd_timer usb_storage snd firewire_ohci usb_libusual firewire_core soundcore uas snd_page_alloc i2c_i801 coretemp edd microcode hid_generic button crc_itu_t ipv6 autofs4 ext4 mbcache jbd2 crc16 usbhid hid sd_mod uhci_hcd ahci libahci libata rtc_cmos ehci_hcd scsi_mod thermal fan usbcore processor usb_common
> [  202.191644] CPU 0 
> [  202.191644] Pid: 7047, comm: systemd-user-se Not tainted 3.6.8-smp #7 MEDIONPC MS-7502/MS-7502
> [  202.191644] RIP: 0010:[<ffffffff81063ac0>]  [<ffffffff81063ac0>] effective_load.isra.43+0x50/0x90
> [  202.191644] RSP: 0018:ffff880221ddfbd8  EFLAGS: 00010086
> [  202.191644] RAX: 0000000000000400 RBX: ffff88022621d880 RCX: 0000000000000000
> [  202.191644] RDX: 0000000000000000 RSI: 0000000000000002 RDI: ffff880220a363a0
> [  202.191644] RBP: ffff880221ddfbd8 R08: 0000000000000400 R09: 00000000000115c0
> [  202.191644] R10: 0000000000000000 R11: 0000000000000400 R12: ffff8802214ed180
> [  202.191644] R13: 00000000000003fd R14: 0000000000000000 R15: 0000000000000003
> [  202.191644] FS:  00007f174a81c7a0(0000) GS:ffff88022fc00000(0000) knlGS:0000000000000000
> [  202.191644] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  202.191644] CR2: 0000000000000000 CR3: 0000000221fad000 CR4: 00000000000007f0
> [  202.191644] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  202.191644] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> [  202.191644] Process systemd-user-se (pid: 7047, threadinfo ffff880221dde000, task ffff88022618b3a0)
> [  202.191644] Stack:
> [  202.191644]  ffff880221ddfc88 ffffffff81063d55 0000000000000400 00000000000115c0
> [  202.191644]  ffff88022235c218 ffffffff814ef9e8 ffffea0000000000 ffff88022621d880
> [  202.191644]  ffff880227007200 ffffffff00000003 0000000000000010 0000000000018f38
> [  202.191644] Call Trace:
> [  202.191644]  [<ffffffff81063d55>] select_task_rq_fair+0x255/0x780
> [  202.191644]  [<ffffffff810607e6>] try_to_wake_up+0x156/0x2c0
> [  202.191644]  [<ffffffff8106098b>] wake_up_state+0xb/0x10
> [  202.191644]  [<ffffffff81044f88>] signal_wake_up+0x28/0x40
> [  202.191644]  [<ffffffff81045406>] complete_signal+0x1d6/0x250
> [  202.191644]  [<ffffffff810455f0>] __send_signal+0x170/0x310
> [  202.191644]  [<ffffffff810457d0>] send_signal+0x40/0x80
> [  202.191644]  [<ffffffff81046257>] do_send_sig_info+0x47/0x90
> [  202.191644]  [<ffffffff8104649a>] group_send_sig_info+0x4a/0x70
> [  202.191644]  [<ffffffff810465ba>] kill_pid_info+0x3a/0x60
> [  202.191644]  [<ffffffff81047ac7>] sys_kill+0x97/0x1a0
> [  202.191644]  [<ffffffff810ebc10>] ? vfs_read+0x120/0x160
> [  202.191644]  [<ffffffff810ebc95>] ? sys_read+0x45/0x90
> [  202.191644]  [<ffffffff8134bde2>] system_call_fastpath+0x16/0x1b
> [  202.191644] Code: 49 0f af 41 50 31 d2 49 f7 f0 48 83 f8 01 48 0f 46 c6 48 2b 07 48 8b bf 40 01 00 00 48 85 ff 74 3a 45 31 c0 48 8b 8f 50 01 00 00 <48> 8b 11 4c 8b 89 80 00 00 00 49 89 d2 48 01 d0 45 8b 59 58 4c 
> [  202.191644] RIP  [<ffffffff81063ac0>] effective_load.isra.43+0x50/0x90
> [  202.191644]  RSP <ffff880221ddfbd8>
> [  202.191644] CR2: 0000000000000000
> 
> Signed-off-by: Mike Galbraith <efault@gmx.de>
> Cc: Yong Zhang <yong.zhang0@gmail.com>
> Cc: stable@vger.kernel.org

Thanks Mike!

Acked-by: Ingo Molnar <mingo@kernel.org>

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2012-12-03  5:36 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-12-01 11:16 [RFC GIT PULL] scheduler fix for autogroups Ingo Molnar
2012-12-01 21:16 ` Linus Torvalds
2012-12-01 21:44   ` Ingo Molnar
2012-12-01 21:47     ` Ingo Molnar
2012-12-01 22:03     ` Linus Torvalds
2012-12-02  7:34       ` Mike Galbraith
2012-12-02  8:21     ` Mike Galbraith
2012-12-02 19:27       ` Ingo Molnar
2012-12-02 19:36         ` Linus Torvalds
2012-12-03  5:25           ` [patch] " Mike Galbraith
2012-12-03  5:36             ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).