linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>,
	Ingo Molnar <mingo@redhat.com>,
	Sasha Levin <sasha.levin@oracle.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Michal Hocko <mhocko@kernel.org>,
	cgroups <cgroups@vger.kernel.org>,
	"linux-mm@kvack.org" <linux-mm@kvack.org>,
	Vladimir Davydov <vdavydov@parallels.com>,
	kernel-team <kernel-team@fb.com>,
	Dmitry Vyukov <dvyukov@google.com>,
	Peter Zijlstra <peterz@infradead.org>
Subject: [PATCH v4.4-rc7] sched: move sched lock synchronized bitfields in task_struct into ->atomic_flags
Date: Thu, 31 Dec 2015 21:56:28 -0500	[thread overview]
Message-ID: <20160101025628.GA3660@htj.duckdns.org> (raw)
In-Reply-To: <CA+55aFx0WxoUPrOPaq3HxM+YUQQ0DPV-c3f8kE1ec7agERb_Lg@mail.gmail.com>

task_struct has a cluster of unsigned bitfields.  Some are updated
under scheduler locks while others are updated only by the task
itself.  Currently, the two classes of bitfields aren't distinguished
and end up on the same word which can lead to clobbering when there
are simultaneous read-modify-write attempts.  While difficult to prove
definitely, it's likely that the resulting inconsistency led to low
frqeuency failures such as wrong memcg_may_oom state or loadavg
underflow due to clobbered sched_contributes_to_load.

Fix it by moving sched lock synchronized bitfields into
->atomic_flags.

v2: Move flags into ->atomic_flags instead of segregating bitfields.

Original-patch-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/g/55FEC685.5010404@oracle.com
Cc: stable@vger.kernel.org
---
Hello,

task_struct is pretty well packed and I couldn't find a good hole to
fit a separate integer into.  atomic_flags is a bit cumbersome but it
looks like the better option.

Thanks.

 include/linux/perf_event.h |    6 +++---
 include/linux/sched.h      |   31 ++++++++++++++++++++++++-------
 kernel/sched/core.c        |   22 +++++++++++-----------
 3 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f9828a4..e5a80a4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -921,7 +921,7 @@ perf_sw_migrate_enabled(void)
 static inline void perf_event_task_migrate(struct task_struct *task)
 {
 	if (perf_sw_migrate_enabled())
-		task->sched_migrated = 1;
+		task_set_sched_migrated(task);
 }
 
 static inline void perf_event_task_sched_in(struct task_struct *prev,
@@ -930,12 +930,12 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
 	if (static_key_false(&perf_sched_events.key))
 		__perf_event_task_sched_in(prev, task);
 
-	if (perf_sw_migrate_enabled() && task->sched_migrated) {
+	if (perf_sw_migrate_enabled() && task_sched_migrated(task)) {
 		struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
 
 		perf_fetch_caller_regs(regs);
 		___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
-		task->sched_migrated = 0;
+		task_clear_sched_migrated(task);
 	}
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index edad7a4..b289f47 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1455,14 +1455,9 @@ struct task_struct {
 	/* Used for emulating ABI behavior of previous Linux versions */
 	unsigned int personality;
 
-	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
-				 * execve */
+	/* unserialized, strictly 'current' */
+	unsigned in_execve:1; /* bit to tell LSMs we're in execve */
 	unsigned in_iowait:1;
-
-	/* Revert to default priority/policy when forking */
-	unsigned sched_reset_on_fork:1;
-	unsigned sched_contributes_to_load:1;
-	unsigned sched_migrated:1;
 #ifdef CONFIG_MEMCG
 	unsigned memcg_may_oom:1;
 #endif
@@ -2144,6 +2139,10 @@ static inline void memalloc_noio_restore(unsigned int flags)
 #define PFA_SPREAD_PAGE  1      /* Spread page cache over cpuset */
 #define PFA_SPREAD_SLAB  2      /* Spread some slab caches over cpuset */
 
+#define PFA_SCHED_RESET_ON_FORK		3 /* revert priority/policy on fork */
+#define PFA_SCHED_CONTRIBUTES_TO_LOAD	4
+#define PFA_SCHED_MIGRATED		5
+
 
 #define TASK_PFA_TEST(name, func)					\
 	static inline bool task_##func(struct task_struct *p)		\
@@ -2154,6 +2153,10 @@ static inline void memalloc_noio_restore(unsigned int flags)
 #define TASK_PFA_CLEAR(name, func)					\
 	static inline void task_clear_##func(struct task_struct *p)	\
 	{ clear_bit(PFA_##name, &p->atomic_flags); }
+#define TASK_PFA_UPDATE(name, func)					\
+	static inline void task_update_##func(struct task_struct *p, bool v) \
+	{ if (v) set_bit(PFA_##name, &p->atomic_flags);			\
+	  else clear_bit(PFA_##name, &p->atomic_flags); }
 
 TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
 TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)
@@ -2166,6 +2169,20 @@ TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
 TASK_PFA_SET(SPREAD_SLAB, spread_slab)
 TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
 
+TASK_PFA_TEST(SCHED_RESET_ON_FORK, sched_reset_on_fork);
+TASK_PFA_SET(SCHED_RESET_ON_FORK, sched_reset_on_fork);
+TASK_PFA_CLEAR(SCHED_RESET_ON_FORK, sched_reset_on_fork);
+TASK_PFA_UPDATE(SCHED_RESET_ON_FORK, sched_reset_on_fork);
+
+TASK_PFA_TEST(SCHED_CONTRIBUTES_TO_LOAD, sched_contributes_to_load);
+TASK_PFA_SET(SCHED_CONTRIBUTES_TO_LOAD, sched_contributes_to_load);
+TASK_PFA_CLEAR(SCHED_CONTRIBUTES_TO_LOAD, sched_contributes_to_load);
+TASK_PFA_UPDATE(SCHED_CONTRIBUTES_TO_LOAD, sched_contributes_to_load);
+
+TASK_PFA_TEST(SCHED_MIGRATED, sched_migrated);
+TASK_PFA_SET(SCHED_MIGRATED, sched_migrated);
+TASK_PFA_CLEAR(SCHED_MIGRATED, sched_migrated);
+
 /*
  * task->jobctl flags
  */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 732e993..c5a6a8c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1751,7 +1751,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 	lockdep_assert_held(&rq->lock);
 
 #ifdef CONFIG_SMP
-	if (p->sched_contributes_to_load)
+	if (task_sched_contributes_to_load(p))
 		rq->nr_uninterruptible--;
 #endif
 
@@ -1982,7 +1982,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 */
 	smp_rmb();
 
-	p->sched_contributes_to_load = !!task_contributes_to_load(p);
+	task_update_sched_contributes_to_load(p, task_contributes_to_load(p));
 	p->state = TASK_WAKING;
 
 	if (p->sched_class->task_waking)
@@ -2205,7 +2205,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	/*
 	 * Revert to default priority/policy on fork if requested.
 	 */
-	if (unlikely(p->sched_reset_on_fork)) {
+	if (unlikely(task_sched_reset_on_fork(p))) {
 		if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
 			p->policy = SCHED_NORMAL;
 			p->static_prio = NICE_TO_PRIO(0);
@@ -2220,7 +2220,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 		 * We don't need the reset flag anymore after the fork. It has
 		 * fulfilled its duty:
 		 */
-		p->sched_reset_on_fork = 0;
+		task_clear_sched_reset_on_fork(p);
 	}
 
 	if (dl_prio(p->prio)) {
@@ -3799,7 +3799,7 @@ static int __sched_setscheduler(struct task_struct *p,
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0) {
-		reset_on_fork = p->sched_reset_on_fork;
+		reset_on_fork = task_sched_reset_on_fork(p);
 		policy = oldpolicy = p->policy;
 	} else {
 		reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
@@ -3870,7 +3870,7 @@ static int __sched_setscheduler(struct task_struct *p,
 			return -EPERM;
 
 		/* Normal users shall not reset the sched_reset_on_fork flag */
-		if (p->sched_reset_on_fork && !reset_on_fork)
+		if (task_sched_reset_on_fork(p) && !reset_on_fork)
 			return -EPERM;
 	}
 
@@ -3909,7 +3909,7 @@ static int __sched_setscheduler(struct task_struct *p,
 		if (dl_policy(policy) && dl_param_changed(p, attr))
 			goto change;
 
-		p->sched_reset_on_fork = reset_on_fork;
+		task_update_sched_reset_on_fork(p, reset_on_fork);
 		task_rq_unlock(rq, p, &flags);
 		return 0;
 	}
@@ -3963,7 +3963,7 @@ static int __sched_setscheduler(struct task_struct *p,
 		return -EBUSY;
 	}
 
-	p->sched_reset_on_fork = reset_on_fork;
+	task_update_sched_reset_on_fork(p, reset_on_fork);
 	oldprio = p->prio;
 
 	if (pi) {
@@ -4260,8 +4260,8 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 	if (p) {
 		retval = security_task_getscheduler(p);
 		if (!retval)
-			retval = p->policy
-				| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
+			retval = p->policy | (task_sched_reset_on_fork(p) ?
+					      SCHED_RESET_ON_FORK : 0);
 	}
 	rcu_read_unlock();
 	return retval;
@@ -4377,7 +4377,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 		goto out_unlock;
 
 	attr.sched_policy = p->policy;
-	if (p->sched_reset_on_fork)
+	if (task_sched_reset_on_fork(p))
 		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
 	if (task_has_dl_policy(p))
 		__getparam_dl(p, &attr);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2016-01-01  2:56 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-09-13 18:59 [PATCH 1/2] memcg: flatten task_struct->memcg_oom Tejun Heo
2015-09-13 19:00 ` [PATCH v3 2/2] memcg: punt high overage reclaim to return-to-userland path Tejun Heo
2015-09-15  7:47   ` Johannes Weiner
2015-09-15 15:53     ` Tejun Heo
2015-09-15 16:12       ` Johannes Weiner
2015-09-15 16:22         ` Tejun Heo
2015-09-15 16:33           ` Johannes Weiner
2015-09-15  7:37 ` [PATCH 1/2] memcg: flatten task_struct->memcg_oom Johannes Weiner
2015-09-20 14:45 ` Sasha Levin
2015-09-21 20:01   ` Tejun Heo
2015-09-30 18:54     ` Tejun Heo
2015-11-25 14:43     ` Peter Zijlstra
2015-11-25 15:02       ` Peter Zijlstra
2015-11-25 15:31         ` Andrey Ryabinin
2015-11-25 17:34           ` Dmitry Vyukov
2015-11-25 17:44           ` Peter Zijlstra
2015-12-11 16:25             ` Tejun Heo
2015-12-15 19:22               ` Peter Zijlstra
2015-12-30  9:23                 ` [PATCH v4.4-rc7] sched: isolate task_struct bitfields according to synchronization domains Tejun Heo
2015-12-30 20:10                   ` Linus Torvalds
2015-12-30 20:17                     ` Linus Torvalds
2015-12-30 20:41                     ` Tejun Heo
2015-12-30 20:43                       ` Linus Torvalds
2016-01-01  2:56                     ` Tejun Heo [this message]
2016-01-06 13:44                       ` [PATCH v4.4-rc7] sched: move sched lock synchronized bitfields in task_struct into ->atomic_flags Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160101025628.GA3660@htj.duckdns.org \
    --to=tj@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=dvyukov@google.com \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-team@fb.com \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=ryabinin.a.a@gmail.com \
    --cc=sasha.levin@oracle.com \
    --cc=torvalds@linux-foundation.org \
    --cc=vdavydov@parallels.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).