linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Eric W. Biederman" <ebiederm@xmission.com>
To: linux-kernel@vger.kernel.org
Cc: linux-api@vger.kernel.org,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Oleg Nesterov <oleg@redhat.com>, Ingo Molnar <mingo@kernel.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Kees Cook <keescook@chromium.org>,
	Roland McGrath <roland@hack.frob.com>,
	Al Viro <viro@ZenIV.linux.org.uk>,
	David Howells <dhowells@redhat.com>,
	"Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com>,
	"Eric W. Biederman" <ebiederm@xmission.com>
Subject: [PATCH 08/26] exit: Make the runqueue rcu safe
Date: Tue,  6 Jun 2017 14:03:20 -0500	[thread overview]
Message-ID: <20170606190338.28347-8-ebiederm@xmission.com> (raw)
In-Reply-To: <20170606190338.28347-1-ebiederm@xmission.com>

Add a rcu_usage to task_struct and use it to reuse the delayed rcu put
logic from release_task in finish_task_switch.  This guarantees that
there will be an rcu interval before usage drops to zero for any task
on the run queue.  Making it safe to unconditionally call
get_task_struct in a rcu critical section for any task on the run
queue.

This guarantee in turn allows the fair scheduluer to use ordinary rcu
primitives to access tasks on the run queue and makes the magic functions
task_rcu_dereference and try_get_task_struct completely unnecessary.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched.h      |  1 +
 include/linux/sched/task.h |  4 +--
 kernel/exit.c              | 83 ++++------------------------------------------
 kernel/fork.c              |  3 +-
 kernel/sched/core.c        |  2 +-
 kernel/sched/fair.c        |  2 +-
 6 files changed, 12 insertions(+), 83 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b69fc650201..461ecd20731c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -492,6 +492,7 @@ struct task_struct {
 	volatile long			state;
 	void				*stack;
 	atomic_t			usage;
+	atomic_t			rcu_usage;
 	/* Per task flags (PF_*), defined further below: */
 	unsigned int			flags;
 	unsigned int			ptrace;
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a978d7189cfd..dc4a4f4c566b 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -94,9 +94,7 @@ static inline void put_task_struct(struct task_struct *t)
 		__put_task_struct(t);
 }
 
-struct task_struct *task_rcu_dereference(struct task_struct **ptask);
-struct task_struct *try_get_task_struct(struct task_struct **ptask);
-
+extern void rcu_put_task_struct(struct task_struct *tsk);
 
 #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
 extern int arch_task_struct_size __read_mostly;
diff --git a/kernel/exit.c b/kernel/exit.c
index c3de7ace243c..625e57f1bb5c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -179,6 +179,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	put_task_struct(tsk);
 }
 
+void rcu_put_task_struct(struct task_struct *tsk)
+{
+	if (atomic_dec_and_test(&tsk->rcu_usage))
+		call_rcu(&tsk->rcu, delayed_put_task_struct);
+}
 
 void release_task(struct task_struct *p)
 {
@@ -218,76 +223,13 @@ void release_task(struct task_struct *p)
 
 	write_unlock_irq(&tasklist_lock);
 	release_thread(p);
-	call_rcu(&p->rcu, delayed_put_task_struct);
+	rcu_put_task_struct(p);
 
 	p = leader;
 	if (unlikely(zap_leader))
 		goto repeat;
 }
 
-/*
- * Note that if this function returns a valid task_struct pointer (!NULL)
- * task->usage must remain >0 for the duration of the RCU critical section.
- */
-struct task_struct *task_rcu_dereference(struct task_struct **ptask)
-{
-	struct sighand_struct *sighand;
-	struct task_struct *task;
-
-	/*
-	 * We need to verify that release_task() was not called and thus
-	 * delayed_put_task_struct() can't run and drop the last reference
-	 * before rcu_read_unlock(). We check task->sighand != NULL,
-	 * but we can read the already freed and reused memory.
-	 */
-retry:
-	task = rcu_dereference(*ptask);
-	if (!task)
-		return NULL;
-
-	probe_kernel_address(&task->sighand, sighand);
-
-	/*
-	 * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
-	 * was already freed we can not miss the preceding update of this
-	 * pointer.
-	 */
-	smp_rmb();
-	if (unlikely(task != READ_ONCE(*ptask)))
-		goto retry;
-
-	/*
-	 * We've re-checked that "task == *ptask", now we have two different
-	 * cases:
-	 *
-	 * 1. This is actually the same task/task_struct. In this case
-	 *    sighand != NULL tells us it is still alive.
-	 *
-	 * 2. This is another task which got the same memory for task_struct.
-	 *    We can't know this of course, and we can not trust
-	 *    sighand != NULL.
-	 *
-	 *    In this case we actually return a random value, but this is
-	 *    correct.
-	 *
-	 *    If we return NULL - we can pretend that we actually noticed that
-	 *    *ptask was updated when the previous task has exited. Or pretend
-	 *    that probe_slab_address(&sighand) reads NULL.
-	 *
-	 *    If we return the new task (because sighand is not NULL for any
-	 *    reason) - this is fine too. This (new) task can't go away before
-	 *    another gp pass.
-	 *
-	 *    And note: We could even eliminate the false positive if re-read
-	 *    task->sighand once again to avoid the falsely NULL. But this case
-	 *    is very unlikely so we don't care.
-	 */
-	if (!sighand)
-		return NULL;
-
-	return task;
-}
-
 void rcuwait_wake_up(struct rcuwait *w)
 {
 	struct task_struct *task;
@@ -317,19 +259,6 @@ void rcuwait_wake_up(struct rcuwait *w)
 	rcu_read_unlock();
 }
 
-struct task_struct *try_get_task_struct(struct task_struct **ptask)
-{
-	struct task_struct *task;
-
-	rcu_read_lock();
-	task = task_rcu_dereference(ptask);
-	if (task)
-		get_task_struct(task);
-	rcu_read_unlock();
-
-	return task;
-}
-
 /*
  * Determine if a process group is "orphaned", according to the POSIX
  * definition in 2.2.2.52.  Orphaned process groups are not to be affected
diff --git a/kernel/fork.c b/kernel/fork.c
index aa1076c5e4a9..1fe837e8c38e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -567,7 +567,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	 * One for us, one for whoever does the "release_task()" (usually
 	 * parent)
 	 */
-	atomic_set(&tsk->usage, 2);
+	atomic_set(&tsk->rcu_usage, 2);
+	atomic_set(&tsk->usage, 1); /* For rcu_usage */
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	tsk->btrace_seq = 0;
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 803c3bc274c4..1fccfd397cab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2762,7 +2762,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		/* Task is done with its stack. */
 		put_task_stack(prev);
 
-		put_task_struct(prev);
+		rcu_put_task_struct(prev);
 	}
 
 	tick_nohz_task_switch();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d71109321841..5c0a1e1cc0f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1527,7 +1527,7 @@ static void task_numa_compare(struct task_numa_env *env,
 	int dist = env->dist;
 
 	rcu_read_lock();
-	cur = task_rcu_dereference(&dst_rq->curr);
+	cur = rcu_dereference(dst_rq->curr);
 	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
 		cur = NULL;
 
-- 
2.10.1

  parent reply	other threads:[~2017-06-06 19:03 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-06-06 19:01 [PATCH 00/26] Fixing wait, exit, ptrace, exec, and CLONE_THREAD Eric W. Biederman
2017-06-06 19:03 ` [PATCH 01/26] alpha: Remove unused TASK_GROUP_LEADER Eric W. Biederman
2017-06-06 19:03   ` [PATCH 02/26] cgroup: Don't open code tasklist_empty() Eric W. Biederman
2017-06-06 19:03   ` [PATCH 03/26] signal: Do not perform permission checks when sending pdeath_signal Eric W. Biederman
     [not found]     ` <20170606190338.28347-3-ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
2017-06-06 20:01       ` Linus Torvalds
     [not found]         ` <CA+55aFya7CgNozFrhQ9qk40UhZAD8SMva1+Y1vQ0YUEbpUpQUA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-07 11:23           ` Eric W. Biederman
2017-06-06 21:42       ` Richard Weinberger
2017-06-06 19:03   ` [PATCH 04/26] signal: Make group_send_sig_info static Eric W. Biederman
2017-06-06 19:03   ` [PATCH 06/26] rlimit: Remove unnecessary grab of tasklist_lock Eric W. Biederman
2017-06-07 12:36     ` Oleg Nesterov
     [not found]       ` <20170607123657.GA22199-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2017-06-07 14:08         ` Eric W. Biederman
2017-06-06 19:03   ` [PATCH 07/26] pidns: Improve the error handling in alloc_pid Eric W. Biederman
2017-06-06 19:03   ` Eric W. Biederman [this message]
     [not found]     ` <20170606190338.28347-8-ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
2017-06-07 13:16       ` [PATCH 08/26] exit: Make the runqueue rcu safe Oleg Nesterov
2017-06-06 19:03   ` [PATCH 09/26] signal: Don't allow sending SIGKILL or SIGSTOP to init Eric W. Biederman
2017-06-06 19:03   ` [PATCH 10/26] ptrace: Simplify ptrace_detach & exit_ptrace Eric W. Biederman
2017-06-06 19:03   ` [PATCH 11/26] wait: Properly implement __WCLONE handling in the presence of exec and ptrace Eric W. Biederman
2017-06-06 19:03   ` [PATCH 12/26] wait: Directly test for the two cases where wait_task_zombie is called Eric W. Biederman
2017-06-06 19:03   ` [PATCH 17/26] exit: Rework the exit states for ptracees Eric W. Biederman
2017-06-06 19:03   ` [PATCH 21/26] wait: Optmize waitpid Eric W. Biederman
     [not found]   ` <20170606190338.28347-1-ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
2017-06-06 19:03     ` [PATCH 05/26] exit: Remove the pointless clearing of SIGPENDING in __exit_signal Eric W. Biederman
2017-06-06 19:03     ` [PATCH 13/26] wait: Remove unused delay_group_leader Eric W. Biederman
2017-06-06 19:03     ` [PATCH 14/26] wait: Move changing of ptrace from wait_consider_task into wait_task_stopped Eric W. Biederman
2017-06-06 19:03     ` [PATCH 15/26] wait: Don't delay !ptrace_reparented leaders Eric W. Biederman
2017-06-06 19:03     ` [PATCH 16/26] exit: Fix reporting a ptraced !reparented leader has exited Eric W. Biederman
2017-06-06 19:03     ` [PATCH 18/26] wait: Fix WSTOPPED on a ptraced child Eric W. Biederman
2017-06-06 19:03     ` [PATCH 19/26] wait: Simpler code for clearing notask_error in wait_consider_task Eric W. Biederman
2017-06-06 19:03     ` [PATCH 20/26] wait: Don't pass the list to wait_consider_task Eric W. Biederman
2017-06-06 19:03     ` [PATCH 22/26] exit: Fix auto-wait of ptraced children Eric W. Biederman
2017-06-06 19:03     ` [PATCH 23/26] signal: Fix SIGCONT before group stop completes Eric W. Biederman
2017-06-06 19:03     ` [PATCH 24/26] signal: In ptrace_stop improve identical signal detection Eric W. Biederman
2017-06-06 19:03   ` [PATCH 25/26] signal: In ptrace_stop use CLD_TRAPPED in all ptrace signals Eric W. Biederman
2017-06-06 19:03   ` [PATCH 26/26] pidns: Ensure zap_pid_ns_processes always terminates Eric W. Biederman
     [not found] ` <877f0pym71.fsf-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
2017-06-06 19:40   ` [PATCH 00/26] Fixing wait, exit, ptrace, exec, and CLONE_THREAD Aleksa Sarai
     [not found]     ` <dd16b1bb-e99e-69f2-72f4-1be4cb24d18d-l3A5Bk7waGM@public.gmane.org>
2017-06-07 11:36       ` Eric W. Biederman
     [not found]         ` <87ink8vxkf.fsf-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
2017-06-07 12:21           ` Aleksa Sarai
2017-06-06 20:07   ` Linus Torvalds
     [not found]     ` <CA+55aFze5rR+rGcG6kt=8PtfgAcs02jqQ7Gm-1=1MzkbA7_nqA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-07 15:59       ` Eric W. Biederman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170606190338.28347-8-ebiederm@xmission.com \
    --to=ebiederm@xmission.com \
    --cc=dhowells@redhat.com \
    --cc=keescook@chromium.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=mtk.manpages@gmail.com \
    --cc=oleg@redhat.com \
    --cc=roland@hack.frob.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@ZenIV.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).