[PATCH] exit: Use read lock for do_notify_parent() instead of write lock

* [PATCH] exit: Use read lock for do_notify_parent() instead of write lock
@ 2015-04-09 17:59 Kirill Tkhai
  2015-04-09 18:28 ` Peter Zijlstra
  2015-04-10 17:50 ` Oleg Nesterov
  0 siblings, 2 replies; 5+ messages in thread
From: Kirill Tkhai @ 2015-04-09 17:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Oleg Nesterov, Andrew Morton, Ingo Molnar, Peter Zijlstra,
	Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
	Kirill Tkhai

The idea is that write lock should be used only for modification of something.
Notification of parent does not change graph of tasks, it just says parent that
child's became dead. So, in ideally it shouldn't be executed under write lock.

Other side is that we take several spin locks inside do_notify_parent(). This
increases the time we're holding tasklist_lock, and all the time the lock is
unavailable for anyone else. It's not good, because tasklist_lock is one of
the most often used locks in the system.

I suggest to execute do_notify_parent() under read_lock(). It allows more tasks
to use it in parallel. Read lock gives enough guarantees for us: child's parent
won't change during the notification.

The only code affected by this change is do_wait() and its child-relative
functions. We execute them with read_lock() held, and this used to guarantee
parallel exit_notify() is impossible. Now they can race, so it's necessary
to synchronize them. We use new EXIT_NOTIFY exit_state for that. It says wait
functions that task is notifying its parent, so they should wait till it set
EXIT_DEAD or EXIT_ZOMBIE exit_status. This doesn't worsen performance. Yes,
we're spinning between wait_consider_task() and do_wait, but there were the
same spin on read_lock() in do_wait(). Really, the new spin is very unlikely
case.

This patch only desribes the idea, it changes exit_notify() only. We can use
the same technics in other places where do_notify_parent() is used.

We need "[PATCH] de_thread: Move notify_count write under lock" from this link:
http://permalink.gmane.org/gmane.linux.kernel/1896220. It's already in akpm
branch of linux-next tree. That patch and current changes of de_thread()
guarantee leader sees right notify_count.

Also, in the future we may think about new rwlock primitive, which atomically
drops write lock and acquires read lock. Something like this for example:

include/asm-generic/qrwlock.h:
static inline void queue_reduce_locked_write_to_read(struct qrwlock *lock)
{
	smp_mb__before_atomic();
	atomic_add(_QR_BIAS - _QW_LOCKED, &lock->cnts);
}

Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
---
 fs/exec.c             |    4 ++--
 include/linux/sched.h |    6 ++++--
 kernel/exit.c         |   26 ++++++++++++++++++++++----
 3 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 314e8d8..7cb8313 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1039,10 +1039,10 @@ static int de_thread(struct task_struct *tsk)
 
 killed:
 	/* protects against exit_notify() and __exit_signal() */
-	read_lock(&tasklist_lock);
+	write_lock_irq(&tasklist_lock);
 	sig->group_exit_task = NULL;
 	sig->notify_count = 0;
-	read_unlock(&tasklist_lock);
+	write_unlock_irq(&tasklist_lock);
 	return -EAGAIN;
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0eabab9..0fc3154 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -214,9 +214,11 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 #define TASK_WAKEKILL		128
 #define TASK_WAKING		256
 #define TASK_PARKED		512
-#define TASK_STATE_MAX		1024
+/* in tsk->exit_state again */
+#define EXIT_NOTIFY		1024
+#define TASK_STATE_MAX		2048
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPn"
 
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
diff --git a/kernel/exit.c b/kernel/exit.c
index feff10b..f6465ae 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,6 +59,9 @@
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
+/* Bigger than any errno to differ from real errors */
+#define REPEAT_DOWAIT (MAX_ERRNO + 1)
+
 static void exit_mm(struct task_struct *tsk);
 
 static void __unhash_process(struct task_struct *p, bool group_dead)
@@ -594,7 +597,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 
 	write_lock_irq(&tasklist_lock);
 	forget_original_parent(tsk, &dead);
+	tsk->exit_state = EXIT_NOTIFY;
+	write_unlock_irq(&tasklist_lock);
 
+	read_lock(&tasklist_lock);
 	if (group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
 
@@ -612,13 +618,14 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	}
 
 	tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
-	if (tsk->exit_state == EXIT_DEAD)
+	smp_wmb(); /* Pairs with read_lock() in do_wait() */
+	if (autoreap)
 		list_add(&tsk->ptrace_entry, &dead);
 
 	/* mt-exec, de_thread() is waiting for group leader */
 	if (unlikely(tsk->signal->notify_count < 0))
 		wake_up_process(tsk->signal->group_exit_task);
-	write_unlock_irq(&tasklist_lock);
+	read_unlock(&tasklist_lock);
 
 	list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
 		list_del_init(&p->ptrace_entry);
@@ -1317,6 +1324,13 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
+	if (unlikely(exit_state == EXIT_NOTIFY)) {
+		if (wo->wo_flags & WNOHANG)
+			return 0;
+		read_unlock(&tasklist_lock);
+		return -REPEAT_DOWAIT;
+	}
+
 	if (unlikely(exit_state == EXIT_TRACE)) {
 		/*
 		 * ptrace == 0 means we are the natural parent. In this case
@@ -1488,11 +1502,15 @@ static long do_wait(struct wait_opts *wo)
 	tsk = current;
 	do {
 		retval = do_wait_thread(wo, tsk);
-		if (retval)
+		if (unlikely(retval == -REPEAT_DOWAIT))
+			goto repeat;
+		else if (retval)
 			goto end;
 
 		retval = ptrace_do_wait(wo, tsk);
-		if (retval)
+		if (unlikely(retval == -REPEAT_DOWAIT))
+			goto repeat;
+		else if (retval)
 			goto end;
 
 		if (wo->wo_flags & __WNOTHREAD)




^ permalink raw reply related	[flat|nested] 5+ messages in thread