All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Fixes for RCU handling of task_struct
@ 2005-10-31  2:05 Paul E. McKenney
  2005-10-31 14:04 ` Ingo Molnar
  0 siblings, 1 reply; 19+ messages in thread
From: Paul E. McKenney @ 2005-10-31  2:05 UTC (permalink / raw)
  To: mingo; +Cc: linux-kernel, oleg, dipankar, suzannew

Hello!

My earlier code that applies RCU to the task list (in PREEMPT_RT)
was missing some rcu_dereference() and rcu_assign_pointer() calls.
This patch fixes these problems.

Signed-off-by: <paulmck@us.ibm.com>

---

 include/linux/list.h |   20 ++++++++++++++++++++
 kernel/pid.c         |   20 ++++++++++----------
 2 files changed, 30 insertions(+), 10 deletions(-)

diff -urpNa -X dontdiff linux-2.6.14-rc5-rt2/include/linux/list.h linux-2.6.14-rc5-rt2-RCUusefix/include/linux/list.h
--- linux-2.6.14-rc5-rt2/include/linux/list.h	2005-10-22 14:41:46.000000000 -0700
+++ linux-2.6.14-rc5-rt2-RCUusefix/include/linux/list.h	2005-10-27 11:02:53.000000000 -0700
@@ -208,6 +208,7 @@ static inline void list_replace_rcu(stru
 	smp_wmb();
 	new->next->prev = new;
 	new->prev->next = new;
+	old->prev = LIST_POISON2;
 }
 
 /**
@@ -578,6 +579,25 @@ static inline void hlist_del_init(struct
 	}
 }
 
+/*
+ * hlist_replace_rcu - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ *
+ * The old entry will be replaced with the new entry atomically.
+ */
+static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *new){
+	struct hlist_node *next = old->next;
+
+	new->next = next;
+	new->pprev = old->pprev;
+	smp_wmb();
+	if (next)
+		new->next->pprev = &new->next;
+	*new->pprev = new;
+	old->pprev = LIST_POISON2;
+}
+
 static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
 {
 	struct hlist_node *first = h->first;
diff -urpNa -X dontdiff linux-2.6.14-rc5-rt2/kernel/pid.c linux-2.6.14-rc5-rt2-RCUusefix/kernel/pid.c
--- linux-2.6.14-rc5-rt2/kernel/pid.c	2005-10-22 14:45:56.000000000 -0700
+++ linux-2.6.14-rc5-rt2-RCUusefix/kernel/pid.c	2005-10-27 11:12:15.000000000 -0700
@@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_
 	struct hlist_node *elem;
 	struct pid *pid;
 
-	hlist_for_each_entry(pid, elem,
+	hlist_for_each_entry_rcu(pid, elem,
 			&pid_hash[type][pid_hashfn(nr)], pid_chain) {
 		if (pid->nr == nr)
 			return pid;
@@ -151,12 +151,12 @@ int fastcall attach_pid(task_t *task, en
 	task_pid = &task->pids[type];
 	pid = find_pid(type, nr);
 	if (pid == NULL) {
-		hlist_add_head(&task_pid->pid_chain,
-				&pid_hash[type][pid_hashfn(nr)]);
 		INIT_LIST_HEAD(&task_pid->pid_list);
+		hlist_add_head_rcu(&task_pid->pid_chain,
+				   &pid_hash[type][pid_hashfn(nr)]);
 	} else {
 		INIT_HLIST_NODE(&task_pid->pid_chain);
-		list_add_tail(&task_pid->pid_list, &pid->pid_list);
+		list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
 	}
 	task_pid->nr = nr;
 
@@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t 
 
 	pid = &task->pids[type];
 	if (!hlist_unhashed(&pid->pid_chain)) {
-		hlist_del(&pid->pid_chain);
 
-		if (list_empty(&pid->pid_list))
+		if (list_empty(&pid->pid_list)) {
 			nr = pid->nr;
-		else {
+			hlist_del_rcu(&pid->pid_chain);
+		} else {
 			pid_next = list_entry(pid->pid_list.next,
 						struct pid, pid_list);
 			/* insert next pid from pid_list to hash */
-			hlist_add_head(&pid_next->pid_chain,
-				&pid_hash[type][pid_hashfn(pid_next->nr)]);
+			hlist_replace_rcu(&pid->pid_chain,
+					  &pid_next->pid_chain);
 		}
 	}
 
-	list_del(&pid->pid_list);
+	list_del_rcu(&pid->pid_list);
 	pid->nr = 0;
 
 	return nr;

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-10-31  2:05 [PATCH] Fixes for RCU handling of task_struct Paul E. McKenney
@ 2005-10-31 14:04 ` Ingo Molnar
  2005-10-31 14:08   ` Ingo Molnar
                     ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Ingo Molnar @ 2005-10-31 14:04 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: linux-kernel, oleg, dipankar, suzannew, Andrew Morton


* Paul E. McKenney <paulmck@us.ibm.com> wrote:

> Hello!
> 
> My earlier code that applies RCU to the task list (in PREEMPT_RT) was 
> missing some rcu_dereference() and rcu_assign_pointer() calls. This 
> patch fixes these problems.
> 
> Signed-off-by: <paulmck@us.ibm.com>

thanks, applied - will show up in -rt2. Find below the rollup of the 
sighand-RCU feature in the -rt tree. Andrew, Paul, could/should we try 
this in -mm?

	Ingo

----

RCU signal handling: send signals RCU-read-locked instead of 
tasklist_lock read-locked. This is a scalability improvement on SMP and 
a preemption-latency improvement under PREEMPT_RCU.

Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -34,6 +34,7 @@
 #include <linux/percpu.h>
 #include <linux/topology.h>
 #include <linux/seccomp.h>
+#include <linux/rcupdate.h>
 
 #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 
@@ -408,8 +409,16 @@ struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
+	struct rcu_head		rcu;
 };
 
+static inline void sighand_free(struct sighand_struct *sp)
+{
+	extern void sighand_free_cb(struct rcu_head *rhp);
+
+	call_rcu(&sp->rcu, sighand_free_cb);
+}
+
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
@@ -913,6 +922,7 @@ struct task_struct {
 	int cpuset_mems_generation;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
+	struct rcu_head rcu;
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
@@ -936,8 +946,29 @@ static inline int pid_alive(struct task_
 extern void free_task(struct task_struct *tsk);
 extern void __put_task_struct(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
-#define put_task_struct(tsk) \
-do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
+
+static inline int get_task_struct_rcu(struct task_struct *t)
+{
+	int oldusage;
+
+	do {
+		oldusage = atomic_read(&t->usage);
+		if (oldusage == 0) {
+			return 0;
+		}
+	} while (cmpxchg(&t->usage.counter,
+		 oldusage, oldusage + 1) != oldusage);
+	return 1;
+}
+
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage)) {
+		call_rcu(&t->rcu, __put_task_struct_cb);
+	}
+}
 
 /*
  * Per process flags
Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -176,6 +176,13 @@ static unsigned int task_timeslice(task_
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	__put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
+
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+
 /*
  * These are the runqueue data structures:
  */
Index: linux/kernel/signal.c
===================================================================
--- linux.orig/kernel/signal.c
+++ linux/kernel/signal.c
@@ -330,13 +330,20 @@ void __exit_sighand(struct task_struct *
 	/* Ok, we're done with the signal handlers */
 	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
-		kmem_cache_free(sighand_cachep, sighand);
+		sighand_free(sighand);
 }
 
 void exit_sighand(struct task_struct *tsk)
 {
 	write_lock_irq(&tasklist_lock);
-	__exit_sighand(tsk);
+	rcu_read_lock();
+	if (tsk->sighand != NULL) {
+		struct sighand_struct *sighand = tsk->sighand;
+		spin_lock(&sighand->siglock);
+		__exit_sighand(tsk);
+		spin_unlock(&sighand->siglock);
+	}
+	rcu_read_unlock();
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -352,6 +359,7 @@ void __exit_signal(struct task_struct *t
 		BUG();
 	if (!atomic_read(&sig->count))
 		BUG();
+	rcu_read_lock();
 	spin_lock(&sighand->siglock);
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count)) {
@@ -359,6 +367,7 @@ void __exit_signal(struct task_struct *t
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		flush_sigqueue(&sig->shared_pending);
 	} else {
@@ -390,9 +399,11 @@ void __exit_signal(struct task_struct *t
 		sig->nvcsw += tsk->nvcsw;
 		sig->nivcsw += tsk->nivcsw;
 		sig->sched_time += tsk->sched_time;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		sig = NULL;	/* Marker for below.  */
 	}
+	rcu_read_unlock();
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
@@ -1119,13 +1130,24 @@ void zap_other_threads(struct task_struc
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	unsigned long flags;
+	struct sighand_struct *sp;
 	int ret;
 
+retry:
 	ret = check_kill_permission(sig, info, p);
-	if (!ret && sig && p->sighand) {
-		spin_lock_irqsave(&p->sighand->siglock, flags);
+	if (!ret && sig && (sp = p->sighand)) {
+		if (!get_task_struct_rcu(p)) {
+			return -ESRCH;
+		}
+		spin_lock_irqsave(&sp->siglock, flags);
+		if (p->sighand != sp) {
+			spin_unlock_irqrestore(&sp->siglock, flags);
+			put_task_struct(p);
+			goto retry;
+		}
 		ret = __group_send_sig_info(sig, info, p);
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		spin_unlock_irqrestore(&sp->siglock, flags);
+		put_task_struct(p);
 	}
 
 	return ret;
@@ -1172,12 +1194,12 @@ kill_proc_info(int sig, struct siginfo *
 	int error;
 	struct task_struct *p;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	p = find_task_by_pid(pid);
 	error = -ESRCH;
 	if (p)
 		error = group_send_sig_info(sig, info, p);
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	return error;
 }
 
@@ -1385,16 +1407,47 @@ send_sigqueue(int sig, struct sigqueue *
 {
 	unsigned long flags;
 	int ret = 0;
+	struct sighand_struct *sh = p->sighand;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-	read_lock(&tasklist_lock);
+
+	/*
+	 * The rcu based delayed sighand destroy makes it possible to
+	 * run this without tasklist lock held. The task struct itself
+	 * cannot go away as create_timer did get_task_struct().
+	 *
+	 * We return -1, when the task is marked exiting, so
+	 * posix_timer_event can redirect it to the group leader
+	 *
+	 */
+	rcu_read_lock();
 
 	if (unlikely(p->flags & PF_EXITING)) {
 		ret = -1;
 		goto out_err;
 	}
 
-	spin_lock_irqsave(&p->sighand->siglock, flags);
+	spin_lock_irqsave(&sh->siglock, flags);
+
+	/*
+	 * We do the check here again to handle the following scenario:
+	 *
+	 * CPU 0		CPU 1
+	 * send_sigqueue
+	 * check PF_EXITING
+	 * interrupt		exit code running
+	 *			__exit_signal
+	 *			lock sighand->siglock
+	 *			unlock sighand->siglock
+	 * lock sh->siglock
+	 * add(tsk->pending) 	flush_sigqueue(tsk->pending)
+	 *
+	 */
+
+	if (unlikely(p->flags & PF_EXITING)) {
+		ret = -1;
+		goto out;
+	}
 
 	if (unlikely(!list_empty(&q->list))) {
 		/*
@@ -1412,17 +1465,16 @@ send_sigqueue(int sig, struct sigqueue *
 		goto out;
 	}
 
-	q->lock = &p->sighand->siglock;
+	q->lock = &sh->siglock;
 	list_add_tail(&q->list, &p->pending.list);
 	sigaddset(&p->pending.signal, sig);
 	if (!sigismember(&p->blocked, sig))
 		signal_wake_up(p, sig == SIGKILL);
 
 out:
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	spin_unlock_irqrestore(&sh->siglock, flags);
 out_err:
-	read_unlock(&tasklist_lock);
-
+	rcu_read_unlock();
 	return ret;
 }
 
@@ -1433,7 +1485,16 @@ send_group_sigqueue(int sig, struct sigq
 	int ret = 0;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-	read_lock(&tasklist_lock);
+
+	while(!read_trylock(&tasklist_lock)) {
+		if (!p->sighand)
+			return -1;
+		cpu_relax();
+	}
+	if (unlikely(!p->sighand)) {
+		ret = -1;
+		goto out_err;
+	}
 	spin_lock_irqsave(&p->sighand->siglock, flags);
 	handle_stop_signal(sig, p);
 
@@ -1467,8 +1528,9 @@ send_group_sigqueue(int sig, struct sigq
 	__group_complete_signal(sig, p);
 out:
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+out_err:
 	read_unlock(&tasklist_lock);
-	return(ret);
+	return ret;
 }
 
 /*
Index: linux/fs/exec.c
===================================================================
--- linux.orig/fs/exec.c
+++ linux/fs/exec.c
@@ -779,7 +779,7 @@ no_thread_group:
 		write_unlock_irq(&tasklist_lock);
 
 		if (atomic_dec_and_test(&oldsighand->count))
-			kmem_cache_free(sighand_cachep, oldsighand);
+			sighand_free(oldsighand);
 	}
 
 	BUG_ON(!thread_group_leader(current));
Index: linux/kernel/exit.c
===================================================================
--- linux.orig/kernel/exit.c
+++ linux/kernel/exit.c
@@ -71,7 +71,6 @@ repeat: 
 		__ptrace_unlink(p);
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
 	__exit_signal(p);
-	__exit_sighand(p);
 	/*
 	 * Note that the fastpath in sys_times depends on __exit_signal having
 	 * updated the counters before a task is removed from the tasklist of
Index: linux/kernel/fork.c
===================================================================
--- linux.orig/kernel/fork.c
+++ linux/kernel/fork.c
@@ -754,6 +754,14 @@ int unshare_files(void)
 
 EXPORT_SYMBOL(unshare_files);
 
+void sighand_free_cb(struct rcu_head *rhp)
+{
+	struct sighand_struct *sp =
+		container_of(rhp, struct sighand_struct, rcu);
+
+	kmem_cache_free(sighand_cachep, sp);
+}
+
 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct sighand_struct *sig;
Index: linux/kernel/rcupdate.c
===================================================================
--- linux.orig/kernel/rcupdate.c
+++ linux/kernel/rcupdate.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
+#include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
Index: linux/include/linux/list.h
===================================================================
--- linux.orig/include/linux/list.h
+++ linux/include/linux/list.h
@@ -208,6 +208,7 @@ static inline void list_replace_rcu(stru
 	smp_wmb();
 	new->next->prev = new;
 	new->prev->next = new;
+	old->prev = LIST_POISON2;
 }
 
 /**
@@ -578,6 +579,25 @@ static inline void hlist_del_init(struct
 	}
 }
 
+/*
+ * hlist_replace_rcu - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ *
+ * The old entry will be replaced with the new entry atomically.
+ */
+static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *new){
+	struct hlist_node *next = old->next;
+
+	new->next = next;
+	new->pprev = old->pprev;
+	smp_wmb();
+	if (next)
+		new->next->pprev = &new->next;
+	*new->pprev = new;
+	old->pprev = LIST_POISON2;
+}
+
 static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
 {
 	struct hlist_node *first = h->first;
Index: linux/kernel/pid.c
===================================================================
--- linux.orig/kernel/pid.c
+++ linux/kernel/pid.c
@@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_
 	struct hlist_node *elem;
 	struct pid *pid;
 
-	hlist_for_each_entry(pid, elem,
+	hlist_for_each_entry_rcu(pid, elem,
 			&pid_hash[type][pid_hashfn(nr)], pid_chain) {
 		if (pid->nr == nr)
 			return pid;
@@ -151,12 +151,12 @@ int fastcall attach_pid(task_t *task, en
 	task_pid = &task->pids[type];
 	pid = find_pid(type, nr);
 	if (pid == NULL) {
-		hlist_add_head(&task_pid->pid_chain,
-				&pid_hash[type][pid_hashfn(nr)]);
 		INIT_LIST_HEAD(&task_pid->pid_list);
+		hlist_add_head_rcu(&task_pid->pid_chain,
+				   &pid_hash[type][pid_hashfn(nr)]);
 	} else {
 		INIT_HLIST_NODE(&task_pid->pid_chain);
-		list_add_tail(&task_pid->pid_list, &pid->pid_list);
+		list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
 	}
 	task_pid->nr = nr;
 
@@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t 
 
 	pid = &task->pids[type];
 	if (!hlist_unhashed(&pid->pid_chain)) {
-		hlist_del(&pid->pid_chain);
 
-		if (list_empty(&pid->pid_list))
+		if (list_empty(&pid->pid_list)) {
 			nr = pid->nr;
-		else {
+			hlist_del_rcu(&pid->pid_chain);
+		} else {
 			pid_next = list_entry(pid->pid_list.next,
 						struct pid, pid_list);
 			/* insert next pid from pid_list to hash */
-			hlist_add_head(&pid_next->pid_chain,
-				&pid_hash[type][pid_hashfn(pid_next->nr)]);
+			hlist_replace_rcu(&pid->pid_chain,
+					  &pid_next->pid_chain);
 		}
 	}
 
-	list_del(&pid->pid_list);
+	list_del_rcu(&pid->pid_list);
 	pid->nr = 0;
 
 	return nr;

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-10-31 14:04 ` Ingo Molnar
@ 2005-10-31 14:08   ` Ingo Molnar
  2005-11-01  4:51   ` Andrew Morton
  2005-11-06 21:49   ` Andrew Morton
  2 siblings, 0 replies; 19+ messages in thread
From: Ingo Molnar @ 2005-10-31 14:08 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: linux-kernel, oleg, dipankar, suzannew, Andrew Morton


* Ingo Molnar <mingo@elte.hu> wrote:

> RCU signal handling: send signals RCU-read-locked instead of 
> tasklist_lock read-locked. This is a scalability improvement on SMP 
> and a preemption-latency improvement under PREEMPT_RCU.

this should read:

RCU tasklist_lock and RCU signal handling: send signals RCU-read-locked 
instead of tasklist_lock read-locked. This is a scalability improvement 
on SMP and a preemption-latency improvement under PREEMPT_RCU.

	Ingo

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-10-31 14:04 ` Ingo Molnar
  2005-10-31 14:08   ` Ingo Molnar
@ 2005-11-01  4:51   ` Andrew Morton
  2005-11-03 19:09     ` Paul E. McKenney
  2005-11-06 21:49   ` Andrew Morton
  2 siblings, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2005-11-01  4:51 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: paulmck, linux-kernel, oleg, dipankar, suzannew

Ingo Molnar <mingo@elte.hu> wrote:
>
> @@ -1433,7 +1485,16 @@ send_group_sigqueue(int sig, struct sigq
>   	int ret = 0;
>   
>   	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
>  -	read_lock(&tasklist_lock);
>  +
>  +	while(!read_trylock(&tasklist_lock)) {
>  +		if (!p->sighand)
>  +			return -1;
>  +		cpu_relax();
>  +	}

This looks kind of ugly and quite unobvious.

What's going on there?

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-11-01  4:51   ` Andrew Morton
@ 2005-11-03 19:09     ` Paul E. McKenney
  2005-11-04 17:41       ` Oleg Nesterov
  0 siblings, 1 reply; 19+ messages in thread
From: Paul E. McKenney @ 2005-11-03 19:09 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ingo Molnar, linux-kernel, oleg, dipankar, suzannew

On Mon, Oct 31, 2005 at 08:51:19PM -0800, Andrew Morton wrote:
> Ingo Molnar <mingo@elte.hu> wrote:
> >
> > @@ -1433,7 +1485,16 @@ send_group_sigqueue(int sig, struct sigq
> >   	int ret = 0;
> >   
> >   	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
> >  -	read_lock(&tasklist_lock);
> >  +
> >  +	while(!read_trylock(&tasklist_lock)) {
> >  +		if (!p->sighand)
> >  +			return -1;
> >  +		cpu_relax();
> >  +	}
> 
> This looks kind of ugly and quite unobvious.
> 
> What's going on there?

This was discussed in the following thread:

	http://marc.theaimsgroup.com/?l=linux-kernel&m=112756875713008&w=2

Looks like its author asked for it to be withdrawn in favor of Roland's
"[PATCH] Call exit_itimers from do_exit, not __exit_signal" patch:

	http://marc.theaimsgroup.com/?l=linux-kernel&m=113008567108608&w=2

My guess is that "Roland" is "Roland McGrath", but I cannot find the
referenced patch.  Oleg, any enlightenment?

						Thanx, Paul

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-11-03 19:09     ` Paul E. McKenney
@ 2005-11-04 17:41       ` Oleg Nesterov
  2005-11-04 20:08         ` Paul E. McKenney
  0 siblings, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2005-11-04 17:41 UTC (permalink / raw)
  To: paulmck; +Cc: Andrew Morton, Ingo Molnar, linux-kernel, dipankar, suzannew

"Paul E. McKenney" wrote:
> 
> On Mon, Oct 31, 2005 at 08:51:19PM -0800, Andrew Morton wrote:
> > Ingo Molnar <mingo@elte.hu> wrote:
> > >
> > > @@ -1433,7 +1485,16 @@ send_group_sigqueue(int sig, struct sigq
> > >     int ret = 0;
> > >
> > >     BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
> > >  -  read_lock(&tasklist_lock);
> > >  +
> > >  +  while(!read_trylock(&tasklist_lock)) {
> > >  +          if (!p->sighand)
> > >  +                  return -1;
> > >  +          cpu_relax();
> > >  +  }
> >
> > This looks kind of ugly and quite unobvious.
> >
> > What's going on there?
> 
> This was discussed in the following thread:
> 
>         http://marc.theaimsgroup.com/?l=linux-kernel&m=112756875713008&w=2
> 
> Looks like its author asked for it to be withdrawn in favor of Roland's
> "[PATCH] Call exit_itimers from do_exit, not __exit_signal" patch:
> 
>         http://marc.theaimsgroup.com/?l=linux-kernel&m=113008567108608&w=2
> 
> My guess is that "Roland" is "Roland McGrath", but I cannot find the
> referenced patch.  Oleg, any enlightenment?

Yes, it was from Roland McGrath:

	http://kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=25f407f0b668f5e4ebd5d13e1fb4306ba6427ead

So I think you can remove ->sighand == NULL re-check and do s/read_trylock/read_lock/.
Posix timers are destroyed from do_exit()->exit_itimers(), after that nobody can send
SI_TIMER to this dying thread group (even if cpu-timer was attached to another process).

send_sigqueue() is different,

> @@ -1385,16 +1407,47 @@ send_sigqueue(int sig, struct sigqueue *
>  {
>         unsigned long flags;
>         int ret = 0;
> +       struct sighand_struct *sh = p->sighand;
> 
>         BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
> -       read_lock(&tasklist_lock);
> +
> +       /*
> +        * The rcu based delayed sighand destroy makes it possible to
> +        * run this without tasklist lock held. The task struct itself
> +        * cannot go away as create_timer did get_task_struct().
> +        *
> +        * We return -1, when the task is marked exiting, so
> +        * posix_timer_event can redirect it to the group leader
> +        *
> +        */
> +       rcu_read_lock();
> 
>         if (unlikely(p->flags & PF_EXITING)) {
>                 ret = -1;
>                 goto out_err;

Is it really needed? You are doing this check again below.

> -       spin_lock_irqsave(&p->sighand->siglock, flags);
> +       spin_lock_irqsave(&sh->siglock, flags);

But 'sh' can be NULL, no? Yes, you already checked PF_EXITING, so this is
very unlikely, but I think it is still possible in theory. 'sh' was loaded
before reading p->flags, but rcu_read_lock() does not imply memory barrier.

Paul, currently I have no time to read the patch carefully, so probably
this all is my misunderstanding.

> @@ -352,6 +359,7 @@ void __exit_signal(struct task_struct *t
>                 BUG();
>         if (!atomic_read(&sig->count))
>                 BUG();
> +       rcu_read_lock();
>         spin_lock(&sighand->siglock);

Why rcu_read_lock() here?

> +static inline int get_task_struct_rcu(struct task_struct *t)
> +{
> +       int oldusage;
> +
> +       do {
> +               oldusage = atomic_read(&t->usage);
> +               if (oldusage == 0) {
> +                       return 0;
> +               }
> +       } while (cmpxchg(&t->usage.counter,
> +                oldusage, oldusage + 1) != oldusage);
> +       return 1;
> +}
> 

I still don't understand the purpose of get_task_struct_rcu().

Oleg.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-11-04 17:41       ` Oleg Nesterov
@ 2005-11-04 20:08         ` Paul E. McKenney
  2005-11-05 16:32           ` Oleg Nesterov
  0 siblings, 1 reply; 19+ messages in thread
From: Paul E. McKenney @ 2005-11-04 20:08 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, Ingo Molnar, linux-kernel, dipankar, suzannew

On Fri, Nov 04, 2005 at 08:41:49PM +0300, Oleg Nesterov wrote:
> "Paul E. McKenney" wrote:
> > 
> > On Mon, Oct 31, 2005 at 08:51:19PM -0800, Andrew Morton wrote:
> > > Ingo Molnar <mingo@elte.hu> wrote:
> > > >
> > > > @@ -1433,7 +1485,16 @@ send_group_sigqueue(int sig, struct sigq
> > > >     int ret = 0;
> > > >
> > > >     BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
> > > >  -  read_lock(&tasklist_lock);
> > > >  +
> > > >  +  while(!read_trylock(&tasklist_lock)) {
> > > >  +          if (!p->sighand)
> > > >  +                  return -1;
> > > >  +          cpu_relax();
> > > >  +  }
> > >
> > > This looks kind of ugly and quite unobvious.
> > >
> > > What's going on there?
> > 
> > This was discussed in the following thread:
> > 
> >         http://marc.theaimsgroup.com/?l=linux-kernel&m=112756875713008&w=2
> > 
> > Looks like its author asked for it to be withdrawn in favor of Roland's
> > "[PATCH] Call exit_itimers from do_exit, not __exit_signal" patch:
> > 
> >         http://marc.theaimsgroup.com/?l=linux-kernel&m=113008567108608&w=2
> > 
> > My guess is that "Roland" is "Roland McGrath", but I cannot find the
> > referenced patch.  Oleg, any enlightenment?
> 
> Yes, it was from Roland McGrath:
> 
> 	http://kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=25f407f0b668f5e4ebd5d13e1fb4306ba6427ead
> 
> So I think you can remove ->sighand == NULL re-check and do s/read_trylock/read_lock/.
> Posix timers are destroyed from do_exit()->exit_itimers(), after that nobody can send
> SI_TIMER to this dying thread group (even if cpu-timer was attached to another process).

This one is not my patch, but, in the spirit of getting things resolved,
I will respond as best I can.

I agree with the change from read_trylock() to read_lock(), given
that Roland's change has been merged.  I also agree with removing the
check against NULL ("if (!p->sighand)").

Andrew, if you would be willing to send me the current state of your
kernel/signal.c, I would be happy to provide the appropriate patch
to get this straightened out.  If there is somewhere I should be
looking to figure this out myself, please let me know.

> send_sigqueue() is different,
> 
> > @@ -1385,16 +1407,47 @@ send_sigqueue(int sig, struct sigqueue *
> >  {
> >         unsigned long flags;
> >         int ret = 0;
> > +       struct sighand_struct *sh = p->sighand;
> > 
> >         BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
> > -       read_lock(&tasklist_lock);
> > +
> > +       /*
> > +        * The rcu based delayed sighand destroy makes it possible to
> > +        * run this without tasklist lock held. The task struct itself
> > +        * cannot go away as create_timer did get_task_struct().
> > +        *
> > +        * We return -1, when the task is marked exiting, so
> > +        * posix_timer_event can redirect it to the group leader
> > +        *
> > +        */
> > +       rcu_read_lock();
> > 
> >         if (unlikely(p->flags & PF_EXITING)) {
> >                 ret = -1;
> >                 goto out_err;
> 
> Is it really needed? You are doing this check again below.

This one looks to be an efficiency hack.  It avoids the lock overhead
in the case that the task is exiting.  Since this case should be rare,
I agree that it makes sense to remove it.

> > -       spin_lock_irqsave(&p->sighand->siglock, flags);
> > +       spin_lock_irqsave(&sh->siglock, flags);
>
> But 'sh' can be NULL, no? Yes, you already checked PF_EXITING, so this is
> very unlikely, but I think it is still possible in theory. 'sh' was loaded
> before reading p->flags, but rcu_read_lock() does not imply memory barrier.

'sh' cannot be NULL, because the caller holds ->it_lock and has checked
for timer deletion under that lock, and because the exiting process
quiesces and deletes timers before freeing sighand.

I do need to look at races with exec(), which could potentially cause
us to be working with the wrong instance of sighand.  I will do this
for -rt, and also for -mm as soon as I get access to it.

> Paul, currently I have no time to read the patch carefully, so probably
> this all is my misunderstanding.

Oleg, I very much appreciate your identifying the patch!  The exec()
issue might well be real, and deserves a look in any case.

> > @@ -352,6 +359,7 @@ void __exit_signal(struct task_struct *t
> >                 BUG();
> >         if (!atomic_read(&sig->count))
> >                 BUG();
> > +       rcu_read_lock();
> >         spin_lock(&sighand->siglock);
> 
> Why rcu_read_lock() here?

In theory unneeded due to the fact that we are holding a reference
count and only doing atomic operations on the sighand_struct.
In practice, very cheap operation and easier documentation.  I don't
want to add strange rules to Documentation/RCU about leaving out
rcu_read_lock() and rcu_dereference() in the special case where
all operations on the RCU-protected structure are atomic, especially
since they are sometimes non-atomic in UP kernels.

The documentation is long enough as it is!  ;-)

> > +static inline int get_task_struct_rcu(struct task_struct *t)
> > +{
> > +       int oldusage;
> > +
> > +       do {
> > +               oldusage = atomic_read(&t->usage);
> > +               if (oldusage == 0) {
> > +                       return 0;
> > +               }
> > +       } while (cmpxchg(&t->usage.counter,
> > +                oldusage, oldusage + 1) != oldusage);
> > +       return 1;
> > +}
> > 
> 
> I still don't understand the purpose of get_task_struct_rcu().

And you are right -- I recently submitted a patch to remove the only
use of it, but it does not appear to have made it into 2.6.14-rt6.
I will check -rt6 and submit a consolidated patch against it.

						Thanx, Paul

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-11-04 20:08         ` Paul E. McKenney
@ 2005-11-05 16:32           ` Oleg Nesterov
  2005-11-05 23:20             ` Paul E. McKenney
  0 siblings, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2005-11-05 16:32 UTC (permalink / raw)
  To: paulmck; +Cc: Andrew Morton, Ingo Molnar, linux-kernel, dipankar, suzannew

"Paul E. McKenney" wrote:
>
> > > -       spin_lock_irqsave(&p->sighand->siglock, flags);
> > > +       spin_lock_irqsave(&sh->siglock, flags);
> >
> > But 'sh' can be NULL, no? Yes, you already checked PF_EXITING, so this is
> > very unlikely, but I think it is still possible in theory. 'sh' was loaded
> > before reading p->flags, but rcu_read_lock() does not imply memory barrier.
>
> 'sh' cannot be NULL, because the caller holds ->it_lock and has checked
> for timer deletion under that lock, and because the exiting process
> quiesces and deletes timers before freeing sighand.
               ^^^^^^^^^^^^^^

Exiting process (thread group) - yes, but exiting thread - no. That is why
send_sigqueue() should check that the target thread is not exiting now. If
we do not take tasklist_lock we can't be sure that ->sighand != NULL. The
caller holds ->it_lock, yes, but this can't help.

Please don't be confused by the 'posix_cpu_timers_exit(tsk)' in __exit_signal()
which is called under sighand->siglock before clearing ->signal/->sighand. This
is completely different, it detaches (but not destroys) cpu-timers, these timers
can have another thread/process as a target for signal sending.

Oleg.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-11-05 16:32           ` Oleg Nesterov
@ 2005-11-05 23:20             ` Paul E. McKenney
  2005-11-06 12:01               ` Oleg Nesterov
  0 siblings, 1 reply; 19+ messages in thread
From: Paul E. McKenney @ 2005-11-05 23:20 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, Ingo Molnar, linux-kernel, dipankar, suzannew

On Sat, Nov 05, 2005 at 07:32:44PM +0300, Oleg Nesterov wrote:
> "Paul E. McKenney" wrote:
> >
> > > > -       spin_lock_irqsave(&p->sighand->siglock, flags);
> > > > +       spin_lock_irqsave(&sh->siglock, flags);
> > >
> > > But 'sh' can be NULL, no? Yes, you already checked PF_EXITING, so this is
> > > very unlikely, but I think it is still possible in theory. 'sh' was loaded
> > > before reading p->flags, but rcu_read_lock() does not imply memory barrier.
> >
> > 'sh' cannot be NULL, because the caller holds ->it_lock and has checked
> > for timer deletion under that lock, and because the exiting process
> > quiesces and deletes timers before freeing sighand.
>                ^^^^^^^^^^^^^^
> 
> Exiting process (thread group) - yes, but exiting thread - no. That is why
> send_sigqueue() should check that the target thread is not exiting now. If
> we do not take tasklist_lock we can't be sure that ->sighand != NULL. The
> caller holds ->it_lock, yes, but this can't help.
> 
> Please don't be confused by the 'posix_cpu_timers_exit(tsk)' in __exit_signal()
> which is called under sighand->siglock before clearing ->signal/->sighand. This
> is completely different, it detaches (but not destroys) cpu-timers, these timers
> can have another thread/process as a target for signal sending.

OK, thank you for catching this!

So the idea is to error out of send_sigqueue() so that posix_timer_event()
will instead call send_group_sigqueeue().  But that could suffer from
the same race if the new leader thread also exits -- or if the exiting
thread was the leader thread to begin with.

But once send_group_sigqueue() read-acquires tasklist_lock, threads
and processes must stay put.  So it should be possible to follow the
->group_leader chain at that point.  We know that there must still be a
group leader because we hold it_lock (so that the entire process cannot
have vanished prior to acquiring tasklist_lock), and the group leader
cannot change because we are read-holding tasklist_lock.  Hence traversing
all of the group_leader pointers has to get us somewhere safe to dump
the signal.

Except that the group leader could do an exec(), right?  If it does so,
it must do so before tasklist_lock is read-acquired.  So the nightmare
case is where all but one thread exits, and then that one thread does
and exec().  If this case can really happen, we want to drop the signal
on the floor, right?

Attached is an incremental patch for everything except for the possible
race with exec().

Thoughts?

						Thanx, Paul

Signed-off-by: <paulmck@us.ibm.com>

---

 signal.c |    8 +++++++-
 1 files changed, 7 insertions(+), 1 deletion(-)

diff -urpNa -X dontdiff linux-2.6.14-mm0-fix/kernel/signal.c linux-2.6.14-mm0-fix-2/kernel/signal.c
--- linux-2.6.14-mm0-fix/kernel/signal.c	2005-11-04 17:23:40.000000000 -0800
+++ linux-2.6.14-mm0-fix-2/kernel/signal.c	2005-11-05 15:05:38.000000000 -0800
@@ -1408,6 +1408,11 @@ send_sigqueue(int sig, struct sigqueue *
 
 retry:
 	sh = rcu_dereference(p->sighand);
+	if (sh == NULL) {
+		/* We raced with pthread_exit()... */
+		ret = -1;
+		goto out_err;
+	}
 
 	spin_lock_irqsave(&sh->siglock, flags);
 	if (p->sighand != sh) {
@@ -1474,7 +1479,8 @@ send_group_sigqueue(int sig, struct sigq
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
 
 	read_lock(&tasklist_lock);
-	/* Since it_lock is held, p->sighand cannot be NULL. */
+	while (p->group_leader != p)
+		p = p->group_leader;
 	spin_lock_irqsave(&p->sighand->siglock, flags);
 	handle_stop_signal(sig, p);
 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-11-05 23:20             ` Paul E. McKenney
@ 2005-11-06 12:01               ` Oleg Nesterov
  2005-11-06 22:59                 ` Paul E. McKenney
  0 siblings, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2005-11-06 12:01 UTC (permalink / raw)
  To: paulmck; +Cc: Andrew Morton, Ingo Molnar, linux-kernel, dipankar, suzannew

"Paul E. McKenney" wrote:
> 
> So the idea is to error out of send_sigqueue() so that posix_timer_event()
> will instead call send_group_sigqueeue().  But that could suffer from
> the same race if the new leader thread also exits -- or if the exiting
> thread was the leader thread to begin with.

The case when leader exits is ok. If it is the only (last) thread - it will
call exit_itimers(). If not - it (or sys_wait4 from parent) will not call
release_task(), but will stay TASK_ZOMBIE with valid ->signal/sighand until
the last thread in same thread group exits (and call exit_itimers).

> But once send_group_sigqueue() read-acquires tasklist_lock, threads
> and processes must stay put.  So it should be possible to follow the
> ->group_leader chain at that point.

Not quite so, I think. See below.

> Except that the group leader could do an exec(), right?  If it does so,
> it must do so before tasklist_lock is read-acquired.  So the nightmare
> case is where all but one thread exits, and then that one thread does
> and exec().

... and that thread is not group leader. Actually, it does not matter
if other threads exited or not, execing thread will kill other threads.

> If this case can really happen, we want to drop the signal
> on the floor, right?

I think yes.

> diff -urpNa -X dontdiff linux-2.6.14-mm0-fix/kernel/signal.c linux-2.6.14-mm0-fix-2/kernel/signal.c
> --- linux-2.6.14-mm0-fix/kernel/signal.c        2005-11-04 17:23:40.000000000 -0800
> +++ linux-2.6.14-mm0-fix-2/kernel/signal.c      2005-11-05 15:05:38.000000000 -0800
> @@ -1408,6 +1408,11 @@ send_sigqueue(int sig, struct sigqueue *
> 
>  retry:
>         sh = rcu_dereference(p->sighand);
> +       if (sh == NULL) {
> +               /* We raced with pthread_exit()... */
> +               ret = -1;
> +               goto out_err;
> +       }

I lost the plot. Because I can't apply this and previous patches (rejects)
and can't imagine how send_sigqueue() looks now. I think this is ok, but
we also need to re-check ->signal != NULL after lock(->sighand) or check
PF_EXITING (iirc ve do have such check).

> @@ -1474,7 +1479,8 @@ send_group_sigqueue(int sig, struct sigq
>         BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
> 
>         read_lock(&tasklist_lock);
> -       /* Since it_lock is held, p->sighand cannot be NULL. */
> +       while (p->group_leader != p)
> +               p = p->group_leader;

No, this is definitely not right. de_thread() does not change leader->group_leader
when non-leader execs, so p->group_leader == p always.

Oleg.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-10-31 14:04 ` Ingo Molnar
  2005-10-31 14:08   ` Ingo Molnar
  2005-11-01  4:51   ` Andrew Morton
@ 2005-11-06 21:49   ` Andrew Morton
  2005-11-06 22:43     ` Paul E. McKenney
  2005-11-07  1:12     ` Nick Piggin
  2 siblings, 2 replies; 19+ messages in thread
From: Andrew Morton @ 2005-11-06 21:49 UTC (permalink / raw)
  To: Ingo Molnar, Nick Piggin; +Cc: paulmck, linux-kernel, oleg, dipankar, suzannew

Ingo Molnar <mingo@elte.hu> wrote:
>
> ...
> 
> RCU signal handling: send signals RCU-read-locked instead of 
> tasklist_lock read-locked. This is a scalability improvement on SMP and 
> a preemption-latency improvement under PREEMPT_RCU.
> 
> Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
> Signed-off-by: Ingo Molnar <mingo@elte.hu>
> 
> ...
> +static inline int get_task_struct_rcu(struct task_struct *t)
> +{
> +	int oldusage;
> +
> +	do {
> +		oldusage = atomic_read(&t->usage);
> +		if (oldusage == 0) {
> +			return 0;
> +		}
> +	} while (cmpxchg(&t->usage.counter,
> +		 oldusage, oldusage + 1) != oldusage);
> +	return 1;
> +}

arm (at least) does not implement cmpxchg.

I think Nick is working on patches which implement cmpxchg on all
architectures?

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-11-06 21:49   ` Andrew Morton
@ 2005-11-06 22:43     ` Paul E. McKenney
  2005-11-07  1:12     ` Nick Piggin
  1 sibling, 0 replies; 19+ messages in thread
From: Paul E. McKenney @ 2005-11-06 22:43 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ingo Molnar, Nick Piggin, linux-kernel, oleg, dipankar, suzannew

On Sun, Nov 06, 2005 at 01:49:45PM -0800, Andrew Morton wrote:
> Ingo Molnar <mingo@elte.hu> wrote:
> >
> > ...
> > 
> > RCU signal handling: send signals RCU-read-locked instead of 
> > tasklist_lock read-locked. This is a scalability improvement on SMP and 
> > a preemption-latency improvement under PREEMPT_RCU.
> > 
> > Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
> > Signed-off-by: Ingo Molnar <mingo@elte.hu>
> > 
> > ...
> > +static inline int get_task_struct_rcu(struct task_struct *t)
> > +{
> > +	int oldusage;
> > +
> > +	do {
> > +		oldusage = atomic_read(&t->usage);
> > +		if (oldusage == 0) {
> > +			return 0;
> > +		}
> > +	} while (cmpxchg(&t->usage.counter,
> > +		 oldusage, oldusage + 1) != oldusage);
> > +	return 1;
> > +}
> 
> arm (at least) does not implement cmpxchg.
> 
> I think Nick is working on patches which implement cmpxchg on all
> architectures?

That he is, but the latest set of signal-RCU patches does not use
get_task_struct_rcu().  Attached is a patch that removes it.

That said, it would be good if there was a set of cmpxchg functions
for all architectures.

						Thanx, Paul

Signed-off-by: <paulmck@us.ibm.com>

---

diff -urpNa -X dontdiff linux-2.6.14-rt6/include/linux/sched.h linux-2.6.14-rt6-nocmpxchg/include/linux/sched.h
--- linux-2.6.14-rt6/include/linux/sched.h	2005-11-04 12:38:05.000000000 -0800
+++ linux-2.6.14-rt6-nocmpxchg/include/linux/sched.h	2005-11-06 14:41:26.000000000 -0800
@@ -1057,20 +1057,6 @@ extern void free_task(struct task_struct
 extern void __put_task_struct(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
-static inline int get_task_struct_rcu(struct task_struct *t)
-{
-	int oldusage;
-
-	do {
-		oldusage = atomic_read(&t->usage);
-		if (oldusage == 0) {
-			return 0;
-		}
-	} while (cmpxchg(&t->usage.counter,
-		 oldusage, oldusage + 1) != oldusage);
-	return 1;
-}
-
 extern void __put_task_struct_cb(struct rcu_head *rhp);
 
 static inline void put_task_struct(struct task_struct *t)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-11-06 12:01               ` Oleg Nesterov
@ 2005-11-06 22:59                 ` Paul E. McKenney
  2005-11-07 13:17                   ` Oleg Nesterov
  0 siblings, 1 reply; 19+ messages in thread
From: Paul E. McKenney @ 2005-11-06 22:59 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, Ingo Molnar, linux-kernel, dipankar, suzannew

On Sun, Nov 06, 2005 at 03:01:42PM +0300, Oleg Nesterov wrote:
> "Paul E. McKenney" wrote:
> > 
> > So the idea is to error out of send_sigqueue() so that posix_timer_event()
> > will instead call send_group_sigqueeue().  But that could suffer from
> > the same race if the new leader thread also exits -- or if the exiting
> > thread was the leader thread to begin with.
> 
> The case when leader exits is ok. If it is the only (last) thread - it will
> call exit_itimers(). If not - it (or sys_wait4 from parent) will not call
> release_task(), but will stay TASK_ZOMBIE with valid ->signal/sighand until
> the last thread in same thread group exits (and call exit_itimers).
> 
> > But once send_group_sigqueue() read-acquires tasklist_lock, threads
> > and processes must stay put.  So it should be possible to follow the
> > ->group_leader chain at that point.
> 
> Not quite so, I think. See below.
> 
> > Except that the group leader could do an exec(), right?  If it does so,
> > it must do so before tasklist_lock is read-acquired.  So the nightmare
> > case is where all but one thread exits, and then that one thread does
> > and exec().
> 
> ... and that thread is not group leader. Actually, it does not matter
> if other threads exited or not, execing thread will kill other threads.
> 
> > If this case can really happen, we want to drop the signal
> > on the floor, right?
> 
> I think yes.
> 
> > diff -urpNa -X dontdiff linux-2.6.14-mm0-fix/kernel/signal.c linux-2.6.14-mm0-fix-2/kernel/signal.c
> > --- linux-2.6.14-mm0-fix/kernel/signal.c        2005-11-04 17:23:40.000000000 -0800
> > +++ linux-2.6.14-mm0-fix-2/kernel/signal.c      2005-11-05 15:05:38.000000000 -0800
> > @@ -1408,6 +1408,11 @@ send_sigqueue(int sig, struct sigqueue *
> > 
> >  retry:
> >         sh = rcu_dereference(p->sighand);
> > +       if (sh == NULL) {
> > +               /* We raced with pthread_exit()... */
> > +               ret = -1;
> > +               goto out_err;
> > +       }
> 
> I lost the plot. Because I can't apply this and previous patches (rejects)
> and can't imagine how send_sigqueue() looks now. I think this is ok, but
> we also need to re-check ->signal != NULL after lock(->sighand) or check
> PF_EXITING (iirc ve do have such check).

I lost the plot as well.  There were apparently a very large number of
changes awaiting 2.6.14 coming out.  ;-)

I also believe we have such a check.

> > @@ -1474,7 +1479,8 @@ send_group_sigqueue(int sig, struct sigq
> >         BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
> > 
> >         read_lock(&tasklist_lock);
> > -       /* Since it_lock is held, p->sighand cannot be NULL. */
> > +       while (p->group_leader != p)
> > +               p = p->group_leader;
> 
> No, this is definitely not right. de_thread() does not change leader->group_leader
> when non-leader execs, so p->group_leader == p always.

This was intended for the case where the group leader does pthread_exit,
which would cause some other thread to assume group leadership.  Or am
I missing something from that code path?  (Quite likely that I am...)

						Thanx, Paul

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-11-06 21:49   ` Andrew Morton
  2005-11-06 22:43     ` Paul E. McKenney
@ 2005-11-07  1:12     ` Nick Piggin
  2005-11-07  4:58       ` Paul E. McKenney
  1 sibling, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2005-11-07  1:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ingo Molnar, paulmck, linux-kernel, oleg, dipankar, suzannew

Andrew Morton wrote:

>>+static inline int get_task_struct_rcu(struct task_struct *t)
>>+{
>>+	int oldusage;
>>+
>>+	do {
>>+		oldusage = atomic_read(&t->usage);
>>+		if (oldusage == 0) {
>>+			return 0;
>>+		}
>>+	} while (cmpxchg(&t->usage.counter,
>>+		 oldusage, oldusage + 1) != oldusage);
>>+	return 1;
>>+}
> 
> 
> arm (at least) does not implement cmpxchg.
> 

Yes, and using atomic_t.counter in generic code is ugly, albeit
compatible with all current implementations.

> I think Nick is working on patches which implement cmpxchg on all
> architectures?
> 

Yes, it is basically ready to go.

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-11-07  1:12     ` Nick Piggin
@ 2005-11-07  4:58       ` Paul E. McKenney
  2005-11-07  5:51         ` Nick Piggin
  0 siblings, 1 reply; 19+ messages in thread
From: Paul E. McKenney @ 2005-11-07  4:58 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, Ingo Molnar, linux-kernel, oleg, dipankar, suzannew

On Mon, Nov 07, 2005 at 12:12:25PM +1100, Nick Piggin wrote:
> Andrew Morton wrote:
> 
> >>+static inline int get_task_struct_rcu(struct task_struct *t)
> >>+{
> >>+	int oldusage;
> >>+
> >>+	do {
> >>+		oldusage = atomic_read(&t->usage);
> >>+		if (oldusage == 0) {
> >>+			return 0;
> >>+		}
> >>+	} while (cmpxchg(&t->usage.counter,
> >>+		 oldusage, oldusage + 1) != oldusage);
> >>+	return 1;
> >>+}
> >
> >
> >arm (at least) does not implement cmpxchg.
> >
> 
> Yes, and using atomic_t.counter in generic code is ugly, albeit
> compatible with all current implementations.
> 
> >I think Nick is working on patches which implement cmpxchg on all
> >architectures?
> 
> Yes, it is basically ready to go.

Would it simplify the rcuref.h code?  Or lib/dec_and_lock.c?

						Thanx, Paul

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-11-07  4:58       ` Paul E. McKenney
@ 2005-11-07  5:51         ` Nick Piggin
  2005-11-07 18:10           ` Paul E. McKenney
  0 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2005-11-07  5:51 UTC (permalink / raw)
  To: paulmck
  Cc: Andrew Morton, Ingo Molnar, linux-kernel, oleg, dipankar, suzannew

Paul E. McKenney wrote:
> On Mon, Nov 07, 2005 at 12:12:25PM +1100, Nick Piggin wrote:

>>Yes, it is basically ready to go.
> 
> 
> Would it simplify the rcuref.h code?  Or lib/dec_and_lock.c?
> 

Yep, I recently posted it to lkml... rcuref.h disappears, and
dec_and_lock becomes simplified not to mention more efficient
on those architectures which do not define HAVE_ARCH_CMPXCHG.

http://marc.theaimsgroup.com/?l=linux-kernel&m=113117753625350&w=2
http://marc.theaimsgroup.com/?l=linux-kernel&m=113117753629218&w=2

I need the infrastructure for lockless pagecache, but fortunately
it is very useful for other things as well. Especially lockless
algorithms it seems.

Nick

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-11-06 22:59                 ` Paul E. McKenney
@ 2005-11-07 13:17                   ` Oleg Nesterov
  2005-11-07 18:28                     ` Oleg Nesterov
  0 siblings, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2005-11-07 13:17 UTC (permalink / raw)
  To: paulmck; +Cc: Andrew Morton, Ingo Molnar, linux-kernel, dipankar, suzannew

"Paul E. McKenney" wrote:
> 
> > > +       while (p->group_leader != p)
> > > +               p = p->group_leader;
> >
> > No, this is definitely not right. de_thread() does not change leader->group_leader
> > when non-leader execs, so p->group_leader == p always.
> 
> This was intended for the case where the group leader does pthread_exit,
> which would cause some other thread to assume group leadership.  Or am
> I missing something from that code path?  (Quite likely that I am...)

When group leader exits it goes into TASK_ZOMBIE state (if it is not the
only one thread in the same group). It is still the ->group_leader for all
threads including itself. Only when release_task(last_thread_in_thread_group)
happens, it will notice not yet released group_leader, and release it, see
'repeat:' patch in release_task().

The ->group_leader is changed only when non-leader thread does exec, it kills
other threads and becomes ->group_leader for itself.

So, I think send_group_sigqueue() should do:

	read_lock(tasklist_lock);

	if (!tsk->signal) {
		// Can happen only if de_thread did release_task(tsk)
		// while switching to new leader.
		// We can't figure out the new leader, but it does not
		// matter - we should drop the signal anyway.
		unlock(tasklist);
		return;
	}

Oleg.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-11-07  5:51         ` Nick Piggin
@ 2005-11-07 18:10           ` Paul E. McKenney
  0 siblings, 0 replies; 19+ messages in thread
From: Paul E. McKenney @ 2005-11-07 18:10 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, Ingo Molnar, linux-kernel, oleg, dipankar, suzannew

On Mon, Nov 07, 2005 at 04:51:26PM +1100, Nick Piggin wrote:
> Paul E. McKenney wrote:
> >On Mon, Nov 07, 2005 at 12:12:25PM +1100, Nick Piggin wrote:
> 
> >>Yes, it is basically ready to go.
> >
> >Would it simplify the rcuref.h code?  Or lib/dec_and_lock.c?
> 
> Yep, I recently posted it to lkml... rcuref.h disappears, and
> dec_and_lock becomes simplified not to mention more efficient
> on those architectures which do not define HAVE_ARCH_CMPXCHG.
> 
> http://marc.theaimsgroup.com/?l=linux-kernel&m=113117753625350&w=2
> http://marc.theaimsgroup.com/?l=linux-kernel&m=113117753629218&w=2
> 
> I need the infrastructure for lockless pagecache, but fortunately
> it is very useful for other things as well. Especially lockless
> algorithms it seems.

I have been inconvenienced by lack of cmpxchg across all architectures
a number of times -- one can often work around it, but...

Here are the places that check __HAVE_ARCH_CMPXCHG in 2.6.14:

o	arch/i386/kernel/acpi/boot.c <global> 85 #ifndef __HAVE_ARCH_CMPXCHG

	Warning message saying that ACPI must use 486 or later.
	Should be able to get rid of this, though not sure what you
	have to do to ACPI.

o	include/asm-i386/mc146818rtc.h <global> 16 #ifdef __HAVE_ARCH_CMPXCHG

	Provides __cmpxchg()-based code for 486 and better, and assumes
	that 386 is UP.  Should be able to eliminate the 386 code and
	the #ifdef.

o	include/linux/rcuref.h <global> 47 #ifdef __HAVE_ARCH_CMPXCHG
o	kernel/rcupdate.c <global> 76 #ifndef __HAVE_ARCH_CMPXCHG
o	lib/dec_and_lock.c <global> 6 #ifdef __HAVE_ARCH_CMPXCHG

	Looks like you have these three covered.

There are about 50 uses of cmpxchg, and it is possible that some of them
would be simplified by your primitives, as well.

So having cmpxchg available would be a very good thing!

						Thanx, Paul

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Fixes for RCU handling of task_struct
  2005-11-07 13:17                   ` Oleg Nesterov
@ 2005-11-07 18:28                     ` Oleg Nesterov
  0 siblings, 0 replies; 19+ messages in thread
From: Oleg Nesterov @ 2005-11-07 18:28 UTC (permalink / raw)
  To: paulmck, Andrew Morton, Ingo Molnar, linux-kernel, dipankar, suzannew

Oleg Nesterov wrote:
> 
> When group leader exits it goes into TASK_ZOMBIE state (if it is not the
> only one thread in the same group).

Just to clarify, single-thread process can go to TASK_ZOMBIE state too,
of course. But group leader can't be released (by itself or via sys_wait4)
while there are other threads in the same group.

> So, I think send_group_sigqueue() should do:
> 
>         read_lock(tasklist_lock);
> 
>         if (!tsk->signal) {
>                 // Can happen only if de_thread did release_task(tsk)
>                 // while switching to new leader.
>                 // We can't figure out the new leader, but it does not
>                 // matter - we should drop the signal anyway.
>                 unlock(tasklist);
>                 return;

No, I was wrong. This is not enough. This 'tsk' can be already freed!
sys_timer_create() bumps tsk->usage only when the signal is sent via
send_sigqueue(), it does not do get_task_struct(leader) when the signal
is not thread specific, but goes to the thread group.

Oleg.

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2005-11-07 18:10 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-10-31  2:05 [PATCH] Fixes for RCU handling of task_struct Paul E. McKenney
2005-10-31 14:04 ` Ingo Molnar
2005-10-31 14:08   ` Ingo Molnar
2005-11-01  4:51   ` Andrew Morton
2005-11-03 19:09     ` Paul E. McKenney
2005-11-04 17:41       ` Oleg Nesterov
2005-11-04 20:08         ` Paul E. McKenney
2005-11-05 16:32           ` Oleg Nesterov
2005-11-05 23:20             ` Paul E. McKenney
2005-11-06 12:01               ` Oleg Nesterov
2005-11-06 22:59                 ` Paul E. McKenney
2005-11-07 13:17                   ` Oleg Nesterov
2005-11-07 18:28                     ` Oleg Nesterov
2005-11-06 21:49   ` Andrew Morton
2005-11-06 22:43     ` Paul E. McKenney
2005-11-07  1:12     ` Nick Piggin
2005-11-07  4:58       ` Paul E. McKenney
2005-11-07  5:51         ` Nick Piggin
2005-11-07 18:10           ` Paul E. McKenney

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.