linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Race condition in ptrace
@ 2005-02-03 12:51 Bodo Stroesser
  2005-02-04  0:27 ` Nick Piggin
  0 siblings, 1 reply; 12+ messages in thread
From: Bodo Stroesser @ 2005-02-03 12:51 UTC (permalink / raw)
  To: Roland Mc Grath
  Cc: Jeff Dike, BlaisorBlade, user-mode-linux devel, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1210 bytes --]

Working with the new UML skas0 mode on my Xeon HT host, sporadically I saw
some processes on UML segfaulting.

In all cases, I could track this down to be caused by a gs segment register,
that had the wrong contents.

This again is caused by a problem in the host linux: A ptraced child going to
stop and having woken up its parent, will save some of its registers (on i386
they are fs, gs and the fp-registers) very late in switch_to. The parent is
granted access to child's registers as soon, as the child is removed from
the runqueue. Thus, in rare cases, the parent might access child's register
savearea before the registers really are saved.

This problem might also be the reason for problems with floatpoint on UML,
that were reported some time ago.

I've written a test program, that reproduces the problem on my 2.6.9 vanilla
host quite quick. Using SuSE kernel 2.6.5-7.97-smp, I can't reproduce the
problem, although the relevant parts seem to be unchanged. Maybe not related
changes modify the timing?

I also created a patch, that fixes the problem on my 2.6.9 host. This probably
isn't a sane patch, but is enough to demonstrate, where I think, the bug is.
Both files are attached.

        Bodo

[-- Attachment #2: fix-ptrace-race.patch --]
[-- Type: text/x-diff, Size: 1425 bytes --]

--- a/include/linux/sched.h	2005-02-02 22:15:51.000000000 +0100
+++ b/include/linux/sched.h	2005-02-02 22:22:54.000000000 +0100
@@ -584,6 +584,7 @@ struct task_struct {
   	struct mempolicy *mempolicy;
   	short il_next;		/* could be shared with used_math */
 #endif
+	volatile long saving;
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
--- a/kernel/sched.c	2005-02-02 21:32:51.000000000 +0100
+++ b/kernel/sched.c	2005-02-02 22:12:14.000000000 +0100
@@ -2689,8 +2689,10 @@ need_resched:
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				unlikely(signal_pending(prev))))
 			prev->state = TASK_RUNNING;
-		else
+		else {
+			prev->saving = 1;
 			deactivate_task(prev, rq);
+		}
 	}
 
 	cpu = smp_processor_id();
--- a/kernel/ptrace.c	2005-02-02 22:12:33.000000000 +0100
+++ b/kernel/ptrace.c	2005-02-02 22:20:46.000000000 +0100
@@ -96,6 +96,7 @@ int ptrace_check_attach(struct task_stru
 
 	if (!ret && !kill) {
 		wait_task_inactive(child);
+		while ( child->saving ) ;
 	}
 
 	/* All systems go.. */
--- a/arch/i386/kernel/process.c	2005-02-02 22:18:29.000000000 +0100
+++ b/arch/i386/kernel/process.c	2005-02-02 22:19:22.000000000 +0100
@@ -577,6 +577,9 @@ struct task_struct fastcall * __switch_t
 	asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs));
 	asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
 
+	wmb();
+	prev_p->saving=0;
+
 	/*
 	 * Restore %fs and %gs if needed.
 	 */

[-- Attachment #3: test_ptrace.c --]
[-- Type: text/plain, Size: 2488 bytes --]

#include <signal.h>
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <sched.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/ptrace.h>
#include <asm/ptrace.h>
#include <asm/unistd.h>
#include <asm/ldt.h>

void
write_ldt( int number)
{
	struct user_desc desc;
	int ret;

	memset(&desc, 0, sizeof( desc));
	desc.entry_number = number;
	desc.base_addr = 0x400179a0;
	desc.limit = 0xffffffff;
	desc.seg_32bit = 1;
	desc.limit_in_pages = 1;
	desc.useable = 1;
	ret = modify_ldt(1, &desc, sizeof( desc));
	if ( ret )
		printf("modify_ldt(write): ret=%d, errno=%d\n", ret, errno);
}

int
child_fn( void)
{
	unsigned int fs=7, new_fs;

	write_ldt(0);

	asm volatile ("movl %0,%%fs": : "m" (fs));

	if ( ptrace( PTRACE_TRACEME, 0, (void *)0, (void *)0) ) {
		perror("ptrace( PTRACE_TRACEME, 0, 0, 0)");
		exit(1);
	}
	asm volatile ("int $3");

	asm volatile("movl %%fs,%0":"=m" (new_fs));
	new_fs &= 0xffff;
	if ( new_fs != (fs ^ 7) ) {
		printf("Child: wrong fs = %d\n", new_fs);
		exit(1);
	}

	fs = new_fs;

	printf("Child: fs changed to %d\n", fs);

	asm volatile ("int $3");
}


int
parent_fn( pid_t child)
{
	int ret, status;
	unsigned long regs[FRAME_SIZE];

	ret = waitpid( child, &status, 0);
	if ( ret != child ) {
		fprintf( stderr, "Parent: ");
		perror("waitpid");
		exit(1);
	}
	if ( !WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP ) {
		printf("\nParent: Childs status is %x: exiting\n", status);
		fflush( stdout);
		return (status != 0);
	}
	if ( ptrace( PTRACE_GETREGS, child, 0, regs) ) {
		fprintf( stderr, "Parent: ");
		perror("ptrace(GETREGS)");
		exit(1);
	}

	while (1) {
		regs[FS] ^= 7;
		if ( ptrace( PTRACE_SETREGS, child, 0, regs) ) {
			fprintf( stderr, "Parent: ");
			perror("ptrace(GETREGS)");
			exit(1);
		}
		if ( ptrace( PTRACE_CONT, child, 0, 0) < 0 ) {
			fprintf( stderr, "Parent: ");
			perror("ptrace( PTRACE_CONT, 0, 0, 0)");
			exit(1);
		}
		ret = waitpid( child, &status, 0);
		if ( ret != child ) {
			fprintf( stderr, "Parent: ");
			perror("waitpid");
			exit(1);
		}
		if ( !WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP ) {
			printf("\nParent: Childs status is %x: exiting\n", status);
			fflush( stdout);
			return (status != 0);
		}
	}
}


int
main( int argc, char ** argv)
{
	int res;
	pid_t child;

	child = fork();

	if ( child < 0 ) {
		perror("fork");
		exit(1);
	}
	else if ( child ) {
		res = parent_fn( child);
		return res;
	} else {
		res = child_fn();
		return res;
	}
}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Race condition in ptrace
  2005-02-03 12:51 Race condition in ptrace Bodo Stroesser
@ 2005-02-04  0:27 ` Nick Piggin
  2005-02-04 12:35   ` Bodo Stroesser
  0 siblings, 1 reply; 12+ messages in thread
From: Nick Piggin @ 2005-02-04  0:27 UTC (permalink / raw)
  To: Bodo Stroesser
  Cc: Roland Mc Grath, Jeff Dike, BlaisorBlade, user-mode-linux devel,
	linux-kernel

Bodo Stroesser wrote:
> Working with the new UML skas0 mode on my Xeon HT host, sporadically I saw
> some processes on UML segfaulting.
> 
> In all cases, I could track this down to be caused by a gs segment 
> register,
> that had the wrong contents.
> 
> This again is caused by a problem in the host linux: A ptraced child 
> going to
> stop and having woken up its parent, will save some of its registers (on 
> i386
> they are fs, gs and the fp-registers) very late in switch_to. The parent is
> granted access to child's registers as soon, as the child is removed from
> the runqueue. Thus, in rare cases, the parent might access child's register
> savearea before the registers really are saved.
> 
> This problem might also be the reason for problems with floatpoint on UML,
> that were reported some time ago.
> 
> I've written a test program, that reproduces the problem on my 2.6.9 
> vanilla
> host quite quick. Using SuSE kernel 2.6.5-7.97-smp, I can't reproduce the
> problem, although the relevant parts seem to be unchanged. Maybe not 
> related
> changes modify the timing?
> 
> I also created a patch, that fixes the problem on my 2.6.9 host. This 
> probably
> isn't a sane patch, but is enough to demonstrate, where I think, the bug 
> is.
> Both files are attached.
> 
>        Bodo
> 
> 
> ------------------------------------------------------------------------
> 
> --- a/include/linux/sched.h	2005-02-02 22:15:51.000000000 +0100
> +++ b/include/linux/sched.h	2005-02-02 22:22:54.000000000 +0100
> @@ -584,6 +584,7 @@ struct task_struct {
>    	struct mempolicy *mempolicy;
>    	short il_next;		/* could be shared with used_math */
>  #endif
> +	volatile long saving;
>  };
>  
>  static inline pid_t process_group(struct task_struct *tsk)
> --- a/kernel/sched.c	2005-02-02 21:32:51.000000000 +0100
> +++ b/kernel/sched.c	2005-02-02 22:12:14.000000000 +0100
> @@ -2689,8 +2689,10 @@ need_resched:
>  		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
>  				unlikely(signal_pending(prev))))
>  			prev->state = TASK_RUNNING;
> -		else
> +		else {
> +			prev->saving = 1;
>  			deactivate_task(prev, rq);
> +		}
>  	}
>  
>  	cpu = smp_processor_id();
> --- a/kernel/ptrace.c	2005-02-02 22:12:33.000000000 +0100
> +++ b/kernel/ptrace.c	2005-02-02 22:20:46.000000000 +0100
> @@ -96,6 +96,7 @@ int ptrace_check_attach(struct task_stru
>  
>  	if (!ret && !kill) {
>  		wait_task_inactive(child);
> +		while ( child->saving ) ;
>  	}
>  
>  	/* All systems go.. */
> --- a/arch/i386/kernel/process.c	2005-02-02 22:18:29.000000000 +0100
> +++ b/arch/i386/kernel/process.c	2005-02-02 22:19:22.000000000 +0100
> @@ -577,6 +577,9 @@ struct task_struct fastcall * __switch_t
>  	asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs));
>  	asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
>  
> +	wmb();
> +	prev_p->saving=0;
> +
>  	/*
>  	 * Restore %fs and %gs if needed.
>  	 */
> 

I don't see how this could help because AFAIKS, child->saving is only
set and cleared while the runqueue is locked. And the same runqueue lock
is taken by wait_task_inactive.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Race condition in ptrace
  2005-02-04  0:27 ` Nick Piggin
@ 2005-02-04 12:35   ` Bodo Stroesser
  2005-02-04 22:15     ` Nick Piggin
  0 siblings, 1 reply; 12+ messages in thread
From: Bodo Stroesser @ 2005-02-04 12:35 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Roland Mc Grath, Jeff Dike, BlaisorBlade, user-mode-linux devel,
	linux-kernel

Nick Piggin wrote:
> Bodo Stroesser wrote:
> 
>> Working with the new UML skas0 mode on my Xeon HT host, sporadically I 
>> saw
>> some processes on UML segfaulting.
>>
>> In all cases, I could track this down to be caused by a gs segment 
>> register,
>> that had the wrong contents.
>>
>> This again is caused by a problem in the host linux: A ptraced child 
>> going to
>> stop and having woken up its parent, will save some of its registers 
>> (on i386
>> they are fs, gs and the fp-registers) very late in switch_to. The 
>> parent is
>> granted access to child's registers as soon, as the child is removed from
>> the runqueue. Thus, in rare cases, the parent might access child's 
>> register
>> savearea before the registers really are saved.
>>
>> This problem might also be the reason for problems with floatpoint on 
>> UML,
>> that were reported some time ago.
>>
>> I've written a test program, that reproduces the problem on my 2.6.9 
>> vanilla
>> host quite quick. Using SuSE kernel 2.6.5-7.97-smp, I can't reproduce the
>> problem, although the relevant parts seem to be unchanged. Maybe not 
>> related
>> changes modify the timing?
>>
>> I also created a patch, that fixes the problem on my 2.6.9 host. This 
>> probably
>> isn't a sane patch, but is enough to demonstrate, where I think, the 
>> bug is.
>> Both files are attached.
>>
>>        Bodo
>>
>>
>> ------------------------------------------------------------------------
>>
>> --- a/include/linux/sched.h    2005-02-02 22:15:51.000000000 +0100
>> +++ b/include/linux/sched.h    2005-02-02 22:22:54.000000000 +0100
>> @@ -584,6 +584,7 @@ struct task_struct {
>>        struct mempolicy *mempolicy;
>>        short il_next;        /* could be shared with used_math */
>>  #endif
>> +    volatile long saving;
>>  };
>>  
>>  static inline pid_t process_group(struct task_struct *tsk)
>> --- a/kernel/sched.c    2005-02-02 21:32:51.000000000 +0100
>> +++ b/kernel/sched.c    2005-02-02 22:12:14.000000000 +0100
>> @@ -2689,8 +2689,10 @@ need_resched:
>>          if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
>>                  unlikely(signal_pending(prev))))
>>              prev->state = TASK_RUNNING;
>> -        else
>> +        else {
>> +            prev->saving = 1;
>>              deactivate_task(prev, rq);
>> +        }
>>      }
>>  
>>      cpu = smp_processor_id();
>> --- a/kernel/ptrace.c    2005-02-02 22:12:33.000000000 +0100
>> +++ b/kernel/ptrace.c    2005-02-02 22:20:46.000000000 +0100
>> @@ -96,6 +96,7 @@ int ptrace_check_attach(struct task_stru
>>  
>>      if (!ret && !kill) {
>>          wait_task_inactive(child);
>> +        while ( child->saving ) ;
>>      }
>>  
>>      /* All systems go.. */
>> --- a/arch/i386/kernel/process.c    2005-02-02 22:18:29.000000000 +0100
>> +++ b/arch/i386/kernel/process.c    2005-02-02 22:19:22.000000000 +0100
>> @@ -577,6 +577,9 @@ struct task_struct fastcall * __switch_t
>>      asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs));
>>      asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
>>  
>> +    wmb();
>> +    prev_p->saving=0;
>> +
>>      /*
>>       * Restore %fs and %gs if needed.
>>       */
>>
> 
> I don't see how this could help because AFAIKS, child->saving is only
> set and cleared while the runqueue is locked. And the same runqueue lock
> is taken by wait_task_inactive.
> 

Sorry, that not right. There are some routines called by sched(), that release
and reacquire the runqueue lock.

Bodo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Race condition in ptrace
  2005-02-04 12:35   ` Bodo Stroesser
@ 2005-02-04 22:15     ` Nick Piggin
  2005-02-04 22:39       ` Andrew Morton
  0 siblings, 1 reply; 12+ messages in thread
From: Nick Piggin @ 2005-02-04 22:15 UTC (permalink / raw)
  To: Bodo Stroesser
  Cc: Roland Mc Grath, Jeff Dike, BlaisorBlade, user-mode-linux devel,
	linux-kernel, Andrew Morton

Bodo Stroesser wrote:
> Nick Piggin wrote:
> 
>> Bodo Stroesser wrote:

>> I don't see how this could help because AFAIKS, child->saving is only
>> set and cleared while the runqueue is locked. And the same runqueue lock
>> is taken by wait_task_inactive.
>>
> 
> Sorry, that not right. There are some routines called by sched(), that 
> release
> and reacquire the runqueue lock.
> 

Oh yeah, it is the wake_sleeping_dependent / dependent_sleeper crap.
Sorry, you are right. And that's definitely a bug in sched.c, because
it breaks wait_task_inactive, as you've rightly observed.

Andrew, IMO this is another bug to hold 2.6.11 for.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Race condition in ptrace
  2005-02-04 22:15     ` Nick Piggin
@ 2005-02-04 22:39       ` Andrew Morton
  2005-02-04 23:15         ` Nick Piggin
  0 siblings, 1 reply; 12+ messages in thread
From: Andrew Morton @ 2005-02-04 22:39 UTC (permalink / raw)
  To: Nick Piggin
  Cc: bstroesser, roland, jdike, blaisorblade_spam,
	user-mode-linux-devel, linux-kernel

Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> Bodo Stroesser wrote:
> > Nick Piggin wrote:
> > 
> >> Bodo Stroesser wrote:
> 
> >> I don't see how this could help because AFAIKS, child->saving is only
> >> set and cleared while the runqueue is locked. And the same runqueue lock
> >> is taken by wait_task_inactive.
> >>
> > 
> > Sorry, that not right. There are some routines called by sched(), that 
> > release
> > and reacquire the runqueue lock.
> > 
> 
> Oh yeah, it is the wake_sleeping_dependent / dependent_sleeper crap.
> Sorry, you are right. And that's definitely a bug in sched.c, because
> it breaks wait_task_inactive, as you've rightly observed.
> 
> Andrew, IMO this is another bug to hold 2.6.11 for.

Sure.  I wouldn't consider Bodo's patch to be the one to use though..

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Race condition in ptrace
  2005-02-04 22:39       ` Andrew Morton
@ 2005-02-04 23:15         ` Nick Piggin
  2005-02-05  4:35           ` Nick Piggin
  0 siblings, 1 reply; 12+ messages in thread
From: Nick Piggin @ 2005-02-04 23:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: bstroesser, roland, jdike, blaisorblade_spam,
	user-mode-linux-devel, linux-kernel

Andrew Morton wrote:
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
>>Bodo Stroesser wrote:
>>
>>>Nick Piggin wrote:
>>>
>>>
>>>>Bodo Stroesser wrote:
>>
>>>>I don't see how this could help because AFAIKS, child->saving is only
>>>>set and cleared while the runqueue is locked. And the same runqueue lock
>>>>is taken by wait_task_inactive.
>>>>
>>>
>>>Sorry, that not right. There are some routines called by sched(), that 
>>>release
>>>and reacquire the runqueue lock.
>>>
>>
>>Oh yeah, it is the wake_sleeping_dependent / dependent_sleeper crap.
>>Sorry, you are right. And that's definitely a bug in sched.c, because
>>it breaks wait_task_inactive, as you've rightly observed.
>>
>>Andrew, IMO this is another bug to hold 2.6.11 for.
> 
> 
> Sure.  I wouldn't consider Bodo's patch to be the one to use though..

No. Something similar could be done that works on all architectures
and all wait_task_inactive callers (and is confined to sched.c). That
would still be more or less a hack to work around smtnice's unfortunate
locking though.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Race condition in ptrace
  2005-02-04 23:15         ` Nick Piggin
@ 2005-02-05  4:35           ` Nick Piggin
  2005-02-06  3:26             ` [PATCH] fix wait_task_inactive race (was Re: Race condition in ptrace) Nick Piggin
  0 siblings, 1 reply; 12+ messages in thread
From: Nick Piggin @ 2005-02-05  4:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: bstroesser, roland, jdike, blaisorblade_spam,
	user-mode-linux-devel, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 801 bytes --]

Nick Piggin wrote:
> Andrew Morton wrote:
> 
>> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>>

>>> Andrew, IMO this is another bug to hold 2.6.11 for.
>>
>>
>>
>> Sure.  I wouldn't consider Bodo's patch to be the one to use though..
> 
> 
> No. Something similar could be done that works on all architectures
> and all wait_task_inactive callers (and is confined to sched.c). That
> would still be more or less a hack to work around smtnice's unfortunate
> locking though.
> 

Something like the following (untested) extension of Bodo's work
could be the minimal fix for 2.6.11. As I've said though, I'd
consider it a hack and prefer to do something about the locking.
That could be done after 2.6.11 though. Depends how you feel.

Bodo, I wonder if this looks like a suitable fix for your problem?

[-- Attachment #2: sched-fixup-locking.patch --]
[-- Type: text/plain, Size: 2795 bytes --]




---

 linux-2.6-npiggin/include/linux/init_task.h |    1 +
 linux-2.6-npiggin/include/linux/sched.h     |    1 +
 linux-2.6-npiggin/kernel/sched.c            |   12 ++++++++++--
 3 files changed, 12 insertions(+), 2 deletions(-)

diff -puN include/linux/sched.h~sched-fixup-locking include/linux/sched.h
--- linux-2.6/include/linux/sched.h~sched-fixup-locking	2005-02-05 15:24:00.000000000 +1100
+++ linux-2.6-npiggin/include/linux/sched.h	2005-02-05 15:24:39.000000000 +1100
@@ -533,6 +533,7 @@ struct task_struct {
 	unsigned long ptrace;
 
 	int lock_depth;		/* Lock depth */
+	int on_cpu;		/* Is the task on the CPU, or in a ctxsw */
 
 	int prio, static_prio;
 	struct list_head run_list;
diff -puN kernel/sched.c~sched-fixup-locking kernel/sched.c
--- linux-2.6/kernel/sched.c~sched-fixup-locking	2005-02-05 15:24:02.000000000 +1100
+++ linux-2.6-npiggin/kernel/sched.c	2005-02-05 15:34:37.000000000 +1100
@@ -294,7 +294,7 @@ static DEFINE_PER_CPU(struct runqueue, r
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(rq, next)	do { } while (0)
 # define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p)		((rq)->curr == (p))
+# define task_running(rq, p)		((p)->on_cpu)
 #endif
 
 /*
@@ -867,7 +867,7 @@ void wait_task_inactive(task_t * p)
 repeat:
 	rq = task_rq_lock(p, &flags);
 	/* Must be off runqueue entirely, not preempted. */
-	if (unlikely(p->array)) {
+	if (unlikely(p->array || p->on_cpu)) {
 		/* If it's preempted, we yield.  It could be a while. */
 		preempted = !task_running(rq, p);
 		task_rq_unlock(rq, &flags);
@@ -2805,11 +2805,18 @@ switch_tasks:
 		rq->curr = next;
 		++*switch_count;
 
+		next->on_cpu = 1;
 		prepare_arch_switch(rq, next);
 		prev = context_switch(rq, prev, next);
 		barrier();
 
 		finish_task_switch(prev);
+		/*
+		 * Some architectures release the runqueue lock before
+		 * context switching. Make sure this isn't reordered.
+		 */
+		smp_wmb();
+		prev->on_cpu = 0;
 	} else
 		spin_unlock_irq(&rq->lock);
 
@@ -4055,6 +4062,7 @@ void __devinit init_idle(task_t *idle, i
 	set_task_cpu(idle, cpu);
 
 	spin_lock_irqsave(&rq->lock, flags);
+	idle->on_cpu = 1;
 	rq->curr = rq->idle = idle;
 	set_tsk_need_resched(idle);
 	spin_unlock_irqrestore(&rq->lock, flags);
diff -puN include/linux/init_task.h~sched-fixup-locking include/linux/init_task.h
--- linux-2.6/include/linux/init_task.h~sched-fixup-locking	2005-02-05 15:24:56.000000000 +1100
+++ linux-2.6-npiggin/include/linux/init_task.h	2005-02-05 15:25:07.000000000 +1100
@@ -73,6 +73,7 @@ extern struct group_info init_groups;
 	.usage		= ATOMIC_INIT(2),				\
 	.flags		= 0,						\
 	.lock_depth	= -1,						\
+	.on_cpu		= 0,						\
 	.prio		= MAX_PRIO-20,					\
 	.static_prio	= MAX_PRIO-20,					\
 	.policy		= SCHED_NORMAL,					\

_

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] fix wait_task_inactive race (was Re: Race condition in ptrace)
  2005-02-05  4:35           ` Nick Piggin
@ 2005-02-06  3:26             ` Nick Piggin
  2005-02-06  7:19               ` Ingo Molnar
  0 siblings, 1 reply; 12+ messages in thread
From: Nick Piggin @ 2005-02-06  3:26 UTC (permalink / raw)
  To: Andrew Morton
  Cc: bstroesser, roland, jdike, blaisorblade_spam,
	user-mode-linux-devel, linux-kernel, Ingo Molnar

[-- Attachment #1: Type: text/plain, Size: 1260 bytes --]

Nick Piggin wrote:

> Something like the following (untested) extension of Bodo's work
> could be the minimal fix for 2.6.11. As I've said though, I'd
> consider it a hack and prefer to do something about the locking.
> That could be done after 2.6.11 though. Depends how you feel.
> 

I think this is the right fix.

When a task is put to sleep, it is dequeued from the runqueue
while it is still running. The problem is that the runqueue
lock can be dropped and retaken in schedule() before the task
actually schedules off, and wait_task_inactive did not account
for this.

I introduced a new function to resolve this state, fixed
wait_task_inactive, and converted over an open coded test.

I did a quick audit of sched.c, and nothing else seems to have
made the same mistake.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>


Question: why does wait_task_inactive have different semantics
for UP && PREEMPT than SMP && PREEMPT? I can see that the kthread
caler probably isn't used in the UP case, but technically it is
relying on behaviour that it doesn't get with UP and PREEMPT.
Looks like the ptrace.c caller won't care.

But still, can we either fix it or put a nice comment there?
Preferably fix, if this isn't a very performance critical path?


[-- Attachment #2: sched-fixup-races.patch --]
[-- Type: text/plain, Size: 1416 bytes --]




---

 linux-2.6-npiggin/kernel/sched.c |   12 ++++++++++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff -puN kernel/sched.c~sched-fixup-races kernel/sched.c
--- linux-2.6/kernel/sched.c~sched-fixup-races	2005-02-06 14:03:53.000000000 +1100
+++ linux-2.6-npiggin/kernel/sched.c	2005-02-06 14:06:43.000000000 +1100
@@ -298,6 +298,14 @@ static DEFINE_PER_CPU(struct runqueue, r
 #endif
 
 /*
+ * Is the task currently running or on the runqueue
+ */
+static int task_onqueue(runqueue_t *rq, task_t *p)
+{
+	return (p->array || task_running(rq, p));
+}
+
+/*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
@@ -836,7 +844,7 @@ static int migrate_task(task_t *p, int d
 	 * If the task is not on a runqueue (and not running), then
 	 * it is sufficient to simply update the task's cpu field.
 	 */
-	if (!p->array && !task_running(rq, p)) {
+	if (!task_onqueue(rq, p)) {
 		set_task_cpu(p, dest_cpu);
 		return 0;
 	}
@@ -867,7 +875,7 @@ void wait_task_inactive(task_t * p)
 repeat:
 	rq = task_rq_lock(p, &flags);
 	/* Must be off runqueue entirely, not preempted. */
-	if (unlikely(p->array)) {
+	if (unlikely(task_onqueue(rq, p))) {
 		/* If it's preempted, we yield.  It could be a while. */
 		preempted = !task_running(rq, p);
 		task_rq_unlock(rq, &flags);

_

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] fix wait_task_inactive race (was Re: Race condition in ptrace)
  2005-02-06  3:26             ` [PATCH] fix wait_task_inactive race (was Re: Race condition in ptrace) Nick Piggin
@ 2005-02-06  7:19               ` Ingo Molnar
  2005-02-06  7:36                 ` Nick Piggin
  0 siblings, 1 reply; 12+ messages in thread
From: Ingo Molnar @ 2005-02-06  7:19 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, bstroesser, roland, jdike, blaisorblade_spam,
	user-mode-linux-devel, linux-kernel


* Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> When a task is put to sleep, it is dequeued from the runqueue
> while it is still running. The problem is that the runqueue
> lock can be dropped and retaken in schedule() before the task
> actually schedules off, and wait_task_inactive did not account
> for this.

ugh. This has been the Nth time we got bitten by the fundamental
unrobustness of non-atomic scheduling on some architectures ...
(And i'll say the N+1th time that this is not good.)

> +static int task_onqueue(runqueue_t *rq, task_t *p)
> +{
> +	return (p->array || task_running(rq, p));
> +}

the fix looks good, but i'd go for the simplified oneliner patch below. 
I dont like the name 'task_onqueue()', a because a task is 'on the
queue' when it has p->array != NULL. The task is running when it's
task_running().  On architectures with nonatomic scheduling a task may
be running while not on the queue and external observers with the
runqueue lock might notice this - and wait_task_inactive() has to take
care of this case. I'm not sure how introducing a function named
"task_onqueue()" will make the situation any clearer.

ok?

	Ingo

--
When a task is put to sleep, it is dequeued from the runqueue
while it is still running. The problem is that one some arches
that has non-atomic scheduling, the runqueue lock can be
dropped and retaken in schedule() before the task actually
schedules off, and wait_task_inactive did not account for this.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

---

 linux/kernel/sched.c |    2 +-
 1 files changed, 1 insertion(+), 1 deletion(-)

--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -867,7 +875,7 @@ void wait_task_inactive(task_t * p)
 repeat:
 	rq = task_rq_lock(p, &flags);
 	/* Must be off runqueue entirely, not preempted. */
-	if (unlikely(p->array)) {
+	if (unlikely(p->array || task_running(rq, p))) {
 		/* If it's preempted, we yield.  It could be a while. */
 		preempted = !task_running(rq, p);
 		task_rq_unlock(rq, &flags);


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] fix wait_task_inactive race (was Re: Race condition in ptrace)
  2005-02-06  7:19               ` Ingo Molnar
@ 2005-02-06  7:36                 ` Nick Piggin
  2005-02-06  7:47                   ` Nick Piggin
  2005-02-14 16:07                   ` Bodo Stroesser
  0 siblings, 2 replies; 12+ messages in thread
From: Nick Piggin @ 2005-02-06  7:36 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, bstroesser, roland, jdike, blaisorblade_spam,
	user-mode-linux-devel, linux-kernel

Ingo Molnar wrote:
> * Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
> 
>>When a task is put to sleep, it is dequeued from the runqueue
>>while it is still running. The problem is that the runqueue
>>lock can be dropped and retaken in schedule() before the task
>>actually schedules off, and wait_task_inactive did not account
>>for this.
> 
> 
> ugh. This has been the Nth time we got bitten by the fundamental
> unrobustness of non-atomic scheduling on some architectures ...
> (And i'll say the N+1th time that this is not good.)
> 

This is actually due to wake_sleeping_dependent and
dependent_sleeper dropping the runqueue lock.

Actually idle_balance can (in rare cases) drop the lock as well,
which I didn't notice earlier, so it is something that we
have been doing forever. The smtnice locking has just exposed
the problem for us, so I wrongfully bashed it earlier *blush*.

> 
>>+static int task_onqueue(runqueue_t *rq, task_t *p)
>>+{
>>+	return (p->array || task_running(rq, p));
>>+}
> 
> 
> the fix looks good, but i'd go for the simplified oneliner patch below. 
> I dont like the name 'task_onqueue()', a because a task is 'on the
> queue' when it has p->array != NULL. The task is running when it's
> task_running().  On architectures with nonatomic scheduling a task may
> be running while not on the queue and external observers with the
> runqueue lock might notice this - and wait_task_inactive() has to take
> care of this case. I'm not sure how introducing a function named
> "task_onqueue()" will make the situation any clearer.
> 
> ok?
> 

Well just because there is a specific condition that both callsites
require. That is, the task is neither running nor on the runqueue.

While task_onqueue is technically wrong if you're looking down into
the fine details of the priority queue management, I think it is OK
to go up a level of abstraction and say that the task is
"on the runqueue" if it is either running or waiting to run.

It is really the one condition that is made un-intuitive due to the
locking in schedule(), so I thought formalising it would be better.
Suggestions for a better name welcome? ;)

Your one liner would fix the problem too, of course. The important
thing at this stage is that it gets fixed for 2.6.11.

Thanks,
Nick



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] fix wait_task_inactive race (was Re: Race condition in ptrace)
  2005-02-06  7:36                 ` Nick Piggin
@ 2005-02-06  7:47                   ` Nick Piggin
  2005-02-14 16:07                   ` Bodo Stroesser
  1 sibling, 0 replies; 12+ messages in thread
From: Nick Piggin @ 2005-02-06  7:47 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, bstroesser, roland, jdike, blaisorblade_spam,
	user-mode-linux-devel, linux-kernel

Nick Piggin wrote:
> Ingo Molnar wrote:
> 
>> * Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>>
>>
>>> When a task is put to sleep, it is dequeued from the runqueue
>>> while it is still running. The problem is that the runqueue
>>> lock can be dropped and retaken in schedule() before the task
>>> actually schedules off, and wait_task_inactive did not account
>>> for this.
>>
>>
>>
>> ugh. This has been the Nth time we got bitten by the fundamental
>> unrobustness of non-atomic scheduling on some architectures ...
>> (And i'll say the N+1th time that this is not good.)
>>
> 
> This is actually due to wake_sleeping_dependent and
> dependent_sleeper dropping the runqueue lock.
> 

Hmph, *and* unlocked context switch architectures as you say.
In fact, I'm surprised those haven't been bitten by this problem
earlier.

So that makes us each half right! :)


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] fix wait_task_inactive race (was Re: Race condition in ptrace)
  2005-02-06  7:36                 ` Nick Piggin
  2005-02-06  7:47                   ` Nick Piggin
@ 2005-02-14 16:07                   ` Bodo Stroesser
  1 sibling, 0 replies; 12+ messages in thread
From: Bodo Stroesser @ 2005-02-14 16:07 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Andrew Morton, roland, jdike, blaisorblade_spam,
	user-mode-linux-devel, linux-kernel

Nick Piggin wrote:
> Your one liner would fix the problem too, of course. The important
> thing at this stage is that it gets fixed for 2.6.11.

Sorry, have been off the net last week.

Thank you for the patches. Have tested Ingo's one liner.
It works fine for me, as expected.

Bodo

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2005-02-14 16:08 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-02-03 12:51 Race condition in ptrace Bodo Stroesser
2005-02-04  0:27 ` Nick Piggin
2005-02-04 12:35   ` Bodo Stroesser
2005-02-04 22:15     ` Nick Piggin
2005-02-04 22:39       ` Andrew Morton
2005-02-04 23:15         ` Nick Piggin
2005-02-05  4:35           ` Nick Piggin
2005-02-06  3:26             ` [PATCH] fix wait_task_inactive race (was Re: Race condition in ptrace) Nick Piggin
2005-02-06  7:19               ` Ingo Molnar
2005-02-06  7:36                 ` Nick Piggin
2005-02-06  7:47                   ` Nick Piggin
2005-02-14 16:07                   ` Bodo Stroesser

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).