linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* VM fixes [4/4]
@ 2004-12-24 17:41 Andrea Arcangeli
  2004-12-24 18:01 ` David S. Miller
                   ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Andrea Arcangeli @ 2004-12-24 17:41 UTC (permalink / raw)
  To: linux-kernel; +Cc: Thomas Gleixner, Andrew Morton

This is the core of the oom-killer fixes I developed partly taking the
idea from Thomas's patches of getting feedback from the exit path, plus
I moved the oom killer into page_alloc.c as it should to be able to
check the watermarks before killing more stuff. This also tweaks the
badness to take thread bombs more into account (that change to badness
is from Thomas, from my part I'd rather rewrite badness from scratch
instead, but that's an orthgonal issue ;). This also merges your write
throttling fix. With this applied the oom killer is very sane, no more 5
sec waits and suprious oom kills.

Write throttling works perfectly here with these 4 patches applied to
2.6.10-rc3, no idea what other breakage Rik has in his kernel that kills
tasks with a cp /dev/zero /tmp. I doubt it's related to the harddisk
timings, my hd is fast, but the deviation between HD speed and ram speed
is so huge that it shouldn't matter.

Rik I'd recommend to try with 2.6.10-rc3 mainline with only these 4
patches applied and see if you can reproduce. If you can really still
reproduce (i mean without tweaking the sysclt default values) this will
still make it a good base to start working on to fix more bugs.

Please apply all 4 patches to mainline, thanks! They're all against
2.6.10-rc3 (bkcvs is broken and I'm not yet adapted the bk snapshots,
I still have the hope that bkcvs will be eventually fixed)

From: Andrea Arcangeli <andrea@suse.de>
Subject: fix several oom killer bugs, most important avoid spurious oom kills
 badness algorithm tweaked by Thomas Gleixner and throttling fix from
 Andrew

Signed-off-by: Andrea Arcangeli <andrea@suse.de>

From: Andrea Arcangeli <andrea@suse.de>
Subject: fix several oom killer bugs, most important avoid spurious oom
 kills, badness algorithm tweaked by Thomas Gleixner and write throttling 
 from Andrew

Signed-off-by: Andrea Arcangeli <andrea@suse.de>

--- x/include/linux/sched.h.orig	2004-12-24 17:53:50.806536304 +0100
+++ x/include/linux/sched.h	2004-12-24 18:00:58.267552352 +0100
@@ -601,6 +601,11 @@ struct task_struct {
 	struct key *thread_keyring;	/* keyring private to this thread */
 #endif
 /*
+ * All archs should support atomic ops with
+ * 1 byte granularity.
+ */
+	unsigned char memdie;
+/*
  * Must be changed atomically so it shouldn't be
  * be a shareable bitflag.
  */
@@ -715,8 +720,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_DUMPCORE	0x00000200	/* dumped core */
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
-#define PF_MEMDIE	0x00001000	/* Killed for out-of-memory */
-#define PF_FLUSHER	0x00002000	/* responsible for disk writeback */
+#define PF_FLUSHER	0x00001000	/* responsible for disk writeback */
 
 #define PF_FREEZE	0x00004000	/* this task should be frozen for suspend */
 #define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
--- x/kernel/exit.c.orig	2004-12-04 08:56:33.000000000 +0100
+++ x/kernel/exit.c	2004-12-24 18:00:58.270551896 +0100
@@ -242,9 +242,8 @@ void reparent_to_init(void)
 	memcpy(current->signal->rlim, init_task.signal->rlim,
 	       sizeof(current->signal->rlim));
 	atomic_inc(&(INIT_USER->__count));
-	switch_uid(INIT_USER);
-
 	write_unlock_irq(&tasklist_lock);
+	switch_uid(INIT_USER);
 }
 
 void __set_special_pids(pid_t session, pid_t pgrp)
--- x/mm/oom_kill.c.orig	2004-12-24 17:53:50.807536152 +0100
+++ x/mm/oom_kill.c	2004-12-24 18:01:19.903263224 +0100
@@ -45,18 +45,30 @@
 unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
+	struct list_head *tsk;
 
 	if (!p->mm)
 		return 0;
 
-	if (p->flags & PF_MEMDIE)
-		return 0;
 	/*
 	 * The memory size of the process is the basis for the badness.
 	 */
 	points = p->mm->total_vm;
 
 	/*
+	 * Processes which fork a lot of child processes are likely 
+	 * a good choice. We add the vmsize of the childs if they
+	 * have an own mm. This prevents forking servers to flood the
+	 * machine with an endless amount of childs
+	 */
+	list_for_each(tsk, &p->children) {
+		struct task_struct *chld;
+		chld = list_entry(tsk, struct task_struct, sibling);
+		if (chld->mm != p->mm && chld->mm)
+			points += chld->mm->total_vm;
+	}
+
+	/*
 	 * CPU time is in tens of seconds and run time is in thousands
          * of seconds. There is no particular reason for this other than
          * that it turned out to work very well in practice.
@@ -131,14 +143,24 @@ static struct task_struct * select_bad_p
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	do_each_thread(g, p)
-		if (p->pid) {
-			unsigned long points = badness(p, uptime.tv_sec);
-			if (points > maxpoints) {
+		/* skip the init task with pid == 1 */
+		if (p->pid > 1) {
+			unsigned long points;
+
+			/*
+			 * This is in the process of releasing memory so wait it
+			 * to finish before killing some other task by mistake.
+			 */
+			if ((p->memdie || (p->flags & PF_EXITING)) && !(p->flags & PF_DEAD))
+				return ERR_PTR(-1UL);
+			if (p->flags & PF_SWAPOFF)
+				return p;
+
+			points = badness(p, uptime.tv_sec);
+			if (points > maxpoints || !chosen) {
 				chosen = p;
 				maxpoints = points;
 			}
-			if (p->flags & PF_SWAPOFF)
-				return p;
 		}
 	while_each_thread(g, p);
 	return chosen;
@@ -151,6 +173,12 @@ static struct task_struct * select_bad_p
  */
 static void __oom_kill_task(task_t *p)
 {
+	if (p->pid == 1) {
+		WARN_ON(1);
+		printk(KERN_WARNING "tried to kill init!\n");
+		return;
+	}
+
 	task_lock(p);
 	if (!p->mm || p->mm == &init_mm) {
 		WARN_ON(1);
@@ -167,7 +195,7 @@ static void __oom_kill_task(task_t *p)
 	 * exit() and clear out its resources quickly...
 	 */
 	p->time_slice = HZ;
-	p->flags |= PF_MEMALLOC | PF_MEMDIE;
+	p->memdie = 1;
 
 	/* This process has hardware access, be more careful. */
 	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
@@ -180,12 +208,45 @@ static void __oom_kill_task(task_t *p)
 static struct mm_struct *oom_kill_task(task_t *p)
 {
 	struct mm_struct *mm = get_task_mm(p);
-	if (!mm || mm == &init_mm)
+	task_t * g, * q;
+
+	if (!mm)
 		return NULL;
+	if (mm == &init_mm) {
+		mmput(mm);
+		return NULL;
+	}
+
 	__oom_kill_task(p);
+	/*
+	 * kill all processes that share the ->mm (i.e. all threads),
+	 * but are in a different thread group
+	 */
+	do_each_thread(g, q)
+		if (q->mm == mm && q->tgid != p->tgid)
+			__oom_kill_task(q);
+	while_each_thread(g, q);
+
 	return mm;
 }
 
+static struct mm_struct *oom_kill_process(task_t *p)
+{
+ 	struct mm_struct *mm;
+	struct task_struct *c;
+	struct list_head *tsk;
+
+	/* Try to kill a child first */
+	list_for_each(tsk, &p->children) {
+		c = list_entry(tsk, struct task_struct, sibling);
+		if (c->mm == p->mm)
+			continue;
+		mm = oom_kill_task(c);
+		if (mm)
+			return mm;
+	}
+	return oom_kill_task(p);
+}
 
 /**
  * oom_kill - kill the "best" process when we run out of memory
@@ -195,117 +256,40 @@ static struct mm_struct *oom_kill_task(t
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-static void oom_kill(void)
+void out_of_memory(int gfp_mask)
 {
-	struct mm_struct *mm;
-	struct task_struct *g, *p, *q;
-	
+	struct mm_struct *mm = NULL;
+	task_t * p;
+
 	read_lock(&tasklist_lock);
 retry:
 	p = select_bad_process();
 
+	if (PTR_ERR(p) == -1UL)
+		goto out;
+
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
+		read_unlock(&tasklist_lock);
 		show_free_areas();
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	mm = oom_kill_task(p);
-	if (!mm)
-		goto retry;
-	/*
-	 * kill all processes that share the ->mm (i.e. all threads),
-	 * but are in a different thread group
-	 */
-	do_each_thread(g, q)
-		if (q->mm == mm && q->tgid != p->tgid)
-			__oom_kill_task(q);
-	while_each_thread(g, q);
-	if (!p->mm)
-		printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
-	read_unlock(&tasklist_lock);
-	mmput(mm);
-
-	/*
-	 * Make kswapd go out of the way, so "p" has a good chance of
-	 * killing itself before someone else gets the chance to ask
-	 * for more memory.
-	 */
-	yield();
-	return;
-}
-
-/**
- * out_of_memory - is the system out of memory?
- */
-void out_of_memory(int gfp_mask)
-{
-	/*
-	 * oom_lock protects out_of_memory()'s static variables.
-	 * It's a global lock; this is not performance-critical.
-	 */
-	static spinlock_t oom_lock = SPIN_LOCK_UNLOCKED;
-	static unsigned long first, last, count, lastkill;
-	unsigned long now, since;
-
-	spin_lock(&oom_lock);
-	now = jiffies;
-	since = now - last;
-	last = now;
-
-	/*
-	 * If it's been a long time since last failure,
-	 * we're not oom.
-	 */
-	if (since > 5*HZ)
-		goto reset;
-
-	/*
-	 * If we haven't tried for at least one second,
-	 * we're not really oom.
-	 */
-	since = now - first;
-	if (since < HZ)
-		goto out_unlock;
-
-	/*
-	 * If we have gotten only a few failures,
-	 * we're not really oom. 
-	 */
-	if (++count < 10)
-		goto out_unlock;
-
-	/*
-	 * If we just killed a process, wait a while
-	 * to give that task a chance to exit. This
-	 * avoids killing multiple processes needlessly.
-	 */
-	since = now - lastkill;
-	if (since < HZ*5)
-		goto out_unlock;
-
-	/*
-	 * Ok, really out of memory. Kill something.
-	 */
-	lastkill = now;
-
 	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
 	show_free_areas();
+	mm = oom_kill_process(p);
+	if (!mm)
+		goto retry;
 
-	/* oom_kill() sleeps */
-	spin_unlock(&oom_lock);
-	oom_kill();
-	spin_lock(&oom_lock);
+ out:
+	read_unlock(&tasklist_lock);
+	if (mm)
+		mmput(mm);
 
-reset:
 	/*
-	 * We dropped the lock above, so check to be sure the variable
-	 * first only ever increases to prevent false OOM's.
+	 * Give "p" a good chance of killing itself before we
+	 * retry to allocate memory.
 	 */
-	if (time_after(now, first))
-		first = now;
-	count = 0;
-
-out_unlock:
-	spin_unlock(&oom_lock);
+	__set_current_state(TASK_INTERRUPTIBLE);
+	schedule_timeout(1);
 }
--- x/mm/page_alloc.c.orig	2004-12-24 17:59:36.182031248 +0100
+++ x/mm/page_alloc.c	2004-12-24 18:00:58.276550984 +0100
@@ -606,6 +606,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 	int classzone_idx;
 	int do_retry;
 	int can_try_harder;
+	int did_some_progress;
 
 	might_sleep_if(wait);
 
@@ -625,6 +626,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 
 	classzone_idx = zone_idx(zones[0]);
 
+ restart:
 	/* Go through the zonelist once, looking for a zone with enough free */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		min = z->pages_low + (1<<order) + z->lowmem_reserve[classzone_idx];
@@ -661,7 +663,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 	}
 
 	/* This allocation should allow future memory freeing. */
-	if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
+	if (((p->flags & PF_MEMALLOC) || p->memdie) && !in_interrupt()) {
 		/* go through the zonelist yet again, ignoring mins */
 		for (i = 0; (z = zones[i]) != NULL; i++) {
 			page = buffered_rmqueue(z, order, gfp_mask);
@@ -676,31 +678,58 @@ __alloc_pages(unsigned int gfp_mask, uns
 		goto nopage;
 
 rebalance:
+	cond_resched();
+
 	/* We now go into synchronous reclaim */
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	try_to_free_pages(zones, gfp_mask, order);
+	did_some_progress = try_to_free_pages(zones, gfp_mask, order);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
 
-	/* go through the zonelist yet one more time */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
-		min = z->pages_min;
-		if (gfp_mask & __GFP_HIGH)
-			min /= 2;
-		if (can_try_harder)
-			min -= min / 4;
-		min += (1<<order) + z->lowmem_reserve[classzone_idx];
+	cond_resched();
 
-		if (z->free_pages < min)
-			continue;
+	if (likely(did_some_progress)) {
+		/* go through the zonelist yet one more time */
+		for (i = 0; (z = zones[i]) != NULL; i++) {
+			min = z->pages_min;
+			if (gfp_mask & __GFP_HIGH)
+				min /= 2;
+			if (can_try_harder)
+				min -= min / 4;
+			min += (1<<order) + z->lowmem_reserve[classzone_idx];
 
-		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
-			goto got_pg;
+			if (z->free_pages < min)
+				continue;
+
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page)
+				goto got_pg;
+		}
+	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+		/*
+		 * Go through the zonelist yet one more time, keep
+		 * very high watermark here, this is only to catch
+		 * a parallel oom killing, we must fail if we're still
+		 * under heavy pressure.
+		 */
+		for (i = 0; (z = zones[i]) != NULL; i++) {
+			min = z->pages_high;
+			min += (1<<order) + z->lowmem_reserve[classzone_idx];
+
+			if (z->free_pages < min)
+				continue;
+
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page)
+				goto got_pg;
+		}
+
+		out_of_memory(gfp_mask);
+		goto restart;
 	}
 
 	/*
--- x/mm/swap_state.c.orig	2004-12-04 08:56:33.000000000 +0100
+++ x/mm/swap_state.c	2004-12-24 18:00:58.277550832 +0100
@@ -59,6 +59,8 @@ void show_swap_cache_info(void)
 		swap_cache_info.add_total, swap_cache_info.del_total,
 		swap_cache_info.find_success, swap_cache_info.find_total,
 		swap_cache_info.noent_race, swap_cache_info.exist_race);
+	printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
 }
 
 /*
--- x/mm/vmscan.c.orig	2004-12-04 08:56:33.000000000 +0100
+++ x/mm/vmscan.c	2004-12-24 18:00:58.280550376 +0100
@@ -935,8 +935,6 @@ int try_to_free_pages(struct zone **zone
 		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
-	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
-		out_of_memory(gfp_mask);
 out:
 	for (i = 0; zones[i] != 0; i++)
 		zones[i]->prev_priority = zones[i]->temp_priority;
@@ -1063,6 +1061,7 @@ scan:
 			shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
 			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_reclaimed += sc.nr_reclaimed;
+			total_scanned += sc.nr_scanned;
 			if (zone->all_unreclaimable)
 				continue;
 			if (zone->pages_scanned >= (zone->nr_active +

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2004-12-24 17:41 VM fixes [4/4] Andrea Arcangeli
@ 2004-12-24 18:01 ` David S. Miller
  2004-12-24 18:22   ` Andrea Arcangeli
  2004-12-24 23:32 ` Linus Torvalds
  2004-12-27 13:38 ` Rik van Riel
  2 siblings, 1 reply; 21+ messages in thread
From: David S. Miller @ 2004-12-24 18:01 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-kernel, tglx, akpm

On Fri, 24 Dec 2004 18:41:56 +0100
Andrea Arcangeli <andrea@suse.de> wrote:

> + * All archs should support atomic ops with
> + * 1 byte granularity.
> + */
> +	unsigned char memdie;

Again, older Alpha's do not.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2004-12-24 18:01 ` David S. Miller
@ 2004-12-24 18:22   ` Andrea Arcangeli
  2004-12-24 20:55     ` David S. Miller
                       ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Andrea Arcangeli @ 2004-12-24 18:22 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-kernel, tglx, akpm

On Fri, Dec 24, 2004 at 10:01:47AM -0800, David S. Miller wrote:
> On Fri, 24 Dec 2004 18:41:56 +0100
> Andrea Arcangeli <andrea@suse.de> wrote:
> 
> > + * All archs should support atomic ops with
> > + * 1 byte granularity.
> > + */
> > +	unsigned char memdie;
> 
> Again, older Alpha's do not.

If those old cpus really supported smp in linux, then fixing this bit is
trivial, just change it to short. Do they support short at least?

Thanks.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2004-12-24 18:22   ` Andrea Arcangeli
@ 2004-12-24 20:55     ` David S. Miller
  2004-12-24 21:25       ` William Lee Irwin III
  2004-12-24 23:41     ` Linus Torvalds
  2004-12-25  0:06     ` VM fixes [4/4] Mitchell Blank Jr
  2 siblings, 1 reply; 21+ messages in thread
From: David S. Miller @ 2004-12-24 20:55 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-kernel, tglx, akpm

On Fri, 24 Dec 2004 19:22:19 +0100
Andrea Arcangeli <andrea@suse.de> wrote:

> On Fri, Dec 24, 2004 at 10:01:47AM -0800, David S. Miller wrote:
> > On Fri, 24 Dec 2004 18:41:56 +0100
> > Andrea Arcangeli <andrea@suse.de> wrote:
> > 
> > > + * All archs should support atomic ops with
> > > + * 1 byte granularity.
> > > + */
> > > +	unsigned char memdie;
> > 
> > Again, older Alpha's do not.
> 
> If those old cpus really supported smp in linux, then fixing this bit is
> trivial, just change it to short. Do they support short at least?

No, they do not.  The smallest atomic unit is one 32-bit word.
And yes there are SMP systems using these chips.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2004-12-24 20:55     ` David S. Miller
@ 2004-12-24 21:25       ` William Lee Irwin III
  2004-12-24 23:52         ` William Lee Irwin III
  2004-12-24 23:55         ` David S. Miller
  0 siblings, 2 replies; 21+ messages in thread
From: William Lee Irwin III @ 2004-12-24 21:25 UTC (permalink / raw)
  To: David S. Miller; +Cc: Andrea Arcangeli, linux-kernel, tglx, akpm

On Fri, 24 Dec 2004 19:22:19 +0100 Andrea Arcangeli <andrea@suse.de> wrote:
>> If those old cpus really supported smp in linux, then fixing this bit is
>> trivial, just change it to short. Do they support short at least?

On Fri, Dec 24, 2004 at 12:55:04PM -0800, David S. Miller wrote:
> No, they do not.  The smallest atomic unit is one 32-bit word.
> And yes there are SMP systems using these chips.

Would systems described as ev56 by /proc/cpuinfo have such chips?


-- wli

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2004-12-24 17:41 VM fixes [4/4] Andrea Arcangeli
  2004-12-24 18:01 ` David S. Miller
@ 2004-12-24 23:32 ` Linus Torvalds
  2004-12-27 13:38 ` Rik van Riel
  2 siblings, 0 replies; 21+ messages in thread
From: Linus Torvalds @ 2004-12-24 23:32 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Kernel Mailing List, Thomas Gleixner, Andrew Morton



On Fri, 24 Dec 2004, Andrea Arcangeli wrote:
>
>  /*
> + * All archs should support atomic ops with
> + * 1 byte granularity.
> + */
> +	unsigned char memdie;

This simply fundamentally isn't true, last I looked.

At least older alphas do _not_ support atomic byte accesses, and if you
want atomic accesses you need to either use the defined smp-atomic
functions (ie things like the bit set operations), or you need to use 
"int", which afaik all architectures _do_ support atomic accesses to. 

		Linus

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2004-12-24 18:22   ` Andrea Arcangeli
  2004-12-24 20:55     ` David S. Miller
@ 2004-12-24 23:41     ` Linus Torvalds
  2004-12-25  2:27       ` Andrea Arcangeli
  2004-12-25  0:06     ` VM fixes [4/4] Mitchell Blank Jr
  2 siblings, 1 reply; 21+ messages in thread
From: Linus Torvalds @ 2004-12-24 23:41 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: David S. Miller, linux-kernel, tglx, akpm



On Fri, 24 Dec 2004, Andrea Arcangeli wrote:
> 
> If those old cpus really supported smp in linux, then fixing this bit is
> trivial, just change it to short. Do they support short at least?

It's not even about SMP. "byte" and "short" are not IRQ-safe or even 
preemption-safe (although I guess alpha doesn't support CONFIG_PREEMPT 
right now anyway) on pre-byte-access alphas.

Just don't do it. Maybe we'll never see another chip try what alpha did 
(it was arguably the single biggest mistake the early alphas had, and 
caused tons of system design trouble), but just use an "int".

That said, I'd suggest putting it in the thread structure instead. We 
already have thread-safe flags there, just use one of the bits. Yes, 
you'll need to use locked accesses to set it, but hey, how often does 
something like this get set anyway? And then you just do ti _right_, using 
set_thread_flag/clear_thread_flag etc..

		Linus

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2004-12-24 21:25       ` William Lee Irwin III
@ 2004-12-24 23:52         ` William Lee Irwin III
  2004-12-24 23:55         ` David S. Miller
  1 sibling, 0 replies; 21+ messages in thread
From: William Lee Irwin III @ 2004-12-24 23:52 UTC (permalink / raw)
  To: David S. Miller; +Cc: Andrea Arcangeli, linux-kernel, tglx, akpm

On Fri, 24 Dec 2004 19:22:19 +0100 Andrea Arcangeli <andrea@suse.de> wrote:
>>> If those old cpus really supported smp in linux, then fixing this bit is
>>> trivial, just change it to short. Do they support short at least?

On Fri, Dec 24, 2004 at 12:55:04PM -0800, David S. Miller wrote:
>> No, they do not.  The smallest atomic unit is one 32-bit word.
>> And yes there are SMP systems using these chips.

On Fri, Dec 24, 2004 at 01:25:13PM -0800, William Lee Irwin III wrote:
> Would systems described as ev56 by /proc/cpuinfo have such chips?

I had this one in particular in mind:

# cat /proc/cpuinfo
cpu                     : Alpha
cpu model               : EV56
cpu variation           : 7
cpu revision            : 0
cpu serial number       :
system type             : Rawhide
system variation        : Tincup
system revision         : 0
system serial number    : NI93009695
cycle frequency [Hz]    : 532819266 est.
timer frequency [Hz]    : 1200.00
page size [bytes]       : 8192
phys. address bits      : 40
max. addr. space #      : 127
BogoMIPS                : 910.04
kernel unaligned acc    : 29 (pc=fffffc00003158a8,va=fffffc004dafbd51)
user unaligned acc      : 0 (pc=0,va=0)
platform string         : AlphaServer 1200 5/533 4MB
cpus detected           : 2
cpus active             : 2
cpu active mask         : 0000000000000003
L1 Icache               : 8K, 1-way, 32b line
L1 Dcache               : 8K, 1-way, 32b line
L2 cache                : 96K, 3-way, 64b line
L3 cache                : 4096K, 1-way, 64b line
#


-- wli

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2004-12-24 21:25       ` William Lee Irwin III
  2004-12-24 23:52         ` William Lee Irwin III
@ 2004-12-24 23:55         ` David S. Miller
  1 sibling, 0 replies; 21+ messages in thread
From: David S. Miller @ 2004-12-24 23:55 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: andrea, linux-kernel, tglx, akpm

On Fri, 24 Dec 2004 13:25:13 -0800
William Lee Irwin III <wli@holomorphy.com> wrote:

> On Fri, 24 Dec 2004 19:22:19 +0100 Andrea Arcangeli <andrea@suse.de> wrote:
> >> If those old cpus really supported smp in linux, then fixing this bit is
> >> trivial, just change it to short. Do they support short at least?
> 
> On Fri, Dec 24, 2004 at 12:55:04PM -0800, David S. Miller wrote:
> > No, they do not.  The smallest atomic unit is one 32-bit word.
> > And yes there are SMP systems using these chips.
> 
> Would systems described as ev56 by /proc/cpuinfo have such chips?

No.  ev4 and earlier have the word or larger load/store limitation.
Only ev5 and later have byte and half-word sized load/store support.

I didn't actually know this when I read your question, so I snooped
around the asm-alpha/ headers and found the comments around the
__alpha_bwx__ ifdef checks. :-)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2004-12-24 18:22   ` Andrea Arcangeli
  2004-12-24 20:55     ` David S. Miller
  2004-12-24 23:41     ` Linus Torvalds
@ 2004-12-25  0:06     ` Mitchell Blank Jr
  2004-12-25  2:37       ` Andrea Arcangeli
  2 siblings, 1 reply; 21+ messages in thread
From: Mitchell Blank Jr @ 2004-12-25  0:06 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: David S. Miller, linux-kernel

Andrea Arcangeli wrote:
> > Again, older Alpha's do not.
> 
> If those old cpus really supported smp in linux,

The question isn't whether those CPUs support SMP; the question is whether
it's possible to build a kernel that supports both SMP boxes and older
CPUs.

-Mitch

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2004-12-24 23:41     ` Linus Torvalds
@ 2004-12-25  2:27       ` Andrea Arcangeli
  2004-12-25  3:24         ` VM fixes [PF_MEMDIE to TIF_MEMDIE] [5/4] Andrea Arcangeli
  0 siblings, 1 reply; 21+ messages in thread
From: Andrea Arcangeli @ 2004-12-25  2:27 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: David S. Miller, linux-kernel, tglx, akpm

On Fri, Dec 24, 2004 at 03:41:37PM -0800, Linus Torvalds wrote:
> 
> 
> On Fri, 24 Dec 2004, Andrea Arcangeli wrote:
> > 
> > If those old cpus really supported smp in linux, then fixing this bit is
> > trivial, just change it to short. Do they support short at least?
> 
> It's not even about SMP. "byte" and "short" are not IRQ-safe or even 
> preemption-safe (although I guess alpha doesn't support CONFIG_PREEMPT 
> right now anyway) on pre-byte-access alphas.

What I meant in this specific case being UP w/o preempt is enough to be
safe, because irq cannot modify memdie/used_math/oomadj. Only normal
kernel context can (or at most used_math can be modified by an exception
running on top of normal kernel context that we know doesn't touch
memdie/oomadj).

If these variables were to be modified from irqs then of course being UP
w/o preempt wouldn't be enough. But it was enough in this specific case.

So the only trouble here is SMP or PREEMPT.

> Just don't do it. Maybe we'll never see another chip try what alpha did 
> (it was arguably the single biggest mistake the early alphas had, and 
> caused tons of system design trouble), but just use an "int".
> 
> That said, I'd suggest putting it in the thread structure instead. We 
> already have thread-safe flags there, just use one of the bits. Yes, 
> you'll need to use locked accesses to set it, but hey, how often does 
> something like this get set anyway? And then you just do ti _right_, using 
> set_thread_flag/clear_thread_flag etc..

Actually I wonder if used_math should really become a PF_USED_MATH and
not the set_thread_flag/clear_thread_flag type of bitflag. The PF_ flags
have the property that they can only be modified by the "current" task.
But the current "short used_math" has the same requirement of the
PF_USED_MATH in this respect. So unless used_math is already racy (since
it's not being modified by locked ins), it should be correct to convert
it to a PF_ bitflag, which is not using locks.

memdie instead really should become a
set_thread_flags/clear_thread_flag (curently it's racy, while we set the
bitflag, the other cpu may be exiting already and we may be preventing
PF_EXITING or PF_DEAD to be set on a exited task with this current
race).

Note also that used_math is currently a short, it might not be a bug but
that's already misleading, it really shall be an int at the light of
your suggestions. I take the blame for making it worse (i.e. a char ;)

So my current plan is to make used_math a PF_USED_MATH, and memdie a
TIF_MEMDIE. And of course oomtaskadj an int (that one requires more than
1 bit of info ;). This change should be optimal for all archs and it
will fix the alpha arch with smp or preempt enabled on older cpus too.

I'd like to make those changes incrementally to the other patches I
already posted, so I avoid rejects fixing work (more than one patch
modified that code).

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2004-12-25  0:06     ` VM fixes [4/4] Mitchell Blank Jr
@ 2004-12-25  2:37       ` Andrea Arcangeli
  0 siblings, 0 replies; 21+ messages in thread
From: Andrea Arcangeli @ 2004-12-25  2:37 UTC (permalink / raw)
  To: Mitchell Blank Jr; +Cc: David S. Miller, linux-kernel

On Fri, Dec 24, 2004 at 04:06:05PM -0800, Mitchell Blank Jr wrote:
> Andrea Arcangeli wrote:
> > > Again, older Alpha's do not.
> > 
> > If those old cpus really supported smp in linux,
> 
> The question isn't whether those CPUs support SMP; the question is whether
> it's possible to build a kernel that supports both SMP boxes and older
> CPUs.

If you want to support those, you must as well tell gcc with a special
flag to never do byte access. I doubt it'd be a good idea to ship a
single kernel that runs on both those ancient cpus and on the more
recent ones too.

The race was mostly theoretical even for alpha, the current PF_MEMDIE
race in mainline triggering in all x86 and all other actual
architectures, is more likely to trigger infact and it has a huge
priority compared to the alpha ev4 SMP race.

But I'm going to fix the ev4 SMP alpha cpus soon by following Linus's
suggestion to reuse the bitflag arrays we already allocated for similar
stuff.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* VM fixes [PF_MEMDIE to TIF_MEMDIE] [5/4]
  2004-12-25  2:27       ` Andrea Arcangeli
@ 2004-12-25  3:24         ` Andrea Arcangeli
  2004-12-25 14:53           ` VM fixes [->used_math to PF_USED_MATH] [6/4] Andrea Arcangeli
  0 siblings, 1 reply; 21+ messages in thread
From: Andrea Arcangeli @ 2004-12-25  3:24 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: David S. Miller, linux-kernel, tglx, akpm

On Sat, Dec 25, 2004 at 03:27:21AM +0100, Andrea Arcangeli wrote:
> So my current plan is to make used_math a PF_USED_MATH, and memdie a
> TIF_MEMDIE. And of course oomtaskadj an int (that one requires more than

Here it is the first part. This makes memdie a TIF_MEMDIE. It's
incremental with the last 4 patches (so I call this one 5/4).

From: Andrea Arcangeli <andrea@suse.de>
Subject: convert memdie to an atomic thread bitflag

memdie will not be modified by the current task, so it cannot be a
PF_MEMDIE but it must be a TIF_MEMDIE.

Signed-off-by: Andrea Arcangeli <andrea@suse.de>

--- x/include/asm-alpha/thread_info.h.~1~	2004-12-04 08:55:03.000000000 +0100
+++ x/include/asm-alpha/thread_info.h	2004-12-25 03:52:04.377884048 +0100
@@ -77,6 +77,7 @@ register struct thread_info *__current_t
 #define TIF_UAC_NOPRINT		6	/* see sysinfo.h */
 #define TIF_UAC_NOFIX		7
 #define TIF_UAC_SIGBUS		8
+#define TIF_MEMDIE		9
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- x/include/asm-arm/thread_info.h.~1~	2004-12-04 08:56:31.000000000 +0100
+++ x/include/asm-arm/thread_info.h	2004-12-25 03:52:50.000000000 +0100
@@ -128,6 +128,7 @@ extern void iwmmxt_task_release(struct t
 #define TIF_SYSCALL_TRACE	8
 #define TIF_POLLING_NRFLAG	16
 #define TIF_USING_IWMMXT	17
+#define TIF_MEMDIE		18
 
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
--- x/include/asm-arm26/thread_info.h.~1~	2004-04-04 08:09:28.000000000 +0200
+++ x/include/asm-arm26/thread_info.h	2004-12-25 03:53:01.000000000 +0100
@@ -125,6 +125,7 @@ extern void free_thread_info(struct thre
 #define TIF_SYSCALL_TRACE	8
 #define TIF_USED_FPU		16
 #define TIF_POLLING_NRFLAG	17
+#define TIF_MEMDIE		18
 
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
--- x/include/asm-cris/thread_info.h.~1~	2003-07-10 19:33:07.000000000 +0200
+++ x/include/asm-cris/thread_info.h	2004-12-25 03:58:28.000000000 +0100
@@ -85,6 +85,7 @@ struct thread_info {
 #define TIF_SIGPENDING		2	/* signal pending */
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+#define TIF_MEMDIE		17
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- x/include/asm-h8300/thread_info.h.~1~	2004-08-25 02:47:35.000000000 +0200
+++ x/include/asm-h8300/thread_info.h	2004-12-25 03:53:45.000000000 +0100
@@ -93,6 +93,7 @@ static inline struct thread_info *curren
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	4	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
+#define TIF_MEMDIE		5
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
--- x/include/asm-i386/thread_info.h.~1~	2004-12-04 08:56:31.000000000 +0100
+++ x/include/asm-i386/thread_info.h	2004-12-25 03:54:03.000000000 +0100
@@ -141,6 +141,7 @@ register unsigned long current_stack_poi
 #define TIF_IRET		5	/* return with iret */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+#define TIF_MEMDIE		17
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- x/include/asm-ia64/thread_info.h.~1~	2004-12-04 08:55:04.000000000 +0100
+++ x/include/asm-ia64/thread_info.h	2004-12-25 03:54:38.000000000 +0100
@@ -67,6 +67,7 @@ struct thread_info {
 #define TIF_SYSCALL_TRACE	3	/* syscall trace active */
 #define TIF_SYSCALL_AUDIT	4	/* syscall auditing active */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+#define TIF_MEMDIE		17
 
 #define TIF_WORK_MASK		0x7	/* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE */
 #define TIF_ALLWORK_MASK	0x1f	/* bits 0..4 are "work to do on user-return" bits */
--- x/include/asm-m68k/thread_info.h.~1~	2004-08-25 02:47:35.000000000 +0200
+++ x/include/asm-m68k/thread_info.h	2004-12-25 03:55:32.000000000 +0100
@@ -48,6 +48,7 @@ struct thread_info {
 #define TIF_NOTIFY_RESUME	2	/* resumption notification requested */
 #define TIF_SIGPENDING		3	/* signal pending */
 #define TIF_NEED_RESCHED	4	/* rescheduling necessary */
+#define TIF_MEMDIE		5
 
 extern int thread_flag_fixme(void);
 
--- x/include/asm-m68knommu/thread_info.h.~1~	2004-12-04 08:56:32.000000000 +0100
+++ x/include/asm-m68knommu/thread_info.h	2004-12-25 03:55:44.000000000 +0100
@@ -91,6 +91,7 @@ static inline struct thread_info *curren
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	4	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
+#define TIF_MEMDIE		5
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
--- x/include/asm-mips/thread_info.h.~1~	2004-12-04 08:56:32.000000000 +0100
+++ x/include/asm-mips/thread_info.h	2004-12-25 03:55:59.000000000 +0100
@@ -116,6 +116,7 @@ register struct thread_info *__current_t
 #define TIF_SYSCALL_AUDIT	4	/* syscall auditing active */
 #define TIF_USEDFPU		16	/* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+#define TIF_MEMDIE		18
 #define TIF_SYSCALL_TRACE	31	/* syscall trace active */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
--- x/include/asm-parisc/thread_info.h.~1~	2004-12-04 08:56:32.000000000 +0100
+++ x/include/asm-parisc/thread_info.h	2004-12-25 03:56:10.000000000 +0100
@@ -63,6 +63,7 @@ struct thread_info {
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	4	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_32BIT               5       /* 32 bit binary */
+#define TIF_MEMDIE		6
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
--- x/include/asm-ppc/thread_info.h.~1~	2004-12-04 08:56:32.000000000 +0100
+++ x/include/asm-ppc/thread_info.h	2004-12-25 03:56:23.000000000 +0100
@@ -76,6 +76,7 @@ static inline struct thread_info *curren
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	4	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
+#define TIF_MEMDIE		5
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- x/include/asm-ppc64/thread_info.h.~1~	2004-12-04 08:56:32.000000000 +0100
+++ x/include/asm-ppc64/thread_info.h	2004-12-25 03:56:34.000000000 +0100
@@ -97,6 +97,7 @@ static inline struct thread_info *curren
 #define TIF_RUN_LIGHT		6	/* iSeries run light */
 #define TIF_ABI_PENDING		7	/* 32/64 bit switch needed */
 #define TIF_SYSCALL_AUDIT	8	/* syscall auditing active */
+#define TIF_MEMDIE		9
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
--- x/include/asm-s390/thread_info.h.~1~	2004-12-04 08:55:04.000000000 +0100
+++ x/include/asm-s390/thread_info.h	2004-12-25 03:56:45.000000000 +0100
@@ -100,6 +100,7 @@ static inline struct thread_info *curren
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling 
 					   TIF_NEED_RESCHED */
 #define TIF_31BIT		18	/* 32bit process */ 
+#define TIF_MEMDIE		19
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- x/include/asm-sh/thread_info.h.~1~	2004-12-04 08:56:32.000000000 +0100
+++ x/include/asm-sh/thread_info.h	2004-12-25 03:56:58.000000000 +0100
@@ -83,6 +83,7 @@ static inline struct thread_info *curren
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_USEDFPU		16	/* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+#define TIF_MEMDIE		18
 #define TIF_USERSPACE		31	/* true if FS sets userspace */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
--- x/include/asm-sh64/thread_info.h.~1~	2004-08-25 02:47:51.000000000 +0200
+++ x/include/asm-sh64/thread_info.h	2004-12-25 03:57:08.000000000 +0100
@@ -74,6 +74,7 @@ static inline struct thread_info *curren
 #define TIF_SYSCALL_TRACE	0	/* syscall trace active */
 #define TIF_SIGPENDING		2	/* signal pending */
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
+#define TIF_MEMDIE		4
 
 #define THREAD_SIZE	16384
 
--- x/include/asm-sparc/thread_info.h.~1~	2004-08-25 02:47:35.000000000 +0200
+++ x/include/asm-sparc/thread_info.h	2004-12-25 03:57:18.000000000 +0100
@@ -138,6 +138,7 @@ BTFIXUPDEF_CALL(void, free_thread_info, 
 					 * this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	9	/* true if poll_idle() is polling
 					 * TIF_NEED_RESCHED */
+#define TIF_MEMDIE		10
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
--- x/include/asm-sparc64/thread_info.h.~1~	2004-08-25 02:47:57.000000000 +0200
+++ x/include/asm-sparc64/thread_info.h	2004-12-25 03:57:29.000000000 +0100
@@ -228,6 +228,7 @@ register struct thread_info *current_thr
  *       an immediate value in instructions such as andcc.
  */
 #define TIF_ABI_PENDING		12
+#define TIF_MEMDIE		13
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- x/include/asm-um/thread_info.h.~1~	2004-12-04 08:56:32.000000000 +0100
+++ x/include/asm-um/thread_info.h	2004-12-25 03:57:42.000000000 +0100
@@ -71,6 +71,7 @@ static inline struct thread_info *curren
 					 * TIF_NEED_RESCHED 
 					 */
 #define TIF_RESTART_BLOCK 	4
+#define TIF_MEMDIE	 	5
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
--- x/include/asm-v850/thread_info.h.~1~	2003-06-17 11:31:42.000000000 +0200
+++ x/include/asm-v850/thread_info.h	2004-12-25 03:57:51.000000000 +0100
@@ -83,6 +83,7 @@ struct thread_info {
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	4	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
+#define TIF_MEMDIE		5
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
--- x/include/asm-x86_64/thread_info.h.~1~	2004-12-04 08:56:32.000000000 +0100
+++ x/include/asm-x86_64/thread_info.h	2004-12-25 03:58:00.000000000 +0100
@@ -106,6 +106,7 @@ static inline struct thread_info *stack_
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
 #define TIF_ABI_PENDING		19
+#define TIF_MEMDIE		20
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- x/include/linux/sched.h.~1~	2004-12-25 03:49:05.883019392 +0100
+++ x/include/linux/sched.h	2004-12-25 03:59:16.000000000 +0100
@@ -601,11 +601,6 @@ struct task_struct {
 	struct key *thread_keyring;	/* keyring private to this thread */
 #endif
 /*
- * All archs should support atomic ops with
- * 1 byte granularity.
- */
-	unsigned char memdie;
-/*
  * Must be changed atomically so it shouldn't be
  * be a shareable bitflag.
  */
--- x/include/asm-m32r/thread_info.h.~1~	2004-12-04 08:55:04.000000000 +0100
+++ x/include/asm-m32r/thread_info.h	2004-12-25 03:55:23.000000000 +0100
@@ -123,6 +123,7 @@ static inline struct thread_info *curren
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_IRET		5	/* return with iret */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+#define TIF_MEMDIE		17
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- x/mm/oom_kill.c.~1~	2004-12-25 03:49:05.000000000 +0100
+++ x/mm/oom_kill.c	2004-12-25 04:16:46.300597552 +0100
@@ -151,7 +151,8 @@ static struct task_struct * select_bad_p
 			 * This is in the process of releasing memory so wait it
 			 * to finish before killing some other task by mistake.
 			 */
-			if ((p->memdie || (p->flags & PF_EXITING)) && !(p->flags & PF_DEAD))
+			if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) &&
+			    !(p->flags & PF_DEAD))
 				return ERR_PTR(-1UL);
 			if (p->flags & PF_SWAPOFF)
 				return p;
@@ -195,7 +196,7 @@ static void __oom_kill_task(task_t *p)
 	 * exit() and clear out its resources quickly...
 	 */
 	p->time_slice = HZ;
-	p->memdie = 1;
+	set_tsk_thread_flag(p, TIF_MEMDIE);
 
 	/* This process has hardware access, be more careful. */
 	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
--- x/mm/page_alloc.c.~1~	2004-12-25 03:49:05.000000000 +0100
+++ x/mm/page_alloc.c	2004-12-25 04:01:41.000000000 +0100
@@ -663,7 +663,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 	}
 
 	/* This allocation should allow future memory freeing. */
-	if (((p->flags & PF_MEMALLOC) || p->memdie) && !in_interrupt()) {
+	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) {
 		/* go through the zonelist yet again, ignoring mins */
 		for (i = 0; (z = zones[i]) != NULL; i++) {
 			page = buffered_rmqueue(z, order, gfp_mask);

^ permalink raw reply	[flat|nested] 21+ messages in thread

* VM fixes [->used_math to PF_USED_MATH] [6/4]
  2004-12-25  3:24         ` VM fixes [PF_MEMDIE to TIF_MEMDIE] [5/4] Andrea Arcangeli
@ 2004-12-25 14:53           ` Andrea Arcangeli
  2004-12-27  7:03             ` Andy Isaacson
  0 siblings, 1 reply; 21+ messages in thread
From: Andrea Arcangeli @ 2004-12-25 14:53 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: David S. Miller, linux-kernel, tglx, akpm

[-- Attachment #1: Type: text/plain, Size: 36181 bytes --]

On Sat, Dec 25, 2004 at 04:24:30AM +0100, Andrea Arcangeli wrote:
> Here it is the first part. This makes memdie a TIF_MEMDIE. It's

And here is the final incremental part converting ->used_math to
PF_USED_MATH.

All combined patches work for me. I'm not going to apply these last two
(5/4 and 6/4) to the stable suse tree though, I'll apply 5/4 and 6/4
only to the future ones based on 2.6.10+, since the ev4 race on
SMP/PREEMPT is not relevant for the suse tree (those last two patches
are a bit too big to take any risk for a _purerly_theoretical_ race on
ev4 + SMP or ev4 + PREEMPT ;). The PF_MEMDIE was instead a more pratical
race (Wli said he triggered it in practice too) and it was triggering on
all archs, not just on ev4 + SMP or evr + PREEMPT, that's fixed with
[1-4]/4.

The below headers are from the SUSE patch format. I attached the script
I use to generate those just in case somebody can find it useful. It's
quite generic and it avoids to destroy the patch headers every time. If
you use quilt probably you don't need it. mkpatch.py works pretty fast
in combination with patch -p1 -b or alternatively in combination with
emacs autofile save.

Merry Christmas and Happy new Year to everyone, I'll be on vacations
until 2 Jan, so if there's any problem with this stuff we'll talk about
it next year ;). I just attempted to finish it before leaving.

From: Andrea Arcangeli <andrea@suse.de>
Subject: Convert the unsafe signed (16bit) used_math to a safe and optimal PF_USED_MATH

... and declare oomkilladj as an int since it can be changed via /proc.

I might have broken arm, see the very first change in the patch to
asm-offsets.c, rest looks ok at first glance.

If you want used_math to return 0 or 1 (instead of 0 or PF_USED_MATH),
just s/!!// in the below patch and place !! in sched.h::*used_math()
accordingly after applying the patch, it should work just fine. Using !!
only when necessary as the below is optimal.

Signed-off-by: Andrea Arcangeli <andrea@suse.de>

--- x/arch/arm26/kernel/asm-offsets.c.~1~	2003-07-17 01:52:38.000000000 +0200
+++ x/arch/arm26/kernel/asm-offsets.c	2004-12-25 15:18:54.589979544 +0100
@@ -42,7 +42,6 @@
 
 int main(void)
 {
-  DEFINE(TSK_USED_MATH,		offsetof(struct task_struct, used_math));
   DEFINE(TSK_ACTIVE_MM,		offsetof(struct task_struct, active_mm));
   BLANK();
   DEFINE(VMA_VM_MM,		offsetof(struct vm_area_struct, vm_mm));
--- x/arch/arm26/kernel/process.c.~1~	2004-08-25 02:47:33.000000000 +0200
+++ x/arch/arm26/kernel/process.c	2004-12-25 15:18:54.591979240 +0100
@@ -296,7 +296,7 @@ void flush_thread(void)
 	memset(&tsk->thread.debug, 0, sizeof(struct debug_info));
 	memset(&thread->fpstate, 0, sizeof(union fp_state));
 
-	current->used_math = 0;
+	clear_used_math();
 }
 
 void release_thread(struct task_struct *dead_task)
@@ -330,7 +330,7 @@ copy_thread(int nr, unsigned long clone_
 int dump_fpu (struct pt_regs *regs, struct user_fp *fp)
 {
 	struct thread_info *thread = current_thread_info();
-	int used_math = current->used_math;
+	int used_math = !!used_math();
 
 	if (used_math)
 		memcpy(fp, &thread->fpstate.soft, sizeof (*fp));
--- x/arch/arm26/kernel/ptrace.c.~1~	2004-12-04 08:56:19.000000000 +0100
+++ x/arch/arm26/kernel/ptrace.c	2004-12-25 15:18:54.592979088 +0100
@@ -540,7 +540,7 @@ static int ptrace_getfpregs(struct task_
  */
 static int ptrace_setfpregs(struct task_struct *tsk, void *ufp)
 {
-	tsk->used_math = 1;
+	set_stopped_child_used_math(tsk);
 	return copy_from_user(&tsk->thread_info->fpstate, ufp,
 			      sizeof(struct user_fp)) ? -EFAULT : 0;
 }
--- x/arch/i386/kernel/cpu/common.c.~1~	2004-12-04 08:54:53.000000000 +0100
+++ x/arch/i386/kernel/cpu/common.c	2004-12-25 15:18:54.593978936 +0100
@@ -580,6 +580,6 @@ void __init cpu_init (void)
 	 * Force FPU initialization:
 	 */
 	current_thread_info()->status = 0;
-	current->used_math = 0;
+	clear_used_math();
 	mxcsr_feature_mask_init();
 }
--- x/arch/i386/kernel/i387.c.~1~	2004-12-04 08:56:19.000000000 +0100
+++ x/arch/i386/kernel/i387.c	2004-12-25 15:18:54.594978784 +0100
@@ -60,7 +60,8 @@ void init_fpu(struct task_struct *tsk)
 		tsk->thread.i387.fsave.twd = 0xffffffffu;
 		tsk->thread.i387.fsave.fos = 0xffff0000u;
 	}
-	tsk->used_math = 1;
+	/* only the device not available exception or ptrace can call init_fpu */
+	set_stopped_child_used_math(tsk);
 }
 
 /*
@@ -330,13 +331,13 @@ static int save_i387_fxsave( struct _fps
 
 int save_i387( struct _fpstate __user *buf )
 {
-	if ( !current->used_math )
+	if ( !used_math() )
 		return 0;
 
 	/* This will cause a "finit" to be triggered by the next
 	 * attempted FPU operation by the 'current' process.
 	 */
-	current->used_math = 0;
+	clear_used_math();
 
 	if ( HAVE_HWFP ) {
 		if ( cpu_has_fxsr ) {
@@ -382,7 +383,7 @@ int restore_i387( struct _fpstate __user
 	} else {
 		err = restore_i387_soft( &current->thread.i387.soft, buf );
 	}
-	current->used_math = 1;
+	set_used_math();
 	return err;
 }
 
@@ -506,7 +507,7 @@ int dump_fpu( struct pt_regs *regs, stru
 	int fpvalid;
 	struct task_struct *tsk = current;
 
-	fpvalid = tsk->used_math;
+	fpvalid = !!used_math();
 	if ( fpvalid ) {
 		unlazy_fpu( tsk );
 		if ( cpu_has_fxsr ) {
@@ -521,7 +522,7 @@ int dump_fpu( struct pt_regs *regs, stru
 
 int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
 {
-	int fpvalid = tsk->used_math;
+	int fpvalid = !!tsk_used_math(tsk);
 
 	if (fpvalid) {
 		if (tsk == current)
@@ -536,7 +537,7 @@ int dump_task_fpu(struct task_struct *ts
 
 int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
 {
-	int fpvalid = tsk->used_math && cpu_has_fxsr;
+	int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
 
 	if (fpvalid) {
 		if (tsk == current)
--- x/arch/i386/kernel/process.c.~1~	2004-12-04 08:56:19.000000000 +0100
+++ x/arch/i386/kernel/process.c	2004-12-25 15:18:54.596978480 +0100
@@ -333,7 +333,7 @@ void flush_thread(void)
 	 * Forget coprocessor state..
 	 */
 	clear_fpu(tsk);
-	tsk->used_math = 0;
+	clear_used_math();
 }
 
 void release_thread(struct task_struct *dead_task)
--- x/arch/i386/kernel/ptrace.c.~1~	2004-12-04 08:56:19.000000000 +0100
+++ x/arch/i386/kernel/ptrace.c	2004-12-25 15:18:54.597978328 +0100
@@ -491,7 +491,7 @@ asmlinkage int sys_ptrace(long request, 
 			break;
 		}
 		ret = 0;
-		if (!child->used_math)
+		if (!tsk_used_math(child))
 			init_fpu(child);
 		get_fpregs((struct user_i387_struct __user *)data, child);
 		break;
@@ -503,7 +503,7 @@ asmlinkage int sys_ptrace(long request, 
 			ret = -EIO;
 			break;
 		}
-		child->used_math = 1;
+		set_stopped_child_used_math(child);
 		set_fpregs(child, (struct user_i387_struct __user *)data);
 		ret = 0;
 		break;
@@ -515,7 +515,7 @@ asmlinkage int sys_ptrace(long request, 
 			ret = -EIO;
 			break;
 		}
-		if (!child->used_math)
+		if (!tsk_used_math(child))
 			init_fpu(child);
 		ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
 		break;
@@ -527,7 +527,7 @@ asmlinkage int sys_ptrace(long request, 
 			ret = -EIO;
 			break;
 		}
-		child->used_math = 1;
+		set_stopped_child_used_math(child);
 		ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
 		break;
 	}
--- x/arch/i386/kernel/traps.c.~1~	2004-12-04 08:56:19.000000000 +0100
+++ x/arch/i386/kernel/traps.c	2004-12-25 15:18:54.598978176 +0100
@@ -920,7 +920,7 @@ asmlinkage void math_state_restore(struc
 	struct task_struct *tsk = thread->task;
 
 	clts();		/* Allow maths ops (or we recurse) */
-	if (!tsk->used_math)
+	if (!tsk_used_math(tsk))
 		init_fpu(tsk);
 	restore_fpu(tsk);
 	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
--- x/arch/i386/math-emu/fpu_entry.c.~1~	2004-08-25 02:47:49.000000000 +0200
+++ x/arch/i386/math-emu/fpu_entry.c	2004-12-25 15:18:54.599978024 +0100
@@ -155,10 +155,10 @@ asmlinkage void math_emulate(long arg)
   RE_ENTRANT_CHECK_ON;
 #endif /* RE_ENTRANT_CHECKING */
 
-  if (!current->used_math)
+  if (!used_math())
     {
       finit();
-      current->used_math = 1;
+      set_used_math();
     }
 
   SETUP_DATA_AREA(arg);
--- x/arch/ia64/ia32/elfcore32.h.~1~	2004-12-04 08:56:19.000000000 +0100
+++ x/arch/ia64/ia32/elfcore32.h	2004-12-25 15:18:54.600977872 +0100
@@ -106,7 +106,7 @@ elf_core_copy_task_fpregs(struct task_st
 	struct ia32_user_i387_struct *fpstate = (void*)fpu;
 	mm_segment_t old_fs;
 
-	if (!tsk->used_math)
+	if (!tsk_used_math(tsk))
 		return 0;
 	
 	old_fs = get_fs();
@@ -124,7 +124,7 @@ elf_core_copy_task_xfpregs(struct task_s
 	struct ia32_user_fxsr_struct *fpxstate = (void*) xfpu;
 	mm_segment_t old_fs;
 
-	if (!tsk->used_math)
+	if (!tsk_used_math(tsk))
 		return 0;
 
 	old_fs = get_fs();
--- x/arch/mips/kernel/irixsig.c.~1~	2004-12-04 08:56:20.000000000 +0100
+++ x/arch/mips/kernel/irixsig.c	2004-12-25 15:18:54.601977720 +0100
@@ -100,7 +100,7 @@ static void setup_irix_frame(struct k_si
 	__put_user((u64) regs->hi, &ctx->hi);
 	__put_user((u64) regs->lo, &ctx->lo);
 	__put_user((u64) regs->cp0_epc, &ctx->pc);
-	__put_user(current->used_math, &ctx->usedfp);
+	__put_user(!!used_math(), &ctx->usedfp);
 	__put_user((u64) regs->cp0_cause, &ctx->cp0_cause);
 	__put_user((u64) regs->cp0_badvaddr, &ctx->cp0_badvaddr);
 
@@ -728,7 +728,7 @@ asmlinkage int irix_getcontext(struct pt
 	__put_user(regs->cp0_epc, &ctx->regs[35]);
 
 	flags = 0x0f;
-	if(!current->used_math) {
+	if(!used_math()) {
 		flags &= ~(0x08);
 	} else {
 		/* XXX wheee... */
--- x/arch/mips/kernel/process.c.~1~	2004-12-04 08:56:20.000000000 +0100
+++ x/arch/mips/kernel/process.c	2004-12-25 15:18:54.602977568 +0100
@@ -76,7 +76,7 @@ void start_thread(struct pt_regs * regs,
 #endif
 	status |= KU_USER;
 	regs->cp0_status = status;
-	current->used_math = 0;
+	clear_used_math();
 	lose_fpu();
 	regs->cp0_epc = pc;
 	regs->regs[29] = sp;
--- x/arch/mips/kernel/ptrace.c.~1~	2004-12-04 08:56:20.000000000 +0100
+++ x/arch/mips/kernel/ptrace.c	2004-12-25 15:18:54.603977416 +0100
@@ -119,7 +119,7 @@ asmlinkage int sys_ptrace(long request, 
 			tmp = regs->regs[addr];
 			break;
 		case FPR_BASE ... FPR_BASE + 31:
-			if (child->used_math) {
+			if (tsk_used_math(child)) {
 				fpureg_t *fregs = get_fpu_regs(child);
 
 #ifdef CONFIG_MIPS32
@@ -205,7 +205,7 @@ asmlinkage int sys_ptrace(long request, 
 		case FPR_BASE ... FPR_BASE + 31: {
 			fpureg_t *fregs = get_fpu_regs(child);
 
-			if (!child->used_math) {
+			if (!tsk_used_math(child)) {
 				/* FP not yet used  */
 				memset(&child->thread.fpu.hard, ~0,
 				       sizeof(child->thread.fpu.hard));
--- x/arch/mips/kernel/ptrace32.c.~1~	2004-12-04 08:56:20.000000000 +0100
+++ x/arch/mips/kernel/ptrace32.c	2004-12-25 15:18:54.604977264 +0100
@@ -112,7 +112,7 @@ asmlinkage int sys32_ptrace(int request,
 			tmp = regs->regs[addr];
 			break;
 		case FPR_BASE ... FPR_BASE + 31:
-			if (child->used_math) {
+			if (tsk_used_math(child)) {
 				fpureg_t *fregs = get_fpu_regs(child);
 
 				/*
@@ -193,7 +193,7 @@ asmlinkage int sys32_ptrace(int request,
 		case FPR_BASE ... FPR_BASE + 31: {
 			fpureg_t *fregs = get_fpu_regs(child);
 
-			if (!child->used_math) {
+			if (!tsk_used_math(child)) {
 				/* FP not yet used  */
 				memset(&child->thread.fpu.hard, ~0,
 				       sizeof(child->thread.fpu.hard));
--- x/arch/mips/kernel/signal.c.~1~	2004-12-04 08:56:20.000000000 +0100
+++ x/arch/mips/kernel/signal.c	2004-12-25 15:18:54.605977112 +0100
@@ -179,11 +179,11 @@ asmlinkage int restore_sigcontext(struct
 	restore_gp_reg(31);
 #undef restore_gp_reg
 
-	err |= __get_user(current->used_math, &sc->sc_used_math);
+	err |= __get_user(!!used_math(), &sc->sc_used_math);
 
 	preempt_disable();
 
-	if (current->used_math) {
+	if (used_math()) {
 		/* restore fpu context if we have used it before */
 		own_fpu();
 		err |= restore_fp_context(sc);
@@ -324,9 +324,9 @@ inline int setup_sigcontext(struct pt_re
 	err |= __put_user(regs->cp0_cause, &sc->sc_cause);
 	err |= __put_user(regs->cp0_badvaddr, &sc->sc_badvaddr);
 
-	err |= __put_user(current->used_math, &sc->sc_used_math);
+	err |= __put_user(!!used_math(), &sc->sc_used_math);
 
-	if (!current->used_math)
+	if (!used_math())
 		goto out;
 
 	/*
--- x/arch/mips/kernel/signal32.c.~1~	2004-12-04 08:56:20.000000000 +0100
+++ x/arch/mips/kernel/signal32.c	2004-12-25 15:18:54.606976960 +0100
@@ -361,11 +361,11 @@ static asmlinkage int restore_sigcontext
 	restore_gp_reg(31);
 #undef restore_gp_reg
 
-	err |= __get_user(current->used_math, &sc->sc_used_math);
+	err |= __get_user(!!used_math(), &sc->sc_used_math);
 
 	preempt_disable();
 
-	if (current->used_math) {
+	if (used_math()) {
 		/* restore fpu context if we have used it before */
 		own_fpu();
 		err |= restore_fp_context32(sc);
@@ -552,9 +552,9 @@ static inline int setup_sigcontext32(str
 	err |= __put_user(regs->cp0_cause, &sc->sc_cause);
 	err |= __put_user(regs->cp0_badvaddr, &sc->sc_badvaddr);
 
-	err |= __put_user(current->used_math, &sc->sc_used_math);
+	err |= __put_user(!!used_math(), &sc->sc_used_math);
 
-	if (!current->used_math)
+	if (!used_math())
 		goto out;
 
 	/* 
--- x/arch/mips/kernel/traps.c.~1~	2004-12-04 08:56:20.000000000 +0100
+++ x/arch/mips/kernel/traps.c	2004-12-25 15:18:54.608976656 +0100
@@ -655,11 +655,11 @@ asmlinkage void do_cpu(struct pt_regs *r
 		preempt_disable();
 
 		own_fpu();
-		if (current->used_math) {	/* Using the FPU again.  */
+		if (used_math()) {	/* Using the FPU again.  */
 			restore_fp(current);
 		} else {			/* First time FPU user.  */
 			init_fpu();
-			current->used_math = 1;
+			set_used_math();
 		}
 
 		if (!cpu_has_fpu) {
--- x/arch/s390/kernel/process.c.~1~	2004-12-04 08:56:20.000000000 +0100
+++ x/arch/s390/kernel/process.c	2004-12-25 15:18:54.609976504 +0100
@@ -216,8 +216,7 @@ void exit_thread(void)
 
 void flush_thread(void)
 {
-
-        current->used_math = 0;
+	clear_used_math();
 	clear_tsk_thread_flag(current, TIF_USEDFPU);
 }
 
--- x/arch/s390/kernel/setup.c.~1~	2004-12-04 08:54:54.000000000 +0100
+++ x/arch/s390/kernel/setup.c	2004-12-25 15:18:54.610976352 +0100
@@ -96,7 +96,7 @@ void __devinit cpu_init (void)
          * Force FPU initialization:
          */
         clear_thread_flag(TIF_USEDFPU);
-        current->used_math = 0;
+        clear_used_math();
 
         /* Setup active_mm for idle_task  */
         atomic_inc(&init_mm.mm_count);
--- x/arch/sh/kernel/cpu/sh4/fpu.c.~1~	2004-02-20 17:26:36.000000000 +0100
+++ x/arch/sh/kernel/cpu/sh4/fpu.c	2004-12-25 15:18:54.611976200 +0100
@@ -323,13 +323,13 @@ do_fpu_state_restore(unsigned long r4, u
 		return;
 	}
 
-	if (tsk->used_math) {
+	if (used_math()) {
 		/* Using the FPU again.  */
 		restore_fpu(tsk);
 	} else	{
 		/* First time FPU user.  */
 		fpu_init();
-		tsk->used_math = 1;
+		set_used_math();
 	}
 	set_tsk_thread_flag(tsk, TIF_USEDFPU);
 }
--- x/arch/sh/kernel/cpu/init.c.~1~	2004-12-04 08:56:20.000000000 +0100
+++ x/arch/sh/kernel/cpu/init.c	2004-12-25 15:18:54.611976200 +0100
@@ -194,7 +194,7 @@ asmlinkage void __init sh_cpu_init(void)
 	/* FPU initialization */
 	if ((cpu_data->flags & CPU_HAS_FPU)) {
 		clear_thread_flag(TIF_USEDFPU);
-		current->used_math = 0;
+		clear_used_math();
 	}
 
 #ifdef CONFIG_SH_DSP
--- x/arch/sh/kernel/process.c.~1~	2004-12-04 08:56:21.000000000 +0100
+++ x/arch/sh/kernel/process.c	2004-12-25 15:18:54.612976048 +0100
@@ -208,7 +208,7 @@ void flush_thread(void)
 
 	/* Forget lazy FPU state */
 	clear_fpu(tsk, regs);
-	tsk->used_math = 0;
+	clear_used_math();
 #endif
 }
 
@@ -225,7 +225,7 @@ int dump_fpu(struct pt_regs *regs, elf_f
 #if defined(CONFIG_SH_FPU)
 	struct task_struct *tsk = current;
 
-	fpvalid = tsk->used_math;
+	fpvalid = !!tsk_used_math(tsk);
 	if (fpvalid) {
 		unlazy_fpu(tsk, regs);
 		memcpy(fpu, &tsk->thread.fpu.hard, sizeof(*fpu));
@@ -260,7 +260,7 @@ dump_task_fpu (struct task_struct *tsk, 
 	int fpvalid = 0;
 
 #if defined(CONFIG_SH_FPU)
-	fpvalid = tsk->used_math;
+	fpvalid = !!tsk_used_math(tsk);
 	if (fpvalid) {
 		struct pt_regs *regs = (struct pt_regs *)
 					((unsigned long)tsk->thread_info
@@ -286,7 +286,7 @@ int copy_thread(int nr, unsigned long cl
 
 	unlazy_fpu(tsk, regs);
 	p->thread.fpu = tsk->thread.fpu;
-	p->used_math = tsk->used_math;
+	copy_to_stopped_child_used_math(p);
 #endif
 
 	childregs = ((struct pt_regs *)
--- x/arch/sh/kernel/ptrace.c.~1~	2004-12-04 08:56:21.000000000 +0100
+++ x/arch/sh/kernel/ptrace.c	2004-12-25 15:18:54.613975896 +0100
@@ -150,7 +150,7 @@ asmlinkage int sys_ptrace(long request, 
 			tmp = get_stack_long(child, addr);
 		else if (addr >= (long) &dummy->fpu &&
 			 addr < (long) &dummy->u_fpvalid) {
-			if (!child->used_math) {
+			if (!tsk_used_math(child)) {
 				if (addr == (long)&dummy->fpu.fpscr)
 					tmp = FPSCR_INIT;
 				else
@@ -159,7 +159,7 @@ asmlinkage int sys_ptrace(long request, 
 				tmp = ((long *)&child->thread.fpu)
 					[(addr - (long)&dummy->fpu) >> 2];
 		} else if (addr == (long) &dummy->u_fpvalid)
-			tmp = child->used_math;
+			tmp = !!tsk_used_math(child);
 		else
 			tmp = 0;
 		ret = put_user(tmp, (unsigned long *)data);
@@ -185,12 +185,12 @@ asmlinkage int sys_ptrace(long request, 
 			ret = put_stack_long(child, addr, data);
 		else if (addr >= (long) &dummy->fpu &&
 			 addr < (long) &dummy->u_fpvalid) {
-			child->used_math = 1;
+			set_stopped_child_used_math(child);
 			((long *)&child->thread.fpu)
 				[(addr - (long)&dummy->fpu) >> 2] = data;
 			ret = 0;
 		} else if (addr == (long) &dummy->u_fpvalid) {
-			child->used_math = data?1:0;
+			conditional_stopped_child_used_math(data, child);
 			ret = 0;
 		}
 		break;
--- x/arch/sh/kernel/signal.c.~1~	2004-12-04 08:56:21.000000000 +0100
+++ x/arch/sh/kernel/signal.c	2004-12-25 15:18:54.614975744 +0100
@@ -163,7 +163,7 @@ static inline int restore_sigcontext_fpu
 	if (!(cpu_data->flags & CPU_HAS_FPU))
 		return 0;
 
-	tsk->used_math = 1;
+	set_used_math();
 	return __copy_from_user(&tsk->thread.fpu.hard, &sc->sc_fpregs[0],
 				sizeof(long)*(16*2+2));
 }
@@ -176,7 +176,7 @@ static inline int save_sigcontext_fpu(st
 	if (!(cpu_data->flags & CPU_HAS_FPU))
 		return 0;
 
-	if (!tsk->used_math) {
+	if (!used_math()) {
 		__put_user(0, &sc->sc_ownedfp);
 		return 0;
 	}
@@ -186,7 +186,7 @@ static inline int save_sigcontext_fpu(st
 	/* This will cause a "finit" to be triggered by the next
 	   attempted FPU operation by the 'current' process.
 	   */
-	tsk->used_math = 0;
+	clear_used_math();
 
 	unlazy_fpu(tsk, regs);
 	return __copy_to_user(&sc->sc_fpregs[0], &tsk->thread.fpu.hard,
@@ -220,7 +220,7 @@ restore_sigcontext(struct pt_regs *regs,
 
 		regs->sr |= SR_FD; /* Release FPU */
 		clear_fpu(tsk, regs);
-		tsk->used_math = 0;
+		clear_used_math();
 		__get_user (owned_fp, &sc->sc_ownedfp);
 		if (owned_fp)
 			err |= restore_sigcontext_fpu(sc);
--- x/arch/sh64/kernel/fpu.c.~1~	2004-08-25 02:47:49.000000000 +0200
+++ x/arch/sh64/kernel/fpu.c	2004-12-25 15:18:54.615975592 +0100
@@ -158,12 +158,12 @@ do_fpu_state_restore(unsigned long ex, s
 		fpsave(&last_task_used_math->thread.fpu.hard);
         }
         last_task_used_math = current;
-        if (current->used_math) {
+        if (used_math()) {
                 fpload(&current->thread.fpu.hard);
         } else {
 		/* First time FPU user.  */
 		fpload(&init_fpuregs.hard);
-                current->used_math = 1;
+                set_used_math();
         }
 	release_fpu();
 }
--- x/arch/sh64/kernel/process.c.~1~	2004-12-04 08:56:21.000000000 +0100
+++ x/arch/sh64/kernel/process.c	2004-12-25 15:18:54.616975440 +0100
@@ -688,7 +688,7 @@ void flush_thread(void)
 		last_task_used_math = NULL;
 	}
 	/* Force FPU state to be reinitialised after exec */
-	current->used_math = 0;
+	clear_used_math();
 #endif
 
 	/* if we are a kernel thread, about to change to user thread,
@@ -713,7 +713,7 @@ int dump_fpu(struct pt_regs *regs, elf_f
 	int fpvalid;
 	struct task_struct *tsk = current;
 
-	fpvalid = tsk->used_math;
+	fpvalid = !!tsk_used_math(tsk);
 	if (fpvalid) {
 		if (current == last_task_used_math) {
 			grab_fpu();
--- x/arch/sh64/kernel/ptrace.c.~1~	2004-12-04 08:56:21.000000000 +0100
+++ x/arch/sh64/kernel/ptrace.c	2004-12-25 15:18:54.617975288 +0100
@@ -63,7 +63,7 @@ get_fpu_long(struct task_struct *task, u
 	struct pt_regs *regs;
 	regs = (struct pt_regs*)((unsigned char *)task + THREAD_SIZE) - 1;
 
-	if (!task->used_math) {
+	if (!tsk_used_math(task)) {
 		if (addr == offsetof(struct user_fpu_struct, fpscr)) {
 			tmp = FPSCR_INIT;
 		} else {
@@ -105,9 +105,9 @@ put_fpu_long(struct task_struct *task, u
 
 	regs = (struct pt_regs*)((unsigned char *)task + THREAD_SIZE) - 1;
 
-	if (!task->used_math) {
+	if (!tsk_used_math(task)) {
 		fpinit(&task->thread.fpu.hard);
-		task->used_math = 1;
+		set_stopped_child_used_math(task);
 	} else if (last_task_used_math == task) {
 		grab_fpu();
 		fpsave(&task->thread.fpu.hard);
@@ -187,7 +187,7 @@ asmlinkage int sys_ptrace(long request, 
 			 (addr <  offsetof(struct user, u_fpvalid))) {
 			tmp = get_fpu_long(child, addr - offsetof(struct user, fpu));
 		} else if (addr == offsetof(struct user, u_fpvalid)) {
-			tmp = child->used_math;
+			tmp = !!tsk_used_math(child);
 		} else {
 			break;
 		}
--- x/arch/sh64/kernel/signal.c.~1~	2004-12-04 08:54:54.000000000 +0100
+++ x/arch/sh64/kernel/signal.c	2004-12-25 15:18:54.618975136 +0100
@@ -186,7 +186,7 @@ restore_sigcontext_fpu(struct pt_regs *r
 	int fpvalid;
 
 	err |= __get_user (fpvalid, &sc->sc_fpvalid);
-	current->used_math = fpvalid;
+	conditional_used_math(fpvalid);
 	if (! fpvalid)
 		return err;
 
@@ -207,7 +207,7 @@ setup_sigcontext_fpu(struct pt_regs *reg
 	int err = 0;
 	int fpvalid;
 
-	fpvalid = current->used_math;
+	fpvalid = !!used_math();
 	err |= __put_user(fpvalid, &sc->sc_fpvalid);
 	if (! fpvalid)
 		return err;
@@ -222,7 +222,7 @@ setup_sigcontext_fpu(struct pt_regs *reg
 
 	err |= __copy_to_user(&sc->sc_fpregs[0], &current->thread.fpu.hard,
 			      (sizeof(long long) * 32) + (sizeof(int) * 1));
-	current->used_math = 0;
+	clear_used_math();
 
 	return err;
 }
--- x/arch/sparc/kernel/process.c.~1~	2004-12-04 08:56:21.000000000 +0100
+++ x/arch/sparc/kernel/process.c	2004-12-25 15:18:54.619974984 +0100
@@ -602,7 +602,7 @@ void dump_thread(struct pt_regs * regs, 
  */
 int dump_fpu (struct pt_regs * regs, elf_fpregset_t * fpregs)
 {
-	if (current->used_math == 0) {
+	if (used_math()) {
 		memset(fpregs, 0, sizeof(*fpregs));
 		fpregs->pr_q_entrysize = 8;
 		return 1;
--- x/arch/sparc/kernel/signal.c.~1~	2004-12-04 08:56:21.000000000 +0100
+++ x/arch/sparc/kernel/signal.c	2004-12-25 15:18:54.621974680 +0100
@@ -202,7 +202,7 @@ restore_fpu_state(struct pt_regs *regs, 
 		regs->psr &= ~PSR_EF;
 	}
 #endif
-	current->used_math = 1;
+	set_used_math();
 	clear_tsk_thread_flag(current, TIF_USEDFPU);
 
 	if (verify_area(VERIFY_READ, fpu, sizeof(*fpu)))
@@ -584,7 +584,7 @@ save_fpu_state(struct pt_regs *regs, __s
 				      &current->thread.fpqueue[0],
 				      ((sizeof(unsigned long) +
 				      (sizeof(unsigned long *)))*16));
-	current->used_math = 0;
+	clear_used_math();
 	return err;
 }
 
@@ -599,7 +599,7 @@ new_setup_frame(struct k_sigaction *ka, 
 	synchronize_user_stack();
 
 	sigframe_size = NF_ALIGNEDSZ;
-	if (!current->used_math)
+	if (!used_math())
 		sigframe_size -= sizeof(__siginfo_fpu_t);
 
 	sf = (struct new_signal_frame __user *)
@@ -616,7 +616,7 @@ new_setup_frame(struct k_sigaction *ka, 
 	
 	err |= __put_user(0, &sf->extra_size);
 
-	if (current->used_math) {
+	if (used_math()) {
 		err |= save_fpu_state(regs, &sf->fpu_state);
 		err |= __put_user(&sf->fpu_state, &sf->fpu_save);
 	} else {
@@ -677,7 +677,7 @@ new_setup_rt_frame(struct k_sigaction *k
 
 	synchronize_user_stack();
 	sigframe_size = RT_ALIGNEDSZ;
-	if (!current->used_math)
+	if (!used_math())
 		sigframe_size -= sizeof(__siginfo_fpu_t);
 	sf = (struct rt_signal_frame __user *)
 		get_sigframe(&ka->sa, regs, sigframe_size);
@@ -690,7 +690,7 @@ new_setup_rt_frame(struct k_sigaction *k
 	err |= __put_user(regs->npc, &sf->regs.npc);
 	err |= __put_user(regs->y, &sf->regs.y);
 	psr = regs->psr;
-	if (current->used_math)
+	if (used_math())
 		psr |= PSR_EF;
 	err |= __put_user(psr, &sf->regs.psr);
 	err |= __copy_to_user(&sf->regs.u_regs, regs->u_regs, sizeof(regs->u_regs));
--- x/arch/sparc/kernel/traps.c.~1~	2004-08-25 02:47:49.000000000 +0200
+++ x/arch/sparc/kernel/traps.c	2004-12-25 15:18:54.622974528 +0100
@@ -246,17 +246,17 @@ void do_fpd_trap(struct pt_regs *regs, u
 		       &fptask->thread.fpqueue[0], &fptask->thread.fpqdepth);
 	}
 	last_task_used_math = current;
-	if(current->used_math) {
+	if(used_math()) {
 		fpload(&current->thread.float_regs[0], &current->thread.fsr);
 	} else {
 		/* Set initial sane state. */
 		fpload(&init_fregs[0], &init_fsr);
-		current->used_math = 1;
+		set_used_math();
 	}
 #else
-	if(!current->used_math) {
+	if(!used_math()) {
 		fpload(&init_fregs[0], &init_fsr);
-		current->used_math = 1;
+		set_used_math();
 	} else {
 		fpload(&current->thread.float_regs[0], &current->thread.fsr);
 	}
--- x/arch/x86_64/ia32/fpu32.c.~1~	2004-08-25 02:47:33.000000000 +0200
+++ x/arch/x86_64/ia32/fpu32.c	2004-12-25 15:18:54.623974376 +0100
@@ -156,7 +156,7 @@ int restore_i387_ia32(struct task_struct
 				     sizeof(struct i387_fxsave_struct)))
 			return -1;
 		tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-		tsk->used_math = 1;
+		set_stopped_child_used_math(tsk);
 	} 
 	return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
 }  
--- x/arch/x86_64/ia32/ia32_binfmt.c.~1~	2004-12-04 08:56:21.000000000 +0100
+++ x/arch/x86_64/ia32/ia32_binfmt.c	2004-12-25 15:18:54.624974224 +0100
@@ -214,7 +214,7 @@ elf_core_copy_task_fpregs(struct task_st
 	struct _fpstate_ia32 *fpstate = (void*)fpu; 
 	mm_segment_t oldfs = get_fs();
 
-	if (!tsk->used_math) 
+	if (!tsk_used_math(tsk)) 
 		return 0;
 	if (!regs)
 		regs = (struct pt_regs *)tsk->thread.rsp0;
@@ -235,7 +235,7 @@ static inline int 
 elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
 {
 	struct pt_regs *regs = ((struct pt_regs *)(t->thread.rsp0))-1; 
-	if (!t->used_math) 
+	if (!tsk_used_math(t)) 
 		return 0;
 	if (t == current)
 		unlazy_fpu(t); 
--- x/arch/x86_64/ia32/ia32_signal.c.~1~	2004-12-04 08:54:54.000000000 +0100
+++ x/arch/x86_64/ia32/ia32_signal.c	2004-12-25 15:18:54.625974072 +0100
@@ -383,7 +383,7 @@ ia32_setup_sigcontext(struct sigcontext_
 	if (tmp < 0)
 	  err = -EFAULT;
 	else { 
-		current->used_math = 0;
+		clear_used_math();
 		stts();
 	  err |= __put_user((u32)(u64)(tmp ? fpstate : NULL), &sc->fpstate);
 	}
--- x/arch/x86_64/ia32/ptrace32.c.~1~	2004-12-04 08:56:21.000000000 +0100
+++ x/arch/x86_64/ia32/ptrace32.c	2004-12-25 15:18:54.625974072 +0100
@@ -358,7 +358,7 @@ asmlinkage long sys32_ptrace(long reques
 			break;
 		/* no checking to be bug-to-bug compatible with i386 */
 		__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u));
-		child->used_math = 1;
+		set_stopped_child_used_math(child);
 		child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
 		ret = 0; 
 		break;
--- x/arch/x86_64/kernel/i387.c.~1~	2004-08-25 02:47:33.000000000 +0200
+++ x/arch/x86_64/kernel/i387.c	2004-12-25 15:18:54.626973920 +0100
@@ -57,12 +57,12 @@ void __init fpu_init(void)
 	mxcsr_feature_mask_init();
 	/* clean state in init */
 	current_thread_info()->status = 0;
-	current->used_math = 0;
+	clear_used_math();
 }
 
 void init_fpu(struct task_struct *child)
 {
-	if (child->used_math) { 
+	if (tsk_used_math(child)) { 
 		if (child == current)
 			unlazy_fpu(child);
 		return;
@@ -70,7 +70,8 @@ void init_fpu(struct task_struct *child)
 	memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
 	child->thread.i387.fxsave.cwd = 0x37f;
 	child->thread.i387.fxsave.mxcsr = 0x1f80;
-	child->used_math = 1;
+	/* only the device not available exception or ptrace can call init_fpu */
+	set_stopped_child_used_math(child);
 }
 
 /*
@@ -91,9 +92,9 @@ int save_i387(struct _fpstate __user *bu
 	if ((unsigned long)buf % 16) 
 		printk("save_i387: bad fpstate %p\n",buf); 
 
-	if (!tsk->used_math) 
+	if (!used_math()) 
 		return 0;
-	tsk->used_math = 0; /* trigger finit */ 
+	clear_used_math(); /* trigger finit */ 
 	if (tsk->thread_info->status & TS_USEDFPU) {
 		err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
 		if (err) return err;
@@ -133,7 +134,7 @@ int dump_fpu( struct pt_regs *regs, stru
 {
 	struct task_struct *tsk = current;
 
-	if (!tsk->used_math) 
+	if (!used_math()) 
 		return 0;
 
 	unlazy_fpu(tsk);
@@ -143,7 +144,7 @@ int dump_fpu( struct pt_regs *regs, stru
 
 int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
 {
-	int fpvalid = tsk->used_math;
+	int fpvalid = !!tsk_used_math(tsk);
 
 	if (fpvalid) {
 		if (tsk == current)
--- x/arch/x86_64/kernel/process.c.~1~	2004-12-04 08:56:21.000000000 +0100
+++ x/arch/x86_64/kernel/process.c	2004-12-25 15:18:54.627973768 +0100
@@ -297,7 +297,7 @@ void flush_thread(void)
 	 * Forget coprocessor state..
 	 */
 	clear_fpu(tsk);
-	tsk->used_math = 0;
+	clear_used_math();
 }
 
 void release_thread(struct task_struct *dead_task)
--- x/arch/x86_64/kernel/ptrace.c.~1~	2004-12-04 08:56:21.000000000 +0100
+++ x/arch/x86_64/kernel/ptrace.c	2004-12-25 15:18:54.628973616 +0100
@@ -480,7 +480,7 @@ asmlinkage long sys_ptrace(long request,
 			ret = -EIO;
 			break;
 		}
-		child->used_math = 1;
+		set_stopped_child_used_math(child);
 		ret = set_fpregs(child, (struct user_i387_struct __user *)data);
 		break;
 	}
--- x/arch/x86_64/kernel/signal.c.~1~	2004-12-04 08:56:21.000000000 +0100
+++ x/arch/x86_64/kernel/signal.c	2004-12-25 15:18:54.629973464 +0100
@@ -246,7 +246,7 @@ static void setup_rt_frame(int sig, stru
 	int err = 0;
 	struct task_struct *me = current;
 
-	if (me->used_math) {
+	if (used_math()) {
 		fp = get_stack(ka, regs, sizeof(struct _fpstate)); 
 		frame = (void __user *)round_down((unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
 
--- x/arch/x86_64/kernel/traps.c.~1~	2004-12-04 08:56:21.000000000 +0100
+++ x/arch/x86_64/kernel/traps.c	2004-12-25 15:18:54.630973312 +0100
@@ -894,7 +894,7 @@ asmlinkage void math_state_restore(void)
 	struct task_struct *me = current;
 	clts();			/* Allow maths ops (or we recurse) */
 
-	if (!me->used_math)
+	if (!used_math())
 		init_fpu(me);
 	restore_fpu_checking(&me->thread.i387.fxsave);
 	me->thread_info->status |= TS_USEDFPU;
--- x/arch/m32r/kernel/ptrace.c.~1~	2004-12-04 08:56:19.000000000 +0100
+++ x/arch/m32r/kernel/ptrace.c	2004-12-25 15:22:37.271126864 +0100
@@ -130,7 +130,7 @@ static int ptrace_read_user(struct task_
 #ifndef NO_FPU
 		else if (off >= (long)(&dummy->fpu >> 2) &&
 			 off < (long)(&dummy->u_fpvalid >> 2)) {
-			if (!tsk->used_math) {
+			if (!tsk_used_math(tsk)) {
 				if (off == (long)(&dummy->fpu.fpscr >> 2))
 					tmp = FPSCR_INIT;
 				else
@@ -139,7 +139,7 @@ static int ptrace_read_user(struct task_
 				tmp = ((long *)(&tsk->thread.fpu >> 2))
 					[off - (long)&dummy->fpu];
 		} else if (off == (long)(&dummy->u_fpvalid >> 2))
-			tmp = tsk->used_math;
+			tmp = !!tsk_used_math(tsk);
 #endif /* not NO_FPU */
 		else
 			tmp = 0;
@@ -187,12 +187,12 @@ static int ptrace_write_user(struct task
 #ifndef NO_FPU
 		else if (off >= (long)(&dummy->fpu >> 2) &&
 			 off < (long)(&dummy->u_fpvalid >> 2)) {
-			tsk->used_math = 1;
+			set_stopped_child_used_math(tsk);
 			((long *)&tsk->thread.fpu)
 				[off - (long)&dummy->fpu] = data;
 			ret = 0;
 		} else if (off == (long)(&dummy->u_fpvalid >> 2)) {
-			tsk->used_math = data ? 1 : 0;
+			conditional_stopped_child_used_math(data, tsk);
 			ret = 0;
 		}
 #endif /* not NO_FPU */
--- x/arch/m32r/kernel/setup.c.~1~	2004-12-04 08:54:53.000000000 +0100
+++ x/arch/m32r/kernel/setup.c	2004-12-25 15:18:54.632973008 +0100
@@ -389,7 +389,7 @@ void __init cpu_init (void)
 
 	/* Force FPU initialization */
 	current_thread_info()->status = 0;
-	current->used_math = 0;
+	clear_used_math();
 
 #ifdef CONFIG_MMU
 	/* Set up MMU */
--- x/include/asm-arm26/constants.h.~1~	2003-06-08 18:21:42.000000000 +0200
+++ x/include/asm-arm26/constants.h	2004-12-25 15:18:54.633972856 +0100
@@ -7,7 +7,6 @@
  *
  */
 
-#define TSK_USED_MATH 788 /* offsetof(struct task_struct, used_math) */
 #define TSK_ACTIVE_MM 96 /* offsetof(struct task_struct, active_mm) */
 
 #define VMA_VM_MM 0 /* offsetof(struct vm_area_struct, vm_mm) */
--- x/include/asm-x86_64/i387.h.~1~	2004-12-04 08:55:04.000000000 +0100
+++ x/include/asm-x86_64/i387.h	2004-12-25 15:18:54.633972856 +0100
@@ -25,16 +25,6 @@ extern void mxcsr_feature_mask_init(void
 extern void init_fpu(struct task_struct *child);
 extern int save_i387(struct _fpstate __user *buf);
 
-static inline int need_signal_i387(struct task_struct *me) 
-{ 
-	if (!me->used_math)
-		return 0;
-	me->used_math = 0; 
-	if (me->thread_info->status & TS_USEDFPU)
-		return 0;
-	return 1;
-} 
-
 /*
  * FPU lazy state save handling...
  */
--- x/include/linux/sched.h.~1~	2004-12-25 03:59:16.000000000 +0100
+++ x/include/linux/sched.h	2004-12-25 15:18:54.635972552 +0100
@@ -600,19 +600,7 @@ struct task_struct {
 	struct key *process_keyring;	/* keyring private to this process (CLONE_THREAD) */
 	struct key *thread_keyring;	/* keyring private to this thread */
 #endif
-/*
- * Must be changed atomically so it shouldn't be
- * be a shareable bitflag.
- */
-	unsigned char used_math;
-/*
- * OOM kill score adjustment (bit shift).
- * Cannot live together with used_math since
- * used_math and oomkilladj can be changed at the
- * same time, so they would race if they're in the
- * same atomic block.
- */
-	short oomkilladj;
+	int oomkilladj; /* OOM kill score adjustment (bit shift). */
 	char comm[16];
 /* file system info */
 	int link_count, total_link_count;
@@ -674,7 +662,7 @@ struct task_struct {
 	wait_queue_t *io_wait;
 #ifdef CONFIG_NUMA
   	struct mempolicy *mempolicy;
-  	short il_next;		/* could be shared with used_math */
+  	short il_next;
 #endif
 };
 
@@ -716,7 +704,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
 #define PF_FLUSHER	0x00001000	/* responsible for disk writeback */
-
+#define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
 #define PF_FREEZE	0x00004000	/* this task should be frozen for suspend */
 #define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
 #define PF_FROZEN	0x00010000	/* frozen for system suspend */
@@ -727,6 +715,31 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 
+/*
+ * Only the _current_ task can read/write to tsk->flags, but other
+ * tasks can access tsk->flags in readonly mode for example
+ * with tsk_used_math (like during threaded core dumping).
+ * There is however an exception to this rule during ptrace
+ * or during fork: the ptracer task is allowed to write to the
+ * child->flags of its traced child (same goes for fork, the parent
+ * can write to the child->flags), because we're guaranteed the
+ * child is not running and in turn not changing child->flags
+ * at the same time the parent does it.
+ */
+#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
+#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
+#define clear_used_math() clear_stopped_child_used_math(current)
+#define set_used_math() set_stopped_child_used_math(current)
+#define conditional_stopped_child_used_math(condition, child) \
+	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
+#define conditional_used_math(condition) \
+	conditional_stopped_child_used_math(condition, current)
+#define copy_to_stopped_child_used_math(child) \
+	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
+/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
+#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
+#define used_math() tsk_used_math(current)
+
 #ifdef CONFIG_SMP
 extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
 #else

[-- Attachment #2: mkpatch.py --]
[-- Type: text/plain, Size: 11509 bytes --]

#!/usr/bin/env python

# Copyright (C) 2004 Andrea Arcangeli <andrea@suse.de> SUSE
# $Id: mkpatch.py,v 1.17 2004/12/04 04:22:09 andrea Exp $

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# You can copy or symlink this script into ~/bin and
# your ~/.signedoffby file should contain a string like this:
# "Signed-off-by: Andrea Arcangeli <andrea@suse.de>"

# Usage is intuitive like this:
#	./mkpatch.py # without parameter search the backup in current dir
#	./mkpatch.py dir2 # this search the backups in dir2
#	./mkpatch.py dir1 dir2
#	./mkpatch.py dir2 destination-patchfile
#	./mkpatch.py dir1 dir2 destination-patchfile
#	./mkpatch.py destination-patchfile # this will only parse patchfile

# There are three options: -n, -s, -a (alias respectively to
# --no-signoff, --signoff and --acked). If you're only rediffing
# the patch you can use '-n' to avoid altering the signoff list.
# If you're instead only reviewing the patch you can use '-a'
# to add an Acked-by, instead of a Signed-off-by. You can use
# bash alias with bash 'alias mkpatch.py=mkpatch.py -a' if you
# only review patches, or you can use -n instead if you only
# regenerate patches without even reviewing them. You can always
# force a signoff by using -a or -s. The last option mode overrides
# any previous signoff mode. The default is '-s' (aka '--signoff').

# If you miss the ~/.signedoffby file, '-n' (aka '--no-signoff')
# behaviour will be forced.

import sys, os, re, readline, getopt
from rfc822 import Message

TAGS = (
	'From',
	'Subject',
	'Patch-mainline',
	'References',
	)

DIFF_CMD = 'diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids --exclude .svn'
SIGNOFF_FILE = '~/.signedoffby'

class signoff_mode_class(object):
	signedoffby = 'Signed-off-by: '
	ackedby = 'Acked-by: '

	def __init__(self):
		self.mode = 0
		self.my_signoff = None
	def signoff(self):
		self.mode = 0
	def no_signoff(self):
		self.mode = 1
	def acked(self):
		self.mode = 2
	def is_acked(self):
		return self.mode == 2
	def is_signingoff(self):
		return self.mode == 0
	def is_enabled(self):
		return self.is_signingoff() or self.is_acked()
	def change_prefix(self, prefix, signoff):
		if self.my_signoff is None:
			return prefix
		if self.is_signingoff():
			if signoff == self.my_signoff:
				return self.signedoffby
		elif self.is_acked():
			if signoff == self.my_signoff:
				return self.ackedby
		return prefix

class tag_class(object):
	def __init__(self, name):
		self.name = name
		self.regexp = re.compile(name + r': (.*)', re.I)
		self.value = ''

	def parse(self, line, message):
		header = message.isheader(line)
		if header and header.lower() == self.name.lower():
			header = message.getheader(header)
			if header:
				self.value = header
				return len(self.value.split('\n'))

	def ask_value(self):
		self.value = ''
		first = 1
		while 1:
			this_value = raw_input('%s: ' % self.name)
			if not this_value:
				break
			if not first:
				self.value += '\n '
			first = 0
			self.value += this_value

class patch_class(object):
	def __init__(self, patchfile, signoff_mode):
		self.patchfile = patchfile
		try:
			self.message = Message(file(patchfile))
		except IOError:
			self.message = None
		self.signoff_mode = signoff_mode
		self.prepare()
		self.read()

	def prepare(self):
		readline.add_history(os.path.basename(self.patchfile))
		readline.add_history('yes'); readline.add_history('no')

		my_signoff = None
		self.re_signoff = re.compile(self.signoff_mode.signedoffby + r'(.*@.*)', re.I)
		self.re_ackedby = re.compile(self.signoff_mode.ackedby + r'(.*@.*)', re.I)
		try:
			signoff = file(os.path.expanduser(SIGNOFF_FILE)).readline()
		except IOError:
			pass
		else:
			m = self.re_signoff.search(signoff)
			if m:
				my_signoff = m.group(1)
				readline.add_history(my_signoff)

		if not my_signoff:
			self.signoff_mode.no_signoff()
		else:
			self.signoff_mode.my_signoff = my_signoff

		self.tags = []
		for tag in TAGS:
			self.tags.append(tag_class(tag))

		self.signedoffby = self.signoff_mode.signedoffby
		self.ackedby = self.signoff_mode.ackedby

	def parse_metadata(self, line):
		# grab bk metadata and convert into valid header
		m = self.re_signoff.search(line)
		prefix = self.signedoffby
		if not m:
			prefix = self.ackedby
			m = self.re_ackedby.search(line)
		if m:
			this_signoff = m.group(1)
			if this_signoff not in self.signoff:
				self.signoff[this_signoff] = prefix
				self.signoff_order.append(this_signoff)
			return

		if self.message:
			for tag in self.tags:
				ret = tag.parse(line, self.message)
				if ret:
					return ret

	def read(self):
		self.metadata = ''
		self.signoff = {}
		self.signoff_order = []
		self.__payload = ''

		try:
			patch = file(self.patchfile, 'r')
		except IOError:
			pass
		else:
			re_index = re.compile(r'Index: .*')
			re_bk = re.compile(r'=====.*vs.*=====')
			re_diff = re.compile(r'diff .*')
			re_plus = re.compile(r'--- .*')
			re_empty = re.compile(r'^\s*$')

			re_signoff = self.re_signoff
			re_ackedby = self.re_ackedby

			emptylines = ''
			headers = 1
			state = 'is_metadata'
			while 1:
				line = patch.readline()
				if not line:
					break

				if re_diff.match(line) or re_plus.match(line) or \
				       re_index.match(line) or re_bk.match(line):
					state = 'is_payload'
				elif state == 'is_metadata' and (re_signoff.match(line) or
								 re_ackedby.match(line)):
					state = 'is_signoff'

				if state == 'is_metadata':
					if re_empty.search(line):
						emptylines += '\n'
					else:
						nr_lines = self.parse_metadata(line)
						if type(nr_lines) == int:
							for i in xrange(1, nr_lines):
								patch.readline()
						if not headers or not nr_lines:
							if headers:
								emptylines = ''
								headers = 0
							self.metadata += emptylines + line
							emptylines = ''
				elif state == 'is_signoff':
					m = self.re_signoff.match(line)
					prefix = self.signedoffby
					if not m:
						prefix = self.ackedby
						m = self.re_ackedby.match(line)
					if m:
						this_signoff = m.group(1)
						if this_signoff not in self.signoff:
							self.signoff[this_signoff] = prefix
							self.signoff_order.append(this_signoff)
				elif state == 'is_payload':
					self.__payload += line
				else:
					raise 'unknown state'

	def ask_empty_tags(self):
		for tag in self.tags:
			if not tag.value:
				tag.ask_value()

	def get_tags(self):
		ret = ''
		for tag in self.tags:
			if tag.value:
				ret += tag.name + ': ' + tag.value + '\n'
		return ret

	def get_signoff(self):
		ret = ''
		for signoff in self.signoff_order:
			prefix = self.signoff[signoff]
			prefix = self.signoff_mode.change_prefix(prefix, signoff)
			ret += prefix + signoff + '\n'
		my_signoff = self.signoff_mode.my_signoff
		if self.signoff_mode.is_enabled() and \
		       my_signoff and my_signoff not in self.signoff:
			prefix = self.signoff_mode.change_prefix(None, my_signoff)
			ret += prefix + my_signoff + '\n'
		return ret

	def write(self):
		tags = self.get_tags()
		if tags:
			tags += '\n'
		metadata = self.metadata
		if metadata:
			metadata += '\n'
		signoff = self.get_signoff()
		if signoff:
			signoff += '\n'
		payload = self.payload
		try:
			os.unlink(self.patchfile) # handle links
		except OSError:
			pass
		file(self.patchfile, 'w').write(tags + metadata + signoff + payload)

	def get_payload(self):
		return self.__payload

	def set_payload(self, value):
		if value is not None:
			self.__payload = cleanup_patch(value)

	payload = property(get_payload, set_payload)

def cleanup_patch(patch):
	diffline = re.compile(DIFF_CMD + r'.*')
	ret = ''
	for line in re.split('\n', patch):
		if line and not diffline.match(line):
			ret += line + '\n'
	return ret

def replace_diff(diff, patchfile, signoff_mode):
	patch = patch_class(patchfile, signoff_mode)
	patch.payload = diff
	patch.ask_empty_tags()
	patch.write()

def mkpatch(*args):
	# parse opts
	try:
		opts, args = getopt.getopt(args, 'nas', ( 'no-signoff', 'acked', 'signoff', ))
	except getopt.GetoptError:
		raise 'EINVAL'
	signoff_mode = signoff_mode_class()
	for opt, arg in opts:
		if opt in ('-n', '--no-signoff', ):
			signoff_mode.no_signoff()
		elif opt in ('-a', '--acked', ):
			signoff_mode.acked()
		elif opt in ('-s', '--signoff', ):
			signoff_mode.signoff()

	# parse args
	nr_args = len(args)
	def cleanup_path(args):
		return map(os.path.normpath, map(os.path.expanduser, args))
	if nr_args > 3:
		raise 'EINVAL'
	elif nr_args == 0:
		olddir = None
		newdir = '.'
		patchfile = None
	elif nr_args == 1:
		olddir = None
		newdir, = cleanup_path(args)
		patchfile = None
	elif nr_args == 2:
		olddir = None
		newdir, patchfile = cleanup_path(args)
	elif nr_args == 3:
		olddir, newdir, patchfile = cleanup_path(args)

	#print olddir, newdir, patchfile
	if olddir and not os.path.isdir(olddir):
		print >>sys.stderr, 'olddir must be a directory'
		raise 'EINVAL'
	elif not os.path.isdir(newdir):
		if not os.path.isfile(newdir):
			print >>sys.stderr, 'newdir must be a directory or a file'
			raise 'EINVAL'
		olddir, newdir, patchfile = (None, None, newdir, )
	elif patchfile and os.path.isdir(patchfile):
		olddir = newdir
		newdir = patchfile
		patchfile = None
	#print olddir, newdir, patchfile

	diff = None
	if not olddir and newdir:
		# use backup files
		print >>sys.stderr, 'Searching backup files in %s ...' % newdir,
		find = os.popen('find %s -type f \( -name \*~ -or -name \*.orig \) 2>/dev/null' % newdir, 'r')
		files = find.readlines()
		if files:
			print >>sys.stderr, 'done.'
		else:
			print >>sys.stderr, 'none found.'

		diff = ''
		already_diffed = {}
		for backup_f in files:
			new_f = None
			backup_f = backup_f[:-1]
			if backup_f[-5:] == '.orig':
				new_f = backup_f[:-5]
			elif backup_f[-4:] == '.~1~':
				new_f = backup_f[:-4]
			elif backup_f[-1:] == '~':
				new_f = backup_f[:-1]

			if new_f:
				if not os.path.isfile(new_f):
					continue
				if new_f in already_diffed:
					continue
				already_diffed[new_f] = 0
				print >>sys.stderr, 'Diffing %s...' % new_f,
				this_diff = os.popen(DIFF_CMD + ' %s %s' % (backup_f, new_f) + ' 2>/dev/null').read()
				diff += this_diff
				if this_diff:
					print >>sys.stderr, 'done.'
				else:
					print >>sys.stderr, 'unchanged.'
	elif olddir and newdir:
		# use two directories
		print >>sys.stderr, 'Creating diff between %s and %s ...' % (olddir, newdir),
		diff = os.popen(DIFF_CMD + ' %s %s' % (olddir, newdir) + ' 2>/dev/null', 'r').read()
		print >>sys.stderr, 'done.'

	if patchfile:
		replace_diff(diff, patchfile, signoff_mode)
		os.execvp('vi', ('vi', '-c', 'set tw=72', patchfile, ))
	else:
		if diff:
			print cleanup_patch(diff),

if __name__ == '__main__':
	try:
		mkpatch(*sys.argv[1:])
	except 'EINVAL':
		print >>sys.stderr, 'Usage:', sys.argv[0], \
		      '[-a|--acked] [-n|--no-signoff] [-s|--signoff] [olddir] [newdir] [patch]'

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [->used_math to PF_USED_MATH] [6/4]
  2004-12-25 14:53           ` VM fixes [->used_math to PF_USED_MATH] [6/4] Andrea Arcangeli
@ 2004-12-27  7:03             ` Andy Isaacson
  2005-01-02 15:41               ` Andrea Arcangeli
  0 siblings, 1 reply; 21+ messages in thread
From: Andy Isaacson @ 2004-12-27  7:03 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Linus Torvalds, David S. Miller, linux-kernel, tglx, akpm

On Sat, Dec 25, 2004 at 03:53:21PM +0100, Andrea Arcangeli wrote:
> only to the future ones based on 2.6.10+, since the ev4 race on
> SMP/PREEMPT is not relevant for the suse tree (those last two patches
> are a bit too big to take any risk for a _purerly_theoretical_ race on
> ev4 + SMP or ev4 + PREEMPT ;). The PF_MEMDIE was instead a more pratical
> race (Wli said he triggered it in practice too) and it was triggering on
> all archs, not just on ev4 + SMP or evr + PREEMPT, that's fixed with
> [1-4]/4.

FWIW, BWX showed up in ev56.  So ev5 is also missing atomic byte
instructions, and there definitely are (were?) SMP ev5 machines
supported by Linux.

I can't find any authoritative source for that assertion, but google
supports it:
http://sources.redhat.com/ml/libc-alpha/2002-09/msg00328.html

-andy

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2004-12-24 17:41 VM fixes [4/4] Andrea Arcangeli
  2004-12-24 18:01 ` David S. Miller
  2004-12-24 23:32 ` Linus Torvalds
@ 2004-12-27 13:38 ` Rik van Riel
  2004-12-28  9:42   ` Thomas Gleixner
  2 siblings, 1 reply; 21+ messages in thread
From: Rik van Riel @ 2004-12-27 13:38 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-kernel, Thomas Gleixner, Andrew Morton

On Fri, 24 Dec 2004, Andrea Arcangeli wrote:

> --- x/mm/oom_kill.c.orig	2004-12-24 17:53:50.807536152 +0100
> +++ x/mm/oom_kill.c	2004-12-24 18:01:19.903263224 +0100
> @@ -45,18 +45,30 @@
> unsigned long badness(struct task_struct *p, unsigned long uptime)
> {

> 	/*
> +	 * Processes which fork a lot of child processes are likely
> +	 * a good choice. We add the vmsize of the childs if they
> +	 * have an own mm. This prevents forking servers to flood the
> +	 * machine with an endless amount of childs
> +	 */

I'm not sure about this one.  You'll end up killing the
parent httpd and sshd, instead of letting them hang around
so the system can recover by itself after the memory use
spike is over.

I guess it all depends on whether your OOM situation is a
spike or a deliberately caused problem...

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2004-12-27 13:38 ` Rik van Riel
@ 2004-12-28  9:42   ` Thomas Gleixner
  2005-01-02 15:51     ` Andrea Arcangeli
  0 siblings, 1 reply; 21+ messages in thread
From: Thomas Gleixner @ 2004-12-28  9:42 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Andrea Arcangeli, LKML, Andrew Morton

On Mon, 2004-12-27 at 08:38 -0500, Rik van Riel wrote:
> On Fri, 24 Dec 2004, Andrea Arcangeli wrote:
> 
> > --- x/mm/oom_kill.c.orig	2004-12-24 17:53:50.807536152 +0100
> > +++ x/mm/oom_kill.c	2004-12-24 18:01:19.903263224 +0100
> > @@ -45,18 +45,30 @@
> > unsigned long badness(struct task_struct *p, unsigned long uptime)
> > {
> 
> > 	/*
> > +	 * Processes which fork a lot of child processes are likely
> > +	 * a good choice. We add the vmsize of the childs if they
> > +	 * have an own mm. This prevents forking servers to flood the
> > +	 * machine with an endless amount of childs
> > +	 */
> 
> I'm not sure about this one.  You'll end up killing the
> parent httpd and sshd, instead of letting them hang around
> so the system can recover by itself after the memory use
> spike is over.

The selection is adding the child VM size, but the killer itself kills a
child process first, so the parent is not the one which is killed in the
first place.

tglx



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [->used_math to PF_USED_MATH] [6/4]
  2004-12-27  7:03             ` Andy Isaacson
@ 2005-01-02 15:41               ` Andrea Arcangeli
  2005-01-08 17:17                 ` William Lee Irwin III
  0 siblings, 1 reply; 21+ messages in thread
From: Andrea Arcangeli @ 2005-01-02 15:41 UTC (permalink / raw)
  To: Andy Isaacson; +Cc: Linus Torvalds, David S. Miller, linux-kernel, tglx, akpm

On Sun, Dec 26, 2004 at 11:03:09PM -0800, Andy Isaacson wrote:
> I can't find any authoritative source for that assertion, but google
> supports it:
> http://sources.redhat.com/ml/libc-alpha/2002-09/msg00328.html

This is a nice reference, thanks for the info (I personally only worked
with ev6 so I probably never run in the exact features and history of
the older chips).

All issues with ev4+smp/preempt and ev5+smp/preempt should be fixed with
the two incremental patches I posted last week that make PF_MEMDIE a
TIF_MEMDIE and used_math a PF_USED_MATH and that fixes the PF_MEMDIE
race that existed on all archs in mainline 2.6 and that AFIK Wli even
managed to reproduce once.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2004-12-28  9:42   ` Thomas Gleixner
@ 2005-01-02 15:51     ` Andrea Arcangeli
  2005-01-02 16:44       ` Rik van Riel
  0 siblings, 1 reply; 21+ messages in thread
From: Andrea Arcangeli @ 2005-01-02 15:51 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Rik van Riel, LKML, Andrew Morton

On Tue, Dec 28, 2004 at 10:42:40AM +0100, Thomas Gleixner wrote:
> On Mon, 2004-12-27 at 08:38 -0500, Rik van Riel wrote:
> > On Fri, 24 Dec 2004, Andrea Arcangeli wrote:
> > 
> > > --- x/mm/oom_kill.c.orig	2004-12-24 17:53:50.807536152 +0100
> > > +++ x/mm/oom_kill.c	2004-12-24 18:01:19.903263224 +0100
> > > @@ -45,18 +45,30 @@
> > > unsigned long badness(struct task_struct *p, unsigned long uptime)
> > > {
> > 
> > > 	/*
> > > +	 * Processes which fork a lot of child processes are likely
> > > +	 * a good choice. We add the vmsize of the childs if they
> > > +	 * have an own mm. This prevents forking servers to flood the
> > > +	 * machine with an endless amount of childs
> > > +	 */
> > 
> > I'm not sure about this one.  You'll end up killing the
> > parent httpd and sshd, instead of letting them hang around
> > so the system can recover by itself after the memory use
> > spike is over.
> 
> The selection is adding the child VM size, but the killer itself kills a
> child process first, so the parent is not the one which is killed in the
> first place.

The other part of Thomas's change is this one:

+static struct mm_struct *oom_kill_process(task_t *p)
+{
+ 	struct mm_struct *mm;
+	struct task_struct *c;
+	struct list_head *tsk;
+
+	/* Try to kill a child first */
+	list_for_each(tsk, &p->children) {
+		c = list_entry(tsk, struct task_struct, sibling);
+		if (c->mm == p->mm)
+			continue;
+		mm = oom_kill_task(c);
+		if (mm)
+			return mm;
+	}
+	return oom_kill_task(p);
+}

Thomas's changes worked better than previous code so far, he can clearly
identify forkbombs or services spread across multiple processes.
Without these changes it was trivial to fool the oom killer and lead to
sshd and other services being killed, so his changes makes lots of sense
to me. It seems certainly better than the current code.

Actually the only question I made to him is why he doesn't kill the
"parent" instead of adding the above code, that is what is being
discussed here, but I was suggesting it as a good thing, not as a bad
thing. If we have a fork bomb killing the master parent would be
optimal. What he does above by killing the childs first is a lot more
conservative and I'm fine with it as well.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [4/4]
  2005-01-02 15:51     ` Andrea Arcangeli
@ 2005-01-02 16:44       ` Rik van Riel
  0 siblings, 0 replies; 21+ messages in thread
From: Rik van Riel @ 2005-01-02 16:44 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Thomas Gleixner, LKML, Andrew Morton

On Sun, 2 Jan 2005, Andrea Arcangeli wrote:

> The other part of Thomas's change is this one:

> Thomas's changes worked better than previous code so far, he can clearly
> identify forkbombs or services spread across multiple processes.

> optimal. What he does above by killing the childs first is a lot more
> conservative and I'm fine with it as well.

I like it a lot, especially when thinking about overloaded
web servers and other loads that are common but not malicious.

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: VM fixes [->used_math to PF_USED_MATH] [6/4]
  2005-01-02 15:41               ` Andrea Arcangeli
@ 2005-01-08 17:17                 ` William Lee Irwin III
  0 siblings, 0 replies; 21+ messages in thread
From: William Lee Irwin III @ 2005-01-08 17:17 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Andy Isaacson, Linus Torvalds, David S. Miller, linux-kernel, tglx, akpm

On Sun, Dec 26, 2004 at 11:03:09PM -0800, Andy Isaacson wrote:
>> I can't find any authoritative source for that assertion, but google
>> supports it:
>> http://sources.redhat.com/ml/libc-alpha/2002-09/msg00328.html

On Sun, Jan 02, 2005 at 04:41:48PM +0100, Andrea Arcangeli wrote:
> This is a nice reference, thanks for the info (I personally only worked
> with ev6 so I probably never run in the exact features and history of
> the older chips).
> All issues with ev4+smp/preempt and ev5+smp/preempt should be fixed with
> the two incremental patches I posted last week that make PF_MEMDIE a
> TIF_MEMDIE and used_math a PF_USED_MATH and that fixes the PF_MEMDIE
> race that existed on all archs in mainline 2.6 and that AFIK Wli even
> managed to reproduce once.

Reproduced only once ever on x86-64, but reliably reproducible on all
other 64-bit architectures. 32-bit architectures have resource
scalability issues that make the particular exploit I wrote ineffective.


-- wli

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2005-01-08 17:18 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-12-24 17:41 VM fixes [4/4] Andrea Arcangeli
2004-12-24 18:01 ` David S. Miller
2004-12-24 18:22   ` Andrea Arcangeli
2004-12-24 20:55     ` David S. Miller
2004-12-24 21:25       ` William Lee Irwin III
2004-12-24 23:52         ` William Lee Irwin III
2004-12-24 23:55         ` David S. Miller
2004-12-24 23:41     ` Linus Torvalds
2004-12-25  2:27       ` Andrea Arcangeli
2004-12-25  3:24         ` VM fixes [PF_MEMDIE to TIF_MEMDIE] [5/4] Andrea Arcangeli
2004-12-25 14:53           ` VM fixes [->used_math to PF_USED_MATH] [6/4] Andrea Arcangeli
2004-12-27  7:03             ` Andy Isaacson
2005-01-02 15:41               ` Andrea Arcangeli
2005-01-08 17:17                 ` William Lee Irwin III
2004-12-25  0:06     ` VM fixes [4/4] Mitchell Blank Jr
2004-12-25  2:37       ` Andrea Arcangeli
2004-12-24 23:32 ` Linus Torvalds
2004-12-27 13:38 ` Rik van Riel
2004-12-28  9:42   ` Thomas Gleixner
2005-01-02 15:51     ` Andrea Arcangeli
2005-01-02 16:44       ` Rik van Riel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).