linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] oom killer (Core)
@ 2004-12-01  9:49 tglx
  2004-12-01 21:16 ` Andrea Arcangeli
                   ` (2 more replies)
  0 siblings, 3 replies; 66+ messages in thread
From: tglx @ 2004-12-01  9:49 UTC (permalink / raw)
  To: akpm; +Cc: andrea, marcelo.tosatti, linux-kernel

The oom killer has currently some strange effects when triggered.
It gets invoked multiple times and the selection of the task to kill
does not take processes into account which fork a lot of child processes.

The patch solves this by
- Preventing reentrancy
- Checking for memory threshold before selection and kill.
- Taking child processes into account when selecting the process to kill

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 oom_kill.c   |  113 ++++++++++++++++++++++++++++++++++++++++++-----------------
 page_alloc.c |    5 ++
 2 files changed, 86 insertions(+), 32 deletions(-)
---
diff -urN 2.6.10-rc2-mm4.orig/mm/oom_kill.c 2.6.10-rc2-mm4/mm/oom_kill.c
--- 2.6.10-rc2-mm4.orig/mm/oom_kill.c	2004-12-01 09:33:44.000000000 +0100
+++ 2.6.10-rc2-mm4/mm/oom_kill.c	2004-12-01 09:37:41.628396951 +0100
@@ -45,8 +45,10 @@
 static unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
+        struct list_head *tsk;
 
-	if (!p->mm)
+	/* Ignore mm-less tasks and init */
+	if (!p->mm || p->pid == 1)
 		return 0;
 
 	if (p->flags & PF_MEMDIE)
@@ -57,6 +59,19 @@
 	points = p->mm->total_vm;
 
 	/*
+	 * Processes which fork a lot of child processes are likely 
+	 * a good choice. We add the vmsize of the childs if they
+	 * have an own mm. This prevents forking servers to flood the
+	 * machine with an endless amount of childs
+	 */
+	list_for_each(tsk, &p->children) {
+		struct task_struct *chld;
+		chld = list_entry(tsk, struct task_struct, sibling);
+		if (chld->mm != p->mm && chld->mm)
+			points += chld->mm->total_vm;
+	}
+
+	/*
 	 * CPU time is in tens of seconds and run time is in thousands
          * of seconds. There is no particular reason for this other than
          * that it turned out to work very well in practice.
@@ -176,6 +191,27 @@
 	return mm;
 }
 
+static struct mm_struct *oom_kill_process(task_t *p)
+{
+	struct mm_struct *mm;
+	struct task_struct *g, *q;
+
+	mm = oom_kill_task(p);
+	if (!mm)
+		return NULL;
+	/*
+	 * kill all processes that share the ->mm (i.e. all threads),
+	 * but are in a different thread group
+	 */
+	do_each_thread(g, q)
+		if (q->mm == mm && q->tgid != p->tgid)
+			__oom_kill_task(q);
+
+	while_each_thread(g, q);
+	if (!p->mm)
+		printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
+	return mm;
+}
 
 /**
  * oom_kill - kill the "best" process when we run out of memory
@@ -188,7 +224,9 @@
 void oom_kill(void)
 {
 	struct mm_struct *mm;
-	struct task_struct *g, *p, *q;
+	struct task_struct *c, *p;
+	struct list_head *tsk;
+	int mmcnt = 0;
 	
 	read_lock(&tasklist_lock);
 retry:
@@ -200,21 +238,32 @@
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	mm = oom_kill_task(p);
-	if (!mm)
-		goto retry;
 	/*
-	 * kill all processes that share the ->mm (i.e. all threads),
-	 * but are in a different thread group
+	 * Kill the forked child processes first
 	 */
-	do_each_thread(g, q)
-		if (q->mm == mm && q->tgid != p->tgid)
-			__oom_kill_task(q);
-	while_each_thread(g, q);
-	if (!p->mm)
-		printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
+	list_for_each(tsk, &p->children) {
+		c = list_entry(tsk, struct task_struct, sibling);
+		/* Do not touch threads, as they get killed later */
+		if (c->mm == p->mm)
+			continue;
+		mm = oom_kill_process(c);
+		if (mm) {
+			mmcnt ++;
+			mmput(mm);
+		}
+	}
+
+	/*
+	 * If we managed to kill one or more child processes
+	 * then let the parent live for now
+	 */
+	if (!mmcnt) {
+		mm = oom_kill_process(p);
+		if (!mm)
+			goto retry;
+		mmput(mm);
+	}
 	read_unlock(&tasklist_lock);
-	mmput(mm);
 	return;
 }
 
@@ -224,14 +273,21 @@
 void out_of_memory(int gfp_mask)
 {
 	/*
-	 * oom_lock protects out_of_memory()'s static variables.
-	 * It's a global lock; this is not performance-critical.
-	 */
-	static DEFINE_SPINLOCK(oom_lock);
+ 	 * inprogress protects out_of_memory()'s static variables
+	 * and prevents reentrancy.
+  	 */
+ 	static unsigned long inprogress;
+ 	static unsigned int  freepages = 1000000;
 	static unsigned long first, last, count, lastkill;
 	unsigned long now, since;
 
-	spin_lock(&oom_lock);
+ 	if (test_and_set_bit(0, &inprogress))
+ 		return;
+ 	
+ 	/* Check, if memory was freed since the last oom kill */
+ 	if (freepages < nr_free_pages())
+ 		goto out_unlock;
+
 	now = jiffies;
 	since = now - last;
 	last = now;
@@ -271,12 +327,11 @@
 	 * Ok, really out of memory. Kill something.
 	 */
 	lastkill = now;
-
-	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
+	printk(KERN_ERR "oom-killer: gfp_mask=0x%x\n", gfp_mask);
 	show_free_areas();
-
-	/* oom_kill() sleeps */
-	spin_unlock(&oom_lock);
+	dump_stack();
+	/* Store free pages  * 2 for the check above */
+	freepages = (nr_free_pages() << 1);
 	oom_kill();
 	/*
 	 * Make kswapd go out of the way, so "p" has a good chance of
@@ -284,17 +339,11 @@
 	 * for more memory.
 	 */
 	yield();
-	spin_lock(&oom_lock);
 
 reset:
-	/*
-	 * We dropped the lock above, so check to be sure the variable
-	 * first only ever increases to prevent false OOM's.
-	 */
-	if (time_after(now, first))
-		first = now;
+	first = jiffies;
 	count = 0;
 
 out_unlock:
-	spin_unlock(&oom_lock);
+	clear_bit(0, &inprogress);
 }
diff -urN 2.6.10-rc2-mm4.orig/mm/page_alloc.c 2.6.10-rc2-mm4/mm/page_alloc.c
--- 2.6.10-rc2-mm4.orig/mm/page_alloc.c	2004-12-01 09:33:44.000000000 +0100
+++ 2.6.10-rc2-mm4/mm/page_alloc.c	2004-12-01 09:36:55.010034604 +0100
@@ -872,6 +872,11 @@
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
 
+	/* Check if try_to_free_pages called out_of_memory and
+	 * if the current task is the sacrifice  */
+	if ((p->flags & PF_MEMDIE) && !in_interrupt())
+		goto nopage;
+
 	/* go through the zonelist yet one more time */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		if (!zone_watermark_ok(z, order, z->pages_min,

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-01  9:49 [PATCH] oom killer (Core) tglx
@ 2004-12-01 21:16 ` Andrea Arcangeli
  2004-12-01 22:06   ` Thomas Gleixner
  2004-12-05  2:52 ` William Lee Irwin III
  2004-12-10 16:32 ` William Lee Irwin III
  2 siblings, 1 reply; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-01 21:16 UTC (permalink / raw)
  To: tglx; +Cc: akpm, marcelo.tosatti, linux-kernel

On Wed, Dec 01, 2004 at 10:49:03AM +0100, tglx@linutronix.de wrote:
> It gets invoked multiple times [..]

You didn't move the invocation in page_alloc.c which is the major bug I
can see (besides the other hacks in oom_kill.c). I'd try fixing the
major bug first.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-01 21:16 ` Andrea Arcangeli
@ 2004-12-01 22:06   ` Thomas Gleixner
  2004-12-01 22:33     ` Andrea Arcangeli
  2004-12-02  3:36     ` Andrea Arcangeli
  0 siblings, 2 replies; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-01 22:06 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, marcelo.tosatti, LKML

On Wed, 2004-12-01 at 22:16 +0100, Andrea Arcangeli wrote:
> On Wed, Dec 01, 2004 at 10:49:03AM +0100, tglx@linutronix.de wrote:
> > It gets invoked multiple times [..]
> 
> You didn't move the invocation in page_alloc.c which is the major bug I
> can see (besides the other hacks in oom_kill.c). I'd try fixing the
> major bug first.

Where do you want to move it ? 

I don't buy that moving the invocation to any place will solve the
problem of multiple invocations.

The multiple invocations happen with SMP and UP + PREEMPT, when the lock
is dropped in out_of_memory()

	spin_unlock(&oom_lock);
	oom_kill();
	spin_lock(&oom_lock);

How does moving the invocation change this ? 

tglx





^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-01 22:06   ` Thomas Gleixner
@ 2004-12-01 22:33     ` Andrea Arcangeli
  2004-12-02  3:36     ` Andrea Arcangeli
  1 sibling, 0 replies; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-01 22:33 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andrew Morton, marcelo.tosatti, LKML

On Wed, Dec 01, 2004 at 11:06:07PM +0100, Thomas Gleixner wrote:
> On Wed, 2004-12-01 at 22:16 +0100, Andrea Arcangeli wrote:
> > On Wed, Dec 01, 2004 at 10:49:03AM +0100, tglx@linutronix.de wrote:
> > > It gets invoked multiple times [..]
> > 
> > You didn't move the invocation in page_alloc.c which is the major bug I
> > can see (besides the other hacks in oom_kill.c). I'd try fixing the
> > major bug first.
> 
> Where do you want to move it ? 
> 
> I don't buy that moving the invocation to any place will solve the
> problem of multiple invocations.

It has to check the levels of free memory before calling oom_kill.c and
that's the usual check that alloc_pages does.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-01 22:06   ` Thomas Gleixner
  2004-12-01 22:33     ` Andrea Arcangeli
@ 2004-12-02  3:36     ` Andrea Arcangeli
  2004-12-02 11:09       ` Thomas Gleixner
  1 sibling, 1 reply; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-02  3:36 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andrew Morton, marcelo.tosatti, LKML, Nick Piggin

Could you please test this? Perhaps it won't be as effective as the
hacks I nuked, but it fixed basic things for me in a quick test so I
like it more. It's important to use pages_high to avoid livelocks.

Thanks.

Signed-off-by: Andrea Arcangeli <andrea@suse.de>

Index: x/mm/oom_kill.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/oom_kill.c,v
retrieving revision 1.31
diff -u -p -r1.31 oom_kill.c
--- x/mm/oom_kill.c	14 Oct 2004 04:27:49 -0000	1.31
+++ x/mm/oom_kill.c	2 Dec 2004 03:34:05 -0000
@@ -229,72 +229,8 @@ retry:
  */
 void out_of_memory(int gfp_mask)
 {
-	/*
-	 * oom_lock protects out_of_memory()'s static variables.
-	 * It's a global lock; this is not performance-critical.
-	 */
-	static spinlock_t oom_lock = SPIN_LOCK_UNLOCKED;
-	static unsigned long first, last, count, lastkill;
-	unsigned long now, since;
-
-	spin_lock(&oom_lock);
-	now = jiffies;
-	since = now - last;
-	last = now;
-
-	/*
-	 * If it's been a long time since last failure,
-	 * we're not oom.
-	 */
-	if (since > 5*HZ)
-		goto reset;
-
-	/*
-	 * If we haven't tried for at least one second,
-	 * we're not really oom.
-	 */
-	since = now - first;
-	if (since < HZ)
-		goto out_unlock;
-
-	/*
-	 * If we have gotten only a few failures,
-	 * we're not really oom. 
-	 */
-	if (++count < 10)
-		goto out_unlock;
-
-	/*
-	 * If we just killed a process, wait a while
-	 * to give that task a chance to exit. This
-	 * avoids killing multiple processes needlessly.
-	 */
-	since = now - lastkill;
-	if (since < HZ*5)
-		goto out_unlock;
-
-	/*
-	 * Ok, really out of memory. Kill something.
-	 */
-	lastkill = now;
-
 	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
 	show_free_areas();
-
-	/* oom_kill() sleeps */
-	spin_unlock(&oom_lock);
+	dump_stack();
 	oom_kill();
-	spin_lock(&oom_lock);
-
-reset:
-	/*
-	 * We dropped the lock above, so check to be sure the variable
-	 * first only ever increases to prevent false OOM's.
-	 */
-	if (time_after(now, first))
-		first = now;
-	count = 0;
-
-out_unlock:
-	spin_unlock(&oom_lock);
 }
Index: x/mm/page_alloc.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/page_alloc.c,v
retrieving revision 1.236
diff -u -p -r1.236 page_alloc.c
--- x/mm/page_alloc.c	16 Nov 2004 03:53:53 -0000	1.236
+++ x/mm/page_alloc.c	2 Dec 2004 02:21:18 -0000
@@ -611,6 +611,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 	int alloc_type;
 	int do_retry;
 	int can_try_harder;
+	int did_some_progress;
 
 	might_sleep_if(wait);
 
@@ -686,18 +687,19 @@ rebalance:
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	try_to_free_pages(zones, gfp_mask, order);
+	did_some_progress = try_to_free_pages(zones, gfp_mask, order);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
 
-	/* go through the zonelist yet one more time */
+	/*
+	 * Go through the zonelist yet one more time, keep
+	 * very high watermark here, this is only to catch
+	 * a parallel oom killing, we must fail if we're still
+	 * under heavy pressure.
+	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
-		min = z->pages_min;
-		if (gfp_mask & __GFP_HIGH)
-			min /= 2;
-		if (can_try_harder)
-			min -= min / 4;
+		min = z->pages_high;
 		min += (1<<order) + z->protection[alloc_type];
 
 		if (z->free_pages < min)
@@ -708,6 +710,9 @@ rebalance:
 			goto got_pg;
 	}
 
+	if (unlikely(!did_some_progress) && (gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
+		out_of_memory(gfp_mask);
+
 	/*
 	 * Don't let big-order allocations loop unless the caller explicitly
 	 * requests that.  Wait for some write requests to complete then retry.
Index: x/mm/vmscan.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/vmscan.c,v
retrieving revision 1.225
diff -u -p -r1.225 vmscan.c
--- x/mm/vmscan.c	19 Nov 2004 22:54:22 -0000	1.225
+++ x/mm/vmscan.c	2 Dec 2004 01:56:50 -0000
@@ -935,8 +935,6 @@ int try_to_free_pages(struct zone **zone
 		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
-	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
-		out_of_memory(gfp_mask);
 out:
 	for (i = 0; zones[i] != 0; i++)
 		zones[i]->prev_priority = zones[i]->temp_priority;

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02  3:36     ` Andrea Arcangeli
@ 2004-12-02 11:09       ` Thomas Gleixner
  2004-12-02 13:48         ` Thomas Gleixner
  0 siblings, 1 reply; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-02 11:09 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, marcelo.tosatti, LKML, Nick Piggin

On Thu, 2004-12-02 at 04:36 +0100, Andrea Arcangeli wrote:
> Could you please test this? Perhaps it won't be as effective as the
> hacks I nuked, but it fixed basic things for me in a quick test so I
> like it more. It's important to use pages_high to avoid livelocks.
> 

Sorry man. The behaviour is worse than before, but I expected that :)

It kills sshd, the application, the controlling terminal and some other
innocent processes like hotplug. 

Take a decent UP machine enable PREEMPT and start # hackbench NN , where
NN is a number of processes, which exhausts the memory of the machine
and see yourself.

If there is the point where memory is exhausted you have to kill a
process _and_ wait until the memory is freed before even thinking about
another invocation of oom_kill().

There is usually more than one process on a machine which requests
memory. So as long as the first kill did not proceed to the point where
the memory is actually freed, you will call out_of_memory() from
different processes or kernel functions and make the machine less usable
than neccecary.

The oom_kill is invoked about 100 times. It ends up killing hotplug when
already 112MB memory are available. See below.

I agree, that the timed hack stuff in out_of_memory is ugly, but we need
some mechanism to avoid 

	1. reentrancy into oom_kill
	2. invocation of oom_kill until the already killed process
	   has finally released memory.

Further we need a better selection of whom to kill. 

tglx

<SNIP>
Free pages:      112360kB (0kB HighMem)
<SNIP>
 [<c0132151>] out_of_memory+0x21/0x30
 [<c0132f96>] __alloc_pages+0x2f6/0x380
 [<c013303f>] __get_free_pages+0x1f/0x40
 [<c013665f>] kmem_getpages+0x1f/0xd0
 [<c0137339>] cache_grow+0xb9/0x180
 [<c0137573>] cache_alloc_refill+0x173/0x220
 [<c01378d4>] __kmalloc+0x74/0x80
 [<c02534d7>] alloc_skb+0x47/0xf0
 [<c0252717>] sock_alloc_send_pskb+0xc7/0x1d0
 [<c025284d>] sock_alloc_send_skb+0x2d/0x40
 [<c02ad466>] unix_stream_sendmsg+0x196/0x400
 [<c012f987>] filemap_nopage+0x207/0x3a0
 [<c024fa66>] sock_aio_write+0xf6/0x120
 [<c014c967>] do_sync_write+0xb7/0xf0
 [<c016048b>] do_pollfd+0x5b/0xa0
 [<c0126f90>] autoremove_wake_function+0x0/0x60
 [<c016053a>] do_poll+0x6a/0xd0
 [<c014ca9f>] vfs_write+0xff/0x130
 [<c014cba1>] sys_write+0x51/0x80
 [<c01027cb>] syscall_call+0x7/0xb
Out of Memory: Killed process 3037 (hotplug).




^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 16:55             ` Andrew Morton
@ 2004-12-02 11:18               ` Marcelo Tosatti
  2004-12-02 17:17               ` Thomas Gleixner
  2004-12-02 18:08               ` Andrea Arcangeli
  2 siblings, 0 replies; 66+ messages in thread
From: Marcelo Tosatti @ 2004-12-02 11:18 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andrea Arcangeli, tglx, linux-kernel, nickpiggin

On Thu, Dec 02, 2004 at 08:55:18AM -0800, Andrew Morton wrote:
> Andrea Arcangeli <andrea@suse.de> wrote:
> >
> > I believe the thing you're hiding with the callback, is some screwup in
> >  the VM. It shouldn't fire oom 300 times in a row.
> 
> Well no ;) 

I bet zone->all_unreclaimable is one of the main issues here as Andrea notes.

> Thomas, could you please put together a description of how to reproduce
> this behaviour?

A simple "fillmem" works for me - the OOM killer kills the hog and the bash 
which its being ran from. Have you tried that? 

If you fire up a few fillmem's at the same time I bet you'll see the problem in a 
greater degree.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 11:09       ` Thomas Gleixner
@ 2004-12-02 13:48         ` Thomas Gleixner
  2004-12-02 16:47           ` Andrea Arcangeli
  0 siblings, 1 reply; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-02 13:48 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, marcelo.tosatti, LKML, Nick Piggin

[-- Attachment #1: Type: text/plain, Size: 1610 bytes --]

On Thu, 2004-12-02 at 12:09 +0100, Thomas Gleixner wrote:
> I agree, that the timed hack stuff in out_of_memory is ugly, but we need
> some mechanism to avoid 
> 
> 	1. reentrancy into oom_kill
> 	2. invocation of oom_kill until the already killed process
> 	   has finally released memory.
> 
> Further we need a better selection of whom to kill. 

Attached patch solves this problems more elegant than the previous
attempts and removes the ugly time, count, threshold stuff.

Reentrancy and follow up calls of oom_kill() are blocked until the task
which was killed by the first oom_kill() has actually released the
resources. I added a callback which is called from do_exit() when the
PF_MEMDIE flag is set. The reentrancy blocking is released inside the
callback.

The whom to select modification is still neccecary to prevent that
innocent processes are killed.

It now kills exactly one process and comes back to the terminal without
further damages. Also other scenarios which I used to test the oom-
killer are not showing any strange side effects.

>From the first call to out_of_memory, which activates the reentrancy
blocking until the blocking is released in the callback, out_of_memory
is called more than 300 times.

Interesting is that the patch applied to the current code in 2.6.10-rc2-
mm4, where the call to out_of_memory is still in vmscan.c, has the same
effect. 

So it's up to you VM guys to fight out from which place you want call
out_of_memory(). I don't care as both places have exactly the same bad
side effects.

My concern is to make it reliable when it is finally invoked.

tglx


[-- Attachment #2: 2.6.10-rc2.oom.diff --]
[-- Type: text/x-patch, Size: 6101 bytes --]

diff -urN 2.6.10-rc2.orig/include/linux/swap.h 2.6.10-rc2.oom/include/linux/swap.h
--- 2.6.10-rc2.orig/include/linux/swap.h	2004-12-02 13:45:55.000000000 +0100
+++ 2.6.10-rc2.oom/include/linux/swap.h	2004-12-02 14:13:16.000000000 +0100
@@ -149,6 +149,7 @@
 
 /* linux/mm/oom_kill.c */
 extern void out_of_memory(int gfp_mask);
+extern void oom_task_killed(void);
 
 /* linux/mm/memory.c */
 extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
diff -urN 2.6.10-rc2.orig/kernel/exit.c 2.6.10-rc2.oom/kernel/exit.c
--- 2.6.10-rc2.orig/kernel/exit.c	2004-12-02 13:45:55.000000000 +0100
+++ 2.6.10-rc2.oom/kernel/exit.c	2004-12-02 12:13:50.000000000 +0100
@@ -820,6 +820,9 @@
 	exit_thread();
 	exit_keys(tsk);
 
+	if (current->flags & PF_MEMDIE)
+		oom_task_killed();
+
 	if (group_dead && tsk->signal->leader)
 		disassociate_ctty(1);
 
diff -urN 2.6.10-rc2.orig/mm/oom_kill.c 2.6.10-rc2.oom/mm/oom_kill.c
--- 2.6.10-rc2.orig/mm/oom_kill.c	2004-12-02 13:45:55.000000000 +0100
+++ 2.6.10-rc2.oom/mm/oom_kill.c	2004-12-02 14:22:21.000000000 +0100
@@ -45,8 +45,10 @@
 static unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
+        struct list_head *tsk;
 
-	if (!p->mm)
+	/* Ignore mm-less tasks and init */
+	if (!p->mm || p->pid == 1)
 		return 0;
 
 	if (p->flags & PF_MEMDIE)
@@ -57,6 +59,19 @@
 	points = p->mm->total_vm;
 
 	/*
+	 * Processes which fork a lot of child processes are likely 
+	 * a good choice. We add the vmsize of the childs if they
+	 * have an own mm. This prevents forking servers to flood the
+	 * machine with an endless amount of childs
+	 */
+	list_for_each(tsk, &p->children) {
+		struct task_struct *chld;
+		chld = list_entry(tsk, struct task_struct, sibling);
+		if (chld->mm != p->mm && chld->mm)
+			points += chld->mm->total_vm;
+	}
+
+	/*
 	 * CPU time is in tens of seconds and run time is in thousands
          * of seconds. There is no particular reason for this other than
          * that it turned out to work very well in practice.
@@ -175,6 +190,27 @@
 	return mm;
 }
 
+static struct mm_struct *oom_kill_process(task_t *p)
+{
+	struct mm_struct *mm;
+	struct task_struct *g, *q;
+
+	mm = oom_kill_task(p);
+	if (!mm)
+		return NULL;
+	/*
+	 * kill all processes that share the ->mm (i.e. all threads),
+	 * but are in a different thread group
+	 */
+	do_each_thread(g, q)
+		if (q->mm == mm && q->tgid != p->tgid)
+			__oom_kill_task(q);
+
+	while_each_thread(g, q);
+	if (!p->mm)
+		printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
+	return mm;
+}
 
 /**
  * oom_kill - kill the "best" process when we run out of memory
@@ -184,10 +220,11 @@
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-static void oom_kill(void)
+void oom_kill(void)
 {
 	struct mm_struct *mm;
-	struct task_struct *g, *p, *q;
+	struct task_struct *c, *p;
+	struct list_head *tsk;
 	
 	read_lock(&tasklist_lock);
 retry:
@@ -199,102 +236,57 @@
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	mm = oom_kill_task(p);
-	if (!mm)
-		goto retry;
 	/*
-	 * kill all processes that share the ->mm (i.e. all threads),
-	 * but are in a different thread group
+	 * Kill the forked child processes first
 	 */
-	do_each_thread(g, q)
-		if (q->mm == mm && q->tgid != p->tgid)
-			__oom_kill_task(q);
-	while_each_thread(g, q);
-	if (!p->mm)
-		printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
-	read_unlock(&tasklist_lock);
-	mmput(mm);
+	list_for_each(tsk, &p->children) {
+		c = list_entry(tsk, struct task_struct, sibling);
+		/* Do not touch threads, as they get killed later */
+		if (c->mm == p->mm)
+			continue;
+		mm = oom_kill_process(c);
+		if (mm) {
+			mmput(mm);
+			goto out;
+		}
+	}
 
 	/*
-	 * Make kswapd go out of the way, so "p" has a good chance of
-	 * killing itself before someone else gets the chance to ask
-	 * for more memory.
+	 * If we managed to kill one or more child processes
+	 * then let the parent live for now
 	 */
-	yield();
+	mm = oom_kill_process(p);
+	if (!mm)
+		goto retry;
+	mmput(mm);
+out:
+	read_unlock(&tasklist_lock);
 	return;
 }
 
+static unsigned long kill_inprogress;
+
 /**
  * out_of_memory - is the system out of memory?
  */
 void out_of_memory(int gfp_mask)
 {
-	/*
-	 * oom_lock protects out_of_memory()'s static variables.
-	 * It's a global lock; this is not performance-critical.
-	 */
-	static spinlock_t oom_lock = SPIN_LOCK_UNLOCKED;
-	static unsigned long first, last, count, lastkill;
-	unsigned long now, since;
-
-	spin_lock(&oom_lock);
-	now = jiffies;
-	since = now - last;
-	last = now;
-
-	/*
-	 * If it's been a long time since last failure,
-	 * we're not oom.
-	 */
-	if (since > 5*HZ)
-		goto reset;
-
-	/*
-	 * If we haven't tried for at least one second,
-	 * we're not really oom.
-	 */
-	since = now - first;
-	if (since < HZ)
-		goto out_unlock;
-
-	/*
-	 * If we have gotten only a few failures,
-	 * we're not really oom. 
-	 */
-	if (++count < 10)
-		goto out_unlock;
-
-	/*
-	 * If we just killed a process, wait a while
-	 * to give that task a chance to exit. This
-	 * avoids killing multiple processes needlessly.
-	 */
-	since = now - lastkill;
-	if (since < HZ*5)
-		goto out_unlock;
-
-	/*
-	 * Ok, really out of memory. Kill something.
-	 */
-	lastkill = now;
-
+ 	if (test_and_set_bit(0, &kill_inprogress))
+ 		return;
+		
 	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
 	show_free_areas();
-
-	/* oom_kill() sleeps */
-	spin_unlock(&oom_lock);
+	dump_stack();
 	oom_kill();
-	spin_lock(&oom_lock);
-
-reset:
-	/*
-	 * We dropped the lock above, so check to be sure the variable
-	 * first only ever increases to prevent false OOM's.
-	 */
-	if (time_after(now, first))
-		first = now;
-	count = 0;
-
-out_unlock:
-	spin_unlock(&oom_lock);
 }
+
+/**
+ * oom_task_killed - the killed task has released the resources
+ *
+ * So we actually have freed memory. Now it's safe to reenable
+ * the oom killer.
+ */
+void oom_task_killed(void)
+{
+	clear_bit(0, &kill_inprogress);
+} 

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 13:48         ` Thomas Gleixner
@ 2004-12-02 16:47           ` Andrea Arcangeli
  2004-12-02 16:55             ` Andrew Morton
  0 siblings, 1 reply; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-02 16:47 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andrew Morton, marcelo.tosatti, LKML, Nick Piggin

On Thu, Dec 02, 2004 at 02:48:00PM +0100, Thomas Gleixner wrote:
> Reentrancy and follow up calls of oom_kill() are blocked until the task
> which was killed by the first oom_kill() has actually released the
> resources. I added a callback which is called from do_exit() when the
> PF_MEMDIE flag is set. The reentrancy blocking is released inside the
> callback.

This logic will certainly fix it, and I'm not against this logic to fix
the problem in a definitive way. It's only unfortunate PF_MEMDIE is smp
racy (plus having to check for PF_MEMDIE in the fast path).

> >From the first call to out_of_memory, which activates the reentrancy
> blocking until the blocking is released in the callback, out_of_memory
> is called more than 300 times.

I believe the thing you're hiding with the callback, is some screwup in
the VM. It shouldn't fire oom 300 times in a row.

zone->all_unreclaimable and zone->pages_scanned _must_ be set to 0 by
the oom_killer invocation, did you try to fix that? 

> So it's up to you VM guys to fight out from which place you want call
> out_of_memory(). I don't care as both places have exactly the same bad
> side effects.

the reason for oom in page_alloc.c, is that you must check the free
pages levels correctly, your previous nr_free_pages() attempt was the
unreliable way to do that (it couldn't take into account all the
lowmem_reserve levels etc..).

> My concern is to make it reliable when it is finally invoked.

This approach of using a callback will sure work fine for that but the
300 times invocation of oom kill shows something is wrong in the VM, and
I believe the screwup is zone->all_unreclaimable and zone->pages_scanned
not being resetted by the oom killer invocation. I suspect if you fix
that single bit, things will start working even without the callback.

Note that in 2.4 only one task gets killed, and there's no need of any
callback in any fast path to make it work. I'm not conceptually against
the callback to "guarantee" oom-killing racing avoidance, but right now
it sounds like it's hiding some more fundamental problem in 2.6.

Thanks.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 16:47           ` Andrea Arcangeli
@ 2004-12-02 16:55             ` Andrew Morton
  2004-12-02 11:18               ` Marcelo Tosatti
                                 ` (2 more replies)
  0 siblings, 3 replies; 66+ messages in thread
From: Andrew Morton @ 2004-12-02 16:55 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: tglx, marcelo.tosatti, linux-kernel, nickpiggin

Andrea Arcangeli <andrea@suse.de> wrote:
>
> I believe the thing you're hiding with the callback, is some screwup in
>  the VM. It shouldn't fire oom 300 times in a row.

Well no ;)

Thomas, could you please put together a description of how to reproduce
this behaviour?


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 16:55             ` Andrew Morton
  2004-12-02 11:18               ` Marcelo Tosatti
@ 2004-12-02 17:17               ` Thomas Gleixner
  2004-12-02 17:27                 ` Andrew Morton
  2004-12-02 18:08               ` Andrea Arcangeli
  2 siblings, 1 reply; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-02 17:17 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andrea Arcangeli, marcelo.tosatti, LKML, nickpiggin

On Thu, 2004-12-02 at 08:55 -0800, Andrew Morton wrote:
> Andrea Arcangeli <andrea@suse.de> wrote:
> >
> > I believe the thing you're hiding with the callback, is some screwup in
> >  the VM. It shouldn't fire oom 300 times in a row.
> 
> Well no ;)

:)

> Thomas, could you please put together a description of how to reproduce
> this behaviour?
> 

As I mentioned before. I'm using a PIII,500Mhz,128MB Machine. Kernel
compiled with PREEMPT=y.
I boot into runlevel 3 and start
# hackbench 40
from the shell.

Just adjust the number so hackbench eats up all the memory.

I have some more test cases with real applications, but they are not so
easy to reproduce. Hackbench resembles the forking server quite well.

tglx



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 17:17               ` Thomas Gleixner
@ 2004-12-02 17:27                 ` Andrew Morton
  0 siblings, 0 replies; 66+ messages in thread
From: Andrew Morton @ 2004-12-02 17:27 UTC (permalink / raw)
  To: tglx; +Cc: andrea, marcelo.tosatti, linux-kernel, nickpiggin

Thomas Gleixner <tglx@linutronix.de> wrote:
>
> > Thomas, could you please put together a description of how to reproduce
>  > this behaviour?
>  > 
> 
>  As I mentioned before. I'm using a PIII,500Mhz,128MB Machine. Kernel
>  compiled with PREEMPT=y.
>  I boot into runlevel 3 and start
>  # hackbench 40
>  from the shell.
> 
>  Just adjust the number so hackbench eats up all the memory.

How much swap space is online?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 16:55             ` Andrew Morton
  2004-12-02 11:18               ` Marcelo Tosatti
  2004-12-02 17:17               ` Thomas Gleixner
@ 2004-12-02 18:08               ` Andrea Arcangeli
  2004-12-02 18:29                 ` Andrew Morton
  2004-12-02 18:55                 ` Thomas Gleixner
  2 siblings, 2 replies; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-02 18:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: tglx, marcelo.tosatti, linux-kernel, nickpiggin

On Thu, Dec 02, 2004 at 08:55:18AM -0800, Andrew Morton wrote:
> Andrea Arcangeli <andrea@suse.de> wrote:
> >
> > I believe the thing you're hiding with the callback, is some screwup in
> >  the VM. It shouldn't fire oom 300 times in a row.
> 
> Well no ;)

Could you explain why do we need all_unreclaimable?  What is the point
of all_unreclaimable if we bypass it at priority = 0?  Just to loop a
few times (between DEF_PRIORITY and 1) at 100% cpu load?

OTOH we must not forget 2.4(-aa) calls do_exit synchronously and it
never sends signals. That might be why 2.4 doesn't kill more than one
task by mistake, even without a callback-wakeup. So if we keep sending
signals I certainly agree with Thomas that using a callback to stop the
VM until the task is rescheduled is a good idea, and potentially it may
be even the only possible fix when the oom killer is enabled like in 2.6
(though the 300 kills in between SIGKILL and the reschedule sounds like
the VM isn't even trying anymore).  Otherwise perhaps his workload is
spawning enough tasks, that it takes an huge time for the rechedule
(that would explain it too).

Actually this should fix it too btw:

-	if (p->flags & PF_MEMDIE)
-		return 0;

Thomas can you try the above?

I'd rather fix this by removing buggy code, than by adding additional
logics on top of already buggy code (i.e. setting PF_MEMDIE is a smp
race and can corrupt other bitflags), but at least the
oom-wakeup-callback from do_exit still makes a lot of sense (even if
PF_MEMDIE must be dropped since it's buggy, and something else should be
used instead).

Whatever we change I'd like to change it on top of my last patch that
already removes the 5 seconds fixed waits, and does the right watermark
checks before killing the next task (Thomas already attempted that with
a not accurate nr_free_pages check, so he at least agrees about the need
of checking watermarks before firing up the oom killer).

BTW, checking for pid == 1 like in Thomas's patch is a must, I advocated
it for years but nobody listened yet, hope Thomas will be better at
convincing the VM mainline folks than me.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 18:08               ` Andrea Arcangeli
@ 2004-12-02 18:29                 ` Andrew Morton
  2004-12-02 19:01                   ` Thomas Gleixner
  2004-12-02 18:55                 ` Thomas Gleixner
  1 sibling, 1 reply; 66+ messages in thread
From: Andrew Morton @ 2004-12-02 18:29 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: tglx, marcelo.tosatti, linux-kernel, nickpiggin

Andrea Arcangeli <andrea@suse.de> wrote:
>
> On Thu, Dec 02, 2004 at 08:55:18AM -0800, Andrew Morton wrote:
> > Andrea Arcangeli <andrea@suse.de> wrote:
> > >
> > > I believe the thing you're hiding with the callback, is some screwup in
> > >  the VM. It shouldn't fire oom 300 times in a row.
> > 
> > Well no ;)
> 
> Could you explain why do we need all_unreclaimable?  What is the point
> of all_unreclaimable if we bypass it at priority = 0?  Just to loop a
> few times (between DEF_PRIORITY and 1) at 100% cpu load?

It addresses the situation where all of a zone's pages are pinned down for
some reason (say, they're all mlocked, or we're out of swapspace).  There's
no point in chewing tons of CPU scanning this zone on every reclaim attempt,
so the VM marks the zone as being full of unreclaimable pages.

When the zone is marked all_unreclaimable, we *logically* don't scan it at
all - just skip it.  But in practice we do need to detect the situation
where some of the zone's pages have become reclaimable again.  That could
happen because more swapspace has become available, or an app ran munlock()
or whatever.  So to perform this detection we perform tiny scans to just
poll the zone.  If that tiny scan results in a page getting reclaimed then
we clear all_reclaimable and the zone returns to normal state.

As an alternative to the tiny-scan-polling we could clear all_unreclaimable
in all zones when releasing swapspace, when running munlock(), etc.

Probably free_pages_bulk() should only be clearing all_unreclaimable if
current->reclaim_state != NULL, because random page freeings in the zone
don't necessarily mean that any pages have become reclaimable.  Or clear
all_unreclaimable in shrink_list() rather than free_pages_bulk().

> OTOH we must not forget 2.4(-aa) calls do_exit synchronously and it
> never sends signals. That might be why 2.4 doesn't kill more than one
> task by mistake, even without a callback-wakeup. So if we keep sending
> signals I certainly agree with Thomas that using a callback to stop the
> VM until the task is rescheduled is a good idea, and potentially it may
> be even the only possible fix when the oom killer is enabled like in 2.6
> (though the 300 kills in between SIGKILL and the reschedule sounds like
> the VM isn't even trying anymore).  Otherwise perhaps his workload is
> spawning enough tasks, that it takes an huge time for the rechedule
> (that would explain it too).

Could be, I dunno.  I've never done any work on the oom-killer and I tend
to regard it as a stupid thing which is only there so you can get back into
the machine to shut down and restart everything.

I mean, if you ran the machine out of memory then it is underprovisioned
and it *will* become unreliable whatever we do.  The answer is to Not Do
That.  As long as the oom-killer allows you to get in and admin the machine
later on then that's good enough as far as I'm concerned.  Others probably
disagree ;)

> Actually this should fix it too btw:
> 
> -	if (p->flags & PF_MEMDIE)
> -		return 0;
> 
> Thomas can you try the above?
> 
> I'd rather fix this by removing buggy code, than by adding additional
> logics on top of already buggy code (i.e. setting PF_MEMDIE is a smp
> race and can corrupt other bitflags),

Yes, that needs a separate task_struct field.

> but at least the
> oom-wakeup-callback from do_exit still makes a lot of sense (even if
> PF_MEMDIE must be dropped since it's buggy, and something else should be
> used instead).

Probably.

> does the right watermark checks before killing the next task

yes, we should do that.

> BTW, checking for pid == 1 like in Thomas's patch is a must, I advocated
> it for years but nobody listened yet, hope Thomas will be better at
> convincing the VM mainline folks than me.

hm.  I thought we were already doing that, but it seems not.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 18:08               ` Andrea Arcangeli
  2004-12-02 18:29                 ` Andrew Morton
@ 2004-12-02 18:55                 ` Thomas Gleixner
  2004-12-02 19:07                   ` Andrew Morton
  2004-12-02 23:35                   ` Andrea Arcangeli
  1 sibling, 2 replies; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-02 18:55 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Thu, 2004-12-02 at 19:08 +0100, Andrea Arcangeli wrote:
> OTOH we must not forget 2.4(-aa) calls do_exit synchronously and it
> never sends signals. That might be why 2.4 doesn't kill more than one
> task by mistake, even without a callback-wakeup. 

I just run the same test on 2.4.27 and the behaviour is completely
different.

The box seems to be stuck in a swap in/out loop for quite a long time.
During this time the box is not responsive at all. It finally stops the
forking after quite a long time of swapping with
fork() (error: resource temporarily not available). 

There is no output in dmesg, but I'm not able to remove the remaining
hackbench processes as even a kill -SIGKILL returns with 
fork() (error: resource temporarily not available)

I'm not sure, which of the two scenarios I like better :)

FYI, I tried with 2.6 UP and PREEMPT=n. The result is more horrible. The
box just gets stuck in an endless swap in/swap out and does not respond
to anything else than SysRq-T and the reset button.

With the callback the machine did not come back after 20 Minutes.

Without the callback the machine dies after about 10 Minutes and killing
all available processes except init with:
Kernel panic - not syncing: Out of memory and no killable processes...

> So if we keep sending
> signals I certainly agree with Thomas that using a callback to stop the
> VM until the task is rescheduled is a good idea, and potentially it may
> be even the only possible fix when the oom killer is enabled like in 2.6
> (though the 300 kills in between SIGKILL and the reschedule sounds like
> the VM isn't even trying anymore).  Otherwise perhaps his workload is
> spawning enough tasks, that it takes an huge time for the rechedule
> (that would explain it too).

Yeah, there are enough tasks and with preempt enabled more than one
tasks requests memory. That explains the repetitive calls to
out_of_memory(). This only happens on UP + PREEMPT=y and SMP. See above.

> Actually this should fix it too btw:
> 
> -	if (p->flags & PF_MEMDIE)
> -		return 0;
> 
> Thomas can you try the above?

You meant the one in badness() right ?
Well it makes it better, but I was able to have a second invocation
before the first killed tasks exited. That's simple to explain. The task
is on the way out and releases resources, so the VM size is reduced and
the killer picks another process. 

> I'd rather fix this by removing buggy code, than by adding additional
> logics on top of already buggy code (i.e. setting PF_MEMDIE is a smp
> race and can corrupt other bitflags), but at least the
> oom-wakeup-callback from do_exit still makes a lot of sense (even if
> PF_MEMDIE must be dropped since it's buggy, and something else should be
> used instead).

I think the callback is the only safe way to fix that. If PF_MEMDIE is
racy then I'm sure we will find a different indicator for that.

> Whatever we change I'd like to change it on top of my last patch that
> already removes the 5 seconds fixed waits, and does the right watermark
> checks before killing the next task (Thomas already attempted that with
> a not accurate nr_free_pages check, so he at least agrees about the need
> of checking watermarks before firing up the oom killer).

Yep, but the reentrancy blocking with the callback makes the time, count
crap and the watermark check go away, as it is safe to reenable the
killer at this point because we definitely freed memory resources. So
the watermark comes for free.

> BTW, checking for pid == 1 like in Thomas's patch is a must, I advocated
> it for years but nobody listened yet, hope Thomas will be better at
> convincing the VM mainline folks than me.

:)

tglx



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 18:29                 ` Andrew Morton
@ 2004-12-02 19:01                   ` Thomas Gleixner
  0 siblings, 0 replies; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-02 19:01 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andrea Arcangeli, marcelo.tosatti, LKML, nickpiggin

On Thu, 2004-12-02 at 10:29 -0800, Andrew Morton wrote:
> Could be, I dunno.  I've never done any work on the oom-killer and I tend
> to regard it as a stupid thing which is only there so you can get back into
> the machine to shut down and restart everything.
> 
> I mean, if you ran the machine out of memory then it is underprovisioned
> and it *will* become unreliable whatever we do.  The answer is to Not Do
> That.  As long as the oom-killer allows you to get in and admin the machine
> later on then that's good enough as far as I'm concerned.  Others probably
> disagree ;)

I agree, but the current situation made me drive 150km, because sshd or
even init was killed.

I hit this problem, when a forking server application got out of control
because there were suddenly connecting hundreds of clients due to a
different problem.

As long as I can log into the machine and fix the crap it's ok. There's
no need for anything else than accessability and a half way
deterministic behaviour.

tglx



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 18:55                 ` Thomas Gleixner
@ 2004-12-02 19:07                   ` Andrew Morton
  2004-12-02 19:08                     ` Thomas Gleixner
  2004-12-02 23:35                   ` Andrea Arcangeli
  1 sibling, 1 reply; 66+ messages in thread
From: Andrew Morton @ 2004-12-02 19:07 UTC (permalink / raw)
  To: tglx; +Cc: andrea, marcelo.tosatti, linux-kernel, nickpiggin

Thomas Gleixner <tglx@linutronix.de> wrote:
>
> FYI, I tried with 2.6 UP and PREEMPT=n. The result is more horrible. The
> box just gets stuck in an endless swap in/swap out and does not respond
> to anything else than SysRq-T and the reset button.

There's a patch in -mm which causes the oom-killer to be invoked each time
you hit sysrq-F, which sounds like a fine idea to me.

Unfortunately it calls the oom-killer from interrupt context, and I need to
fix that up before the patch goes any further.


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 19:07                   ` Andrew Morton
@ 2004-12-02 19:08                     ` Thomas Gleixner
  2004-12-02 19:22                       ` Andrew Morton
  0 siblings, 1 reply; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-02 19:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: andrea, marcelo.tosatti, LKML, nickpiggin

On Thu, 2004-12-02 at 11:07 -0800, Andrew Morton wrote:
> Thomas Gleixner <tglx@linutronix.de> wrote:
> >
> > FYI, I tried with 2.6 UP and PREEMPT=n. The result is more horrible. The
> > box just gets stuck in an endless swap in/swap out and does not respond
> > to anything else than SysRq-T and the reset button.
> 
> There's a patch in -mm which causes the oom-killer to be invoked each time
> you hit sysrq-F, which sounds like a fine idea to me.

Can you please explain, how I can hit sysrq-F when I can't log into the
remote machine ?

tglx



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 19:08                     ` Thomas Gleixner
@ 2004-12-02 19:22                       ` Andrew Morton
  2004-12-02 19:24                         ` Thomas Gleixner
  0 siblings, 1 reply; 66+ messages in thread
From: Andrew Morton @ 2004-12-02 19:22 UTC (permalink / raw)
  To: tglx; +Cc: andrea, marcelo.tosatti, linux-kernel, nickpiggin

Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Thu, 2004-12-02 at 11:07 -0800, Andrew Morton wrote:
> > Thomas Gleixner <tglx@linutronix.de> wrote:
> > >
> > > FYI, I tried with 2.6 UP and PREEMPT=n. The result is more horrible. The
> > > box just gets stuck in an endless swap in/swap out and does not respond
> > > to anything else than SysRq-T and the reset button.
> > 
> > There's a patch in -mm which causes the oom-killer to be invoked each time
> > you hit sysrq-F, which sounds like a fine idea to me.
> 
> Can you please explain, how I can hit sysrq-F when I can't log into the
> remote machine ?
> 

umm, in the same way you're using "SysRq-T and the reset button"?

You can issue sysrq commands over serial consoles too.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 19:22                       ` Andrew Morton
@ 2004-12-02 19:24                         ` Thomas Gleixner
  2004-12-02 20:11                           ` Andre Tomt
                                             ` (2 more replies)
  0 siblings, 3 replies; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-02 19:24 UTC (permalink / raw)
  To: Andrew Morton; +Cc: andrea, marcelo.tosatti, LKML, nickpiggin

On Thu, 2004-12-02 at 11:22 -0800, Andrew Morton wrote:
> Thomas Gleixner <tglx@linutronix.de> wrote:
> >
> > On Thu, 2004-12-02 at 11:07 -0800, Andrew Morton wrote:
> > > Thomas Gleixner <tglx@linutronix.de> wrote:
> > > >
> > > > FYI, I tried with 2.6 UP and PREEMPT=n. The result is more horrible. The
> > > > box just gets stuck in an endless swap in/swap out and does not respond
> > > > to anything else than SysRq-T and the reset button.
> > > 
> > > There's a patch in -mm which causes the oom-killer to be invoked each time
> > > you hit sysrq-F, which sounds like a fine idea to me.
> > 
> > Can you please explain, how I can hit sysrq-F when I can't log into the
> > remote machine ?
> > 
> 
> umm, in the same way you're using "SysRq-T and the reset button"?
> 
> You can issue sysrq commands over serial consoles too.

I know, but the console and the reset button are 150km away. When I dial
into the machine or try to connect via the network, I cannot connect
with the current kernels. Neither 2.4, because the fork fails, nor 2.6
because oom killed sshd. So I cannot send anything except a service man,
who drives 150km to hit sysrq-F or the reset button.

If this happens the kernel just should make sure to make the machine
accessible and let me repair the problem. Nothing else.

tglx



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 19:24                         ` Thomas Gleixner
@ 2004-12-02 20:11                           ` Andre Tomt
  2004-12-03 22:45                             ` Thomas Gleixner
  2004-12-02 23:47                           ` Andrea Arcangeli
  2004-12-03 14:41                           ` Helge Hafting
  2 siblings, 1 reply; 66+ messages in thread
From: Andre Tomt @ 2004-12-02 20:11 UTC (permalink / raw)
  To: tglx; +Cc: Andrew Morton, andrea, marcelo.tosatti, LKML, nickpiggin

Thomas Gleixner wrote:
> On Thu, 2004-12-02 at 11:22 -0800, Andrew Morton wrote:
>>You can issue sysrq commands over serial consoles too.
> 
> I know, but the console and the reset button are 150km away. When I dial
> into the machine or try to connect via the network, I cannot connect
> with the current kernels. Neither 2.4, because the fork fails, nor 2.6
> because oom killed sshd. So I cannot send anything except a service man,
> who drives 150km to hit sysrq-F or the reset button.

Get one of those terminal server/concentrators that export the serial 
consoles over IP. Or one of those KVM-over-IP extenders. Worth every penny.

[sorry thomas, forgot reply all first time aound, so you'll get a dupe.]

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 18:55                 ` Thomas Gleixner
  2004-12-02 19:07                   ` Andrew Morton
@ 2004-12-02 23:35                   ` Andrea Arcangeli
  2004-12-03  2:28                     ` Andrea Arcangeli
  2004-12-03 21:10                     ` Thomas Gleixner
  1 sibling, 2 replies; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-02 23:35 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Thu, Dec 02, 2004 at 07:55:16PM +0100, Thomas Gleixner wrote:
> On Thu, 2004-12-02 at 19:08 +0100, Andrea Arcangeli wrote:
> > OTOH we must not forget 2.4(-aa) calls do_exit synchronously and it
> > never sends signals. That might be why 2.4 doesn't kill more than one
> > task by mistake, even without a callback-wakeup. 
> 
> I just run the same test on 2.4.27 and the behaviour is completely
> different.
> 
> The box seems to be stuck in a swap in/out loop for quite a long time.
> During this time the box is not responsive at all. It finally stops the
> forking after quite a long time of swapping with
> fork() (error: resource temporarily not available). 

Fork eventually failing is very reasonable if you're executing a fork
loop.

> 
> There is no output in dmesg, but I'm not able to remove the remaining
> hackbench processes as even a kill -SIGKILL returns with 
> fork() (error: resource temporarily not available)
> 
> I'm not sure, which of the two scenarios I like better :)

Please try with 2.4.23aa3, I think there was some oom killer change
after I had no resources to track 2.4 anymore. I'm not saying 2.4.23aa3
will work better though, but I would like to know if there's some corner
case still open in 2.4-aa. Careful, 2.4.23aa3 has security bugs (only
local security of course, i.e. normally not a big issue, sure good
enough for a quick test).

I doubt your testcase simulates anything remotely realistic but
anyway it's still informative.

What I'm simulating here is very real life scenario with a couple of
apps allocating more memory than ram.

> FYI, I tried with 2.6 UP and PREEMPT=n. The result is more horrible. The
> box just gets stuck in an endless swap in/swap out and does not respond
> to anything else than SysRq-T and the reset button.
> 
> With the callback the machine did not come back after 20 Minutes.

Was the oom killer invoked at all? If yes, and it works with preempt,
that could mean a cond_resched is simply missing...

> You meant the one in badness() right ?

yes.

> Well it makes it better, but I was able to have a second invocation
> before the first killed tasks exited. That's simple to explain. The task
> is on the way out and releases resources, so the VM size is reduced and
> the killer picks another process. 

That's trivial to fix checking for PF_DEAD/PF_EXITING.

> > I'd rather fix this by removing buggy code, than by adding additional
> > logics on top of already buggy code (i.e. setting PF_MEMDIE is a smp
> > race and can corrupt other bitflags), but at least the
> > oom-wakeup-callback from do_exit still makes a lot of sense (even if
> > PF_MEMDIE must be dropped since it's buggy, and something else should be
> > used instead).
> 
> I think the callback is the only safe way to fix that. If PF_MEMDIE is
> racy then I'm sure we will find a different indicator for that.

The callback adds overhead to the exit path. Plus strictly speaking it's
not actually a callback, you're just "polling" for the bitflag :)

> Yep, but the reentrancy blocking with the callback makes the time, count
> crap and the watermark check go away, as it is safe to reenable the
> killer at this point because we definitely freed memory resources. So
> the watermark comes for free.

You can get an I/O race where your program is about to finish a failing
try_to_free_pages pass (note that a task exiting won't make
try_to_free_pages work any easier, try_to_free_pages has to free
allocated memory, it doesn't care if there's 1M or 100M of free memory).
If you don't check the watermarks after waiting for I/O, you're going to
generate a suprious oom-killing. Your changes can't help.

Note that even the watermark checks leaves a race window open, but at
least it's not an I/O window. While try_to_free_pages can wait for I/O
and then fail.

I'll add to my last patch the removal of the PF_MEMDIE check in oom_kill
plus I'll fix the remaining race with PF_EXITING/DEAD, and I'll add a
cond_resched. Then you can try again with my simple way (w/ and w/o
PREEMPT ;).

Thanks for the great feedback.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 19:24                         ` Thomas Gleixner
  2004-12-02 20:11                           ` Andre Tomt
@ 2004-12-02 23:47                           ` Andrea Arcangeli
  2004-12-03 14:41                           ` Helge Hafting
  2 siblings, 0 replies; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-02 23:47 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Thu, Dec 02, 2004 at 08:24:10PM +0100, Thomas Gleixner wrote:
> because oom killed sshd. So I cannot send anything except a service man,
> who drives 150km to hit sysrq-F or the reset button.

You'd need a little old computer next to your server with serial console
attached to it for it to be effective. But it sounds more like a last
resort and it makes economical sense only if you're not hosting your
server and you've your own server room. Normally one doesn't need to
drive 150km just because you can call somebody at the phone to click
reboot (worse than SYSRQ+F [unless it was the critical app itself
hitting a memleak], but much cheaper than hosting a serial console
server too).

The SYSRQ+F sounds more useful on a desktop usage, if you've tons and
tons of swap. Don't forget that with an huge amount of swap, you're
telling the kernel "please make my machine extremely slow if all apps
uses the ram at the same time". In many situations it may be more
efficicient for you to kill one of these apps, wait the other to finish,
and restart the killed app from scratch (to avoid trashing, and the
kernel _can't_ avoid trashing or it wouldn't be fair anymore). So if you
do a mistake and you want to recover responsiveness in less than a
second, the SYSRQ+F trick should make it.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 23:35                   ` Andrea Arcangeli
@ 2004-12-03  2:28                     ` Andrea Arcangeli
  2004-12-03 22:37                       ` Thomas Gleixner
                                         ` (2 more replies)
  2004-12-03 21:10                     ` Thomas Gleixner
  1 sibling, 3 replies; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-03  2:28 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Fri, Dec 03, 2004 at 12:34:59AM +0100, Andrea Arcangeli wrote:
> I'll add to my last patch the removal of the PF_MEMDIE check in oom_kill
> plus I'll fix the remaining race with PF_EXITING/DEAD, and I'll add a
> cond_resched. Then you can try again with my simple way (w/ and w/o
> PREEMPT ;).

Ok, I expect this patch to fix the problem completely. The biggest
difference is that it doesn't affect the exit fast path and it doesn't need
notification. It's not even slower than your patch since you were
really polling too. This schedule properly, if you get any PREEMPT=n
problem I'd really like to know. This is on top of my previous patch so
it does the checks for the watermarks correctly (those are not obsoleted
by whatever thing we do in out_of_memory()). This still make use of the
PF_MEMDIE info but not in kernel/, only in mm/oom_kill.c where it born
and dies there, so the race is somewhat hidden in there. As you said
converting PF_MEMDIE to a set_tsk_thread_flag or some other non-racy
thing is a due change but I'm not doing it in the below patch.

With this thing, I doubt any wrong task will ever be killed again and
you won't risk to drive 150km again (or at least not for this problem ;).
Please test and apply to mainline if you agree.

 oom_kill.c   |  158 ++++++++++++++++++++---------------------------------------
 page_alloc.c |   60 ++++++++++++++++------
 swap_state.c |    2 
 vmscan.c     |    2 
 4 files changed, 103 insertions(+), 119 deletions(-)

Signed-off-by: Andrea Arcangeli <andrea@suse.de>

Index: x/mm/oom_kill.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/oom_kill.c,v
retrieving revision 1.31
diff -u -p -r1.31 oom_kill.c
--- x/mm/oom_kill.c	14 Oct 2004 04:27:49 -0000	1.31
+++ x/mm/oom_kill.c	3 Dec 2004 02:12:29 -0000
@@ -49,8 +49,6 @@ static unsigned long badness(struct task
 	if (!p->mm)
 		return 0;
 
-	if (p->flags & PF_MEMDIE)
-		return 0;
 	/*
 	 * The memory size of the process is the basis for the badness.
 	 */
@@ -120,14 +118,25 @@ static struct task_struct * select_bad_p
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	do_each_thread(g, p)
-		if (p->pid) {
-			unsigned long points = badness(p, uptime.tv_sec);
+		/* skip the init task with pid == 1 */
+		if (p->pid > 1) {
+			unsigned long points;
+
+			/*
+			 * This is in the process of releasing memory so wait it
+			 * to finish before killing some other task by mistake.
+			 */
+			if ((p->flags & PF_MEMDIE) ||
+			    ((p->flags & PF_EXITING) && !(p->flags & PF_DEAD)))
+				return ERR_PTR(-1UL);
+			if (p->flags & PF_SWAPOFF)
+				return p;
+
+			points = badness(p, uptime.tv_sec);
 			if (points > maxpoints) {
 				chosen = p;
 				maxpoints = points;
 			}
-			if (p->flags & PF_SWAPOFF)
-				return p;
 		}
 	while_each_thread(g, p);
 	return chosen;
@@ -140,6 +149,12 @@ static struct task_struct * select_bad_p
  */
 static void __oom_kill_task(task_t *p)
 {
+	if (p->pid == 1) {
+		WARN_ON(1);
+		printk(KERN_WARNING "tried to kill init!\n");
+		return;
+	}
+
 	task_lock(p);
 	if (!p->mm || p->mm == &init_mm) {
 		WARN_ON(1);
@@ -169,9 +184,25 @@ static void __oom_kill_task(task_t *p)
 static struct mm_struct *oom_kill_task(task_t *p)
 {
 	struct mm_struct *mm = get_task_mm(p);
-	if (!mm || mm == &init_mm)
+	task_t * g, * q;
+
+	if (!mm)
 		return NULL;
+	if (mm == &init_mm) {
+		mmput(mm);
+		return NULL;
+	}
+
 	__oom_kill_task(p);
+	/*
+	 * kill all processes that share the ->mm (i.e. all threads),
+	 * but are in a different thread group
+	 */
+	do_each_thread(g, q)
+		if (q->mm == mm && q->tgid != p->tgid)
+			__oom_kill_task(q);
+	while_each_thread(g, q);
+
 	return mm;
 }
 
@@ -184,117 +215,40 @@ static struct mm_struct *oom_kill_task(t
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-static void oom_kill(void)
+void out_of_memory(int gfp_mask)
 {
-	struct mm_struct *mm;
-	struct task_struct *g, *p, *q;
-	
+	struct mm_struct *mm = NULL;
+	task_t * p;
+
 	read_lock(&tasklist_lock);
 retry:
 	p = select_bad_process();
 
+	if (PTR_ERR(p) == -1UL)
+		goto out;
+
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
+		read_unlock(&tasklist_lock);
 		show_free_areas();
 		panic("Out of memory and no killable processes...\n");
 	}
 
+	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
+	show_free_areas();
 	mm = oom_kill_task(p);
 	if (!mm)
 		goto retry;
-	/*
-	 * kill all processes that share the ->mm (i.e. all threads),
-	 * but are in a different thread group
-	 */
-	do_each_thread(g, q)
-		if (q->mm == mm && q->tgid != p->tgid)
-			__oom_kill_task(q);
-	while_each_thread(g, q);
-	if (!p->mm)
-		printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
-	read_unlock(&tasklist_lock);
-	mmput(mm);
 
-	/*
-	 * Make kswapd go out of the way, so "p" has a good chance of
-	 * killing itself before someone else gets the chance to ask
-	 * for more memory.
-	 */
-	yield();
-	return;
-}
-
-/**
- * out_of_memory - is the system out of memory?
- */
-void out_of_memory(int gfp_mask)
-{
-	/*
-	 * oom_lock protects out_of_memory()'s static variables.
-	 * It's a global lock; this is not performance-critical.
-	 */
-	static spinlock_t oom_lock = SPIN_LOCK_UNLOCKED;
-	static unsigned long first, last, count, lastkill;
-	unsigned long now, since;
-
-	spin_lock(&oom_lock);
-	now = jiffies;
-	since = now - last;
-	last = now;
-
-	/*
-	 * If it's been a long time since last failure,
-	 * we're not oom.
-	 */
-	if (since > 5*HZ)
-		goto reset;
-
-	/*
-	 * If we haven't tried for at least one second,
-	 * we're not really oom.
-	 */
-	since = now - first;
-	if (since < HZ)
-		goto out_unlock;
-
-	/*
-	 * If we have gotten only a few failures,
-	 * we're not really oom. 
-	 */
-	if (++count < 10)
-		goto out_unlock;
-
-	/*
-	 * If we just killed a process, wait a while
-	 * to give that task a chance to exit. This
-	 * avoids killing multiple processes needlessly.
-	 */
-	since = now - lastkill;
-	if (since < HZ*5)
-		goto out_unlock;
+ out:
+	read_unlock(&tasklist_lock);
+	if (mm)
+		mmput(mm);
 
 	/*
-	 * Ok, really out of memory. Kill something.
+	 * Give "p" a good chance of killing itself before we
+	 * retry to allocate memory.
 	 */
-	lastkill = now;
-
-	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
-	show_free_areas();
-
-	/* oom_kill() sleeps */
-	spin_unlock(&oom_lock);
-	oom_kill();
-	spin_lock(&oom_lock);
-
-reset:
-	/*
-	 * We dropped the lock above, so check to be sure the variable
-	 * first only ever increases to prevent false OOM's.
-	 */
-	if (time_after(now, first))
-		first = now;
-	count = 0;
-
-out_unlock:
-	spin_unlock(&oom_lock);
+	__set_current_state(TASK_INTERRUPTIBLE);
+	schedule_timeout(1);
 }
Index: x/mm/page_alloc.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/page_alloc.c,v
retrieving revision 1.236
diff -u -p -r1.236 page_alloc.c
--- x/mm/page_alloc.c	16 Nov 2004 03:53:53 -0000	1.236
+++ x/mm/page_alloc.c	3 Dec 2004 01:58:19 -0000
@@ -611,8 +611,10 @@ __alloc_pages(unsigned int gfp_mask, uns
 	int alloc_type;
 	int do_retry;
 	int can_try_harder;
+	int did_some_progress;
 
-	might_sleep_if(wait);
+	if (wait)
+		cond_resched();
 
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
@@ -645,6 +647,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 	for (i = 0; (z = zones[i]) != NULL; i++)
 		wakeup_kswapd(z);
 
+ restart:
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
 	 * coming from realtime tasks to go deeper into reserves
@@ -681,31 +684,58 @@ __alloc_pages(unsigned int gfp_mask, uns
 		goto nopage;
 
 rebalance:
+	cond_resched();
+
 	/* We now go into synchronous reclaim */
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	try_to_free_pages(zones, gfp_mask, order);
+	did_some_progress = try_to_free_pages(zones, gfp_mask, order);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
 
-	/* go through the zonelist yet one more time */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
-		min = z->pages_min;
-		if (gfp_mask & __GFP_HIGH)
-			min /= 2;
-		if (can_try_harder)
-			min -= min / 4;
-		min += (1<<order) + z->protection[alloc_type];
+	cond_resched();
 
-		if (z->free_pages < min)
-			continue;
+	if (likely(did_some_progress)) {
+		/* go through the zonelist yet one more time */
+		for (i = 0; (z = zones[i]) != NULL; i++) {
+			min = z->pages_min;
+			if (gfp_mask & __GFP_HIGH)
+				min /= 2;
+			if (can_try_harder)
+				min -= min / 4;
+			min += (1<<order) + z->protection[alloc_type];
 
-		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
-			goto got_pg;
+			if (z->free_pages < min)
+				continue;
+
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page)
+				goto got_pg;
+		}
+	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+		/*
+		 * Go through the zonelist yet one more time, keep
+		 * very high watermark here, this is only to catch
+		 * a parallel oom killing, we must fail if we're still
+		 * under heavy pressure.
+		 */
+		for (i = 0; (z = zones[i]) != NULL; i++) {
+			min = z->pages_high;
+			min += (1<<order) + z->protection[alloc_type];
+
+			if (z->free_pages < min)
+				continue;
+
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page)
+				goto got_pg;
+		}
+
+		out_of_memory(gfp_mask);
+		goto restart;
 	}
 
 	/*
Index: x/mm/swap_state.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/swap_state.c,v
retrieving revision 1.79
diff -u -p -r1.79 swap_state.c
--- x/mm/swap_state.c	29 Oct 2004 20:21:14 -0000	1.79
+++ x/mm/swap_state.c	3 Dec 2004 01:48:22 -0000
@@ -59,6 +59,8 @@ void show_swap_cache_info(void)
 		swap_cache_info.add_total, swap_cache_info.del_total,
 		swap_cache_info.find_success, swap_cache_info.find_total,
 		swap_cache_info.noent_race, swap_cache_info.exist_race);
+	printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
 }
 
 /*
Index: x/mm/vmscan.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/vmscan.c,v
retrieving revision 1.225
diff -u -p -r1.225 vmscan.c
--- x/mm/vmscan.c	19 Nov 2004 22:54:22 -0000	1.225
+++ x/mm/vmscan.c	2 Dec 2004 01:56:50 -0000
@@ -935,8 +935,6 @@ int try_to_free_pages(struct zone **zone
 		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
-	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
-		out_of_memory(gfp_mask);
 out:
 	for (i = 0; zones[i] != 0; i++)
 		zones[i]->prev_priority = zones[i]->temp_priority;

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 19:24                         ` Thomas Gleixner
  2004-12-02 20:11                           ` Andre Tomt
  2004-12-02 23:47                           ` Andrea Arcangeli
@ 2004-12-03 14:41                           ` Helge Hafting
  2004-12-03 21:20                             ` Thomas Gleixner
  2 siblings, 1 reply; 66+ messages in thread
From: Helge Hafting @ 2004-12-03 14:41 UTC (permalink / raw)
  To: tglx; +Cc: Andrew Morton, andrea, marcelo.tosatti, LKML, nickpiggin

Thomas Gleixner wrote:

> I know, but the console and the reset button are 150km away. When I dial
>
>into the machine or try to connect via the network, I cannot connect
>with the current kernels. Neither 2.4, because the fork fails, nor 2.6
>because oom killed sshd. So I cannot send anything except a service man,
>who drives 150km to hit sysrq-F or the reset button.
>  
>

The case of OOM killed sshd is fixable without touching the kernel:
Make sure sshd is started from init, init will then restart sshd whenever
it quits for some reason.  This will get you your essential sshd back
assuming the machine is still running and the OOM killer managed
to free up some memory by killing some other processes.

One might still wish for better OOM behaviour, but it is a case
where something has to give.

Helge Hafting

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 23:35                   ` Andrea Arcangeli
  2004-12-03  2:28                     ` Andrea Arcangeli
@ 2004-12-03 21:10                     ` Thomas Gleixner
  2004-12-03 22:21                       ` Andrea Arcangeli
  1 sibling, 1 reply; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-03 21:10 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Fri, 2004-12-03 at 00:35 +0100, Andrea Arcangeli wrote:
> Fork eventually failing is very reasonable if you're executing a fork
> loop.

Yes, it's reasonable, but the effect that any consequent command is
aborted then is not so reasonable.
 
> > There is no output in dmesg, but I'm not able to remove the remaining
> > hackbench processes as even a kill -SIGKILL returns with 
> > fork() (error: resource temporarily not available)
> > 
> > I'm not sure, which of the two scenarios I like better :)
> 
> Please try with 2.4.23aa3, I think there was some oom killer change
> after I had no resources to track 2.4 anymore. I'm not saying 2.4.23aa3
> will work better though, but I would like to know if there's some corner
> case still open in 2.4-aa. Careful, 2.4.23aa3 has security bugs (only
> local security of course, i.e. normally not a big issue, sure good
> enough for a quick test).

I try, if I find some time, but I'm not too interested in 2.4 :)

> I doubt your testcase simulates anything remotely realistic but
> anyway it's still informative.

The hackbench testcase resembles the real life problem I encountered
quite realistic. Of course we have fixed the application since then, but
I bet that I'm not the only one who uses forking servers.

> What I'm simulating here is very real life scenario with a couple of
> apps allocating more memory than ram.

Use a forking server, connect a lot of clients and it is real life. :)

> > FYI, I tried with 2.6 UP and PREEMPT=n. The result is more horrible. The
> > box just gets stuck in an endless swap in/swap out and does not respond
> > to anything else than SysRq-T and the reset button.
> > 
> > With the callback the machine did not come back after 20 Minutes.
> 
> Was the oom killer invoked at all? If yes, and it works with preempt,
> that could mean a cond_resched is simply missing...

Yes, it was invoked

> > I think the callback is the only safe way to fix that. If PF_MEMDIE is
> > racy then I'm sure we will find a different indicator for that.
> 
> The callback adds overhead to the exit path. Plus strictly speaking it's
> not actually a callback, you're just "polling" for the bitflag :)

I know :)

> > Yep, but the reentrancy blocking with the callback makes the time, count
> > crap and the watermark check go away, as it is safe to reenable the
> > killer at this point because we definitely freed memory resources. So
> > the watermark comes for free.
> 
> You can get an I/O race where your program is about to finish a failing
> try_to_free_pages pass (note that a task exiting won't make
> try_to_free_pages work any easier, try_to_free_pages has to free
> allocated memory, it doesn't care if there's 1M or 100M of free memory).
> If you don't check the watermarks after waiting for I/O, you're going to
> generate a suprious oom-killing. Your changes can't help.

True, I did not take the I/O into account.

> Note that even the watermark checks leaves a race window open, but at
> least it's not an I/O window. While try_to_free_pages can wait for I/O
> and then fail.
> 
> I'll add to my last patch the removal of the PF_MEMDIE check in oom_kill
> plus I'll fix the remaining race with PF_EXITING/DEAD, and I'll add a
> cond_resched. Then you can try again with my simple way (w/ and w/o
> PREEMPT ;).
> 
> Thanks for the great feedback.



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-03 14:41                           ` Helge Hafting
@ 2004-12-03 21:20                             ` Thomas Gleixner
  2004-12-05 21:14                               ` Helge Hafting
  0 siblings, 1 reply; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-03 21:20 UTC (permalink / raw)
  To: Helge Hafting; +Cc: Andrew Morton, andrea, marcelo.tosatti, LKML, nickpiggin

On Fri, 2004-12-03 at 15:41 +0100, Helge Hafting wrote:
> The case of OOM killed sshd is fixable without touching the kernel:
> Make sure sshd is started from init, init will then restart sshd whenever
> it quits for some reason.  This will get you your essential sshd back
> assuming the machine is still running and the OOM killer managed
> to free up some memory by killing some other processes.
> 
> One might still wish for better OOM behaviour, but it is a case
> where something has to give.
> 

Hey, are you kidding ?

2.4 lets me not in, because the fork of sshd fails. How do you fix this
with changing the userspace ?

2.6 oom is plain buggy

I have no problem to help myself, but I want to get this fixed in a
reliable way which meets the comment in oom_kill.c: "least surprise"

tglx



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-03 21:10                     ` Thomas Gleixner
@ 2004-12-03 22:21                       ` Andrea Arcangeli
  0 siblings, 0 replies; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-03 22:21 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Fri, Dec 03, 2004 at 10:10:06PM +0100, Thomas Gleixner wrote:
> On Fri, 2004-12-03 at 00:35 +0100, Andrea Arcangeli wrote:
> > Fork eventually failing is very reasonable if you're executing a fork
> > loop.
> 
> Yes, it's reasonable, but the effect that any consequent command is
> aborted then is not so reasonable.

Did you use the 4k stacks on 2.6 btw?

> Use a forking server, connect a lot of clients and it is real life. :)

;)

> Yes, it was invoked

Ok good.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-03  2:28                     ` Andrea Arcangeli
@ 2004-12-03 22:37                       ` Thomas Gleixner
  2004-12-03 22:51                         ` Thomas Gleixner
  2004-12-10 16:36                       ` William Lee Irwin III
  2004-12-10 16:51                       ` William Lee Irwin III
  2 siblings, 1 reply; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-03 22:37 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Fri, 2004-12-03 at 03:28 +0100, Andrea Arcangeli wrote:
> On Fri, Dec 03, 2004 at 12:34:59AM +0100, Andrea Arcangeli wrote:
> > I'll add to my last patch the removal of the PF_MEMDIE check in oom_kill
> > plus I'll fix the remaining race with PF_EXITING/DEAD, and I'll add a
> > cond_resched. Then you can try again with my simple way (w/ and w/o
> > PREEMPT ;).
> 
> Ok, I expect this patch to fix the problem completely. 
> <SNIP>
> With this thing, I doubt any wrong task will ever be killed again...

You're right. oom-kill() did not do anything wrong. See log below

This is w/o PREEMPT. Is it neccecary to verify w/ PREEMPT too ?

If it would have booted it still would have killed sshd instead of the
application which was forking a lot of childs.

tglx

Dentry cache hash table entries: 32768 (order: 5, 131072 bytes)
Inode-cache hash table entries: 16384 (order: 4, 65536 bytes)
Memory: 126476k/131060k available (1690k kernel code, 4044k reserved,
732k data)Checking if this processor honours the WP bit even in
supervisor mode... Ok.
Mount-cache hash table entries: 512 (order: 0, 4096 bytes)
CPU: L1 I cache: 16K, L1 D cache: 16K
CPU: L2 cache: 128K
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#0.
CPU: Intel Celeron (Mendocino) stepping 00
Enabling fast FPU save and restore... done.
Checking 'hlt' instruction... OK.

END OF LOG



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-02 20:11                           ` Andre Tomt
@ 2004-12-03 22:45                             ` Thomas Gleixner
  0 siblings, 0 replies; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-03 22:45 UTC (permalink / raw)
  To: Andre Tomt; +Cc: Andrew Morton, andrea, marcelo.tosatti, LKML, nickpiggin

On Thu, 2004-12-02 at 21:11 +0100, Andre Tomt wrote:
> Thomas Gleixner wrote:
> > On Thu, 2004-12-02 at 11:22 -0800, Andrew Morton wrote:
> >>You can issue sysrq commands over serial consoles too.
> > 
> > I know, but the console and the reset button are 150km away. When I dial
> > into the machine or try to connect via the network, I cannot connect
> > with the current kernels. Neither 2.4, because the fork fails, nor 2.6
> > because oom killed sshd. So I cannot send anything except a service man,
> > who drives 150km to hit sysrq-F or the reset button.
> 
> Get one of those terminal server/concentrators that export the serial 
> consoles over IP. Or one of those KVM-over-IP extenders. Worth every penny.

Makes totally sense. 

I add 1000$ equipment to a 300$ embedded box because there is a software
bug.

Is it possible that we are living in a different universe ?

tglx



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-03 22:37                       ` Thomas Gleixner
@ 2004-12-03 22:51                         ` Thomas Gleixner
  2004-12-03 23:08                           ` Andrea Arcangeli
  0 siblings, 1 reply; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-03 22:51 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, marcelo.tosatti, LKML, nickpiggin

Ooops, sorry it did add something to the Log after 10 minutes

On Fri, 2004-12-03 at 23:37 +0100, Thomas Gleixner wrote:
> You're right. oom-kill() did not do anything wrong. See log below
> 
> This is w/o PREEMPT. Is it neccecary to verify w/ PREEMPT too ?
> 
> If it would have booted it still would have killed sshd instead of the
> application which was forking a lot of childs.
> 
> Enabling fast FPU save and restore... done.
> Checking 'hlt' instruction... OK.
> 
> END OF LOG

Unable to handle kernel NULL pointer dereference at virtual address
0000000c
 printing eip:
c011fe30
*pde = 00000000
Oops: 0000 [#1]
Modules linked in:
CPU:    0
EIP:    0060:[<c011fe30>]    Not tainted VLI
EFLAGS: 00010082   (2.6.10-rc2)
EIP is at __queue_work+0x20/0x60
eax: 00000000   ebx: c032cc80   ecx: 00000000   edx: 00000008
esi: c03acd14   edi: 00000282   ebp: c035ff64   esp: c035ff34
ds: 007b   es: 007b   ss: 0068
Process swapper (pid: 0, threadinfo=c035e000 task=c02ebb00)
Stack: c035ffa4 00000000 c03acd14 c035ff64 c011fe99 00000000 c032cc80
c01dd8b0
       c0119c20 00000000 00000000 c035ffa4 c035ff64 c035ff64 00000000
00000001
       c03a5868 0000000a 003d9007 c0115f3b c03a5868 00000046 00099100
c039e120
Call Trace:
 [<c011fe99>] queue_work+0x29/0x50
 [<c01dd8b0>] blank_screen_t+0x0/0x20
 [<c0119c20>] run_timer_softirq+0xb0/0x170
 [<c0115f3b>] __do_softirq+0x7b/0x90
 [<c0115f77>] do_softirq+0x27/0x30
 [<c010411e>] do_IRQ+0x1e/0x30
 [<c010271e>] common_interrupt+0x1a/0x20
 [<c0100623>] default_idle+0x23/0x40
 [<c01006b4>] cpu_idle+0x34/0x40
 [<c03607d8>] start_kernel+0x168/0x1b0
 [<c0360370>] unknown_bootoption+0x0/0x1e0
Code: eb e5 90 90 90 90 90 90 90 90 90 83 ec 10 8b 44 24 14 89 5c 24 04
8b 5c 2
 <0>Kernel panic - not syncing: Fatal exception in interrupt



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-03 22:51                         ` Thomas Gleixner
@ 2004-12-03 23:08                           ` Andrea Arcangeli
  0 siblings, 0 replies; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-03 23:08 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Fri, Dec 03, 2004 at 11:51:50PM +0100, Thomas Gleixner wrote:
> Ooops, sorry it did add something to the Log after 10 minutes

You mean my patch is preventing your machine to boot? Then you're doing
something else wrong because it's impossible my patch is preventing your
machine to boot. I also tested it before posting. You must have changed
something more than the PREEMPT bit after applying my patch.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-01  9:49 [PATCH] oom killer (Core) tglx
  2004-12-01 21:16 ` Andrea Arcangeli
@ 2004-12-05  2:52 ` William Lee Irwin III
  2004-12-05 13:38   ` Thomas Gleixner
  2004-12-10 16:32 ` William Lee Irwin III
  2 siblings, 1 reply; 66+ messages in thread
From: William Lee Irwin III @ 2004-12-05  2:52 UTC (permalink / raw)
  To: tglx; +Cc: akpm, andrea, marcelo.tosatti, linux-kernel

On Wed, Dec 01, 2004 at 10:49:03AM +0100, tglx@linutronix.de wrote:
> The oom killer has currently some strange effects when triggered.
> It gets invoked multiple times and the selection of the task to kill
> does not take processes into account which fork a lot of child processes.
> The patch solves this by
> - Preventing reentrancy
> - Checking for memory threshold before selection and kill.
> - Taking child processes into account when selecting the process to kill

Hmm, this thread seems to be serious. I'll audit the policy adjustments
for issues with the mechanisms (e.g. killing kernel threads, races with
timeouts).


-- wli

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-05  2:52 ` William Lee Irwin III
@ 2004-12-05 13:38   ` Thomas Gleixner
  2004-12-05 15:22     ` Andrea Arcangeli
  0 siblings, 1 reply; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-05 13:38 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: Andrew Morton, andrea, marcelo.tosatti, LKML

On Sat, 2004-12-04 at 18:52 -0800, William Lee Irwin III wrote:
> On Wed, Dec 01, 2004 at 10:49:03AM +0100, tglx@linutronix.de wrote:
> > The oom killer has currently some strange effects when triggered.
> > It gets invoked multiple times and the selection of the task to kill
> > does not take processes into account which fork a lot of child processes.
> > The patch solves this by
> > - Preventing reentrancy
> > - Checking for memory threshold before selection and kill.
> > - Taking child processes into account when selecting the process to kill
> 
> Hmm, this thread seems to be serious. 

:)

> I'll audit the policy adjustments
> for issues with the mechanisms (e.g. killing kernel threads, races with
> timeouts).

Andrea provided a quite good fix for the invokation problem. I'm just
tracking down a different problem which was revieled by his changes.

The selection mechanism is currently taking a couple of things into
account:

 - VM size
 - nice value
 - CPU time
 - owner == root ?
 - CAP_SYS_RAWIO

I found out, that it is neccecary to take the child processes into
account to detect processes which fork a lot of childs.

The current mechanism rather kills sshd or portmap as they happen to
have a bigger VM size than the process which forked a lot of childs.

So I added a check for the child processes with an own VM. This solved
the problem quite well. Of course it does not count the childs of PID <
2.

tglx





^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-05 13:38   ` Thomas Gleixner
@ 2004-12-05 15:22     ` Andrea Arcangeli
  0 siblings, 0 replies; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-05 15:22 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: William Lee Irwin III, Andrew Morton, marcelo.tosatti, LKML

On Sun, Dec 05, 2004 at 02:38:22PM +0100, Thomas Gleixner wrote:
> Andrea provided a quite good fix for the invokation problem. I'm just
> tracking down a different problem which was revieled by his changes.
> 
> The selection mechanism is currently taking a couple of things into
> account:
> 
>  - VM size
>  - nice value
>  - CPU time
>  - owner == root ?
>  - CAP_SYS_RAWIO
> 
> I found out, that it is neccecary to take the child processes into
> account to detect processes which fork a lot of childs.
> 
> The current mechanism rather kills sshd or portmap as they happen to
> have a bigger VM size than the process which forked a lot of childs.

Taking childs into account looks fine to me.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-03 21:20                             ` Thomas Gleixner
@ 2004-12-05 21:14                               ` Helge Hafting
  0 siblings, 0 replies; 66+ messages in thread
From: Helge Hafting @ 2004-12-05 21:14 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Helge Hafting, Andrew Morton, andrea, marcelo.tosatti, LKML, nickpiggin

On Fri, Dec 03, 2004 at 10:20:22PM +0100, Thomas Gleixner wrote:
> On Fri, 2004-12-03 at 15:41 +0100, Helge Hafting wrote:
> > The case of OOM killed sshd is fixable without touching the kernel:
> > Make sure sshd is started from init, init will then restart sshd whenever
> > it quits for some reason.  This will get you your essential sshd back
> > assuming the machine is still running and the OOM killer managed
> > to free up some memory by killing some other processes.
> > 
> > One might still wish for better OOM behaviour, but it is a case
> > where something has to give.
> > 
> 
> Hey, are you kidding ?
> 
Please don't misunderstand.  I'm not saing that 2.6 is fine,
only that there is a way to automatically restart a sshd accidentally 
killed by the OOM killer.  This could probably save you some of those
trips, if you keep experimenting with 2.6.

> 2.4 lets me not in, because the fork of sshd fails. How do you fix this
> with changing the userspace ?
> 
Fork failing is another issue than a killed sshd.  It is usually fixed
by using ulimit so a buggy process simply cant fork off way too
many children. (Or use up too much memory.)

> 2.6 oom is plain buggy
> 
Quite possible, but be aware that these things can happen with 2.4 too.
It is possible to get 2.4 into a shortage where fork fails,
you should use ulimit anyway to avoid that.  Also, having
a sshd that is restarted by "init" is a good idea anyway
on such a remote machine.  2.4 might not kill sshd by kernel bug, but
who knows what some future exploit or future bug could do. 

> I have no problem to help myself, but I want to get this fixed in a
> reliable way which meets the comment in oom_kill.c: "least surprise"

I too want a well-behaved OOM killer.  Workarounds are available though,
until this happens.

Helge Hafting

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-01  9:49 [PATCH] oom killer (Core) tglx
  2004-12-01 21:16 ` Andrea Arcangeli
  2004-12-05  2:52 ` William Lee Irwin III
@ 2004-12-10 16:32 ` William Lee Irwin III
  2004-12-10 16:52   ` Thomas Gleixner
  2 siblings, 1 reply; 66+ messages in thread
From: William Lee Irwin III @ 2004-12-10 16:32 UTC (permalink / raw)
  To: tglx; +Cc: akpm, andrea, marcelo.tosatti, linux-kernel

On Wed, Dec 01, 2004 at 10:49:03AM +0100, tglx@linutronix.de wrote:
> The oom killer has currently some strange effects when triggered.
> It gets invoked multiple times and the selection of the task to kill
> does not take processes into account which fork a lot of child processes.
> The patch solves this by
> - Preventing reentrancy
> - Checking for memory threshold before selection and kill.
> - Taking child processes into account when selecting the process to kill

It appears the net functional change here is the reentrancy prevention;
the choice of tasks is policy. The functional change may be accomplished
with the following:


Index: linux-2.6.9/mm/oom_kill.c
===================================================================
--- linux-2.6.9.orig/mm/oom_kill.c	2004-10-18 14:54:30.000000000 -0700
+++ linux-2.6.9/mm/oom_kill.c	2004-12-10 08:21:31.000000000 -0800
@@ -237,7 +237,8 @@
 	static unsigned long first, last, count, lastkill;
 	unsigned long now, since;
 
-	spin_lock(&oom_lock);
+	if (!spin_trylock(&oom_lock))
+		return;
 	now = jiffies;
 	since = now - last;
 	last = now;
@@ -282,9 +283,7 @@
 	show_free_areas();
 
 	/* oom_kill() sleeps */
-	spin_unlock(&oom_lock);
 	oom_kill();
-	spin_lock(&oom_lock);
 
 reset:
 	/*

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-03  2:28                     ` Andrea Arcangeli
  2004-12-03 22:37                       ` Thomas Gleixner
@ 2004-12-10 16:36                       ` William Lee Irwin III
  2004-12-10 17:35                         ` Andrea Arcangeli
  2004-12-10 16:51                       ` William Lee Irwin III
  2 siblings, 1 reply; 66+ messages in thread
From: William Lee Irwin III @ 2004-12-10 16:36 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Thomas Gleixner, Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Fri, Dec 03, 2004 at 03:28:54AM +0100, Andrea Arcangeli wrote:
> +	if (mm == &init_mm) {
> +		mmput(mm);
> +		return NULL;
> +	}

On Fri, Dec 03, 2004 at 03:28:54AM +0100, Andrea Arcangeli wrote:
> +	if (PTR_ERR(p) == -1UL)
> +		goto out;
> +
>  	/* Found nothing?!?! Either we hang forever, or we panic. */
>  	if (!p) {
> +		read_unlock(&tasklist_lock);
>  		show_free_areas();
>  		panic("Out of memory and no killable processes...\n");
>  	}

Maybe the mm == &init_mm case should return an ERR_PTR also, as that is
a sign of a transient error, not cause for a hard panic.


-- wli

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-03  2:28                     ` Andrea Arcangeli
  2004-12-03 22:37                       ` Thomas Gleixner
  2004-12-10 16:36                       ` William Lee Irwin III
@ 2004-12-10 16:51                       ` William Lee Irwin III
  2 siblings, 0 replies; 66+ messages in thread
From: William Lee Irwin III @ 2004-12-10 16:51 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Thomas Gleixner, Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Fri, Dec 03, 2004 at 03:28:54AM +0100, Andrea Arcangeli wrote:
> Ok, I expect this patch to fix the problem completely. The biggest
> difference is that it doesn't affect the exit fast path and it doesn't need
> notification. It's not even slower than your patch since you were
> really polling too. This schedule properly, if you get any PREEMPT=n
> problem I'd really like to know. This is on top of my previous patch so
> it does the checks for the watermarks correctly (those are not obsoleted
> by whatever thing we do in out_of_memory()). This still make use of the
> PF_MEMDIE info but not in kernel/, only in mm/oom_kill.c where it born
> and dies there, so the race is somewhat hidden in there. As you said
> converting PF_MEMDIE to a set_tsk_thread_flag or some other non-racy
> thing is a due change but I'm not doing it in the below patch.

I see some real bugfixes in here. I'll start up a fresh thread with some
of those broken out and cc: the relevant people shortly. It's not really
any "new material", just reorganizing things into smaller patches, in
particular the ones I see needing merging ASAP that aren't controversial.


-- wli

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-10 16:32 ` William Lee Irwin III
@ 2004-12-10 16:52   ` Thomas Gleixner
  2004-12-10 17:43     ` William Lee Irwin III
                       ` (3 more replies)
  0 siblings, 4 replies; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-10 16:52 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: Andrew Morton, andrea, marcelo.tosatti, LKML

[-- Attachment #1: Type: text/plain, Size: 1196 bytes --]

On Fri, 2004-12-10 at 08:32 -0800, William Lee Irwin III wrote:
> On Wed, Dec 01, 2004 at 10:49:03AM +0100, tglx@linutronix.de wrote:
> > The oom killer has currently some strange effects when triggered.
> > It gets invoked multiple times and the selection of the task to kill
> > does not take processes into account which fork a lot of child processes.
> > The patch solves this by
> > - Preventing reentrancy
> > - Checking for memory threshold before selection and kill.
> > - Taking child processes into account when selecting the process to kill
> 
> It appears the net functional change here is the reentrancy prevention;
> the choice of tasks is policy. The functional change may be accomplished
> with the following:

Your patch would call yield() with the lock held. On an UP machine you
end up in the same code, as spin_locks are NOPs except for the preempt
part.

It's now obsolete by the fixes which were done by Andrea. 

I'm wondering why he did not post the final version. Andrea ???

Attached is the latest working and tested patch. It contains Andrea's
fixes to the oom invocation and my modifications to the selection whom
to kill.

This should really go into mainline.

tglx


[-- Attachment #2: oom-final.diff --]
[-- Type: text/x-patch, Size: 9934 bytes --]

diff -urN 2.6.10-rc2.orig/mm/oom_kill.c 2.6.10-rc2.oom/mm/oom_kill.c
--- 2.6.10-rc2.orig/mm/oom_kill.c	2004-12-10 17:39:07.000000000 +0100
+++ 2.6.10-rc2.oom/mm/oom_kill.c	2004-12-05 16:46:42.000000000 +0100
@@ -45,18 +45,30 @@
 static unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
+	struct list_head *tsk;
 
 	if (!p->mm)
 		return 0;
 
-	if (p->flags & PF_MEMDIE)
-		return 0;
 	/*
 	 * The memory size of the process is the basis for the badness.
 	 */
 	points = p->mm->total_vm;
 
 	/*
+	 * Processes which fork a lot of child processes are likely 
+	 * a good choice. We add the vmsize of the childs if they
+	 * have an own mm. This prevents forking servers to flood the
+	 * machine with an endless amount of childs
+	 */
+	list_for_each(tsk, &p->children) {
+		struct task_struct *chld;
+		chld = list_entry(tsk, struct task_struct, sibling);
+		if (chld->mm != p->mm && chld->mm)
+			points += chld->mm->total_vm;
+	}
+
+	/*
 	 * CPU time is in tens of seconds and run time is in thousands
          * of seconds. There is no particular reason for this other than
          * that it turned out to work very well in practice.
@@ -120,14 +132,25 @@
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	do_each_thread(g, p)
-		if (p->pid) {
-			unsigned long points = badness(p, uptime.tv_sec);
+		/* skip the init task with pid == 1 */
+		if (p->pid > 1) {
+			unsigned long points;
+
+			/*
+			 * This is in the process of releasing memory so wait it
+			 * to finish before killing some other task by mistake.
+			 */
+			if ((p->flags & PF_MEMDIE) ||
+			    ((p->flags & PF_EXITING) && !(p->flags & PF_DEAD)))
+				return ERR_PTR(-1UL);
+			if (p->flags & PF_SWAPOFF)
+				return p;
+
+			points = badness(p, uptime.tv_sec);
 			if (points > maxpoints) {
 				chosen = p;
 				maxpoints = points;
 			}
-			if (p->flags & PF_SWAPOFF)
-				return p;
 		}
 	while_each_thread(g, p);
 	return chosen;
@@ -140,6 +163,12 @@
  */
 static void __oom_kill_task(task_t *p)
 {
+	if (p->pid == 1) {
+		WARN_ON(1);
+		printk(KERN_WARNING "tried to kill init!\n");
+		return;
+	}
+
 	task_lock(p);
 	if (!p->mm || p->mm == &init_mm) {
 		WARN_ON(1);
@@ -169,12 +198,45 @@
 static struct mm_struct *oom_kill_task(task_t *p)
 {
 	struct mm_struct *mm = get_task_mm(p);
-	if (!mm || mm == &init_mm)
+	task_t * g, * q;
+
+	if (!mm)
 		return NULL;
+	if (mm == &init_mm) {
+		mmput(mm);
+		return NULL;
+	}
+
 	__oom_kill_task(p);
+	/*
+	 * kill all processes that share the ->mm (i.e. all threads),
+	 * but are in a different thread group
+	 */
+	do_each_thread(g, q)
+		if (q->mm == mm && q->tgid != p->tgid)
+			__oom_kill_task(q);
+	while_each_thread(g, q);
+
 	return mm;
 }
 
+static struct mm_struct *oom_kill_process(task_t *p)
+{
+ 	struct mm_struct *mm;
+	struct task_struct *c;
+	struct list_head *tsk;
+
+	/* Try to kill a child first */
+	list_for_each(tsk, &p->children) {
+		c = list_entry(tsk, struct task_struct, sibling);
+		if (c->mm == p->mm)
+			continue;
+		mm = oom_kill_task(c);
+		if (mm)
+			return mm;
+	}
+	return oom_kill_task(p);
+}
 
 /**
  * oom_kill - kill the "best" process when we run out of memory
@@ -184,117 +246,40 @@
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-static void oom_kill(void)
+void out_of_memory(int gfp_mask)
 {
-	struct mm_struct *mm;
-	struct task_struct *g, *p, *q;
-	
+	struct mm_struct *mm = NULL;
+	task_t * p;
+
 	read_lock(&tasklist_lock);
 retry:
 	p = select_bad_process();
 
+	if (PTR_ERR(p) == -1UL)
+		goto out;
+
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
+		read_unlock(&tasklist_lock);
 		show_free_areas();
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	mm = oom_kill_task(p);
-	if (!mm)
-		goto retry;
-	/*
-	 * kill all processes that share the ->mm (i.e. all threads),
-	 * but are in a different thread group
-	 */
-	do_each_thread(g, q)
-		if (q->mm == mm && q->tgid != p->tgid)
-			__oom_kill_task(q);
-	while_each_thread(g, q);
-	if (!p->mm)
-		printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
-	read_unlock(&tasklist_lock);
-	mmput(mm);
-
-	/*
-	 * Make kswapd go out of the way, so "p" has a good chance of
-	 * killing itself before someone else gets the chance to ask
-	 * for more memory.
-	 */
-	yield();
-	return;
-}
-
-/**
- * out_of_memory - is the system out of memory?
- */
-void out_of_memory(int gfp_mask)
-{
-	/*
-	 * oom_lock protects out_of_memory()'s static variables.
-	 * It's a global lock; this is not performance-critical.
-	 */
-	static spinlock_t oom_lock = SPIN_LOCK_UNLOCKED;
-	static unsigned long first, last, count, lastkill;
-	unsigned long now, since;
-
-	spin_lock(&oom_lock);
-	now = jiffies;
-	since = now - last;
-	last = now;
-
-	/*
-	 * If it's been a long time since last failure,
-	 * we're not oom.
-	 */
-	if (since > 5*HZ)
-		goto reset;
-
-	/*
-	 * If we haven't tried for at least one second,
-	 * we're not really oom.
-	 */
-	since = now - first;
-	if (since < HZ)
-		goto out_unlock;
-
-	/*
-	 * If we have gotten only a few failures,
-	 * we're not really oom. 
-	 */
-	if (++count < 10)
-		goto out_unlock;
-
-	/*
-	 * If we just killed a process, wait a while
-	 * to give that task a chance to exit. This
-	 * avoids killing multiple processes needlessly.
-	 */
-	since = now - lastkill;
-	if (since < HZ*5)
-		goto out_unlock;
-
-	/*
-	 * Ok, really out of memory. Kill something.
-	 */
-	lastkill = now;
-
 	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
 	show_free_areas();
+	mm = oom_kill_process(p);
+	if (!mm)
+		goto retry;
 
-	/* oom_kill() sleeps */
-	spin_unlock(&oom_lock);
-	oom_kill();
-	spin_lock(&oom_lock);
+ out:
+	read_unlock(&tasklist_lock);
+	if (mm)
+		mmput(mm);
 
-reset:
 	/*
-	 * We dropped the lock above, so check to be sure the variable
-	 * first only ever increases to prevent false OOM's.
+	 * Give "p" a good chance of killing itself before we
+	 * retry to allocate memory.
 	 */
-	if (time_after(now, first))
-		first = now;
-	count = 0;
-
-out_unlock:
-	spin_unlock(&oom_lock);
+	__set_current_state(TASK_INTERRUPTIBLE);
+	schedule_timeout(1);
 }
diff -urN 2.6.10-rc2.orig/mm/page_alloc.c 2.6.10-rc2.oom/mm/page_alloc.c
--- 2.6.10-rc2.orig/mm/page_alloc.c	2004-12-10 17:39:07.000000000 +0100
+++ 2.6.10-rc2.oom/mm/page_alloc.c	2004-12-05 16:42:38.000000000 +0100
@@ -611,9 +611,10 @@
 	int alloc_type;
 	int do_retry;
 	int can_try_harder;
+	int did_some_progress;
 
 	might_sleep_if(wait);
-
+	
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or is the caller has realtime scheduling
@@ -638,13 +639,15 @@
 			continue;
 
 		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
+		if (page) {
 			goto got_pg;
+		}
 	}
 
 	for (i = 0; (z = zones[i]) != NULL; i++)
 		wakeup_kswapd(z);
 
+ restart:
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
 	 * coming from realtime tasks to go deeper into reserves
@@ -681,31 +684,58 @@
 		goto nopage;
 
 rebalance:
+	cond_resched();
+
 	/* We now go into synchronous reclaim */
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	try_to_free_pages(zones, gfp_mask, order);
+	did_some_progress = try_to_free_pages(zones, gfp_mask, order);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
 
-	/* go through the zonelist yet one more time */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
-		min = z->pages_min;
-		if (gfp_mask & __GFP_HIGH)
-			min /= 2;
-		if (can_try_harder)
-			min -= min / 4;
-		min += (1<<order) + z->protection[alloc_type];
+	cond_resched();
 
-		if (z->free_pages < min)
-			continue;
+	if (likely(did_some_progress)) {
+		/* go through the zonelist yet one more time */
+		for (i = 0; (z = zones[i]) != NULL; i++) {
+			min = z->pages_min;
+			if (gfp_mask & __GFP_HIGH)
+				min /= 2;
+			if (can_try_harder)
+				min -= min / 4;
+			min += (1<<order) + z->protection[alloc_type];
 
-		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
-			goto got_pg;
+			if (z->free_pages < min)
+				continue;
+
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page)
+				goto got_pg;
+		}
+	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+		/*
+		 * Go through the zonelist yet one more time, keep
+		 * very high watermark here, this is only to catch
+		 * a parallel oom killing, we must fail if we're still
+		 * under heavy pressure.
+		 */
+		for (i = 0; (z = zones[i]) != NULL; i++) {
+			min = z->pages_high;
+			min += (1<<order) + z->protection[alloc_type];
+
+			if (z->free_pages < min)
+				continue;
+
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page)
+				goto got_pg;
+		}
+
+		out_of_memory(gfp_mask);
+		goto restart;
 	}
 
 	/*
diff -urN 2.6.10-rc2.orig/mm/swap_state.c 2.6.10-rc2.oom/mm/swap_state.c
--- 2.6.10-rc2.orig/mm/swap_state.c	2004-12-10 17:39:07.000000000 +0100
+++ 2.6.10-rc2.oom/mm/swap_state.c	2004-12-04 18:52:38.000000000 +0100
@@ -59,6 +59,8 @@
 		swap_cache_info.add_total, swap_cache_info.del_total,
 		swap_cache_info.find_success, swap_cache_info.find_total,
 		swap_cache_info.noent_race, swap_cache_info.exist_race);
+	printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
 }
 
 /*
diff -urN 2.6.10-rc2.orig/mm/vmscan.c 2.6.10-rc2.oom/mm/vmscan.c
--- 2.6.10-rc2.orig/mm/vmscan.c	2004-12-10 17:39:07.000000000 +0100
+++ 2.6.10-rc2.oom/mm/vmscan.c	2004-12-04 18:52:38.000000000 +0100
@@ -935,8 +935,6 @@
 		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
-	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
-		out_of_memory(gfp_mask);
 out:
 	for (i = 0; zones[i] != 0; i++)
 		zones[i]->prev_priority = zones[i]->temp_priority;

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-10 16:36                       ` William Lee Irwin III
@ 2004-12-10 17:35                         ` Andrea Arcangeli
  2004-12-10 17:43                           ` William Lee Irwin III
  0 siblings, 1 reply; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-10 17:35 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Thomas Gleixner, Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Fri, Dec 10, 2004 at 08:36:14AM -0800, William Lee Irwin III wrote:
> On Fri, Dec 03, 2004 at 03:28:54AM +0100, Andrea Arcangeli wrote:
> > +	if (mm == &init_mm) {
> > +		mmput(mm);
> > +		return NULL;
> > +	}
> 
> On Fri, Dec 03, 2004 at 03:28:54AM +0100, Andrea Arcangeli wrote:
> > +	if (PTR_ERR(p) == -1UL)
> > +		goto out;
> > +
> >  	/* Found nothing?!?! Either we hang forever, or we panic. */
> >  	if (!p) {
> > +		read_unlock(&tasklist_lock);
> >  		show_free_areas();
> >  		panic("Out of memory and no killable processes...\n");
> >  	}
> 
> Maybe the mm == &init_mm case should return an ERR_PTR also, as that is
> a sign of a transient error, not cause for a hard panic.

It can't be a transient error as far as I can tell, it's just like the
issue of alloc_pages returning NULL (and potentially scheduling first)
before mounting the root fs.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-10 17:35                         ` Andrea Arcangeli
@ 2004-12-10 17:43                           ` William Lee Irwin III
  2004-12-10 17:55                             ` Andrea Arcangeli
  0 siblings, 1 reply; 66+ messages in thread
From: William Lee Irwin III @ 2004-12-10 17:43 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Thomas Gleixner, Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Fri, Dec 10, 2004 at 08:36:14AM -0800, William Lee Irwin III wrote:
>> Maybe the mm == &init_mm case should return an ERR_PTR also, as that is
>> a sign of a transient error, not cause for a hard panic.

On Fri, Dec 10, 2004 at 06:35:54PM +0100, Andrea Arcangeli wrote:
> It can't be a transient error as far as I can tell, it's just like the
> issue of alloc_pages returning NULL (and potentially scheduling first)
> before mounting the root fs.

Well, the only way I see this happening is the process exiting followed
by use_mm() on init_mm for unobvious reasons (perhaps reasons not in
the tree).


-- wli

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-10 16:52   ` Thomas Gleixner
@ 2004-12-10 17:43     ` William Lee Irwin III
  2004-12-10 17:47     ` William Lee Irwin III
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 66+ messages in thread
From: William Lee Irwin III @ 2004-12-10 17:43 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andrew Morton, andrea, marcelo.tosatti, LKML

On Fri, 2004-12-10 at 08:32 -0800, William Lee Irwin III wrote:
>> It appears the net functional change here is the reentrancy prevention;
>> the choice of tasks is policy. The functional change may be accomplished
>> with the following:

On Fri, Dec 10, 2004 at 05:52:33PM +0100, Thomas Gleixner wrote:
> Your patch would call yield() with the lock held. On an UP machine you
> end up in the same code, as spin_locks are NOPs except for the preempt
> part.
> It's now obsolete by the fixes which were done by Andrea. 
> I'm wondering why he did not post the final version. Andrea ???
> Attached is the latest working and tested patch. It contains Andrea's
> fixes to the oom invocation and my modifications to the selection whom
> to kill.
> This should really go into mainline.

ARGH, preempt.


-- wli

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-10 16:52   ` Thomas Gleixner
  2004-12-10 17:43     ` William Lee Irwin III
@ 2004-12-10 17:47     ` William Lee Irwin III
  2004-12-10 17:49     ` Andrea Arcangeli
  2004-12-24  1:18     ` Andrea Arcangeli
  3 siblings, 0 replies; 66+ messages in thread
From: William Lee Irwin III @ 2004-12-10 17:47 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andrew Morton, andrea, marcelo.tosatti, LKML

On Fri, 2004-12-10 at 08:32 -0800, William Lee Irwin III wrote:
>> It appears the net functional change here is the reentrancy prevention;
>> the choice of tasks is policy. The functional change may be accomplished
>> with the following:

On Fri, Dec 10, 2004 at 05:52:33PM +0100, Thomas Gleixner wrote:
> Your patch would call yield() with the lock held. On an UP machine you
> end up in the same code, as spin_locks are NOPs except for the preempt
> part.
> It's now obsolete by the fixes which were done by Andrea. 
> I'm wondering why he did not post the final version. Andrea ???
> Attached is the latest working and tested patch. It contains Andrea's
> fixes to the oom invocation and my modifications to the selection whom
> to kill.
> This should really go into mainline.

This should all be split up; it's doing a dozen things at a time.


-- wli

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-10 16:52   ` Thomas Gleixner
  2004-12-10 17:43     ` William Lee Irwin III
  2004-12-10 17:47     ` William Lee Irwin III
@ 2004-12-10 17:49     ` Andrea Arcangeli
  2004-12-10 17:57       ` William Lee Irwin III
  2004-12-24  1:18     ` Andrea Arcangeli
  3 siblings, 1 reply; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-10 17:49 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: William Lee Irwin III, Andrew Morton, marcelo.tosatti, LKML

On Fri, Dec 10, 2004 at 05:52:33PM +0100, Thomas Gleixner wrote:
> I'm wondering why he did not post the final version. Andrea ???

I already posted the final version since it had no bugs and I asked to
get it merged twice. The only bugs are obviously in the drivers (or the
callers) and they needs urgent fixing and additionally the
might_sleep_if must stop checking if the system is running so these bugs
can see the light of the day. Not being allowed to schedule in
alloc_pages with __GFP_WAIT set is a mistake.

Your patch was orthogonal to mine, so I didn't merge it. Go figure that
every time I post something it gets splitted into trivial pieces, so
it's a waste of time to try to merge any additional patch and post a
final one since it'll never be final anyway.

I am about to merge the things together for some other tree (not
mainline), that is a worthwhile effort but with the split behaviour of
mainline, for mainline it'd be a waste of time.

One last thing worth discussing on my side is if we should worry about
the tiny race between the watermark checks and the entering of the oom
killing. In theory we could wrap the thing around a semaphore and close
the race completely, though current code is simpler and as you find
it works fine in practice.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-10 17:43                           ` William Lee Irwin III
@ 2004-12-10 17:55                             ` Andrea Arcangeli
  2004-12-10 18:00                               ` William Lee Irwin III
  0 siblings, 1 reply; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-10 17:55 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Thomas Gleixner, Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Fri, Dec 10, 2004 at 09:43:36AM -0800, William Lee Irwin III wrote:
> Well, the only way I see this happening is the process exiting followed
> by use_mm() on init_mm for unobvious reasons (perhaps reasons not in
> the tree).

I don't see the problem with use_mm. use_mm has either the mm set to
ctx->mm or to NULL, and ctx->mm is set to the mm of the process calling
io_setup.

The only thing using init_mm is the idle task/swapper as far as I can
tell, kernel threads and exiting tasks have a NULL mm.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-10 17:49     ` Andrea Arcangeli
@ 2004-12-10 17:57       ` William Lee Irwin III
  2004-12-12  0:12         ` William Lee Irwin III
  0 siblings, 1 reply; 66+ messages in thread
From: William Lee Irwin III @ 2004-12-10 17:57 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Thomas Gleixner, Andrew Morton, marcelo.tosatti, LKML

On Fri, Dec 10, 2004 at 06:49:38PM +0100, Andrea Arcangeli wrote:
> Your patch was orthogonal to mine, so I didn't merge it. Go figure that
> every time I post something it gets splitted into trivial pieces, so
> it's a waste of time to try to merge any additional patch and post a
> final one since it'll never be final anyway.
> I am about to merge the things together for some other tree (not
> mainline), that is a worthwhile effort but with the split behaviour of
> mainline, for mainline it'd be a waste of time.
> One last thing worth discussing on my side is if we should worry about
> the tiny race between the watermark checks and the entering of the oom
> killing. In theory we could wrap the thing around a semaphore and close
> the race completely, though current code is simpler and as you find
> it works fine in practice.

The easy way to fix that issue is to take the whole diff and break off
pieces, with the remainder always as the last patch. That way the whole
set of changes stays pending and appears intact at the end of the series.

I will personally be held responsible for identifying the causes of
behavioral changes in the OOM killer, and am having to investigate
several instances of bad OOM killer behavior already, so I have to do
this anyway, and so it might as well be done for mainline.


-- wli

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-10 17:55                             ` Andrea Arcangeli
@ 2004-12-10 18:00                               ` William Lee Irwin III
  2004-12-10 18:15                                 ` Andrea Arcangeli
  0 siblings, 1 reply; 66+ messages in thread
From: William Lee Irwin III @ 2004-12-10 18:00 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Thomas Gleixner, Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Fri, Dec 10, 2004 at 09:43:36AM -0800, William Lee Irwin III wrote:
>> Well, the only way I see this happening is the process exiting followed
>> by use_mm() on init_mm for unobvious reasons (perhaps reasons not in
>> the tree).

On Fri, Dec 10, 2004 at 06:55:04PM +0100, Andrea Arcangeli wrote:
> I don't see the problem with use_mm. use_mm has either the mm set to
> ctx->mm or to NULL, and ctx->mm is set to the mm of the process calling
> io_setup.
> The only thing using init_mm is the idle task/swapper as far as I can
> tell, kernel threads and exiting tasks have a NULL mm.

Yet those don't appear in the tasklist, so some task in the tasklist
has to get ->mm set to &init_mm. The notion above was that the init_mm
check was to handle some out-of-tree attempt to do aio from kernel threads.


-- wli

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-10 18:00                               ` William Lee Irwin III
@ 2004-12-10 18:15                                 ` Andrea Arcangeli
  2004-12-10 18:19                                   ` William Lee Irwin III
  0 siblings, 1 reply; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-10 18:15 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Thomas Gleixner, Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Fri, Dec 10, 2004 at 10:00:31AM -0800, William Lee Irwin III wrote:
> Yet those don't appear in the tasklist, so some task in the tasklist
> has to get ->mm set to &init_mm. The notion above was that the init_mm
> check was to handle some out-of-tree attempt to do aio from kernel threads.

Not sure to understand correctly but, aio has always been done through
kernel threads, and that's the whole thing about aio. Not sure what
you're doing out-of-tree, but you don't need to use init_mm to deal with
kernel threads, infact kernel threads can only have ->mm = NULL. When
switching mm with use_mm the aio thread is only going to use a real mm
with mappings in userspace, so even in that case you don't need init_mm.
I didn't see the out of tree code though.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-10 18:15                                 ` Andrea Arcangeli
@ 2004-12-10 18:19                                   ` William Lee Irwin III
  2004-12-10 19:05                                     ` Andrea Arcangeli
  0 siblings, 1 reply; 66+ messages in thread
From: William Lee Irwin III @ 2004-12-10 18:19 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Thomas Gleixner, Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Fri, Dec 10, 2004 at 10:00:31AM -0800, William Lee Irwin III wrote:
>> Yet those don't appear in the tasklist, so some task in the tasklist
>> has to get ->mm set to &init_mm. The notion above was that the init_mm
>> check was to handle some out-of-tree attempt to do aio from kernel threads.

On Fri, Dec 10, 2004 at 07:15:29PM +0100, Andrea Arcangeli wrote:
> Not sure to understand correctly but, aio has always been done through
> kernel threads, and that's the whole thing about aio. Not sure what
> you're doing out-of-tree, but you don't need to use init_mm to deal with
> kernel threads, infact kernel threads can only have ->mm = NULL. When
> switching mm with use_mm the aio thread is only going to use a real mm
> with mappings in userspace, so even in that case you don't need init_mm.
> I didn't see the out of tree code though.

Maybe the whole init_mm check should die since nothing in-tree could
cause it?


-- wli

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-10 18:19                                   ` William Lee Irwin III
@ 2004-12-10 19:05                                     ` Andrea Arcangeli
  0 siblings, 0 replies; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-10 19:05 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Thomas Gleixner, Andrew Morton, marcelo.tosatti, LKML, nickpiggin

On Fri, Dec 10, 2004 at 10:19:54AM -0800, William Lee Irwin III wrote:
> Maybe the whole init_mm check should die since nothing in-tree could
> cause it?

Well it's a bugcheck after all so it certainly can be removed. I
wouldn't mind to remove it completely, but this is just to be sure
nothing is going wrong, though I agree it isn't going to help very much ;).

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-10 17:57       ` William Lee Irwin III
@ 2004-12-12  0:12         ` William Lee Irwin III
  0 siblings, 0 replies; 66+ messages in thread
From: William Lee Irwin III @ 2004-12-12  0:12 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Thomas Gleixner, Andrew Morton, marcelo.tosatti, LKML

On Fri, Dec 10, 2004 at 09:57:06AM -0800, William Lee Irwin III wrote:
> The easy way to fix that issue is to take the whole diff and break off
> pieces, with the remainder always as the last patch. That way the whole
> set of changes stays pending and appears intact at the end of the series.
> I will personally be held responsible for identifying the causes of
> behavioral changes in the OOM killer, and am having to investigate
> several instances of bad OOM killer behavior already, so I have to do
> this anyway, and so it might as well be done for mainline.

Actually, I'm dropping my whole public effort and am going to stick to
using what I do in this respect for internal testing etc.


-- wli

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-10 16:52   ` Thomas Gleixner
                       ` (2 preceding siblings ...)
  2004-12-10 17:49     ` Andrea Arcangeli
@ 2004-12-24  1:18     ` Andrea Arcangeli
  3 siblings, 0 replies; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-24  1:18 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: William Lee Irwin III, Andrew Morton, marcelo.tosatti, LKML

On Fri, Dec 10, 2004 at 05:52:33PM +0100, Thomas Gleixner wrote:
> +			if ((p->flags & PF_MEMDIE) ||
> +			    ((p->flags & PF_EXITING) && !(p->flags & PF_DEAD)))

this had to be:

			if (((p->flags & PF_MEMDIE) || (p->flags & PF_EXITING)) &&
			    !(p->flags & PF_DEAD))

I didn't take zombies into account. A task may be in memdie state and zombie at
the same time. We must not wait a PF_MEMDIE to go away completely, we must wait
only until PF_DEAD is not set. After PF_DEAD is set we know all ram we
were waiting for is already in the free list.

I noticed some deadlocks during an oom-torture-test before applying the stuff
into the suse kernel. The above change fixed all my deadlocks. Everything else
was working fine already.

Actually before finding the above bug I fixed PF_MEMDIE too and I converted it
to p->memdie, an unsigned char. All archs should support 1 byte granularity
for per-process atomic instructions, it's near used_math that I also converted
to a char to signal it cannot be a bitflag sharing the same char with globals.

Struct layout looks like this.

	/*
	 * All archs should support atomic ops with
	 * 1 byte granularity.
	 */
	unsigned char memdie;
	/*
	 * Must be changed atomically so it shouldn't be
	 * be a shareable bitflag.
	 */
	unsigned char used_math;
	/*
	 * OOM kill score adjustment (bit shift).
	 * Cannot live together with used_math since
	 * used_math and oomkilladj can be changed at the
	 * same time, so they would race if they're in the
	 * same atomic block.
	 */
	short oomkilladj;

As for the |= PF_MEMALLOC in oom_kill_task that was a gratuitous smp breakage,
I didn't need to do anything in PF_MEMALLOC since alloc_pages checks _both_
PF_MEMALLOC and p->memdie already.

I also added a !chosen in the below code to make that logic more self
contained and less dependent on the badness implementation (should never
be necessary though):

			if (points > maxpoints || !chosen) {
				chosen = p;
				maxpoints = points;
			}

I can port the final patch (including fixage of PF_MEMDIE races) to 2.6.10-rc
after I finished.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-05 15:34           ` Andrea Arcangeli
@ 2004-12-05 16:29             ` Thomas Gleixner
  0 siblings, 0 replies; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-05 16:29 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Voluspa, LKML

[-- Attachment #1: Type: text/plain, Size: 1577 bytes --]

On Sun, 2004-12-05 at 16:34 +0100, Andrea Arcangeli wrote:
> On Sun, Dec 05, 2004 at 03:55:07PM +0100, Thomas Gleixner wrote:
> > Yes, the modification are not interfering with your patch. They just add
> > the accounting of child processes to the selection.
> 
> Could you post your modifications on top of my patch so we can combine
> them easily?

The patch is against 2.6.10-rc2-your-latest-patch

It adds the VM size of the child processes and tries to kill a child of
the selected process first. It could be discussed, if we just kill the
parent, but it turned out that the kill child first approach has less
unexpected results.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

> > That makes sense, but it does not catch processes forking a lot of
> > childs, because the allocation rate is not accounted to the parent.
> 
> Not sure, the child could easily inherit the allocation rate of the parent.
> So if the fork bomb spreads the last leaf in the process tree would
> easily get accounted for every kernel stack/fds/etc.. and userspace
> allocation done from its previous parents too.

I think the current solution is just fine. I tested it thoroughly with
different scenarios and was not able to break it. The ugly time, count
hacks are gone and I have the impression that it tries harder to swap
out stuff now. I also tested with swap off and it works without hassle.
There may be some remaining "surprise" hidden in the selection, but I'm
not sure if it is worth the time to optimize it further or even rewrite
it with some other builtin surprises.

tglx



[-- Attachment #2: oom-select.diff --]
[-- Type: text/x-patch, Size: 2262 bytes --]

diff -urN 2.6.10-rc2.orig/mm/oom_kill.c 2.6.10-rc2.oom/mm/oom_kill.c
--- 2.6.10-rc2.orig/mm/oom_kill.c	2004-12-05 16:36:12.000000000 +0100
+++ 2.6.10-rc2.oom/mm/oom_kill.c	2004-12-05 16:46:42.000000000 +0100
@@ -45,6 +45,7 @@
 static unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
+	struct list_head *tsk;
 
 	if (!p->mm)
 		return 0;
@@ -55,6 +56,19 @@
 	points = p->mm->total_vm;
 
 	/*
+	 * Processes which fork a lot of child processes are likely 
+	 * a good choice. We add the vmsize of the childs if they
+	 * have an own mm. This prevents forking servers to flood the
+	 * machine with an endless amount of childs
+	 */
+	list_for_each(tsk, &p->children) {
+		struct task_struct *chld;
+		chld = list_entry(tsk, struct task_struct, sibling);
+		if (chld->mm != p->mm && chld->mm)
+			points += chld->mm->total_vm;
+	}
+
+	/*
 	 * CPU time is in tens of seconds and run time is in thousands
          * of seconds. There is no particular reason for this other than
          * that it turned out to work very well in practice.
@@ -206,6 +220,23 @@
 	return mm;
 }
 
+static struct mm_struct *oom_kill_process(task_t *p)
+{
+ 	struct mm_struct *mm;
+	struct task_struct *c;
+	struct list_head *tsk;
+
+	/* Try to kill a child first */
+	list_for_each(tsk, &p->children) {
+		c = list_entry(tsk, struct task_struct, sibling);
+		if (c->mm == p->mm)
+			continue;
+		mm = oom_kill_task(c);
+		if (mm)
+			return mm;
+	}
+	return oom_kill_task(p);
+}
 
 /**
  * oom_kill - kill the "best" process when we run out of memory
@@ -236,7 +267,7 @@
 
 	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
 	show_free_areas();
-	mm = oom_kill_task(p);
+	mm = oom_kill_process(p);
 	if (!mm)
 		goto retry;
 
diff -urN 2.6.10-rc2.orig/mm/page_alloc.c 2.6.10-rc2.oom/mm/page_alloc.c
--- 2.6.10-rc2.orig/mm/page_alloc.c	2004-12-05 16:36:12.000000000 +0100
+++ 2.6.10-rc2.oom/mm/page_alloc.c	2004-12-05 16:42:38.000000000 +0100
@@ -613,9 +613,8 @@
 	int can_try_harder;
 	int did_some_progress;
 
-	if (wait)
-		cond_resched();
-
+	might_sleep_if(wait);
+	
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or is the caller has realtime scheduling

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-05 14:55         ` Thomas Gleixner
@ 2004-12-05 15:34           ` Andrea Arcangeli
  2004-12-05 16:29             ` Thomas Gleixner
  0 siblings, 1 reply; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-05 15:34 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Voluspa, LKML

On Sun, Dec 05, 2004 at 03:55:07PM +0100, Thomas Gleixner wrote:
> Yes, the modification are not interfering with your patch. They just add
> the accounting of child processes to the selection.

Could you post your modifications on top of my patch so we can combine
them easily?

> That makes sense, but it does not catch processes forking a lot of
> childs, because the allocation rate is not accounted to the parent.

Not sure, the child could easily inherit the allocation rate of the parent.
So if the fork bomb spreads the last leaf in the process tree would
easily get accounted for every kernel stack/fds/etc.. and userspace
allocation done from its previous parents too.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-05  0:27       ` Andrea Arcangeli
@ 2004-12-05 14:55         ` Thomas Gleixner
  2004-12-05 15:34           ` Andrea Arcangeli
  0 siblings, 1 reply; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-05 14:55 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Voluspa, LKML

On Sun, 2004-12-05 at 01:27 +0100, Andrea Arcangeli wrote:
> On Sat, Dec 04, 2004 at 10:02:03PM +0100, Thomas Gleixner wrote:
> > On Sat, 2004-12-04 at 19:33 +0100, Thomas Gleixner wrote:

> Ok, so some piece of code is buggy: somebody is using GFP_KERNEL instead
> of GFP_ATOMIC.  Reverting my change will only hide the real bug so I
> wouldn't recommend it (except for testing purposes).
> 
> Would be very nice to find the real bug.

Hmm, most of the allocations in early init have the GFP_WAIT bit set.
When I block the call to cond_resched() up to cpu_idle() in rest_init()
everything works. Enabling the call to cond_resched() at any place
before brings the problem back.

> So you mean there's a separate issue with the task selection right? I
> didn't touch the task selection at all.

I know.

> > Can you agree to add the selection patch, which takes the multi child
> > forking process into account ? I don't explain again why it makes
> > sense :)
> 
> I didn't recall that part of your patch, but it seems very orthogonal. I

Yes, the modification are not interfering with your patch. They just add
the accounting of child processes to the selection.

> didn't want to change the process selection at the same time. If I will
> touch the task selection I'll probably rewrite it from scratch to choose
> tasks only in function of the allocation rate, 

That makes sense, but it does not catch processes forking a lot of
childs, because the allocation rate is not accounted to the parent.

> sure not with anything
> similar to the current algorithm which is close to a DoS with big
> database servers if some other smaller app hits a memleak and allocates
> in a loop.

The comment above badness() is the best part of it, especially the
"least surprise part" :)

 * 5) we try to kill the process the user expects us to kill, this
 *    algorithm has been meticulously tuned to meet the principle
 *    of least surprise ... (be careful when you change it)

tglx



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
@ 2004-12-05  8:32 Voluspa
  0 siblings, 0 replies; 66+ messages in thread
From: Voluspa @ 2004-12-05  8:32 UTC (permalink / raw)
  To: andrea; +Cc: tglx, linux-kernel


I know... Said that I wouldn't come back unless there was a problem. But 
with these results! The patch is fantastic.

My problem app, blender, allocated all remaining physical memory and 360 
megs (of the 1 gig) swap but remained running. No oom-kill involved at 
all. This is a first with that app (in such a mode, with such a large 
working set of pictures). 500 1.2 meg uncompressed targa pictures 
animated in the graphical window of "Sequence Editor" on my system (256 
meg real mem). Wow.

The oom-kill/swap/memory handling/whatever must have been really sick.

Thank You!
Mats Johannesson (time to pay back by trying 2.6.10-rc3)


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
@ 2004-12-05  2:22 Voluspa
  0 siblings, 0 replies; 66+ messages in thread
From: Voluspa @ 2004-12-05  2:22 UTC (permalink / raw)
  To: tglx; +Cc: andrea, linux-kernel


On 2004-12-04 21:02:03 Thomas Gleixner wrote:

> Mats. I don't understand why this did not work for you. The change has
> to be reverted to the original line "might_sleep_if(wait)" !

Well, I'm no coder and therefore follow instructions to the letter, 
almost (; <-- ehum), which meant that "put back" became lines 613-615 in 
mm/page_alloc.c

might_sleep_if(wait);
if (wait)
        cond_resched();

Now interpreting it as the lines should be

might_sleep_if(wait);

/*

the kernel boots without problems. Have yet to test the oom-killer with 
my rogue app. Won't return unless there's problems.

Mvh
Mats Johannesson


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-04 21:02     ` Thomas Gleixner
@ 2004-12-05  0:27       ` Andrea Arcangeli
  2004-12-05 14:55         ` Thomas Gleixner
  0 siblings, 1 reply; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-05  0:27 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Voluspa, LKML

On Sat, Dec 04, 2004 at 10:02:03PM +0100, Thomas Gleixner wrote:
> On Sat, 2004-12-04 at 19:33 +0100, Thomas Gleixner wrote:
> >
> > I added some debug output and it calls __alloc_pages a couple of times.
> > All those calls get out from the first goto got_pg as expected.
> > 
> > I will try to add some more debug later
> > 
> 
> Your assumption that reverting the 
> 
> -       might_sleep_if(wait);
> +       if (wait)
> +               cond_resched();
> 
> change does solve the problem is correct. Looking at the diffs its the
> only change which can have any influence at this point.
> 
> Mats. I don't understand why this did not work for you. The change has
> to be reverted to the original line "might_sleep_if(wait)" !

Ok, so some piece of code is buggy: somebody is using GFP_KERNEL instead
of GFP_ATOMIC.  Reverting my change will only hide the real bug so I
wouldn't recommend it (except for testing purposes).

Would be very nice to find the real bug.

> It then works so far except that it kills the wrong process (sshd), but
> I did expect that from the previous experience. 
> 
> There is no multi kill or other strange things happening. I tested it
> with hackbench and the real application _after_ adding my "whom to kill
> patch" on top.
> 
> Looks much better now. 

So you mean there's a separate issue with the task selection right? I
didn't touch the task selection at all.

> Can you agree to add the selection patch, which takes the multi child
> forking process into account ? I don't explain again why it makes
> sense :)

I didn't recall that part of your patch, but it seems very orthogonal. I
didn't want to change the process selection at the same time. If I will
touch the task selection I'll probably rewrite it from scratch to choose
tasks only in function of the allocation rate, sure not with anything
similar to the current algorithm which is close to a DoS with big
database servers if some other smaller app hits a memleak and allocates
in a loop.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-04 18:33   ` Thomas Gleixner
@ 2004-12-04 21:02     ` Thomas Gleixner
  2004-12-05  0:27       ` Andrea Arcangeli
  0 siblings, 1 reply; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-04 21:02 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Voluspa, LKML

On Sat, 2004-12-04 at 19:33 +0100, Thomas Gleixner wrote:
>
> I added some debug output and it calls __alloc_pages a couple of times.
> All those calls get out from the first goto got_pg as expected.
> 
> I will try to add some more debug later
> 

Your assumption that reverting the 

-       might_sleep_if(wait);
+       if (wait)
+               cond_resched();

change does solve the problem is correct. Looking at the diffs its the
only change which can have any influence at this point.

Mats. I don't understand why this did not work for you. The change has
to be reverted to the original line "might_sleep_if(wait)" !

Scheduling in this init stage causes the breakage. 
might_sleep_if() is a nop and only does a state check when compiled with
CONFIG_DEBUG_SPINLOCK_SLEEP=y, but it does neither sleep nor schedule. 

It then works so far except that it kills the wrong process (sshd), but
I did expect that from the previous experience. 

There is no multi kill or other strange things happening. I tested it
with hackbench and the real application _after_ adding my "whom to kill
patch" on top.

Looks much better now. 

Can you agree to add the selection patch, which takes the multi child
forking process into account ? I don't explain again why it makes
sense :)

tglx



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-04 16:43 ` Andrea Arcangeli
@ 2004-12-04 18:33   ` Thomas Gleixner
  2004-12-04 21:02     ` Thomas Gleixner
  0 siblings, 1 reply; 66+ messages in thread
From: Thomas Gleixner @ 2004-12-04 18:33 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Voluspa, LKML

On Sat, 2004-12-04 at 17:43 +0100, Andrea Arcangeli wrote:
> On Sat, Dec 04, 2004 at 01:42:54PM +0100, Voluspa wrote:
>
> If try_to_free_pages is being recalled during boot them we've a problem
> somewhere else, it should never happen!
> 
> Plus it works like a charm here.
> 
> Can you send me your .config so that I will try to send you privately a
> kernel image built on my machine? (and before sending I'll try to boot
> it locally ;) My .config sure is happily running.
> 

You want my .config too ? :)

I tried again from scratch and the kernel is booting without your patch.
Adding your patch with the same config still does not boot. It does not
depend on PREEMPT=y/n.

I added some debug output and it calls __alloc_pages a couple of times.
All those calls get out from the first goto got_pg as expected.

I will try to add some more debug later

tglx



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-04 12:42 Voluspa
@ 2004-12-04 16:43 ` Andrea Arcangeli
  2004-12-04 18:33   ` Thomas Gleixner
  0 siblings, 1 reply; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-04 16:43 UTC (permalink / raw)
  To: Voluspa; +Cc: linux-kernel

On Sat, Dec 04, 2004 at 01:42:54PM +0100, Voluspa wrote:
> 
> On 2004-12-04 8:08:40 Andrea Arcangeli wrote:
> 
> > You can try to put back a might_slee_if(wait), but if it deadlocks 
> with
> > that change sure it's not a bug in my patch, it's instead a bug
> > somewhere else that calls alloc_pages w/o GFP_ATOMIC. Ingo's
> > lowlatency patch would expose the same bug too since they're aliasing
> > the might_sleep to cond_resched.
> 
> Putting it back doesn't alter the outcome - hanging. And the original 
> patch, (hope it was the right one) from:
> 
> http://marc.theaimsgroup.com/?l=linux-kernel&m=110204117506557&w=2

yes it's the right one ;)

> root:loke:/usr/src/linux-2.6.9-oomkill# patch -Np1 -i ../oomkill.patch 
> patching file mm/oom_kill.c
> patching file mm/page_alloc.c
> Hunk #1 succeeded at 608 (offset -3 lines).
> Hunk #3 succeeded at 681 (offset -3 lines).
> patching file mm/swap_state.c
> patching file mm/vmscan.c
> 
> has been tried with the following variations. With and without 
> optimizing for size, with and without preempt, with and without kernel 
> boot params (cfq, lapic), cold and hot starts, and then I threw in a smp 
> compile for measure. All have the same behaviour:
> 
> [...]
> Checking 'hlt' instruction... OK.
> 
> [10 minutes wait. Then a long callback trace
>  scrolls off the screen ending like Thomas']
> 
> <0>Kernel panic - not syncing: Fatal exception in interrupt
> 
> My toolchain (well, the whole software system) is quite contemporary 
> within the stable branches. Built from scratch with gcc-3.4.3, glibc-
> 20041011 (nptl) and binutils-2.15.92.0.2
> 
> No energy control, acpi-pm or whatever it's called, is used here. The 
> machine is extremely stable. Running with 100 percent utilization 24/7.
> 
> Don't shoot the messenger ;)

I trust you of course but I've absolutely no idea how can my patch ever
change any code that runs at that point during boot. mm/oom_kill.c can
be obviously ruled out. The changes in mm/swap_state.c (two printk in
show_swap_cache_info) as well can be obviously ruled out. The change in
mm/vmscan.c as well only makes a difference during an oom condition.

This mean it has to be the change in mm/page_alloc.c that broke
something. But even that should never run during boot (except for the
cond_resched instead of might_sleep_if that you already tried to backout
separately from the rest). There's simply not enough memory pressure at
boot in order to recall try_to_free_pages and run the modified code.

If try_to_free_pages is being recalled during boot them we've a problem
somewhere else, it should never happen!

Plus it works like a charm here.

Can you send me your .config so that I will try to send you privately a
kernel image built on my machine? (and before sending I'll try to boot
it locally ;) My .config sure is happily running.

Many thanks.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
@ 2004-12-04 12:42 Voluspa
  2004-12-04 16:43 ` Andrea Arcangeli
  0 siblings, 1 reply; 66+ messages in thread
From: Voluspa @ 2004-12-04 12:42 UTC (permalink / raw)
  To: andrea; +Cc: linux-kernel


On 2004-12-04 8:08:40 Andrea Arcangeli wrote:

> You can try to put back a might_slee_if(wait), but if it deadlocks 
with
> that change sure it's not a bug in my patch, it's instead a bug
> somewhere else that calls alloc_pages w/o GFP_ATOMIC. Ingo's
> lowlatency patch would expose the same bug too since they're aliasing
> the might_sleep to cond_resched.

Putting it back doesn't alter the outcome - hanging. And the original 
patch, (hope it was the right one) from:

http://marc.theaimsgroup.com/?l=linux-kernel&m=110204117506557&w=2

root:loke:/usr/src/linux-2.6.9-oomkill# patch -Np1 -i ../oomkill.patch 
patching file mm/oom_kill.c
patching file mm/page_alloc.c
Hunk #1 succeeded at 608 (offset -3 lines).
Hunk #3 succeeded at 681 (offset -3 lines).
patching file mm/swap_state.c
patching file mm/vmscan.c

has been tried with the following variations. With and without 
optimizing for size, with and without preempt, with and without kernel 
boot params (cfq, lapic), cold and hot starts, and then I threw in a smp 
compile for measure. All have the same behaviour:

[...]
Checking 'hlt' instruction... OK.

[10 minutes wait. Then a long callback trace
 scrolls off the screen ending like Thomas']

<0>Kernel panic - not syncing: Fatal exception in interrupt

My toolchain (well, the whole software system) is quite contemporary 
within the stable branches. Built from scratch with gcc-3.4.3, glibc-
20041011 (nptl) and binutils-2.15.92.0.2

No energy control, acpi-pm or whatever it's called, is used here. The 
machine is extremely stable. Running with 100 percent utilization 24/7.

Don't shoot the messenger ;)
Mats Johannesson


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
  2004-12-04  7:00 Voluspa
@ 2004-12-04  8:08 ` Andrea Arcangeli
  0 siblings, 0 replies; 66+ messages in thread
From: Andrea Arcangeli @ 2004-12-04  8:08 UTC (permalink / raw)
  To: Voluspa; +Cc: linux-kernel

On Sat, Dec 04, 2004 at 08:00:04AM +0100, Voluspa wrote:
> 
> On 2004-12-03 23:08:55 Andrea Arcangeli wrote:
> 
> >You mean my patch is preventing your machine to boot? Then you're doing
> >something else wrong because it's impossible my patch is preventing 
> >your machine to boot.
> 
> Same experience as Thomas here. Full stop like his first log (no errors)
> . PIII (Celeron) 900@1 gig, 256 meg mem, 1 gig swap, preempt enabled.

The places I modified are never invoked during boot (except the below
one). It works like a charm here. I tried again building on top of
2.6.10-rc3 and it works again just fine here (previously I was on top of
the kernel CVS out of sync currently). No idea what's preventing you to
boot, but it's very hard to believe that my patch is to blame for it.

The only modified piece of code that may run during boot is this:

-       might_sleep_if(wait);
+       if (wait)
+               cond_resched();


You can try to put back a might_slee_if(wait), but if it deadlocks with
that change sure it's not a bug in my patch, it's instead a bug
somewhere else that calls alloc_pages w/o GFP_ATOMIC. Ingo's
lowlatency patch would expose the same bug too since they're aliasing
the might_sleep to cond_resched.

But I can't reproduce anything wrong here.

Enabling unmasked SIMD FPU exception support... done.
Checking 'hlt' instruction... OK.
 tbxface-0118 [02] acpi_load_tables      : ACPI Tables successfully
acquired
Parsing all Control
Methods:...........................................................................................................................................
Table [DSDT](id F004) - 502 Objects with 45 Devices 139 Methods 29
Regions
ACPI Namespace successfully loaded at root c0503d40
evxfevnt-0094 [03] acpi_enable           : Transition to ACPI mode
successful
CPU0: Intel(R) XEON(TM) CPU 2.40GHz stepping 04
per-CPU timeslice cutoff: 1462.70 usecs.
[..]

machine boots and run just fine and the patch is definitely applied.

PREEMPT=n but it can't be a bug in my patch even if PREEMPT=y breaks.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
@ 2004-12-04  7:00 Voluspa
  2004-12-04  8:08 ` Andrea Arcangeli
  0 siblings, 1 reply; 66+ messages in thread
From: Voluspa @ 2004-12-04  7:00 UTC (permalink / raw)
  To: andrea; +Cc: linux-kernel


On 2004-12-03 23:08:55 Andrea Arcangeli wrote:

>You mean my patch is preventing your machine to boot? Then you're doing
>something else wrong because it's impossible my patch is preventing 
>your machine to boot.

Same experience as Thomas here. Full stop like his first log (no errors)
. PIII (Celeron) 900@1 gig, 256 meg mem, 1 gig swap, preempt enabled.

Tried your patch since the oom killer slaughtered a very important app 
here when another one ran amok. Not fork spawnings, just ram-eating. Was 
blender (3d renderer) in "Sequence Editor" mode when i hit alt-a (for 
animate) on a pretty large set of stills. Eventually blender got killed 
also, twice...

Kernel 2.6.9 with nick p-s? patch for the buggy kswapd (100 percent cpu, 
without using any swap).

Mvh
Mats Johannesson


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] oom killer (Core)
@ 2004-12-01 10:21 tvrtko.ursulin
  0 siblings, 0 replies; 66+ messages in thread
From: tvrtko.ursulin @ 2004-12-01 10:21 UTC (permalink / raw)
  To: tglx; +Cc: akpm, andrea, linux-kernel, marcelo.tosatti

On 01/12/2004 09:49:03 linux-kernel-owner wrote:

>The oom killer has currently some strange effects when triggered.
>It gets invoked multiple times and the selection of the task to kill
>does not take processes into account which fork a lot of child processes.
>
>The patch solves this by
>- Preventing reentrancy
>- Checking for memory threshold before selection and kill.
>- Taking child processes into account when selecting the process to kill

Ah, again. :) Rusty Lynch and me tried something similar at leat twice but 
with no avail. 

We had a modular OOM killers infrastructure with two new killers. One 
would just panic on OOM situation, and other did account for parents which 
repeatedly spawned 'bad' children. Some people called it 'a bit right 
winged'. :)

Take a look at http://linux.ursulin.net if you are interested. I did not 
sync it with the latest kernel since nobody was interested. It worked for 
me. 


^ permalink raw reply	[flat|nested] 66+ messages in thread

end of thread, other threads:[~2004-12-24  1:19 UTC | newest]

Thread overview: 66+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-12-01  9:49 [PATCH] oom killer (Core) tglx
2004-12-01 21:16 ` Andrea Arcangeli
2004-12-01 22:06   ` Thomas Gleixner
2004-12-01 22:33     ` Andrea Arcangeli
2004-12-02  3:36     ` Andrea Arcangeli
2004-12-02 11:09       ` Thomas Gleixner
2004-12-02 13:48         ` Thomas Gleixner
2004-12-02 16:47           ` Andrea Arcangeli
2004-12-02 16:55             ` Andrew Morton
2004-12-02 11:18               ` Marcelo Tosatti
2004-12-02 17:17               ` Thomas Gleixner
2004-12-02 17:27                 ` Andrew Morton
2004-12-02 18:08               ` Andrea Arcangeli
2004-12-02 18:29                 ` Andrew Morton
2004-12-02 19:01                   ` Thomas Gleixner
2004-12-02 18:55                 ` Thomas Gleixner
2004-12-02 19:07                   ` Andrew Morton
2004-12-02 19:08                     ` Thomas Gleixner
2004-12-02 19:22                       ` Andrew Morton
2004-12-02 19:24                         ` Thomas Gleixner
2004-12-02 20:11                           ` Andre Tomt
2004-12-03 22:45                             ` Thomas Gleixner
2004-12-02 23:47                           ` Andrea Arcangeli
2004-12-03 14:41                           ` Helge Hafting
2004-12-03 21:20                             ` Thomas Gleixner
2004-12-05 21:14                               ` Helge Hafting
2004-12-02 23:35                   ` Andrea Arcangeli
2004-12-03  2:28                     ` Andrea Arcangeli
2004-12-03 22:37                       ` Thomas Gleixner
2004-12-03 22:51                         ` Thomas Gleixner
2004-12-03 23:08                           ` Andrea Arcangeli
2004-12-10 16:36                       ` William Lee Irwin III
2004-12-10 17:35                         ` Andrea Arcangeli
2004-12-10 17:43                           ` William Lee Irwin III
2004-12-10 17:55                             ` Andrea Arcangeli
2004-12-10 18:00                               ` William Lee Irwin III
2004-12-10 18:15                                 ` Andrea Arcangeli
2004-12-10 18:19                                   ` William Lee Irwin III
2004-12-10 19:05                                     ` Andrea Arcangeli
2004-12-10 16:51                       ` William Lee Irwin III
2004-12-03 21:10                     ` Thomas Gleixner
2004-12-03 22:21                       ` Andrea Arcangeli
2004-12-05  2:52 ` William Lee Irwin III
2004-12-05 13:38   ` Thomas Gleixner
2004-12-05 15:22     ` Andrea Arcangeli
2004-12-10 16:32 ` William Lee Irwin III
2004-12-10 16:52   ` Thomas Gleixner
2004-12-10 17:43     ` William Lee Irwin III
2004-12-10 17:47     ` William Lee Irwin III
2004-12-10 17:49     ` Andrea Arcangeli
2004-12-10 17:57       ` William Lee Irwin III
2004-12-12  0:12         ` William Lee Irwin III
2004-12-24  1:18     ` Andrea Arcangeli
2004-12-01 10:21 tvrtko.ursulin
2004-12-04  7:00 Voluspa
2004-12-04  8:08 ` Andrea Arcangeli
2004-12-04 12:42 Voluspa
2004-12-04 16:43 ` Andrea Arcangeli
2004-12-04 18:33   ` Thomas Gleixner
2004-12-04 21:02     ` Thomas Gleixner
2004-12-05  0:27       ` Andrea Arcangeli
2004-12-05 14:55         ` Thomas Gleixner
2004-12-05 15:34           ` Andrea Arcangeli
2004-12-05 16:29             ` Thomas Gleixner
2004-12-05  2:22 Voluspa
2004-12-05  8:32 Voluspa

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).