All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-21  5:59 ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-21  5:59 UTC (permalink / raw)
  To: linux-mm; +Cc: akpm, rientjes, minchan.kim, linux-kernel

A patch for avoiding oom-serial-killer at lowmem shortage.
Patch is onto mmotm-2010/01/15 (depends on mm-count-lowmem-rss.patch)
Tested on x86-64/SMP + debug module(to allocated lowmem), works well.

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

One cause of OOM-Killer is memory shortage in lower zones.
(If memory is enough, lowmem_reserve_ratio works well. but..)

In lowmem-shortage oom-kill, oom-killer choses a vicitim process
on their vm size. But this kills a process which has lowmem memory
only if it's lucky. At last, there will be an oom-serial-killer.

Now, we have per-mm lowmem usage counter. We can make use of it
to select a good? victim.

This patch does
  - add CONSTRAINT_LOWMEM to oom's constraint type.
  - pass constraint to __badness()
  - change calculation based on constraint. If CONSTRAINT_LOWMEM,
    use low_rss instead of vmsize.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/oom.h |    1 
 mm/oom_kill.c       |   69 +++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 48 insertions(+), 22 deletions(-)

Index: mmotm-2.6.33-Jan15/include/linux/oom.h
===================================================================
--- mmotm-2.6.33-Jan15.orig/include/linux/oom.h
+++ mmotm-2.6.33-Jan15/include/linux/oom.h
@@ -20,6 +20,7 @@ struct notifier_block;
  */
 enum oom_constraint {
 	CONSTRAINT_NONE,
+	CONSTRAINT_LOWMEM,
 	CONSTRAINT_CPUSET,
 	CONSTRAINT_MEMORY_POLICY,
 };
Index: mmotm-2.6.33-Jan15/mm/oom_kill.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/mm/oom_kill.c
+++ mmotm-2.6.33-Jan15/mm/oom_kill.c
@@ -55,6 +55,7 @@ static int has_intersects_mems_allowed(s
  * badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
  * @uptime: current uptime in seconds
+ * @constraint: context of badness calculation.
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -70,7 +71,8 @@ static int has_intersects_mems_allowed(s
  *    of least surprise ... (be careful when you change it)
  */
 
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime,
+			int constraint)
 {
 	unsigned long points, cpu_time, run_time;
 	struct mm_struct *mm;
@@ -89,11 +91,16 @@ unsigned long badness(struct task_struct
 		task_unlock(p);
 		return 0;
 	}
-
-	/*
-	 * The memory size of the process is the basis for the badness.
-	 */
-	points = mm->total_vm;
+	switch  (constraint) {
+	case CONSTRAINT_LOWMEM:
+		/* use lowmem usage as the basis for the badness */
+		points = get_low_rss(mm);
+		break;
+	default:
+		/* use virtual memory size as the basis for the badness */
+		points = mm->total_vm;
+		break;
+	}
 
 	/*
 	 * After this unlock we can no longer dereference local variable `mm'
@@ -113,12 +120,16 @@ unsigned long badness(struct task_struct
 	 * machine with an endless amount of children. In case a single
 	 * child is eating the vast majority of memory, adding only half
 	 * to the parents will make the child our kill candidate of choice.
+	 *
+	 * At lowmem shortage, ignore this part.
 	 */
-	list_for_each_entry(child, &p->children, sibling) {
-		task_lock(child);
-		if (child->mm != mm && child->mm)
-			points += child->mm->total_vm/2 + 1;
-		task_unlock(child);
+	if (constraint != CONSTRAINT_LOWMEM) {
+		list_for_each_entry(child, &p->children, sibling) {
+			task_lock(child);
+			if (child->mm != mm && child->mm)
+				points += child->mm->total_vm/2 + 1;
+			task_unlock(child);
+		}
 	}
 
 	/*
@@ -212,6 +223,9 @@ static enum oom_constraint constrained_a
 	if (gfp_mask & __GFP_THISNODE)
 		return CONSTRAINT_NONE;
 
+	if (high_zoneidx <= lowmem_zone)
+		return CONSTRAINT_LOWMEM;
+
 	/*
 	 * The nodemask here is a nodemask passed to alloc_pages(). Now,
 	 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
@@ -233,6 +247,10 @@ static enum oom_constraint constrained_a
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 				gfp_t gfp_mask, nodemask_t *nodemask)
 {
+	int zone_idx = gfp_zone(gfp_mask);
+
+	if (zone_idx <= lowmem_zone)
+		return CONSTRAINT_LOWMEM;
 	return CONSTRAINT_NONE;
 }
 #endif
@@ -244,7 +262,7 @@ static enum oom_constraint constrained_a
  * (not docbooked, we don't want this one cluttering up the manual)
  */
 static struct task_struct *select_bad_process(unsigned long *ppoints,
-						struct mem_cgroup *mem)
+				struct mem_cgroup *mem, int constraint)
 {
 	struct task_struct *p;
 	struct task_struct *chosen = NULL;
@@ -300,7 +318,7 @@ static struct task_struct *select_bad_pr
 		if (p->signal->oom_adj == OOM_DISABLE)
 			continue;
 
-		points = badness(p, uptime.tv_sec);
+		points = badness(p, uptime.tv_sec, constraint);
 		if (points > *ppoints || !chosen) {
 			chosen = p;
 			*ppoints = points;
@@ -455,7 +473,7 @@ static int oom_kill_process(struct task_
 	}
 
 	printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
-					message, task_pid_nr(p), p->comm, points);
+				message, task_pid_nr(p), p->comm, points);
 
 	/* Try to kill a child first */
 	list_for_each_entry(c, &p->children, sibling) {
@@ -475,7 +493,7 @@ void mem_cgroup_out_of_memory(struct mem
 
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem);
+	p = select_bad_process(&points, mem, CONSTRAINT_NONE);
 	if (PTR_ERR(p) == -1UL)
 		goto out;
 
@@ -557,7 +575,7 @@ void clear_zonelist_oom(struct zonelist 
 /*
  * Must be called with tasklist_lock held for read.
  */
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static void __out_of_memory(gfp_t gfp_mask, int order, int constraint)
 {
 	struct task_struct *p;
 	unsigned long points;
@@ -571,7 +589,7 @@ retry:
 	 * Rambo mode: Shoot down a process and hope it solves whatever
 	 * issues we may have.
 	 */
-	p = select_bad_process(&points, NULL);
+	p = select_bad_process(&points, NULL, constraint);
 
 	if (PTR_ERR(p) == -1UL)
 		return;
@@ -583,9 +601,16 @@ retry:
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	if (oom_kill_process(p, gfp_mask, order, points, NULL,
+	switch (constraint) {
+	case CONSTRAINT_LOWMEM:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
+			"Out of memory (in lowmem)"))
+			goto retry;
+	default:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
 			     "Out of memory"))
-		goto retry;
+			goto retry;
+	}
 }
 
 /*
@@ -612,7 +637,7 @@ void pagefault_out_of_memory(void)
 		panic("out of memory from page fault. panic_on_oom is selected.\n");
 
 	read_lock(&tasklist_lock);
-	__out_of_memory(0, 0); /* unknown gfp_mask and order */
+	__out_of_memory(0, 0, CONSTRAINT_NONE); /* unknown gfp_mask and order */
 	read_unlock(&tasklist_lock);
 
 	/*
@@ -663,7 +688,7 @@ void out_of_memory(struct zonelist *zone
 		oom_kill_process(current, gfp_mask, order, 0, NULL,
 				"No available memory (MPOL_BIND)");
 		break;
-
+	case CONSTRAINT_LOWMEM:
 	case CONSTRAINT_NONE:
 		if (sysctl_panic_on_oom) {
 			dump_header(NULL, gfp_mask, order, NULL);
@@ -671,7 +696,7 @@ void out_of_memory(struct zonelist *zone
 		}
 		/* Fall-through */
 	case CONSTRAINT_CPUSET:
-		__out_of_memory(gfp_mask, order);
+		__out_of_memory(gfp_mask, order, constraint);
 		break;
 	}
 


^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-21  5:59 ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-21  5:59 UTC (permalink / raw)
  To: linux-mm; +Cc: akpm, rientjes, minchan.kim, linux-kernel

A patch for avoiding oom-serial-killer at lowmem shortage.
Patch is onto mmotm-2010/01/15 (depends on mm-count-lowmem-rss.patch)
Tested on x86-64/SMP + debug module(to allocated lowmem), works well.

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

One cause of OOM-Killer is memory shortage in lower zones.
(If memory is enough, lowmem_reserve_ratio works well. but..)

In lowmem-shortage oom-kill, oom-killer choses a vicitim process
on their vm size. But this kills a process which has lowmem memory
only if it's lucky. At last, there will be an oom-serial-killer.

Now, we have per-mm lowmem usage counter. We can make use of it
to select a good? victim.

This patch does
  - add CONSTRAINT_LOWMEM to oom's constraint type.
  - pass constraint to __badness()
  - change calculation based on constraint. If CONSTRAINT_LOWMEM,
    use low_rss instead of vmsize.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/oom.h |    1 
 mm/oom_kill.c       |   69 +++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 48 insertions(+), 22 deletions(-)

Index: mmotm-2.6.33-Jan15/include/linux/oom.h
===================================================================
--- mmotm-2.6.33-Jan15.orig/include/linux/oom.h
+++ mmotm-2.6.33-Jan15/include/linux/oom.h
@@ -20,6 +20,7 @@ struct notifier_block;
  */
 enum oom_constraint {
 	CONSTRAINT_NONE,
+	CONSTRAINT_LOWMEM,
 	CONSTRAINT_CPUSET,
 	CONSTRAINT_MEMORY_POLICY,
 };
Index: mmotm-2.6.33-Jan15/mm/oom_kill.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/mm/oom_kill.c
+++ mmotm-2.6.33-Jan15/mm/oom_kill.c
@@ -55,6 +55,7 @@ static int has_intersects_mems_allowed(s
  * badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
  * @uptime: current uptime in seconds
+ * @constraint: context of badness calculation.
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -70,7 +71,8 @@ static int has_intersects_mems_allowed(s
  *    of least surprise ... (be careful when you change it)
  */
 
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime,
+			int constraint)
 {
 	unsigned long points, cpu_time, run_time;
 	struct mm_struct *mm;
@@ -89,11 +91,16 @@ unsigned long badness(struct task_struct
 		task_unlock(p);
 		return 0;
 	}
-
-	/*
-	 * The memory size of the process is the basis for the badness.
-	 */
-	points = mm->total_vm;
+	switch  (constraint) {
+	case CONSTRAINT_LOWMEM:
+		/* use lowmem usage as the basis for the badness */
+		points = get_low_rss(mm);
+		break;
+	default:
+		/* use virtual memory size as the basis for the badness */
+		points = mm->total_vm;
+		break;
+	}
 
 	/*
 	 * After this unlock we can no longer dereference local variable `mm'
@@ -113,12 +120,16 @@ unsigned long badness(struct task_struct
 	 * machine with an endless amount of children. In case a single
 	 * child is eating the vast majority of memory, adding only half
 	 * to the parents will make the child our kill candidate of choice.
+	 *
+	 * At lowmem shortage, ignore this part.
 	 */
-	list_for_each_entry(child, &p->children, sibling) {
-		task_lock(child);
-		if (child->mm != mm && child->mm)
-			points += child->mm->total_vm/2 + 1;
-		task_unlock(child);
+	if (constraint != CONSTRAINT_LOWMEM) {
+		list_for_each_entry(child, &p->children, sibling) {
+			task_lock(child);
+			if (child->mm != mm && child->mm)
+				points += child->mm->total_vm/2 + 1;
+			task_unlock(child);
+		}
 	}
 
 	/*
@@ -212,6 +223,9 @@ static enum oom_constraint constrained_a
 	if (gfp_mask & __GFP_THISNODE)
 		return CONSTRAINT_NONE;
 
+	if (high_zoneidx <= lowmem_zone)
+		return CONSTRAINT_LOWMEM;
+
 	/*
 	 * The nodemask here is a nodemask passed to alloc_pages(). Now,
 	 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
@@ -233,6 +247,10 @@ static enum oom_constraint constrained_a
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 				gfp_t gfp_mask, nodemask_t *nodemask)
 {
+	int zone_idx = gfp_zone(gfp_mask);
+
+	if (zone_idx <= lowmem_zone)
+		return CONSTRAINT_LOWMEM;
 	return CONSTRAINT_NONE;
 }
 #endif
@@ -244,7 +262,7 @@ static enum oom_constraint constrained_a
  * (not docbooked, we don't want this one cluttering up the manual)
  */
 static struct task_struct *select_bad_process(unsigned long *ppoints,
-						struct mem_cgroup *mem)
+				struct mem_cgroup *mem, int constraint)
 {
 	struct task_struct *p;
 	struct task_struct *chosen = NULL;
@@ -300,7 +318,7 @@ static struct task_struct *select_bad_pr
 		if (p->signal->oom_adj == OOM_DISABLE)
 			continue;
 
-		points = badness(p, uptime.tv_sec);
+		points = badness(p, uptime.tv_sec, constraint);
 		if (points > *ppoints || !chosen) {
 			chosen = p;
 			*ppoints = points;
@@ -455,7 +473,7 @@ static int oom_kill_process(struct task_
 	}
 
 	printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
-					message, task_pid_nr(p), p->comm, points);
+				message, task_pid_nr(p), p->comm, points);
 
 	/* Try to kill a child first */
 	list_for_each_entry(c, &p->children, sibling) {
@@ -475,7 +493,7 @@ void mem_cgroup_out_of_memory(struct mem
 
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem);
+	p = select_bad_process(&points, mem, CONSTRAINT_NONE);
 	if (PTR_ERR(p) == -1UL)
 		goto out;
 
@@ -557,7 +575,7 @@ void clear_zonelist_oom(struct zonelist 
 /*
  * Must be called with tasklist_lock held for read.
  */
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static void __out_of_memory(gfp_t gfp_mask, int order, int constraint)
 {
 	struct task_struct *p;
 	unsigned long points;
@@ -571,7 +589,7 @@ retry:
 	 * Rambo mode: Shoot down a process and hope it solves whatever
 	 * issues we may have.
 	 */
-	p = select_bad_process(&points, NULL);
+	p = select_bad_process(&points, NULL, constraint);
 
 	if (PTR_ERR(p) == -1UL)
 		return;
@@ -583,9 +601,16 @@ retry:
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	if (oom_kill_process(p, gfp_mask, order, points, NULL,
+	switch (constraint) {
+	case CONSTRAINT_LOWMEM:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
+			"Out of memory (in lowmem)"))
+			goto retry;
+	default:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
 			     "Out of memory"))
-		goto retry;
+			goto retry;
+	}
 }
 
 /*
@@ -612,7 +637,7 @@ void pagefault_out_of_memory(void)
 		panic("out of memory from page fault. panic_on_oom is selected.\n");
 
 	read_lock(&tasklist_lock);
-	__out_of_memory(0, 0); /* unknown gfp_mask and order */
+	__out_of_memory(0, 0, CONSTRAINT_NONE); /* unknown gfp_mask and order */
 	read_unlock(&tasklist_lock);
 
 	/*
@@ -663,7 +688,7 @@ void out_of_memory(struct zonelist *zone
 		oom_kill_process(current, gfp_mask, order, 0, NULL,
 				"No available memory (MPOL_BIND)");
 		break;
-
+	case CONSTRAINT_LOWMEM:
 	case CONSTRAINT_NONE:
 		if (sysctl_panic_on_oom) {
 			dump_header(NULL, gfp_mask, order, NULL);
@@ -671,7 +696,7 @@ void out_of_memory(struct zonelist *zone
 		}
 		/* Fall-through */
 	case CONSTRAINT_CPUSET:
-		__out_of_memory(gfp_mask, order);
+		__out_of_memory(gfp_mask, order, constraint);
 		break;
 	}
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH] oom-kill: add lowmem usage aware oom kill handling
  2010-01-21  5:59 ` KAMEZAWA Hiroyuki
@ 2010-01-21 15:18   ` Minchan Kim
  -1 siblings, 0 replies; 75+ messages in thread
From: Minchan Kim @ 2010-01-21 15:18 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, akpm, rientjes, linux-kernel

Hi, Kame. 

On Thu, 2010-01-21 at 14:59 +0900, KAMEZAWA Hiroyuki wrote:
> A patch for avoiding oom-serial-killer at lowmem shortage.
> Patch is onto mmotm-2010/01/15 (depends on mm-count-lowmem-rss.patch)
> Tested on x86-64/SMP + debug module(to allocated lowmem), works well.
> 
> ==
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> One cause of OOM-Killer is memory shortage in lower zones.
> (If memory is enough, lowmem_reserve_ratio works well. but..)
> 
> In lowmem-shortage oom-kill, oom-killer choses a vicitim process
> on their vm size. But this kills a process which has lowmem memory
> only if it's lucky. At last, there will be an oom-serial-killer.
> 
> Now, we have per-mm lowmem usage counter. We can make use of it
> to select a good? victim.
> 
> This patch does
>   - add CONSTRAINT_LOWMEM to oom's constraint type.
>   - pass constraint to __badness()
>   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
>     use low_rss instead of vmsize.

As far as low memory, it would be better to consider lowmem counter.
But as you know, {vmsize VS rss} is debatable topic.
Maybe someone doesn't like this idea. 

So don't we need any test result at least?
If we don't have this patch, it happens several innocent process
killing. but we can't prevent it by this patch. 

Sorry for bothering you.

-- 
Kind regards,
Minchan Kim


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-21 15:18   ` Minchan Kim
  0 siblings, 0 replies; 75+ messages in thread
From: Minchan Kim @ 2010-01-21 15:18 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, akpm, rientjes, linux-kernel

Hi, Kame. 

On Thu, 2010-01-21 at 14:59 +0900, KAMEZAWA Hiroyuki wrote:
> A patch for avoiding oom-serial-killer at lowmem shortage.
> Patch is onto mmotm-2010/01/15 (depends on mm-count-lowmem-rss.patch)
> Tested on x86-64/SMP + debug module(to allocated lowmem), works well.
> 
> ==
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> One cause of OOM-Killer is memory shortage in lower zones.
> (If memory is enough, lowmem_reserve_ratio works well. but..)
> 
> In lowmem-shortage oom-kill, oom-killer choses a vicitim process
> on their vm size. But this kills a process which has lowmem memory
> only if it's lucky. At last, there will be an oom-serial-killer.
> 
> Now, we have per-mm lowmem usage counter. We can make use of it
> to select a good? victim.
> 
> This patch does
>   - add CONSTRAINT_LOWMEM to oom's constraint type.
>   - pass constraint to __badness()
>   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
>     use low_rss instead of vmsize.

As far as low memory, it would be better to consider lowmem counter.
But as you know, {vmsize VS rss} is debatable topic.
Maybe someone doesn't like this idea. 

So don't we need any test result at least?
If we don't have this patch, it happens several innocent process
killing. but we can't prevent it by this patch. 

Sorry for bothering you.

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH] oom-kill: add lowmem usage aware oom kill handling
  2010-01-21  5:59 ` KAMEZAWA Hiroyuki
@ 2010-01-21 15:29   ` Balbir Singh
  -1 siblings, 0 replies; 75+ messages in thread
From: Balbir Singh @ 2010-01-21 15:29 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, akpm, rientjes, minchan.kim, linux-kernel

On Thursday 21 January 2010 11:29 AM, KAMEZAWA Hiroyuki wrote:
> A patch for avoiding oom-serial-killer at lowmem shortage.
> Patch is onto mmotm-2010/01/15 (depends on mm-count-lowmem-rss.patch)
> Tested on x86-64/SMP + debug module(to allocated lowmem), works well.
> 
> ==
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> One cause of OOM-Killer is memory shortage in lower zones.
> (If memory is enough, lowmem_reserve_ratio works well. but..)
> 
> In lowmem-shortage oom-kill, oom-killer choses a vicitim process
> on their vm size. But this kills a process which has lowmem memory
> only if it's lucky. At last, there will be an oom-serial-killer.
> 
> Now, we have per-mm lowmem usage counter. We can make use of it
> to select a good? victim.

Have you seen any use cases that need this change? Or is it mostly via
code review and to utilize the availability of lowmem rss? Do we often
run into lowmem shortage triggering OOM?


-- 
Three Cheers,
Balbir Singh

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-21 15:29   ` Balbir Singh
  0 siblings, 0 replies; 75+ messages in thread
From: Balbir Singh @ 2010-01-21 15:29 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, akpm, rientjes, minchan.kim, linux-kernel

On Thursday 21 January 2010 11:29 AM, KAMEZAWA Hiroyuki wrote:
> A patch for avoiding oom-serial-killer at lowmem shortage.
> Patch is onto mmotm-2010/01/15 (depends on mm-count-lowmem-rss.patch)
> Tested on x86-64/SMP + debug module(to allocated lowmem), works well.
> 
> ==
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> One cause of OOM-Killer is memory shortage in lower zones.
> (If memory is enough, lowmem_reserve_ratio works well. but..)
> 
> In lowmem-shortage oom-kill, oom-killer choses a vicitim process
> on their vm size. But this kills a process which has lowmem memory
> only if it's lucky. At last, there will be an oom-serial-killer.
> 
> Now, we have per-mm lowmem usage counter. We can make use of it
> to select a good? victim.

Have you seen any use cases that need this change? Or is it mostly via
code review and to utilize the availability of lowmem rss? Do we often
run into lowmem shortage triggering OOM?


-- 
Three Cheers,
Balbir Singh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH] oom-kill: add lowmem usage aware oom kill handling
  2010-01-21 15:18   ` Minchan Kim
@ 2010-01-21 23:48     ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-21 23:48 UTC (permalink / raw)
  To: Minchan Kim; +Cc: linux-mm, akpm, rientjes, linux-kernel

On Fri, 22 Jan 2010 00:18:44 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:

> Hi, Kame. 
> 
> On Thu, 2010-01-21 at 14:59 +0900, KAMEZAWA Hiroyuki wrote:
> > A patch for avoiding oom-serial-killer at lowmem shortage.
> > Patch is onto mmotm-2010/01/15 (depends on mm-count-lowmem-rss.patch)
> > Tested on x86-64/SMP + debug module(to allocated lowmem), works well.
> > 
> > ==
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > 
> > One cause of OOM-Killer is memory shortage in lower zones.
> > (If memory is enough, lowmem_reserve_ratio works well. but..)
> > 
> > In lowmem-shortage oom-kill, oom-killer choses a vicitim process
> > on their vm size. But this kills a process which has lowmem memory
> > only if it's lucky. At last, there will be an oom-serial-killer.
> > 
> > Now, we have per-mm lowmem usage counter. We can make use of it
> > to select a good? victim.
> > 
> > This patch does
> >   - add CONSTRAINT_LOWMEM to oom's constraint type.
> >   - pass constraint to __badness()
> >   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
> >     use low_rss instead of vmsize.
> 
> As far as low memory, it would be better to consider lowmem counter.
> But as you know, {vmsize VS rss} is debatable topic.
> Maybe someone doesn't like this idea. 
> 
About lowmem, vmsize never work well.

> So don't we need any test result at least?
My test result was very artificial, so I didn't attach the result.

 - Before this patch, sshd was killed at first.
 - After this patch, memory consumer of low-rss was killed.

> If we don't have this patch, it happens several innocent process
> killing. but we can't prevent it by this patch. 
> 
I can't catch what you mean.

> Sorry for bothering you.
> 

Hmm, boot option or CONFIG ? (CONFIG_OOMKILLER_EXTENSION ?)

I'm now writing fork-bomb detector again and want to remove current
"gathering child's vm_size" heuristics. I'd like to put that under
the same config, too.

Thanks,
-Kame







^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-21 23:48     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-21 23:48 UTC (permalink / raw)
  To: Minchan Kim; +Cc: linux-mm, akpm, rientjes, linux-kernel

On Fri, 22 Jan 2010 00:18:44 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:

> Hi, Kame. 
> 
> On Thu, 2010-01-21 at 14:59 +0900, KAMEZAWA Hiroyuki wrote:
> > A patch for avoiding oom-serial-killer at lowmem shortage.
> > Patch is onto mmotm-2010/01/15 (depends on mm-count-lowmem-rss.patch)
> > Tested on x86-64/SMP + debug module(to allocated lowmem), works well.
> > 
> > ==
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > 
> > One cause of OOM-Killer is memory shortage in lower zones.
> > (If memory is enough, lowmem_reserve_ratio works well. but..)
> > 
> > In lowmem-shortage oom-kill, oom-killer choses a vicitim process
> > on their vm size. But this kills a process which has lowmem memory
> > only if it's lucky. At last, there will be an oom-serial-killer.
> > 
> > Now, we have per-mm lowmem usage counter. We can make use of it
> > to select a good? victim.
> > 
> > This patch does
> >   - add CONSTRAINT_LOWMEM to oom's constraint type.
> >   - pass constraint to __badness()
> >   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
> >     use low_rss instead of vmsize.
> 
> As far as low memory, it would be better to consider lowmem counter.
> But as you know, {vmsize VS rss} is debatable topic.
> Maybe someone doesn't like this idea. 
> 
About lowmem, vmsize never work well.

> So don't we need any test result at least?
My test result was very artificial, so I didn't attach the result.

 - Before this patch, sshd was killed at first.
 - After this patch, memory consumer of low-rss was killed.

> If we don't have this patch, it happens several innocent process
> killing. but we can't prevent it by this patch. 
> 
I can't catch what you mean.

> Sorry for bothering you.
> 

Hmm, boot option or CONFIG ? (CONFIG_OOMKILLER_EXTENSION ?)

I'm now writing fork-bomb detector again and want to remove current
"gathering child's vm_size" heuristics. I'd like to put that under
the same config, too.

Thanks,
-Kame






--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH] oom-kill: add lowmem usage aware oom kill handling
  2010-01-21 15:29   ` Balbir Singh
@ 2010-01-21 23:54     ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-21 23:54 UTC (permalink / raw)
  To: balbir; +Cc: linux-mm, akpm, rientjes, minchan.kim, linux-kernel

On Thu, 21 Jan 2010 20:59:48 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> On Thursday 21 January 2010 11:29 AM, KAMEZAWA Hiroyuki wrote:
> > A patch for avoiding oom-serial-killer at lowmem shortage.
> > Patch is onto mmotm-2010/01/15 (depends on mm-count-lowmem-rss.patch)
> > Tested on x86-64/SMP + debug module(to allocated lowmem), works well.
> > 
> > ==
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > 
> > One cause of OOM-Killer is memory shortage in lower zones.
> > (If memory is enough, lowmem_reserve_ratio works well. but..)
> > 
> > In lowmem-shortage oom-kill, oom-killer choses a vicitim process
> > on their vm size. But this kills a process which has lowmem memory
> > only if it's lucky. At last, there will be an oom-serial-killer.
> > 
> > Now, we have per-mm lowmem usage counter. We can make use of it
> > to select a good? victim.
> 
> Have you seen any use cases that need this change? Or is it mostly via
> code review and to utilize the availability of lowmem rss? Do we often
> run into lowmem shortage triggering OOM?
> 
  - I saw lowmem OOM killer very frequently on x86-32 box in my cusotmers.
  - I saw lowmem OOM killer somemtimes on ia64 box in my customers.

I know this helps oom handling in x86-32+Highmem environments.

For my _new_ customers, almost all devices are connected to 64bit PCI bus.
So, this is not for my customers ;) But OOM-Killer should handle this case.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-21 23:54     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-21 23:54 UTC (permalink / raw)
  To: balbir; +Cc: linux-mm, akpm, rientjes, minchan.kim, linux-kernel

On Thu, 21 Jan 2010 20:59:48 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> On Thursday 21 January 2010 11:29 AM, KAMEZAWA Hiroyuki wrote:
> > A patch for avoiding oom-serial-killer at lowmem shortage.
> > Patch is onto mmotm-2010/01/15 (depends on mm-count-lowmem-rss.patch)
> > Tested on x86-64/SMP + debug module(to allocated lowmem), works well.
> > 
> > ==
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > 
> > One cause of OOM-Killer is memory shortage in lower zones.
> > (If memory is enough, lowmem_reserve_ratio works well. but..)
> > 
> > In lowmem-shortage oom-kill, oom-killer choses a vicitim process
> > on their vm size. But this kills a process which has lowmem memory
> > only if it's lucky. At last, there will be an oom-serial-killer.
> > 
> > Now, we have per-mm lowmem usage counter. We can make use of it
> > to select a good? victim.
> 
> Have you seen any use cases that need this change? Or is it mostly via
> code review and to utilize the availability of lowmem rss? Do we often
> run into lowmem shortage triggering OOM?
> 
  - I saw lowmem OOM killer very frequently on x86-32 box in my cusotmers.
  - I saw lowmem OOM killer somemtimes on ia64 box in my customers.

I know this helps oom handling in x86-32+Highmem environments.

For my _new_ customers, almost all devices are connected to 64bit PCI bus.
So, this is not for my customers ;) But OOM-Killer should handle this case.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH] oom-kill: add lowmem usage aware oom kill handling
  2010-01-21 23:48     ` KAMEZAWA Hiroyuki
@ 2010-01-22  0:40       ` Minchan Kim
  -1 siblings, 0 replies; 75+ messages in thread
From: Minchan Kim @ 2010-01-22  0:40 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, akpm, rientjes, linux-kernel

On Fri, Jan 22, 2010 at 8:48 AM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Fri, 22 Jan 2010 00:18:44 +0900
> Minchan Kim <minchan.kim@gmail.com> wrote:
>
>> Hi, Kame.
>>
>> On Thu, 2010-01-21 at 14:59 +0900, KAMEZAWA Hiroyuki wrote:
>> > A patch for avoiding oom-serial-killer at lowmem shortage.
>> > Patch is onto mmotm-2010/01/15 (depends on mm-count-lowmem-rss.patch)
>> > Tested on x86-64/SMP + debug module(to allocated lowmem), works well.
>> >
>> > ==
>> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>> >
>> > One cause of OOM-Killer is memory shortage in lower zones.
>> > (If memory is enough, lowmem_reserve_ratio works well. but..)
>> >
>> > In lowmem-shortage oom-kill, oom-killer choses a vicitim process
>> > on their vm size. But this kills a process which has lowmem memory
>> > only if it's lucky. At last, there will be an oom-serial-killer.
>> >
>> > Now, we have per-mm lowmem usage counter. We can make use of it
>> > to select a good? victim.
>> >
>> > This patch does
>> >   - add CONSTRAINT_LOWMEM to oom's constraint type.
>> >   - pass constraint to __badness()
>> >   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
>> >     use low_rss instead of vmsize.
>>
>> As far as low memory, it would be better to consider lowmem counter.
>> But as you know, {vmsize VS rss} is debatable topic.
>> Maybe someone doesn't like this idea.
>>
> About lowmem, vmsize never work well.
>

Tend to agree with you.
I am just worried about "vmsize lovers".

You removed considering vmsize totally.
In case of LOWMEM, lowcount considering make sense.
But never considering vmsize might be debatable.

So personllay, I thouhg we could add more weight lowcount
in case of LOWMEM. But I chaged my mind.
I think it make OOM heurisic more complated without big benefit.

Simple is best.

>> So don't we need any test result at least?
> My test result was very artificial, so I didn't attach the result.
>
>  - Before this patch, sshd was killed at first.
>  - After this patch, memory consumer of low-rss was killed.

Okay. You already anwsered my question by Balbir's reply.
I had a question it's real problem and how often it happens.

>
>> If we don't have this patch, it happens several innocent process
>> killing. but we can't prevent it by this patch.
>>
> I can't catch what you mean.

I just said your patch's benefit.

>> Sorry for bothering you.
>>
>
> Hmm, boot option or CONFIG ? (CONFIG_OOMKILLER_EXTENSION ?)
>
> I'm now writing fork-bomb detector again and want to remove current
> "gathering child's vm_size" heuristics. I'd like to put that under
> the same config, too.

Totally, I don't like CONFIG option for that.
But vmsize lovers also don't want to change current behavior.
So it's desirable until your fork-form detector become mature and
prove it's good.

One more questions about below.

+       if (constraint != CONSTRAINT_LOWMEM) {
+               list_for_each_entry(child, &p->children, sibling) {
+                       task_lock(child);
+                       if (child->mm != mm && child->mm)
+                               points += child->mm->total_vm/2 + 1;
+                       task_unlock(child);
+               }

Why didn't you consider child's lowmem counter in case of LOWMEM?

>
> Thanks,
> -Kame
>
>
>
>
>
>
>



-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-22  0:40       ` Minchan Kim
  0 siblings, 0 replies; 75+ messages in thread
From: Minchan Kim @ 2010-01-22  0:40 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, akpm, rientjes, linux-kernel

On Fri, Jan 22, 2010 at 8:48 AM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Fri, 22 Jan 2010 00:18:44 +0900
> Minchan Kim <minchan.kim@gmail.com> wrote:
>
>> Hi, Kame.
>>
>> On Thu, 2010-01-21 at 14:59 +0900, KAMEZAWA Hiroyuki wrote:
>> > A patch for avoiding oom-serial-killer at lowmem shortage.
>> > Patch is onto mmotm-2010/01/15 (depends on mm-count-lowmem-rss.patch)
>> > Tested on x86-64/SMP + debug module(to allocated lowmem), works well.
>> >
>> > ==
>> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>> >
>> > One cause of OOM-Killer is memory shortage in lower zones.
>> > (If memory is enough, lowmem_reserve_ratio works well. but..)
>> >
>> > In lowmem-shortage oom-kill, oom-killer choses a vicitim process
>> > on their vm size. But this kills a process which has lowmem memory
>> > only if it's lucky. At last, there will be an oom-serial-killer.
>> >
>> > Now, we have per-mm lowmem usage counter. We can make use of it
>> > to select a good? victim.
>> >
>> > This patch does
>> >   - add CONSTRAINT_LOWMEM to oom's constraint type.
>> >   - pass constraint to __badness()
>> >   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
>> >     use low_rss instead of vmsize.
>>
>> As far as low memory, it would be better to consider lowmem counter.
>> But as you know, {vmsize VS rss} is debatable topic.
>> Maybe someone doesn't like this idea.
>>
> About lowmem, vmsize never work well.
>

Tend to agree with you.
I am just worried about "vmsize lovers".

You removed considering vmsize totally.
In case of LOWMEM, lowcount considering make sense.
But never considering vmsize might be debatable.

So personllay, I thouhg we could add more weight lowcount
in case of LOWMEM. But I chaged my mind.
I think it make OOM heurisic more complated without big benefit.

Simple is best.

>> So don't we need any test result at least?
> My test result was very artificial, so I didn't attach the result.
>
>  - Before this patch, sshd was killed at first.
>  - After this patch, memory consumer of low-rss was killed.

Okay. You already anwsered my question by Balbir's reply.
I had a question it's real problem and how often it happens.

>
>> If we don't have this patch, it happens several innocent process
>> killing. but we can't prevent it by this patch.
>>
> I can't catch what you mean.

I just said your patch's benefit.

>> Sorry for bothering you.
>>
>
> Hmm, boot option or CONFIG ? (CONFIG_OOMKILLER_EXTENSION ?)
>
> I'm now writing fork-bomb detector again and want to remove current
> "gathering child's vm_size" heuristics. I'd like to put that under
> the same config, too.

Totally, I don't like CONFIG option for that.
But vmsize lovers also don't want to change current behavior.
So it's desirable until your fork-form detector become mature and
prove it's good.

One more questions about below.

+       if (constraint != CONSTRAINT_LOWMEM) {
+               list_for_each_entry(child, &p->children, sibling) {
+                       task_lock(child);
+                       if (child->mm != mm && child->mm)
+                               points += child->mm->total_vm/2 + 1;
+                       task_unlock(child);
+               }

Why didn't you consider child's lowmem counter in case of LOWMEM?

>
> Thanks,
> -Kame
>
>
>
>
>
>
>



-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH] oom-kill: add lowmem usage aware oom kill handling
  2010-01-22  0:40       ` Minchan Kim
@ 2010-01-22  1:06         ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-22  1:06 UTC (permalink / raw)
  To: Minchan Kim; +Cc: linux-mm, akpm, rientjes, linux-kernel

On Fri, 22 Jan 2010 09:40:17 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:

> On Fri, Jan 22, 2010 at 8:48 AM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Fri, 22 Jan 2010 00:18:44 +0900
> > Minchan Kim <minchan.kim@gmail.com> wrote:
> >
> >> Hi, Kame.
> >>
> >> On Thu, 2010-01-21 at 14:59 +0900, KAMEZAWA Hiroyuki wrote:
> >> > A patch for avoiding oom-serial-killer at lowmem shortage.
> >> > Patch is onto mmotm-2010/01/15 (depends on mm-count-lowmem-rss.patch)
> >> > Tested on x86-64/SMP + debug module(to allocated lowmem), works well.
> >> >
> >> > ==
> >> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> >> >
> >> > One cause of OOM-Killer is memory shortage in lower zones.
> >> > (If memory is enough, lowmem_reserve_ratio works well. but..)
> >> >
> >> > In lowmem-shortage oom-kill, oom-killer choses a vicitim process
> >> > on their vm size. But this kills a process which has lowmem memory
> >> > only if it's lucky. At last, there will be an oom-serial-killer.
> >> >
> >> > Now, we have per-mm lowmem usage counter. We can make use of it
> >> > to select a good? victim.
> >> >
> >> > This patch does
> >> >   - add CONSTRAINT_LOWMEM to oom's constraint type.
> >> >   - pass constraint to __badness()
> >> >   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
> >> >     use low_rss instead of vmsize.
> >>
> >> As far as low memory, it would be better to consider lowmem counter.
> >> But as you know, {vmsize VS rss} is debatable topic.
> >> Maybe someone doesn't like this idea.
> >>
> > About lowmem, vmsize never work well.
> >
> 
> Tend to agree with you.
> I am just worried about "vmsize lovers".
> 
> You removed considering vmsize totally.
> In case of LOWMEM, lowcount considering make sense.
> But never considering vmsize might be debatable.
> 
> So personllay, I thouhg we could add more weight lowcount
> in case of LOWMEM. But I chaged my mind.
> I think it make OOM heurisic more complated without big benefit.
> 
thanks. I don't want patch-drop again, either :)

> Simple is best.
> 
> >> So don't we need any test result at least?
> > My test result was very artificial, so I didn't attach the result.
> >
> >  - Before this patch, sshd was killed at first.
> >  - After this patch, memory consumer of low-rss was killed.
> 
> Okay. You already anwsered my question by Balbir's reply.
> I had a question it's real problem and how often it happens.
> 
> >
> >> If we don't have this patch, it happens several innocent process
> >> killing. but we can't prevent it by this patch.
> >>
> > I can't catch what you mean.
> 
> I just said your patch's benefit.
> 
> >> Sorry for bothering you.
> >>
> >
> > Hmm, boot option or CONFIG ? (CONFIG_OOMKILLER_EXTENSION ?)
> >
> > I'm now writing fork-bomb detector again and want to remove current
> > "gathering child's vm_size" heuristics. I'd like to put that under
> > the same config, too.
> 
> Totally, I don't like CONFIG option for that.
> But vmsize lovers also don't want to change current behavior.
> So it's desirable until your fork-form detector become mature and
> prove it's good.
> 
Hmm, Okay, I'll add some. Kosaki told me sysctl is better. I'll check
how it looks.

> One more questions about below.
> 
> +       if (constraint != CONSTRAINT_LOWMEM) {
> +               list_for_each_entry(child, &p->children, sibling) {
> +                       task_lock(child);
> +                       if (child->mm != mm && child->mm)
> +                               points += child->mm->total_vm/2 + 1;
> +                       task_unlock(child);
> +               }
> 
> Why didn't you consider child's lowmem counter in case of LOWMEM?
> 
Assume process A, B, C, D. B and C are children of A.
 
  A (low_rss = 0)
  B (low_rss = 20)
  C (low_rss = 20)
  D (low_rss = 20)

When we caluculate A's socre by above logic, A's score may be greater than
B and C, D. We do targetted oom-kill as sniper, not as genocider. So, ignoreing
children here is better, I think.
I'll add some explanation to changelog.

Thanks,
-Kame






^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-22  1:06         ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-22  1:06 UTC (permalink / raw)
  To: Minchan Kim; +Cc: linux-mm, akpm, rientjes, linux-kernel

On Fri, 22 Jan 2010 09:40:17 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:

> On Fri, Jan 22, 2010 at 8:48 AM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Fri, 22 Jan 2010 00:18:44 +0900
> > Minchan Kim <minchan.kim@gmail.com> wrote:
> >
> >> Hi, Kame.
> >>
> >> On Thu, 2010-01-21 at 14:59 +0900, KAMEZAWA Hiroyuki wrote:
> >> > A patch for avoiding oom-serial-killer at lowmem shortage.
> >> > Patch is onto mmotm-2010/01/15 (depends on mm-count-lowmem-rss.patch)
> >> > Tested on x86-64/SMP + debug module(to allocated lowmem), works well.
> >> >
> >> > ==
> >> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> >> >
> >> > One cause of OOM-Killer is memory shortage in lower zones.
> >> > (If memory is enough, lowmem_reserve_ratio works well. but..)
> >> >
> >> > In lowmem-shortage oom-kill, oom-killer choses a vicitim process
> >> > on their vm size. But this kills a process which has lowmem memory
> >> > only if it's lucky. At last, there will be an oom-serial-killer.
> >> >
> >> > Now, we have per-mm lowmem usage counter. We can make use of it
> >> > to select a good? victim.
> >> >
> >> > This patch does
> >> > A  - add CONSTRAINT_LOWMEM to oom's constraint type.
> >> > A  - pass constraint to __badness()
> >> > A  - change calculation based on constraint. If CONSTRAINT_LOWMEM,
> >> > A  A  use low_rss instead of vmsize.
> >>
> >> As far as low memory, it would be better to consider lowmem counter.
> >> But as you know, {vmsize VS rss} is debatable topic.
> >> Maybe someone doesn't like this idea.
> >>
> > About lowmem, vmsize never work well.
> >
> 
> Tend to agree with you.
> I am just worried about "vmsize lovers".
> 
> You removed considering vmsize totally.
> In case of LOWMEM, lowcount considering make sense.
> But never considering vmsize might be debatable.
> 
> So personllay, I thouhg we could add more weight lowcount
> in case of LOWMEM. But I chaged my mind.
> I think it make OOM heurisic more complated without big benefit.
> 
thanks. I don't want patch-drop again, either :)

> Simple is best.
> 
> >> So don't we need any test result at least?
> > My test result was very artificial, so I didn't attach the result.
> >
> > A - Before this patch, sshd was killed at first.
> > A - After this patch, memory consumer of low-rss was killed.
> 
> Okay. You already anwsered my question by Balbir's reply.
> I had a question it's real problem and how often it happens.
> 
> >
> >> If we don't have this patch, it happens several innocent process
> >> killing. but we can't prevent it by this patch.
> >>
> > I can't catch what you mean.
> 
> I just said your patch's benefit.
> 
> >> Sorry for bothering you.
> >>
> >
> > Hmm, boot option or CONFIG ? (CONFIG_OOMKILLER_EXTENSION ?)
> >
> > I'm now writing fork-bomb detector again and want to remove current
> > "gathering child's vm_size" heuristics. I'd like to put that under
> > the same config, too.
> 
> Totally, I don't like CONFIG option for that.
> But vmsize lovers also don't want to change current behavior.
> So it's desirable until your fork-form detector become mature and
> prove it's good.
> 
Hmm, Okay, I'll add some. Kosaki told me sysctl is better. I'll check
how it looks.

> One more questions about below.
> 
> +       if (constraint != CONSTRAINT_LOWMEM) {
> +               list_for_each_entry(child, &p->children, sibling) {
> +                       task_lock(child);
> +                       if (child->mm != mm && child->mm)
> +                               points += child->mm->total_vm/2 + 1;
> +                       task_unlock(child);
> +               }
> 
> Why didn't you consider child's lowmem counter in case of LOWMEM?
> 
Assume process A, B, C, D. B and C are children of A.
 
  A (low_rss = 0)
  B (low_rss = 20)
  C (low_rss = 20)
  D (low_rss = 20)

When we caluculate A's socre by above logic, A's score may be greater than
B and C, D. We do targetted oom-kill as sniper, not as genocider. So, ignoreing
children here is better, I think.
I'll add some explanation to changelog.

Thanks,
-Kame





--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v2] oom-kill: add lowmem usage aware oom kill handling
  2010-01-21  5:59 ` KAMEZAWA Hiroyuki
@ 2010-01-22  6:23   ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-22  6:23 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm, akpm, rientjes, minchan.kim, linux-kernel, balbir

updated. thank you for review.

The patch is onto mmotm-Jan15 (depends on mm-count-lowmem-rss.patch)
Tested on x86-64/SMP + debug module(to allocated lowmem), works well.

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Default oom-killer uses badness calculation based on process's vm_size
and some amounts of heuristics. Some users see proc->oom_score and
proc->oom_adj to control oom-killed tendency under their server.

Now, we know oom-killer don't work ideally in some situaion, in PCs. Some
enhancements are demanded. But such enhancements for oom-killer makes
incomaptibility to oom-controls in enterprise world. So, this patch
adds sysctl for extensions for oom-killer. Main purpose is for
making a chance for wider test for new scheme.

One cause of OOM-Killer is memory shortage in lower zones.
(If memory is enough, lowmem_reserve_ratio works well. but..)
I saw lowmem-oom frequently on x86-32 and sometimes on ia64 in
my cusotmer support jobs. If we just see process's vm_size at oom,
we can never kill a process which has lowmem.
At last, there will be an oom-serial-killer.

Now, we have per-mm lowmem usage counter. We can make use of it
to select a good victim.

This patch does
  - add sysctl for new bahavior.
  - add CONSTRAINT_LOWMEM to oom's constraint type.
  - pass constraint to __badness()
  - change calculation based on constraint. If CONSTRAINT_LOWMEM,
    use low_rss instead of vmsize.

Changelog 2010/01/22:
 - added sysctl
 - fixed !CONFIG_MMU
 - fixed fs/proc/base.c breakacge.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 Documentation/sysctl/vm.txt |   16 ++++++++
 fs/proc/base.c              |    5 +-
 include/linux/oom.h         |    1 
 kernel/sysctl.c             |   10 ++++-
 mm/oom_kill.c               |   87 ++++++++++++++++++++++++++++++++------------
 5 files changed, 94 insertions(+), 25 deletions(-)

Index: mmotm-2.6.33-Jan15/include/linux/oom.h
===================================================================
--- mmotm-2.6.33-Jan15.orig/include/linux/oom.h
+++ mmotm-2.6.33-Jan15/include/linux/oom.h
@@ -20,6 +20,7 @@ struct notifier_block;
  */
 enum oom_constraint {
 	CONSTRAINT_NONE,
+	CONSTRAINT_LOWMEM,
 	CONSTRAINT_CPUSET,
 	CONSTRAINT_MEMORY_POLICY,
 };
Index: mmotm-2.6.33-Jan15/mm/oom_kill.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/mm/oom_kill.c
+++ mmotm-2.6.33-Jan15/mm/oom_kill.c
@@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks;
 static DEFINE_SPINLOCK(zone_scan_lock);
 /* #define DEBUG */
 
+int sysctl_oom_kill_extension_mask;
+enum {
+	EXT_LOWMEM_OOM,
+};
+
+#ifdef CONFIG_MMU
+static int oom_extension(int idx)
+{
+	return sysctl_oom_kill_extension_mask & (1 << idx);
+}
+#else
+static int oom_extension(int idx)
+{
+	return 0;
+}
+#endif
+
 /*
  * Is all threads of the target process nodes overlap ours?
  */
@@ -55,6 +72,7 @@ static int has_intersects_mems_allowed(s
  * badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
  * @uptime: current uptime in seconds
+ * @constraint: context of badness calculation.
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -70,7 +88,8 @@ static int has_intersects_mems_allowed(s
  *    of least surprise ... (be careful when you change it)
  */
 
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime,
+			int constraint)
 {
 	unsigned long points, cpu_time, run_time;
 	struct mm_struct *mm;
@@ -89,11 +108,16 @@ unsigned long badness(struct task_struct
 		task_unlock(p);
 		return 0;
 	}
-
-	/*
-	 * The memory size of the process is the basis for the badness.
-	 */
-	points = mm->total_vm;
+	switch  (constraint) {
+	case CONSTRAINT_LOWMEM:
+		/* use lowmem usage as the basis for the badness */
+		points = get_low_rss(mm);
+		break;
+	default:
+		/* use virtual memory size as the basis for the badness */
+		points = mm->total_vm;
+		break;
+	}
 
 	/*
 	 * After this unlock we can no longer dereference local variable `mm'
@@ -113,12 +137,17 @@ unsigned long badness(struct task_struct
 	 * machine with an endless amount of children. In case a single
 	 * child is eating the vast majority of memory, adding only half
 	 * to the parents will make the child our kill candidate of choice.
+	 *
+	 * At lowmem shortage, this part is skipped because children's lowmem
+	 * usage is not related to its parent.
 	 */
-	list_for_each_entry(child, &p->children, sibling) {
-		task_lock(child);
-		if (child->mm != mm && child->mm)
-			points += child->mm->total_vm/2 + 1;
-		task_unlock(child);
+	if (constraint != CONSTRAINT_LOWMEM) {
+		list_for_each_entry(child, &p->children, sibling) {
+			task_lock(child);
+			if (child->mm != mm && child->mm)
+				points += child->mm->total_vm/2 + 1;
+			task_unlock(child);
+		}
 	}
 
 	/*
@@ -212,6 +241,9 @@ static enum oom_constraint constrained_a
 	if (gfp_mask & __GFP_THISNODE)
 		return CONSTRAINT_NONE;
 
+	if (oom_extension(EXT_LOWMEM_OOM) && (high_zoneidx <= lowmem_zone))
+		return CONSTRAINT_LOWMEM;
+
 	/*
 	 * The nodemask here is a nodemask passed to alloc_pages(). Now,
 	 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
@@ -233,6 +265,10 @@ static enum oom_constraint constrained_a
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 				gfp_t gfp_mask, nodemask_t *nodemask)
 {
+	int zone_idx = gfp_zone(gfp_mask);
+
+	if (oom_extension(EXT_LOWMEM_OOM) && (zone_idx <= lowmem_zone))
+		return CONSTRAINT_LOWMEM;
 	return CONSTRAINT_NONE;
 }
 #endif
@@ -244,7 +280,7 @@ static enum oom_constraint constrained_a
  * (not docbooked, we don't want this one cluttering up the manual)
  */
 static struct task_struct *select_bad_process(unsigned long *ppoints,
-						struct mem_cgroup *mem)
+				struct mem_cgroup *mem, int constraint)
 {
 	struct task_struct *p;
 	struct task_struct *chosen = NULL;
@@ -300,7 +336,7 @@ static struct task_struct *select_bad_pr
 		if (p->signal->oom_adj == OOM_DISABLE)
 			continue;
 
-		points = badness(p, uptime.tv_sec);
+		points = badness(p, uptime.tv_sec, constraint);
 		if (points > *ppoints || !chosen) {
 			chosen = p;
 			*ppoints = points;
@@ -455,7 +491,7 @@ static int oom_kill_process(struct task_
 	}
 
 	printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
-					message, task_pid_nr(p), p->comm, points);
+				message, task_pid_nr(p), p->comm, points);
 
 	/* Try to kill a child first */
 	list_for_each_entry(c, &p->children, sibling) {
@@ -475,7 +511,7 @@ void mem_cgroup_out_of_memory(struct mem
 
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem);
+	p = select_bad_process(&points, mem, CONSTRAINT_NONE);
 	if (PTR_ERR(p) == -1UL)
 		goto out;
 
@@ -557,7 +593,7 @@ void clear_zonelist_oom(struct zonelist 
 /*
  * Must be called with tasklist_lock held for read.
  */
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static void __out_of_memory(gfp_t gfp_mask, int order, int constraint)
 {
 	struct task_struct *p;
 	unsigned long points;
@@ -571,7 +607,7 @@ retry:
 	 * Rambo mode: Shoot down a process and hope it solves whatever
 	 * issues we may have.
 	 */
-	p = select_bad_process(&points, NULL);
+	p = select_bad_process(&points, NULL, constraint);
 
 	if (PTR_ERR(p) == -1UL)
 		return;
@@ -583,9 +619,16 @@ retry:
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	if (oom_kill_process(p, gfp_mask, order, points, NULL,
+	switch (constraint) {
+	case CONSTRAINT_LOWMEM:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
+			"Out of memory (in lowmem)"))
+			goto retry;
+	default:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
 			     "Out of memory"))
-		goto retry;
+			goto retry;
+	}
 }
 
 /*
@@ -612,7 +655,7 @@ void pagefault_out_of_memory(void)
 		panic("out of memory from page fault. panic_on_oom is selected.\n");
 
 	read_lock(&tasklist_lock);
-	__out_of_memory(0, 0); /* unknown gfp_mask and order */
+	__out_of_memory(0, 0, CONSTRAINT_NONE); /* unknown gfp_mask and order */
 	read_unlock(&tasklist_lock);
 
 	/*
@@ -663,7 +706,7 @@ void out_of_memory(struct zonelist *zone
 		oom_kill_process(current, gfp_mask, order, 0, NULL,
 				"No available memory (MPOL_BIND)");
 		break;
-
+	case CONSTRAINT_LOWMEM:
 	case CONSTRAINT_NONE:
 		if (sysctl_panic_on_oom) {
 			dump_header(NULL, gfp_mask, order, NULL);
@@ -671,7 +714,7 @@ void out_of_memory(struct zonelist *zone
 		}
 		/* Fall-through */
 	case CONSTRAINT_CPUSET:
-		__out_of_memory(gfp_mask, order);
+		__out_of_memory(gfp_mask, order, constraint);
 		break;
 	}
 
Index: mmotm-2.6.33-Jan15/kernel/sysctl.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/kernel/sysctl.c
+++ mmotm-2.6.33-Jan15/kernel/sysctl.c
@@ -22,7 +22,6 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
-#include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
@@ -72,6 +71,7 @@ extern int sysctl_overcommit_ratio;
 extern int sysctl_panic_on_oom;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_oom_dump_tasks;
+extern int sysctl_oom_kill_extension_mask;
 extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
@@ -202,6 +202,7 @@ extern struct ctl_table epoll_table[];
 int sysctl_legacy_va_layout;
 #endif
 
+
 extern int prove_locking;
 extern int lock_stat;
 
@@ -1282,6 +1283,13 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.procname	= "oom_kill_extension_mask",
+		.data		= &sysctl_oom_kill_extension_mask,
+		.maxlen		= sizeof(sysctl_oom_kill_extension_mask),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 
 /*
  * NOTE: do not add new entries to this table unless you have read
Index: mmotm-2.6.33-Jan15/Documentation/sysctl/vm.txt
===================================================================
--- mmotm-2.6.33-Jan15.orig/Documentation/sysctl/vm.txt
+++ mmotm-2.6.33-Jan15/Documentation/sysctl/vm.txt
@@ -45,6 +45,7 @@ Currently, these files are in /proc/sys/
 - numa_zonelist_order
 - oom_dump_tasks
 - oom_kill_allocating_task
+- oom_kill_extension_mask
 - overcommit_memory
 - overcommit_ratio
 - page-cluster
@@ -511,6 +512,21 @@ The default value is 0.
 
 ==============================================================
 
+oom_kill_extension_mask:
+
+This is a mask for oom-killer extension features.
+Setting these flags may cause incompatibility for proc->oom_score and
+proc->oom_adj controls. So, please set carefully.
+
+bit 0....lowmem aware oom-killing.
+    If set, at lowmem shortage oom killing (for example, exhausting NORMAL_ZONE
+    under x86-32 HIGHMEM host), oom-killer will see lowmem rss usage of
+    processes instead of vmsize. Works only when CONFIG_MMU=y.
+
+The default value is 0
+
+==============================================================
+
 overcommit_memory:
 
 This value contains a flag that enables memory overcommitment.
Index: mmotm-2.6.33-Jan15/fs/proc/base.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/fs/proc/base.c
+++ mmotm-2.6.33-Jan15/fs/proc/base.c
@@ -458,7 +458,8 @@ static const struct file_operations proc
 #endif
 
 /* The badness from the OOM killer */
-unsigned long badness(struct task_struct *p, unsigned long uptime);
+unsigned long badness(struct task_struct *p,
+	unsigned long uptime, int constraint);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long points;
@@ -466,7 +467,7 @@ static int proc_oom_score(struct task_st
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	read_lock(&tasklist_lock);
-	points = badness(task->group_leader, uptime.tv_sec);
+	points = badness(task->group_leader, uptime.tv_sec, CONSTRAINT_NONE);
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }


^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v2] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-22  6:23   ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-22  6:23 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm, akpm, rientjes, minchan.kim, linux-kernel, balbir

updated. thank you for review.

The patch is onto mmotm-Jan15 (depends on mm-count-lowmem-rss.patch)
Tested on x86-64/SMP + debug module(to allocated lowmem), works well.

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Default oom-killer uses badness calculation based on process's vm_size
and some amounts of heuristics. Some users see proc->oom_score and
proc->oom_adj to control oom-killed tendency under their server.

Now, we know oom-killer don't work ideally in some situaion, in PCs. Some
enhancements are demanded. But such enhancements for oom-killer makes
incomaptibility to oom-controls in enterprise world. So, this patch
adds sysctl for extensions for oom-killer. Main purpose is for
making a chance for wider test for new scheme.

One cause of OOM-Killer is memory shortage in lower zones.
(If memory is enough, lowmem_reserve_ratio works well. but..)
I saw lowmem-oom frequently on x86-32 and sometimes on ia64 in
my cusotmer support jobs. If we just see process's vm_size at oom,
we can never kill a process which has lowmem.
At last, there will be an oom-serial-killer.

Now, we have per-mm lowmem usage counter. We can make use of it
to select a good victim.

This patch does
  - add sysctl for new bahavior.
  - add CONSTRAINT_LOWMEM to oom's constraint type.
  - pass constraint to __badness()
  - change calculation based on constraint. If CONSTRAINT_LOWMEM,
    use low_rss instead of vmsize.

Changelog 2010/01/22:
 - added sysctl
 - fixed !CONFIG_MMU
 - fixed fs/proc/base.c breakacge.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 Documentation/sysctl/vm.txt |   16 ++++++++
 fs/proc/base.c              |    5 +-
 include/linux/oom.h         |    1 
 kernel/sysctl.c             |   10 ++++-
 mm/oom_kill.c               |   87 ++++++++++++++++++++++++++++++++------------
 5 files changed, 94 insertions(+), 25 deletions(-)

Index: mmotm-2.6.33-Jan15/include/linux/oom.h
===================================================================
--- mmotm-2.6.33-Jan15.orig/include/linux/oom.h
+++ mmotm-2.6.33-Jan15/include/linux/oom.h
@@ -20,6 +20,7 @@ struct notifier_block;
  */
 enum oom_constraint {
 	CONSTRAINT_NONE,
+	CONSTRAINT_LOWMEM,
 	CONSTRAINT_CPUSET,
 	CONSTRAINT_MEMORY_POLICY,
 };
Index: mmotm-2.6.33-Jan15/mm/oom_kill.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/mm/oom_kill.c
+++ mmotm-2.6.33-Jan15/mm/oom_kill.c
@@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks;
 static DEFINE_SPINLOCK(zone_scan_lock);
 /* #define DEBUG */
 
+int sysctl_oom_kill_extension_mask;
+enum {
+	EXT_LOWMEM_OOM,
+};
+
+#ifdef CONFIG_MMU
+static int oom_extension(int idx)
+{
+	return sysctl_oom_kill_extension_mask & (1 << idx);
+}
+#else
+static int oom_extension(int idx)
+{
+	return 0;
+}
+#endif
+
 /*
  * Is all threads of the target process nodes overlap ours?
  */
@@ -55,6 +72,7 @@ static int has_intersects_mems_allowed(s
  * badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
  * @uptime: current uptime in seconds
+ * @constraint: context of badness calculation.
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -70,7 +88,8 @@ static int has_intersects_mems_allowed(s
  *    of least surprise ... (be careful when you change it)
  */
 
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime,
+			int constraint)
 {
 	unsigned long points, cpu_time, run_time;
 	struct mm_struct *mm;
@@ -89,11 +108,16 @@ unsigned long badness(struct task_struct
 		task_unlock(p);
 		return 0;
 	}
-
-	/*
-	 * The memory size of the process is the basis for the badness.
-	 */
-	points = mm->total_vm;
+	switch  (constraint) {
+	case CONSTRAINT_LOWMEM:
+		/* use lowmem usage as the basis for the badness */
+		points = get_low_rss(mm);
+		break;
+	default:
+		/* use virtual memory size as the basis for the badness */
+		points = mm->total_vm;
+		break;
+	}
 
 	/*
 	 * After this unlock we can no longer dereference local variable `mm'
@@ -113,12 +137,17 @@ unsigned long badness(struct task_struct
 	 * machine with an endless amount of children. In case a single
 	 * child is eating the vast majority of memory, adding only half
 	 * to the parents will make the child our kill candidate of choice.
+	 *
+	 * At lowmem shortage, this part is skipped because children's lowmem
+	 * usage is not related to its parent.
 	 */
-	list_for_each_entry(child, &p->children, sibling) {
-		task_lock(child);
-		if (child->mm != mm && child->mm)
-			points += child->mm->total_vm/2 + 1;
-		task_unlock(child);
+	if (constraint != CONSTRAINT_LOWMEM) {
+		list_for_each_entry(child, &p->children, sibling) {
+			task_lock(child);
+			if (child->mm != mm && child->mm)
+				points += child->mm->total_vm/2 + 1;
+			task_unlock(child);
+		}
 	}
 
 	/*
@@ -212,6 +241,9 @@ static enum oom_constraint constrained_a
 	if (gfp_mask & __GFP_THISNODE)
 		return CONSTRAINT_NONE;
 
+	if (oom_extension(EXT_LOWMEM_OOM) && (high_zoneidx <= lowmem_zone))
+		return CONSTRAINT_LOWMEM;
+
 	/*
 	 * The nodemask here is a nodemask passed to alloc_pages(). Now,
 	 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
@@ -233,6 +265,10 @@ static enum oom_constraint constrained_a
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 				gfp_t gfp_mask, nodemask_t *nodemask)
 {
+	int zone_idx = gfp_zone(gfp_mask);
+
+	if (oom_extension(EXT_LOWMEM_OOM) && (zone_idx <= lowmem_zone))
+		return CONSTRAINT_LOWMEM;
 	return CONSTRAINT_NONE;
 }
 #endif
@@ -244,7 +280,7 @@ static enum oom_constraint constrained_a
  * (not docbooked, we don't want this one cluttering up the manual)
  */
 static struct task_struct *select_bad_process(unsigned long *ppoints,
-						struct mem_cgroup *mem)
+				struct mem_cgroup *mem, int constraint)
 {
 	struct task_struct *p;
 	struct task_struct *chosen = NULL;
@@ -300,7 +336,7 @@ static struct task_struct *select_bad_pr
 		if (p->signal->oom_adj == OOM_DISABLE)
 			continue;
 
-		points = badness(p, uptime.tv_sec);
+		points = badness(p, uptime.tv_sec, constraint);
 		if (points > *ppoints || !chosen) {
 			chosen = p;
 			*ppoints = points;
@@ -455,7 +491,7 @@ static int oom_kill_process(struct task_
 	}
 
 	printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
-					message, task_pid_nr(p), p->comm, points);
+				message, task_pid_nr(p), p->comm, points);
 
 	/* Try to kill a child first */
 	list_for_each_entry(c, &p->children, sibling) {
@@ -475,7 +511,7 @@ void mem_cgroup_out_of_memory(struct mem
 
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem);
+	p = select_bad_process(&points, mem, CONSTRAINT_NONE);
 	if (PTR_ERR(p) == -1UL)
 		goto out;
 
@@ -557,7 +593,7 @@ void clear_zonelist_oom(struct zonelist 
 /*
  * Must be called with tasklist_lock held for read.
  */
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static void __out_of_memory(gfp_t gfp_mask, int order, int constraint)
 {
 	struct task_struct *p;
 	unsigned long points;
@@ -571,7 +607,7 @@ retry:
 	 * Rambo mode: Shoot down a process and hope it solves whatever
 	 * issues we may have.
 	 */
-	p = select_bad_process(&points, NULL);
+	p = select_bad_process(&points, NULL, constraint);
 
 	if (PTR_ERR(p) == -1UL)
 		return;
@@ -583,9 +619,16 @@ retry:
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	if (oom_kill_process(p, gfp_mask, order, points, NULL,
+	switch (constraint) {
+	case CONSTRAINT_LOWMEM:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
+			"Out of memory (in lowmem)"))
+			goto retry;
+	default:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
 			     "Out of memory"))
-		goto retry;
+			goto retry;
+	}
 }
 
 /*
@@ -612,7 +655,7 @@ void pagefault_out_of_memory(void)
 		panic("out of memory from page fault. panic_on_oom is selected.\n");
 
 	read_lock(&tasklist_lock);
-	__out_of_memory(0, 0); /* unknown gfp_mask and order */
+	__out_of_memory(0, 0, CONSTRAINT_NONE); /* unknown gfp_mask and order */
 	read_unlock(&tasklist_lock);
 
 	/*
@@ -663,7 +706,7 @@ void out_of_memory(struct zonelist *zone
 		oom_kill_process(current, gfp_mask, order, 0, NULL,
 				"No available memory (MPOL_BIND)");
 		break;
-
+	case CONSTRAINT_LOWMEM:
 	case CONSTRAINT_NONE:
 		if (sysctl_panic_on_oom) {
 			dump_header(NULL, gfp_mask, order, NULL);
@@ -671,7 +714,7 @@ void out_of_memory(struct zonelist *zone
 		}
 		/* Fall-through */
 	case CONSTRAINT_CPUSET:
-		__out_of_memory(gfp_mask, order);
+		__out_of_memory(gfp_mask, order, constraint);
 		break;
 	}
 
Index: mmotm-2.6.33-Jan15/kernel/sysctl.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/kernel/sysctl.c
+++ mmotm-2.6.33-Jan15/kernel/sysctl.c
@@ -22,7 +22,6 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
-#include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
@@ -72,6 +71,7 @@ extern int sysctl_overcommit_ratio;
 extern int sysctl_panic_on_oom;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_oom_dump_tasks;
+extern int sysctl_oom_kill_extension_mask;
 extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
@@ -202,6 +202,7 @@ extern struct ctl_table epoll_table[];
 int sysctl_legacy_va_layout;
 #endif
 
+
 extern int prove_locking;
 extern int lock_stat;
 
@@ -1282,6 +1283,13 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.procname	= "oom_kill_extension_mask",
+		.data		= &sysctl_oom_kill_extension_mask,
+		.maxlen		= sizeof(sysctl_oom_kill_extension_mask),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 
 /*
  * NOTE: do not add new entries to this table unless you have read
Index: mmotm-2.6.33-Jan15/Documentation/sysctl/vm.txt
===================================================================
--- mmotm-2.6.33-Jan15.orig/Documentation/sysctl/vm.txt
+++ mmotm-2.6.33-Jan15/Documentation/sysctl/vm.txt
@@ -45,6 +45,7 @@ Currently, these files are in /proc/sys/
 - numa_zonelist_order
 - oom_dump_tasks
 - oom_kill_allocating_task
+- oom_kill_extension_mask
 - overcommit_memory
 - overcommit_ratio
 - page-cluster
@@ -511,6 +512,21 @@ The default value is 0.
 
 ==============================================================
 
+oom_kill_extension_mask:
+
+This is a mask for oom-killer extension features.
+Setting these flags may cause incompatibility for proc->oom_score and
+proc->oom_adj controls. So, please set carefully.
+
+bit 0....lowmem aware oom-killing.
+    If set, at lowmem shortage oom killing (for example, exhausting NORMAL_ZONE
+    under x86-32 HIGHMEM host), oom-killer will see lowmem rss usage of
+    processes instead of vmsize. Works only when CONFIG_MMU=y.
+
+The default value is 0
+
+==============================================================
+
 overcommit_memory:
 
 This value contains a flag that enables memory overcommitment.
Index: mmotm-2.6.33-Jan15/fs/proc/base.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/fs/proc/base.c
+++ mmotm-2.6.33-Jan15/fs/proc/base.c
@@ -458,7 +458,8 @@ static const struct file_operations proc
 #endif
 
 /* The badness from the OOM killer */
-unsigned long badness(struct task_struct *p, unsigned long uptime);
+unsigned long badness(struct task_struct *p,
+	unsigned long uptime, int constraint);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long points;
@@ -466,7 +467,7 @@ static int proc_oom_score(struct task_st
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	read_lock(&tasklist_lock);
-	points = badness(task->group_leader, uptime.tv_sec);
+	points = badness(task->group_leader, uptime.tv_sec, CONSTRAINT_NONE);
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v2] oom-kill: add lowmem usage aware oom kill handling
  2010-01-22  6:23   ` KAMEZAWA Hiroyuki
@ 2010-01-22 14:00     ` Minchan Kim
  -1 siblings, 0 replies; 75+ messages in thread
From: Minchan Kim @ 2010-01-22 14:00 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, akpm, rientjes, linux-kernel, balbir

On Fri, 2010-01-22 at 15:23 +0900, KAMEZAWA Hiroyuki wrote:
> updated. thank you for review.
> 
> The patch is onto mmotm-Jan15 (depends on mm-count-lowmem-rss.patch)
> Tested on x86-64/SMP + debug module(to allocated lowmem), works well.
> 
> ==
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> Default oom-killer uses badness calculation based on process's vm_size
> and some amounts of heuristics. Some users see proc->oom_score and
> proc->oom_adj to control oom-killed tendency under their server.
> 
> Now, we know oom-killer don't work ideally in some situaion, in PCs. Some
> enhancements are demanded. But such enhancements for oom-killer makes
> incomaptibility to oom-controls in enterprise world. So, this patch
> adds sysctl for extensions for oom-killer. Main purpose is for
> making a chance for wider test for new scheme.
> 
> One cause of OOM-Killer is memory shortage in lower zones.
> (If memory is enough, lowmem_reserve_ratio works well. but..)
> I saw lowmem-oom frequently on x86-32 and sometimes on ia64 in
> my cusotmer support jobs. If we just see process's vm_size at oom,
> we can never kill a process which has lowmem.
> At last, there will be an oom-serial-killer.
> 
> Now, we have per-mm lowmem usage counter. We can make use of it
> to select a good victim.
> 
> This patch does
>   - add sysctl for new bahavior.
>   - add CONSTRAINT_LOWMEM to oom's constraint type.
>   - pass constraint to __badness()
>   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
>     use low_rss instead of vmsize.
> 
> Changelog 2010/01/22:
>  - added sysctl
>  - fixed !CONFIG_MMU
>  - fixed fs/proc/base.c breakacge.
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
>  Documentation/sysctl/vm.txt |   16 ++++++++
>  fs/proc/base.c              |    5 +-
>  include/linux/oom.h         |    1 
>  kernel/sysctl.c             |   10 ++++-
>  mm/oom_kill.c               |   87 ++++++++++++++++++++++++++++++++------------
>  5 files changed, 94 insertions(+), 25 deletions(-)
> 
> Index: mmotm-2.6.33-Jan15/include/linux/oom.h
> ===================================================================
> --- mmotm-2.6.33-Jan15.orig/include/linux/oom.h
> +++ mmotm-2.6.33-Jan15/include/linux/oom.h
> @@ -20,6 +20,7 @@ struct notifier_block;
>   */
>  enum oom_constraint {
>  	CONSTRAINT_NONE,
> +	CONSTRAINT_LOWMEM,
>  	CONSTRAINT_CPUSET,
>  	CONSTRAINT_MEMORY_POLICY,
>  };

<snip>
> @@ -475,7 +511,7 @@ void mem_cgroup_out_of_memory(struct mem
>  
>  	read_lock(&tasklist_lock);
>  retry:
> -	p = select_bad_process(&points, mem);
> +	p = select_bad_process(&points, mem, CONSTRAINT_NONE);

Why do you fix this with only CONSTRAINT_NONE?
I think we can know CONSTRAINT_LOWMEM with gfp_mask in here. 

Any problem?

Otherwise, Looks good to me. :)

Reviewed-by: Minchan Kim <minchan.kim@gmail.com>

-- 
Kind regards,
Minchan Kim


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v2] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-22 14:00     ` Minchan Kim
  0 siblings, 0 replies; 75+ messages in thread
From: Minchan Kim @ 2010-01-22 14:00 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, akpm, rientjes, linux-kernel, balbir

On Fri, 2010-01-22 at 15:23 +0900, KAMEZAWA Hiroyuki wrote:
> updated. thank you for review.
> 
> The patch is onto mmotm-Jan15 (depends on mm-count-lowmem-rss.patch)
> Tested on x86-64/SMP + debug module(to allocated lowmem), works well.
> 
> ==
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> Default oom-killer uses badness calculation based on process's vm_size
> and some amounts of heuristics. Some users see proc->oom_score and
> proc->oom_adj to control oom-killed tendency under their server.
> 
> Now, we know oom-killer don't work ideally in some situaion, in PCs. Some
> enhancements are demanded. But such enhancements for oom-killer makes
> incomaptibility to oom-controls in enterprise world. So, this patch
> adds sysctl for extensions for oom-killer. Main purpose is for
> making a chance for wider test for new scheme.
> 
> One cause of OOM-Killer is memory shortage in lower zones.
> (If memory is enough, lowmem_reserve_ratio works well. but..)
> I saw lowmem-oom frequently on x86-32 and sometimes on ia64 in
> my cusotmer support jobs. If we just see process's vm_size at oom,
> we can never kill a process which has lowmem.
> At last, there will be an oom-serial-killer.
> 
> Now, we have per-mm lowmem usage counter. We can make use of it
> to select a good victim.
> 
> This patch does
>   - add sysctl for new bahavior.
>   - add CONSTRAINT_LOWMEM to oom's constraint type.
>   - pass constraint to __badness()
>   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
>     use low_rss instead of vmsize.
> 
> Changelog 2010/01/22:
>  - added sysctl
>  - fixed !CONFIG_MMU
>  - fixed fs/proc/base.c breakacge.
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
>  Documentation/sysctl/vm.txt |   16 ++++++++
>  fs/proc/base.c              |    5 +-
>  include/linux/oom.h         |    1 
>  kernel/sysctl.c             |   10 ++++-
>  mm/oom_kill.c               |   87 ++++++++++++++++++++++++++++++++------------
>  5 files changed, 94 insertions(+), 25 deletions(-)
> 
> Index: mmotm-2.6.33-Jan15/include/linux/oom.h
> ===================================================================
> --- mmotm-2.6.33-Jan15.orig/include/linux/oom.h
> +++ mmotm-2.6.33-Jan15/include/linux/oom.h
> @@ -20,6 +20,7 @@ struct notifier_block;
>   */
>  enum oom_constraint {
>  	CONSTRAINT_NONE,
> +	CONSTRAINT_LOWMEM,
>  	CONSTRAINT_CPUSET,
>  	CONSTRAINT_MEMORY_POLICY,
>  };

<snip>
> @@ -475,7 +511,7 @@ void mem_cgroup_out_of_memory(struct mem
>  
>  	read_lock(&tasklist_lock);
>  retry:
> -	p = select_bad_process(&points, mem);
> +	p = select_bad_process(&points, mem, CONSTRAINT_NONE);

Why do you fix this with only CONSTRAINT_NONE?
I think we can know CONSTRAINT_LOWMEM with gfp_mask in here. 

Any problem?

Otherwise, Looks good to me. :)

Reviewed-by: Minchan Kim <minchan.kim@gmail.com>

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v2] oom-kill: add lowmem usage aware oom kill handling
  2010-01-22 14:00     ` Minchan Kim
@ 2010-01-22 15:16       ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-22 15:16 UTC (permalink / raw)
  To: Minchan Kim
  Cc: KAMEZAWA Hiroyuki, linux-mm, akpm, rientjes, linux-kernel, balbir

Minchan Kim wrote:
> On Fri, 2010-01-22 at 15:23 +0900, KAMEZAWA Hiroyuki wrote:
>> updated. thank you for review.

>>  	CONSTRAINT_MEMORY_POLICY,
>>  };
>
> <snip>
>> @@ -475,7 +511,7 @@ void mem_cgroup_out_of_memory(struct mem
>>
>>  	read_lock(&tasklist_lock);
>>  retry:
>> -	p = select_bad_process(&points, mem);
>> +	p = select_bad_process(&points, mem, CONSTRAINT_NONE);
>
> Why do you fix this with only CONSTRAINT_NONE?
> I think we can know CONSTRAINT_LOWMEM with gfp_mask in here.
>
memcg is just for accounting anon/file pages. Then, it's never
cause lowmem oom problem (any memory is ok for memcg).



> Any problem?
>
> Otherwise, Looks good to me. :)
>
> Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
>
Thank you.

-Kame

> --
> Kind regards,
> Minchan Kim
>
>



^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v2] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-22 15:16       ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-22 15:16 UTC (permalink / raw)
  To: Minchan Kim
  Cc: KAMEZAWA Hiroyuki, linux-mm, akpm, rientjes, linux-kernel, balbir

Minchan Kim wrote:
> On Fri, 2010-01-22 at 15:23 +0900, KAMEZAWA Hiroyuki wrote:
>> updated. thank you for review.

>>  	CONSTRAINT_MEMORY_POLICY,
>>  };
>
> <snip>
>> @@ -475,7 +511,7 @@ void mem_cgroup_out_of_memory(struct mem
>>
>>  	read_lock(&tasklist_lock);
>>  retry:
>> -	p = select_bad_process(&points, mem);
>> +	p = select_bad_process(&points, mem, CONSTRAINT_NONE);
>
> Why do you fix this with only CONSTRAINT_NONE?
> I think we can know CONSTRAINT_LOWMEM with gfp_mask in here.
>
memcg is just for accounting anon/file pages. Then, it's never
cause lowmem oom problem (any memory is ok for memcg).



> Any problem?
>
> Otherwise, Looks good to me. :)
>
> Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
>
Thank you.

-Kame

> --
> Kind regards,
> Minchan Kim
>
>


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v2] oom-kill: add lowmem usage aware oom kill handling
  2010-01-22 15:16       ` KAMEZAWA Hiroyuki
@ 2010-01-22 15:41         ` Minchan Kim
  -1 siblings, 0 replies; 75+ messages in thread
From: Minchan Kim @ 2010-01-22 15:41 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, akpm, rientjes, linux-kernel, balbir

2010/1/23 KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>:
>> <snip>
>>> @@ -475,7 +511,7 @@ void mem_cgroup_out_of_memory(struct mem
>>>
>>>      read_lock(&tasklist_lock);
>>>  retry:
>>> -    p = select_bad_process(&points, mem);
>>> +    p = select_bad_process(&points, mem, CONSTRAINT_NONE);
>>
>> Why do you fix this with only CONSTRAINT_NONE?
>> I think we can know CONSTRAINT_LOWMEM with gfp_mask in here.
>>
> memcg is just for accounting anon/file pages. Then, it's never
> cause lowmem oom problem (any memory is ok for memcg).

Okay. Thanks for the explanation.
-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v2] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-22 15:41         ` Minchan Kim
  0 siblings, 0 replies; 75+ messages in thread
From: Minchan Kim @ 2010-01-22 15:41 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, akpm, rientjes, linux-kernel, balbir

2010/1/23 KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>:
>> <snip>
>>> @@ -475,7 +511,7 @@ void mem_cgroup_out_of_memory(struct mem
>>>
>>>      read_lock(&tasklist_lock);
>>>  retry:
>>> -    p = select_bad_process(&points, mem);
>>> +    p = select_bad_process(&points, mem, CONSTRAINT_NONE);
>>
>> Why do you fix this with only CONSTRAINT_NONE?
>> I think we can know CONSTRAINT_LOWMEM with gfp_mask in here.
>>
> memcg is just for accounting anon/file pages. Then, it's never
> cause lowmem oom problem (any memory is ok for memcg).

Okay. Thanks for the explanation.
-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-22  6:23   ` KAMEZAWA Hiroyuki
@ 2010-01-25  6:15     ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-25  6:15 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm, akpm, rientjes, minchan.kim, linux-kernel, balbir

Did several tests on x86-32 and I felt that sysctl value should be
printed on oom log... this is v3.

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Default oom-killer uses badness calculation based on process's vm_size
and some amounts of heuristics. Some users see proc->oom_score and
proc->oom_adj to control oom-killed tendency under their server.

Now, we know oom-killer don't work ideally in some situaion, in PCs. Some
enhancements are demanded. But such enhancements for oom-killer makes
incomaptibility to oom-controls in enterprise world. So, this patch
adds sysctl for extensions for oom-killer. Main purpose is for
making a chance for wider test for new scheme.

One cause of OOM-Killer is memory shortage in lower zones.
(If memory is enough, lowmem_reserve_ratio works well. but..)
I saw lowmem-oom frequently on x86-32 and sometimes on ia64 in
my cusotmer support jobs. If we just see process's vm_size at oom,
we can never kill a process which has lowmem.
At last, there will be an oom-serial-killer.

Now, we have per-mm lowmem usage counter. We can make use of it
to select a good victim.

This patch does
  - add sysctl for new bahavior.
  - add CONSTRAINT_LOWMEM to oom's constraint type.
  - pass constraint to __badness()
  - change calculation based on constraint. If CONSTRAINT_LOWMEM,
    use low_rss instead of vmsize.

Changelog 2010/01/25
 - showing extension_mask value in OOM kill main log header.
Changelog 2010/01/22:
 - added sysctl
 - fixed !CONFIG_MMU
 - fixed fs/proc/base.c breakacge.

Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 Documentation/sysctl/vm.txt |   16 +++++++
 fs/proc/base.c              |    5 +-
 include/linux/oom.h         |    1 
 kernel/sysctl.c             |   10 ++++
 mm/oom_kill.c               |   92 ++++++++++++++++++++++++++++++++------------
 5 files changed, 97 insertions(+), 27 deletions(-)

Index: mmotm-2.6.33-Jan15/include/linux/oom.h
===================================================================
--- mmotm-2.6.33-Jan15.orig/include/linux/oom.h
+++ mmotm-2.6.33-Jan15/include/linux/oom.h
@@ -20,6 +20,7 @@ struct notifier_block;
  */
 enum oom_constraint {
 	CONSTRAINT_NONE,
+	CONSTRAINT_LOWMEM,
 	CONSTRAINT_CPUSET,
 	CONSTRAINT_MEMORY_POLICY,
 };
Index: mmotm-2.6.33-Jan15/mm/oom_kill.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/mm/oom_kill.c
+++ mmotm-2.6.33-Jan15/mm/oom_kill.c
@@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks;
 static DEFINE_SPINLOCK(zone_scan_lock);
 /* #define DEBUG */
 
+int sysctl_oom_kill_extension_mask;
+enum {
+	EXT_LOWMEM_OOM,
+};
+
+#ifdef CONFIG_MMU
+static int oom_extension(int idx)
+{
+	return sysctl_oom_kill_extension_mask & (1 << idx);
+}
+#else
+static int oom_extension(int idx)
+{
+	return 0;
+}
+#endif
+
 /*
  * Is all threads of the target process nodes overlap ours?
  */
@@ -55,6 +72,7 @@ static int has_intersects_mems_allowed(s
  * badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
  * @uptime: current uptime in seconds
+ * @constraint: context of badness calculation.
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -70,7 +88,8 @@ static int has_intersects_mems_allowed(s
  *    of least surprise ... (be careful when you change it)
  */
 
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime,
+			int constraint)
 {
 	unsigned long points, cpu_time, run_time;
 	struct mm_struct *mm;
@@ -89,11 +108,16 @@ unsigned long badness(struct task_struct
 		task_unlock(p);
 		return 0;
 	}
-
-	/*
-	 * The memory size of the process is the basis for the badness.
-	 */
-	points = mm->total_vm;
+	switch  (constraint) {
+	case CONSTRAINT_LOWMEM:
+		/* use lowmem usage as the basis for the badness */
+		points = get_low_rss(mm);
+		break;
+	default:
+		/* use virtual memory size as the basis for the badness */
+		points = mm->total_vm;
+		break;
+	}
 
 	/*
 	 * After this unlock we can no longer dereference local variable `mm'
@@ -113,12 +137,17 @@ unsigned long badness(struct task_struct
 	 * machine with an endless amount of children. In case a single
 	 * child is eating the vast majority of memory, adding only half
 	 * to the parents will make the child our kill candidate of choice.
+	 *
+	 * At lowmem shortage, this part is skipped because children's lowmem
+	 * usage is not related to its parent.
 	 */
-	list_for_each_entry(child, &p->children, sibling) {
-		task_lock(child);
-		if (child->mm != mm && child->mm)
-			points += child->mm->total_vm/2 + 1;
-		task_unlock(child);
+	if (constraint != CONSTRAINT_LOWMEM) {
+		list_for_each_entry(child, &p->children, sibling) {
+			task_lock(child);
+			if (child->mm != mm && child->mm)
+				points += child->mm->total_vm/2 + 1;
+			task_unlock(child);
+		}
 	}
 
 	/*
@@ -212,6 +241,9 @@ static enum oom_constraint constrained_a
 	if (gfp_mask & __GFP_THISNODE)
 		return CONSTRAINT_NONE;
 
+	if (oom_extension(EXT_LOWMEM_OOM) && (high_zoneidx <= lowmem_zone))
+		return CONSTRAINT_LOWMEM;
+
 	/*
 	 * The nodemask here is a nodemask passed to alloc_pages(). Now,
 	 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
@@ -233,6 +265,10 @@ static enum oom_constraint constrained_a
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 				gfp_t gfp_mask, nodemask_t *nodemask)
 {
+	int zone_idx = gfp_zone(gfp_mask);
+
+	if (oom_extension(EXT_LOWMEM_OOM) && (zone_idx <= lowmem_zone))
+		return CONSTRAINT_LOWMEM;
 	return CONSTRAINT_NONE;
 }
 #endif
@@ -244,7 +280,7 @@ static enum oom_constraint constrained_a
  * (not docbooked, we don't want this one cluttering up the manual)
  */
 static struct task_struct *select_bad_process(unsigned long *ppoints,
-						struct mem_cgroup *mem)
+				struct mem_cgroup *mem, int constraint)
 {
 	struct task_struct *p;
 	struct task_struct *chosen = NULL;
@@ -300,7 +336,7 @@ static struct task_struct *select_bad_pr
 		if (p->signal->oom_adj == OOM_DISABLE)
 			continue;
 
-		points = badness(p, uptime.tv_sec);
+		points = badness(p, uptime.tv_sec, constraint);
 		if (points > *ppoints || !chosen) {
 			chosen = p;
 			*ppoints = points;
@@ -360,8 +396,9 @@ static void dump_header(struct task_stru
 							struct mem_cgroup *mem)
 {
 	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-		"oom_adj=%d\n",
-		current->comm, gfp_mask, order, current->signal->oom_adj);
+		"oom_adj=%d extesion=%x\n",
+		current->comm, gfp_mask, order,
+		current->signal->oom_adj, sysctl_oom_kill_extension_mask);
 	task_lock(current);
 	cpuset_print_task_mems_allowed(current);
 	task_unlock(current);
@@ -455,7 +492,7 @@ static int oom_kill_process(struct task_
 	}
 
 	printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
-					message, task_pid_nr(p), p->comm, points);
+				message, task_pid_nr(p), p->comm, points);
 
 	/* Try to kill a child first */
 	list_for_each_entry(c, &p->children, sibling) {
@@ -475,7 +512,7 @@ void mem_cgroup_out_of_memory(struct mem
 
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem);
+	p = select_bad_process(&points, mem, CONSTRAINT_NONE);
 	if (PTR_ERR(p) == -1UL)
 		goto out;
 
@@ -557,7 +594,7 @@ void clear_zonelist_oom(struct zonelist 
 /*
  * Must be called with tasklist_lock held for read.
  */
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static void __out_of_memory(gfp_t gfp_mask, int order, int constraint)
 {
 	struct task_struct *p;
 	unsigned long points;
@@ -571,7 +608,7 @@ retry:
 	 * Rambo mode: Shoot down a process and hope it solves whatever
 	 * issues we may have.
 	 */
-	p = select_bad_process(&points, NULL);
+	p = select_bad_process(&points, NULL, constraint);
 
 	if (PTR_ERR(p) == -1UL)
 		return;
@@ -583,9 +620,16 @@ retry:
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	if (oom_kill_process(p, gfp_mask, order, points, NULL,
+	switch (constraint) {
+	case CONSTRAINT_LOWMEM:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
+			"Out of memory (in lowmem)"))
+			goto retry;
+	default:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
 			     "Out of memory"))
-		goto retry;
+			goto retry;
+	}
 }
 
 /*
@@ -612,7 +656,7 @@ void pagefault_out_of_memory(void)
 		panic("out of memory from page fault. panic_on_oom is selected.\n");
 
 	read_lock(&tasklist_lock);
-	__out_of_memory(0, 0); /* unknown gfp_mask and order */
+	__out_of_memory(0, 0, CONSTRAINT_NONE); /* unknown gfp_mask and order */
 	read_unlock(&tasklist_lock);
 
 	/*
@@ -663,7 +707,7 @@ void out_of_memory(struct zonelist *zone
 		oom_kill_process(current, gfp_mask, order, 0, NULL,
 				"No available memory (MPOL_BIND)");
 		break;
-
+	case CONSTRAINT_LOWMEM:
 	case CONSTRAINT_NONE:
 		if (sysctl_panic_on_oom) {
 			dump_header(NULL, gfp_mask, order, NULL);
@@ -671,7 +715,7 @@ void out_of_memory(struct zonelist *zone
 		}
 		/* Fall-through */
 	case CONSTRAINT_CPUSET:
-		__out_of_memory(gfp_mask, order);
+		__out_of_memory(gfp_mask, order, constraint);
 		break;
 	}
 
Index: mmotm-2.6.33-Jan15/kernel/sysctl.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/kernel/sysctl.c
+++ mmotm-2.6.33-Jan15/kernel/sysctl.c
@@ -22,7 +22,6 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
-#include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
@@ -72,6 +71,7 @@ extern int sysctl_overcommit_ratio;
 extern int sysctl_panic_on_oom;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_oom_dump_tasks;
+extern int sysctl_oom_kill_extension_mask;
 extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
@@ -202,6 +202,7 @@ extern struct ctl_table epoll_table[];
 int sysctl_legacy_va_layout;
 #endif
 
+
 extern int prove_locking;
 extern int lock_stat;
 
@@ -1282,6 +1283,13 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.procname	= "oom_kill_extension_mask",
+		.data		= &sysctl_oom_kill_extension_mask,
+		.maxlen		= sizeof(sysctl_oom_kill_extension_mask),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 
 /*
  * NOTE: do not add new entries to this table unless you have read
Index: mmotm-2.6.33-Jan15/Documentation/sysctl/vm.txt
===================================================================
--- mmotm-2.6.33-Jan15.orig/Documentation/sysctl/vm.txt
+++ mmotm-2.6.33-Jan15/Documentation/sysctl/vm.txt
@@ -45,6 +45,7 @@ Currently, these files are in /proc/sys/
 - numa_zonelist_order
 - oom_dump_tasks
 - oom_kill_allocating_task
+- oom_kill_extension_mask
 - overcommit_memory
 - overcommit_ratio
 - page-cluster
@@ -511,6 +512,21 @@ The default value is 0.
 
 ==============================================================
 
+oom_kill_extension_mask:
+
+This is a mask for oom-killer extension features.
+Setting these flags may cause incompatibility for proc->oom_score and
+proc->oom_adj controls. So, please set carefully.
+
+bit 0....lowmem aware oom-killing.
+    If set, at lowmem shortage oom killing (for example, exhausting NORMAL_ZONE
+    under x86-32 HIGHMEM host), oom-killer will see lowmem rss usage of
+    processes instead of vmsize. Works only when CONFIG_MMU=y.
+
+The default value is 0
+
+==============================================================
+
 overcommit_memory:
 
 This value contains a flag that enables memory overcommitment.
Index: mmotm-2.6.33-Jan15/fs/proc/base.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/fs/proc/base.c
+++ mmotm-2.6.33-Jan15/fs/proc/base.c
@@ -458,7 +458,8 @@ static const struct file_operations proc
 #endif
 
 /* The badness from the OOM killer */
-unsigned long badness(struct task_struct *p, unsigned long uptime);
+unsigned long badness(struct task_struct *p,
+	unsigned long uptime, int constraint);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long points;
@@ -466,7 +467,7 @@ static int proc_oom_score(struct task_st
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	read_lock(&tasklist_lock);
-	points = badness(task->group_leader, uptime.tv_sec);
+	points = badness(task->group_leader, uptime.tv_sec, CONSTRAINT_NONE);
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }


^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-25  6:15     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-25  6:15 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm, akpm, rientjes, minchan.kim, linux-kernel, balbir

Did several tests on x86-32 and I felt that sysctl value should be
printed on oom log... this is v3.

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Default oom-killer uses badness calculation based on process's vm_size
and some amounts of heuristics. Some users see proc->oom_score and
proc->oom_adj to control oom-killed tendency under their server.

Now, we know oom-killer don't work ideally in some situaion, in PCs. Some
enhancements are demanded. But such enhancements for oom-killer makes
incomaptibility to oom-controls in enterprise world. So, this patch
adds sysctl for extensions for oom-killer. Main purpose is for
making a chance for wider test for new scheme.

One cause of OOM-Killer is memory shortage in lower zones.
(If memory is enough, lowmem_reserve_ratio works well. but..)
I saw lowmem-oom frequently on x86-32 and sometimes on ia64 in
my cusotmer support jobs. If we just see process's vm_size at oom,
we can never kill a process which has lowmem.
At last, there will be an oom-serial-killer.

Now, we have per-mm lowmem usage counter. We can make use of it
to select a good victim.

This patch does
  - add sysctl for new bahavior.
  - add CONSTRAINT_LOWMEM to oom's constraint type.
  - pass constraint to __badness()
  - change calculation based on constraint. If CONSTRAINT_LOWMEM,
    use low_rss instead of vmsize.

Changelog 2010/01/25
 - showing extension_mask value in OOM kill main log header.
Changelog 2010/01/22:
 - added sysctl
 - fixed !CONFIG_MMU
 - fixed fs/proc/base.c breakacge.

Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 Documentation/sysctl/vm.txt |   16 +++++++
 fs/proc/base.c              |    5 +-
 include/linux/oom.h         |    1 
 kernel/sysctl.c             |   10 ++++
 mm/oom_kill.c               |   92 ++++++++++++++++++++++++++++++++------------
 5 files changed, 97 insertions(+), 27 deletions(-)

Index: mmotm-2.6.33-Jan15/include/linux/oom.h
===================================================================
--- mmotm-2.6.33-Jan15.orig/include/linux/oom.h
+++ mmotm-2.6.33-Jan15/include/linux/oom.h
@@ -20,6 +20,7 @@ struct notifier_block;
  */
 enum oom_constraint {
 	CONSTRAINT_NONE,
+	CONSTRAINT_LOWMEM,
 	CONSTRAINT_CPUSET,
 	CONSTRAINT_MEMORY_POLICY,
 };
Index: mmotm-2.6.33-Jan15/mm/oom_kill.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/mm/oom_kill.c
+++ mmotm-2.6.33-Jan15/mm/oom_kill.c
@@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks;
 static DEFINE_SPINLOCK(zone_scan_lock);
 /* #define DEBUG */
 
+int sysctl_oom_kill_extension_mask;
+enum {
+	EXT_LOWMEM_OOM,
+};
+
+#ifdef CONFIG_MMU
+static int oom_extension(int idx)
+{
+	return sysctl_oom_kill_extension_mask & (1 << idx);
+}
+#else
+static int oom_extension(int idx)
+{
+	return 0;
+}
+#endif
+
 /*
  * Is all threads of the target process nodes overlap ours?
  */
@@ -55,6 +72,7 @@ static int has_intersects_mems_allowed(s
  * badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
  * @uptime: current uptime in seconds
+ * @constraint: context of badness calculation.
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -70,7 +88,8 @@ static int has_intersects_mems_allowed(s
  *    of least surprise ... (be careful when you change it)
  */
 
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime,
+			int constraint)
 {
 	unsigned long points, cpu_time, run_time;
 	struct mm_struct *mm;
@@ -89,11 +108,16 @@ unsigned long badness(struct task_struct
 		task_unlock(p);
 		return 0;
 	}
-
-	/*
-	 * The memory size of the process is the basis for the badness.
-	 */
-	points = mm->total_vm;
+	switch  (constraint) {
+	case CONSTRAINT_LOWMEM:
+		/* use lowmem usage as the basis for the badness */
+		points = get_low_rss(mm);
+		break;
+	default:
+		/* use virtual memory size as the basis for the badness */
+		points = mm->total_vm;
+		break;
+	}
 
 	/*
 	 * After this unlock we can no longer dereference local variable `mm'
@@ -113,12 +137,17 @@ unsigned long badness(struct task_struct
 	 * machine with an endless amount of children. In case a single
 	 * child is eating the vast majority of memory, adding only half
 	 * to the parents will make the child our kill candidate of choice.
+	 *
+	 * At lowmem shortage, this part is skipped because children's lowmem
+	 * usage is not related to its parent.
 	 */
-	list_for_each_entry(child, &p->children, sibling) {
-		task_lock(child);
-		if (child->mm != mm && child->mm)
-			points += child->mm->total_vm/2 + 1;
-		task_unlock(child);
+	if (constraint != CONSTRAINT_LOWMEM) {
+		list_for_each_entry(child, &p->children, sibling) {
+			task_lock(child);
+			if (child->mm != mm && child->mm)
+				points += child->mm->total_vm/2 + 1;
+			task_unlock(child);
+		}
 	}
 
 	/*
@@ -212,6 +241,9 @@ static enum oom_constraint constrained_a
 	if (gfp_mask & __GFP_THISNODE)
 		return CONSTRAINT_NONE;
 
+	if (oom_extension(EXT_LOWMEM_OOM) && (high_zoneidx <= lowmem_zone))
+		return CONSTRAINT_LOWMEM;
+
 	/*
 	 * The nodemask here is a nodemask passed to alloc_pages(). Now,
 	 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
@@ -233,6 +265,10 @@ static enum oom_constraint constrained_a
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 				gfp_t gfp_mask, nodemask_t *nodemask)
 {
+	int zone_idx = gfp_zone(gfp_mask);
+
+	if (oom_extension(EXT_LOWMEM_OOM) && (zone_idx <= lowmem_zone))
+		return CONSTRAINT_LOWMEM;
 	return CONSTRAINT_NONE;
 }
 #endif
@@ -244,7 +280,7 @@ static enum oom_constraint constrained_a
  * (not docbooked, we don't want this one cluttering up the manual)
  */
 static struct task_struct *select_bad_process(unsigned long *ppoints,
-						struct mem_cgroup *mem)
+				struct mem_cgroup *mem, int constraint)
 {
 	struct task_struct *p;
 	struct task_struct *chosen = NULL;
@@ -300,7 +336,7 @@ static struct task_struct *select_bad_pr
 		if (p->signal->oom_adj == OOM_DISABLE)
 			continue;
 
-		points = badness(p, uptime.tv_sec);
+		points = badness(p, uptime.tv_sec, constraint);
 		if (points > *ppoints || !chosen) {
 			chosen = p;
 			*ppoints = points;
@@ -360,8 +396,9 @@ static void dump_header(struct task_stru
 							struct mem_cgroup *mem)
 {
 	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-		"oom_adj=%d\n",
-		current->comm, gfp_mask, order, current->signal->oom_adj);
+		"oom_adj=%d extesion=%x\n",
+		current->comm, gfp_mask, order,
+		current->signal->oom_adj, sysctl_oom_kill_extension_mask);
 	task_lock(current);
 	cpuset_print_task_mems_allowed(current);
 	task_unlock(current);
@@ -455,7 +492,7 @@ static int oom_kill_process(struct task_
 	}
 
 	printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
-					message, task_pid_nr(p), p->comm, points);
+				message, task_pid_nr(p), p->comm, points);
 
 	/* Try to kill a child first */
 	list_for_each_entry(c, &p->children, sibling) {
@@ -475,7 +512,7 @@ void mem_cgroup_out_of_memory(struct mem
 
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem);
+	p = select_bad_process(&points, mem, CONSTRAINT_NONE);
 	if (PTR_ERR(p) == -1UL)
 		goto out;
 
@@ -557,7 +594,7 @@ void clear_zonelist_oom(struct zonelist 
 /*
  * Must be called with tasklist_lock held for read.
  */
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static void __out_of_memory(gfp_t gfp_mask, int order, int constraint)
 {
 	struct task_struct *p;
 	unsigned long points;
@@ -571,7 +608,7 @@ retry:
 	 * Rambo mode: Shoot down a process and hope it solves whatever
 	 * issues we may have.
 	 */
-	p = select_bad_process(&points, NULL);
+	p = select_bad_process(&points, NULL, constraint);
 
 	if (PTR_ERR(p) == -1UL)
 		return;
@@ -583,9 +620,16 @@ retry:
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	if (oom_kill_process(p, gfp_mask, order, points, NULL,
+	switch (constraint) {
+	case CONSTRAINT_LOWMEM:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
+			"Out of memory (in lowmem)"))
+			goto retry;
+	default:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
 			     "Out of memory"))
-		goto retry;
+			goto retry;
+	}
 }
 
 /*
@@ -612,7 +656,7 @@ void pagefault_out_of_memory(void)
 		panic("out of memory from page fault. panic_on_oom is selected.\n");
 
 	read_lock(&tasklist_lock);
-	__out_of_memory(0, 0); /* unknown gfp_mask and order */
+	__out_of_memory(0, 0, CONSTRAINT_NONE); /* unknown gfp_mask and order */
 	read_unlock(&tasklist_lock);
 
 	/*
@@ -663,7 +707,7 @@ void out_of_memory(struct zonelist *zone
 		oom_kill_process(current, gfp_mask, order, 0, NULL,
 				"No available memory (MPOL_BIND)");
 		break;
-
+	case CONSTRAINT_LOWMEM:
 	case CONSTRAINT_NONE:
 		if (sysctl_panic_on_oom) {
 			dump_header(NULL, gfp_mask, order, NULL);
@@ -671,7 +715,7 @@ void out_of_memory(struct zonelist *zone
 		}
 		/* Fall-through */
 	case CONSTRAINT_CPUSET:
-		__out_of_memory(gfp_mask, order);
+		__out_of_memory(gfp_mask, order, constraint);
 		break;
 	}
 
Index: mmotm-2.6.33-Jan15/kernel/sysctl.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/kernel/sysctl.c
+++ mmotm-2.6.33-Jan15/kernel/sysctl.c
@@ -22,7 +22,6 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
-#include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
@@ -72,6 +71,7 @@ extern int sysctl_overcommit_ratio;
 extern int sysctl_panic_on_oom;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_oom_dump_tasks;
+extern int sysctl_oom_kill_extension_mask;
 extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
@@ -202,6 +202,7 @@ extern struct ctl_table epoll_table[];
 int sysctl_legacy_va_layout;
 #endif
 
+
 extern int prove_locking;
 extern int lock_stat;
 
@@ -1282,6 +1283,13 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.procname	= "oom_kill_extension_mask",
+		.data		= &sysctl_oom_kill_extension_mask,
+		.maxlen		= sizeof(sysctl_oom_kill_extension_mask),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 
 /*
  * NOTE: do not add new entries to this table unless you have read
Index: mmotm-2.6.33-Jan15/Documentation/sysctl/vm.txt
===================================================================
--- mmotm-2.6.33-Jan15.orig/Documentation/sysctl/vm.txt
+++ mmotm-2.6.33-Jan15/Documentation/sysctl/vm.txt
@@ -45,6 +45,7 @@ Currently, these files are in /proc/sys/
 - numa_zonelist_order
 - oom_dump_tasks
 - oom_kill_allocating_task
+- oom_kill_extension_mask
 - overcommit_memory
 - overcommit_ratio
 - page-cluster
@@ -511,6 +512,21 @@ The default value is 0.
 
 ==============================================================
 
+oom_kill_extension_mask:
+
+This is a mask for oom-killer extension features.
+Setting these flags may cause incompatibility for proc->oom_score and
+proc->oom_adj controls. So, please set carefully.
+
+bit 0....lowmem aware oom-killing.
+    If set, at lowmem shortage oom killing (for example, exhausting NORMAL_ZONE
+    under x86-32 HIGHMEM host), oom-killer will see lowmem rss usage of
+    processes instead of vmsize. Works only when CONFIG_MMU=y.
+
+The default value is 0
+
+==============================================================
+
 overcommit_memory:
 
 This value contains a flag that enables memory overcommitment.
Index: mmotm-2.6.33-Jan15/fs/proc/base.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/fs/proc/base.c
+++ mmotm-2.6.33-Jan15/fs/proc/base.c
@@ -458,7 +458,8 @@ static const struct file_operations proc
 #endif
 
 /* The badness from the OOM killer */
-unsigned long badness(struct task_struct *p, unsigned long uptime);
+unsigned long badness(struct task_struct *p,
+	unsigned long uptime, int constraint);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long points;
@@ -466,7 +467,7 @@ static int proc_oom_score(struct task_st
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	read_lock(&tasklist_lock);
-	points = badness(task->group_leader, uptime.tv_sec);
+	points = badness(task->group_leader, uptime.tv_sec, CONSTRAINT_NONE);
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-25  6:15     ` KAMEZAWA Hiroyuki
@ 2010-01-26 23:12       ` Andrew Morton
  -1 siblings, 0 replies; 75+ messages in thread
From: Andrew Morton @ 2010-01-26 23:12 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, rientjes, minchan.kim, linux-kernel, balbir

On Mon, 25 Jan 2010 15:15:03 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> Did several tests on x86-32 and I felt that sysctl value should be
> printed on oom log... this is v3.
> 
> ==
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> Default oom-killer uses badness calculation based on process's vm_size
> and some amounts of heuristics. Some users see proc->oom_score and
> proc->oom_adj to control oom-killed tendency under their server.
> 
> Now, we know oom-killer don't work ideally in some situaion, in PCs. Some
> enhancements are demanded. But such enhancements for oom-killer makes
> incomaptibility to oom-controls in enterprise world. So, this patch
> adds sysctl for extensions for oom-killer. Main purpose is for
> making a chance for wider test for new scheme.
> 
> One cause of OOM-Killer is memory shortage in lower zones.
> (If memory is enough, lowmem_reserve_ratio works well. but..)
> I saw lowmem-oom frequently on x86-32 and sometimes on ia64 in
> my cusotmer support jobs. If we just see process's vm_size at oom,
> we can never kill a process which has lowmem.
> At last, there will be an oom-serial-killer.
> 
> Now, we have per-mm lowmem usage counter. We can make use of it
> to select a good victim.
> 
> This patch does
>   - add sysctl for new bahavior.
>   - add CONSTRAINT_LOWMEM to oom's constraint type.
>   - pass constraint to __badness()
>   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
>     use low_rss instead of vmsize.
> 
> Changelog 2010/01/25
>  - showing extension_mask value in OOM kill main log header.
> Changelog 2010/01/22:
>  - added sysctl
>  - fixed !CONFIG_MMU
>  - fixed fs/proc/base.c breakacge.

It'd be nice to see some testing results for this.  Perhaps "here's a
test case and here's the before-and-after behaviour".

I don't like the sysctl knob much.  Hardly anyone will know to enable
it so the feature won't get much testing and this binary decision
fractures the testing effort.  It would be much better if we can get
everyone running the same code.  I mean, if there are certain workloads
on certain machines with which the oom-killer doesn't behave correctly
then fix it!

Why was the '#include <linux/sysctl.h>" removed from sysctl.c?

The patch adds a random newline to sysctl.c.

It was never a good idea to add extern declarations to sysctl.c.  It's
better to add them to a subsystem-specific header file (ie:
mm-sysctl.h) and then include that file from the mm files that define
or use sysctl_foo, and include it into sysctl.c.  Oh well.


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-26 23:12       ` Andrew Morton
  0 siblings, 0 replies; 75+ messages in thread
From: Andrew Morton @ 2010-01-26 23:12 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, rientjes, minchan.kim, linux-kernel, balbir

On Mon, 25 Jan 2010 15:15:03 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> Did several tests on x86-32 and I felt that sysctl value should be
> printed on oom log... this is v3.
> 
> ==
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> Default oom-killer uses badness calculation based on process's vm_size
> and some amounts of heuristics. Some users see proc->oom_score and
> proc->oom_adj to control oom-killed tendency under their server.
> 
> Now, we know oom-killer don't work ideally in some situaion, in PCs. Some
> enhancements are demanded. But such enhancements for oom-killer makes
> incomaptibility to oom-controls in enterprise world. So, this patch
> adds sysctl for extensions for oom-killer. Main purpose is for
> making a chance for wider test for new scheme.
> 
> One cause of OOM-Killer is memory shortage in lower zones.
> (If memory is enough, lowmem_reserve_ratio works well. but..)
> I saw lowmem-oom frequently on x86-32 and sometimes on ia64 in
> my cusotmer support jobs. If we just see process's vm_size at oom,
> we can never kill a process which has lowmem.
> At last, there will be an oom-serial-killer.
> 
> Now, we have per-mm lowmem usage counter. We can make use of it
> to select a good victim.
> 
> This patch does
>   - add sysctl for new bahavior.
>   - add CONSTRAINT_LOWMEM to oom's constraint type.
>   - pass constraint to __badness()
>   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
>     use low_rss instead of vmsize.
> 
> Changelog 2010/01/25
>  - showing extension_mask value in OOM kill main log header.
> Changelog 2010/01/22:
>  - added sysctl
>  - fixed !CONFIG_MMU
>  - fixed fs/proc/base.c breakacge.

It'd be nice to see some testing results for this.  Perhaps "here's a
test case and here's the before-and-after behaviour".

I don't like the sysctl knob much.  Hardly anyone will know to enable
it so the feature won't get much testing and this binary decision
fractures the testing effort.  It would be much better if we can get
everyone running the same code.  I mean, if there are certain workloads
on certain machines with which the oom-killer doesn't behave correctly
then fix it!

Why was the '#include <linux/sysctl.h>" removed from sysctl.c?

The patch adds a random newline to sysctl.c.

It was never a good idea to add extern declarations to sysctl.c.  It's
better to add them to a subsystem-specific header file (ie:
mm-sysctl.h) and then include that file from the mm files that define
or use sysctl_foo, and include it into sysctl.c.  Oh well.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-25  6:15     ` KAMEZAWA Hiroyuki
@ 2010-01-26 23:16       ` Andrew Morton
  -1 siblings, 0 replies; 75+ messages in thread
From: Andrew Morton @ 2010-01-26 23:16 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, rientjes, minchan.kim, linux-kernel, balbir

On Mon, 25 Jan 2010 15:15:03 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> -unsigned long badness(struct task_struct *p, unsigned long uptime)
> +unsigned long badness(struct task_struct *p, unsigned long uptime,
> +			int constraint)

And badness() should be renamed to something better (eg, oom_badness), and
the declaration should be placed in a mm-specific header file.

Yes, the code was already like that.  But please don't leave crappiness in
place when you come across it - take the opportunity to fix it up.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-26 23:16       ` Andrew Morton
  0 siblings, 0 replies; 75+ messages in thread
From: Andrew Morton @ 2010-01-26 23:16 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, rientjes, minchan.kim, linux-kernel, balbir

On Mon, 25 Jan 2010 15:15:03 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> -unsigned long badness(struct task_struct *p, unsigned long uptime)
> +unsigned long badness(struct task_struct *p, unsigned long uptime,
> +			int constraint)

And badness() should be renamed to something better (eg, oom_badness), and
the declaration should be placed in a mm-specific header file.

Yes, the code was already like that.  But please don't leave crappiness in
place when you come across it - take the opportunity to fix it up.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-26 23:16       ` Andrew Morton
@ 2010-01-26 23:44         ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-26 23:44 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, rientjes, minchan.kim, linux-kernel, balbir

On Tue, 26 Jan 2010 15:16:06 -0800
Andrew Morton <akpm@linux-foundation.org> wrote:

> On Mon, 25 Jan 2010 15:15:03 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> 
> > -unsigned long badness(struct task_struct *p, unsigned long uptime)
> > +unsigned long badness(struct task_struct *p, unsigned long uptime,
> > +			int constraint)
> 
> And badness() should be renamed to something better (eg, oom_badness), and
> the declaration should be placed in a mm-specific header file.
> 
> Yes, the code was already like that.  But please don't leave crappiness in
> place when you come across it - take the opportunity to fix it up.
> 
Sure. I'll write v4.

-Kame


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-26 23:44         ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-26 23:44 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, rientjes, minchan.kim, linux-kernel, balbir

On Tue, 26 Jan 2010 15:16:06 -0800
Andrew Morton <akpm@linux-foundation.org> wrote:

> On Mon, 25 Jan 2010 15:15:03 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> 
> > -unsigned long badness(struct task_struct *p, unsigned long uptime)
> > +unsigned long badness(struct task_struct *p, unsigned long uptime,
> > +			int constraint)
> 
> And badness() should be renamed to something better (eg, oom_badness), and
> the declaration should be placed in a mm-specific header file.
> 
> Yes, the code was already like that.  But please don't leave crappiness in
> place when you come across it - take the opportunity to fix it up.
> 
Sure. I'll write v4.

-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-26 23:12       ` Andrew Morton
@ 2010-01-26 23:53         ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-26 23:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, rientjes, minchan.kim, linux-kernel, balbir

On Tue, 26 Jan 2010 15:12:02 -0800
Andrew Morton <akpm@linux-foundation.org> wrote:

> On Mon, 25 Jan 2010 15:15:03 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> > This patch does
> >   - add sysctl for new bahavior.
> >   - add CONSTRAINT_LOWMEM to oom's constraint type.
> >   - pass constraint to __badness()
> >   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
> >     use low_rss instead of vmsize.
> > 
> > Changelog 2010/01/25
> >  - showing extension_mask value in OOM kill main log header.
> > Changelog 2010/01/22:
> >  - added sysctl
> >  - fixed !CONFIG_MMU
> >  - fixed fs/proc/base.c breakacge.
> 
> It'd be nice to see some testing results for this.  Perhaps "here's a
> test case and here's the before-and-after behaviour".
> 
Hm. posting test case module is O.K ?
At leaset, I'll add what test was done and /var/log/message output to the log.


> I don't like the sysctl knob much. 
me, too.

> Hardly anyone will know to enable
> it so the feature won't get much testing and this binary decision
> fractures the testing effort.  It would be much better if we can get
> everyone running the same code.  I mean, if there are certain workloads
> on certain machines with which the oom-killer doesn't behave correctly
> then fix it!
Yes, I think you're right. But "breaking current behaviro of our servers!"
arguments kills all proposal to this area and this oom-killer or vmscan is
a feature should be tested by real users. (I'll write fork-bomb detector
and RSS based OOM again.)

Then, I'd like to use sysctl. Distro/users can select default value of this
by /etc/sysctl.conf file, at least.


> 
> Why was the '#include <linux/sysctl.h>" removed from sysctl.c?
> 
> The patch adds a random newline to sysctl.c.
> 
Sorry. my bad.


> It was never a good idea to add extern declarations to sysctl.c.  It's
> better to add them to a subsystem-specific header file (ie:
> mm-sysctl.h) and then include that file from the mm files that define
> or use sysctl_foo, and include it into sysctl.c.  Oh well.
> 
Hmm. Okay. I'll consider about that.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-26 23:53         ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-26 23:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, rientjes, minchan.kim, linux-kernel, balbir

On Tue, 26 Jan 2010 15:12:02 -0800
Andrew Morton <akpm@linux-foundation.org> wrote:

> On Mon, 25 Jan 2010 15:15:03 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> > This patch does
> >   - add sysctl for new bahavior.
> >   - add CONSTRAINT_LOWMEM to oom's constraint type.
> >   - pass constraint to __badness()
> >   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
> >     use low_rss instead of vmsize.
> > 
> > Changelog 2010/01/25
> >  - showing extension_mask value in OOM kill main log header.
> > Changelog 2010/01/22:
> >  - added sysctl
> >  - fixed !CONFIG_MMU
> >  - fixed fs/proc/base.c breakacge.
> 
> It'd be nice to see some testing results for this.  Perhaps "here's a
> test case and here's the before-and-after behaviour".
> 
Hm. posting test case module is O.K ?
At leaset, I'll add what test was done and /var/log/message output to the log.


> I don't like the sysctl knob much. 
me, too.

> Hardly anyone will know to enable
> it so the feature won't get much testing and this binary decision
> fractures the testing effort.  It would be much better if we can get
> everyone running the same code.  I mean, if there are certain workloads
> on certain machines with which the oom-killer doesn't behave correctly
> then fix it!
Yes, I think you're right. But "breaking current behaviro of our servers!"
arguments kills all proposal to this area and this oom-killer or vmscan is
a feature should be tested by real users. (I'll write fork-bomb detector
and RSS based OOM again.)

Then, I'd like to use sysctl. Distro/users can select default value of this
by /etc/sysctl.conf file, at least.


> 
> Why was the '#include <linux/sysctl.h>" removed from sysctl.c?
> 
> The patch adds a random newline to sysctl.c.
> 
Sorry. my bad.


> It was never a good idea to add extern declarations to sysctl.c.  It's
> better to add them to a subsystem-specific header file (ie:
> mm-sysctl.h) and then include that file from the mm files that define
> or use sysctl_foo, and include it into sysctl.c.  Oh well.
> 
Hmm. Okay. I'll consider about that.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-26 23:53         ` KAMEZAWA Hiroyuki
@ 2010-01-27  0:19           ` Andrew Morton
  -1 siblings, 0 replies; 75+ messages in thread
From: Andrew Morton @ 2010-01-27  0:19 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, rientjes, minchan.kim, linux-kernel, balbir

On Wed, 27 Jan 2010 08:53:55 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> > Hardly anyone will know to enable
> > it so the feature won't get much testing and this binary decision
> > fractures the testing effort.  It would be much better if we can get
> > everyone running the same code.  I mean, if there are certain workloads
> > on certain machines with which the oom-killer doesn't behave correctly
> > then fix it!
> Yes, I think you're right. But "breaking current behaviro of our servers!"
> arguments kills all proposal to this area and this oom-killer or vmscan is
> a feature should be tested by real users. (I'll write fork-bomb detector
> and RSS based OOM again.)

Well don't break their servers then ;)

What I'm not understanding is: why is it not possible to improve the
behaviour on the affected machines without affecting the behaviour on
other machines?

What are these "servers" to which you refer?  x86_32 servers, I assume
- the patch shouldn't affect 64-bit machines.  Why don't they also want
this treatment and in what way does the patch "break" them?


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-27  0:19           ` Andrew Morton
  0 siblings, 0 replies; 75+ messages in thread
From: Andrew Morton @ 2010-01-27  0:19 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, rientjes, minchan.kim, linux-kernel, balbir

On Wed, 27 Jan 2010 08:53:55 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> > Hardly anyone will know to enable
> > it so the feature won't get much testing and this binary decision
> > fractures the testing effort.  It would be much better if we can get
> > everyone running the same code.  I mean, if there are certain workloads
> > on certain machines with which the oom-killer doesn't behave correctly
> > then fix it!
> Yes, I think you're right. But "breaking current behaviro of our servers!"
> arguments kills all proposal to this area and this oom-killer or vmscan is
> a feature should be tested by real users. (I'll write fork-bomb detector
> and RSS based OOM again.)

Well don't break their servers then ;)

What I'm not understanding is: why is it not possible to improve the
behaviour on the affected machines without affecting the behaviour on
other machines?

What are these "servers" to which you refer?  x86_32 servers, I assume
- the patch shouldn't affect 64-bit machines.  Why don't they also want
this treatment and in what way does the patch "break" them?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-27  0:19           ` Andrew Morton
@ 2010-01-27  0:58             ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-27  0:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, rientjes, minchan.kim, linux-kernel, balbir

On Tue, 26 Jan 2010 16:19:52 -0800
Andrew Morton <akpm@linux-foundation.org> wrote:

> On Wed, 27 Jan 2010 08:53:55 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> 
> > > Hardly anyone will know to enable
> > > it so the feature won't get much testing and this binary decision
> > > fractures the testing effort.  It would be much better if we can get
> > > everyone running the same code.  I mean, if there are certain workloads
> > > on certain machines with which the oom-killer doesn't behave correctly
> > > then fix it!
> > Yes, I think you're right. But "breaking current behaviro of our servers!"
> > arguments kills all proposal to this area and this oom-killer or vmscan is
> > a feature should be tested by real users. (I'll write fork-bomb detector
> > and RSS based OOM again.)
> 
> Well don't break their servers then ;)
> 
> What I'm not understanding is: why is it not possible to improve the
> behaviour on the affected machines without affecting the behaviour on
> other machines?
> 

Now, /proc/<pid>/oom_score and /proc/<pid>/oom_adj are used by servers.
After this patch, badness() returns different value based on given context.
Changing format of them was an idea, but, as David said, using "RSS" values
will show unstable oom_score. So, I didn't modify oom_score (for this time).

To be honest, all my work are for guys who don't tweak oom_adj based on oom_score.
IOW, this is for usual novice people. And I don't wan't to break servers which
depends on oom black magic currently supported.

It may be better to show lowmem_rss via /proc/<pid>/statm or somewhere. But
I didn't do that because usual people doesn't check that in periodic and
tweak oom_adj.

For my customers, I don't like oom black magic. I'd like to recommend to
use memcg, of course ;) But lowmem oom cannot be handled by memcg, well.
So I started from this. 


> What are these "servers" to which you refer?
Almost all servers/PCs/laptops which have multiple zones in memory layout.


> x86_32 servers, I assume
> - the patch shouldn't affect 64-bit machines.  Why don't they also want
> this treatment and in what way does the patch "break" them?

Ah, explanation was not enough.

This patch depends on mm-add-lowmem-detection-logic.patch
The lowmem is
   - ZONE_NORMAL in x86-32 which has HIGHMEM
   - ZONE_DMA32  in x86-64 which has ZONE_NORMAL
   - ZONE_DMA    in x86-64 which doesn't have ZONE_NORMAL(memory < 4G)
   - ZONE_DMA    in ia64 which has ZONE_NORMAL(memory > 4G)
   - no zone     in ppc. all zone are DMA. (lowmem_zone=-1)

So, this affects x86-64 hosts, especially when it has 4 Gbytes of memory
and 32bit pci cards.

Thanks,
-Kame















^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-27  0:58             ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-27  0:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, rientjes, minchan.kim, linux-kernel, balbir

On Tue, 26 Jan 2010 16:19:52 -0800
Andrew Morton <akpm@linux-foundation.org> wrote:

> On Wed, 27 Jan 2010 08:53:55 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> 
> > > Hardly anyone will know to enable
> > > it so the feature won't get much testing and this binary decision
> > > fractures the testing effort.  It would be much better if we can get
> > > everyone running the same code.  I mean, if there are certain workloads
> > > on certain machines with which the oom-killer doesn't behave correctly
> > > then fix it!
> > Yes, I think you're right. But "breaking current behaviro of our servers!"
> > arguments kills all proposal to this area and this oom-killer or vmscan is
> > a feature should be tested by real users. (I'll write fork-bomb detector
> > and RSS based OOM again.)
> 
> Well don't break their servers then ;)
> 
> What I'm not understanding is: why is it not possible to improve the
> behaviour on the affected machines without affecting the behaviour on
> other machines?
> 

Now, /proc/<pid>/oom_score and /proc/<pid>/oom_adj are used by servers.
After this patch, badness() returns different value based on given context.
Changing format of them was an idea, but, as David said, using "RSS" values
will show unstable oom_score. So, I didn't modify oom_score (for this time).

To be honest, all my work are for guys who don't tweak oom_adj based on oom_score.
IOW, this is for usual novice people. And I don't wan't to break servers which
depends on oom black magic currently supported.

It may be better to show lowmem_rss via /proc/<pid>/statm or somewhere. But
I didn't do that because usual people doesn't check that in periodic and
tweak oom_adj.

For my customers, I don't like oom black magic. I'd like to recommend to
use memcg, of course ;) But lowmem oom cannot be handled by memcg, well.
So I started from this. 


> What are these "servers" to which you refer?
Almost all servers/PCs/laptops which have multiple zones in memory layout.


> x86_32 servers, I assume
> - the patch shouldn't affect 64-bit machines.  Why don't they also want
> this treatment and in what way does the patch "break" them?

Ah, explanation was not enough.

This patch depends on mm-add-lowmem-detection-logic.patch
The lowmem is
   - ZONE_NORMAL in x86-32 which has HIGHMEM
   - ZONE_DMA32  in x86-64 which has ZONE_NORMAL
   - ZONE_DMA    in x86-64 which doesn't have ZONE_NORMAL(memory < 4G)
   - ZONE_DMA    in ia64 which has ZONE_NORMAL(memory > 4G)
   - no zone     in ppc. all zone are DMA. (lowmem_zone=-1)

So, this affects x86-64 hosts, especially when it has 4 Gbytes of memory
and 32bit pci cards.

Thanks,
-Kame














--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v4 0/2] oom-kill: add lowmem usage aware oom kill handling
  2010-01-27  0:58             ` KAMEZAWA Hiroyuki
@ 2010-01-27  6:30               ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-27  6:30 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, linux-mm, rientjes, minchan.kim, linux-kernel, balbir

I updated oom-kill-add-lowmem-usage-aware-oom-kill-handling.patch.

I added "clean up" patch before new feature. I divided it into 2 patches.

[1/2] clean up vm related sysctl variable declaration
[2/2] lowmem aware oom-kill.

I rewrote patch description for 2/2 much. Plz point out if not enough yet.
And 2/2 includes some bug fixes.

Thank you for all helps.

Regards,
-Kame


^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v4 0/2] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-27  6:30               ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-27  6:30 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, linux-mm, rientjes, minchan.kim, linux-kernel, balbir

I updated oom-kill-add-lowmem-usage-aware-oom-kill-handling.patch.

I added "clean up" patch before new feature. I divided it into 2 patches.

[1/2] clean up vm related sysctl variable declaration
[2/2] lowmem aware oom-kill.

I rewrote patch description for 2/2 much. Plz point out if not enough yet.
And 2/2 includes some bug fixes.

Thank you for all helps.

Regards,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v4 1/2] sysctl clean up vm related variable declarations
  2010-01-27  6:30               ` KAMEZAWA Hiroyuki
@ 2010-01-27  6:32                 ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-27  6:32 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, linux-mm, rientjes, minchan.kim, linux-kernel, balbir

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, there are many "extern" declaration in kernel/sysctl.c. "extern"
declaration in *.c file is not appreciated in general.
And Hmm...it seems there are a few redundant declarations.

Because most of sysctl variables are defined in its own header file,
they should be declared in the same style, be done in its own *.h file.

This patch removes some VM(memory management) related sysctl's
variable declaration from kernel/sysctl.c and move them to
proper places.

Change log:
 - 2010/01/27 (new)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/mm.h     |    5 +++++
 include/linux/mmzone.h |    1 +
 include/linux/oom.h    |    5 +++++
 kernel/sysctl.c        |   16 ++--------------
 mm/mmap.c              |    5 +++++
 5 files changed, 18 insertions(+), 14 deletions(-)

Index: mmotm-2.6.33-Jan15-2/include/linux/mm.h
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/include/linux/mm.h
+++ mmotm-2.6.33-Jan15-2/include/linux/mm.h
@@ -1432,6 +1432,7 @@ int in_gate_area_no_task(unsigned long a
 #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
 #endif	/* __HAVE_ARCH_GATE_AREA */
 
+extern int sysctl_drop_caches;
 int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
@@ -1476,5 +1477,9 @@ extern int soft_offline_page(struct page
 
 extern void dump_page(struct page *page);
 
+#ifndef CONFIG_NOMMU
+extern int sysctl_nr_trim_pages;
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
Index: mmotm-2.6.33-Jan15-2/include/linux/mmzone.h
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/include/linux/mmzone.h
+++ mmotm-2.6.33-Jan15-2/include/linux/mmzone.h
@@ -747,6 +747,7 @@ int min_free_kbytes_sysctl_handler(struc
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
+extern int percpu_pagelist_fraction; /* for sysctl */
 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
Index: mmotm-2.6.33-Jan15-2/kernel/sysctl.c
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/kernel/sysctl.c
+++ mmotm-2.6.33-Jan15-2/kernel/sysctl.c
@@ -51,6 +51,8 @@
 #include <linux/slow-work.h>
 #include <linux/perf_event.h>
 #include <linux/rcustring.h>
+#include <linux/mman.h>
+#include <linux/oom.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -67,11 +69,6 @@
 /* External variables not in a header file. */
 extern int C_A_D;
 extern int print_fatal_signals;
-extern int sysctl_overcommit_memory;
-extern int sysctl_overcommit_ratio;
-extern int sysctl_panic_on_oom;
-extern int sysctl_oom_kill_allocating_task;
-extern int sysctl_oom_dump_tasks;
 extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
@@ -80,14 +77,9 @@ extern unsigned int core_pipe_limit;
 extern int pid_max;
 extern int min_free_kbytes;
 extern int pid_max_min, pid_max_max;
-extern int sysctl_drop_caches;
-extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
-#ifndef CONFIG_MMU
-extern int sysctl_nr_trim_pages;
-#endif
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
@@ -198,10 +190,6 @@ extern struct ctl_table inotify_table[];
 extern struct ctl_table epoll_table[];
 #endif
 
-#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
-int sysctl_legacy_va_layout;
-#endif
-
 extern int prove_locking;
 extern int lock_stat;
 
Index: mmotm-2.6.33-Jan15-2/include/linux/oom.h
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/include/linux/oom.h
+++ mmotm-2.6.33-Jan15-2/include/linux/oom.h
@@ -43,5 +43,10 @@ static inline void oom_killer_enable(voi
 {
 	oom_killer_disabled = false;
 }
+/* for sysctl */
+extern int sysctl_panic_on_oom;
+extern int sysctl_oom_kill_allocating_task;
+extern int sysctl_oom_dump_tasks;
+
 #endif /* __KERNEL__*/
 #endif /* _INCLUDE_LINUX_OOM_H */
Index: mmotm-2.6.33-Jan15-2/mm/mmap.c
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/mm/mmap.c
+++ mmotm-2.6.33-Jan15-2/mm/mmap.c
@@ -87,6 +87,11 @@ int sysctl_overcommit_ratio = 50;	/* def
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 struct percpu_counter vm_committed_as;
 
+#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+/* Used by each architecture's private code and sysctl. */
+int sysctl_legacy_va_layout;
+#endif
+
 /*
  * Check that a process has enough memory to allocate a new virtual
  * mapping. 0 means there is enough memory for the allocation to


^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v4 1/2] sysctl clean up vm related variable declarations
@ 2010-01-27  6:32                 ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-27  6:32 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, linux-mm, rientjes, minchan.kim, linux-kernel, balbir

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, there are many "extern" declaration in kernel/sysctl.c. "extern"
declaration in *.c file is not appreciated in general.
And Hmm...it seems there are a few redundant declarations.

Because most of sysctl variables are defined in its own header file,
they should be declared in the same style, be done in its own *.h file.

This patch removes some VM(memory management) related sysctl's
variable declaration from kernel/sysctl.c and move them to
proper places.

Change log:
 - 2010/01/27 (new)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/mm.h     |    5 +++++
 include/linux/mmzone.h |    1 +
 include/linux/oom.h    |    5 +++++
 kernel/sysctl.c        |   16 ++--------------
 mm/mmap.c              |    5 +++++
 5 files changed, 18 insertions(+), 14 deletions(-)

Index: mmotm-2.6.33-Jan15-2/include/linux/mm.h
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/include/linux/mm.h
+++ mmotm-2.6.33-Jan15-2/include/linux/mm.h
@@ -1432,6 +1432,7 @@ int in_gate_area_no_task(unsigned long a
 #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
 #endif	/* __HAVE_ARCH_GATE_AREA */
 
+extern int sysctl_drop_caches;
 int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
@@ -1476,5 +1477,9 @@ extern int soft_offline_page(struct page
 
 extern void dump_page(struct page *page);
 
+#ifndef CONFIG_NOMMU
+extern int sysctl_nr_trim_pages;
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
Index: mmotm-2.6.33-Jan15-2/include/linux/mmzone.h
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/include/linux/mmzone.h
+++ mmotm-2.6.33-Jan15-2/include/linux/mmzone.h
@@ -747,6 +747,7 @@ int min_free_kbytes_sysctl_handler(struc
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
+extern int percpu_pagelist_fraction; /* for sysctl */
 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
Index: mmotm-2.6.33-Jan15-2/kernel/sysctl.c
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/kernel/sysctl.c
+++ mmotm-2.6.33-Jan15-2/kernel/sysctl.c
@@ -51,6 +51,8 @@
 #include <linux/slow-work.h>
 #include <linux/perf_event.h>
 #include <linux/rcustring.h>
+#include <linux/mman.h>
+#include <linux/oom.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -67,11 +69,6 @@
 /* External variables not in a header file. */
 extern int C_A_D;
 extern int print_fatal_signals;
-extern int sysctl_overcommit_memory;
-extern int sysctl_overcommit_ratio;
-extern int sysctl_panic_on_oom;
-extern int sysctl_oom_kill_allocating_task;
-extern int sysctl_oom_dump_tasks;
 extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
@@ -80,14 +77,9 @@ extern unsigned int core_pipe_limit;
 extern int pid_max;
 extern int min_free_kbytes;
 extern int pid_max_min, pid_max_max;
-extern int sysctl_drop_caches;
-extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
-#ifndef CONFIG_MMU
-extern int sysctl_nr_trim_pages;
-#endif
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
@@ -198,10 +190,6 @@ extern struct ctl_table inotify_table[];
 extern struct ctl_table epoll_table[];
 #endif
 
-#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
-int sysctl_legacy_va_layout;
-#endif
-
 extern int prove_locking;
 extern int lock_stat;
 
Index: mmotm-2.6.33-Jan15-2/include/linux/oom.h
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/include/linux/oom.h
+++ mmotm-2.6.33-Jan15-2/include/linux/oom.h
@@ -43,5 +43,10 @@ static inline void oom_killer_enable(voi
 {
 	oom_killer_disabled = false;
 }
+/* for sysctl */
+extern int sysctl_panic_on_oom;
+extern int sysctl_oom_kill_allocating_task;
+extern int sysctl_oom_dump_tasks;
+
 #endif /* __KERNEL__*/
 #endif /* _INCLUDE_LINUX_OOM_H */
Index: mmotm-2.6.33-Jan15-2/mm/mmap.c
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/mm/mmap.c
+++ mmotm-2.6.33-Jan15-2/mm/mmap.c
@@ -87,6 +87,11 @@ int sysctl_overcommit_ratio = 50;	/* def
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 struct percpu_counter vm_committed_as;
 
+#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+/* Used by each architecture's private code and sysctl. */
+int sysctl_legacy_va_layout;
+#endif
+
 /*
  * Check that a process has enough memory to allocate a new virtual
  * mapping. 0 means there is enough memory for the allocation to

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v4 2/2] oom-kill: add lowmem usage aware oom kill handling v4
  2010-01-27  6:30               ` KAMEZAWA Hiroyuki
@ 2010-01-27  6:33                 ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-27  6:33 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, linux-mm, rientjes, minchan.kim, linux-kernel, balbir

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Default oom-killer uses badness calculation based on process's vm_size
and some amounts of heuristics. Some users see proc->oom_score and
proc->oom_adj to control oom-killed tendency under their server.

Now, we know oom-killer don't work ideally in some situaion, in PCs. Some
enhancements are demanded. But such enhancements for oom-killer makes
incomaptibility to oom-controls in enterprise world. So, this patch
adds sysctl for extensions for oom-killer. Main purpose is for
making a chance for wider test for new scheme.

One cause of OOM-Killer is memory shortage in lower zones.
(If memory is enough, lowmem_reserve_ratio works well. but..)
I saw lowmem-oom frequently on x86-32 and sometimes on ia64 in
my cusotmer support jobs. If we just see process's vm_size at oom,
we can never kill a process which has lowmem.
At last, there will be an oom-serial-killer.

(*) Lowmem is small zones in archs for special purpose.
   for example
   ZONE_DMA32 .. in x86-64 4G+ memmory
   ZONE_DMA   .. in ia64 4G+ memory 
   ZONE_NORMAL ..in x86-32 with highmemory.

Now, we have per-mm lowmem usage counter. We can make use of it
to select a good victim.

This patch does
 - add sysctl for new bahavior.
 - add CONSTRAINT_LOWMEM to oom's constraint type.
 - pass constraint to __badness()
 - change calculation based on constraint. If CONSTRAINT_LOWMEM,
   use low_rss instead of vmsize.

[excuse]
Why sysctl is added is because this feature breaks assumption
in oom_score/oom_adj, a knob from users. Even if oom_adj is enough
high, it the process is the only user of lowmem, it will be killed.
and oom_score can't show this special situaion. I think this patch is for
users who don't tweak oom_adj etc...making default behavior better.
(If sysctl is set)

[a side effect]
 Now, the kernel panics if oom killer can't find a process to be killed.
 So, if lowmem-oom occurs and there is no process which has no lowmem,
 the kerenl panics before killing tasks after this patch.
 Anyway, only we can do in such case is killing innocent processes
 at random and gamble on freed kernel memory by process kill will help
 oom situaion.....But oom-killer is not a gamble logic.
 So, panic is okay in such extreme situation, I think.

[test result]
A test case and output sample in /var/log/messages.
  (test on x86-64)
    1. do swap off. (for easy test)
    2. use up memory > 4G by some logic(I used hugepage.)
    3. run an memory-consume program with 800M memory.
    4. run an kernel module which alloc memory with GFP_DMA32
       repeatedly.

 (Before patch)
[  739.620583] cat invoked oom-killer: gfp_mask=0xd4, order=3, oom_adj=0
[  739.620588] cat cpuset=/ mems_allowed=0
[  739.620591] Pid: 12875, comm: cat Tainted: P           2.6.33-rc4-mm1 #10
<snip>
...
[  739.739887] 28184 pages shared
[  739.739890] 494948 pages non-shared
[  739.739895] Out of memory: kill process 2716 (gnome-session)
score 191570 or a child
[  739.739901] Killed process 2726 (gnome-power-man) vsz:319232kB
anon-rss:8156kB, file-rss:2764kB lowmem 0kB

A process of no lowmem usage  is killed. And some innocent ones
were killed in the sequence of oom-killer.

 (After patch)
At 1st oom killer,
[  579.384021] cat invoked oom-killer: gfp_mask=0xd4, order=3, oom_adj=0
[  579.384026] cat cpuset=/ mems_allowed=0 extension=1
[  579.384029] Pid: 11438, comm: cat Tainted: P           2.6.33-rc4-mm1
<snip>
[  579.384008] Out of memory (in lowmem): kill process 2919 (malloc)
	       score 4323 or a child
[  579.384012] Killed process 2919 (malloc) vsz:1442976kB, anon-
rss:312kB, file-rss:1439004kB lowmem 1383556kB

Then, lowmem memory eater (malloc) was killed. And no more oom-kill.

Changelog 2010/01/27
 - removed declaraiont of badness() from fs/proc/base.c and added a
   oom_badness() declaration in oom.h
 - plarecment of sysctl_oom_kill_extenstion_mask declaration was
   moved to oom.h
 - updated patch description.
 - fixed spell miss.
 - fixed bug of lacked "break" in switch().
 - added printk before panic.

Changelog 2010/01/25
 - showing extension value in OOM kill main log header.

Changelog 2010/01/22:
 - added sysctl
 - fixed !CONFIG_MMU
 - fixed fs/proc/base.c breakacge.

Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 Documentation/sysctl/vm.txt |   16 +++++++
 fs/proc/base.c              |    4 -
 include/linux/oom.h         |    7 +++
 kernel/sysctl.c             |    7 +++
 mm/oom_kill.c               |   97 ++++++++++++++++++++++++++++++++------------
 5 files changed, 104 insertions(+), 27 deletions(-)

Index: mmotm-2.6.33-Jan15-2/include/linux/oom.h
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/include/linux/oom.h
+++ mmotm-2.6.33-Jan15-2/include/linux/oom.h
@@ -11,6 +11,7 @@
 
 #include <linux/types.h>
 #include <linux/nodemask.h>
+#include <linux/sched.h>
 
 struct zonelist;
 struct notifier_block;
@@ -20,6 +21,7 @@ struct notifier_block;
  */
 enum oom_constraint {
 	CONSTRAINT_NONE,
+	CONSTRAINT_LOWMEM,
 	CONSTRAINT_CPUSET,
 	CONSTRAINT_MEMORY_POLICY,
 };
@@ -47,6 +49,11 @@ static inline void oom_killer_enable(voi
 extern int sysctl_panic_on_oom;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_oom_dump_tasks;
+extern int sysctl_oom_kill_extension_mask;
+
+/* Called by fs/proc/base.c */
+unsigned long oom_badness(struct task_struct *p,
+		unsigned long uptime, int constraint);
 
 #endif /* __KERNEL__*/
 #endif /* _INCLUDE_LINUX_OOM_H */
Index: mmotm-2.6.33-Jan15-2/mm/oom_kill.c
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/mm/oom_kill.c
+++ mmotm-2.6.33-Jan15-2/mm/oom_kill.c
@@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks;
 static DEFINE_SPINLOCK(zone_scan_lock);
 /* #define DEBUG */
 
+int sysctl_oom_kill_extension_mask;
+enum {
+	EXT_LOWMEM_OOM,
+};
+
+#ifdef CONFIG_MMU
+static int oom_extension(int idx)
+{
+	return sysctl_oom_kill_extension_mask & (1 << idx);
+}
+#else
+static int oom_extension(int idx)
+{
+	return 0;
+}
+#endif
+
 /*
  * Is all threads of the target process nodes overlap ours?
  */
@@ -52,9 +69,10 @@ static int has_intersects_mems_allowed(s
 }
 
 /**
- * badness - calculate a numeric value for how bad this task has been
+ * oom_badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
  * @uptime: current uptime in seconds
+ * @constraint: context of badness calculation.
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -70,7 +88,8 @@ static int has_intersects_mems_allowed(s
  *    of least surprise ... (be careful when you change it)
  */
 
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long oom_badness(struct task_struct *p, unsigned long uptime,
+			int constraint)
 {
 	unsigned long points, cpu_time, run_time;
 	struct mm_struct *mm;
@@ -89,11 +108,16 @@ unsigned long badness(struct task_struct
 		task_unlock(p);
 		return 0;
 	}
-
-	/*
-	 * The memory size of the process is the basis for the badness.
-	 */
-	points = mm->total_vm;
+	switch  (constraint) {
+	case CONSTRAINT_LOWMEM:
+		/* use lowmem usage as the basis for the badness */
+		points = get_low_rss(mm);
+		break;
+	default:
+		/* use virtual memory size as the basis for the badness */
+		points = mm->total_vm;
+		break;
+	}
 
 	/*
 	 * After this unlock we can no longer dereference local variable `mm'
@@ -113,12 +137,17 @@ unsigned long badness(struct task_struct
 	 * machine with an endless amount of children. In case a single
 	 * child is eating the vast majority of memory, adding only half
 	 * to the parents will make the child our kill candidate of choice.
+	 *
+	 * At lowmem shortage, this part is skipped because children's lowmem
+	 * usage is not related to its parent.
 	 */
-	list_for_each_entry(child, &p->children, sibling) {
-		task_lock(child);
-		if (child->mm != mm && child->mm)
-			points += child->mm->total_vm/2 + 1;
-		task_unlock(child);
+	if (constraint != CONSTRAINT_LOWMEM) {
+		list_for_each_entry(child, &p->children, sibling) {
+			task_lock(child);
+			if (child->mm != mm && child->mm)
+				points += child->mm->total_vm/2 + 1;
+			task_unlock(child);
+		}
 	}
 
 	/*
@@ -212,6 +241,9 @@ static enum oom_constraint constrained_a
 	if (gfp_mask & __GFP_THISNODE)
 		return CONSTRAINT_NONE;
 
+	if (oom_extension(EXT_LOWMEM_OOM) && (high_zoneidx <= lowmem_zone))
+		return CONSTRAINT_LOWMEM;
+
 	/*
 	 * The nodemask here is a nodemask passed to alloc_pages(). Now,
 	 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
@@ -233,6 +265,10 @@ static enum oom_constraint constrained_a
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 				gfp_t gfp_mask, nodemask_t *nodemask)
 {
+	int zone_idx = gfp_zone(gfp_mask);
+
+	if (oom_extension(EXT_LOWMEM_OOM) && (zone_idx <= lowmem_zone))
+		return CONSTRAINT_LOWMEM;
 	return CONSTRAINT_NONE;
 }
 #endif
@@ -244,7 +280,7 @@ static enum oom_constraint constrained_a
  * (not docbooked, we don't want this one cluttering up the manual)
  */
 static struct task_struct *select_bad_process(unsigned long *ppoints,
-						struct mem_cgroup *mem)
+				struct mem_cgroup *mem, int constraint)
 {
 	struct task_struct *p;
 	struct task_struct *chosen = NULL;
@@ -300,7 +336,7 @@ static struct task_struct *select_bad_pr
 		if (p->signal->oom_adj == OOM_DISABLE)
 			continue;
 
-		points = badness(p, uptime.tv_sec);
+		points = oom_badness(p, uptime.tv_sec, constraint);
 		if (points > *ppoints || !chosen) {
 			chosen = p;
 			*ppoints = points;
@@ -360,8 +396,9 @@ static void dump_header(struct task_stru
 							struct mem_cgroup *mem)
 {
 	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-		"oom_adj=%d\n",
-		current->comm, gfp_mask, order, current->signal->oom_adj);
+		"oom_adj=%d extension=%x\n",
+		current->comm, gfp_mask, order,
+		current->signal->oom_adj, sysctl_oom_kill_extension_mask);
 	task_lock(current);
 	cpuset_print_task_mems_allowed(current);
 	task_unlock(current);
@@ -455,7 +492,7 @@ static int oom_kill_process(struct task_
 	}
 
 	printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
-					message, task_pid_nr(p), p->comm, points);
+				message, task_pid_nr(p), p->comm, points);
 
 	/* Try to kill a child first */
 	list_for_each_entry(c, &p->children, sibling) {
@@ -475,7 +512,7 @@ void mem_cgroup_out_of_memory(struct mem
 
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem);
+	p = select_bad_process(&points, mem, CONSTRAINT_NONE);
 	if (PTR_ERR(p) == -1UL)
 		goto out;
 
@@ -557,7 +594,7 @@ void clear_zonelist_oom(struct zonelist 
 /*
  * Must be called with tasklist_lock held for read.
  */
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static void __out_of_memory(gfp_t gfp_mask, int order, int constraint)
 {
 	struct task_struct *p;
 	unsigned long points;
@@ -571,7 +608,7 @@ retry:
 	 * Rambo mode: Shoot down a process and hope it solves whatever
 	 * issues we may have.
 	 */
-	p = select_bad_process(&points, NULL);
+	p = select_bad_process(&points, NULL, constraint);
 
 	if (PTR_ERR(p) == -1UL)
 		return;
@@ -579,13 +616,23 @@ retry:
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
 		read_unlock(&tasklist_lock);
+		if (constraint == CONSTRAINT_LOWMEM)
+			printk("Lowmem is exhausted and no process has lowmem\n");
 		dump_header(NULL, gfp_mask, order, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	if (oom_kill_process(p, gfp_mask, order, points, NULL,
+	switch (constraint) {
+	case CONSTRAINT_LOWMEM:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
+			"Out of memory (in lowmem)"))
+			goto retry;
+		break;
+	default:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
 			     "Out of memory"))
-		goto retry;
+			goto retry;
+	}
 }
 
 /*
@@ -612,7 +659,7 @@ void pagefault_out_of_memory(void)
 		panic("out of memory from page fault. panic_on_oom is selected.\n");
 
 	read_lock(&tasklist_lock);
-	__out_of_memory(0, 0); /* unknown gfp_mask and order */
+	__out_of_memory(0, 0, CONSTRAINT_NONE); /* unknown gfp_mask and order */
 	read_unlock(&tasklist_lock);
 
 	/*
@@ -663,7 +710,7 @@ void out_of_memory(struct zonelist *zone
 		oom_kill_process(current, gfp_mask, order, 0, NULL,
 				"No available memory (MPOL_BIND)");
 		break;
-
+	case CONSTRAINT_LOWMEM:
 	case CONSTRAINT_NONE:
 		if (sysctl_panic_on_oom) {
 			dump_header(NULL, gfp_mask, order, NULL);
@@ -671,7 +718,7 @@ void out_of_memory(struct zonelist *zone
 		}
 		/* Fall-through */
 	case CONSTRAINT_CPUSET:
-		__out_of_memory(gfp_mask, order);
+		__out_of_memory(gfp_mask, order, constraint);
 		break;
 	}
 
Index: mmotm-2.6.33-Jan15-2/kernel/sysctl.c
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/kernel/sysctl.c
+++ mmotm-2.6.33-Jan15-2/kernel/sysctl.c
@@ -1270,6 +1270,13 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.procname	= "oom_kill_extension_mask",
+		.data		= &sysctl_oom_kill_extension_mask,
+		.maxlen		= sizeof(sysctl_oom_kill_extension_mask),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 
 /*
  * NOTE: do not add new entries to this table unless you have read
Index: mmotm-2.6.33-Jan15-2/Documentation/sysctl/vm.txt
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/Documentation/sysctl/vm.txt
+++ mmotm-2.6.33-Jan15-2/Documentation/sysctl/vm.txt
@@ -45,6 +45,7 @@ Currently, these files are in /proc/sys/
 - numa_zonelist_order
 - oom_dump_tasks
 - oom_kill_allocating_task
+- oom_kill_extension_mask
 - overcommit_memory
 - overcommit_ratio
 - page-cluster
@@ -511,6 +512,21 @@ The default value is 0.
 
 ==============================================================
 
+oom_kill_extension_mask:
+
+This is a mask for oom-killer extension features.
+Setting these flags may cause incompatibility for proc->oom_score and
+proc->oom_adj controls. So, please set carefully.
+
+bit 0....lowmem aware oom-killing.
+    If set, at lowmem shortage oom killing (for example, exhausting NORMAL_ZONE
+    under x86-32 HIGHMEM host), oom-killer will see lowmem rss usage of
+    processes instead of vmsize. Works only when CONFIG_MMU=y.
+
+The default value is 0
+
+==============================================================
+
 overcommit_memory:
 
 This value contains a flag that enables memory overcommitment.
Index: mmotm-2.6.33-Jan15-2/fs/proc/base.c
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/fs/proc/base.c
+++ mmotm-2.6.33-Jan15-2/fs/proc/base.c
@@ -458,7 +458,6 @@ static const struct file_operations proc
 #endif
 
 /* The badness from the OOM killer */
-unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long points;
@@ -466,7 +465,8 @@ static int proc_oom_score(struct task_st
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	read_lock(&tasklist_lock);
-	points = badness(task->group_leader, uptime.tv_sec);
+	points = oom_badness(task->group_leader,
+			uptime.tv_sec, CONSTRAINT_NONE);
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }


^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v4 2/2] oom-kill: add lowmem usage aware oom kill handling v4
@ 2010-01-27  6:33                 ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-27  6:33 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, linux-mm, rientjes, minchan.kim, linux-kernel, balbir

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Default oom-killer uses badness calculation based on process's vm_size
and some amounts of heuristics. Some users see proc->oom_score and
proc->oom_adj to control oom-killed tendency under their server.

Now, we know oom-killer don't work ideally in some situaion, in PCs. Some
enhancements are demanded. But such enhancements for oom-killer makes
incomaptibility to oom-controls in enterprise world. So, this patch
adds sysctl for extensions for oom-killer. Main purpose is for
making a chance for wider test for new scheme.

One cause of OOM-Killer is memory shortage in lower zones.
(If memory is enough, lowmem_reserve_ratio works well. but..)
I saw lowmem-oom frequently on x86-32 and sometimes on ia64 in
my cusotmer support jobs. If we just see process's vm_size at oom,
we can never kill a process which has lowmem.
At last, there will be an oom-serial-killer.

(*) Lowmem is small zones in archs for special purpose.
   for example
   ZONE_DMA32 .. in x86-64 4G+ memmory
   ZONE_DMA   .. in ia64 4G+ memory 
   ZONE_NORMAL ..in x86-32 with highmemory.

Now, we have per-mm lowmem usage counter. We can make use of it
to select a good victim.

This patch does
 - add sysctl for new bahavior.
 - add CONSTRAINT_LOWMEM to oom's constraint type.
 - pass constraint to __badness()
 - change calculation based on constraint. If CONSTRAINT_LOWMEM,
   use low_rss instead of vmsize.

[excuse]
Why sysctl is added is because this feature breaks assumption
in oom_score/oom_adj, a knob from users. Even if oom_adj is enough
high, it the process is the only user of lowmem, it will be killed.
and oom_score can't show this special situaion. I think this patch is for
users who don't tweak oom_adj etc...making default behavior better.
(If sysctl is set)

[a side effect]
 Now, the kernel panics if oom killer can't find a process to be killed.
 So, if lowmem-oom occurs and there is no process which has no lowmem,
 the kerenl panics before killing tasks after this patch.
 Anyway, only we can do in such case is killing innocent processes
 at random and gamble on freed kernel memory by process kill will help
 oom situaion.....But oom-killer is not a gamble logic.
 So, panic is okay in such extreme situation, I think.

[test result]
A test case and output sample in /var/log/messages.
  (test on x86-64)
    1. do swap off. (for easy test)
    2. use up memory > 4G by some logic(I used hugepage.)
    3. run an memory-consume program with 800M memory.
    4. run an kernel module which alloc memory with GFP_DMA32
       repeatedly.

 (Before patch)
[  739.620583] cat invoked oom-killer: gfp_mask=0xd4, order=3, oom_adj=0
[  739.620588] cat cpuset=/ mems_allowed=0
[  739.620591] Pid: 12875, comm: cat Tainted: P           2.6.33-rc4-mm1 #10
<snip>
...
[  739.739887] 28184 pages shared
[  739.739890] 494948 pages non-shared
[  739.739895] Out of memory: kill process 2716 (gnome-session)
score 191570 or a child
[  739.739901] Killed process 2726 (gnome-power-man) vsz:319232kB
anon-rss:8156kB, file-rss:2764kB lowmem 0kB

A process of no lowmem usage  is killed. And some innocent ones
were killed in the sequence of oom-killer.

 (After patch)
At 1st oom killer,
[  579.384021] cat invoked oom-killer: gfp_mask=0xd4, order=3, oom_adj=0
[  579.384026] cat cpuset=/ mems_allowed=0 extension=1
[  579.384029] Pid: 11438, comm: cat Tainted: P           2.6.33-rc4-mm1
<snip>
[  579.384008] Out of memory (in lowmem): kill process 2919 (malloc)
	       score 4323 or a child
[  579.384012] Killed process 2919 (malloc) vsz:1442976kB, anon-
rss:312kB, file-rss:1439004kB lowmem 1383556kB

Then, lowmem memory eater (malloc) was killed. And no more oom-kill.

Changelog 2010/01/27
 - removed declaraiont of badness() from fs/proc/base.c and added a
   oom_badness() declaration in oom.h
 - plarecment of sysctl_oom_kill_extenstion_mask declaration was
   moved to oom.h
 - updated patch description.
 - fixed spell miss.
 - fixed bug of lacked "break" in switch().
 - added printk before panic.

Changelog 2010/01/25
 - showing extension value in OOM kill main log header.

Changelog 2010/01/22:
 - added sysctl
 - fixed !CONFIG_MMU
 - fixed fs/proc/base.c breakacge.

Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 Documentation/sysctl/vm.txt |   16 +++++++
 fs/proc/base.c              |    4 -
 include/linux/oom.h         |    7 +++
 kernel/sysctl.c             |    7 +++
 mm/oom_kill.c               |   97 ++++++++++++++++++++++++++++++++------------
 5 files changed, 104 insertions(+), 27 deletions(-)

Index: mmotm-2.6.33-Jan15-2/include/linux/oom.h
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/include/linux/oom.h
+++ mmotm-2.6.33-Jan15-2/include/linux/oom.h
@@ -11,6 +11,7 @@
 
 #include <linux/types.h>
 #include <linux/nodemask.h>
+#include <linux/sched.h>
 
 struct zonelist;
 struct notifier_block;
@@ -20,6 +21,7 @@ struct notifier_block;
  */
 enum oom_constraint {
 	CONSTRAINT_NONE,
+	CONSTRAINT_LOWMEM,
 	CONSTRAINT_CPUSET,
 	CONSTRAINT_MEMORY_POLICY,
 };
@@ -47,6 +49,11 @@ static inline void oom_killer_enable(voi
 extern int sysctl_panic_on_oom;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_oom_dump_tasks;
+extern int sysctl_oom_kill_extension_mask;
+
+/* Called by fs/proc/base.c */
+unsigned long oom_badness(struct task_struct *p,
+		unsigned long uptime, int constraint);
 
 #endif /* __KERNEL__*/
 #endif /* _INCLUDE_LINUX_OOM_H */
Index: mmotm-2.6.33-Jan15-2/mm/oom_kill.c
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/mm/oom_kill.c
+++ mmotm-2.6.33-Jan15-2/mm/oom_kill.c
@@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks;
 static DEFINE_SPINLOCK(zone_scan_lock);
 /* #define DEBUG */
 
+int sysctl_oom_kill_extension_mask;
+enum {
+	EXT_LOWMEM_OOM,
+};
+
+#ifdef CONFIG_MMU
+static int oom_extension(int idx)
+{
+	return sysctl_oom_kill_extension_mask & (1 << idx);
+}
+#else
+static int oom_extension(int idx)
+{
+	return 0;
+}
+#endif
+
 /*
  * Is all threads of the target process nodes overlap ours?
  */
@@ -52,9 +69,10 @@ static int has_intersects_mems_allowed(s
 }
 
 /**
- * badness - calculate a numeric value for how bad this task has been
+ * oom_badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
  * @uptime: current uptime in seconds
+ * @constraint: context of badness calculation.
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -70,7 +88,8 @@ static int has_intersects_mems_allowed(s
  *    of least surprise ... (be careful when you change it)
  */
 
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long oom_badness(struct task_struct *p, unsigned long uptime,
+			int constraint)
 {
 	unsigned long points, cpu_time, run_time;
 	struct mm_struct *mm;
@@ -89,11 +108,16 @@ unsigned long badness(struct task_struct
 		task_unlock(p);
 		return 0;
 	}
-
-	/*
-	 * The memory size of the process is the basis for the badness.
-	 */
-	points = mm->total_vm;
+	switch  (constraint) {
+	case CONSTRAINT_LOWMEM:
+		/* use lowmem usage as the basis for the badness */
+		points = get_low_rss(mm);
+		break;
+	default:
+		/* use virtual memory size as the basis for the badness */
+		points = mm->total_vm;
+		break;
+	}
 
 	/*
 	 * After this unlock we can no longer dereference local variable `mm'
@@ -113,12 +137,17 @@ unsigned long badness(struct task_struct
 	 * machine with an endless amount of children. In case a single
 	 * child is eating the vast majority of memory, adding only half
 	 * to the parents will make the child our kill candidate of choice.
+	 *
+	 * At lowmem shortage, this part is skipped because children's lowmem
+	 * usage is not related to its parent.
 	 */
-	list_for_each_entry(child, &p->children, sibling) {
-		task_lock(child);
-		if (child->mm != mm && child->mm)
-			points += child->mm->total_vm/2 + 1;
-		task_unlock(child);
+	if (constraint != CONSTRAINT_LOWMEM) {
+		list_for_each_entry(child, &p->children, sibling) {
+			task_lock(child);
+			if (child->mm != mm && child->mm)
+				points += child->mm->total_vm/2 + 1;
+			task_unlock(child);
+		}
 	}
 
 	/*
@@ -212,6 +241,9 @@ static enum oom_constraint constrained_a
 	if (gfp_mask & __GFP_THISNODE)
 		return CONSTRAINT_NONE;
 
+	if (oom_extension(EXT_LOWMEM_OOM) && (high_zoneidx <= lowmem_zone))
+		return CONSTRAINT_LOWMEM;
+
 	/*
 	 * The nodemask here is a nodemask passed to alloc_pages(). Now,
 	 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
@@ -233,6 +265,10 @@ static enum oom_constraint constrained_a
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 				gfp_t gfp_mask, nodemask_t *nodemask)
 {
+	int zone_idx = gfp_zone(gfp_mask);
+
+	if (oom_extension(EXT_LOWMEM_OOM) && (zone_idx <= lowmem_zone))
+		return CONSTRAINT_LOWMEM;
 	return CONSTRAINT_NONE;
 }
 #endif
@@ -244,7 +280,7 @@ static enum oom_constraint constrained_a
  * (not docbooked, we don't want this one cluttering up the manual)
  */
 static struct task_struct *select_bad_process(unsigned long *ppoints,
-						struct mem_cgroup *mem)
+				struct mem_cgroup *mem, int constraint)
 {
 	struct task_struct *p;
 	struct task_struct *chosen = NULL;
@@ -300,7 +336,7 @@ static struct task_struct *select_bad_pr
 		if (p->signal->oom_adj == OOM_DISABLE)
 			continue;
 
-		points = badness(p, uptime.tv_sec);
+		points = oom_badness(p, uptime.tv_sec, constraint);
 		if (points > *ppoints || !chosen) {
 			chosen = p;
 			*ppoints = points;
@@ -360,8 +396,9 @@ static void dump_header(struct task_stru
 							struct mem_cgroup *mem)
 {
 	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-		"oom_adj=%d\n",
-		current->comm, gfp_mask, order, current->signal->oom_adj);
+		"oom_adj=%d extension=%x\n",
+		current->comm, gfp_mask, order,
+		current->signal->oom_adj, sysctl_oom_kill_extension_mask);
 	task_lock(current);
 	cpuset_print_task_mems_allowed(current);
 	task_unlock(current);
@@ -455,7 +492,7 @@ static int oom_kill_process(struct task_
 	}
 
 	printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
-					message, task_pid_nr(p), p->comm, points);
+				message, task_pid_nr(p), p->comm, points);
 
 	/* Try to kill a child first */
 	list_for_each_entry(c, &p->children, sibling) {
@@ -475,7 +512,7 @@ void mem_cgroup_out_of_memory(struct mem
 
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem);
+	p = select_bad_process(&points, mem, CONSTRAINT_NONE);
 	if (PTR_ERR(p) == -1UL)
 		goto out;
 
@@ -557,7 +594,7 @@ void clear_zonelist_oom(struct zonelist 
 /*
  * Must be called with tasklist_lock held for read.
  */
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static void __out_of_memory(gfp_t gfp_mask, int order, int constraint)
 {
 	struct task_struct *p;
 	unsigned long points;
@@ -571,7 +608,7 @@ retry:
 	 * Rambo mode: Shoot down a process and hope it solves whatever
 	 * issues we may have.
 	 */
-	p = select_bad_process(&points, NULL);
+	p = select_bad_process(&points, NULL, constraint);
 
 	if (PTR_ERR(p) == -1UL)
 		return;
@@ -579,13 +616,23 @@ retry:
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
 		read_unlock(&tasklist_lock);
+		if (constraint == CONSTRAINT_LOWMEM)
+			printk("Lowmem is exhausted and no process has lowmem\n");
 		dump_header(NULL, gfp_mask, order, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	if (oom_kill_process(p, gfp_mask, order, points, NULL,
+	switch (constraint) {
+	case CONSTRAINT_LOWMEM:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
+			"Out of memory (in lowmem)"))
+			goto retry;
+		break;
+	default:
+		if (oom_kill_process(p, gfp_mask, order, points, NULL,
 			     "Out of memory"))
-		goto retry;
+			goto retry;
+	}
 }
 
 /*
@@ -612,7 +659,7 @@ void pagefault_out_of_memory(void)
 		panic("out of memory from page fault. panic_on_oom is selected.\n");
 
 	read_lock(&tasklist_lock);
-	__out_of_memory(0, 0); /* unknown gfp_mask and order */
+	__out_of_memory(0, 0, CONSTRAINT_NONE); /* unknown gfp_mask and order */
 	read_unlock(&tasklist_lock);
 
 	/*
@@ -663,7 +710,7 @@ void out_of_memory(struct zonelist *zone
 		oom_kill_process(current, gfp_mask, order, 0, NULL,
 				"No available memory (MPOL_BIND)");
 		break;
-
+	case CONSTRAINT_LOWMEM:
 	case CONSTRAINT_NONE:
 		if (sysctl_panic_on_oom) {
 			dump_header(NULL, gfp_mask, order, NULL);
@@ -671,7 +718,7 @@ void out_of_memory(struct zonelist *zone
 		}
 		/* Fall-through */
 	case CONSTRAINT_CPUSET:
-		__out_of_memory(gfp_mask, order);
+		__out_of_memory(gfp_mask, order, constraint);
 		break;
 	}
 
Index: mmotm-2.6.33-Jan15-2/kernel/sysctl.c
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/kernel/sysctl.c
+++ mmotm-2.6.33-Jan15-2/kernel/sysctl.c
@@ -1270,6 +1270,13 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.procname	= "oom_kill_extension_mask",
+		.data		= &sysctl_oom_kill_extension_mask,
+		.maxlen		= sizeof(sysctl_oom_kill_extension_mask),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 
 /*
  * NOTE: do not add new entries to this table unless you have read
Index: mmotm-2.6.33-Jan15-2/Documentation/sysctl/vm.txt
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/Documentation/sysctl/vm.txt
+++ mmotm-2.6.33-Jan15-2/Documentation/sysctl/vm.txt
@@ -45,6 +45,7 @@ Currently, these files are in /proc/sys/
 - numa_zonelist_order
 - oom_dump_tasks
 - oom_kill_allocating_task
+- oom_kill_extension_mask
 - overcommit_memory
 - overcommit_ratio
 - page-cluster
@@ -511,6 +512,21 @@ The default value is 0.
 
 ==============================================================
 
+oom_kill_extension_mask:
+
+This is a mask for oom-killer extension features.
+Setting these flags may cause incompatibility for proc->oom_score and
+proc->oom_adj controls. So, please set carefully.
+
+bit 0....lowmem aware oom-killing.
+    If set, at lowmem shortage oom killing (for example, exhausting NORMAL_ZONE
+    under x86-32 HIGHMEM host), oom-killer will see lowmem rss usage of
+    processes instead of vmsize. Works only when CONFIG_MMU=y.
+
+The default value is 0
+
+==============================================================
+
 overcommit_memory:
 
 This value contains a flag that enables memory overcommitment.
Index: mmotm-2.6.33-Jan15-2/fs/proc/base.c
===================================================================
--- mmotm-2.6.33-Jan15-2.orig/fs/proc/base.c
+++ mmotm-2.6.33-Jan15-2/fs/proc/base.c
@@ -458,7 +458,6 @@ static const struct file_operations proc
 #endif
 
 /* The badness from the OOM killer */
-unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long points;
@@ -466,7 +465,8 @@ static int proc_oom_score(struct task_st
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	read_lock(&tasklist_lock);
-	points = badness(task->group_leader, uptime.tv_sec);
+	points = oom_badness(task->group_leader,
+			uptime.tv_sec, CONSTRAINT_NONE);
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-25  6:15     ` KAMEZAWA Hiroyuki
@ 2010-01-27 23:40       ` David Rientjes
  -1 siblings, 0 replies; 75+ messages in thread
From: David Rientjes @ 2010-01-27 23:40 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Balbir Singh, minchan.kim, linux-kernel, linux-mm

On Mon, 25 Jan 2010, KAMEZAWA Hiroyuki wrote:

> Default oom-killer uses badness calculation based on process's vm_size
> and some amounts of heuristics. Some users see proc->oom_score and
> proc->oom_adj to control oom-killed tendency under their server.
> 
> Now, we know oom-killer don't work ideally in some situaion, in PCs. Some
> enhancements are demanded. But such enhancements for oom-killer makes
> incomaptibility to oom-controls in enterprise world. So, this patch
> adds sysctl for extensions for oom-killer. Main purpose is for
> making a chance for wider test for new scheme.
> 

That's insufficient for inclusion in mainline, we don't add new sysctls so 
that new heuristics can be tried out.  It's fine to propose a new sysctl 
to define how the oom killer behaves, but the main purpose would not be 
for testing; rather, it would be to enable options that users within the 
minority would want to use.

I disagree that we should be doing this as a bitmask that defines certain 
oom killer options; we already have three seperate sysctls which also 
enable options: panic_on_oom, oom_kill_allocating_task, and 
oom_dump_tasks.  Either these existing sysctls need to be converted to the 
bitmask, breaking the long-standing legacy support, or you simply need to 
clutter procfs a little more.  I'm slightly biased toward the latter since 
it doesn't require any userspace change and tunables such as panic_on_oom 
have been around for a long time.

 [ Note: it may be possible to consolidate two of these existing sysctls
   down into one: oom_dump_tasks can be enabled by default if the tasklist
   is sufficiently short and the only use-case for oom_kill_allocating_task
   is for machines with enormously long tasklists to prevent unnecessary
   delays in selecting a bad process to kill.  Thus, we could probably
   consolidate these into one sysctl: oom_kill_quick, which would disable
   the tasklist dump and always kill current when invoked. ]

> One cause of OOM-Killer is memory shortage in lower zones.
> (If memory is enough, lowmem_reserve_ratio works well. but..)

I don't understand the reference to lowmem_reserve_ratio here, it may 
reserve lowmem from ~GFP_DMA requests but it does nothing to prevent oom 
conditions from excessive DMA page allocations.

> I saw lowmem-oom frequently on x86-32 and sometimes on ia64 in
> my cusotmer support jobs. If we just see process's vm_size at oom,
> we can never kill a process which has lowmem.

That's not always true, it may end up killing a large consumer of DMA 
memory by chance simply because the heuristics work out that way.  In 
other words, we can't say it will "never" work correctly as it is 
currently implemented.  I agree we can make it smarter, however.

> At last, there will be an oom-serial-killer.
> 

Heh.

> Now, we have per-mm lowmem usage counter. We can make use of it
> to select a good victim.
> 
> This patch does
>   - add sysctl for new bahavior.
>   - add CONSTRAINT_LOWMEM to oom's constraint type.
>   - pass constraint to __badness()

You mean badness()?  Passing the constraint works well for my 
CONSTRAINT_MEMPOLICY patch as well.

>   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
>     use low_rss instead of vmsize.
> 

Nack, we can't simply use the lowmem rss as a baseline because 
/proc/pid/oom_adj, the single most powerful heuristic in badness(), is not 
defined for these dual scenarios.  There may only be a single baseline to 
define for oom_adj, otherwise it will have erradic results depending on 
the context in which the oom killer is called.  It can be used to polarize 
the heuristic depending on the total VM size which may be disadvantageous 
when using lowmem rss as the baseline.

I think the best alternative would be to strongly penalize the badness() 
points for tasks that do not have a lowmem rss when we are constrained by 
CONSTRAINT_LOWMEM, similar to how we penalize tasks not sharing current's 
mems_allowed since it (usually) doesn't help.  We do not necessarily 
always want to kill the task that is consuming the most lowmem for a 
single page allocation; we need to decide how valuable lowmem is in 
relation to overall VM size, however.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-27 23:40       ` David Rientjes
  0 siblings, 0 replies; 75+ messages in thread
From: David Rientjes @ 2010-01-27 23:40 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Balbir Singh, minchan.kim, linux-kernel, linux-mm

On Mon, 25 Jan 2010, KAMEZAWA Hiroyuki wrote:

> Default oom-killer uses badness calculation based on process's vm_size
> and some amounts of heuristics. Some users see proc->oom_score and
> proc->oom_adj to control oom-killed tendency under their server.
> 
> Now, we know oom-killer don't work ideally in some situaion, in PCs. Some
> enhancements are demanded. But such enhancements for oom-killer makes
> incomaptibility to oom-controls in enterprise world. So, this patch
> adds sysctl for extensions for oom-killer. Main purpose is for
> making a chance for wider test for new scheme.
> 

That's insufficient for inclusion in mainline, we don't add new sysctls so 
that new heuristics can be tried out.  It's fine to propose a new sysctl 
to define how the oom killer behaves, but the main purpose would not be 
for testing; rather, it would be to enable options that users within the 
minority would want to use.

I disagree that we should be doing this as a bitmask that defines certain 
oom killer options; we already have three seperate sysctls which also 
enable options: panic_on_oom, oom_kill_allocating_task, and 
oom_dump_tasks.  Either these existing sysctls need to be converted to the 
bitmask, breaking the long-standing legacy support, or you simply need to 
clutter procfs a little more.  I'm slightly biased toward the latter since 
it doesn't require any userspace change and tunables such as panic_on_oom 
have been around for a long time.

 [ Note: it may be possible to consolidate two of these existing sysctls
   down into one: oom_dump_tasks can be enabled by default if the tasklist
   is sufficiently short and the only use-case for oom_kill_allocating_task
   is for machines with enormously long tasklists to prevent unnecessary
   delays in selecting a bad process to kill.  Thus, we could probably
   consolidate these into one sysctl: oom_kill_quick, which would disable
   the tasklist dump and always kill current when invoked. ]

> One cause of OOM-Killer is memory shortage in lower zones.
> (If memory is enough, lowmem_reserve_ratio works well. but..)

I don't understand the reference to lowmem_reserve_ratio here, it may 
reserve lowmem from ~GFP_DMA requests but it does nothing to prevent oom 
conditions from excessive DMA page allocations.

> I saw lowmem-oom frequently on x86-32 and sometimes on ia64 in
> my cusotmer support jobs. If we just see process's vm_size at oom,
> we can never kill a process which has lowmem.

That's not always true, it may end up killing a large consumer of DMA 
memory by chance simply because the heuristics work out that way.  In 
other words, we can't say it will "never" work correctly as it is 
currently implemented.  I agree we can make it smarter, however.

> At last, there will be an oom-serial-killer.
> 

Heh.

> Now, we have per-mm lowmem usage counter. We can make use of it
> to select a good victim.
> 
> This patch does
>   - add sysctl for new bahavior.
>   - add CONSTRAINT_LOWMEM to oom's constraint type.
>   - pass constraint to __badness()

You mean badness()?  Passing the constraint works well for my 
CONSTRAINT_MEMPOLICY patch as well.

>   - change calculation based on constraint. If CONSTRAINT_LOWMEM,
>     use low_rss instead of vmsize.
> 

Nack, we can't simply use the lowmem rss as a baseline because 
/proc/pid/oom_adj, the single most powerful heuristic in badness(), is not 
defined for these dual scenarios.  There may only be a single baseline to 
define for oom_adj, otherwise it will have erradic results depending on 
the context in which the oom killer is called.  It can be used to polarize 
the heuristic depending on the total VM size which may be disadvantageous 
when using lowmem rss as the baseline.

I think the best alternative would be to strongly penalize the badness() 
points for tasks that do not have a lowmem rss when we are constrained by 
CONSTRAINT_LOWMEM, similar to how we penalize tasks not sharing current's 
mems_allowed since it (usually) doesn't help.  We do not necessarily 
always want to kill the task that is consuming the most lowmem for a 
single page allocation; we need to decide how valuable lowmem is in 
relation to overall VM size, however.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-26 23:53         ` KAMEZAWA Hiroyuki
@ 2010-01-27 23:46           ` David Rientjes
  -1 siblings, 0 replies; 75+ messages in thread
From: David Rientjes @ 2010-01-27 23:46 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Balbir Singh, minchan.kim, linux-kernel, linux-mm

On Wed, 27 Jan 2010, KAMEZAWA Hiroyuki wrote:

> Yes, I think you're right. But "breaking current behaviro of our servers!"
> arguments kills all proposal to this area and this oom-killer or vmscan is
> a feature should be tested by real users.

Nobody has said we should discount lowmem rss when dealing with a GFP_DMA 
allocation, it simply wasn't possible until the lowmem rss counters were 
introduced in -mm.  It would prevent the needless killing of innocent 
tasks which would not allow the page allocation to succeed, so it's a good 
feature to have.  It doesn't need to be configurable at all, we just need 
to find a way to introduce it into the heuristic without mangling oom_adj.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-27 23:46           ` David Rientjes
  0 siblings, 0 replies; 75+ messages in thread
From: David Rientjes @ 2010-01-27 23:46 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Balbir Singh, minchan.kim, linux-kernel, linux-mm

On Wed, 27 Jan 2010, KAMEZAWA Hiroyuki wrote:

> Yes, I think you're right. But "breaking current behaviro of our servers!"
> arguments kills all proposal to this area and this oom-killer or vmscan is
> a feature should be tested by real users.

Nobody has said we should discount lowmem rss when dealing with a GFP_DMA 
allocation, it simply wasn't possible until the lowmem rss counters were 
introduced in -mm.  It would prevent the needless killing of innocent 
tasks which would not allow the page allocation to succeed, so it's a good 
feature to have.  It doesn't need to be configurable at all, we just need 
to find a way to introduce it into the heuristic without mangling oom_adj.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-27  0:58             ` KAMEZAWA Hiroyuki
@ 2010-01-27 23:56               ` David Rientjes
  -1 siblings, 0 replies; 75+ messages in thread
From: David Rientjes @ 2010-01-27 23:56 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, linux-mm, minchan.kim, linux-kernel, balbir

On Wed, 27 Jan 2010, KAMEZAWA Hiroyuki wrote:

> Now, /proc/<pid>/oom_score and /proc/<pid>/oom_adj are used by servers.

Nonsense, there are plenty of userspace applications such as udev that 
tune their own oom_adj value on their own!  oom_adj is used by anyone who 
wants to define oom killer priority by polarizing the badness heuristic 
for certain tasks to, for example, always prefer them or completely 
disable oom killing for them.

> After this patch, badness() returns different value based on given context.
> Changing format of them was an idea, but, as David said, using "RSS" values
> will show unstable oom_score. So, I didn't modify oom_score (for this time).
> 

That's a seperate issue: you cannot define the baseline of the heuristic 
in terms of rss because it does not allow userspace to define when a task 
has become "rogue", i.e. when it is consuming far more memory than 
expected, because it is a dynamic value that depends on the state of the 
VM at the time of oom.  That is one of the two most popular reasons for 
tuning oom_adj, the other aforementioned.

The issue with using lowmem rss for CONSTRAINT_LOWMEM is that it 
misinterprets oom_adj values given to tasks; users will tune their oom_adj 
based on global, system-wide ooms (or use /proc/pid/oom_score to reveal 
the priority) and will never understand how it affects the value of a 
resident page in lowmem for GFP_DMA allocations.

> To be honest, all my work are for guys who don't tweak oom_adj based on oom_score.
> IOW, this is for usual novice people. And I don't wan't to break servers which
> depends on oom black magic currently supported.
> 

Why can't you simply create your own heuristic, seperate from badness(), 
for CONSTRAINT_LOWMEM?  Define the criteria that you see as important in 
selecting a task in that scenario and then propose it as a seperate 
function, there is no requirement that we must have a single heuristic 
that works for all the various oom killer constraints.  It would be 
entirely appropriate to ignore oom_adj in that heuristic, as well, since 
its not defined for such oom conditions (OOM_DISABLE is already taken care 
of in the tasklist scan and needs no further support).

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-27 23:56               ` David Rientjes
  0 siblings, 0 replies; 75+ messages in thread
From: David Rientjes @ 2010-01-27 23:56 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, linux-mm, minchan.kim, linux-kernel, balbir

On Wed, 27 Jan 2010, KAMEZAWA Hiroyuki wrote:

> Now, /proc/<pid>/oom_score and /proc/<pid>/oom_adj are used by servers.

Nonsense, there are plenty of userspace applications such as udev that 
tune their own oom_adj value on their own!  oom_adj is used by anyone who 
wants to define oom killer priority by polarizing the badness heuristic 
for certain tasks to, for example, always prefer them or completely 
disable oom killing for them.

> After this patch, badness() returns different value based on given context.
> Changing format of them was an idea, but, as David said, using "RSS" values
> will show unstable oom_score. So, I didn't modify oom_score (for this time).
> 

That's a seperate issue: you cannot define the baseline of the heuristic 
in terms of rss because it does not allow userspace to define when a task 
has become "rogue", i.e. when it is consuming far more memory than 
expected, because it is a dynamic value that depends on the state of the 
VM at the time of oom.  That is one of the two most popular reasons for 
tuning oom_adj, the other aforementioned.

The issue with using lowmem rss for CONSTRAINT_LOWMEM is that it 
misinterprets oom_adj values given to tasks; users will tune their oom_adj 
based on global, system-wide ooms (or use /proc/pid/oom_score to reveal 
the priority) and will never understand how it affects the value of a 
resident page in lowmem for GFP_DMA allocations.

> To be honest, all my work are for guys who don't tweak oom_adj based on oom_score.
> IOW, this is for usual novice people. And I don't wan't to break servers which
> depends on oom black magic currently supported.
> 

Why can't you simply create your own heuristic, seperate from badness(), 
for CONSTRAINT_LOWMEM?  Define the criteria that you see as important in 
selecting a task in that scenario and then propose it as a seperate 
function, there is no requirement that we must have a single heuristic 
that works for all the various oom killer constraints.  It would be 
entirely appropriate to ignore oom_adj in that heuristic, as well, since 
its not defined for such oom conditions (OOM_DISABLE is already taken care 
of in the tasklist scan and needs no further support).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v4 0/2] oom-kill: add lowmem usage aware oom kill handling
  2010-01-27  6:30               ` KAMEZAWA Hiroyuki
@ 2010-01-28  0:12                 ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-28  0:12 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, linux-mm, rientjes, minchan.kim, linux-kernel, balbir

On Wed, 27 Jan 2010 15:30:53 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> I updated oom-kill-add-lowmem-usage-aware-oom-kill-handling.patch.
> 
> I added "clean up" patch before new feature. I divided it into 2 patches.
> 
> [1/2] clean up vm related sysctl variable declaration
> [2/2] lowmem aware oom-kill.
> 
> I rewrote patch description for 2/2 much. Plz point out if not enough yet.
> And 2/2 includes some bug fixes.
> 
> Thank you for all helps.
> 
I stop all this work. 

Sorry for noise. Already I asked Andrew to drop lowmem_rss counting.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v4 0/2] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-28  0:12                 ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-28  0:12 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, linux-mm, rientjes, minchan.kim, linux-kernel, balbir

On Wed, 27 Jan 2010 15:30:53 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> I updated oom-kill-add-lowmem-usage-aware-oom-kill-handling.patch.
> 
> I added "clean up" patch before new feature. I divided it into 2 patches.
> 
> [1/2] clean up vm related sysctl variable declaration
> [2/2] lowmem aware oom-kill.
> 
> I rewrote patch description for 2/2 much. Plz point out if not enough yet.
> And 2/2 includes some bug fixes.
> 
> Thank you for all helps.
> 
I stop all this work. 

Sorry for noise. Already I asked Andrew to drop lowmem_rss counting.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-27  0:58             ` KAMEZAWA Hiroyuki
@ 2010-01-28  0:16               ` Alan Cox
  -1 siblings, 0 replies; 75+ messages in thread
From: Alan Cox @ 2010-01-28  0:16 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, linux-mm, rientjes, minchan.kim, linux-kernel, balbir

> Now, /proc/<pid>/oom_score and /proc/<pid>/oom_adj are used by servers.

And embedded, and some desktops (including some neat experimental hacks
where windows slowly get to be bigger bigger oom targes the longer
they've been non-focussed)

> For my customers, I don't like oom black magic. I'd like to recommend to
> use memcg, of course ;) But lowmem oom cannot be handled by memcg, well.
> So I started from this. 

I can't help feeling this is the wrong approach. IFF we are running out
of low memory pages then killing stuff for that reason is wrong to begin
with except in extreme cases and those extreme cases are probably also
cases the kill won't help.

If we have a movable user page (even an mlocked one) then if there is
space in other parts of memory (ie the OOM is due to a single zone
problem) we should *never* be killing in the first place, we should be
moving the page. The mlock case is a bit hairy but the non mlock case is
exactly the same sequence of operations as a page out and page in
somewhere else skipping the widdling on the disk bit in the middle.

There are cases we can't do that - eg if the kernel has it pinned for
DMA, but in that case OOM isn't going to recover the page either - at
least not until the DMA or whatever unpins it (at which point you could
just move it).

Am I missing something fundamental here ?

Alan

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-28  0:16               ` Alan Cox
  0 siblings, 0 replies; 75+ messages in thread
From: Alan Cox @ 2010-01-28  0:16 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, linux-mm, rientjes, minchan.kim, linux-kernel, balbir

> Now, /proc/<pid>/oom_score and /proc/<pid>/oom_adj are used by servers.

And embedded, and some desktops (including some neat experimental hacks
where windows slowly get to be bigger bigger oom targes the longer
they've been non-focussed)

> For my customers, I don't like oom black magic. I'd like to recommend to
> use memcg, of course ;) But lowmem oom cannot be handled by memcg, well.
> So I started from this. 

I can't help feeling this is the wrong approach. IFF we are running out
of low memory pages then killing stuff for that reason is wrong to begin
with except in extreme cases and those extreme cases are probably also
cases the kill won't help.

If we have a movable user page (even an mlocked one) then if there is
space in other parts of memory (ie the OOM is due to a single zone
problem) we should *never* be killing in the first place, we should be
moving the page. The mlock case is a bit hairy but the non mlock case is
exactly the same sequence of operations as a page out and page in
somewhere else skipping the widdling on the disk bit in the middle.

There are cases we can't do that - eg if the kernel has it pinned for
DMA, but in that case OOM isn't going to recover the page either - at
least not until the DMA or whatever unpins it (at which point you could
just move it).

Am I missing something fundamental here ?

Alan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-28  0:16               ` Alan Cox
@ 2010-01-28  0:26                 ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-28  0:26 UTC (permalink / raw)
  To: Alan Cox
  Cc: Andrew Morton, linux-mm, rientjes, minchan.kim, linux-kernel, balbir

Thank you for comment. But I stoppped this already....

On Thu, 28 Jan 2010 00:16:36 +0000
Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:

> > Now, /proc/<pid>/oom_score and /proc/<pid>/oom_adj are used by servers.
> 
> And embedded, and some desktops (including some neat experimental hacks
> where windows slowly get to be bigger bigger oom targes the longer
> they've been non-focussed)
> 
Sure.

> > For my customers, I don't like oom black magic. I'd like to recommend to
> > use memcg, of course ;) But lowmem oom cannot be handled by memcg, well.
> > So I started from this. 
> 
> I can't help feeling this is the wrong approach. IFF we are running out
> of low memory pages then killing stuff for that reason is wrong to begin
> with except in extreme cases and those extreme cases are probably also
> cases the kill won't help.
> 
> If we have a movable user page (even an mlocked one) then if there is
> space in other parts of memory (ie the OOM is due to a single zone
> problem) we should *never* be killing in the first place, we should be
> moving the page. The mlock case is a bit hairy but the non mlock case is
> exactly the same sequence of operations as a page out and page in
> somewhere else skipping the widdling on the disk bit in the middle.
> 
> There are cases we can't do that - eg if the kernel has it pinned for
> DMA, but in that case OOM isn't going to recover the page either - at
> least not until the DMA or whatever unpins it (at which point you could
> just move it).
> 
> Am I missing something fundamental here ?
> 

I just wanted to make oom-killer shouldn't kill sshd or X-serivce or
task launcher IOW, oom-killer shouldn't do not-reasonalble selection.

If lowmem user is killed, I'll be satisfied with the cace "Oh, the process
is killed because lowmem was in short and it used lowmem, Hmmm..." and
never be satisfied with the cace "Ohch!, F*cking OOM killer killed X-server
and 10s of innocent processes!!!".

But year, I stop this. For me, panic_on_oom=1 is all and enough.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-28  0:26                 ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-28  0:26 UTC (permalink / raw)
  To: Alan Cox
  Cc: Andrew Morton, linux-mm, rientjes, minchan.kim, linux-kernel, balbir

Thank you for comment. But I stoppped this already....

On Thu, 28 Jan 2010 00:16:36 +0000
Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:

> > Now, /proc/<pid>/oom_score and /proc/<pid>/oom_adj are used by servers.
> 
> And embedded, and some desktops (including some neat experimental hacks
> where windows slowly get to be bigger bigger oom targes the longer
> they've been non-focussed)
> 
Sure.

> > For my customers, I don't like oom black magic. I'd like to recommend to
> > use memcg, of course ;) But lowmem oom cannot be handled by memcg, well.
> > So I started from this. 
> 
> I can't help feeling this is the wrong approach. IFF we are running out
> of low memory pages then killing stuff for that reason is wrong to begin
> with except in extreme cases and those extreme cases are probably also
> cases the kill won't help.
> 
> If we have a movable user page (even an mlocked one) then if there is
> space in other parts of memory (ie the OOM is due to a single zone
> problem) we should *never* be killing in the first place, we should be
> moving the page. The mlock case is a bit hairy but the non mlock case is
> exactly the same sequence of operations as a page out and page in
> somewhere else skipping the widdling on the disk bit in the middle.
> 
> There are cases we can't do that - eg if the kernel has it pinned for
> DMA, but in that case OOM isn't going to recover the page either - at
> least not until the DMA or whatever unpins it (at which point you could
> just move it).
> 
> Am I missing something fundamental here ?
> 

I just wanted to make oom-killer shouldn't kill sshd or X-serivce or
task launcher IOW, oom-killer shouldn't do not-reasonalble selection.

If lowmem user is killed, I'll be satisfied with the cace "Oh, the process
is killed because lowmem was in short and it used lowmem, Hmmm..." and
never be satisfied with the cace "Ohch!, F*cking OOM killer killed X-server
and 10s of innocent processes!!!".

But year, I stop this. For me, panic_on_oom=1 is all and enough.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-28  0:16               ` Alan Cox
@ 2010-01-28  0:59                 ` David Rientjes
  -1 siblings, 0 replies; 75+ messages in thread
From: David Rientjes @ 2010-01-28  0:59 UTC (permalink / raw)
  To: Alan Cox
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, minchan.kim,
	linux-kernel, Balbir Singh, Mel Gorman

On Thu, 28 Jan 2010, Alan Cox wrote:

> > Now, /proc/<pid>/oom_score and /proc/<pid>/oom_adj are used by servers.
> 
> And embedded, and some desktops (including some neat experimental hacks
> where windows slowly get to be bigger bigger oom targes the longer
> they've been non-focussed)
> 

Right, oom_adj is used much more widely than described.

> I can't help feeling this is the wrong approach. IFF we are running out
> of low memory pages then killing stuff for that reason is wrong to begin
> with except in extreme cases and those extreme cases are probably also
> cases the kill won't help.
> 
> If we have a movable user page (even an mlocked one) then if there is
> space in other parts of memory (ie the OOM is due to a single zone
> problem) we should *never* be killing in the first place, we should be
> moving the page. The mlock case is a bit hairy but the non mlock case is
> exactly the same sequence of operations as a page out and page in
> somewhere else skipping the widdling on the disk bit in the middle.
> 

Mel Gorman's memory compaction patchset will preempt direct reclaim and 
the oom killer if it can defragment zones by page migration such that a 
higher order allocation would now succeed.

In this specific context, both compaction and direct reclaim will have 
failed so the oom killer is the only alternative.  For __GFP_NOFAIL, 
that's required.  However, there has been some long-standing debate (and 
not only for lowmem, but for all oom conditions) about when the page 
allocator should simply return NULL.  We've always killed something on 
blocking allocations to favor current at the expense of other memory hogs, 
but that may be changed soon: it may make sense to defer oom killing 
completely unless the badness() score reaches a certain threshold such 
that memory leakers really can be dealt with accordingly.

In the lowmem case, it certainly seems plausible to use the same behavior 
that we currently do for mempolicy-constrained ooms: kill current.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-28  0:59                 ` David Rientjes
  0 siblings, 0 replies; 75+ messages in thread
From: David Rientjes @ 2010-01-28  0:59 UTC (permalink / raw)
  To: Alan Cox
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, minchan.kim,
	linux-kernel, Balbir Singh, Mel Gorman

On Thu, 28 Jan 2010, Alan Cox wrote:

> > Now, /proc/<pid>/oom_score and /proc/<pid>/oom_adj are used by servers.
> 
> And embedded, and some desktops (including some neat experimental hacks
> where windows slowly get to be bigger bigger oom targes the longer
> they've been non-focussed)
> 

Right, oom_adj is used much more widely than described.

> I can't help feeling this is the wrong approach. IFF we are running out
> of low memory pages then killing stuff for that reason is wrong to begin
> with except in extreme cases and those extreme cases are probably also
> cases the kill won't help.
> 
> If we have a movable user page (even an mlocked one) then if there is
> space in other parts of memory (ie the OOM is due to a single zone
> problem) we should *never* be killing in the first place, we should be
> moving the page. The mlock case is a bit hairy but the non mlock case is
> exactly the same sequence of operations as a page out and page in
> somewhere else skipping the widdling on the disk bit in the middle.
> 

Mel Gorman's memory compaction patchset will preempt direct reclaim and 
the oom killer if it can defragment zones by page migration such that a 
higher order allocation would now succeed.

In this specific context, both compaction and direct reclaim will have 
failed so the oom killer is the only alternative.  For __GFP_NOFAIL, 
that's required.  However, there has been some long-standing debate (and 
not only for lowmem, but for all oom conditions) about when the page 
allocator should simply return NULL.  We've always killed something on 
blocking allocations to favor current at the expense of other memory hogs, 
but that may be changed soon: it may make sense to defer oom killing 
completely unless the badness() score reaches a certain threshold such 
that memory leakers really can be dealt with accordingly.

In the lowmem case, it certainly seems plausible to use the same behavior 
that we currently do for mempolicy-constrained ooms: kill current.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v4 1/2] sysctl clean up vm related variable declarations
  2010-01-27  6:32                 ` KAMEZAWA Hiroyuki
@ 2010-01-28  8:54                   ` David Rientjes
  -1 siblings, 0 replies; 75+ messages in thread
From: David Rientjes @ 2010-01-28  8:54 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Balbir Singh, minchan.kim, linux-kernel, linux-mm

On Wed, 27 Jan 2010, KAMEZAWA Hiroyuki wrote:

> Now, there are many "extern" declaration in kernel/sysctl.c. "extern"
> declaration in *.c file is not appreciated in general.
> And Hmm...it seems there are a few redundant declarations.
> 

sysctl_overcommit_memory and sysctl_overcommit_ratio, right?

> Because most of sysctl variables are defined in its own header file,
> they should be declared in the same style, be done in its own *.h file.
> 
> This patch removes some VM(memory management) related sysctl's
> variable declaration from kernel/sysctl.c and move them to
> proper places.
> 
> Change log:
>  - 2010/01/27 (new)
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

This is a very nice cleanup of the sysctl code, I hope you find the time 
to push it regardless of the future direction of the oom killer lowmem 
constraint.

One comment below.

> ---
>  include/linux/mm.h     |    5 +++++
>  include/linux/mmzone.h |    1 +
>  include/linux/oom.h    |    5 +++++
>  kernel/sysctl.c        |   16 ++--------------
>  mm/mmap.c              |    5 +++++
>  5 files changed, 18 insertions(+), 14 deletions(-)
> 
> Index: mmotm-2.6.33-Jan15-2/include/linux/mm.h
> ===================================================================
> --- mmotm-2.6.33-Jan15-2.orig/include/linux/mm.h
> +++ mmotm-2.6.33-Jan15-2/include/linux/mm.h
> @@ -1432,6 +1432,7 @@ int in_gate_area_no_task(unsigned long a
>  #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
>  #endif	/* __HAVE_ARCH_GATE_AREA */
>  
> +extern int sysctl_drop_caches;
>  int drop_caches_sysctl_handler(struct ctl_table *, int,
>  					void __user *, size_t *, loff_t *);
>  unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
> @@ -1476,5 +1477,9 @@ extern int soft_offline_page(struct page
>  
>  extern void dump_page(struct page *page);
>  
> +#ifndef CONFIG_NOMMU
> +extern int sysctl_nr_trim_pages;

This should be #ifndef CONFIG_MMU.

> +#endif
> +
>  #endif /* __KERNEL__ */
>  #endif /* _LINUX_MM_H */

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v4 1/2] sysctl clean up vm related variable declarations
@ 2010-01-28  8:54                   ` David Rientjes
  0 siblings, 0 replies; 75+ messages in thread
From: David Rientjes @ 2010-01-28  8:54 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Balbir Singh, minchan.kim, linux-kernel, linux-mm

On Wed, 27 Jan 2010, KAMEZAWA Hiroyuki wrote:

> Now, there are many "extern" declaration in kernel/sysctl.c. "extern"
> declaration in *.c file is not appreciated in general.
> And Hmm...it seems there are a few redundant declarations.
> 

sysctl_overcommit_memory and sysctl_overcommit_ratio, right?

> Because most of sysctl variables are defined in its own header file,
> they should be declared in the same style, be done in its own *.h file.
> 
> This patch removes some VM(memory management) related sysctl's
> variable declaration from kernel/sysctl.c and move them to
> proper places.
> 
> Change log:
>  - 2010/01/27 (new)
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

This is a very nice cleanup of the sysctl code, I hope you find the time 
to push it regardless of the future direction of the oom killer lowmem 
constraint.

One comment below.

> ---
>  include/linux/mm.h     |    5 +++++
>  include/linux/mmzone.h |    1 +
>  include/linux/oom.h    |    5 +++++
>  kernel/sysctl.c        |   16 ++--------------
>  mm/mmap.c              |    5 +++++
>  5 files changed, 18 insertions(+), 14 deletions(-)
> 
> Index: mmotm-2.6.33-Jan15-2/include/linux/mm.h
> ===================================================================
> --- mmotm-2.6.33-Jan15-2.orig/include/linux/mm.h
> +++ mmotm-2.6.33-Jan15-2/include/linux/mm.h
> @@ -1432,6 +1432,7 @@ int in_gate_area_no_task(unsigned long a
>  #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
>  #endif	/* __HAVE_ARCH_GATE_AREA */
>  
> +extern int sysctl_drop_caches;
>  int drop_caches_sysctl_handler(struct ctl_table *, int,
>  					void __user *, size_t *, loff_t *);
>  unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
> @@ -1476,5 +1477,9 @@ extern int soft_offline_page(struct page
>  
>  extern void dump_page(struct page *page);
>  
> +#ifndef CONFIG_NOMMU
> +extern int sysctl_nr_trim_pages;

This should be #ifndef CONFIG_MMU.

> +#endif
> +
>  #endif /* __KERNEL__ */
>  #endif /* _LINUX_MM_H */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v4 1/2] sysctl clean up vm related variable declarations
  2010-01-28  8:54                   ` David Rientjes
@ 2010-01-28 10:30                     ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-28 10:30 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Balbir Singh, minchan.kim, linux-kernel, linux-mm

On Thu, 28 Jan 2010 00:54:49 -0800 (PST)
David Rientjes <rientjes@google.com> wrote:

> > ---
> >  include/linux/mm.h     |    5 +++++
> >  include/linux/mmzone.h |    1 +
> >  include/linux/oom.h    |    5 +++++
> >  kernel/sysctl.c        |   16 ++--------------
> >  mm/mmap.c              |    5 +++++
> >  5 files changed, 18 insertions(+), 14 deletions(-)
> > 
> > Index: mmotm-2.6.33-Jan15-2/include/linux/mm.h
> > ===================================================================
> > --- mmotm-2.6.33-Jan15-2.orig/include/linux/mm.h
> > +++ mmotm-2.6.33-Jan15-2/include/linux/mm.h
> > @@ -1432,6 +1432,7 @@ int in_gate_area_no_task(unsigned long a
> >  #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
> >  #endif	/* __HAVE_ARCH_GATE_AREA */
> >  
> > +extern int sysctl_drop_caches;
> >  int drop_caches_sysctl_handler(struct ctl_table *, int,
> >  					void __user *, size_t *, loff_t *);
> >  unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
> > @@ -1476,5 +1477,9 @@ extern int soft_offline_page(struct page
> >  
> >  extern void dump_page(struct page *page);
> >  
> > +#ifndef CONFIG_NOMMU
> > +extern int sysctl_nr_trim_pages;
> 
> This should be #ifndef CONFIG_MMU.
> 
yes...thank you for review.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v4 1/2] sysctl clean up vm related variable declarations
@ 2010-01-28 10:30                     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 75+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-28 10:30 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Balbir Singh, minchan.kim, linux-kernel, linux-mm

On Thu, 28 Jan 2010 00:54:49 -0800 (PST)
David Rientjes <rientjes@google.com> wrote:

> > ---
> >  include/linux/mm.h     |    5 +++++
> >  include/linux/mmzone.h |    1 +
> >  include/linux/oom.h    |    5 +++++
> >  kernel/sysctl.c        |   16 ++--------------
> >  mm/mmap.c              |    5 +++++
> >  5 files changed, 18 insertions(+), 14 deletions(-)
> > 
> > Index: mmotm-2.6.33-Jan15-2/include/linux/mm.h
> > ===================================================================
> > --- mmotm-2.6.33-Jan15-2.orig/include/linux/mm.h
> > +++ mmotm-2.6.33-Jan15-2/include/linux/mm.h
> > @@ -1432,6 +1432,7 @@ int in_gate_area_no_task(unsigned long a
> >  #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
> >  #endif	/* __HAVE_ARCH_GATE_AREA */
> >  
> > +extern int sysctl_drop_caches;
> >  int drop_caches_sysctl_handler(struct ctl_table *, int,
> >  					void __user *, size_t *, loff_t *);
> >  unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
> > @@ -1476,5 +1477,9 @@ extern int soft_offline_page(struct page
> >  
> >  extern void dump_page(struct page *page);
> >  
> > +#ifndef CONFIG_NOMMU
> > +extern int sysctl_nr_trim_pages;
> 
> This should be #ifndef CONFIG_MMU.
> 
yes...thank you for review.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-28  0:16               ` Alan Cox
@ 2010-01-29  0:25                 ` Vedran Furač
  -1 siblings, 0 replies; 75+ messages in thread
From: Vedran Furač @ 2010-01-29  0:25 UTC (permalink / raw)
  To: Alan Cox
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

Alan Cox wrote:

> Am I missing something fundamental here ?

Yes, the fact linux mm currently sucks. How else would you explain
possibility of killing random (often root owned) processes using a 5
lines program started by an ordinary user? Killed process could be an
apache web server or X server on a desktop. I demonstrated this flaw few
months ago here and only Kame tried to find a way to fix it but
encountered noncooperation.

I don't know what to say, really. Sad... Actually funny, when you know
that competition OS, often ridiculed by linux users, doesn't suffer any
consequences when that same 5 line program is run.

Regards,
Vedran


-- 
http://vedranf.net | a8e7a7783ca0d460fee090cc584adc12

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-29  0:25                 ` Vedran Furač
  0 siblings, 0 replies; 75+ messages in thread
From: Vedran Furač @ 2010-01-29  0:25 UTC (permalink / raw)
  To: Alan Cox
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

Alan Cox wrote:

> Am I missing something fundamental here ?

Yes, the fact linux mm currently sucks. How else would you explain
possibility of killing random (often root owned) processes using a 5
lines program started by an ordinary user? Killed process could be an
apache web server or X server on a desktop. I demonstrated this flaw few
months ago here and only Kame tried to find a way to fix it but
encountered noncooperation.

I don't know what to say, really. Sad... Actually funny, when you know
that competition OS, often ridiculed by linux users, doesn't suffer any
consequences when that same 5 line program is run.

Regards,
Vedran


-- 
http://vedranf.net | a8e7a7783ca0d460fee090cc584adc12

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-29  0:25                 ` Vedran Furač
@ 2010-01-29  0:35                   ` Alan Cox
  -1 siblings, 0 replies; 75+ messages in thread
From: Alan Cox @ 2010-01-29  0:35 UTC (permalink / raw)
  To: vedran.furac
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

On Fri, 29 Jan 2010 01:25:18 +0100
Vedran Furač <vedran.furac@gmail.com> wrote:

> Alan Cox wrote:
> 
> > Am I missing something fundamental here ?
> 
> Yes, the fact linux mm currently sucks. How else would you explain
> possibility of killing random (often root owned) processes using a 5
> lines program started by an ordinary user? 

If you don't want to run with overcommit you turn it off. At that point
processes get memory allocations refused if they can overrun the
theoretical limit, but you generally need more swap (it's one of the
reasons why things like BSD historically have a '3 * memory' rule).

So sounds to me like a problem between the keyboard and screen (coupled
with the fact far too few desktop vendors include tools to easily set
this stuff up)

Alan

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-29  0:35                   ` Alan Cox
  0 siblings, 0 replies; 75+ messages in thread
From: Alan Cox @ 2010-01-29  0:35 UTC (permalink / raw)
  To: vedran.furac
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

On Fri, 29 Jan 2010 01:25:18 +0100
Vedran Furač <vedran.furac@gmail.com> wrote:

> Alan Cox wrote:
> 
> > Am I missing something fundamental here ?
> 
> Yes, the fact linux mm currently sucks. How else would you explain
> possibility of killing random (often root owned) processes using a 5
> lines program started by an ordinary user? 

If you don't want to run with overcommit you turn it off. At that point
processes get memory allocations refused if they can overrun the
theoretical limit, but you generally need more swap (it's one of the
reasons why things like BSD historically have a '3 * memory' rule).

So sounds to me like a problem between the keyboard and screen (coupled
with the fact far too few desktop vendors include tools to easily set
this stuff up)

Alan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-29  0:35                   ` Alan Cox
@ 2010-01-29  0:57                     ` Vedran Furač
  -1 siblings, 0 replies; 75+ messages in thread
From: Vedran Furač @ 2010-01-29  0:57 UTC (permalink / raw)
  To: Alan Cox
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

[-- Attachment #1: Type: text/plain, Size: 1305 bytes --]

Alan Cox wrote:

> On Fri, 29 Jan 2010 01:25:18 +0100
> Vedran Furač <vedran.furac@gmail.com> wrote:
> 
>> Alan Cox wrote:
>>
>>> Am I missing something fundamental here ?
>> Yes, the fact linux mm currently sucks. How else would you explain
>> possibility of killing random (often root owned) processes using a 5
>> lines program started by an ordinary user? 
> 
> If you don't want to run with overcommit you turn it off. At that point
> processes get memory allocations refused if they can overrun the

I've started this discussion with question why overcommit isn't turned
off by default. Problem is that it breaks java and some other stuff that
allocates much more memory than it needs. Very quickly Committed_AS hits
CommitLimit and one cannot allocate any more while there is plenty of
memory still unused.

> theoretical limit, but you generally need more swap (it's one of the
> reasons why things like BSD historically have a '3 * memory' rule).

Say I have 8GB of memory and there's always some free, why would I need
swap?

> So sounds to me like a problem between the keyboard and screen (coupled

Unfortunately it is not. Give me ssh access to your computer (leave
overcommit on) and I'll kill your X with anything running on it.


-- 
http://vedranf.net | a8e7a7783ca0d460fee090cc584adc12

[-- Attachment #2: vedran_furac.vcf --]
[-- Type: text/x-vcard, Size: 220 bytes --]

begin:vcard
fn;quoted-printable:Vedran Fura=C4=8D
n;quoted-printable:Fura=C4=8D;Vedran
adr:;;;;;;Croatia
email;internet:vedran.furac@gmail.com
x-mozilla-html:FALSE
url:http://vedranf.net
version:2.1
end:vcard


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-29  0:57                     ` Vedran Furač
  0 siblings, 0 replies; 75+ messages in thread
From: Vedran Furač @ 2010-01-29  0:57 UTC (permalink / raw)
  To: Alan Cox
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

[-- Attachment #1: Type: text/plain, Size: 1305 bytes --]

Alan Cox wrote:

> On Fri, 29 Jan 2010 01:25:18 +0100
> Vedran FuraA? <vedran.furac@gmail.com> wrote:
> 
>> Alan Cox wrote:
>>
>>> Am I missing something fundamental here ?
>> Yes, the fact linux mm currently sucks. How else would you explain
>> possibility of killing random (often root owned) processes using a 5
>> lines program started by an ordinary user? 
> 
> If you don't want to run with overcommit you turn it off. At that point
> processes get memory allocations refused if they can overrun the

I've started this discussion with question why overcommit isn't turned
off by default. Problem is that it breaks java and some other stuff that
allocates much more memory than it needs. Very quickly Committed_AS hits
CommitLimit and one cannot allocate any more while there is plenty of
memory still unused.

> theoretical limit, but you generally need more swap (it's one of the
> reasons why things like BSD historically have a '3 * memory' rule).

Say I have 8GB of memory and there's always some free, why would I need
swap?

> So sounds to me like a problem between the keyboard and screen (coupled

Unfortunately it is not. Give me ssh access to your computer (leave
overcommit on) and I'll kill your X with anything running on it.


-- 
http://vedranf.net | a8e7a7783ca0d460fee090cc584adc12

[-- Attachment #2: vedran_furac.vcf --]
[-- Type: text/x-vcard, Size: 220 bytes --]

begin:vcard
fn;quoted-printable:Vedran Fura=C4=8D
n;quoted-printable:Fura=C4=8D;Vedran
adr:;;;;;;Croatia
email;internet:vedran.furac@gmail.com
x-mozilla-html:FALSE
url:http://vedranf.net
version:2.1
end:vcard


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-29  0:57                     ` Vedran Furač
@ 2010-01-29 11:03                       ` Alan Cox
  -1 siblings, 0 replies; 75+ messages in thread
From: Alan Cox @ 2010-01-29 11:03 UTC (permalink / raw)
  To: vedran.furac
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

> off by default. Problem is that it breaks java and some other stuff that
> allocates much more memory than it needs. Very quickly Committed_AS hits
> CommitLimit and one cannot allocate any more while there is plenty of
> memory still unused.

So how about you go and have a complain at the people who are causing
your problem, rather than the kernel.

> > theoretical limit, but you generally need more swap (it's one of the
> > reasons why things like BSD historically have a '3 * memory' rule).
> 
> Say I have 8GB of memory and there's always some free, why would I need
> swap?

So that all the applications that allocate tons of address space and
don't use it can swap when you hit that corner case, and as a result you
don't need to go OOM. You should only get an OOM when you run out of
memory + swap.

> > So sounds to me like a problem between the keyboard and screen (coupled
> 
> Unfortunately it is not. Give me ssh access to your computer (leave
> overcommit on) and I'll kill your X with anything running on it.

If you have overcommit on then you can cause stuff to get killed. Thats
what the option enables.

It's really very simple: overcommit off you must have enough RAM and swap
to hold all allocations requested. Overcommit on - you don't need this
but if you do use more than is available on the system something has to
go.

It's kind of like banking  overcommit off is proper banking, overcommit
on is modern western banking.

Alan

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-29 11:03                       ` Alan Cox
  0 siblings, 0 replies; 75+ messages in thread
From: Alan Cox @ 2010-01-29 11:03 UTC (permalink / raw)
  To: vedran.furac
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

> off by default. Problem is that it breaks java and some other stuff that
> allocates much more memory than it needs. Very quickly Committed_AS hits
> CommitLimit and one cannot allocate any more while there is plenty of
> memory still unused.

So how about you go and have a complain at the people who are causing
your problem, rather than the kernel.

> > theoretical limit, but you generally need more swap (it's one of the
> > reasons why things like BSD historically have a '3 * memory' rule).
> 
> Say I have 8GB of memory and there's always some free, why would I need
> swap?

So that all the applications that allocate tons of address space and
don't use it can swap when you hit that corner case, and as a result you
don't need to go OOM. You should only get an OOM when you run out of
memory + swap.

> > So sounds to me like a problem between the keyboard and screen (coupled
> 
> Unfortunately it is not. Give me ssh access to your computer (leave
> overcommit on) and I'll kill your X with anything running on it.

If you have overcommit on then you can cause stuff to get killed. Thats
what the option enables.

It's really very simple: overcommit off you must have enough RAM and swap
to hold all allocations requested. Overcommit on - you don't need this
but if you do use more than is available on the system something has to
go.

It's kind of like banking  overcommit off is proper banking, overcommit
on is modern western banking.

Alan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-29 11:03                       ` Alan Cox
  (?)
@ 2010-01-30 12:33                       ` Vedran Furač
  2010-01-30 12:59                           ` Alan Cox
  -1 siblings, 1 reply; 75+ messages in thread
From: Vedran Furač @ 2010-01-30 12:33 UTC (permalink / raw)
  To: Alan Cox
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

[-- Attachment #1: Type: text/plain, Size: 2557 bytes --]

Alan Cox wrote:

>> off by default. Problem is that it breaks java and some other stuff that
>> allocates much more memory than it needs. Very quickly Committed_AS hits
>> CommitLimit and one cannot allocate any more while there is plenty of
>> memory still unused.
> 
> So how about you go and have a complain at the people who are causing
> your problem, rather than the kernel.

That would pass completely unnoticed and ignored as long as overcommit
is enabled by default.

>>> theoretical limit, but you generally need more swap (it's one of the
>>> reasons why things like BSD historically have a '3 * memory' rule).
>> Say I have 8GB of memory and there's always some free, why would I need
>> swap?
> 
> So that all the applications that allocate tons of address space and
> don't use it can swap when you hit that corner case, and as a result you
> don't need to go OOM. You should only get an OOM when you run out of
> memory + swap.

Yes, but unfortunately using swap makes machine crawl with huge disk IO
every time you access some application you haven't been using for a few
hours. So recently more and more people are disabling it completely with
positive experience.

>>> So sounds to me like a problem between the keyboard and screen (coupled
>> Unfortunately it is not. Give me ssh access to your computer (leave
>> overcommit on) and I'll kill your X with anything running on it.
> 
> If you have overcommit on then you can cause stuff to get killed. Thats
> what the option enables.

s/stuff/wrong stuff/

> It's really very simple: overcommit off you must have enough RAM and swap
> to hold all allocations requested. Overcommit on - you don't need this
> but if you do use more than is available on the system something has to
> go.
> 
> It's kind of like banking  overcommit off is proper banking, overcommit
> on is modern western banking.

Hehe, yes and you know the consequences.

If you look at malloc(3) you would see this:

"This means that when malloc() returns non-NULL there is no guarantee
that the memory really is available.  This is a really bad bug."

So, if you don't want to change the OOM algorithm why not fixing this
bug then? And after that change the proc(5) manpage entry for
/proc/sys/vm/overcommit_memory into something like:

0: heuristic overcommit (enable this if you have memory problems with
                          some buggy software)
1: always overcommit, never check
2: always check, never overcommit (this is the default)

Regards,
Vedran


-- 
http://vedranf.net | a8e7a7783ca0d460fee090cc584adc12

[-- Attachment #2: vedran_furac.vcf --]
[-- Type: text/x-vcard, Size: 220 bytes --]

begin:vcard
fn;quoted-printable:Vedran Fura=C4=8D
n;quoted-printable:Fura=C4=8D;Vedran
adr:;;;;;;Croatia
email;internet:vedran.furac@gmail.com
x-mozilla-html:FALSE
url:http://vedranf.net
version:2.1
end:vcard


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-30 12:33                       ` Vedran Furač
@ 2010-01-30 12:59                           ` Alan Cox
  0 siblings, 0 replies; 75+ messages in thread
From: Alan Cox @ 2010-01-30 12:59 UTC (permalink / raw)
  To: vedran.furac
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

> > So how about you go and have a complain at the people who are causing
> > your problem, rather than the kernel.
> 
> That would pass completely unnoticed and ignored as long as overcommit
> is enabled by default.

Defaults are set by the distributions. So you are still complaining to
the wrong people.

> So, if you don't want to change the OOM algorithm why not fixing this
> bug then? And after that change the proc(5) manpage entry for
> /proc/sys/vm/overcommit_memory into something like:
> 
> 0: heuristic overcommit (enable this if you have memory problems with
>                           some buggy software)
> 1: always overcommit, never check
> 2: always check, never overcommit (this is the default)

Because there are a lot of systems where heuristic overcommit makes
sense ?

Alan

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-30 12:59                           ` Alan Cox
  0 siblings, 0 replies; 75+ messages in thread
From: Alan Cox @ 2010-01-30 12:59 UTC (permalink / raw)
  To: vedran.furac
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

> > So how about you go and have a complain at the people who are causing
> > your problem, rather than the kernel.
> 
> That would pass completely unnoticed and ignored as long as overcommit
> is enabled by default.

Defaults are set by the distributions. So you are still complaining to
the wrong people.

> So, if you don't want to change the OOM algorithm why not fixing this
> bug then? And after that change the proc(5) manpage entry for
> /proc/sys/vm/overcommit_memory into something like:
> 
> 0: heuristic overcommit (enable this if you have memory problems with
>                           some buggy software)
> 1: always overcommit, never check
> 2: always check, never overcommit (this is the default)

Because there are a lot of systems where heuristic overcommit makes
sense ?

Alan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-30 12:59                           ` Alan Cox
  (?)
@ 2010-01-30 17:30                           ` Vedran Furač
  2010-01-30 17:45                               ` Alan Cox
  -1 siblings, 1 reply; 75+ messages in thread
From: Vedran Furač @ 2010-01-30 17:30 UTC (permalink / raw)
  To: Alan Cox
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

[-- Attachment #1: Type: text/plain, Size: 1560 bytes --]

Alan Cox wrote:

>>> So how about you go and have a complain at the people who are causing
>>> your problem, rather than the kernel.
>> That would pass completely unnoticed and ignored as long as overcommit
>> is enabled by default.
> 
> Defaults are set by the distributions. So you are still complaining to
> the wrong people.

I can't say I'm able to correctly read kernel code, but I believe
default is set by:

int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
int sysctl_overcommit_ratio = 50;       /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;

in mmap.c.

>> So, if you don't want to change the OOM algorithm why not fixing this
>> bug then? And after that change the proc(5) manpage entry for
>> /proc/sys/vm/overcommit_memory into something like:
>>
>> 0: heuristic overcommit (enable this if you have memory problems with
>>                           some buggy software)
>> 1: always overcommit, never check
>> 2: always check, never overcommit (this is the default)
> 
> Because there are a lot of systems where heuristic overcommit makes
> sense ?

Ok, I won't argue any more. Just please watch this short (~1min)
screencast I made and tell me which behavior is good and which is bad
and should be fixed:

http://vedranf.net/tmp/oom.ogv  (you can watch it using VLC for example)

Actually anyone receiving this mail should see it. What do you think,
what will customers rather choose if they see this?

Regards,
Vedran


-- 
http://vedranf.net | a8e7a7783ca0d460fee090cc584adc12

[-- Attachment #2: vedran_furac.vcf --]
[-- Type: text/x-vcard, Size: 220 bytes --]

begin:vcard
fn;quoted-printable:Vedran Fura=C4=8D
n;quoted-printable:Fura=C4=8D;Vedran
adr:;;;;;;Croatia
email;internet:vedran.furac@gmail.com
x-mozilla-html:FALSE
url:http://vedranf.net
version:2.1
end:vcard


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-30 17:30                           ` Vedran Furač
@ 2010-01-30 17:45                               ` Alan Cox
  0 siblings, 0 replies; 75+ messages in thread
From: Alan Cox @ 2010-01-30 17:45 UTC (permalink / raw)
  To: vedran.furac
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

> I can't say I'm able to correctly read kernel code, but I believe
> default is set by:

It['s set by the distribution - the kerne has a default value but given
the same distributions are shipping the crap userspace that breaks with
no-overcommit do you think they'll set it to break their user apps ?

No.

> http://vedranf.net/tmp/oom.ogv  (you can watch it using VLC for example)
> 
> Actually anyone receiving this mail should see it. What do you think,
> what will customers rather choose if they see this?

Address that to the distributions. Their customers. Systems I set up for
people always have no overcommit enabled.

Alan

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
@ 2010-01-30 17:45                               ` Alan Cox
  0 siblings, 0 replies; 75+ messages in thread
From: Alan Cox @ 2010-01-30 17:45 UTC (permalink / raw)
  To: vedran.furac
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

> I can't say I'm able to correctly read kernel code, but I believe
> default is set by:

It['s set by the distribution - the kerne has a default value but given
the same distributions are shipping the crap userspace that breaks with
no-overcommit do you think they'll set it to break their user apps ?

No.

> http://vedranf.net/tmp/oom.ogv  (you can watch it using VLC for example)
> 
> Actually anyone receiving this mail should see it. What do you think,
> what will customers rather choose if they see this?

Address that to the distributions. Their customers. Systems I set up for
people always have no overcommit enabled.

Alan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v3] oom-kill: add lowmem usage aware oom kill handling
  2010-01-30 17:45                               ` Alan Cox
  (?)
@ 2010-01-30 18:17                               ` Vedran Furač
  -1 siblings, 0 replies; 75+ messages in thread
From: Vedran Furač @ 2010-01-30 18:17 UTC (permalink / raw)
  To: Alan Cox
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, rientjes,
	minchan.kim, linux-kernel, balbir

[-- Attachment #1: Type: text/plain, Size: 544 bytes --]

Alan Cox wrote:

>> http://vedranf.net/tmp/oom.ogv  (you can watch it using VLC for example)
>>
>> Actually anyone receiving this mail should see it. What do you think,
>> what will customers rather choose if they see this?
> 
> Address that to the distributions. Their customers. Systems I set up for
> people always have no overcommit enabled.

And distros say that's a bug in kernel. Result: nothing gets done and
users will continue to swear at linux after it kills their work...


-- 
http://vedranf.net | a8e7a7783ca0d460fee090cc584adc12

[-- Attachment #2: vedran_furac.vcf --]
[-- Type: text/x-vcard, Size: 220 bytes --]

begin:vcard
fn;quoted-printable:Vedran Fura=C4=8D
n;quoted-printable:Fura=C4=8D;Vedran
adr:;;;;;;Croatia
email;internet:vedran.furac@gmail.com
x-mozilla-html:FALSE
url:http://vedranf.net
version:2.1
end:vcard


^ permalink raw reply	[flat|nested] 75+ messages in thread

end of thread, other threads:[~2010-01-30 18:17 UTC | newest]

Thread overview: 75+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-01-21  5:59 [PATCH] oom-kill: add lowmem usage aware oom kill handling KAMEZAWA Hiroyuki
2010-01-21  5:59 ` KAMEZAWA Hiroyuki
2010-01-21 15:18 ` Minchan Kim
2010-01-21 15:18   ` Minchan Kim
2010-01-21 23:48   ` KAMEZAWA Hiroyuki
2010-01-21 23:48     ` KAMEZAWA Hiroyuki
2010-01-22  0:40     ` Minchan Kim
2010-01-22  0:40       ` Minchan Kim
2010-01-22  1:06       ` KAMEZAWA Hiroyuki
2010-01-22  1:06         ` KAMEZAWA Hiroyuki
2010-01-21 15:29 ` Balbir Singh
2010-01-21 15:29   ` Balbir Singh
2010-01-21 23:54   ` KAMEZAWA Hiroyuki
2010-01-21 23:54     ` KAMEZAWA Hiroyuki
2010-01-22  6:23 ` [PATCH v2] " KAMEZAWA Hiroyuki
2010-01-22  6:23   ` KAMEZAWA Hiroyuki
2010-01-22 14:00   ` Minchan Kim
2010-01-22 14:00     ` Minchan Kim
2010-01-22 15:16     ` KAMEZAWA Hiroyuki
2010-01-22 15:16       ` KAMEZAWA Hiroyuki
2010-01-22 15:41       ` Minchan Kim
2010-01-22 15:41         ` Minchan Kim
2010-01-25  6:15   ` [PATCH v3] " KAMEZAWA Hiroyuki
2010-01-25  6:15     ` KAMEZAWA Hiroyuki
2010-01-26 23:12     ` Andrew Morton
2010-01-26 23:12       ` Andrew Morton
2010-01-26 23:53       ` KAMEZAWA Hiroyuki
2010-01-26 23:53         ` KAMEZAWA Hiroyuki
2010-01-27  0:19         ` Andrew Morton
2010-01-27  0:19           ` Andrew Morton
2010-01-27  0:58           ` KAMEZAWA Hiroyuki
2010-01-27  0:58             ` KAMEZAWA Hiroyuki
2010-01-27  6:30             ` [PATCH v4 0/2] " KAMEZAWA Hiroyuki
2010-01-27  6:30               ` KAMEZAWA Hiroyuki
2010-01-27  6:32               ` [PATCH v4 1/2] sysctl clean up vm related variable declarations KAMEZAWA Hiroyuki
2010-01-27  6:32                 ` KAMEZAWA Hiroyuki
2010-01-28  8:54                 ` David Rientjes
2010-01-28  8:54                   ` David Rientjes
2010-01-28 10:30                   ` KAMEZAWA Hiroyuki
2010-01-28 10:30                     ` KAMEZAWA Hiroyuki
2010-01-27  6:33               ` [PATCH v4 2/2] oom-kill: add lowmem usage aware oom kill handling v4 KAMEZAWA Hiroyuki
2010-01-27  6:33                 ` KAMEZAWA Hiroyuki
2010-01-28  0:12               ` [PATCH v4 0/2] oom-kill: add lowmem usage aware oom kill handling KAMEZAWA Hiroyuki
2010-01-28  0:12                 ` KAMEZAWA Hiroyuki
2010-01-27 23:56             ` [PATCH v3] " David Rientjes
2010-01-27 23:56               ` David Rientjes
2010-01-28  0:16             ` Alan Cox
2010-01-28  0:16               ` Alan Cox
2010-01-28  0:26               ` KAMEZAWA Hiroyuki
2010-01-28  0:26                 ` KAMEZAWA Hiroyuki
2010-01-28  0:59               ` David Rientjes
2010-01-28  0:59                 ` David Rientjes
2010-01-29  0:25               ` Vedran Furač
2010-01-29  0:25                 ` Vedran Furač
2010-01-29  0:35                 ` Alan Cox
2010-01-29  0:35                   ` Alan Cox
2010-01-29  0:57                   ` Vedran Furač
2010-01-29  0:57                     ` Vedran Furač
2010-01-29 11:03                     ` Alan Cox
2010-01-29 11:03                       ` Alan Cox
2010-01-30 12:33                       ` Vedran Furač
2010-01-30 12:59                         ` Alan Cox
2010-01-30 12:59                           ` Alan Cox
2010-01-30 17:30                           ` Vedran Furač
2010-01-30 17:45                             ` Alan Cox
2010-01-30 17:45                               ` Alan Cox
2010-01-30 18:17                               ` Vedran Furač
2010-01-27 23:46         ` David Rientjes
2010-01-27 23:46           ` David Rientjes
2010-01-26 23:16     ` Andrew Morton
2010-01-26 23:16       ` Andrew Morton
2010-01-26 23:44       ` KAMEZAWA Hiroyuki
2010-01-26 23:44         ` KAMEZAWA Hiroyuki
2010-01-27 23:40     ` David Rientjes
2010-01-27 23:40       ` David Rientjes

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.