All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 1/3 v3] oom: add per-mm oom disable count
@ 2010-08-20 22:41 David Rientjes
  2010-08-20 22:41 ` [patch 2/3 v3] oom: avoid killing a task if a thread sharing its mm cannot be killed David Rientjes
                   ` (4 more replies)
  0 siblings, 5 replies; 17+ messages in thread
From: David Rientjes @ 2010-08-20 22:41 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KAMEZAWA Hiroyuki, KOSAKI Motohiro, Rik van Riel, Ying Han, linux-mm

From: Ying Han <yinghan@google.com>

It's pointless to kill a task if another thread sharing its mm cannot be
killed to allow future memory freeing.  A subsequent patch will prevent
kills in such cases, but first it's necessary to have a way to flag a
task that shares memory with an OOM_DISABLE task that doesn't incur an
additional tasklist scan, which would make select_bad_process() an O(n^2)
function.

This patch adds an atomic counter to struct mm_struct that follows how
many threads attached to it have an oom_score_adj of OOM_SCORE_ADJ_MIN.
They cannot be killed by the kernel, so their memory cannot be freed in
oom conditions.

This only requires task_lock() on the task that we're operating on, it
does not require mm->mmap_sem since task_lock() pins the mm and the
operation is atomic.

[rientjes@google.com: changelog and sys_unshare() code]
Signed-off-by: Ying Han <yinghan@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/exec.c                |    5 +++++
 fs/proc/base.c           |   30 ++++++++++++++++++++++++++++++
 include/linux/mm_types.h |    2 ++
 kernel/exit.c            |    3 +++
 kernel/fork.c            |   13 ++++++++++++-
 5 files changed, 52 insertions(+), 1 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -54,6 +54,7 @@
 #include <linux/fsnotify.h>
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -745,6 +746,10 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
+	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+		atomic_dec(&active_mm->oom_disable_count);
+		atomic_inc(&tsk->mm->oom_disable_count);
+	}
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1047,6 +1047,21 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		return -EACCES;
 	}
 
+	task_lock(task);
+	if (!task->mm) {
+		task_unlock(task);
+		unlock_task_sighand(task, &flags);
+		put_task_struct(task);
+		return -EINVAL;
+	}
+
+	if (oom_adjust != task->signal->oom_adj) {
+		if (oom_adjust == OOM_DISABLE)
+			atomic_inc(&task->mm->oom_disable_count);
+		if (task->signal->oom_adj == OOM_DISABLE)
+			atomic_dec(&task->mm->oom_disable_count);
+	}
+
 	/*
 	 * Warn that /proc/pid/oom_adj is deprecated, see
 	 * Documentation/feature-removal-schedule.txt.
@@ -1065,6 +1080,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
 								-OOM_DISABLE;
+	task_unlock(task);
 	unlock_task_sighand(task, &flags);
 	put_task_struct(task);
 
@@ -1133,6 +1149,19 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		return -EACCES;
 	}
 
+	task_lock(task);
+	if (!task->mm) {
+		task_unlock(task);
+		unlock_task_sighand(task, &flags);
+		put_task_struct(task);
+		return -EINVAL;
+	}
+	if (oom_score_adj != task->signal->oom_score_adj) {
+		if (oom_score_adj == OOM_SCORE_ADJ_MIN)
+			atomic_inc(&task->mm->oom_disable_count);
+		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+			atomic_dec(&task->mm->oom_disable_count);
+	}
 	task->signal->oom_score_adj = oom_score_adj;
 	/*
 	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
@@ -1143,6 +1172,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
 							OOM_SCORE_ADJ_MAX;
+	task_unlock(task);
 	unlock_task_sighand(task, &flags);
 	put_task_struct(task);
 	return count;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -310,6 +310,8 @@ struct mm_struct {
 #ifdef CONFIG_MMU_NOTIFIER
 	struct mmu_notifier_mm *mmu_notifier_mm;
 #endif
+	/* How many tasks sharing this mm are OOM_DISABLE */
+	atomic_t oom_disable_count;
 };
 
 /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
diff --git a/kernel/exit.c b/kernel/exit.c
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -50,6 +50,7 @@
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/oom.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -689,6 +690,8 @@ static void exit_mm(struct task_struct * tsk)
 	enter_lazy_tlb(mm, current);
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
+	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+		atomic_dec(&mm->oom_disable_count);
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
diff --git a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -65,6 +65,7 @@
 #include <linux/perf_event.h>
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
+#include <linux/oom.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -485,6 +486,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	mm->cached_hole_size = ~0UL;
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
+	atomic_set(&mm->oom_disable_count, 0);
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
@@ -738,6 +740,8 @@ good_mm:
 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
 	mm->last_interval = 0;
+	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+		atomic_inc(&mm->oom_disable_count);
 
 	tsk->mm = mm;
 	tsk->active_mm = mm;
@@ -1296,8 +1300,11 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-	if (p->mm)
+	if (p->mm) {
+		if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+			atomic_dec(&p->mm->oom_disable_count);
 		mmput(p->mm);
+	}
 bad_fork_cleanup_signal:
 	if (!(clone_flags & CLONE_THREAD))
 		free_signal_struct(p->signal);
@@ -1690,6 +1697,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 			active_mm = current->active_mm;
 			current->mm = new_mm;
 			current->active_mm = new_mm;
+			if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+				atomic_dec(&mm->oom_disable_count);
+				atomic_inc(&new_mm->oom_disable_count);
+			}
 			activate_mm(active_mm, new_mm);
 			new_mm = mm;
 		}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [patch 2/3 v3] oom: avoid killing a task if a thread sharing its mm cannot be killed
  2010-08-20 22:41 [patch 1/3 v3] oom: add per-mm oom disable count David Rientjes
@ 2010-08-20 22:41 ` David Rientjes
  2010-08-22  9:49   ` KOSAKI Motohiro
  2010-08-20 22:41 ` [patch 3/3 v3] oom: kill all threads sharing oom killed task's mm David Rientjes
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 17+ messages in thread
From: David Rientjes @ 2010-08-20 22:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: KAMEZAWA Hiroyuki, KOSAKI Motohiro, Rik van Riel, linux-mm


The oom killer's goal is to kill a memory-hogging task so that it may
exit, free its memory, and allow the current context to allocate the
memory that triggered it in the first place.  Thus, killing a task is
pointless if other threads sharing its mm cannot be killed because of its
/proc/pid/oom_adj or /proc/pid/oom_score_adj value.

This patch checks whether any other thread sharing p->mm has an
oom_score_adj of OOM_SCORE_ADJ_MIN.  If so, the thread cannot be killed
and oom_badness(p) returns 0, meaning it's unkillable.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/oom_kill.c |    9 +++++----
 1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -162,10 +162,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
 		return 0;
 
 	/*
-	 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
-	 * need to be executed for something that cannot be killed.
+	 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
+	 * so the entire heuristic doesn't need to be executed for something
+	 * that cannot be killed.
 	 */
-	if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+	if (atomic_read(&p->mm->oom_disable_count)) {
 		task_unlock(p);
 		return 0;
 	}
@@ -675,7 +676,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	read_lock(&tasklist_lock);
 	if (sysctl_oom_kill_allocating_task &&
 	    !oom_unkillable_task(current, NULL, nodemask) &&
-	    (current->signal->oom_adj != OOM_DISABLE)) {
+	    current->mm && !atomic_read(&current->mm->oom_disable_count)) {
 		/*
 		 * oom_kill_process() needs tasklist_lock held.  If it returns
 		 * non-zero, current could not be killed so we must fallback to

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [patch 3/3 v3] oom: kill all threads sharing oom killed task's mm
  2010-08-20 22:41 [patch 1/3 v3] oom: add per-mm oom disable count David Rientjes
  2010-08-20 22:41 ` [patch 2/3 v3] oom: avoid killing a task if a thread sharing its mm cannot be killed David Rientjes
@ 2010-08-20 22:41 ` David Rientjes
  2010-08-20 23:52   ` David Rientjes
  2010-08-23 23:13 ` [patch 1/3 v3] oom: add per-mm oom disable count Andrew Morton
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 17+ messages in thread
From: David Rientjes @ 2010-08-20 22:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: KAMEZAWA Hiroyuki, KOSAKI Motohiro, Rik van Riel, linux-mm

It's necessary to kill all threads that share an oom killed task's mm if
the goal is to lead to future memory freeing.

This patch reintroduces the code removed in 8c5cd6f3 (oom: oom_kill
doesn't kill vfork parent (or child)) since it is obsoleted.

It's now guaranteed that any task passed to oom_kill_task() does not
share an mm with any thread that is unkillable.  Thus, we're safe to
issue a SIGKILL to any thread sharing the same mm.

This is especially necessary to solve an mm->mmap_sem livelock issue
whereas an oom killed thread must acquire the lock in the exit path while
another thread is holding it in the page allocator while trying to
allocate memory itself (and will preempt the oom killer since a task was
already killed).  Since tasks with pending fatal signals are now granted
access to memory reserves, the thread holding the lock may quickly
allocate and release the lock so that the oom killed task may exit.

This mainly is for threads that are cloned with CLONE_VM but not
CLONE_THREAD, so they are in a different thread group.  Non-NPTL threads
exist in the wild and this change is necessary to prevent the livelock in
such cases.  We care more about preventing the livelock than incurring
the additional tasklist in the oom killer when a task has been killed.
Systems that are sufficiently large to not want the tasklist scan in the
oom killer in the first place already have the option of enabling
/proc/sys/vm/oom_kill_allocating_task, which was designed specifically
for that purpose.

This code had existed in the oom killer for over eight years dating back
to the 2.4 kernel.

Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/oom_kill.c |   22 ++++++++++++++++++++++
 1 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -401,16 +401,38 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 #define K(x) ((x) << (PAGE_SHIFT-10))
 static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
 {
+	struct task_struct *q;
+	struct mm_struct *mm;
+
 	p = find_lock_task_mm(p);
 	if (!p)
 		return 1;
 
+	/* mm cannot be safely dereferenced after task_unlock(p) */
+	mm = p->mm;
+
 	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
 		task_pid_nr(p), p->comm, K(p->mm->total_vm),
 		K(get_mm_counter(p->mm, MM_ANONPAGES)),
 		K(get_mm_counter(p->mm, MM_FILEPAGES)));
 	task_unlock(p);
 
+	/*
+	 * Kill all processes sharing p->mm in other thread groups, if any.
+	 * They don't get access to memory reserves or a higher scheduler
+	 * priority, though, to avoid depletion of all memory or task
+	 * starvation.  This prevents mm->mmap_sem livelock when an oom killed
+	 * task cannot exit because it requires the semaphore and its contended
+	 * by another thread trying to allocate memory itself.  That thread will
+	 * now get access to memory reserves since it has a pending fatal
+	 * signal.
+	 */
+	for_each_process(q)
+		if (q->mm == mm && !same_thread_group(q, p)) {
+			pr_err("Kill process %d (%s) sharing same memory\n",
+				task_pid_nr(p), p->comm);
+			force_sig(SIGKILL, q);
+		}
 
 	set_tsk_thread_flag(p, TIF_MEMDIE);
 	force_sig(SIGKILL, p);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch 3/3 v3] oom: kill all threads sharing oom killed task's mm
  2010-08-20 22:41 ` [patch 3/3 v3] oom: kill all threads sharing oom killed task's mm David Rientjes
@ 2010-08-20 23:52   ` David Rientjes
  2010-08-23 23:16     ` Andrew Morton
  0 siblings, 1 reply; 17+ messages in thread
From: David Rientjes @ 2010-08-20 23:52 UTC (permalink / raw)
  To: Andrew Morton; +Cc: KAMEZAWA Hiroyuki, KOSAKI Motohiro, Rik van Riel, linux-mm

Sorry, forgot to address KOSAKI-san's request to add a pr_err() when 
sending SIGKILLs to other tasks.  Updated patch here.


oom: kill all threads sharing oom killed task's mm

It's necessary to kill all threads that share an oom killed task's mm if
the goal is to lead to future memory freeing.

This patch reintroduces the code removed in 8c5cd6f3 (oom: oom_kill
doesn't kill vfork parent (or child)) since it is obsoleted.

It's now guaranteed that any task passed to oom_kill_task() does not
share an mm with any thread that is unkillable.  Thus, we're safe to
issue a SIGKILL to any thread sharing the same mm.

This is especially necessary to solve an mm->mmap_sem livelock issue
whereas an oom killed thread must acquire the lock in the exit path while
another thread is holding it in the page allocator while trying to
allocate memory itself (and will preempt the oom killer since a task was
already killed).  Since tasks with pending fatal signals are now granted
access to memory reserves, the thread holding the lock may quickly
allocate and release the lock so that the oom killed task may exit.

This mainly is for threads that are cloned with CLONE_VM but not
CLONE_THREAD, so they are in a different thread group.  Non-NPTL threads
exist in the wild and this change is necessary to prevent the livelock in
such cases.  We care more about preventing the livelock than incurring
the additional tasklist in the oom killer when a task has been killed.
Systems that are sufficiently large to not want the tasklist scan in the
oom killer in the first place already have the option of enabling
/proc/sys/vm/oom_kill_allocating_task, which was designed specifically
for that purpose.

This code had existed in the oom killer for over eight years dating back
to the 2.4 kernel.

Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/oom_kill.c |   22 ++++++++++++++++++++++
 1 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -401,16 +401,38 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 #define K(x) ((x) << (PAGE_SHIFT-10))
 static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
 {
+	struct task_struct *q;
+	struct mm_struct *mm;
+
 	p = find_lock_task_mm(p);
 	if (!p)
 		return 1;
 
+	/* mm cannot be safely dereferenced after task_unlock(p) */
+	mm = p->mm;
+
 	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
 		task_pid_nr(p), p->comm, K(p->mm->total_vm),
 		K(get_mm_counter(p->mm, MM_ANONPAGES)),
 		K(get_mm_counter(p->mm, MM_FILEPAGES)));
 	task_unlock(p);
 
+	/*
+	 * Kill all processes sharing p->mm in other thread groups, if any.
+	 * They don't get access to memory reserves or a higher scheduler
+	 * priority, though, to avoid depletion of all memory or task
+	 * starvation.  This prevents mm->mmap_sem livelock when an oom killed
+	 * task cannot exit because it requires the semaphore and its contended
+	 * by another thread trying to allocate memory itself.  That thread will
+	 * now get access to memory reserves since it has a pending fatal
+	 * signal.
+	 */
+	for_each_process(q)
+		if (q->mm == mm && !same_thread_group(q, p)) {
+			pr_err("Kill process %d (%s) sharing same memory\n",
+				task_pid_nr(q), q->comm);
+			force_sig(SIGKILL, q);
+		}
 
 	set_tsk_thread_flag(p, TIF_MEMDIE);
 	force_sig(SIGKILL, p);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch 2/3 v3] oom: avoid killing a task if a thread sharing its mm cannot be killed
  2010-08-20 22:41 ` [patch 2/3 v3] oom: avoid killing a task if a thread sharing its mm cannot be killed David Rientjes
@ 2010-08-22  9:49   ` KOSAKI Motohiro
  2010-08-22 23:14     ` David Rientjes
  0 siblings, 1 reply; 17+ messages in thread
From: KOSAKI Motohiro @ 2010-08-22  9:49 UTC (permalink / raw)
  To: David Rientjes
  Cc: kosaki.motohiro, Andrew Morton, KAMEZAWA Hiroyuki, Rik van Riel,
	linux-mm

> 
> The oom killer's goal is to kill a memory-hogging task so that it may
> exit, free its memory, and allow the current context to allocate the
> memory that triggered it in the first place.  Thus, killing a task is
> pointless if other threads sharing its mm cannot be killed because of its
> /proc/pid/oom_adj or /proc/pid/oom_score_adj value.
> 
> This patch checks whether any other thread sharing p->mm has an
> oom_score_adj of OOM_SCORE_ADJ_MIN.  If so, the thread cannot be killed
> and oom_badness(p) returns 0, meaning it's unkillable.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  mm/oom_kill.c |    9 +++++----
>  1 files changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -162,10 +162,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
>  		return 0;
>  
>  	/*
> -	 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
> -	 * need to be executed for something that cannot be killed.
> +	 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
> +	 * so the entire heuristic doesn't need to be executed for something
> +	 * that cannot be killed.
>  	 */
> -	if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
> +	if (atomic_read(&p->mm->oom_disable_count)) {
>  		task_unlock(p);
>  		return 0;
>  	}
> @@ -675,7 +676,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
>  	read_lock(&tasklist_lock);
>  	if (sysctl_oom_kill_allocating_task &&
>  	    !oom_unkillable_task(current, NULL, nodemask) &&
> -	    (current->signal->oom_adj != OOM_DISABLE)) {
> +	    current->mm && !atomic_read(&current->mm->oom_disable_count)) {
>  		/*
>  		 * oom_kill_process() needs tasklist_lock held.  If it returns
>  		 * non-zero, current could not be killed so we must fallback to

This seems significantly cleaner than previous. Of cource, even though I need 
to review [1/3] carefully. Unfortunatelly I'm very busy in this week, then
my responce might late a while. but it's not mean silinetly nak.

Thanks.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch 2/3 v3] oom: avoid killing a task if a thread sharing its mm cannot be killed
  2010-08-22  9:49   ` KOSAKI Motohiro
@ 2010-08-22 23:14     ` David Rientjes
  0 siblings, 0 replies; 17+ messages in thread
From: David Rientjes @ 2010-08-22 23:14 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Andrew Morton, KAMEZAWA Hiroyuki, Rik van Riel, linux-mm

On Sun, 22 Aug 2010, KOSAKI Motohiro wrote:

> This seems significantly cleaner than previous. Of cource, even though I need 
> to review [1/3] carefully. Unfortunatelly I'm very busy in this week, then
> my responce might late a while. but it's not mean silinetly nak.
> 

I agree, it's much better than making select_bad_process() turn out to be 
O(n^2) when no threads are OOM_DISABLE.

Thanks!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch 1/3 v3] oom: add per-mm oom disable count
  2010-08-20 22:41 [patch 1/3 v3] oom: add per-mm oom disable count David Rientjes
  2010-08-20 22:41 ` [patch 2/3 v3] oom: avoid killing a task if a thread sharing its mm cannot be killed David Rientjes
  2010-08-20 22:41 ` [patch 3/3 v3] oom: kill all threads sharing oom killed task's mm David Rientjes
@ 2010-08-23 23:13 ` Andrew Morton
  2010-08-24  0:53   ` David Rientjes
  2010-08-27 21:48 ` Andrew Morton
  2010-08-30  4:39 ` [patch 1/3 v3] oom: add per-mm oom disable count KOSAKI Motohiro
  4 siblings, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2010-08-23 23:13 UTC (permalink / raw)
  To: David Rientjes
  Cc: KAMEZAWA Hiroyuki, KOSAKI Motohiro, Rik van Riel, Ying Han, linux-mm

On Fri, 20 Aug 2010 15:41:48 -0700 (PDT)
David Rientjes <rientjes@google.com> wrote:

> From: Ying Han <yinghan@google.com>
> 
> It's pointless to kill a task if another thread sharing its mm cannot be
> killed to allow future memory freeing.  A subsequent patch will prevent
> kills in such cases, but first it's necessary to have a way to flag a
> task that shares memory with an OOM_DISABLE task that doesn't incur an
> additional tasklist scan, which would make select_bad_process() an O(n^2)
> function.
> 
> This patch adds an atomic counter to struct mm_struct that follows how
> many threads attached to it have an oom_score_adj of OOM_SCORE_ADJ_MIN.
> They cannot be killed by the kernel, so their memory cannot be freed in
> oom conditions.
> 
> This only requires task_lock() on the task that we're operating on, it
> does not require mm->mmap_sem since task_lock() pins the mm and the
> operation is atomic.
> 
>
> ...
>
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1047,6 +1047,21 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
>  		return -EACCES;
>  	}
>  
> +	task_lock(task);
> +	if (!task->mm) {
> +		task_unlock(task);
> +		unlock_task_sighand(task, &flags);
> +		put_task_struct(task);
> +		return -EINVAL;
> +	}
> +
> +	if (oom_adjust != task->signal->oom_adj) {
> +		if (oom_adjust == OOM_DISABLE)
> +			atomic_inc(&task->mm->oom_disable_count);
> +		if (task->signal->oom_adj == OOM_DISABLE)
> +			atomic_dec(&task->mm->oom_disable_count);
> +	}

scary function.  Wanna try converting oom_adjust_write() to the
single-exit-with-goto model sometime, see if the result looks more
maintainable?

>
> ...
>
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -65,6 +65,7 @@
>  #include <linux/perf_event.h>
>  #include <linux/posix-timers.h>
>  #include <linux/user-return-notifier.h>
> +#include <linux/oom.h>
>  
>  #include <asm/pgtable.h>
>  #include <asm/pgalloc.h>
> @@ -485,6 +486,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
>  	mm->cached_hole_size = ~0UL;
>  	mm_init_aio(mm);
>  	mm_init_owner(mm, p);
> +	atomic_set(&mm->oom_disable_count, 0);

So in fork() we zap this if !CLONE_VM?  Was the CLONE_VM case tested
nicely?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch 3/3 v3] oom: kill all threads sharing oom killed task's mm
  2010-08-20 23:52   ` David Rientjes
@ 2010-08-23 23:16     ` Andrew Morton
  2010-08-24  0:57       ` David Rientjes
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2010-08-23 23:16 UTC (permalink / raw)
  To: David Rientjes; +Cc: KAMEZAWA Hiroyuki, KOSAKI Motohiro, Rik van Riel, linux-mm

On Fri, 20 Aug 2010 16:52:38 -0700 (PDT)
David Rientjes <rientjes@google.com> wrote:

> +			pr_err("Kill process %d (%s) sharing same memory\n",
> +				task_pid_nr(q), q->comm);

We're really supposed to use get_task_comm() when accessing another
tasks's comm[] to avoid races with that task altering its comm[] in
prctl().

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch 1/3 v3] oom: add per-mm oom disable count
  2010-08-23 23:13 ` [patch 1/3 v3] oom: add per-mm oom disable count Andrew Morton
@ 2010-08-24  0:53   ` David Rientjes
  0 siblings, 0 replies; 17+ messages in thread
From: David Rientjes @ 2010-08-24  0:53 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KAMEZAWA Hiroyuki, KOSAKI Motohiro, Rik van Riel, Ying Han, linux-mm

On Mon, 23 Aug 2010, Andrew Morton wrote:

> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -1047,6 +1047,21 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
> >  		return -EACCES;
> >  	}
> >  
> > +	task_lock(task);
> > +	if (!task->mm) {
> > +		task_unlock(task);
> > +		unlock_task_sighand(task, &flags);
> > +		put_task_struct(task);
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (oom_adjust != task->signal->oom_adj) {
> > +		if (oom_adjust == OOM_DISABLE)
> > +			atomic_inc(&task->mm->oom_disable_count);
> > +		if (task->signal->oom_adj == OOM_DISABLE)
> > +			atomic_dec(&task->mm->oom_disable_count);
> > +	}
> 
> scary function.  Wanna try converting oom_adjust_write() to the
> single-exit-with-goto model sometime, see if the result looks more
> maintainable?
> 

Ok, that would certainly be better.

> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -65,6 +65,7 @@
> >  #include <linux/perf_event.h>
> >  #include <linux/posix-timers.h>
> >  #include <linux/user-return-notifier.h>
> > +#include <linux/oom.h>
> >  
> >  #include <asm/pgtable.h>
> >  #include <asm/pgalloc.h>
> > @@ -485,6 +486,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
> >  	mm->cached_hole_size = ~0UL;
> >  	mm_init_aio(mm);
> >  	mm_init_owner(mm, p);
> > +	atomic_set(&mm->oom_disable_count, 0);
> 
> So in fork() we zap this if !CLONE_VM?  Was the CLONE_VM case tested
> nicely?
> 

In clone(), yeah, fork() doesn't use CLONE_VM so the child thread will 
always have a different ->mm.  The new ->mm will get oom_disable_count 
incremented later if tsk has inherited a OOM_DISABLE value.

We've been running with this patch (minus the sys_unshare() code I added) 
for a year now internally.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch 3/3 v3] oom: kill all threads sharing oom killed task's mm
  2010-08-23 23:16     ` Andrew Morton
@ 2010-08-24  0:57       ` David Rientjes
  0 siblings, 0 replies; 17+ messages in thread
From: David Rientjes @ 2010-08-24  0:57 UTC (permalink / raw)
  To: Andrew Morton; +Cc: KAMEZAWA Hiroyuki, KOSAKI Motohiro, Rik van Riel, linux-mm

On Mon, 23 Aug 2010, Andrew Morton wrote:

> > +			pr_err("Kill process %d (%s) sharing same memory\n",
> > +				task_pid_nr(q), q->comm);
> 
> We're really supposed to use get_task_comm() when accessing another
> tasks's comm[] to avoid races with that task altering its comm[] in
> prctl().
> 

Ok!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch 1/3 v3] oom: add per-mm oom disable count
  2010-08-20 22:41 [patch 1/3 v3] oom: add per-mm oom disable count David Rientjes
                   ` (2 preceding siblings ...)
  2010-08-23 23:13 ` [patch 1/3 v3] oom: add per-mm oom disable count Andrew Morton
@ 2010-08-27 21:48 ` Andrew Morton
  2010-08-28 22:25   ` [patch] oom: fix locking for oom_adj and oom_score_adj David Rientjes
  2010-08-30  4:39 ` [patch 1/3 v3] oom: add per-mm oom disable count KOSAKI Motohiro
  4 siblings, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2010-08-27 21:48 UTC (permalink / raw)
  To: David Rientjes
  Cc: KAMEZAWA Hiroyuki, KOSAKI Motohiro, Rik van Riel, Ying Han, linux-mm

On Fri, 20 Aug 2010 15:41:48 -0700 (PDT)
David Rientjes <rientjes@google.com> wrote:

> From: Ying Han <yinghan@google.com>
> 
> It's pointless to kill a task if another thread sharing its mm cannot be
> killed to allow future memory freeing.  A subsequent patch will prevent
> kills in such cases, but first it's necessary to have a way to flag a
> task that shares memory with an OOM_DISABLE task that doesn't incur an
> additional tasklist scan, which would make select_bad_process() an O(n^2)
> function.
> 
> This patch adds an atomic counter to struct mm_struct that follows how
> many threads attached to it have an oom_score_adj of OOM_SCORE_ADJ_MIN.
> They cannot be killed by the kernel, so their memory cannot be freed in
> oom conditions.
> 
> This only requires task_lock() on the task that we're operating on, it
> does not require mm->mmap_sem since task_lock() pins the mm and the
> operation is atomic.

I don't think lockdep likes us taking task_lock() inside
lock_task_sighand(), in oom_adjust_write():

[   78.185341] 
[   78.185341] =========================================================
[   78.185341] [ INFO: possible irq lock inversion dependency detected ]
[   78.185341] 2.6.36-rc2-mm1 #6
[   78.185341] ---------------------------------------------------------
[   78.185341] kworker/0:1/0 just changed the state of lock:
[   78.185341]  (&(&sighand->siglock)->rlock){-.....}, at: [<ffffffff81042d83>] lock_task_sighand+0x9a/0xda
[   78.185341] but this lock took another, HARDIRQ-unsafe lock in the past:
[   78.185341]  (&(&p->alloc_lock)->rlock){+.+...}
[   78.185341] 
[   78.185341] and interrupts could create inverse lock ordering between them.
[   78.185341] 
[   78.185341] 
[   78.185341] other info that might help us debug this:
[   78.185341] 2 locks held by kworker/0:1/0:
[   78.185341]  #0:  (rcu_read_lock){.+.+..}, at: [<ffffffff81044f28>] kill_pid_info+0x0/0x8d
[   78.185341]  #1:  (rcu_read_lock){.+.+..}, at: [<ffffffff81042ce9>] lock_task_sighand+0x0/0xda
[   78.185341] 
[   78.185341] the shortest dependencies between 2nd lock and 1st lock:
[   78.185341]  -> (&(&p->alloc_lock)->rlock){+.+...} ops: 221932 {
[   78.185341]     HARDIRQ-ON-W at:
[   78.185341]                                          [<ffffffff8105f595>] __lock_acquire+0x67c/0x865
[   78.185341]                                          [<ffffffff8105fbc7>] lock_acquire+0x8b/0xa8
[   78.185341]                                          [<ffffffff8137df6f>] _raw_spin_lock+0x2f/0x62
[   78.185341]                                          [<ffffffff810c48be>] set_task_comm+0x20/0x61
[   78.185341]                                          [<ffffffff8104eb0e>] kthreadd+0x21/0xf4
[   78.185341]                                          [<ffffffff81003814>] kernel_thread_helper+0x4/0x10
[   78.185341]     SOFTIRQ-ON-W at:
[   78.185341]                                          [<ffffffff8105f5b7>] __lock_acquire+0x69e/0x865
[   78.185341]                                          [<ffffffff8105fbc7>] lock_acquire+0x8b/0xa8
[   78.185341]                                          [<ffffffff8137df6f>] _raw_spin_lock+0x2f/0x62
[   78.185341]                                          [<ffffffff810c48be>] set_task_comm+0x20/0x61
[   78.185341]                                          [<ffffffff8104eb0e>] kthreadd+0x21/0xf4
[   78.185341]                                          [<ffffffff81003814>] kernel_thread_helper+0x4/0x10
[   78.185341]     INITIAL USE at:
[   78.185341]                                         [<ffffffff8105f602>] __lock_acquire+0x6e9/0x865
[   78.185341]                                         [<ffffffff8105fbc7>] lock_acquire+0x8b/0xa8
[   78.185341]                                         [<ffffffff8137df6f>] _raw_spin_lock+0x2f/0x62
[   78.185341]                                         [<ffffffff810c48be>] set_task_comm+0x20/0x61
[   78.185341]                                         [<ffffffff8104eb0e>] kthreadd+0x21/0xf4
[   78.185341]                                         [<ffffffff81003814>] kernel_thread_helper+0x4/0x10
[   78.185341]   }
[   78.185341]   ... key      at: [<ffffffff81b4d568>] __key.46765+0x0/0x8
[   78.185341]   ... acquired at:
[   78.185341]    [<ffffffff8105f701>] __lock_acquire+0x7e8/0x865
[   78.185341]    [<ffffffff8105fbc7>] lock_acquire+0x8b/0xa8
[   78.185341]    [<ffffffff8137df6f>] _raw_spin_lock+0x2f/0x62
[   78.185341]    [<ffffffff8110b910>] oom_adjust_write+0x144/0x2a9
[   78.185341]    [<ffffffff810c02c4>] vfs_write+0xb1/0x13d
[   78.185341]    [<ffffffff810c0873>] sys_write+0x4a/0x75
[   78.185341]    [<ffffffff810029eb>] system_call_fastpath+0x16/0x1b
[   78.185341] 
[   78.185341] -> (&(&sighand->siglock)->rlock){-.....} ops: 208770 {
[   78.185341]    IN-HARDIRQ-W at:
[   78.185341]                                        [<ffffffff8105f51f>] __lock_acquire+0x606/0x865
[   78.185341]                                        [<ffffffff8105fbc7>] lock_acquire+0x8b/0xa8
[   78.185341]                                        [<ffffffff8137e756>] _raw_spin_lock_irqsave+0x3b/0x73
[   78.185341]                                        [<ffffffff81042d83>] lock_task_sighand+0x9a/0xda
[   78.185341]                                        [<ffffffff81044bc9>] do_send_sig_info+0x2f/0x72
[   78.185341]                                        [<ffffffff81044f17>] group_send_sig_info+0x88/0x99
[   78.185341]                                        [<ffffffff81044f88>] kill_pid_info+0x60/0x8d
[   78.185341]                                        [<ffffffff8103b79c>] it_real_fn+0x17/0x1b
[   78.185341]                                        [<ffffffff81051efd>] hrtimer_run_queues+0x167/0x1cb
[   78.185341]                                        [<ffffffff81041bb3>] run_local_timers+0x9/0x15
[   78.185341]                                        [<ffffffff81041d7d>] update_process_times+0x2d/0x56
[   78.185341]                                        [<ffffffff810593ff>] tick_periodic+0x63/0x6f
[   78.185341]                                        [<ffffffff81059429>] tick_handle_periodic+0x1e/0x6b
[   78.185341]                                        [<ffffffff81019b2e>] smp_apic_timer_interrupt+0x83/0x96
[   78.185341]                                        [<ffffffff810033d3>] apic_timer_interrupt+0x13/0x20
[   78.185341]                                        [<ffffffff810014ca>] cpu_idle+0x48/0x66
[   78.185341]                                        [<ffffffff813772bd>] start_secondary+0x1b9/0x1bd
[   78.185341]    INITIAL USE at:
[   78.185341]                                       [<ffffffff8105f602>] __lock_acquire+0x6e9/0x865
[   78.185341]                                       [<ffffffff8105fbc7>] lock_acquire+0x8b/0xa8
[   78.185341]                                       [<ffffffff8137e756>] _raw_spin_lock_irqsave+0x3b/0x73
[   78.185341]                                       [<ffffffff81043122>] flush_signals+0x1d/0x43
[   78.185341]                                       [<ffffffff81043172>] ignore_signals+0x2a/0x2c
[   78.185341]                                       [<ffffffff8104eb16>] kthreadd+0x29/0xf4
[   78.185341]                                       [<ffffffff81003814>] kernel_thread_helper+0x4/0x10
[   78.185341]  }
[   78.185341]  ... key      at: [<ffffffff81b4d538>] __key.47081+0x0/0x8
[   78.185341]  ... acquired at:
[   78.185341]    [<ffffffff8105c566>] check_usage_forwards+0xc0/0xcf
[   78.185341]    [<ffffffff8105ce6f>] mark_lock+0x2f4/0x53f
[   78.185341]    [<ffffffff8105f51f>] __lock_acquire+0x606/0x865
[   78.185341]    [<ffffffff8105fbc7>] lock_acquire+0x8b/0xa8
[   78.185341]    [<ffffffff8137e756>] _raw_spin_lock_irqsave+0x3b/0x73
[   78.185341]    [<ffffffff81042d83>] lock_task_sighand+0x9a/0xda
[   78.185341]    [<ffffffff81044bc9>] do_send_sig_info+0x2f/0x72
[   78.185341]    [<ffffffff81044f17>] group_send_sig_info+0x88/0x99
[   78.185341]    [<ffffffff81044f88>] kill_pid_info+0x60/0x8d
[   78.185341]    [<ffffffff8103b79c>] it_real_fn+0x17/0x1b
[   78.185341]    [<ffffffff81051efd>] hrtimer_run_queues+0x167/0x1cb
[   78.185341]    [<ffffffff81041bb3>] run_local_timers+0x9/0x15
[   78.185341]    [<ffffffff81041d7d>] update_process_times+0x2d/0x56
[   78.185341]    [<ffffffff810593ff>] tick_periodic+0x63/0x6f
[   78.185341]    [<ffffffff81059429>] tick_handle_periodic+0x1e/0x6b
[   78.185341]    [<ffffffff81019b2e>] smp_apic_timer_interrupt+0x83/0x96
[   78.185341]    [<ffffffff810033d3>] apic_timer_interrupt+0x13/0x20
[   78.185341]    [<ffffffff810014ca>] cpu_idle+0x48/0x66
[   78.185341]    [<ffffffff813772bd>] start_secondary+0x1b9/0x1bd
[   78.185341] 
[   78.185341] 
[   78.185341] stack backtrace:
[   78.185341] Pid: 0, comm: kworker/0:1 Not tainted 2.6.36-rc2-mm1 #6
[   78.185341] Call Trace:
[   78.185341]  <IRQ>  [<ffffffff8105c497>] print_irq_inversion_bug+0x11e/0x12d
[   78.185341]  [<ffffffff8105c566>] check_usage_forwards+0xc0/0xcf
[   78.185341]  [<ffffffff8105c4a6>] ? check_usage_forwards+0x0/0xcf
[   78.185341]  [<ffffffff8105ce6f>] mark_lock+0x2f4/0x53f
[   78.185341]  [<ffffffff8105f51f>] __lock_acquire+0x606/0x865
[   78.185341]  [<ffffffff8105fbc7>] lock_acquire+0x8b/0xa8
[   78.185341]  [<ffffffff81042d83>] ? lock_task_sighand+0x9a/0xda
[   78.185341]  [<ffffffff8137e756>] _raw_spin_lock_irqsave+0x3b/0x73
[   78.185341]  [<ffffffff81042d83>] ? lock_task_sighand+0x9a/0xda
[   78.185341]  [<ffffffff81042d83>] lock_task_sighand+0x9a/0xda
[   78.185341]  [<ffffffff81042ce9>] ? lock_task_sighand+0x0/0xda
[   78.185341]  [<ffffffff81044bc9>] do_send_sig_info+0x2f/0x72
[   78.185341]  [<ffffffff81044f17>] group_send_sig_info+0x88/0x99
[   78.185341]  [<ffffffff81044e8f>] ? group_send_sig_info+0x0/0x99
[   78.185341]  [<ffffffff8103b785>] ? it_real_fn+0x0/0x1b
[   78.185341]  [<ffffffff81044f88>] kill_pid_info+0x60/0x8d
[   78.185341]  [<ffffffff81044f28>] ? kill_pid_info+0x0/0x8d
[   78.185341]  [<ffffffff8103b785>] ? it_real_fn+0x0/0x1b
[   78.185341]  [<ffffffff8103b79c>] it_real_fn+0x17/0x1b
[   78.185341]  [<ffffffff81051efd>] hrtimer_run_queues+0x167/0x1cb
[   78.185341]  [<ffffffff81041bb3>] run_local_timers+0x9/0x15
[   78.185341]  [<ffffffff81041d7d>] update_process_times+0x2d/0x56
[   78.185341]  [<ffffffff810593ff>] tick_periodic+0x63/0x6f
[   78.185341]  [<ffffffff81059429>] tick_handle_periodic+0x1e/0x6b
[   78.185341]  [<ffffffff81019b2e>] smp_apic_timer_interrupt+0x83/0x96
[   78.185341]  [<ffffffff810033d3>] apic_timer_interrupt+0x13/0x20
[   78.185341]  <EOI>  [<ffffffff81381c3a>] ? __atomic_notifier_call_chain+0x0/0x84
[   78.185341]  [<ffffffff81381c3a>] ? __atomic_notifier_call_chain+0x0/0x84
[   78.185341]  [<ffffffff81381c3a>] ? __atomic_notifier_call_chain+0x0/0x84
[   78.185341]  [<ffffffff81009afe>] ? mwait_idle+0x66/0x72
[   78.185341]  [<ffffffff81009af4>] ? mwait_idle+0x5c/0x72
[   78.185341]  [<ffffffff810014ca>] cpu_idle+0x48/0x66
[   78.185341]  [<ffffffff813772bd>] start_secondary+0x1b9/0x1bd
[   78.185341]  [<ffffffff81377104>] ? start_secondary+0x0/0x1bd

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [patch] oom: fix locking for oom_adj and oom_score_adj
  2010-08-27 21:48 ` Andrew Morton
@ 2010-08-28 22:25   ` David Rientjes
  0 siblings, 0 replies; 17+ messages in thread
From: David Rientjes @ 2010-08-28 22:25 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KAMEZAWA Hiroyuki, KOSAKI Motohiro, Rik van Riel, Ying Han, linux-mm

On Fri, 27 Aug 2010, Andrew Morton wrote:

> > From: Ying Han <yinghan@google.com>
> > 
> > It's pointless to kill a task if another thread sharing its mm cannot be
> > killed to allow future memory freeing.  A subsequent patch will prevent
> > kills in such cases, but first it's necessary to have a way to flag a
> > task that shares memory with an OOM_DISABLE task that doesn't incur an
> > additional tasklist scan, which would make select_bad_process() an O(n^2)
> > function.
> > 
> > This patch adds an atomic counter to struct mm_struct that follows how
> > many threads attached to it have an oom_score_adj of OOM_SCORE_ADJ_MIN.
> > They cannot be killed by the kernel, so their memory cannot be freed in
> > oom conditions.
> > 
> > This only requires task_lock() on the task that we're operating on, it
> > does not require mm->mmap_sem since task_lock() pins the mm and the
> > operation is atomic.
> 
> I don't think lockdep likes us taking task_lock() inside
> lock_task_sighand(), in oom_adjust_write():
> 
> [   78.185341] 
> [   78.185341] =========================================================
> [   78.185341] [ INFO: possible irq lock inversion dependency detected ]
> [   78.185341] 2.6.36-rc2-mm1 #6
> [   78.185341] ---------------------------------------------------------
> [   78.185341] kworker/0:1/0 just changed the state of lock:
> [   78.185341]  (&(&sighand->siglock)->rlock){-.....}, at: [<ffffffff81042d83>] lock_task_sighand+0x9a/0xda
> [   78.185341] but this lock took another, HARDIRQ-unsafe lock in the past:
> [   78.185341]  (&(&p->alloc_lock)->rlock){+.+...}
> [   78.185341] 
> [   78.185341] and interrupts could create inverse lock ordering between them.



oom: fix locking for oom_adj and oom_score_adj

The locking order in oom_adjust_write() and oom_score_adj_write() for 
task->alloc_lock and task->sighand->siglock is reversed, and lockdep
notices that irqs could encounter an ABBA scenario.

This fixes the locking order so that we always take task_lock(task) prior
to lock_task_sighand(task).

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 Can't be folded into the offending patch, 
 oom-add-per-mm-oom-disable-count.patch, because later patch
 oom-rewrite-error-handling-for-oom_adj-and-oom_score_adj-tunables.patch 
 rewrites error handling in these functions.

 fs/proc/base.c |   40 +++++++++++++++++++++-------------------
 1 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1042,9 +1042,16 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		err = -ESRCH;
 		goto out;
 	}
+
+	task_lock(task);
+	if (!task->mm) {
+		err = -EINVAL;
+		goto err_task_lock;
+	}
+	
 	if (!lock_task_sighand(task, &flags)) {
 		err = -ESRCH;
-		goto err_task_struct;
+		goto err_task_lock;
 	}
 
 	if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
@@ -1052,12 +1059,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		goto err_sighand;
 	}
 
-	task_lock(task);
-	if (!task->mm) {
-		err = -EINVAL;
-		goto err_task_lock;
-	}
-
 	if (oom_adjust != task->signal->oom_adj) {
 		if (oom_adjust == OOM_DISABLE)
 			atomic_inc(&task->mm->oom_disable_count);
@@ -1083,11 +1084,10 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
 								-OOM_DISABLE;
-err_task_lock:
-	task_unlock(task);
 err_sighand:
 	unlock_task_sighand(task, &flags);
-err_task_struct:
+err_task_lock:
+	task_unlock(task);
 	put_task_struct(task);
 out:
 	return err < 0 ? err : count;
@@ -1150,21 +1150,24 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		err = -ESRCH;
 		goto out;
 	}
+
+	task_lock(task);
+	if (!task->mm) {
+		err = -EINVAL;
+		goto err_task_lock;
+	}
+
 	if (!lock_task_sighand(task, &flags)) {
 		err = -ESRCH;
-		goto err_task_struct;
+		goto err_task_lock;
 	}
+
 	if (oom_score_adj < task->signal->oom_score_adj &&
 			!capable(CAP_SYS_RESOURCE)) {
 		err = -EACCES;
 		goto err_sighand;
 	}
 
-	task_lock(task);
-	if (!task->mm) {
-		err = -EINVAL;
-		goto err_task_lock;
-	}
 	if (oom_score_adj != task->signal->oom_score_adj) {
 		if (oom_score_adj == OOM_SCORE_ADJ_MIN)
 			atomic_inc(&task->mm->oom_disable_count);
@@ -1181,11 +1184,10 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
 							OOM_SCORE_ADJ_MAX;
-err_task_lock:
-	task_unlock(task);
 err_sighand:
 	unlock_task_sighand(task, &flags);
-err_task_struct:
+err_task_lock:
+	task_unlock(task);
 	put_task_struct(task);
 out:
 	return err < 0 ? err : count;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch 1/3 v3] oom: add per-mm oom disable count
  2010-08-20 22:41 [patch 1/3 v3] oom: add per-mm oom disable count David Rientjes
                   ` (3 preceding siblings ...)
  2010-08-27 21:48 ` Andrew Morton
@ 2010-08-30  4:39 ` KOSAKI Motohiro
  2010-08-30 21:14   ` David Rientjes
  4 siblings, 1 reply; 17+ messages in thread
From: KOSAKI Motohiro @ 2010-08-30  4:39 UTC (permalink / raw)
  To: David Rientjes, Ying Han
  Cc: kosaki.motohiro, Andrew Morton, KAMEZAWA Hiroyuki, Rik van Riel,
	linux-mm

> From: Ying Han <yinghan@google.com>
> 
> It's pointless to kill a task if another thread sharing its mm cannot be
> killed to allow future memory freeing.  A subsequent patch will prevent
> kills in such cases, but first it's necessary to have a way to flag a
> task that shares memory with an OOM_DISABLE task that doesn't incur an
> additional tasklist scan, which would make select_bad_process() an O(n^2)
> function.
> 
> This patch adds an atomic counter to struct mm_struct that follows how
> many threads attached to it have an oom_score_adj of OOM_SCORE_ADJ_MIN.
> They cannot be killed by the kernel, so their memory cannot be freed in
> oom conditions.
> 
> This only requires task_lock() on the task that we're operating on, it
> does not require mm->mmap_sem since task_lock() pins the mm and the
> operation is atomic.
> 
> [rientjes@google.com: changelog and sys_unshare() code]
> Signed-off-by: Ying Han <yinghan@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  fs/exec.c                |    5 +++++
>  fs/proc/base.c           |   30 ++++++++++++++++++++++++++++++
>  include/linux/mm_types.h |    2 ++
>  kernel/exit.c            |    3 +++
>  kernel/fork.c            |   13 ++++++++++++-
>  5 files changed, 52 insertions(+), 1 deletions(-)
> 
> diff --git a/fs/exec.c b/fs/exec.c
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -54,6 +54,7 @@
>  #include <linux/fsnotify.h>
>  #include <linux/fs_struct.h>
>  #include <linux/pipe_fs_i.h>
> +#include <linux/oom.h>
>  
>  #include <asm/uaccess.h>
>  #include <asm/mmu_context.h>
> @@ -745,6 +746,10 @@ static int exec_mmap(struct mm_struct *mm)
>  	tsk->mm = mm;
>  	tsk->active_mm = mm;
>  	activate_mm(active_mm, mm);
> +	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
> +		atomic_dec(&active_mm->oom_disable_count);

When kernel thread makes user-land process (e.g. usermode-helper),
active_mm might point to unrelated process. active_mm is only meaningful
for scheduler code. please don't touch it. probably you intend to
change old_mm.



> +		atomic_inc(&tsk->mm->oom_disable_count);
> +	}
>  	task_unlock(tsk);
>  	arch_pick_mmap_layout(mm);
>  	if (old_mm) {
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1047,6 +1047,21 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
>  		return -EACCES;
>  	}
>  
> +	task_lock(task);
>
> +	if (!task->mm) {
> +		task_unlock(task);
> +		unlock_task_sighand(task, &flags);
> +		put_task_struct(task);
> +		return -EINVAL;
> +	}
> +
> +	if (oom_adjust != task->signal->oom_adj) {
> +		if (oom_adjust == OOM_DISABLE)
> +			atomic_inc(&task->mm->oom_disable_count);
> +		if (task->signal->oom_adj == OOM_DISABLE)
> +			atomic_dec(&task->mm->oom_disable_count);
> +	}
> +
>  	/*
>  	 * Warn that /proc/pid/oom_adj is deprecated, see
>  	 * Documentation/feature-removal-schedule.txt.
> @@ -1065,6 +1080,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
>  	else
>  		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
>  								-OOM_DISABLE;
> +	task_unlock(task);
>  	unlock_task_sighand(task, &flags);
>  	put_task_struct(task);
>  
> @@ -1133,6 +1149,19 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
>  		return -EACCES;
>  	}
>  
> +	task_lock(task);
> +	if (!task->mm) {
> +		task_unlock(task);
> +		unlock_task_sighand(task, &flags);
> +		put_task_struct(task);
> +		return -EINVAL;
> +	}
> +	if (oom_score_adj != task->signal->oom_score_adj) {
> +		if (oom_score_adj == OOM_SCORE_ADJ_MIN)
> +			atomic_inc(&task->mm->oom_disable_count);
> +		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
> +			atomic_dec(&task->mm->oom_disable_count);
> +	}
>  	task->signal->oom_score_adj = oom_score_adj;
>  	/*
>  	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
> @@ -1143,6 +1172,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
>  	else
>  		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
>  							OOM_SCORE_ADJ_MAX;
> +	task_unlock(task);
>  	unlock_task_sighand(task, &flags);
>  	put_task_struct(task);
>  	return count;
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -310,6 +310,8 @@ struct mm_struct {
>  #ifdef CONFIG_MMU_NOTIFIER
>  	struct mmu_notifier_mm *mmu_notifier_mm;
>  #endif
> +	/* How many tasks sharing this mm are OOM_DISABLE */
> +	atomic_t oom_disable_count;
>  };
>  
>  /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
> diff --git a/kernel/exit.c b/kernel/exit.c
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -50,6 +50,7 @@
>  #include <linux/perf_event.h>
>  #include <trace/events/sched.h>
>  #include <linux/hw_breakpoint.h>
> +#include <linux/oom.h>
>  
>  #include <asm/uaccess.h>
>  #include <asm/unistd.h>
> @@ -689,6 +690,8 @@ static void exit_mm(struct task_struct * tsk)
>  	enter_lazy_tlb(mm, current);
>  	/* We don't want this task to be frozen prematurely */
>  	clear_freeze_flag(tsk);
> +	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
> +		atomic_dec(&mm->oom_disable_count);
>  	task_unlock(tsk);
>  	mm_update_next_owner(mm);
>  	mmput(mm);
> diff --git a/kernel/fork.c b/kernel/fork.c
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -65,6 +65,7 @@
>  #include <linux/perf_event.h>
>  #include <linux/posix-timers.h>
>  #include <linux/user-return-notifier.h>
> +#include <linux/oom.h>
>  
>  #include <asm/pgtable.h>
>  #include <asm/pgalloc.h>
> @@ -485,6 +486,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
>  	mm->cached_hole_size = ~0UL;
>  	mm_init_aio(mm);
>  	mm_init_owner(mm, p);
> +	atomic_set(&mm->oom_disable_count, 0);
>  
>  	if (likely(!mm_alloc_pgd(mm))) {
>  		mm->def_flags = 0;
> @@ -738,6 +740,8 @@ good_mm:
>  	/* Initializing for Swap token stuff */
>  	mm->token_priority = 0;
>  	mm->last_interval = 0;
> +	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
> +		atomic_inc(&mm->oom_disable_count);
>  
>  	tsk->mm = mm;
>  	tsk->active_mm = mm;
> @@ -1296,8 +1300,11 @@ bad_fork_cleanup_io:
>  bad_fork_cleanup_namespaces:
>  	exit_task_namespaces(p);
>  bad_fork_cleanup_mm:
> -	if (p->mm)
> +	if (p->mm) {
> +		if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
> +			atomic_dec(&p->mm->oom_disable_count);
>  		mmput(p->mm);
> +	}

This place, we don't have any lock. so, checking signal->oom_score_adj and
change oom_disable_count seems inatomic.


>  bad_fork_cleanup_signal:
>  	if (!(clone_flags & CLONE_THREAD))
>  		free_signal_struct(p->signal);
> @@ -1690,6 +1697,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
>  			active_mm = current->active_mm;
>  			current->mm = new_mm;
>  			current->active_mm = new_mm;
> +			if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
> +				atomic_dec(&mm->oom_disable_count);
> +				atomic_inc(&new_mm->oom_disable_count);
> +			}
>  			activate_mm(active_mm, new_mm);
>  			new_mm = mm;
>  		}

This place, we are grabbing task_lock(), but task_lock don't prevent
to change signal->oom_score_adj from another thread. This seems racy.

Perhaps, almost place need something else. example,

	if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
		task_lock_sighand()
		if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { /* check again */
			atomic_dec(&p->mm->oom_disable_count);
		}
		task_unlock_sighand()
	}

But I'm not sure all place don't makes lock order inversion.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch 1/3 v3] oom: add per-mm oom disable count
  2010-08-30  4:39 ` [patch 1/3 v3] oom: add per-mm oom disable count KOSAKI Motohiro
@ 2010-08-30 21:14   ` David Rientjes
  2010-08-31 23:54     ` KOSAKI Motohiro
  0 siblings, 1 reply; 17+ messages in thread
From: David Rientjes @ 2010-08-30 21:14 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Ying Han, Andrew Morton, KAMEZAWA Hiroyuki, Rik van Riel, linux-mm

On Mon, 30 Aug 2010, KOSAKI Motohiro wrote:

> > diff --git a/fs/exec.c b/fs/exec.c
> > --- a/fs/exec.c
> > +++ b/fs/exec.c
> > @@ -54,6 +54,7 @@
> >  #include <linux/fsnotify.h>
> >  #include <linux/fs_struct.h>
> >  #include <linux/pipe_fs_i.h>
> > +#include <linux/oom.h>
> >  
> >  #include <asm/uaccess.h>
> >  #include <asm/mmu_context.h>
> > @@ -745,6 +746,10 @@ static int exec_mmap(struct mm_struct *mm)
> >  	tsk->mm = mm;
> >  	tsk->active_mm = mm;
> >  	activate_mm(active_mm, mm);
> > +	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
> > +		atomic_dec(&active_mm->oom_disable_count);
> 
> When kernel thread makes user-land process (e.g. usermode-helper),
> active_mm might point to unrelated process. active_mm is only meaningful
> for scheduler code. please don't touch it. probably you intend to
> change old_mm.
> 

This is safe because kthreads never have non-zero 
p->signal->oom_score_adj.

> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -310,6 +310,8 @@ struct mm_struct {
> >  #ifdef CONFIG_MMU_NOTIFIER
> >  	struct mmu_notifier_mm *mmu_notifier_mm;
> >  #endif
> > +	/* How many tasks sharing this mm are OOM_DISABLE */
> > +	atomic_t oom_disable_count;
> >  };
> >  
> >  /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
> > diff --git a/kernel/exit.c b/kernel/exit.c
> > --- a/kernel/exit.c
> > +++ b/kernel/exit.c
> > @@ -50,6 +50,7 @@
> >  #include <linux/perf_event.h>
> >  #include <trace/events/sched.h>
> >  #include <linux/hw_breakpoint.h>
> > +#include <linux/oom.h>
> >  
> >  #include <asm/uaccess.h>
> >  #include <asm/unistd.h>
> > @@ -689,6 +690,8 @@ static void exit_mm(struct task_struct * tsk)
> >  	enter_lazy_tlb(mm, current);
> >  	/* We don't want this task to be frozen prematurely */
> >  	clear_freeze_flag(tsk);
> > +	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
> > +		atomic_dec(&mm->oom_disable_count);
> >  	task_unlock(tsk);
> >  	mm_update_next_owner(mm);
> >  	mmput(mm);
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -65,6 +65,7 @@
> >  #include <linux/perf_event.h>
> >  #include <linux/posix-timers.h>
> >  #include <linux/user-return-notifier.h>
> > +#include <linux/oom.h>
> >  
> >  #include <asm/pgtable.h>
> >  #include <asm/pgalloc.h>
> > @@ -485,6 +486,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
> >  	mm->cached_hole_size = ~0UL;
> >  	mm_init_aio(mm);
> >  	mm_init_owner(mm, p);
> > +	atomic_set(&mm->oom_disable_count, 0);
> >  
> >  	if (likely(!mm_alloc_pgd(mm))) {
> >  		mm->def_flags = 0;
> > @@ -738,6 +740,8 @@ good_mm:
> >  	/* Initializing for Swap token stuff */
> >  	mm->token_priority = 0;
> >  	mm->last_interval = 0;
> > +	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
> > +		atomic_inc(&mm->oom_disable_count);
> >  
> >  	tsk->mm = mm;
> >  	tsk->active_mm = mm;
> > @@ -1296,8 +1300,11 @@ bad_fork_cleanup_io:
> >  bad_fork_cleanup_namespaces:
> >  	exit_task_namespaces(p);
> >  bad_fork_cleanup_mm:
> > -	if (p->mm)
> > +	if (p->mm) {
> > +		if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
> > +			atomic_dec(&p->mm->oom_disable_count);
> >  		mmput(p->mm);
> > +	}
> 
> This place, we don't have any lock. so, checking signal->oom_score_adj and
> change oom_disable_count seems inatomic.
> 

Ah, true, we need task_lock(p) around the conditional, thanks.

> >  bad_fork_cleanup_signal:
> >  	if (!(clone_flags & CLONE_THREAD))
> >  		free_signal_struct(p->signal);
> > @@ -1690,6 +1697,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
> >  			active_mm = current->active_mm;
> >  			current->mm = new_mm;
> >  			current->active_mm = new_mm;
> > +			if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
> > +				atomic_dec(&mm->oom_disable_count);
> > +				atomic_inc(&new_mm->oom_disable_count);
> > +			}
> >  			activate_mm(active_mm, new_mm);
> >  			new_mm = mm;
> >  		}
> 
> This place, we are grabbing task_lock(), but task_lock don't prevent
> to change signal->oom_score_adj from another thread. This seems racy.
> 

It does, task_lock(current) protects current->signal->oom_score_adj from 
changing in oom-add-per-mm-oom-disable-count.patch.

I'll add the task_lock(p) in mm_init(), thanks for the review!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch 1/3 v3] oom: add per-mm oom disable count
  2010-08-30 21:14   ` David Rientjes
@ 2010-08-31 23:54     ` KOSAKI Motohiro
  2010-09-01 23:41       ` David Rientjes
  0 siblings, 1 reply; 17+ messages in thread
From: KOSAKI Motohiro @ 2010-08-31 23:54 UTC (permalink / raw)
  To: David Rientjes
  Cc: kosaki.motohiro, Ying Han, Andrew Morton, KAMEZAWA Hiroyuki,
	Rik van Riel, linux-mm

> > > @@ -745,6 +746,10 @@ static int exec_mmap(struct mm_struct *mm)
> > >  	tsk->mm = mm;
> > >  	tsk->active_mm = mm;
> > >  	activate_mm(active_mm, mm);
> > > +	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
> > > +		atomic_dec(&active_mm->oom_disable_count);
> > 
> > When kernel thread makes user-land process (e.g. usermode-helper),
> > active_mm might point to unrelated process. active_mm is only meaningful
> > for scheduler code. please don't touch it. probably you intend to
> > change old_mm.
> 
> This is safe because kthreads never have non-zero 
> p->signal->oom_score_adj.

Hm? my example is wrong? my point is, you shouldn't touch active_mm.


> > > @@ -1690,6 +1697,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
> > >  			active_mm = current->active_mm;
> > >  			current->mm = new_mm;
> > >  			current->active_mm = new_mm;
> > > +			if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
> > > +				atomic_dec(&mm->oom_disable_count);
> > > +				atomic_inc(&new_mm->oom_disable_count);
> > > +			}
> > >  			activate_mm(active_mm, new_mm);
> > >  			new_mm = mm;
> > >  		}
> > 
> > This place, we are grabbing task_lock(), but task_lock don't prevent
> > to change signal->oom_score_adj from another thread. This seems racy.
> > 
> 
> It does, task_lock(current) protects current->signal->oom_score_adj from 
> changing in oom-add-per-mm-oom-disable-count.patch.
> 
> I'll add the task_lock(p) in mm_init(), thanks for the review!

Wait, can you please elabolate more? task_lock() only lock one thread.
Why can it protect multi-thread race?



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch 1/3 v3] oom: add per-mm oom disable count
  2010-08-31 23:54     ` KOSAKI Motohiro
@ 2010-09-01 23:41       ` David Rientjes
  2010-09-02  0:50         ` KOSAKI Motohiro
  0 siblings, 1 reply; 17+ messages in thread
From: David Rientjes @ 2010-09-01 23:41 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Ying Han, Andrew Morton, KAMEZAWA Hiroyuki, Rik van Riel, linux-mm

On Wed, 1 Sep 2010, KOSAKI Motohiro wrote:

> > > > @@ -745,6 +746,10 @@ static int exec_mmap(struct mm_struct *mm)
> > > >  	tsk->mm = mm;
> > > >  	tsk->active_mm = mm;
> > > >  	activate_mm(active_mm, mm);
> > > > +	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
> > > > +		atomic_dec(&active_mm->oom_disable_count);
> > > 
> > > When kernel thread makes user-land process (e.g. usermode-helper),
> > > active_mm might point to unrelated process. active_mm is only meaningful
> > > for scheduler code. please don't touch it. probably you intend to
> > > change old_mm.
> > 
> > This is safe because kthreads never have non-zero 
> > p->signal->oom_score_adj.
> 
> Hm? my example is wrong? my point is, you shouldn't touch active_mm.
> 

Ok, I'll use old_mm instead as a cleanup.  Thanks!

> > > > @@ -1690,6 +1697,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
> > > >  			active_mm = current->active_mm;
> > > >  			current->mm = new_mm;
> > > >  			current->active_mm = new_mm;
> > > > +			if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
> > > > +				atomic_dec(&mm->oom_disable_count);
> > > > +				atomic_inc(&new_mm->oom_disable_count);
> > > > +			}
> > > >  			activate_mm(active_mm, new_mm);
> > > >  			new_mm = mm;
> > > >  		}
> > > 
> > > This place, we are grabbing task_lock(), but task_lock don't prevent
> > > to change signal->oom_score_adj from another thread. This seems racy.
> > > 
> > 
> > It does, task_lock(current) protects current->signal->oom_score_adj from 
> > changing in oom-add-per-mm-oom-disable-count.patch.
> > 
> > I'll add the task_lock(p) in mm_init(), thanks for the review!
> 
> Wait, can you please elabolate more? task_lock() only lock one thread.
> Why can it protect multi-thread race?
> 

We take task_lock(tsk) whenever we change tsk->signal->oom_score_adj.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch 1/3 v3] oom: add per-mm oom disable count
  2010-09-01 23:41       ` David Rientjes
@ 2010-09-02  0:50         ` KOSAKI Motohiro
  0 siblings, 0 replies; 17+ messages in thread
From: KOSAKI Motohiro @ 2010-09-02  0:50 UTC (permalink / raw)
  To: David Rientjes
  Cc: kosaki.motohiro, Ying Han, Andrew Morton, KAMEZAWA Hiroyuki,
	Rik van Riel, linux-mm

> > > > > @@ -1690,6 +1697,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
> > > > >  			active_mm = current->active_mm;
> > > > >  			current->mm = new_mm;
> > > > >  			current->active_mm = new_mm;
> > > > > +			if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
> > > > > +				atomic_dec(&mm->oom_disable_count);
> > > > > +				atomic_inc(&new_mm->oom_disable_count);
> > > > > +			}
> > > > >  			activate_mm(active_mm, new_mm);
> > > > >  			new_mm = mm;
> > > > >  		}
> > > > 
> > > > This place, we are grabbing task_lock(), but task_lock don't prevent
> > > > to change signal->oom_score_adj from another thread. This seems racy.
> > > > 
> > > 
> > > It does, task_lock(current) protects current->signal->oom_score_adj from 
> > > changing in oom-add-per-mm-oom-disable-count.patch.
> > > 
> > > I'll add the task_lock(p) in mm_init(), thanks for the review!
> > 
> > Wait, can you please elabolate more? task_lock() only lock one thread.
> > Why can it protect multi-thread race?
> > 
> 
> We take task_lock(tsk) whenever we change tsk->signal->oom_score_adj.

example, Process P1 has threads T1 and T2.
oom_score_adj_write() take task_lock(T1) and siglock(P1). unshare() take
task_lock(T2). How protect?





--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2010-09-02  0:50 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-08-20 22:41 [patch 1/3 v3] oom: add per-mm oom disable count David Rientjes
2010-08-20 22:41 ` [patch 2/3 v3] oom: avoid killing a task if a thread sharing its mm cannot be killed David Rientjes
2010-08-22  9:49   ` KOSAKI Motohiro
2010-08-22 23:14     ` David Rientjes
2010-08-20 22:41 ` [patch 3/3 v3] oom: kill all threads sharing oom killed task's mm David Rientjes
2010-08-20 23:52   ` David Rientjes
2010-08-23 23:16     ` Andrew Morton
2010-08-24  0:57       ` David Rientjes
2010-08-23 23:13 ` [patch 1/3 v3] oom: add per-mm oom disable count Andrew Morton
2010-08-24  0:53   ` David Rientjes
2010-08-27 21:48 ` Andrew Morton
2010-08-28 22:25   ` [patch] oom: fix locking for oom_adj and oom_score_adj David Rientjes
2010-08-30  4:39 ` [patch 1/3 v3] oom: add per-mm oom disable count KOSAKI Motohiro
2010-08-30 21:14   ` David Rientjes
2010-08-31 23:54     ` KOSAKI Motohiro
2010-09-01 23:41       ` David Rientjes
2010-09-02  0:50         ` KOSAKI Motohiro

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.