All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
@ 2009-01-21  8:06 Miao Xie
  2009-01-21  8:30 ` Nick Piggin
                   ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Miao Xie @ 2009-01-21  8:06 UTC (permalink / raw)
  To: Ingo Molnar, Andrew Morton, Paul Menage; +Cc: Linux-Kernel

The task still allocated the page caches on old node after modifying its
cpuset's mems when 'memory_spread_page' was set, it is caused by the old
mem_allowed_list of the task, the current kernel doesn't updates it unless some
function invokes cpuset_update_task_memory_state(), it is too late sometimes.
We must update the mem_allowed_list of the tasks in time.

Slab has the same problem.

We fixes the bug by updating tasks' mem_allowed_list and spread flag after
its cpuset's mems or spread flag is changed.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 include/linux/cpuset.h |    4 -
 include/linux/sched.h  |    1 -
 init/main.c            |    3 +-
 kernel/cpuset.c        |  204 +++++++++++++++--------------------------------
 kernel/kthread.c       |    1 +
 mm/mempolicy.c         |   12 ---
 mm/page_alloc.c        |    5 +-
 7 files changed, 69 insertions(+), 161 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 90c6074..c8155a6 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -17,7 +17,6 @@
 
 extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 
-extern int cpuset_init_early(void);
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
@@ -26,7 +25,6 @@ extern void cpuset_cpus_allowed_locked(struct task_struct *p,
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
-void cpuset_update_task_memory_state(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
 extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
@@ -83,7 +81,6 @@ extern void cpuset_print_task_mems_allowed(struct task_struct *p);
 
 #else /* !CONFIG_CPUSETS */
 
-static inline int cpuset_init_early(void) { return 0; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
@@ -105,7 +102,6 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 
 #define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY])
 static inline void cpuset_init_current_mems_allowed(void) {}
-static inline void cpuset_update_task_memory_state(void) {}
 
 static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b8..2c9a93c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1332,7 +1332,6 @@ struct task_struct {
 #endif
 #ifdef CONFIG_CPUSETS
 	nodemask_t mems_allowed;
-	int cpuset_mems_generation;
 	int cpuset_mem_spread_rotor;
 #endif
 #ifdef CONFIG_CGROUPS
diff --git a/init/main.c b/init/main.c
index 8442094..3ce3b5d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -635,7 +635,6 @@ asmlinkage void __init start_kernel(void)
 #endif
 	vmalloc_init();
 	vfs_caches_init_early();
-	cpuset_init_early();
 	page_cgroup_init();
 	mem_init();
 	enable_debug_pagealloc();
@@ -845,6 +844,8 @@ static int __init kernel_init(void * unused)
 	 */
 	init_pid_ns.child_reaper = current;
 
+	current->mems_allowed = node_possible_map;
+
 	cad_pid = task_pid(current);
 
 	smp_prepare_cpus(setup_max_cpus);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a856788..36436fc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -89,12 +89,6 @@ struct cpuset {
 
 	struct cpuset *parent;		/* my parent */
 
-	/*
-	 * Copy of global cpuset_mems_generation as of the most
-	 * recent time this cpuset changed its mems_allowed.
-	 */
-	int mems_generation;
-
 	struct fmeter fmeter;		/* memory_pressure filter */
 
 	/* partition number for rebuild_sched_domains() */
@@ -172,27 +166,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
 	return test_bit(CS_SPREAD_SLAB, &cs->flags);
 }
 
-/*
- * Increment this integer everytime any cpuset changes its
- * mems_allowed value.  Users of cpusets can track this generation
- * number, and avoid having to lock and reload mems_allowed unless
- * the cpuset they're using changes generation.
- *
- * A single, global generation is needed because cpuset_attach_task() could
- * reattach a task to a different cpuset, which must not have its
- * generation numbers aliased with those of that tasks previous cpuset.
- *
- * Generations are needed for mems_allowed because one task cannot
- * modify another's memory placement.  So we must enable every task,
- * on every visit to __alloc_pages(), to efficiently check whether
- * its current->cpuset->mems_allowed has changed, requiring an update
- * of its current->mems_allowed.
- *
- * Since writes to cpuset_mems_generation are guarded by the cgroup lock
- * there is no need to mark it atomic.
- */
-static int cpuset_mems_generation;
-
 static struct cpuset top_cpuset = {
 	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
 };
@@ -224,8 +197,8 @@ static struct cpuset top_cpuset = {
  * If a task is only holding callback_mutex, then it has read-only
  * access to cpusets.
  *
- * The task_struct fields mems_allowed and mems_generation may only
- * be accessed in the context of that task, so require no locks.
+ * The task_struct fields mems_allowed may only be accessed in the context
+ * of that task, so require no locks.
  *
  * The cpuset_common_file_read() handlers only hold callback_mutex across
  * small pieces of code, such as when reading out possibly multi-word
@@ -327,77 +300,6 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 	BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
 }
 
-/**
- * cpuset_update_task_memory_state - update task memory placement
- *
- * If the current tasks cpusets mems_allowed changed behind our
- * backs, update current->mems_allowed, mems_generation and task NUMA
- * mempolicy to the new value.
- *
- * Task mempolicy is updated by rebinding it relative to the
- * current->cpuset if a task has its memory placement changed.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_mutex or task_lock() held.  May be
- * called with or without cgroup_mutex held.  Thanks in part to
- * 'the_top_cpuset_hack', the task's cpuset pointer will never
- * be NULL.  This routine also might acquire callback_mutex during
- * call.
- *
- * Reading current->cpuset->mems_generation doesn't need task_lock
- * to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset using RCU.
- *
- * The rcu_dereference() is technically probably not needed,
- * as I don't actually mind if I see a new cpuset pointer but
- * an old value of mems_generation.  However this really only
- * matters on alpha systems using cpusets heavily.  If I dropped
- * that rcu_dereference(), it would save them a memory barrier.
- * For all other arch's, rcu_dereference is a no-op anyway, and for
- * alpha systems not using cpusets, another planned optimization,
- * avoiding the rcu critical section for tasks in the root cpuset
- * which is statically allocated, so can't vanish, will make this
- * irrelevant.  Better to use RCU as intended, than to engage in
- * some cute trick to save a memory barrier that is impossible to
- * test, for alpha systems using cpusets heavily, which might not
- * even exist.
- *
- * This routine is needed to update the per-task mems_allowed data,
- * within the tasks context, when it is trying to allocate memory
- * (in various mm/mempolicy.c routines) and notices that some other
- * task has been modifying its cpuset.
- */
-
-void cpuset_update_task_memory_state(void)
-{
-	int my_cpusets_mem_gen;
-	struct task_struct *tsk = current;
-	struct cpuset *cs;
-
-	rcu_read_lock();
-	my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
-	rcu_read_unlock();
-
-	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
-		mutex_lock(&callback_mutex);
-		task_lock(tsk);
-		cs = task_cs(tsk); /* Maybe changed when task not locked */
-		guarantee_online_mems(cs, &tsk->mems_allowed);
-		tsk->cpuset_mems_generation = cs->mems_generation;
-		if (is_spread_page(cs))
-			tsk->flags |= PF_SPREAD_PAGE;
-		else
-			tsk->flags &= ~PF_SPREAD_PAGE;
-		if (is_spread_slab(cs))
-			tsk->flags |= PF_SPREAD_SLAB;
-		else
-			tsk->flags &= ~PF_SPREAD_SLAB;
-		task_unlock(tsk);
-		mutex_unlock(&callback_mutex);
-		mpol_rebind_task(tsk, &tsk->mems_allowed);
-	}
-}
-
 /*
  * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
  *
@@ -990,14 +892,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  *    other task, the task_struct mems_allowed that we are hacking
  *    is for our current task, which must allocate new pages for that
  *    migrating memory region.
- *
- *    We call cpuset_update_task_memory_state() before hacking
- *    our tasks mems_allowed, so that we are assured of being in
- *    sync with our tasks cpuset, and in particular, callbacks to
- *    cpuset_update_task_memory_state() from nested page allocations
- *    won't see any mismatch of our cpuset and task mems_generation
- *    values, so won't overwrite our hacked tasks mems_allowed
- *    nodemask.
  */
 
 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1005,8 +899,6 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 {
 	struct task_struct *tsk = current;
 
-	cpuset_update_task_memory_state();
-
 	mutex_lock(&callback_mutex);
 	tsk->mems_allowed = *to;
 	mutex_unlock(&callback_mutex);
@@ -1076,6 +968,10 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 				"Cpuset mempolicy rebind incomplete.\n");
 			break;
 		}
+		mutex_lock(&callback_mutex);
+		guarantee_online_mems(cs, &p->mems_allowed);
+		mutex_unlock(&callback_mutex);
+		mpol_rebind_task(p, &p->mems_allowed);
 		mm = get_task_mm(p);
 		if (!mm)
 			continue;
@@ -1118,10 +1014,9 @@ done:
 /*
  * Handle user request to change the 'mems' memory placement
  * of a cpuset.  Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies and if
- * the cpuset is marked 'memory_migrate', migrate the tasks
- * pages to the new memory.
+ * cpusets mems_allowed, and for each task in the cpuset,
+ * rebind any vma mempolicies and if the cpuset is marked
+ * 'memory_migrate', migrate the tasks pages to the new memory.
  *
  * Call with cgroup_mutex held.  May take callback_mutex during call.
  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1169,7 +1064,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 
 	mutex_lock(&callback_mutex);
 	cs->mems_allowed = trialcs->mems_allowed;
-	cs->mems_generation = cpuset_mems_generation++;
 	mutex_unlock(&callback_mutex);
 
 	retval = update_tasks_nodemask(cs, &oldmem);
@@ -1197,6 +1091,33 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 	return 0;
 }
 
+static void cpuset_change_flag(struct task_struct *tsk,
+				struct cgroup_scanner *scan)
+{
+	struct cpuset *cs;
+
+	cs = task_cs(tsk);
+	if (is_spread_page(cs))
+		tsk->flags |= PF_SPREAD_PAGE;
+	else
+		tsk->flags &= ~PF_SPREAD_PAGE;
+	if (is_spread_slab(cs))
+		tsk->flags |= PF_SPREAD_SLAB;
+	else
+		tsk->flags &= ~PF_SPREAD_SLAB;
+}
+
+static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
+{
+	struct cgroup_scanner scan;
+
+	scan.cg = cs->css.cgroup;
+	scan.test_task = NULL;
+	scan.process_task = cpuset_change_flag;
+	scan.heap = heap;
+	cgroup_scan_tasks(&scan);
+}
+
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:		the bit to update (see cpuset_flagbits_t)
@@ -1212,6 +1133,8 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	struct cpuset *trialcs;
 	int err;
 	int balance_flag_changed;
+	int spread_changed;
+	struct ptr_heap heap;
 
 	trialcs = alloc_trial_cpuset(cs);
 	if (!trialcs)
@@ -1226,9 +1149,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	if (err < 0)
 		goto out;
 
+	err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+	if (err)
+		goto out;
+
 	balance_flag_changed = (is_sched_load_balance(cs) !=
 				is_sched_load_balance(trialcs));
 
+	spread_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
+			|| (is_spread_page(cs) != is_spread_page(trialcs)));
+
 	mutex_lock(&callback_mutex);
 	cs->flags = trialcs->flags;
 	mutex_unlock(&callback_mutex);
@@ -1236,6 +1166,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
 		async_rebuild_sched_domains();
 
+	if (spread_changed)
+		update_tasks_flags(cs, &heap);
+
+	heap_free(&heap);
 out:
 	free_trial_cpuset(trialcs);
 	return err;
@@ -1374,15 +1308,29 @@ static void cpuset_attach(struct cgroup_subsys *ss,
 
 	if (cs == &top_cpuset) {
 		cpumask_copy(cpus_attach, cpu_possible_mask);
+		tsk->mems_allowed = node_possible_map;
 	} else {
 		mutex_lock(&callback_mutex);
 		guarantee_online_cpus(cs, cpus_attach);
+		guarantee_online_mems(cs,&tsk->mems_allowed);
 		mutex_unlock(&callback_mutex);
 	}
 	err = set_cpus_allowed_ptr(tsk, cpus_attach);
 	if (err)
 		return;
 
+	mutex_lock(&callback_mutex);
+	if (is_spread_page(cs))
+		tsk->flags |= PF_SPREAD_PAGE;
+	else
+		tsk->flags &= ~PF_SPREAD_PAGE;
+	if (is_spread_slab(cs))
+		tsk->flags |= PF_SPREAD_SLAB;
+	else
+		tsk->flags &= ~PF_SPREAD_SLAB;
+	mutex_unlock(&callback_mutex);
+	mpol_rebind_task(tsk, &tsk->mems_allowed);
+
 	from = oldcs->mems_allowed;
 	to = cs->mems_allowed;
 	mm = get_task_mm(tsk);
@@ -1444,11 +1392,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 		break;
 	case FILE_SPREAD_PAGE:
 		retval = update_flag(CS_SPREAD_PAGE, cs, val);
-		cs->mems_generation = cpuset_mems_generation++;
 		break;
 	case FILE_SPREAD_SLAB:
 		retval = update_flag(CS_SPREAD_SLAB, cs, val);
-		cs->mems_generation = cpuset_mems_generation++;
 		break;
 	default:
 		retval = -EINVAL;
@@ -1787,8 +1733,6 @@ static struct cgroup_subsys_state *cpuset_create(
 	struct cpuset *parent;
 
 	if (!cont->parent) {
-		/* This is early initialization for the top cgroup */
-		top_cpuset.mems_generation = cpuset_mems_generation++;
 		return &top_cpuset.css;
 	}
 	parent = cgroup_cs(cont->parent);
@@ -1800,7 +1744,6 @@ static struct cgroup_subsys_state *cpuset_create(
 		return ERR_PTR(-ENOMEM);
 	}
 
-	cpuset_update_task_memory_state();
 	cs->flags = 0;
 	if (is_spread_page(parent))
 		set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1809,7 +1752,6 @@ static struct cgroup_subsys_state *cpuset_create(
 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	cpumask_clear(cs->cpus_allowed);
 	nodes_clear(cs->mems_allowed);
-	cs->mems_generation = cpuset_mems_generation++;
 	fmeter_init(&cs->fmeter);
 	cs->relax_domain_level = -1;
 
@@ -1828,8 +1770,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	struct cpuset *cs = cgroup_cs(cont);
 
-	cpuset_update_task_memory_state();
-
 	if (is_sched_load_balance(cs))
 		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
 
@@ -1850,21 +1790,6 @@ struct cgroup_subsys cpuset_subsys = {
 	.early_init = 1,
 };
 
-/*
- * cpuset_init_early - just enough so that the calls to
- * cpuset_update_task_memory_state() in early init code
- * are harmless.
- */
-
-int __init cpuset_init_early(void)
-{
-	alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
-
-	top_cpuset.mems_generation = cpuset_mems_generation++;
-	return 0;
-}
-
-
 /**
  * cpuset_init - initialize cpusets at system boot
  *
@@ -1875,11 +1800,12 @@ int __init cpuset_init(void)
 {
 	int err = 0;
 
+	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
+		BUG();
 	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
 
 	fmeter_init(&top_cpuset.fmeter);
-	top_cpuset.mems_generation = cpuset_mems_generation++;
 	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
 	top_cpuset.relax_domain_level = -1;
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4fbc456..90469e6 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -242,6 +242,7 @@ int kthreadd(void *unused)
 	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
 	set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
 
+	current->mems_allowed = node_possible_map;
 	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
 
 	for (;;) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3eb4a6f..5912b03 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -222,10 +222,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	policy->flags = flags;
 
 	if (nodes) {
-		/*
-		 * cpuset related setup doesn't apply to local allocation
-		 */
-		cpuset_update_task_memory_state();
 		if (flags & MPOL_F_RELATIVE_NODES)
 			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
 					       &cpuset_current_mems_allowed);
@@ -674,7 +670,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
 
-	cpuset_update_task_memory_state();
 	if (flags &
 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 		return -EINVAL;
@@ -1545,8 +1540,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
 
-	cpuset_update_task_memory_state();
-
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
 
@@ -1585,16 +1578,11 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
  *	interrupt context and apply the current process NUMA policy.
  *	Returns NULL when no page can be allocated.
  *
- *	Don't call cpuset_update_task_memory_state() unless
- *	1) it's ok to take cpuset_sem (can WAIT), and
- *	2) allocating for current task (not interrupt).
  */
 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
 	struct mempolicy *pol = current->mempolicy;
 
-	if ((gfp & __GFP_WAIT) && !in_interrupt())
-		cpuset_update_task_memory_state();
 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
 		pol = &default_policy;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5675b30..503219c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1573,10 +1573,7 @@ nofail_alloc:
 
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
-	/*
-	 * The task's cpuset might have expanded its set of allowable nodes
-	 */
-	cpuset_update_task_memory_state();
+
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
-- 
1.6.0.3



^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-01-21  8:06 [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set Miao Xie
@ 2009-01-21  8:30 ` Nick Piggin
  2009-01-21 10:41 ` Paul Menage
  2009-01-27 22:42 ` Andrew Morton
  2 siblings, 0 replies; 24+ messages in thread
From: Nick Piggin @ 2009-01-21  8:30 UTC (permalink / raw)
  To: miaox; +Cc: Ingo Molnar, Andrew Morton, Paul Menage, Linux-Kernel

On Wednesday 21 January 2009 19:06:20 Miao Xie wrote:
> The task still allocated the page caches on old node after modifying its
> cpuset's mems when 'memory_spread_page' was set, it is caused by the old
> mem_allowed_list of the task, the current kernel doesn't updates it unless
> some function invokes cpuset_update_task_memory_state(), it is too late
> sometimes. We must update the mem_allowed_list of the tasks in time.
>
> Slab has the same problem.
>
> We fixes the bug by updating tasks' mem_allowed_list and spread flag after
> its cpuset's mems or spread flag is changed.

Thank you, this is a much better way to handle the problem, and keeps
those costly function calls out of fastpaths.

I don't know the cpuset code well enough to review it right now, but
the mm/ pieces thank you kindly for it :) Nice patch.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the  unallowed node when memory spread is set
  2009-01-21  8:06 [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set Miao Xie
  2009-01-21  8:30 ` Nick Piggin
@ 2009-01-21 10:41 ` Paul Menage
  2009-02-03  3:05   ` Miao Xie
  2009-01-27 22:42 ` Andrew Morton
  2 siblings, 1 reply; 24+ messages in thread
From: Paul Menage @ 2009-01-21 10:41 UTC (permalink / raw)
  To: miaox; +Cc: Ingo Molnar, Andrew Morton, Linux-Kernel

On Wed, Jan 21, 2009 at 12:06 AM, Miao Xie <miaox@cn.fujitsu.com> wrote:
> The task still allocated the page caches on old node after modifying its
> cpuset's mems when 'memory_spread_page' was set, it is caused by the old
> mem_allowed_list of the task, the current kernel doesn't updates it unless some
> function invokes cpuset_update_task_memory_state(), it is too late sometimes.

Can you give a more concrete example of how the current code can break?

> We must update the mem_allowed_list of the tasks in time.

This is a fairly fundamental change to the way that mems_allowed is
handled - it dates back to fairly early in cpusets' history.

> - * The task_struct fields mems_allowed and mems_generation may only
> - * be accessed in the context of that task, so require no locks.
> + * The task_struct fields mems_allowed may only be accessed in the context
> + * of that task, so require no locks.

This comment is no longer true, since mems_allowed is now being
updated by other processes.

What's to stop a task reading current->mems_allowed and racing with an
update from update_tasks_nodemask() ? Or else, why can that not lead
to badness?

Paul

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-01-21  8:06 [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set Miao Xie
  2009-01-21  8:30 ` Nick Piggin
  2009-01-21 10:41 ` Paul Menage
@ 2009-01-27 22:42 ` Andrew Morton
  2009-01-28 16:38   ` Christoph Lameter
  2009-02-03  3:25   ` Miao Xie
  2 siblings, 2 replies; 24+ messages in thread
From: Andrew Morton @ 2009-01-27 22:42 UTC (permalink / raw)
  To: miaox; +Cc: mingo, menage, linux-kernel, Christoph Lameter, Nick Piggin

On Wed, 21 Jan 2009 16:06:20 +0800
Miao Xie <miaox@cn.fujitsu.com> wrote:

> The task still allocated the page caches on old node after modifying its
> cpuset's mems when 'memory_spread_page' was set, it is caused by the old
> mem_allowed_list of the task, the current kernel doesn't updates it unless some
> function invokes cpuset_update_task_memory_state(), it is too late sometimes.
> We must update the mem_allowed_list of the tasks in time.
> 
> Slab has the same problem.
> 
> We fixes the bug by updating tasks' mem_allowed_list and spread flag after
> its cpuset's mems or spread flag is changed.
> 
>
> ...
>
> --- a/kernel/kthread.c
> +++ b/kernel/kthread.c
> @@ -242,6 +242,7 @@ int kthreadd(void *unused)
>  	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
>  	set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
>  
> +	current->mems_allowed = node_possible_map;
>  	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;

Why this change?  kthreadd() is called from rest_init(), before anyone
has had a chance to alter ->mems_allowed?


I queued the patch pending your response to Paul's questions, thanks.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-01-27 22:42 ` Andrew Morton
@ 2009-01-28 16:38   ` Christoph Lameter
  2009-02-03  3:25   ` Miao Xie
  1 sibling, 0 replies; 24+ messages in thread
From: Christoph Lameter @ 2009-01-28 16:38 UTC (permalink / raw)
  To: Andrew Morton; +Cc: miaox, mingo, menage, linux-kernel, Nick Piggin

What exactly is breaking? You do a memory spread in a set of nodes
determined by the cpuset? Or how do you restrict the nodes that memory
spread is allowed to cycle through?




^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-01-21 10:41 ` Paul Menage
@ 2009-02-03  3:05   ` Miao Xie
  0 siblings, 0 replies; 24+ messages in thread
From: Miao Xie @ 2009-02-03  3:05 UTC (permalink / raw)
  To: Paul Menage; +Cc: Ingo Molnar, Andrew Morton, Linux-Kernel

[-- Attachment #1: Type: text/plain, Size: 2970 bytes --]

on 2009-1-21 18:41 Paul Menage wrote:
> On Wed, Jan 21, 2009 at 12:06 AM, Miao Xie <miaox@cn.fujitsu.com> wrote:
>> The task still allocated the page caches on old node after modifying its
>> cpuset's mems when 'memory_spread_page' was set, it is caused by the old
>> mem_allowed_list of the task, the current kernel doesn't updates it unless some
>> function invokes cpuset_update_task_memory_state(), it is too late sometimes.
> 
> Can you give a more concrete example of how the current code can break?

I'm sorry for replying late.

A test program which reproduces the problem on current kernel is attached.
This program just repeats reading the file "DATAFILE" after receiving SIGUSR1
and exits after receiving SIGINT.
	Test Task
	  sigsuspend();
	  while(!end) {
		read DATAFILE;
		sigsuspend();
	  }

Steps to reproduce:
# mkdir /dev/cpuset
# mount -t cpuset xxx /dev/cpuset
# mkdir /dev/cpuset/0
# echo 0 > /dev/cpuset/0/cpus
# echo 1-2 > /dev/cpuset/0/mems
# echo 1 > /dev/cpuset/0/memory_spread_page
# dd if=/dev/zero of=DATAFILE bs=4096 count=20480	<- create a 80M file
# ./mem-hog &
# tst_pid=$!
# cat /proc/$tst_pid/status
...
Mems_allowed:   00000000,00000007
Mems_allowed_list:      0-2
...
# echo $tst_pid > /dev/cpuset/0/tasks
# echo 3 > /proc/sys/vm/drop_caches
# find /sys/devices/system/node/ -name "meminfo" -exec cat {} + | grep "FilePages"
Node 0 FilePages:         15488 kB
Node 1 FilePages:             0 kB
Node 2 FilePages:             0 kB
# cat /proc/$tst_pid/status
...
Mems_allowed:   00000000,00000007		<- old memory list
Mems_allowed_list:      0-2
...
# kill -s sigusr1 $tst_pid

	...wait a moment...

# find /sys/devices/system/node/ -name "meminfo" -exec cat {} + | grep "FilePages"
Node 0 FilePages:         97548 kB
Node 1 FilePages:             0 kB
Node 2 FilePages:             0 kB
# cat /proc/$tst_pid/status
...
Mems_allowed:   00000000,00000007               <- old memory list
Mems_allowed_list:      0-2
...

>> We must update the mem_allowed_list of the tasks in time.
> 
> This is a fairly fundamental change to the way that mems_allowed is
> handled - it dates back to fairly early in cpusets' history.

I found Mr. Paul Jackson had discussed it with somebody:

  http://lkml.org/lkml/2004/8/11/211

According to what Mr. Paul Jackson said, my patch has a serious bug.
Maybe we fix this bug just by choosing a good callsite for
cpuset_update_task_memory_state().

Thanks!
Miao

>> - * The task_struct fields mems_allowed and mems_generation may only
>> - * be accessed in the context of that task, so require no locks.
>> + * The task_struct fields mems_allowed may only be accessed in the context
>> + * of that task, so require no locks.
> 
> This comment is no longer true, since mems_allowed is now being
> updated by other processes.
> 
> What's to stop a task reading current->mems_allowed and racing with an
> update from update_tasks_nodemask() ? Or else, why can that not lead
> to badness?
> 
> Paul
> 
> 
> 


[-- Attachment #2: mem-hog.c --]
[-- Type: text/x-csrc, Size: 1352 bytes --]

/* gcc mem-hog.c -g -o mem-hog -Wall -Wextra */

#include <stdio.h>
#include <unistd.h>
#include <err.h>
#include <signal.h>
#include <fcntl.h>

#define UNUSED __attribute__ ((unused))
#define BUFFER_SIZE 1024

volatile int end;

void sighandler1(UNUSED int signo)
{
}

void sigint(UNUSED int signo)
{
	end = 1;
}

int page_cache_hog(void)
{
	int fd = -1;
	char buff[BUFFER_SIZE];
	char path[BUFFER_SIZE];
	int ret = 0;

	sprintf(path, "%s", "DATAFILE");
	fd = open(path, O_RDONLY);
	if (fd == -1) {
		warn("open %s failed", path);
		return -1;
	}

	while ((ret = read(fd, buff, sizeof(buff))) > 0);
	if (ret == -1)
		warn("read %s failed", path);

	close(fd);
	return ret;
}

int mem_hog(void)
{
	sigset_t sigset;
	int ret = 0;

	if (sigemptyset(&sigset) < 0)
		err(1, "sigemptyset()");
	sigsuspend(&sigset);

	while (!end) {
		ret = page_cache_hog();

		sigsuspend(&sigset);
	}

	return ret;
}

int main(UNUSED int argc, UNUSED char *argv[])
{
	struct sigaction sa1, sa2;

	sa1.sa_handler = sighandler1;
	if (sigemptyset(&sa1.sa_mask) < 0)
		err(1, "sigemptyset()");

	sa1.sa_flags = 0;
	if (sigaction(SIGUSR1, &sa1, NULL) < 0)
		err(1, "sigaction()");

	sa2.sa_handler = sigint;
	if (sigemptyset(&sa2.sa_mask) < 0)
		err(1, "sigemptyset()");

	sa2.sa_flags = 0;
	if (sigaction(SIGINT, &sa2, NULL) < 0)
		err(1, "sigaction()");

	return mem_hog();
}

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-01-27 22:42 ` Andrew Morton
  2009-01-28 16:38   ` Christoph Lameter
@ 2009-02-03  3:25   ` Miao Xie
  2009-02-03 22:16     ` Andrew Morton
  1 sibling, 1 reply; 24+ messages in thread
From: Miao Xie @ 2009-02-03  3:25 UTC (permalink / raw)
  To: Andrew Morton; +Cc: mingo, menage, linux-kernel, Christoph Lameter, Nick Piggin

on 2009-1-28 6:42 Andrew Morton wrote:
> On Wed, 21 Jan 2009 16:06:20 +0800
> Miao Xie <miaox@cn.fujitsu.com> wrote:
> 
>> The task still allocated the page caches on old node after modifying its
>> cpuset's mems when 'memory_spread_page' was set, it is caused by the old
>> mem_allowed_list of the task, the current kernel doesn't updates it unless some
>> function invokes cpuset_update_task_memory_state(), it is too late sometimes.
>> We must update the mem_allowed_list of the tasks in time.
>>
>> Slab has the same problem.
>>
>> We fixes the bug by updating tasks' mem_allowed_list and spread flag after
>> its cpuset's mems or spread flag is changed.
>>
>>
>> ...
>>
>> --- a/kernel/kthread.c
>> +++ b/kernel/kthread.c
>> @@ -242,6 +242,7 @@ int kthreadd(void *unused)
>>  	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
>>  	set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
>>  
>> +	current->mems_allowed = node_possible_map;
>>  	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
> 
> Why this change?  kthreadd() is called from rest_init(), before anyone
> has had a chance to alter ->mems_allowed?

I found that after mems_allowed of kthreadd was not initialized applying this patch,
every bit of it is 1, so...
Maybe it is redundant.

Thanks!
Miao

> 
> I queued the patch pending your response to Paul's questions, thanks.
> 
> 
> 



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-02-03  3:25   ` Miao Xie
@ 2009-02-03 22:16     ` Andrew Morton
  2009-02-03 22:49       ` Paul Menage
  2009-02-04  9:03       ` Miao Xie
  0 siblings, 2 replies; 24+ messages in thread
From: Andrew Morton @ 2009-02-03 22:16 UTC (permalink / raw)
  To: miaox; +Cc: mingo, menage, linux-kernel, cl, nickpiggin

On Tue, 03 Feb 2009 11:25:25 +0800
Miao Xie <miaox@cn.fujitsu.com> wrote:

> on 2009-1-28 6:42 Andrew Morton wrote:
> > On Wed, 21 Jan 2009 16:06:20 +0800
> > Miao Xie <miaox@cn.fujitsu.com> wrote:
> > 
> >> The task still allocated the page caches on old node after modifying its
> >> cpuset's mems when 'memory_spread_page' was set, it is caused by the old
> >> mem_allowed_list of the task, the current kernel doesn't updates it unless some
> >> function invokes cpuset_update_task_memory_state(), it is too late sometimes.
> >> We must update the mem_allowed_list of the tasks in time.
> >>
> >> Slab has the same problem.
> >>
> >> We fixes the bug by updating tasks' mem_allowed_list and spread flag after
> >> its cpuset's mems or spread flag is changed.
> >>
> >>
> >> ...
> >>
> >> --- a/kernel/kthread.c
> >> +++ b/kernel/kthread.c
> >> @@ -242,6 +242,7 @@ int kthreadd(void *unused)
> >>  	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
> >>  	set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
> >>  
> >> +	current->mems_allowed = node_possible_map;
> >>  	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
> > 
> > Why this change?  kthreadd() is called from rest_init(), before anyone
> > has had a chance to alter ->mems_allowed?
> 
> I found that after mems_allowed of kthreadd was not initialized applying this patch,
> every bit of it is 1, so...
> Maybe it is redundant.

I think it is redundant.  kthreadd's mems_allowed _should_ be all-ones.
Or at least, all-nodes-allowed.

I wasn't able to find out where the setting of init'smems_allowed
happens, after a bit of grepping and hunting.  It should be done within
INIT_TASK, but isn't.

Still, kthreadd is reliably parented by swapper, and there shold be no
need to alter its mems_allowed.

Similarly, what was the reason for setting current->mems_allowed in
kernel_init()?  That also should be unneeded.

Finally, I've somewhat lost track of where we are with this patch. 
Paul, do you see any other remaining issues?


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the  unallowed node when memory spread is set
  2009-02-03 22:16     ` Andrew Morton
@ 2009-02-03 22:49       ` Paul Menage
  2009-02-04  9:31         ` Miao Xie
  2009-02-04  9:03       ` Miao Xie
  1 sibling, 1 reply; 24+ messages in thread
From: Paul Menage @ 2009-02-03 22:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: miaox, mingo, linux-kernel, cl, nickpiggin

On Tue, Feb 3, 2009 at 2:16 PM, Andrew Morton <akpm@linux-foundation.org> wrote:
> On Tue, 03 Feb 2009 11:25:25 +0800
> Miao Xie <miaox@cn.fujitsu.com> wrote:
>
>> on 2009-1-28 6:42 Andrew Morton wrote:
>> > On Wed, 21 Jan 2009 16:06:20 +0800
>> > Miao Xie <miaox@cn.fujitsu.com> wrote:
>> >
>> >> The task still allocated the page caches on old node after modifying its
>> >> cpuset's mems when 'memory_spread_page' was set, it is caused by the old
>> >> mem_allowed_list of the task, the current kernel doesn't updates it unless some
>> >> function invokes cpuset_update_task_memory_state(), it is too late sometimes.
>> >> We must update the mem_allowed_list of the tasks in time.
>> >>
>> >> Slab has the same problem.
>> >>
>> >> We fixes the bug by updating tasks' mem_allowed_list and spread flag after
>> >> its cpuset's mems or spread flag is changed.
>> >>
>> >>
>> >> ...
>> >>
>> >> --- a/kernel/kthread.c
>> >> +++ b/kernel/kthread.c
>> >> @@ -242,6 +242,7 @@ int kthreadd(void *unused)
>> >>    set_user_nice(tsk, KTHREAD_NICE_LEVEL);
>> >>    set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
>> >>
>> >> +  current->mems_allowed = node_possible_map;
>> >>    current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
>> >
>> > Why this change?  kthreadd() is called from rest_init(), before anyone
>> > has had a chance to alter ->mems_allowed?
>>
>> I found that after mems_allowed of kthreadd was not initialized applying this patch,
>> every bit of it is 1, so...
>> Maybe it is redundant.
>
> I think it is redundant.  kthreadd's mems_allowed _should_ be all-ones.
> Or at least, all-nodes-allowed.
>
> I wasn't able to find out where the setting of init'smems_allowed
> happens, after a bit of grepping and hunting.  It should be done within
> INIT_TASK, but isn't.
>
> Still, kthreadd is reliably parented by swapper, and there shold be no
> need to alter its mems_allowed.
>
> Similarly, what was the reason for setting current->mems_allowed in
> kernel_init()?  That also should be unneeded.
>
> Finally, I've somewhat lost track of where we are with this patch.
> Paul, do you see any other remaining issues?

AFAICS this patch still has a race between a thread reading its
mems_allowed, and another thread updating it. The current architecture
of having task->mems_allowed be only updatable by current was PaulJ's
code originally, and I'm a bit loathe to touch it. But if we're going
to, we'll need at the minimum to add a lock for any code that touches
current->mems_allowed.

Paul

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-02-03 22:16     ` Andrew Morton
  2009-02-03 22:49       ` Paul Menage
@ 2009-02-04  9:03       ` Miao Xie
  1 sibling, 0 replies; 24+ messages in thread
From: Miao Xie @ 2009-02-04  9:03 UTC (permalink / raw)
  To: Andrew Morton; +Cc: mingo, menage, linux-kernel, cl, nickpiggin

on 2009-2-4 6:16 Andrew Morton wrote:
> On Tue, 03 Feb 2009 11:25:25 +0800
> Miao Xie <miaox@cn.fujitsu.com> wrote:
> 
>> on 2009-1-28 6:42 Andrew Morton wrote:
>>> On Wed, 21 Jan 2009 16:06:20 +0800
>>> Miao Xie <miaox@cn.fujitsu.com> wrote:
>>>
>>>> The task still allocated the page caches on old node after modifying its
>>>> cpuset's mems when 'memory_spread_page' was set, it is caused by the old
>>>> mem_allowed_list of the task, the current kernel doesn't updates it unless some
>>>> function invokes cpuset_update_task_memory_state(), it is too late sometimes.
>>>> We must update the mem_allowed_list of the tasks in time.
>>>>
>>>> Slab has the same problem.
>>>>
>>>> We fixes the bug by updating tasks' mem_allowed_list and spread flag after
>>>> its cpuset's mems or spread flag is changed.
>>>>
>>>>
>>>> ...
>>>>
>>>> --- a/kernel/kthread.c
>>>> +++ b/kernel/kthread.c
>>>> @@ -242,6 +242,7 @@ int kthreadd(void *unused)
>>>>  	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
>>>>  	set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
>>>>  
>>>> +	current->mems_allowed = node_possible_map;
>>>>  	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
>>> Why this change?  kthreadd() is called from rest_init(), before anyone
>>> has had a chance to alter ->mems_allowed?
>> I found that after mems_allowed of kthreadd was not initialized applying this patch,
>> every bit of it is 1, so...
>> Maybe it is redundant.
> 
> I think it is redundant.  kthreadd's mems_allowed _should_ be all-ones.
> Or at least, all-nodes-allowed.
> 
> I wasn't able to find out where the setting of init'smems_allowed
> happens, after a bit of grepping and hunting.  It should be done within
> INIT_TASK, but isn't.

I found it. Call trace is following:
start_kernel()
  ->build_all_zonelists()
      ->cpuset_init_current_mems_allowed()
          ->nodes_setall(current->mems_allowed);
then, it is inherited by kthreadd.

> 
> Still, kthreadd is reliably parented by swapper, and there shold be no
> need to alter its mems_allowed.

but it is strange to users because there are not so many nodes in the system in fact,
so I think using node_possible_map to set init's mems_allowed is better.

> Similarly, what was the reason for setting current->mems_allowed in
> kernel_init()?  That also should be unneeded.

The same as above.

> 
> Finally, I've somewhat lost track of where we are with this patch. 
> Paul, do you see any other remaining issues?
> 
> 
> 
> 



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-02-03 22:49       ` Paul Menage
@ 2009-02-04  9:31         ` Miao Xie
  2009-02-06 19:19           ` Paul Menage
  0 siblings, 1 reply; 24+ messages in thread
From: Miao Xie @ 2009-02-04  9:31 UTC (permalink / raw)
  To: Paul Menage; +Cc: Andrew Morton, mingo, linux-kernel, cl, nickpiggin

on 2009-2-4 6:49 Paul Menage wrote:
> On Tue, Feb 3, 2009 at 2:16 PM, Andrew Morton <akpm@linux-foundation.org> wrote:
>> On Tue, 03 Feb 2009 11:25:25 +0800
>> Miao Xie <miaox@cn.fujitsu.com> wrote:
[snip]
>>
>> I wasn't able to find out where the setting of init'smems_allowed
>> happens, after a bit of grepping and hunting.  It should be done within
>> INIT_TASK, but isn't.
>>
>> Still, kthreadd is reliably parented by swapper, and there shold be no
>> need to alter its mems_allowed.
>>
>> Similarly, what was the reason for setting current->mems_allowed in
>> kernel_init()?  That also should be unneeded.
>>
>> Finally, I've somewhat lost track of where we are with this patch.
>> Paul, do you see any other remaining issues?
> 
> AFAICS this patch still has a race between a thread reading its
> mems_allowed, and another thread updating it. The current architecture
> of having task->mems_allowed be only updatable by current was PaulJ's
> code originally, and I'm a bit loathe to touch it. But if we're going
> to, we'll need at the minimum to add a lock for any code that touches
> current->mems_allowed.

Agree! But mems_allowed is touched in the module of memory management
in general, adding a lock to protect mems_allowed may lead to performance
regression.

> 
> Paul
> 
> 
> 



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the  unallowed node when memory spread is set
  2009-02-04  9:31         ` Miao Xie
@ 2009-02-06 19:19           ` Paul Menage
  2009-02-09  4:02             ` Nick Piggin
  0 siblings, 1 reply; 24+ messages in thread
From: Paul Menage @ 2009-02-06 19:19 UTC (permalink / raw)
  To: miaox; +Cc: Andrew Morton, mingo, linux-kernel, cl, nickpiggin

On Wed, Feb 4, 2009 at 1:31 AM, Miao Xie <miaox@cn.fujitsu.com> wrote:
>> AFAICS this patch still has a race between a thread reading its
>> mems_allowed, and another thread updating it. The current architecture
>> of having task->mems_allowed be only updatable by current was PaulJ's
>> code originally, and I'm a bit loathe to touch it. But if we're going
>> to, we'll need at the minimum to add a lock for any code that touches
>> current->mems_allowed.
>
> Agree! But mems_allowed is touched in the module of memory management
> in general, adding a lock to protect mems_allowed may lead to performance
> regression.
>

I've asked Andrew to remove this from -mm until we can figure out:

- whether it's needed
- how to do it safely.

Basically the problem that you're seeing is that when you move a task
into a cpuset that has memory_spread_page=1, it fails to call
cpuset_update_task_memory_state()?

One thing missing from your test - can you verify that the test
program really did get migrated into the child cpuset? If it failed
(and echo failed to report the error) that would explain what you're
seeing.

Paul

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-02-06 19:19           ` Paul Menage
@ 2009-02-09  4:02             ` Nick Piggin
  2009-02-10 11:37               ` Paul Menage
  0 siblings, 1 reply; 24+ messages in thread
From: Nick Piggin @ 2009-02-09  4:02 UTC (permalink / raw)
  To: Paul Menage; +Cc: miaox, Andrew Morton, mingo, linux-kernel, cl

On Saturday 07 February 2009 06:19:27 Paul Menage wrote:
> On Wed, Feb 4, 2009 at 1:31 AM, Miao Xie <miaox@cn.fujitsu.com> wrote:
> >> AFAICS this patch still has a race between a thread reading its
> >> mems_allowed, and another thread updating it. The current architecture
> >> of having task->mems_allowed be only updatable by current was PaulJ's
> >> code originally, and I'm a bit loathe to touch it. But if we're going
> >> to, we'll need at the minimum to add a lock for any code that touches
> >> current->mems_allowed.
> >
> > Agree! But mems_allowed is touched in the module of memory management
> > in general, adding a lock to protect mems_allowed may lead to performance
> > regression.
>
> I've asked Andrew to remove this from -mm until we can figure out:
>
> - whether it's needed
> - how to do it safely.

Is it a problem if mems_allowed can get sampled in an unsafe way?
It will happen only quite rarely. This code seems to be far simpler
and more robust than the current fragile scheme, so it would be
nice to use it.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the  unallowed node when memory spread is set
  2009-02-09  4:02             ` Nick Piggin
@ 2009-02-10 11:37               ` Paul Menage
  2009-02-12  0:54                 ` Nick Piggin
  0 siblings, 1 reply; 24+ messages in thread
From: Paul Menage @ 2009-02-10 11:37 UTC (permalink / raw)
  To: Nick Piggin; +Cc: miaox, Andrew Morton, mingo, linux-kernel, cl

On Sun, Feb 8, 2009 at 8:02 PM, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> Is it a problem if mems_allowed can get sampled in an unsafe way?

It could cause an OOM to be incorrectly triggered if the task didn't
see all the mems that it was meant to have access to. (In the extreme
case, it could see no mems at all).

> It will happen only quite rarely. This code seems to be far simpler
> and more robust than the current fragile scheme, so it would be
> nice to use it.

Agreed, the existing task->mems_allowed / cpuset_update_memory_state()
code is a bit unwieldy. It's pretty much all inherited from the
original cpusets code.

A nicer way to do it might be:

- get rid of task->mems_allowed entirely

- have cpuset->mems_allowed be a pointer to an immutable RCU-protected
nodemask (so updating the nodemask for a cpuset would always replace
it with a fresh one and RCU-free the old one)

- make cpuset_zone_allowed_*() use rcu_read_lock() and just check
*(task_cs(current)->mems_allowed) rather than current->mems_allowed

- ensure that get_page_from_freelist() is also RCU-safe (right now I
think it's not since cpuset_zone_allowed_softwall() can sleep, but I
think cgroups/cpusets is sufficiently RCU-safe now that we could quite
likely remove the mutex lock from cpuset_zone_allowed_*()

- add an rcu_read_lock() in hugetlb.c:cpuset_mems_nr()

- figure out where the mempolicy updates currently done in
cpuset_update_task_memory_state() need to occur - this is part of the
code that I'm pretty fuzzy on. (Maybe we can just copy the bits from
Miao's patch for this?)

Anything else?

Paul

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-02-10 11:37               ` Paul Menage
@ 2009-02-12  0:54                 ` Nick Piggin
  2009-02-12  1:19                   ` Paul Menage
  0 siblings, 1 reply; 24+ messages in thread
From: Nick Piggin @ 2009-02-12  0:54 UTC (permalink / raw)
  To: Paul Menage; +Cc: miaox, Andrew Morton, mingo, linux-kernel, cl

On Tuesday 10 February 2009 22:37:28 Paul Menage wrote:
> On Sun, Feb 8, 2009 at 8:02 PM, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> > Is it a problem if mems_allowed can get sampled in an unsafe way?
>
> It could cause an OOM to be incorrectly triggered if the task didn't
> see all the mems that it was meant to have access to. (In the extreme
> case, it could see no mems at all).

You can't be overly worried about concurrency cases. We're talking about
two threads here, one chancing mems_allowed, and the other performing
an allocation.

It would be possible, depending on timing, for the allocating thread to
see either pre or post mems_allowed even if access was fully locked.

The only difference is that a partially changed mems_allowed could be
seen. But what does this really mean? Some combination of the new and
the old nodes. I don't think this is too much of a problem.


> > It will happen only quite rarely. This code seems to be far simpler
> > and more robust than the current fragile scheme, so it would be
> > nice to use it.
>
> Agreed, the existing task->mems_allowed / cpuset_update_memory_state()
> code is a bit unwieldy. It's pretty much all inherited from the
> original cpusets code.
>
> A nicer way to do it might be:
>
> - get rid of task->mems_allowed entirely
>
> - have cpuset->mems_allowed be a pointer to an immutable RCU-protected
> nodemask (so updating the nodemask for a cpuset would always replace
> it with a fresh one and RCU-free the old one)
>
> - make cpuset_zone_allowed_*() use rcu_read_lock() and just check
> *(task_cs(current)->mems_allowed) rather than current->mems_allowed
>
> - ensure that get_page_from_freelist() is also RCU-safe (right now I
> think it's not since cpuset_zone_allowed_softwall() can sleep, but I
> think cgroups/cpusets is sufficiently RCU-safe now that we could quite
> likely remove the mutex lock from cpuset_zone_allowed_*()
>
> - add an rcu_read_lock() in hugetlb.c:cpuset_mems_nr()

This could work if we *really* need an atomic snapshot of mems_allowed.
seqcount synchronisation would be an alternative too that could allow
sleeping more easily than SRCU (OTOH if you don't need sleeping, then
RCU should be faster than seqcount).

But I'm not convinced we do need this to be atomic.


> - figure out where the mempolicy updates currently done in
> cpuset_update_task_memory_state() need to occur - this is part of the
> code that I'm pretty fuzzy on. (Maybe we can just copy the bits from
> Miao's patch for this?)

Basically we want to push all that into the sites where the memory policy
is actually changed. So yes they should all go away.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the  unallowed node when memory spread is set
  2009-02-12  0:54                 ` Nick Piggin
@ 2009-02-12  1:19                   ` Paul Menage
  2009-02-12  1:55                     ` Nick Piggin
  2009-02-12  5:57                     ` Miao Xie
  0 siblings, 2 replies; 24+ messages in thread
From: Paul Menage @ 2009-02-12  1:19 UTC (permalink / raw)
  To: Nick Piggin, Paul Jackson; +Cc: miaox, Andrew Morton, mingo, linux-kernel, cl

On Wed, Feb 11, 2009 at 4:54 PM, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> It would be possible, depending on timing, for the allocating thread to
> see either pre or post mems_allowed even if access was fully locked.

Right - seeing either the pre set or the post set is fine.

>
> The only difference is that a partially changed mems_allowed could be
> seen. But what does this really mean? Some combination of the new and
> the old nodes. I don't think this is too much of a problem.

But if the old and new nodes are disjoint, that could lead to seeing no nodes.

Also, having the results of cpuset_zone_allowed() and
cpuset_current_mems_allowed change at random times over the course of
a call to alloc_pages() might cause interesting effects (e.g. we make
progress freeing pages from one set of nodes, and then call
get_page_from_freelist() on a different set of nodes).

>
> This could work if we *really* need an atomic snapshot of mems_allowed.
> seqcount synchronisation would be an alternative too that could allow
> sleeping more easily than SRCU (OTOH if you don't need sleeping, then
> RCU should be faster than seqcount).
>
> But I'm not convinced we do need this to be atomic.

It's possible that I'm being overly-paranoid here. The decision to
make mems_allowed updates be purely pulled by the task itself predates
my involvement with cpusets code by a long time. Paul Jackson (CC'd)
may have opinions here, but I suspect his sgi.com email address no
longer works, and I don't have any more recent address for him.

Paul

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-02-12  1:19                   ` Paul Menage
@ 2009-02-12  1:55                     ` Nick Piggin
  2009-02-12  1:58                       ` Paul Menage
                                         ` (2 more replies)
  2009-02-12  5:57                     ` Miao Xie
  1 sibling, 3 replies; 24+ messages in thread
From: Nick Piggin @ 2009-02-12  1:55 UTC (permalink / raw)
  To: Paul Menage; +Cc: Paul Jackson, miaox, Andrew Morton, mingo, linux-kernel, cl

On Thursday 12 February 2009 12:19:11 Paul Menage wrote:
> On Wed, Feb 11, 2009 at 4:54 PM, Nick Piggin <nickpiggin@yahoo.com.au> 
wrote:
> > It would be possible, depending on timing, for the allocating thread to
> > see either pre or post mems_allowed even if access was fully locked.
>
> Right - seeing either the pre set or the post set is fine.
>
> > The only difference is that a partially changed mems_allowed could be
> > seen. But what does this really mean? Some combination of the new and
> > the old nodes. I don't think this is too much of a problem.
>
> But if the old and new nodes are disjoint, that could lead to seeing no
> nodes.

Well we could structure updates as setting all new allowed nodes,
then clearing newly disallowed ones.


> Also, having the results of cpuset_zone_allowed() and
> cpuset_current_mems_allowed change at random times over the course of
> a call to alloc_pages() might cause interesting effects (e.g. we make
> progress freeing pages from one set of nodes, and then call
> get_page_from_freelist() on a different set of nodes).

But again, is this really a problem? We're talking about a tiny
possibility in a very uncommon case anyway when the cpuset is
changing.

If it can cause an outright error like OOM of course that's no
good, but if it just requires us to go around the reclaim loop
or allocate from another zone... I don't think that's so bad.


> > This could work if we *really* need an atomic snapshot of mems_allowed.
> > seqcount synchronisation would be an alternative too that could allow
> > sleeping more easily than SRCU (OTOH if you don't need sleeping, then
> > RCU should be faster than seqcount).
> >
> > But I'm not convinced we do need this to be atomic.
>
> It's possible that I'm being overly-paranoid here. The decision to
> make mems_allowed updates be purely pulled by the task itself predates
> my involvement with cpusets code by a long time.

It's not such a bad model, but the problem with it is that it needs
to be carefully spread over the VM, and in fastpaths too. Now if it
were something really critical, fine, but I'm hoping we can do
without.



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the  unallowed node when memory spread is set
  2009-02-12  1:55                     ` Nick Piggin
@ 2009-02-12  1:58                       ` Paul Menage
  2009-02-12  8:23                       ` Miao Xie
  2009-02-12  8:27                       ` Miao Xie
  2 siblings, 0 replies; 24+ messages in thread
From: Paul Menage @ 2009-02-12  1:58 UTC (permalink / raw)
  To: Nick Piggin; +Cc: miaox, Andrew Morton, mingo, linux-kernel, cl

On Wed, Feb 11, 2009 at 5:55 PM, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>> But if the old and new nodes are disjoint, that could lead to seeing no
>> nodes.
>
> Well we could structure updates as setting all new allowed nodes,
> then clearing newly disallowed ones.
>

OK, good point.

Paul

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-02-12  1:19                   ` Paul Menage
  2009-02-12  1:55                     ` Nick Piggin
@ 2009-02-12  5:57                     ` Miao Xie
  2009-02-12 11:06                       ` Paul Jackson
  1 sibling, 1 reply; 24+ messages in thread
From: Miao Xie @ 2009-02-12  5:57 UTC (permalink / raw)
  To: Paul Menage
  Cc: Nick Piggin, Paul Jackson, Andrew Morton, mingo, linux-kernel,
	cl, Derek Fults

on 2009-2-12 9:19 Paul Menage wrote:
> On Wed, Feb 11, 2009 at 4:54 PM, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>> It would be possible, depending on timing, for the allocating thread to
>> see either pre or post mems_allowed even if access was fully locked.
> 
> Right - seeing either the pre set or the post set is fine.
> 
>> The only difference is that a partially changed mems_allowed could be
>> seen. But what does this really mean? Some combination of the new and
>> the old nodes. I don't think this is too much of a problem.
> 
> But if the old and new nodes are disjoint, that could lead to seeing no nodes.
> 
> Also, having the results of cpuset_zone_allowed() and
> cpuset_current_mems_allowed change at random times over the course of
> a call to alloc_pages() might cause interesting effects (e.g. we make
> progress freeing pages from one set of nodes, and then call
> get_page_from_freelist() on a different set of nodes).
> 
>> This could work if we *really* need an atomic snapshot of mems_allowed.
>> seqcount synchronisation would be an alternative too that could allow
>> sleeping more easily than SRCU (OTOH if you don't need sleeping, then
>> RCU should be faster than seqcount).
>>
>> But I'm not convinced we do need this to be atomic.
> 
> It's possible that I'm being overly-paranoid here. The decision to
> make mems_allowed updates be purely pulled by the task itself predates
> my involvement with cpusets code by a long time. Paul Jackson (CC'd)
> may have opinions here, but I suspect his sgi.com email address no
> longer works, and I don't have any more recent address for him.

I think it's pj@usa.net(CC'd).

Author: Paul Jackson <pj@sgi.com>
Date:   Fri Oct 3 15:23:42 2008 -0700

    cpusets: remove pj from cpuset maintainers

    Remove myself from the kernel MAINTAINERS file for cpusets.  I am leaving
    SGI and probably will not be active in Linux kernel work.  I can be
    reached at <pj@usa.net>.  Contact Derek Fults <dfults@sgi.com> for future
    SGI+cpuset related issues.  I'm off to the next chapter of this good life.

    Signed-off-by: Paul Jackson <pj@sgi.com>
    Cc: Paul Menage <menage@google.com>
    Cc: Derek Fults <dfults@sgi.com>
    Cc: John Hesterberg <jh@sgi.com>
    Cc: Paul Jackson <pj@usa.net>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Thanks!
Miao

> 
> Paul
> 
> 
> 



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-02-12  1:55                     ` Nick Piggin
  2009-02-12  1:58                       ` Paul Menage
@ 2009-02-12  8:23                       ` Miao Xie
  2009-02-12 21:53                         ` Paul Menage
  2009-02-12  8:27                       ` Miao Xie
  2 siblings, 1 reply; 24+ messages in thread
From: Miao Xie @ 2009-02-12  8:23 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Paul Menage, Paul Jackson, Andrew Morton, mingo, linux-kernel, cl

on 2009-2-12 9:55 Nick Piggin wrote:
> On Thursday 12 February 2009 12:19:11 Paul Menage wrote:
>> On Wed, Feb 11, 2009 at 4:54 PM, Nick Piggin <nickpiggin@yahoo.com.au> 
> wrote:
>>> It would be possible, depending on timing, for the allocating thread to
>>> see either pre or post mems_allowed even if access was fully locked.
>> Right - seeing either the pre set or the post set is fine.
>>
>>> The only difference is that a partially changed mems_allowed could be
>>> seen. But what does this really mean? Some combination of the new and
>>> the old nodes. I don't think this is too much of a problem.
>> But if the old and new nodes are disjoint, that could lead to seeing no
>> nodes.
> 
> Well we could structure updates as setting all new allowed nodes,
> then clearing newly disallowed ones.

But it still has the other problem. such as: 
Task1							Task2
get_page_from_freelist()				while(1) {
{
	for_each_zone_zonelist_nodemask() {
								change Task1's mems_allowed
		if (!cpuset_zone_allowed_softwall())
			goto try_next_zone;
try_next_zone:
		...
	}
}							}

In the extreme case, Task1 will be completely unable to allocate memory
at worst. At least, it will lead to the delay of allocate pages. Though
the probability of this case is very low, we have to take into account.

Thanks!
Miao

> 
> 
>> Also, having the results of cpuset_zone_allowed() and
>> cpuset_current_mems_allowed change at random times over the course of
>> a call to alloc_pages() might cause interesting effects (e.g. we make
>> progress freeing pages from one set of nodes, and then call
>> get_page_from_freelist() on a different set of nodes).
> 
> But again, is this really a problem? We're talking about a tiny
> possibility in a very uncommon case anyway when the cpuset is
> changing.
> 
> If it can cause an outright error like OOM of course that's no
> good, but if it just requires us to go around the reclaim loop
> or allocate from another zone... I don't think that's so bad.
> 
> 
>>> This could work if we *really* need an atomic snapshot of mems_allowed.
>>> seqcount synchronisation would be an alternative too that could allow
>>> sleeping more easily than SRCU (OTOH if you don't need sleeping, then
>>> RCU should be faster than seqcount).
>>>
>>> But I'm not convinced we do need this to be atomic.
>> It's possible that I'm being overly-paranoid here. The decision to
>> make mems_allowed updates be purely pulled by the task itself predates
>> my involvement with cpusets code by a long time.
> 
> It's not such a bad model, but the problem with it is that it needs
> to be carefully spread over the VM, and in fastpaths too. Now if it
> were something really critical, fine, but I'm hoping we can do
> without.
> 
> 
> 
> 
> 



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-02-12  1:55                     ` Nick Piggin
  2009-02-12  1:58                       ` Paul Menage
  2009-02-12  8:23                       ` Miao Xie
@ 2009-02-12  8:27                       ` Miao Xie
  2009-02-12 10:40                         ` Nick Piggin
  2 siblings, 1 reply; 24+ messages in thread
From: Miao Xie @ 2009-02-12  8:27 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Paul Menage, Paul Jackson, Andrew Morton, mingo, linux-kernel,
	cl, Derek Fults

on 2009-2-12 9:55 Nick Piggin wrote:
> On Thursday 12 February 2009 12:19:11 Paul Menage wrote:
>> On Wed, Feb 11, 2009 at 4:54 PM, Nick Piggin <nickpiggin@yahoo.com.au> 
> wrote:
>>> It would be possible, depending on timing, for the allocating thread to
>>> see either pre or post mems_allowed even if access was fully locked.
>> Right - seeing either the pre set or the post set is fine.
>>
>>> The only difference is that a partially changed mems_allowed could be
>>> seen. But what does this really mean? Some combination of the new and
>>> the old nodes. I don't think this is too much of a problem.
>> But if the old and new nodes are disjoint, that could lead to seeing no
>> nodes.
> 
> Well we could structure updates as setting all new allowed nodes,
> then clearing newly disallowed ones.

But it still has the other problem. such as: 
Task1							Task2
get_page_from_freelist()				while(1) {
{
	for_each_zone_zonelist_nodemask() {
								change Task1's mems_allowed
		if (!cpuset_zone_allowed_softwall())
			goto try_next_zone;
try_next_zone:
		...
	}
}							}

In the extreme case, Task1 will be completely unable to allocate memory
at worst. At least, it will lead to the delay of allocate pages. Though
the probability of this case is very low, we have to take into account.

Thanks!
Miao

> 
> 
>> Also, having the results of cpuset_zone_allowed() and
>> cpuset_current_mems_allowed change at random times over the course of
>> a call to alloc_pages() might cause interesting effects (e.g. we make
>> progress freeing pages from one set of nodes, and then call
>> get_page_from_freelist() on a different set of nodes).
> 
> But again, is this really a problem? We're talking about a tiny
> possibility in a very uncommon case anyway when the cpuset is
> changing.
> 
> If it can cause an outright error like OOM of course that's no
> good, but if it just requires us to go around the reclaim loop
> or allocate from another zone... I don't think that's so bad.
> 
> 
>>> This could work if we *really* need an atomic snapshot of mems_allowed.
>>> seqcount synchronisation would be an alternative too that could allow
>>> sleeping more easily than SRCU (OTOH if you don't need sleeping, then
>>> RCU should be faster than seqcount).
>>>
>>> But I'm not convinced we do need this to be atomic.
>> It's possible that I'm being overly-paranoid here. The decision to
>> make mems_allowed updates be purely pulled by the task itself predates
>> my involvement with cpusets code by a long time.
> 
> It's not such a bad model, but the problem with it is that it needs
> to be carefully spread over the VM, and in fastpaths too. Now if it
> were something really critical, fine, but I'm hoping we can do
> without.
> 
> 
> 
> 
> 



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-02-12  8:27                       ` Miao Xie
@ 2009-02-12 10:40                         ` Nick Piggin
  0 siblings, 0 replies; 24+ messages in thread
From: Nick Piggin @ 2009-02-12 10:40 UTC (permalink / raw)
  To: miaox
  Cc: Paul Menage, Paul Jackson, Andrew Morton, mingo, linux-kernel,
	cl, Derek Fults

On Thursday 12 February 2009 19:27:16 Miao Xie wrote:
> on 2009-2-12 9:55 Nick Piggin wrote:
> > On Thursday 12 February 2009 12:19:11 Paul Menage wrote:
> >> On Wed, Feb 11, 2009 at 4:54 PM, Nick Piggin <nickpiggin@yahoo.com.au>
> >
> > wrote:
> >>> It would be possible, depending on timing, for the allocating thread to
> >>> see either pre or post mems_allowed even if access was fully locked.
> >>
> >> Right - seeing either the pre set or the post set is fine.
> >>
> >>> The only difference is that a partially changed mems_allowed could be
> >>> seen. But what does this really mean? Some combination of the new and
> >>> the old nodes. I don't think this is too much of a problem.
> >>
> >> But if the old and new nodes are disjoint, that could lead to seeing no
> >> nodes.
> >
> > Well we could structure updates as setting all new allowed nodes,
> > then clearing newly disallowed ones.
>
> But it still has the other problem. such as:
> Task1							Task2
> get_page_from_freelist()				while(1) {
> {
> 	for_each_zone_zonelist_nodemask() {
> 								change Task1's mems_allowed
> 		if (!cpuset_zone_allowed_softwall())
> 			goto try_next_zone;
> try_next_zone:
> 		...
> 	}
> }							}
>
> In the extreme case, Task1 will be completely unable to allocate memory
> at worst. At least, it will lead to the delay of allocate pages. Though
> the probability of this case is very low, we have to take into account.

Hmm, but a task with that permission could kill task1 in a number
of ways. Is it really worth worrying about?


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
  2009-02-12  5:57                     ` Miao Xie
@ 2009-02-12 11:06                       ` Paul Jackson
  0 siblings, 0 replies; 24+ messages in thread
From: Paul Jackson @ 2009-02-12 11:06 UTC (permalink / raw)
  To: miaox, Paul Menage
  Cc: Nick Piggin, Andrew Morton, mingo, linux-kernel, cl, Derek Fults

Paul M wrote:
> I think it's pj@usa.net

That's my email address, yes.

But my life has moved further and faster from
Linux kernels and cpusets than makes sense to
describe here.

Good luck with whatever you're discussing ;).
-- 
                Paul Jackson
                pj@usa.net


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] cpuset: fix allocating page cache/slab object on the  unallowed node when memory spread is set
  2009-02-12  8:23                       ` Miao Xie
@ 2009-02-12 21:53                         ` Paul Menage
  0 siblings, 0 replies; 24+ messages in thread
From: Paul Menage @ 2009-02-12 21:53 UTC (permalink / raw)
  To: miaox; +Cc: Nick Piggin, Paul Jackson, Andrew Morton, mingo, linux-kernel, cl

On Thu, Feb 12, 2009 at 12:23 AM, Miao Xie <miaox@cn.fujitsu.com> wrote:
>
> In the extreme case, Task1 will be completely unable to allocate memory
> at worst. At least, it will lead to the delay of allocate pages. Though
> the probability of this case is very low, we have to take into account.

As Nick says, this is basically maliciousness on the part of
(privileged) Task2. I don't think it's a situation we need to even try
to deal with (other than not oopsing, and doing the right thing once
Task2 stops being an ass).

Paul

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2009-02-12 21:54 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-01-21  8:06 [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set Miao Xie
2009-01-21  8:30 ` Nick Piggin
2009-01-21 10:41 ` Paul Menage
2009-02-03  3:05   ` Miao Xie
2009-01-27 22:42 ` Andrew Morton
2009-01-28 16:38   ` Christoph Lameter
2009-02-03  3:25   ` Miao Xie
2009-02-03 22:16     ` Andrew Morton
2009-02-03 22:49       ` Paul Menage
2009-02-04  9:31         ` Miao Xie
2009-02-06 19:19           ` Paul Menage
2009-02-09  4:02             ` Nick Piggin
2009-02-10 11:37               ` Paul Menage
2009-02-12  0:54                 ` Nick Piggin
2009-02-12  1:19                   ` Paul Menage
2009-02-12  1:55                     ` Nick Piggin
2009-02-12  1:58                       ` Paul Menage
2009-02-12  8:23                       ` Miao Xie
2009-02-12 21:53                         ` Paul Menage
2009-02-12  8:27                       ` Miao Xie
2009-02-12 10:40                         ` Nick Piggin
2009-02-12  5:57                     ` Miao Xie
2009-02-12 11:06                       ` Paul Jackson
2009-02-04  9:03       ` Miao Xie

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.