All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] workqueue: update numa affinity when node hotplug
@ 2015-02-27 10:04 Gu Zheng
  2015-02-27 11:54 ` Tejun Heo
  0 siblings, 1 reply; 9+ messages in thread
From: Gu Zheng @ 2015-02-27 10:04 UTC (permalink / raw)
  To: tj
  Cc: linux-kernel, laijs, isimatu.yasuaki, kamezawa.hiroyu, tangchen,
	guz.fnst

Yasuaki Ishimatsu found that with node online/offline, cpu<->node
relationship is established. Because workqueue uses a info which was
established at boot time, but it may be changed by node hotpluging.

Once pool->node points to a stale node, following allocation failure
happens.
  ==
     SLUB: Unable to allocate memory on node 2 (gfp=0x80d0)
      cache: kmalloc-192, object size: 192, buffer size: 192, default
order:
    1, min order: 0
      node 0: slabs: 6172, objs: 259224, free: 245741
      node 1: slabs: 3261, objs: 136962, free: 127656
  ==

This patch use the present cpumask directly rather than self-maintained
wq_numa_possible_cpumask, so that the wq is a simple customer of numa, and
updates per cpu workqueue pool's node affinity when numa node changed via
the notify callback that registered to node on/down event at the functions
try_online/offline_node.

Unbound workqueue's per node pool are already updated by wq_update_unbound_numa()
at CPU_DOWN_PREPARE of the last cpu, by existing code.

Reported-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
---
 include/linux/memory_hotplug.h |    8 +++
 kernel/workqueue.c             |   93 +++++++++++++++++++++++++--------------
 mm/memory_hotplug.c            |   44 +++++++++++++++++++
 3 files changed, 111 insertions(+), 34 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 8f1a419..07bb94e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -98,6 +98,14 @@ extern void __online_page_free(struct page *page);
 
 extern int try_online_node(int nid);
 
+/* event type of node on/dwon */
+#define NODE_ON		0x00001
+#define NODE_DOWN		0x00002
+
+extern int __ref register_node_notifier(struct notifier_block *nb);
+extern int __ref unregister_node_notifier(struct notifier_block *nb);
+extern void get_present_cpumask_of_node(cpumask_var_t cpumask, int node);
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 extern bool is_pageblock_removable_nolock(struct page *page);
 extern int arch_remove_memory(u64 start, u64 size);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index beeeac9..60c5d29 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -265,9 +265,6 @@ struct workqueue_struct {
 
 static struct kmem_cache *pwq_cache;
 
-static cpumask_var_t *wq_numa_possible_cpumask;
-					/* possible CPUs of each node */
-
 static bool wq_disable_numa;
 module_param_named(disable_numa, wq_disable_numa, bool, 0444);
 
@@ -3493,10 +3490,18 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
 	pool->attrs->no_numa = false;
 
 	/* if cpumask is contained inside a NUMA node, we belong to that node */
-	if (wq_numa_enabled) {
+	if (wq_numa_enabled && !cpumask_empty(pool->attrs->cpumask)) {
 		for_each_node(node) {
-			if (cpumask_subset(pool->attrs->cpumask,
-					   wq_numa_possible_cpumask[node])) {
+			int cpu;
+			bool is_sub_set = true;
+
+			for_each_cpu(cpu, pool->attrs->cpumask)
+				if (cpu_to_node(cpu) != node) {
+					is_sub_set = false;
+					break;
+				}
+
+			if (is_sub_set) {
 				pool->node = node;
 				break;
 			}
@@ -3717,8 +3722,9 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
 	if (cpumask_empty(cpumask))
 		goto use_dfl;
 
-	/* yeap, return possible CPUs in @node that @attrs wants */
-	cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+	/* yeap, return present CPUs in @node that @attrs wants */
+	get_present_cpumask_of_node(cpumask, node);
+	cpumask_and(cpumask, attrs->cpumask, cpumask);
 	return !cpumask_equal(cpumask, attrs->cpumask);
 
 use_dfl:
@@ -4564,6 +4570,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
 					       void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
+	int node = cpu_to_node(cpu);
 	struct worker_pool *pool;
 	struct workqueue_struct *wq;
 	int pi;
@@ -4571,6 +4578,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
 		for_each_cpu_worker_pool(pool, cpu) {
+			pool->node = node;
 			if (pool->nr_workers)
 				continue;
 			if (!create_worker(pool))
@@ -4787,11 +4795,50 @@ out_unlock:
 }
 #endif /* CONFIG_FREEZER */
 
-static void __init wq_numa_init(void)
+static int wq_numa_callback(struct notifier_block *self,
+			    unsigned long action, void *arg)
 {
-	cpumask_var_t *tbl;
-	int node, cpu;
+	int node = (unsigned long)arg;
+	struct worker_pool *pool;
+	int pi;
+	int cpu;
 
+	switch (action) {
+	case NODE_DOWN:
+		mutex_lock(&wq_pool_mutex);
+		for_each_pool(pool, pi) {
+			if (pool->node == node) {
+				pool->node = NUMA_NO_NODE;
+				if (pool->cpu < 0)
+					hash_del(&pool->hash_node);
+			}
+		}
+		mutex_unlock(&wq_pool_mutex);
+		break;
+	case NODE_ON:
+		mutex_lock(&wq_pool_mutex);
+		for_each_present_cpu(cpu) {
+			if (node != cpu_to_node(cpu))
+				continue;
+			for_each_cpu_worker_pool(pool, cpu)
+				pool->node = node;
+		}
+		mutex_unlock(&wq_pool_mutex);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct notifier_block wq_numa_nb = {
+	.notifier_call = wq_numa_callback,
+	.priority = 0
+};
+
+static void __init wq_numa_init(void)
+{
 	if (num_possible_nodes() <= 1)
 		return;
 
@@ -4803,29 +4850,7 @@ static void __init wq_numa_init(void)
 	wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
 	BUG_ON(!wq_update_unbound_numa_attrs_buf);
 
-	/*
-	 * We want masks of possible CPUs of each node which isn't readily
-	 * available.  Build one from cpu_to_node() which should have been
-	 * fully initialized by now.
-	 */
-	tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
-	BUG_ON(!tbl);
-
-	for_each_node(node)
-		BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
-				node_online(node) ? node : NUMA_NO_NODE));
-
-	for_each_possible_cpu(cpu) {
-		node = cpu_to_node(cpu);
-		if (WARN_ON(node == NUMA_NO_NODE)) {
-			pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
-			/* happens iff arch is bonkers, let's just proceed */
-			return;
-		}
-		cpumask_set_cpu(cpu, tbl[node]);
-	}
-
-	wq_numa_possible_cpumask = tbl;
+	register_node_notifier(&wq_numa_nb);
 	wq_numa_enabled = true;
 }
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9fab107..1778628 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1132,6 +1132,44 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 	return;
 }
 
+static RAW_NOTIFIER_HEAD(node_chain);
+
+int __ref register_node_notifier(struct notifier_block *nb)
+{
+	int ret;
+
+	ret = raw_notifier_chain_register(&node_chain, nb);
+	return ret;
+}
+EXPORT_SYMBOL(register_node_notifier);
+
+int __ref unregister_node_notifier(struct notifier_block *nb)
+{
+	int ret;
+
+	ret = raw_notifier_chain_unregister(&node_chain, nb);
+	return ret;
+}
+EXPORT_SYMBOL(unregister_node_notifier);
+
+static int call_node_notify(unsigned long val, void *v)
+{
+	int ret;
+
+	ret = __raw_notifier_call_chain(&node_chain, val, v, -1, NULL);
+
+	return notifier_to_errno(ret);
+}
+
+void get_present_cpumask_of_node(cpumask_var_t cpumask, int node)
+{
+	unsigned int cpu;
+
+	for_each_present_cpu(cpu)
+		if (node == cpu_to_node(cpu))
+			cpumask_set_cpu(cpu, cpumask);
+}
+EXPORT_SYMBOL(get_present_cpumask_of_node);
 
 /**
  * try_online_node - online a node if offlined
@@ -1157,6 +1195,9 @@ int try_online_node(int nid)
 	ret = register_one_node(nid);
 	BUG_ON(ret);
 
+	/* notify that the node is on */
+	call_node_notify(NODE_ON, (void *)(long)nid);
+
 	if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
 		mutex_lock(&zonelists_mutex);
 		build_all_zonelists(NULL, NULL);
@@ -1978,6 +2019,9 @@ void try_offline_node(int nid)
 			vfree(zone->wait_table);
 	}
 
+	/* notify that the node is down */
+	call_node_notify(NODE_DOWN, (void *)(long)nid);
+
 	/*
 	 * Since there is no way to guarentee the address of pgdat/zone is not
 	 * on stack of any kernel threads or used by other kernel objects
-- 
1.7.7


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] workqueue: update numa affinity when node hotplug
  2015-02-27 10:04 [PATCH] workqueue: update numa affinity when node hotplug Gu Zheng
@ 2015-02-27 11:54 ` Tejun Heo
  2015-03-02  8:41   ` Kamezawa Hiroyuki
  0 siblings, 1 reply; 9+ messages in thread
From: Tejun Heo @ 2015-02-27 11:54 UTC (permalink / raw)
  To: Gu Zheng; +Cc: linux-kernel, laijs, isimatu.yasuaki, kamezawa.hiroyu, tangchen

Hello,

On Fri, Feb 27, 2015 at 06:04:52PM +0800, Gu Zheng wrote:
> Yasuaki Ishimatsu found that with node online/offline, cpu<->node
> relationship is established. Because workqueue uses a info which was
> established at boot time, but it may be changed by node hotpluging.

I've asked this a couple times now but can somebody please justify why
cpu<->node relationship needs to change?  If it has to change, that's
okay but let's please first make sure that we understand why such
thing is necessary so that we can figure out what kind of facilities
are necessary for such dynamism.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] workqueue: update numa affinity when node hotplug
  2015-02-27 11:54 ` Tejun Heo
@ 2015-03-02  8:41   ` Kamezawa Hiroyuki
  2015-03-02 16:28     ` Tejun Heo
  0 siblings, 1 reply; 9+ messages in thread
From: Kamezawa Hiroyuki @ 2015-03-02  8:41 UTC (permalink / raw)
  To: Tejun Heo, Gu Zheng; +Cc: linux-kernel, laijs, isimatu.yasuaki, tangchen

On 2015/02/27 20:54, Tejun Heo wrote:
> Hello,
>
> On Fri, Feb 27, 2015 at 06:04:52PM +0800, Gu Zheng wrote:
>> Yasuaki Ishimatsu found that with node online/offline, cpu<->node
>> relationship is established. Because workqueue uses a info which was
>> established at boot time, but it may be changed by node hotpluging.
>
> I've asked this a couple times now but can somebody please justify why
> cpu<->node relationship needs to change?  If it has to change, that's
> okay but let's please first make sure that we understand why such
> thing is necessary so that we can figure out what kind of facilities
> are necessary for such dynamism.
>
> Thanks.
>

Let me start from explaining current behavior.

- cpu-id is determined when a new processor(lapicid/x2apicid) is founded.
   cpu-id<->nodeid relationship is _not_ recorded.

- node-id is determined when a new pxm(firmware info) is founded.
   pxm<->nodeid relationship is recorded.

By this, there are 2 cases of cpu<->nodeid change.

Case A) In x86, cpus on memory-less nodes are all tied to existing nodes(round robin).
At memory-hotadd happens and a new node comes, cpus are moved to a newly added node
based on pxm.

Case B) Adding a node after removing another node, if pxm of them were different from
each other, cpu<->node relatiionship changes.

I personally thinks proper fix is building persistent cpu-id <-> lapicid relationship as
pxm does rather than creating band-aid.

But I have no good idea to hanlde case A), which is used now.
Hm, Case-A will not be problem for the case of workqueue's kmalloc because a node with
ZONE_NORMAL cannot be removed.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] workqueue: update numa affinity when node hotplug
  2015-03-02  8:41   ` Kamezawa Hiroyuki
@ 2015-03-02 16:28     ` Tejun Heo
  2015-03-03  6:53       ` Kamezawa Hiroyuki
  0 siblings, 1 reply; 9+ messages in thread
From: Tejun Heo @ 2015-03-02 16:28 UTC (permalink / raw)
  To: Kamezawa Hiroyuki
  Cc: Gu Zheng, linux-kernel, laijs, isimatu.yasuaki, tangchen

Hello,

On Mon, Mar 02, 2015 at 05:41:05PM +0900, Kamezawa Hiroyuki wrote:
> Let me start from explaining current behavior.
> 
> - cpu-id is determined when a new processor(lapicid/x2apicid) is founded.
>   cpu-id<->nodeid relationship is _not_ recorded.

Is this something from the firmware side or is it just that we aren't
maintaining the association right now?

> - node-id is determined when a new pxm(firmware info) is founded.
>   pxm<->nodeid relationship is recorded.
> 
> By this, there are 2 cases of cpu<->nodeid change.
> 
> Case A) In x86, cpus on memory-less nodes are all tied to existing nodes(round robin).
> At memory-hotadd happens and a new node comes, cpus are moved to a newly added node
> based on pxm.

Ah, okay, so the firmware doesn't provide proximity information at all
for memory-less nodes so we end up putting all of them somewhere
random and when memory is added to one of the memory-less nodes, the
mapping information changes?

Am I understanding it correctly?  If so, it's super weird tho.  Why
wouldn't there be proximity information for a memless node?  Not
having memory doesn't mean it's at the same distance from all existing
nodes.

> Case B) Adding a node after removing another node, if pxm of them were different from
> each other, cpu<->node relatiionship changes.

I don't get this either.  Does proximity relationship actually change?
Or is it that we're assigning different IDs to the same thing?  Isn't
proximity pretty much hardwired to how the system is architected to
begin with?

> I personally thinks proper fix is building persistent cpu-id <-> lapicid relationship as
> pxm does rather than creating band-aid.

Oh if this is possible, I agree that's the right direction too.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] workqueue: update numa affinity when node hotplug
  2015-03-02 16:28     ` Tejun Heo
@ 2015-03-03  6:53       ` Kamezawa Hiroyuki
  2015-03-03 13:18         ` Tejun Heo
  0 siblings, 1 reply; 9+ messages in thread
From: Kamezawa Hiroyuki @ 2015-03-03  6:53 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Gu Zheng, linux-kernel, laijs, isimatu.yasuaki, tangchen

On 2015/03/03 1:28, Tejun Heo wrote:
> Hello,
>
> On Mon, Mar 02, 2015 at 05:41:05PM +0900, Kamezawa Hiroyuki wrote:
>> Let me start from explaining current behavior.
>>
>> - cpu-id is determined when a new processor(lapicid/x2apicid) is founded.
>>    cpu-id<->nodeid relationship is _not_ recorded.
>
> Is this something from the firmware side or is it just that we aren't
> maintaining the association right now?
>

I think it's not just maintained.

>> - node-id is determined when a new pxm(firmware info) is founded.
>>    pxm<->nodeid relationship is recorded.
>>
>> By this, there are 2 cases of cpu<->nodeid change.
>>
>> Case A) In x86, cpus on memory-less nodes are all tied to existing nodes(round robin).
>> At memory-hotadd happens and a new node comes, cpus are moved to a newly added node
>> based on pxm.
>
> Ah, okay, so the firmware doesn't provide proximity information at all
> for memory-less nodes so we end up putting all of them somewhere
> random and when memory is added to one of the memory-less nodes, the
> mapping information changes?
>

With memory-less node, proximity domain for processors are given but ignored.
When memory(node) hotplug happens, the information revisited and cpuid<->nodeid
relationship is updated.

> Am I understanding it correctly?  If so, it's super weird tho.  Why
> wouldn't there be proximity information for a memless node?  Not
> having memory doesn't mean it's at the same distance from all existing
> nodes.
>
Firmware gives pxm for memory-less node but it's ignored.
I'm not sure why the current implemetaion is.

>> Case B) Adding a node after removing another node, if pxm of them were different from
>> each other, cpu<->node relatiionship changes.
>
> I don't get this either.  Does proximity relationship actually change?
> Or is it that we're assigning different IDs to the same thing?Isn't
> proximity pretty much hardwired to how the system is architected to
> begin with?
>

relationship between proximity domain and lapic id doesn't change.
relationship between lapic-id and cpu-id changes.

pxm <-> memory address  : no change
pxm <-> lapicid         : no change
pxm <-> node id         : no change
lapicid <-> cpu id      : change.

>> I personally thinks proper fix is building persistent cpu-id <-> lapicid relationship as
>> pxm does rather than creating band-aid.
>
> Oh if this is possible, I agree that's the right direction too.
>

Implementation is a bit complicated now :(.

Thanks,
-Kame

> Thanks.
>



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] workqueue: update numa affinity when node hotplug
  2015-03-03  6:53       ` Kamezawa Hiroyuki
@ 2015-03-03 13:18         ` Tejun Heo
  2015-03-04  5:45           ` Kamezawa Hiroyuki
  0 siblings, 1 reply; 9+ messages in thread
From: Tejun Heo @ 2015-03-03 13:18 UTC (permalink / raw)
  To: Kamezawa Hiroyuki
  Cc: Gu Zheng, linux-kernel, laijs, isimatu.yasuaki, tangchen

Hello, Kame.

On Tue, Mar 03, 2015 at 03:53:46PM +0900, Kamezawa Hiroyuki wrote:
> relationship between proximity domain and lapic id doesn't change.
> relationship between lapic-id and cpu-id changes.
> 
> pxm <-> memory address  : no change
> pxm <-> lapicid         : no change
> pxm <-> node id         : no change
> lapicid <-> cpu id      : change.

So, we're changing the cpu ID to NUMA node mapping because current
NUMA code is ignoring PXM for memoryless nodes?  That's it?

> >>I personally thinks proper fix is building persistent cpu-id <-> lapicid relationship as
> >>pxm does rather than creating band-aid.
> >
> >Oh if this is possible, I agree that's the right direction too.
> >
> 
> Implementation is a bit complicated now :(.

Ah well, even then, the obviously right thing to do is updating NUMA
code to always keep track of PXM information.  We don't really want to
pile NUMA hacks in random users of NUMA code.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] workqueue: update numa affinity when node hotplug
  2015-03-03 13:18         ` Tejun Heo
@ 2015-03-04  5:45           ` Kamezawa Hiroyuki
  2015-03-05  1:23             ` Gu Zheng
  0 siblings, 1 reply; 9+ messages in thread
From: Kamezawa Hiroyuki @ 2015-03-04  5:45 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Gu Zheng, linux-kernel, laijs, isimatu.yasuaki, tangchen

On 2015/03/03 22:18, Tejun Heo wrote:
> Hello, Kame.
>
> On Tue, Mar 03, 2015 at 03:53:46PM +0900, Kamezawa Hiroyuki wrote:
>> relationship between proximity domain and lapic id doesn't change.
>> relationship between lapic-id and cpu-id changes.
>>
>> pxm <-> memory address  : no change
>> pxm <-> lapicid         : no change
>> pxm <-> node id         : no change
>> lapicid <-> cpu id      : change.
>
> So, we're changing the cpu ID to NUMA node mapping because current
> NUMA code is ignoring PXM for memoryless nodes?  That's it?
>

For memory-less node case, yes.
Another problem is that lapicid <-> cpuid relationship is not persistent.


>>>> I personally thinks proper fix is building persistent cpu-id <-> lapicid relationship as
>>>> pxm does rather than creating band-aid.
>>>
>>> Oh if this is possible, I agree that's the right direction too.
>>>
>>
>> Implementation is a bit complicated now :(.
>
> Ah well, even then, the obviously right thing to do is updating NUMA
> code to always keep track of PXM information.  We don't really want to
> pile NUMA hacks in random users of NUMA code.
>

We'd like to start from making apicid <-> cpuid persistent because memory-less
node case doesn't cause panic.

Gu-san, how do you think ?

Thanks,
-Kame

P.S.
Finally, I want something like udev for cpuid/numaid...



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] workqueue: update numa affinity when node hotplug
  2015-03-04  5:45           ` Kamezawa Hiroyuki
@ 2015-03-05  1:23             ` Gu Zheng
  2015-03-05  7:16               ` Kamezawa Hiroyuki
  0 siblings, 1 reply; 9+ messages in thread
From: Gu Zheng @ 2015-03-05  1:23 UTC (permalink / raw)
  To: Kamezawa Hiroyuki
  Cc: Tejun Heo, linux-kernel, laijs, isimatu.yasuaki, tangchen

Hi Kamazawa-san,
On 03/04/2015 01:45 PM, Kamezawa Hiroyuki wrote:

> On 2015/03/03 22:18, Tejun Heo wrote:
>> Hello, Kame.
>>
>> On Tue, Mar 03, 2015 at 03:53:46PM +0900, Kamezawa Hiroyuki wrote:
>>> relationship between proximity domain and lapic id doesn't change.
>>> relationship between lapic-id and cpu-id changes.
>>>
>>> pxm <-> memory address  : no change
>>> pxm <-> lapicid         : no change
>>> pxm <-> node id         : no change
>>> lapicid <-> cpu id      : change.
>>
>> So, we're changing the cpu ID to NUMA node mapping because current
>> NUMA code is ignoring PXM for memoryless nodes?  That's it?
>>
> 
> For memory-less node case, yes.
> Another problem is that lapicid <-> cpuid relationship is not persistent.
> 
> 
>>>>> I personally thinks proper fix is building persistent cpu-id <-> lapicid relationship as
>>>>> pxm does rather than creating band-aid.
>>>>
>>>> Oh if this is possible, I agree that's the right direction too.
>>>>
>>>
>>> Implementation is a bit complicated now :(.
>>
>> Ah well, even then, the obviously right thing to do is updating NUMA
>> code to always keep track of PXM information.  We don't really want to
>> pile NUMA hacks in random users of NUMA code.
>>
> 
> We'd like to start from making apicid <-> cpuid persistent because memory-less
> node case doesn't cause panic.
> 
> Gu-san, how do you think ?

Fine by me. But it seems that the change will break the re-use of free cpuid when
hot add new cpus, I am afraid it may affect other sub-systems, though I can not
point out the specific example.

Thanks,
Gu 

> 
> Thanks,
> -Kame
> 
> P.S.
> Finally, I want something like udev for cpuid/numaid...
> 
> 
> .
> 



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] workqueue: update numa affinity when node hotplug
  2015-03-05  1:23             ` Gu Zheng
@ 2015-03-05  7:16               ` Kamezawa Hiroyuki
  0 siblings, 0 replies; 9+ messages in thread
From: Kamezawa Hiroyuki @ 2015-03-05  7:16 UTC (permalink / raw)
  To: Gu Zheng; +Cc: Tejun Heo, linux-kernel, laijs, isimatu.yasuaki, tangchen

On 2015/03/05 10:23, Gu Zheng wrote:
> Hi Kamazawa-san,
> On 03/04/2015 01:45 PM, Kamezawa Hiroyuki wrote:
>
>> On 2015/03/03 22:18, Tejun Heo wrote:
>>> Hello, Kame.
>>>
>>> On Tue, Mar 03, 2015 at 03:53:46PM +0900, Kamezawa Hiroyuki wrote:
>>>> relationship between proximity domain and lapic id doesn't change.
>>>> relationship between lapic-id and cpu-id changes.
>>>>
>>>> pxm <-> memory address  : no change
>>>> pxm <-> lapicid         : no change
>>>> pxm <-> node id         : no change
>>>> lapicid <-> cpu id      : change.
>>>
>>> So, we're changing the cpu ID to NUMA node mapping because current
>>> NUMA code is ignoring PXM for memoryless nodes?  That's it?
>>>
>>
>> For memory-less node case, yes.
>> Another problem is that lapicid <-> cpuid relationship is not persistent.
>>
>>
>>>>>> I personally thinks proper fix is building persistent cpu-id <-> lapicid relationship as
>>>>>> pxm does rather than creating band-aid.
>>>>>
>>>>> Oh if this is possible, I agree that's the right direction too.
>>>>>
>>>>
>>>> Implementation is a bit complicated now :(.
>>>
>>> Ah well, even then, the obviously right thing to do is updating NUMA
>>> code to always keep track of PXM information.  We don't really want to
>>> pile NUMA hacks in random users of NUMA code.
>>>
>>
>> We'd like to start from making apicid <-> cpuid persistent because memory-less
>> node case doesn't cause panic.
>>
>> Gu-san, how do you think ?
>
> Fine by me. But it seems that the change will break the re-use of free cpuid when
> hot add new cpus, I am afraid it may affect other sub-systems, though I can not
> point out the specific example.
>

That may be concern. But, IHMO, applications should not expect cpuid of newly added cpu.

If an application takes care of placement of [thread, memory], OS API should be refreshed
to allow user to tell "please pleace [thread, memory] nearby each other" rather than specifying
[cpu, memory]. (As Peter Z.'s schedNUMA)

If application takes care of physical placement of hardware, ethernet, SSD, etc...
it should not depends on cpuid.

Some of required features may not be archived yet but cpuid of newly added cpu
should not be a big problem for applications, I think.

Open dicussion with (x86/sched/mm) maintainers will be required, anyway.
I think it's worth trying.

Thanks,
-Kame






^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2015-03-05  7:17 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-02-27 10:04 [PATCH] workqueue: update numa affinity when node hotplug Gu Zheng
2015-02-27 11:54 ` Tejun Heo
2015-03-02  8:41   ` Kamezawa Hiroyuki
2015-03-02 16:28     ` Tejun Heo
2015-03-03  6:53       ` Kamezawa Hiroyuki
2015-03-03 13:18         ` Tejun Heo
2015-03-04  5:45           ` Kamezawa Hiroyuki
2015-03-05  1:23             ` Gu Zheng
2015-03-05  7:16               ` Kamezawa Hiroyuki

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.