linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] lib: optimize cpumask_local_spread()
@ 2019-10-31  6:03 Shaokun Zhang
  2019-10-31  7:39 ` Michal Hocko
  2019-11-01  0:37 ` Andrew Morton
  0 siblings, 2 replies; 5+ messages in thread
From: Shaokun Zhang @ 2019-10-31  6:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: yuqi jin, Andrew Morton, Mike Rapoport, Paul Burton,
	Michal Hocko, Michael Ellerman, Anshuman Khandual, Shaokun Zhang

From: yuqi jin <jinyuqi@huawei.com>

In the multi-processor and NUMA system, A device may have many numa
nodes belonging to multiple cpus. When we get a local numa, it is better
to find the node closest to the local numa node to return instead of
going to the online cpu immediately.

For example, In Huawei Kunpeng 920 system, there are 4 NUMA node(0 -3)
in the 2-socket system(0 - 1). If the I/O device is in socket1
and the local NUMA node is 2, we shall choose the non-local node3 in
the same socket when cpu core in NUMA node2 is less that I/O requirements.
If we directly pick one cpu core from all online ones, it may be in
the another socket and it is not friendly for performance.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: yuqi jin <jinyuqi@huawei.com>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
---
Changes from RFC:
     Address Michal Hocko's comment: Use GFP_ATOMIC instead of GFP_KERNEL

 lib/cpumask.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 65 insertions(+), 11 deletions(-)

diff --git a/lib/cpumask.c b/lib/cpumask.c
index 0cb672eb107c..c92177b0e095 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -192,6 +192,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 }
 #endif
 
+static void calc_node_distance(int *node_dist, int node)
+{
+	int i;
+
+	for (i = 0; i < nr_node_ids; i++)
+		node_dist[i] = node_distance(node, i);
+}
+
+static int find_nearest_node(int *node_dist, bool *used_flag)
+{
+	int i, min_dist = node_dist[0], node_id = -1;
+
+	for (i = 0; i < nr_node_ids; i++)
+		if (used_flag[i] == 0) {
+			min_dist = node_dist[i];
+			node_id = i;
+			break;
+		}
+	for (i = 0; i < nr_node_ids; i++)
+		if (node_dist[i] < min_dist && used_flag[i] == 0) {
+			min_dist = node_dist[i];
+			node_id = i;
+		}
+
+	return node_id;
+}
+
 /**
  * cpumask_local_spread - select the i'th cpu with local numa cpu's first
  * @i: index number
@@ -205,7 +232,8 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
  */
 unsigned int cpumask_local_spread(unsigned int i, int node)
 {
-	int cpu;
+	int cpu, j, id, *node_dist;
+	bool *used_flag;
 
 	/* Wrap: we always want a cpu. */
 	i %= num_online_cpus();
@@ -215,19 +243,45 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
 			if (i-- == 0)
 				return cpu;
 	} else {
-		/* NUMA first. */
-		for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
-			if (i-- == 0)
-				return cpu;
+		node_dist = kmalloc_array(nr_node_ids, sizeof(int), GFP_ATOMIC);
+		if (!node_dist)
+			for_each_cpu(cpu, cpu_online_mask)
+				if (i-- == 0)
+					return cpu;
 
-		for_each_cpu(cpu, cpu_online_mask) {
-			/* Skip NUMA nodes, done above. */
-			if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
-				continue;
+		used_flag = kmalloc_array(nr_node_ids, sizeof(bool), GFP_ATOMIC);
+		if (!used_flag)
+			for_each_cpu(cpu, cpu_online_mask)
+				if (i-- == 0) {
+					kfree(node_dist);
+					return cpu;
+				}
+		memset(used_flag, 0, nr_node_ids * sizeof(bool));
 
-			if (i-- == 0)
-				return cpu;
+		calc_node_distance(node_dist, node);
+		for (j = 0; j < nr_node_ids; j++) {
+			id = find_nearest_node(node_dist, used_flag);
+			if (id < 0)
+				break;
+			for_each_cpu_and(cpu,
+				cpumask_of_node(id), cpu_online_mask)
+				if (i-- == 0) {
+					kfree(node_dist);
+					kfree(used_flag);
+					return cpu;
+				}
+			used_flag[id] = 1;
 		}
+
+		for_each_cpu(cpu, cpu_online_mask)
+			if (i-- == 0) {
+				kfree(node_dist);
+				kfree(used_flag);
+				return cpu;
+			}
+
+		kfree(node_dist);
+		kfree(used_flag);
 	}
 	BUG();
 }
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] lib: optimize cpumask_local_spread()
  2019-10-31  6:03 [PATCH] lib: optimize cpumask_local_spread() Shaokun Zhang
@ 2019-10-31  7:39 ` Michal Hocko
  2019-11-04  3:37   ` Shaokun Zhang
  2019-11-01  0:37 ` Andrew Morton
  1 sibling, 1 reply; 5+ messages in thread
From: Michal Hocko @ 2019-10-31  7:39 UTC (permalink / raw)
  To: Shaokun Zhang
  Cc: linux-kernel, yuqi jin, Andrew Morton, Mike Rapoport,
	Paul Burton, Michael Ellerman, Anshuman Khandual

On Thu 31-10-19 14:03:33, Shaokun Zhang wrote:
> From: yuqi jin <jinyuqi@huawei.com>
> 
> In the multi-processor and NUMA system, A device may have many numa
> nodes belonging to multiple cpus. When we get a local numa, it is better
> to find the node closest to the local numa node to return instead of
> going to the online cpu immediately.
> 
> For example, In Huawei Kunpeng 920 system, there are 4 NUMA node(0 -3)
> in the 2-socket system(0 - 1). If the I/O device is in socket1
> and the local NUMA node is 2, we shall choose the non-local node3 in
> the same socket when cpu core in NUMA node2 is less that I/O requirements.
> If we directly pick one cpu core from all online ones, it may be in
> the another socket and it is not friendly for performance.

My previous review feedback included a request for a much better
description of the actual problem and how much of a performance gain we
are talking about along with a workoload description.

Besides that I do not think that the implementation is great either.
Relying on GFP_ATOMIC is very dubious. Is there any specific reason why
the data structure cannot pre reallocated? The comment for
cpumask_local_spread says that this is not the most efficient function
so users should better be prepared to not call it from hot paths AFAIU.
That would imply that an internal locking should be acceptable as well
so a preallocated data structure could be used.

> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Mike Rapoport <rppt@linux.ibm.com>
> Cc: Paul Burton <paul.burton@mips.com>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Anshuman Khandual <anshuman.khandual@arm.com>
> Signed-off-by: yuqi jin <jinyuqi@huawei.com>
> Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
> ---
> Changes from RFC:
>      Address Michal Hocko's comment: Use GFP_ATOMIC instead of GFP_KERNEL
> 
>  lib/cpumask.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 65 insertions(+), 11 deletions(-)
> 
> diff --git a/lib/cpumask.c b/lib/cpumask.c
> index 0cb672eb107c..c92177b0e095 100644
> --- a/lib/cpumask.c
> +++ b/lib/cpumask.c
> @@ -192,6 +192,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
>  }
>  #endif
>  
> +static void calc_node_distance(int *node_dist, int node)
> +{
> +	int i;
> +
> +	for (i = 0; i < nr_node_ids; i++)
> +		node_dist[i] = node_distance(node, i);
> +}
> +
> +static int find_nearest_node(int *node_dist, bool *used_flag)
> +{
> +	int i, min_dist = node_dist[0], node_id = -1;
> +
> +	for (i = 0; i < nr_node_ids; i++)
> +		if (used_flag[i] == 0) {
> +			min_dist = node_dist[i];
> +			node_id = i;
> +			break;
> +		}
> +	for (i = 0; i < nr_node_ids; i++)
> +		if (node_dist[i] < min_dist && used_flag[i] == 0) {
> +			min_dist = node_dist[i];
> +			node_id = i;
> +		}
> +
> +	return node_id;
> +}
> +
>  /**
>   * cpumask_local_spread - select the i'th cpu with local numa cpu's first
>   * @i: index number
> @@ -205,7 +232,8 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
>   */
>  unsigned int cpumask_local_spread(unsigned int i, int node)
>  {
> -	int cpu;
> +	int cpu, j, id, *node_dist;
> +	bool *used_flag;
>  
>  	/* Wrap: we always want a cpu. */
>  	i %= num_online_cpus();
> @@ -215,19 +243,45 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
>  			if (i-- == 0)
>  				return cpu;
>  	} else {
> -		/* NUMA first. */
> -		for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
> -			if (i-- == 0)
> -				return cpu;
> +		node_dist = kmalloc_array(nr_node_ids, sizeof(int), GFP_ATOMIC);
> +		if (!node_dist)
> +			for_each_cpu(cpu, cpu_online_mask)
> +				if (i-- == 0)
> +					return cpu;
>  
> -		for_each_cpu(cpu, cpu_online_mask) {
> -			/* Skip NUMA nodes, done above. */
> -			if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
> -				continue;
> +		used_flag = kmalloc_array(nr_node_ids, sizeof(bool), GFP_ATOMIC);
> +		if (!used_flag)
> +			for_each_cpu(cpu, cpu_online_mask)
> +				if (i-- == 0) {
> +					kfree(node_dist);
> +					return cpu;
> +				}
> +		memset(used_flag, 0, nr_node_ids * sizeof(bool));
>  
> -			if (i-- == 0)
> -				return cpu;
> +		calc_node_distance(node_dist, node);
> +		for (j = 0; j < nr_node_ids; j++) {
> +			id = find_nearest_node(node_dist, used_flag);
> +			if (id < 0)
> +				break;
> +			for_each_cpu_and(cpu,
> +				cpumask_of_node(id), cpu_online_mask)
> +				if (i-- == 0) {
> +					kfree(node_dist);
> +					kfree(used_flag);
> +					return cpu;
> +				}
> +			used_flag[id] = 1;
>  		}
> +
> +		for_each_cpu(cpu, cpu_online_mask)
> +			if (i-- == 0) {
> +				kfree(node_dist);
> +				kfree(used_flag);
> +				return cpu;
> +			}
> +
> +		kfree(node_dist);
> +		kfree(used_flag);
>  	}
>  	BUG();
>  }
> -- 
> 2.7.4

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] lib: optimize cpumask_local_spread()
  2019-10-31  6:03 [PATCH] lib: optimize cpumask_local_spread() Shaokun Zhang
  2019-10-31  7:39 ` Michal Hocko
@ 2019-11-01  0:37 ` Andrew Morton
  2019-11-04  6:02   ` Shaokun Zhang
  1 sibling, 1 reply; 5+ messages in thread
From: Andrew Morton @ 2019-11-01  0:37 UTC (permalink / raw)
  To: Shaokun Zhang
  Cc: linux-kernel, yuqi jin, Mike Rapoport, Paul Burton, Michal Hocko,
	Michael Ellerman, Anshuman Khandual

On Thu, 31 Oct 2019 14:03:33 +0800 Shaokun Zhang <zhangshaokun@hisilicon.com> wrote:

> From: yuqi jin <jinyuqi@huawei.com>
> 
> In the multi-processor and NUMA system, A device may have many numa
> nodes belonging to multiple cpus. When we get a local numa, it is better
> to find the node closest to the local numa node to return instead of
> going to the online cpu immediately.
> 
> For example, In Huawei Kunpeng 920 system, there are 4 NUMA node(0 -3)
> in the 2-socket system(0 - 1). If the I/O device is in socket1
> and the local NUMA node is 2, we shall choose the non-local node3 in
> the same socket when cpu core in NUMA node2 is less that I/O requirements.
> If we directly pick one cpu core from all online ones, it may be in
> the another socket and it is not friendly for performance.
> 
> ...
>
> Changes from RFC:
>      Address Michal Hocko's comment: Use GFP_ATOMIC instead of GFP_KERNEL

Are you sure this is necessary?  cpumask_local_spread() is typically
called when a device driver is initializing irq affinities, and
sleeping allocations are usually OK at driver initialization time.  If
there is some driver which is calling cpumask_local_spread() from
atomic context, I bet it's pretty easy to fix.

> --- a/lib/cpumask.c
> +++ b/lib/cpumask.c
> @@ -192,6 +192,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
>  }
>  #endif
>  
> +static void calc_node_distance(int *node_dist, int node)
> +{
> +	int i;
> +
> +	for (i = 0; i < nr_node_ids; i++)
> +		node_dist[i] = node_distance(node, i);
> +}
> +
> +static int find_nearest_node(int *node_dist, bool *used_flag)

The name "used_flag" is rather redundant for a thing of type bool - we
know it's a flag!  "used" would suffice.

> +{
> +	int i, min_dist = node_dist[0], node_id = -1;
> +
> +	for (i = 0; i < nr_node_ids; i++)
> +		if (used_flag[i] == 0) {
> +			min_dist = node_dist[i];
> +			node_id = i;
> +			break;
> +		}
> +	for (i = 0; i < nr_node_ids; i++)
> +		if (node_dist[i] < min_dist && used_flag[i] == 0) {
> +			min_dist = node_dist[i];
> +			node_id = i;
> +		}
> +
> +	return node_id;
> +}
> +
>  /**
>   * cpumask_local_spread - select the i'th cpu with local numa cpu's first
>   * @i: index number
> @@ -205,7 +232,8 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
>   */
>  unsigned int cpumask_local_spread(unsigned int i, int node)

Yes, this has become quite an expensive function.  That seems harmless
given the typical callsites.

>  {
> -	int cpu;
> +	int cpu, j, id, *node_dist;
> +	bool *used_flag;
>  
>  	/* Wrap: we always want a cpu. */
>  	i %= num_online_cpus();
> @@ -215,19 +243,45 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
>  			if (i-- == 0)
>  				return cpu;
>  	} else {
> -		/* NUMA first. */
> -		for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
> -			if (i-- == 0)
> -				return cpu;
> +		node_dist = kmalloc_array(nr_node_ids, sizeof(int), GFP_ATOMIC);
> +		if (!node_dist)
> +			for_each_cpu(cpu, cpu_online_mask)
> +				if (i-- == 0)
> +					return cpu;
>  
> -		for_each_cpu(cpu, cpu_online_mask) {
> -			/* Skip NUMA nodes, done above. */
> -			if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
> -				continue;
> +		used_flag = kmalloc_array(nr_node_ids, sizeof(bool), GFP_ATOMIC);

This could actually be an array of bits (include/linux/bitmap.h), but
it hardly seems important.

In fact with CONFIG_NODES_SHIFT <= 10, such a bitmap would have max
size of 128 bytes and could be a local.  But again, this is unimportant
as long as the other kmalloc is in there.


> +		if (!used_flag)
> +			for_each_cpu(cpu, cpu_online_mask)
> +				if (i-- == 0) {
> +					kfree(node_dist);
> +					return cpu;
> +				}
> +		memset(used_flag, 0, nr_node_ids * sizeof(bool));
>  
> -			if (i-- == 0)
> -				return cpu;
> +		calc_node_distance(node_dist, node);
> +		for (j = 0; j < nr_node_ids; j++) {
> +			id = find_nearest_node(node_dist, used_flag);
> +			if (id < 0)
> +				break;
> +			for_each_cpu_and(cpu,
> +				cpumask_of_node(id), cpu_online_mask)
> +				if (i-- == 0) {
> +					kfree(node_dist);
> +					kfree(used_flag);
> +					return cpu;
> +				}
> +			used_flag[id] = 1;
>  		}
> +
> +		for_each_cpu(cpu, cpu_online_mask)
> +			if (i-- == 0) {
> +				kfree(node_dist);
> +				kfree(used_flag);
> +				return cpu;
> +			}
> +
> +		kfree(node_dist);
> +		kfree(used_flag);
>  	}
>  	BUG();
>  }


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] lib: optimize cpumask_local_spread()
  2019-10-31  7:39 ` Michal Hocko
@ 2019-11-04  3:37   ` Shaokun Zhang
  0 siblings, 0 replies; 5+ messages in thread
From: Shaokun Zhang @ 2019-11-04  3:37 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, yuqi jin, Andrew Morton, Mike Rapoport,
	Paul Burton, Michael Ellerman, Anshuman Khandual

Hi Michal,

On 2019/10/31 15:39, Michal Hocko wrote:
> On Thu 31-10-19 14:03:33, Shaokun Zhang wrote:
>> From: yuqi jin <jinyuqi@huawei.com>
>>
>> In the multi-processor and NUMA system, A device may have many numa
>> nodes belonging to multiple cpus. When we get a local numa, it is better
>> to find the node closest to the local numa node to return instead of
>> going to the online cpu immediately.
>>
>> For example, In Huawei Kunpeng 920 system, there are 4 NUMA node(0 -3)
>> in the 2-socket system(0 - 1). If the I/O device is in socket1
>> and the local NUMA node is 2, we shall choose the non-local node3 in
>> the same socket when cpu core in NUMA node2 is less that I/O requirements.
>> If we directly pick one cpu core from all online ones, it may be in
>> the another socket and it is not friendly for performance.
> 
> My previous review feedback included a request for a much better
> description of the actual problem and how much of a performance gain we
> are talking about along with a workoload description.
> 

Ok, I will update both in next version.

> Besides that I do not think that the implementation is great either.

Ok, Agree, so I sent it as RFC firstly and wanted to discuss this issue,
I will fix it more reasonable.

> Relying on GFP_ATOMIC is very dubious. Is there any specific reason why
> the data structure cannot pre reallocated? The comment for

Ok, will do it.

> cpumask_local_spread says that this is not the most efficient function
> so users should better be prepared to not call it from hot paths AFAIU.
> That would imply that an internal locking should be acceptable as well
> so a preallocated data structure could be used.

Ok.

Thanks,
Shaokun

> 
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Mike Rapoport <rppt@linux.ibm.com>
>> Cc: Paul Burton <paul.burton@mips.com>
>> Cc: Michal Hocko <mhocko@suse.com>
>> Cc: Michael Ellerman <mpe@ellerman.id.au>
>> Cc: Anshuman Khandual <anshuman.khandual@arm.com>
>> Signed-off-by: yuqi jin <jinyuqi@huawei.com>
>> Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
>> ---
>> Changes from RFC:
>>      Address Michal Hocko's comment: Use GFP_ATOMIC instead of GFP_KERNEL
>>
>>  lib/cpumask.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++---------
>>  1 file changed, 65 insertions(+), 11 deletions(-)
>>
>> diff --git a/lib/cpumask.c b/lib/cpumask.c
>> index 0cb672eb107c..c92177b0e095 100644
>> --- a/lib/cpumask.c
>> +++ b/lib/cpumask.c
>> @@ -192,6 +192,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
>>  }
>>  #endif
>>  
>> +static void calc_node_distance(int *node_dist, int node)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < nr_node_ids; i++)
>> +		node_dist[i] = node_distance(node, i);
>> +}
>> +
>> +static int find_nearest_node(int *node_dist, bool *used_flag)
>> +{
>> +	int i, min_dist = node_dist[0], node_id = -1;
>> +
>> +	for (i = 0; i < nr_node_ids; i++)
>> +		if (used_flag[i] == 0) {
>> +			min_dist = node_dist[i];
>> +			node_id = i;
>> +			break;
>> +		}
>> +	for (i = 0; i < nr_node_ids; i++)
>> +		if (node_dist[i] < min_dist && used_flag[i] == 0) {
>> +			min_dist = node_dist[i];
>> +			node_id = i;
>> +		}
>> +
>> +	return node_id;
>> +}
>> +
>>  /**
>>   * cpumask_local_spread - select the i'th cpu with local numa cpu's first
>>   * @i: index number
>> @@ -205,7 +232,8 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
>>   */
>>  unsigned int cpumask_local_spread(unsigned int i, int node)
>>  {
>> -	int cpu;
>> +	int cpu, j, id, *node_dist;
>> +	bool *used_flag;
>>  
>>  	/* Wrap: we always want a cpu. */
>>  	i %= num_online_cpus();
>> @@ -215,19 +243,45 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
>>  			if (i-- == 0)
>>  				return cpu;
>>  	} else {
>> -		/* NUMA first. */
>> -		for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
>> -			if (i-- == 0)
>> -				return cpu;
>> +		node_dist = kmalloc_array(nr_node_ids, sizeof(int), GFP_ATOMIC);
>> +		if (!node_dist)
>> +			for_each_cpu(cpu, cpu_online_mask)
>> +				if (i-- == 0)
>> +					return cpu;
>>  
>> -		for_each_cpu(cpu, cpu_online_mask) {
>> -			/* Skip NUMA nodes, done above. */
>> -			if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
>> -				continue;
>> +		used_flag = kmalloc_array(nr_node_ids, sizeof(bool), GFP_ATOMIC);
>> +		if (!used_flag)
>> +			for_each_cpu(cpu, cpu_online_mask)
>> +				if (i-- == 0) {
>> +					kfree(node_dist);
>> +					return cpu;
>> +				}
>> +		memset(used_flag, 0, nr_node_ids * sizeof(bool));
>>  
>> -			if (i-- == 0)
>> -				return cpu;
>> +		calc_node_distance(node_dist, node);
>> +		for (j = 0; j < nr_node_ids; j++) {
>> +			id = find_nearest_node(node_dist, used_flag);
>> +			if (id < 0)
>> +				break;
>> +			for_each_cpu_and(cpu,
>> +				cpumask_of_node(id), cpu_online_mask)
>> +				if (i-- == 0) {
>> +					kfree(node_dist);
>> +					kfree(used_flag);
>> +					return cpu;
>> +				}
>> +			used_flag[id] = 1;
>>  		}
>> +
>> +		for_each_cpu(cpu, cpu_online_mask)
>> +			if (i-- == 0) {
>> +				kfree(node_dist);
>> +				kfree(used_flag);
>> +				return cpu;
>> +			}
>> +
>> +		kfree(node_dist);
>> +		kfree(used_flag);
>>  	}
>>  	BUG();
>>  }
>> -- 
>> 2.7.4
> 


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] lib: optimize cpumask_local_spread()
  2019-11-01  0:37 ` Andrew Morton
@ 2019-11-04  6:02   ` Shaokun Zhang
  0 siblings, 0 replies; 5+ messages in thread
From: Shaokun Zhang @ 2019-11-04  6:02 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, yuqi jin, Mike Rapoport, Paul Burton, Michal Hocko,
	Michael Ellerman, Anshuman Khandual

Hi Andrew,

On 2019/11/1 8:37, Andrew Morton wrote:
> On Thu, 31 Oct 2019 14:03:33 +0800 Shaokun Zhang <zhangshaokun@hisilicon.com> wrote:
> 
>> From: yuqi jin <jinyuqi@huawei.com>
>>
>> In the multi-processor and NUMA system, A device may have many numa
>> nodes belonging to multiple cpus. When we get a local numa, it is better
>> to find the node closest to the local numa node to return instead of
>> going to the online cpu immediately.
>>
>> For example, In Huawei Kunpeng 920 system, there are 4 NUMA node(0 -3)
>> in the 2-socket system(0 - 1). If the I/O device is in socket1
>> and the local NUMA node is 2, we shall choose the non-local node3 in
>> the same socket when cpu core in NUMA node2 is less that I/O requirements.
>> If we directly pick one cpu core from all online ones, it may be in
>> the another socket and it is not friendly for performance.
>>
>> ...
>>
>> Changes from RFC:
>>      Address Michal Hocko's comment: Use GFP_ATOMIC instead of GFP_KERNEL
> 
> Are you sure this is necessary?  cpumask_local_spread() is typically
> called when a device driver is initializing irq affinities, and
> sleeping allocations are usually OK at driver initialization time.  If

Got it and my limited realization, thanks for more explanations about it.

> there is some driver which is calling cpumask_local_spread() from
> atomic context, I bet it's pretty easy to fix.
> 
>> --- a/lib/cpumask.c
>> +++ b/lib/cpumask.c
>> @@ -192,6 +192,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
>>  }
>>  #endif
>>  
>> +static void calc_node_distance(int *node_dist, int node)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < nr_node_ids; i++)
>> +		node_dist[i] = node_distance(node, i);
>> +}
>> +
>> +static int find_nearest_node(int *node_dist, bool *used_flag)
> 
> The name "used_flag" is rather redundant for a thing of type bool - we
> know it's a flag!  "used" would suffice.
> 

Ok

>> +{
>> +	int i, min_dist = node_dist[0], node_id = -1;
>> +
>> +	for (i = 0; i < nr_node_ids; i++)
>> +		if (used_flag[i] == 0) {
>> +			min_dist = node_dist[i];
>> +			node_id = i;
>> +			break;
>> +		}
>> +	for (i = 0; i < nr_node_ids; i++)
>> +		if (node_dist[i] < min_dist && used_flag[i] == 0) {
>> +			min_dist = node_dist[i];
>> +			node_id = i;
>> +		}
>> +
>> +	return node_id;
>> +}
>> +
>>  /**
>>   * cpumask_local_spread - select the i'th cpu with local numa cpu's first
>>   * @i: index number
>> @@ -205,7 +232,8 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
>>   */
>>  unsigned int cpumask_local_spread(unsigned int i, int node)
> 
> Yes, this has become quite an expensive function.  That seems harmless
> given the typical callsites.
> 
>>  {
>> -	int cpu;
>> +	int cpu, j, id, *node_dist;
>> +	bool *used_flag;
>>  
>>  	/* Wrap: we always want a cpu. */
>>  	i %= num_online_cpus();
>> @@ -215,19 +243,45 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
>>  			if (i-- == 0)
>>  				return cpu;
>>  	} else {
>> -		/* NUMA first. */
>> -		for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
>> -			if (i-- == 0)
>> -				return cpu;
>> +		node_dist = kmalloc_array(nr_node_ids, sizeof(int), GFP_ATOMIC);
>> +		if (!node_dist)
>> +			for_each_cpu(cpu, cpu_online_mask)
>> +				if (i-- == 0)
>> +					return cpu;
>>  
>> -		for_each_cpu(cpu, cpu_online_mask) {
>> -			/* Skip NUMA nodes, done above. */
>> -			if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
>> -				continue;
>> +		used_flag = kmalloc_array(nr_node_ids, sizeof(bool), GFP_ATOMIC);
> 
> This could actually be an array of bits (include/linux/bitmap.h), but
> it hardly seems important.
> 

Ok,

> In fact with CONFIG_NODES_SHIFT <= 10, such a bitmap would have max
> size of 128 bytes and could be a local.  But again, this is unimportant
> as long as the other kmalloc is in there.
> 

Got it and I will follow it in next version.

Thanks,
Shaokun

> 
>> +		if (!used_flag)
>> +			for_each_cpu(cpu, cpu_online_mask)
>> +				if (i-- == 0) {
>> +					kfree(node_dist);
>> +					return cpu;
>> +				}
>> +		memset(used_flag, 0, nr_node_ids * sizeof(bool));
>>  
>> -			if (i-- == 0)
>> -				return cpu;
>> +		calc_node_distance(node_dist, node);
>> +		for (j = 0; j < nr_node_ids; j++) {
>> +			id = find_nearest_node(node_dist, used_flag);
>> +			if (id < 0)
>> +				break;
>> +			for_each_cpu_and(cpu,
>> +				cpumask_of_node(id), cpu_online_mask)
>> +				if (i-- == 0) {
>> +					kfree(node_dist);
>> +					kfree(used_flag);
>> +					return cpu;
>> +				}
>> +			used_flag[id] = 1;
>>  		}
>> +
>> +		for_each_cpu(cpu, cpu_online_mask)
>> +			if (i-- == 0) {
>> +				kfree(node_dist);
>> +				kfree(used_flag);
>> +				return cpu;
>> +			}
>> +
>> +		kfree(node_dist);
>> +		kfree(used_flag);
>>  	}
>>  	BUG();
>>  }
> 
> 
> .
> 


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-11-04  6:03 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-31  6:03 [PATCH] lib: optimize cpumask_local_spread() Shaokun Zhang
2019-10-31  7:39 ` Michal Hocko
2019-11-04  3:37   ` Shaokun Zhang
2019-11-01  0:37 ` Andrew Morton
2019-11-04  6:02   ` Shaokun Zhang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).