All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] irqchip/gic-v3-its: Balance initial LPI affinity across CPUs
@ 2020-01-19 19:05 Marc Zyngier
  2020-01-20 16:13 ` John Garry
  0 siblings, 1 reply; 9+ messages in thread
From: Marc Zyngier @ 2020-01-19 19:05 UTC (permalink / raw)
  To: linux-kernel; +Cc: Thomas Gleixner, Jason Cooper, John Garry

When mapping a LPI, the ITS driver picks the first possible
affinity, which is in most cases CPU0, assuming that if
that's not suitable, someone will come and set the affinity
to something more interesting.

It apparently isn't the case, and people complain of poor
performance when many interrupts are glued to the same CPU.
So let's place the interrupts by finding the "least loaded"
CPU (that is, the one that has the fewer LPIs mapped to it).
So called 'managed' interrupts are an interesting case where
the affinity is actually dictated by the kernel itself, and
we should honor this.

Reported-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/1575642904-58295-1-git-send-email-john.garry@huawei.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 drivers/irqchip/irq-gic-v3-its.c | 120 +++++++++++++++++++++++--------
 1 file changed, 92 insertions(+), 28 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index e05673bcd52b..ec50cc1b11a3 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -177,6 +177,8 @@ static DEFINE_IDA(its_vpeid_ida);
 #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
 #define gic_data_rdist_vlpi_base()	(gic_data_rdist_rd_base() + SZ_128K)
 
+static DEFINE_PER_CPU(atomic_t, cpu_lpi_count);
+
 static u16 get_its_list(struct its_vm *vm)
 {
 	struct its_node *its;
@@ -1287,42 +1289,96 @@ static void its_unmask_irq(struct irq_data *d)
 	lpi_update_config(d, 0, LPI_PROP_ENABLED);
 }
 
+static int its_pick_target_cpu(const struct cpumask *cpu_mask)
+{
+	unsigned int cpu = nr_cpu_ids, tmp;
+	int count = S32_MAX;
+
+	/*
+	 * If we're only picking one of the online CPUs, just pick the
+	 * first one (there are drivers that depend on this behaviour).
+	 * At some point, we'll have to weed them out.
+	 */
+	if (cpu_mask == cpu_online_mask)
+		return cpumask_first(cpu_mask);
+
+	for_each_cpu(tmp, cpu_mask) {
+		int this_count = per_cpu(cpu_lpi_count, tmp).counter;
+		if (this_count < count) {
+			cpu = tmp;
+		        count = this_count;
+		}
+	}
+
+	return cpu;
+}
+
+static void its_compute_affinity(struct irq_data *d,
+				 const struct cpumask *requested,
+				 struct cpumask *computed)
+{
+	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
+
+	cpumask_and(computed, requested, cpu_online_mask);
+
+	/* LPI cannot be routed to a redistributor that is on a foreign node */
+	if ((its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) &&
+	    its_dev->its->numa_node >= 0)
+		cpumask_and(computed, computed,
+			    cpumask_of_node(its_dev->its->numa_node));
+}
+
 static int its_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
 			    bool force)
 {
-	unsigned int cpu;
-	const struct cpumask *cpu_mask = cpu_online_mask;
 	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
-	struct its_collection *target_col;
+	int ret = IRQ_SET_MASK_OK_DONE;
 	u32 id = its_get_event_id(d);
+	cpumask_var_t tmpmask;
+	struct cpumask *mask;
 
 	/* A forwarded interrupt should use irq_set_vcpu_affinity */
 	if (irqd_is_forwarded_to_vcpu(d))
 		return -EINVAL;
 
-       /* lpi cannot be routed to a redistributor that is on a foreign node */
-	if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) {
-		if (its_dev->its->numa_node >= 0) {
-			cpu_mask = cpumask_of_node(its_dev->its->numa_node);
-			if (!cpumask_intersects(mask_val, cpu_mask))
-				return -EINVAL;
-		}
+	if (!force) {
+		if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+			return -ENOMEM;
+
+		mask = tmpmask;
+		its_compute_affinity(d, mask_val, mask);
+	} else	{
+		mask = (struct cpumask *)mask_val;
 	}
 
-	cpu = cpumask_any_and(mask_val, cpu_mask);
+	if (cpumask_empty(mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
 
-	if (cpu >= nr_cpu_ids)
-		return -EINVAL;
+	if (!cpumask_test_cpu(its_dev->event_map.col_map[id], mask)) {
+		struct its_collection *target_col;
+		int cpu;
+
+		cpu = its_pick_target_cpu(mask);
+		if (cpu >= nr_cpu_ids) {
+			ret = -EINVAL;
+			goto out;
+		}
 
-	/* don't set the affinity when the target cpu is same as current one */
-	if (cpu != its_dev->event_map.col_map[id]) {
+		atomic_inc(per_cpu_ptr(&cpu_lpi_count, cpu));
+		atomic_dec(per_cpu_ptr(&cpu_lpi_count,
+				       its_dev->event_map.col_map[id]));
 		target_col = &its_dev->its->collections[cpu];
 		its_send_movi(its_dev, target_col, id);
 		its_dev->event_map.col_map[id] = cpu;
 		irq_data_update_effective_affinity(d, cpumask_of(cpu));
 	}
 
-	return IRQ_SET_MASK_OK_DONE;
+out:
+	if (!force)
+		free_cpumask_var(tmpmask);
+	return ret;
 }
 
 static u64 its_irq_get_msi_base(struct its_device *its_dev)
@@ -2773,28 +2829,34 @@ static int its_irq_domain_activate(struct irq_domain *domain,
 {
 	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
 	u32 event = its_get_event_id(d);
-	const struct cpumask *cpu_mask = cpu_online_mask;
-	int cpu;
+	int ret = 0, cpu = nr_cpu_ids;
+	const struct cpumask *reqmask;
+	cpumask_var_t mask;
 
-	/* get the cpu_mask of local node */
-	if (its_dev->its->numa_node >= 0)
-		cpu_mask = cpumask_of_node(its_dev->its->numa_node);
+	if (irqd_affinity_is_managed(d))
+		reqmask = irq_data_get_affinity_mask(d);
+	else
+		reqmask = cpu_online_mask;
 
-	/* Bind the LPI to the first possible CPU */
-	cpu = cpumask_first_and(cpu_mask, cpu_online_mask);
-	if (cpu >= nr_cpu_ids) {
-		if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144)
-			return -EINVAL;
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
 
-		cpu = cpumask_first(cpu_online_mask);
+	its_compute_affinity(d, reqmask, mask);
+	cpu = its_pick_target_cpu(mask);
+	if (cpu >= nr_cpu_ids) {
+		ret = -EINVAL;
+		goto out;
 	}
 
+	atomic_inc(per_cpu_ptr(&cpu_lpi_count, cpu));
 	its_dev->event_map.col_map[event] = cpu;
 	irq_data_update_effective_affinity(d, cpumask_of(cpu));
 
 	/* Map the GIC IRQ and event to the device */
 	its_send_mapti(its_dev, d->hwirq, event);
-	return 0;
+out:
+	free_cpumask_var(mask);
+	return ret;
 }
 
 static void its_irq_domain_deactivate(struct irq_domain *domain,
@@ -2803,6 +2865,8 @@ static void its_irq_domain_deactivate(struct irq_domain *domain,
 	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
 	u32 event = its_get_event_id(d);
 
+	atomic_dec(per_cpu_ptr(&cpu_lpi_count,
+			       its_dev->event_map.col_map[event]));
 	/* Stop the delivery of interrupts */
 	its_send_discard(its_dev, event);
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] irqchip/gic-v3-its: Balance initial LPI affinity across CPUs
  2020-01-19 19:05 [PATCH] irqchip/gic-v3-its: Balance initial LPI affinity across CPUs Marc Zyngier
@ 2020-01-20 16:13 ` John Garry
  2020-01-20 17:42   ` Marc Zyngier
  0 siblings, 1 reply; 9+ messages in thread
From: John Garry @ 2020-01-20 16:13 UTC (permalink / raw)
  To: Marc Zyngier, linux-kernel
  Cc: Thomas Gleixner, Jason Cooper, Ming Lei, chenxiang

On 19/01/2020 19:05, Marc Zyngier wrote:> When mapping a LPI, the ITS 
driver picks the first possible> affinity, which is in most cases CPU0, 
assuming that if> that's not suitable, someone will come and set the 
affinity> to something more interesting.> > It apparently isn't the 
case, and people complain of poor> performance when many interrupts are 
glued to the same CPU.> So let's place the interrupts by finding the 
"least loaded"> CPU (that is, the one that has the fewer LPIs mapped to 
it).> So called 'managed' interrupts are an interesting case where> the 
affinity is actually dictated by the kernel itself, and> we should honor 
this.> > Reported-by: John Garry <john.garry@huawei.com>> Link: 
https://lore.kernel.org/r/1575642904-58295-1-git-send-email-john.garry@huawei.com> 
Signed-off-by: Marc Zyngier <maz@kernel.org>
So I tested on NVMe, and it showed a good performance boost - maybe ~20% 
for 2x NVMe disks on my D06. But a couple of comments below:

Thanks,
John

> ---
>   drivers/irqchip/irq-gic-v3-its.c | 120 +++++++++++++++++++++++--------
>   1 file changed, 92 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index e05673bcd52b..ec50cc1b11a3 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -177,6 +177,8 @@ static DEFINE_IDA(its_vpeid_ida);
>   #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
>   #define gic_data_rdist_vlpi_base()	(gic_data_rdist_rd_base() + SZ_128K)
>   
> +static DEFINE_PER_CPU(atomic_t, cpu_lpi_count);
> +
>   static u16 get_its_list(struct its_vm *vm)
>   {
>   	struct its_node *its;
> @@ -1287,42 +1289,96 @@ static void its_unmask_irq(struct irq_data *d)
>   	lpi_update_config(d, 0, LPI_PROP_ENABLED);
>   }
>   
> +static int its_pick_target_cpu(const struct cpumask *cpu_mask)
> +{
> +	unsigned int cpu = nr_cpu_ids, tmp;
> +	int count = S32_MAX;
> +
> +	/*
> +	 * If we're only picking one of the online CPUs, just pick the
> +	 * first one (there are drivers that depend on this behaviour).
> +	 * At some point, we'll have to weed them out.
> +	 */
> +	if (cpu_mask == cpu_online_mask)

should this use cpumask_equal()?

> +		return cpumask_first(cpu_mask);
> +
> +	for_each_cpu(tmp, cpu_mask) {
> +		int this_count = per_cpu(cpu_lpi_count, tmp).counter;
> +		if (this_count < count) {
> +			cpu = tmp;
> +		        count = this_count;
> +		}
> +	}
> +
> +	return cpu;
> +}
> +
> +static void its_compute_affinity(struct irq_data *d,
> +				 const struct cpumask *requested,
> +				 struct cpumask *computed)
> +{
> +	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
> +
> +	cpumask_and(computed, requested, cpu_online_mask);
> +
> +	/* LPI cannot be routed to a redistributor that is on a foreign node */
> +	if ((its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) &&
> +	    its_dev->its->numa_node >= 0)
> +		cpumask_and(computed, computed,
> +			    cpumask_of_node(its_dev->its->numa_node));
> +}
> +
>   static int its_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
>   			    bool force)
>   {
> -	unsigned int cpu;
> -	const struct cpumask *cpu_mask = cpu_online_mask;
>   	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
> -	struct its_collection *target_col;
> +	int ret = IRQ_SET_MASK_OK_DONE;
>   	u32 id = its_get_event_id(d);
> +	cpumask_var_t tmpmask;
> +	struct cpumask *mask;
>   
>   	/* A forwarded interrupt should use irq_set_vcpu_affinity */
>   	if (irqd_is_forwarded_to_vcpu(d))
>   		return -EINVAL;
>   
> -       /* lpi cannot be routed to a redistributor that is on a foreign node */
> -	if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) {
> -		if (its_dev->its->numa_node >= 0) {
> -			cpu_mask = cpumask_of_node(its_dev->its->numa_node);
> -			if (!cpumask_intersects(mask_val, cpu_mask))
> -				return -EINVAL;
> -		}
> +	if (!force) {
> +		if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
> +			return -ENOMEM;
> +
> +		mask = tmpmask;
> +		its_compute_affinity(d, mask_val, mask);
> +	} else	{
> +		mask = (struct cpumask *)mask_val;
>   	}
>   
> -	cpu = cpumask_any_and(mask_val, cpu_mask);
> +	if (cpumask_empty(mask)) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
>   
> -	if (cpu >= nr_cpu_ids)
> -		return -EINVAL;
> +	if (!cpumask_test_cpu(its_dev->event_map.col_map[id], mask)) {
> +		struct its_collection *target_col;
> +		int cpu;
> +
> +		cpu = its_pick_target_cpu(mask);
> +		if (cpu >= nr_cpu_ids) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
>   
> -	/* don't set the affinity when the target cpu is same as current one */
> -	if (cpu != its_dev->event_map.col_map[id]) {
> +		atomic_inc(per_cpu_ptr(&cpu_lpi_count, cpu));
> +		atomic_dec(per_cpu_ptr(&cpu_lpi_count,
> +				       its_dev->event_map.col_map[id]));
>   		target_col = &its_dev->its->collections[cpu];
>   		its_send_movi(its_dev, target_col, id);
>   		its_dev->event_map.col_map[id] = cpu;
>   		irq_data_update_effective_affinity(d, cpumask_of(cpu));
>   	}
>   
> -	return IRQ_SET_MASK_OK_DONE;
> +out:
> +	if (!force)
> +		free_cpumask_var(tmpmask);
> +	return ret;
>   }
>   
>   static u64 its_irq_get_msi_base(struct its_device *its_dev)
> @@ -2773,28 +2829,34 @@ static int its_irq_domain_activate(struct irq_domain *domain,
>   {
>   	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>   	u32 event = its_get_event_id(d);
> -	const struct cpumask *cpu_mask = cpu_online_mask;
> -	int cpu;
> +	int ret = 0, cpu = nr_cpu_ids;
> +	const struct cpumask *reqmask;
> +	cpumask_var_t mask;
>   
> -	/* get the cpu_mask of local node */
> -	if (its_dev->its->numa_node >= 0)
> -		cpu_mask = cpumask_of_node(its_dev->its->numa_node);
> +	if (irqd_affinity_is_managed(d))
> +		reqmask = irq_data_get_affinity_mask(d);
> +	else
> +		reqmask = cpu_online_mask;
>   
> -	/* Bind the LPI to the first possible CPU */
> -	cpu = cpumask_first_and(cpu_mask, cpu_online_mask);
> -	if (cpu >= nr_cpu_ids) {
> -		if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144)
> -			return -EINVAL;
> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> +		return -ENOMEM;
>   
> -		cpu = cpumask_first(cpu_online_mask);
> +	its_compute_affinity(d, reqmask, mask);
> +	cpu = its_pick_target_cpu(mask);
> +	if (cpu >= nr_cpu_ids) {
> +		ret = -EINVAL;
> +		goto out;
>   	}
>   
> +	atomic_inc(per_cpu_ptr(&cpu_lpi_count, cpu));

I wonder if we should only consider managed interrupts in this accounting?

So cpu0 is effectively going to be excluded from the balancing, as it 
will have so many lpis targeted.

And, for the others, even if we balance all the LPIs, won't irqbalance 
(if running, obviously) can come along and fiddle with these non-managed 
interrupt affinities anyway?

>   	its_dev->event_map.col_map[event] = cpu;
>   	irq_data_update_effective_affinity(d, cpumask_of(cpu));
>   
>   	/* Map the GIC IRQ and event to the device */
>   	its_send_mapti(its_dev, d->hwirq, event);
> -	return 0;
> +out:
> +	free_cpumask_var(mask);
> +	return ret;
>   }
>   
>   static void its_irq_domain_deactivate(struct irq_domain *domain,
> @@ -2803,6 +2865,8 @@ static void its_irq_domain_deactivate(struct irq_domain *domain,
>   	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>   	u32 event = its_get_event_id(d);
>   
> +	atomic_dec(per_cpu_ptr(&cpu_lpi_count,
> +			       its_dev->event_map.col_map[event]));
>   	/* Stop the delivery of interrupts */
>   	its_send_discard(its_dev, event);
>   }
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] irqchip/gic-v3-its: Balance initial LPI affinity across CPUs
  2020-01-20 16:13 ` John Garry
@ 2020-01-20 17:42   ` Marc Zyngier
  2020-01-20 18:21     ` John Garry
  0 siblings, 1 reply; 9+ messages in thread
From: Marc Zyngier @ 2020-01-20 17:42 UTC (permalink / raw)
  To: John Garry
  Cc: linux-kernel, Thomas Gleixner, Jason Cooper, Ming Lei, chenxiang

On 2020-01-20 16:13, John Garry wrote:
> On 19/01/2020 19:05, Marc Zyngier wrote:> When mapping a LPI, the ITS
> driver picks the first possible> affinity, which is in most cases
> CPU0, assuming that if> that's not suitable, someone will come and set
> the affinity> to something more interesting.> > It apparently isn't
> the case, and people complain of poor> performance when many
> interrupts are glued to the same CPU.> So let's place the interrupts
> by finding the "least loaded"> CPU (that is, the one that has the
> fewer LPIs mapped to it).> So called 'managed' interrupts are an
> interesting case where> the affinity is actually dictated by the
> kernel itself, and> we should honor this.> > Reported-by: John Garry
> <john.garry@huawei.com>> Link:
> https://lore.kernel.org/r/1575642904-58295-1-git-send-email-john.garry@huawei.com>
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> So I tested on NVMe, and it showed a good performance boost - maybe
> ~20% for 2x NVMe disks on my D06. But a couple of comments below:
> 
> Thanks,
> John
> 
>> ---
>>   drivers/irqchip/irq-gic-v3-its.c | 120 
>> +++++++++++++++++++++++--------
>>   1 file changed, 92 insertions(+), 28 deletions(-)
>> 
>> diff --git a/drivers/irqchip/irq-gic-v3-its.c 
>> b/drivers/irqchip/irq-gic-v3-its.c
>> index e05673bcd52b..ec50cc1b11a3 100644
>> --- a/drivers/irqchip/irq-gic-v3-its.c
>> +++ b/drivers/irqchip/irq-gic-v3-its.c
>> @@ -177,6 +177,8 @@ static DEFINE_IDA(its_vpeid_ida);
>>   #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
>>   #define gic_data_rdist_vlpi_base()	(gic_data_rdist_rd_base() + 
>> SZ_128K)
>>   +static DEFINE_PER_CPU(atomic_t, cpu_lpi_count);
>> +
>>   static u16 get_its_list(struct its_vm *vm)
>>   {
>>   	struct its_node *its;
>> @@ -1287,42 +1289,96 @@ static void its_unmask_irq(struct irq_data *d)
>>   	lpi_update_config(d, 0, LPI_PROP_ENABLED);
>>   }
>>   +static int its_pick_target_cpu(const struct cpumask *cpu_mask)
>> +{
>> +	unsigned int cpu = nr_cpu_ids, tmp;
>> +	int count = S32_MAX;
>> +
>> +	/*
>> +	 * If we're only picking one of the online CPUs, just pick the
>> +	 * first one (there are drivers that depend on this behaviour).
>> +	 * At some point, we'll have to weed them out.
>> +	 */
>> +	if (cpu_mask == cpu_online_mask)
> 
> should this use cpumask_equal()?

Oops. yes. Good point.

> 
>> +		return cpumask_first(cpu_mask);
>> +
>> +	for_each_cpu(tmp, cpu_mask) {
>> +		int this_count = per_cpu(cpu_lpi_count, tmp).counter;
>> +		if (this_count < count) {
>> +			cpu = tmp;
>> +		        count = this_count;
>> +		}
>> +	}
>> +
>> +	return cpu;
>> +}
>> +
>> +static void its_compute_affinity(struct irq_data *d,
>> +				 const struct cpumask *requested,
>> +				 struct cpumask *computed)
>> +{
>> +	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>> +
>> +	cpumask_and(computed, requested, cpu_online_mask);
>> +
>> +	/* LPI cannot be routed to a redistributor that is on a foreign node 
>> */
>> +	if ((its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) &&
>> +	    its_dev->its->numa_node >= 0)
>> +		cpumask_and(computed, computed,
>> +			    cpumask_of_node(its_dev->its->numa_node));
>> +}
>> +
>>   static int its_set_affinity(struct irq_data *d, const struct cpumask 
>> *mask_val,
>>   			    bool force)
>>   {
>> -	unsigned int cpu;
>> -	const struct cpumask *cpu_mask = cpu_online_mask;
>>   	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>> -	struct its_collection *target_col;
>> +	int ret = IRQ_SET_MASK_OK_DONE;
>>   	u32 id = its_get_event_id(d);
>> +	cpumask_var_t tmpmask;
>> +	struct cpumask *mask;
>>     	/* A forwarded interrupt should use irq_set_vcpu_affinity */
>>   	if (irqd_is_forwarded_to_vcpu(d))
>>   		return -EINVAL;
>>   -       /* lpi cannot be routed to a redistributor that is on a 
>> foreign node */
>> -	if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) {
>> -		if (its_dev->its->numa_node >= 0) {
>> -			cpu_mask = cpumask_of_node(its_dev->its->numa_node);
>> -			if (!cpumask_intersects(mask_val, cpu_mask))
>> -				return -EINVAL;
>> -		}
>> +	if (!force) {
>> +		if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
>> +			return -ENOMEM;
>> +
>> +		mask = tmpmask;
>> +		its_compute_affinity(d, mask_val, mask);
>> +	} else	{
>> +		mask = (struct cpumask *)mask_val;
>>   	}
>>   -	cpu = cpumask_any_and(mask_val, cpu_mask);
>> +	if (cpumask_empty(mask)) {
>> +		ret = -EINVAL;
>> +		goto out;
>> +	}
>>   -	if (cpu >= nr_cpu_ids)
>> -		return -EINVAL;
>> +	if (!cpumask_test_cpu(its_dev->event_map.col_map[id], mask)) {
>> +		struct its_collection *target_col;
>> +		int cpu;
>> +
>> +		cpu = its_pick_target_cpu(mask);
>> +		if (cpu >= nr_cpu_ids) {
>> +			ret = -EINVAL;
>> +			goto out;
>> +		}
>>   -	/* don't set the affinity when the target cpu is same as current 
>> one */
>> -	if (cpu != its_dev->event_map.col_map[id]) {
>> +		atomic_inc(per_cpu_ptr(&cpu_lpi_count, cpu));
>> +		atomic_dec(per_cpu_ptr(&cpu_lpi_count,
>> +				       its_dev->event_map.col_map[id]));
>>   		target_col = &its_dev->its->collections[cpu];
>>   		its_send_movi(its_dev, target_col, id);
>>   		its_dev->event_map.col_map[id] = cpu;
>>   		irq_data_update_effective_affinity(d, cpumask_of(cpu));
>>   	}
>>   -	return IRQ_SET_MASK_OK_DONE;
>> +out:
>> +	if (!force)
>> +		free_cpumask_var(tmpmask);
>> +	return ret;
>>   }
>>     static u64 its_irq_get_msi_base(struct its_device *its_dev)
>> @@ -2773,28 +2829,34 @@ static int its_irq_domain_activate(struct 
>> irq_domain *domain,
>>   {
>>   	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>>   	u32 event = its_get_event_id(d);
>> -	const struct cpumask *cpu_mask = cpu_online_mask;
>> -	int cpu;
>> +	int ret = 0, cpu = nr_cpu_ids;
>> +	const struct cpumask *reqmask;
>> +	cpumask_var_t mask;
>>   -	/* get the cpu_mask of local node */
>> -	if (its_dev->its->numa_node >= 0)
>> -		cpu_mask = cpumask_of_node(its_dev->its->numa_node);
>> +	if (irqd_affinity_is_managed(d))
>> +		reqmask = irq_data_get_affinity_mask(d);
>> +	else
>> +		reqmask = cpu_online_mask;
>>   -	/* Bind the LPI to the first possible CPU */
>> -	cpu = cpumask_first_and(cpu_mask, cpu_online_mask);
>> -	if (cpu >= nr_cpu_ids) {
>> -		if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144)
>> -			return -EINVAL;
>> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
>> +		return -ENOMEM;
>>   -		cpu = cpumask_first(cpu_online_mask);
>> +	its_compute_affinity(d, reqmask, mask);
>> +	cpu = its_pick_target_cpu(mask);
>> +	if (cpu >= nr_cpu_ids) {
>> +		ret = -EINVAL;
>> +		goto out;
>>   	}
>>   +	atomic_inc(per_cpu_ptr(&cpu_lpi_count, cpu));
> 
> I wonder if we should only consider managed interrupts in this 
> accounting?
> 
> So cpu0 is effectively going to be excluded from the balancing, as it
> will have so many lpis targeted.

Maybe, but only if the provided managed affinity gives you the
opportunity of placing the LPI somewhere else. If the managed
affinity says CPU0 only, then that's where you end up.

> And, for the others, even if we balance all the LPIs, won't irqbalance
> (if running, obviously) can come along and fiddle with these
> non-managed interrupt affinities anyway?

Of course, irqbalance will move things around. But that should be to
CPUs that do not have too many screaming interrupts.

Thanks,

         M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] irqchip/gic-v3-its: Balance initial LPI affinity across CPUs
  2020-01-20 17:42   ` Marc Zyngier
@ 2020-01-20 18:21     ` John Garry
  2020-01-20 18:45       ` Marc Zyngier
  0 siblings, 1 reply; 9+ messages in thread
From: John Garry @ 2020-01-20 18:21 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: linux-kernel, Thomas Gleixner, Jason Cooper, Ming Lei, chenxiang (M)

On 20/01/2020 17:42, Marc Zyngier wrote:

Hi Marc,

>>>      static u64 its_irq_get_msi_base(struct its_device *its_dev)
>>> @@ -2773,28 +2829,34 @@ static int its_irq_domain_activate(struct
>>> irq_domain *domain,
>>>    {
>>>    	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>>>    	u32 event = its_get_event_id(d);
>>> -	const struct cpumask *cpu_mask = cpu_online_mask;
>>> -	int cpu;
>>> +	int ret = 0, cpu = nr_cpu_ids;
>>> +	const struct cpumask *reqmask;
>>> +	cpumask_var_t mask;
>>>    -	/* get the cpu_mask of local node */
>>> -	if (its_dev->its->numa_node >= 0)
>>> -		cpu_mask = cpumask_of_node(its_dev->its->numa_node);
>>> +	if (irqd_affinity_is_managed(d))
>>> +		reqmask = irq_data_get_affinity_mask(d);
>>> +	else
>>> +		reqmask = cpu_online_mask;
>>>    -	/* Bind the LPI to the first possible CPU */
>>> -	cpu = cpumask_first_and(cpu_mask, cpu_online_mask);
>>> -	if (cpu >= nr_cpu_ids) {
>>> -		if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144)
>>> -			return -EINVAL;
>>> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
>>> +		return -ENOMEM;
>>>    -		cpu = cpumask_first(cpu_online_mask);
>>> +	its_compute_affinity(d, reqmask, mask);
>>> +	cpu = its_pick_target_cpu(mask);
>>> +	if (cpu >= nr_cpu_ids) {
>>> +		ret = -EINVAL;
>>> +		goto out;
>>>    	}
>>>    +	atomic_inc(per_cpu_ptr(&cpu_lpi_count, cpu));
>>
>> I wonder if we should only consider managed interrupts in this
>> accounting?
>>
>> So cpu0 is effectively going to be excluded from the balancing, as it
>> will have so many lpis targeted.
> 
> Maybe, but only if the provided managed affinity gives you the
> opportunity of placing the LPI somewhere else. 

Of course, if there's no other cpu in the mask then so be it.

If the managed
> affinity says CPU0 only, then that's where you end up.
> 

If my debug code is correct (with the above fix), cpu0 had 763 
interrupts targeted on my D06 initially :)

But it's not just cpu0. I find initial non-managed interrupt affinity 
masks are set generally on cpu cluster/numa node masks, so the first 
cpus in those masks are bit over-subscribed, so then we may be spreading 
the managed interrupts over less cpus in the mask.

This is a taste of lpi distribution on my 96 core system:
cpu0 763
cpu1 2
cpu3 1
cpu4 2
cpu5 2
cpu6 0
cpu7 0
cpu8 2
cpu9 1
cpu10 0
...
cpu16 2
...
cpu24 8
...
cpu48 10 (numa node boundary)
...


>> And, for the others, even if we balance all the LPIs, won't irqbalance
>> (if running, obviously) can come along and fiddle with these
>> non-managed interrupt affinities anyway?
> 
> Of course, irqbalance will move things around. But that should be to
> CPUs that do not have too many screaming interrupts.
> 
> Thanks,
> 
>           M.

Cheers,
John

> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] irqchip/gic-v3-its: Balance initial LPI affinity across CPUs
  2020-01-20 18:21     ` John Garry
@ 2020-01-20 18:45       ` Marc Zyngier
  2020-01-20 19:24         ` Thomas Gleixner
  0 siblings, 1 reply; 9+ messages in thread
From: Marc Zyngier @ 2020-01-20 18:45 UTC (permalink / raw)
  To: John Garry
  Cc: linux-kernel, Thomas Gleixner, Jason Cooper, Ming Lei, chenxiang (M)

Hi John,

On 2020-01-20 18:21, John Garry wrote:
> On 20/01/2020 17:42, Marc Zyngier wrote:
> 
> Hi Marc,
> 
>>>>      static u64 its_irq_get_msi_base(struct its_device *its_dev)
>>>> @@ -2773,28 +2829,34 @@ static int its_irq_domain_activate(struct
>>>> irq_domain *domain,
>>>>    {
>>>>    	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>>>>    	u32 event = its_get_event_id(d);
>>>> -	const struct cpumask *cpu_mask = cpu_online_mask;
>>>> -	int cpu;
>>>> +	int ret = 0, cpu = nr_cpu_ids;
>>>> +	const struct cpumask *reqmask;
>>>> +	cpumask_var_t mask;
>>>>    -	/* get the cpu_mask of local node */
>>>> -	if (its_dev->its->numa_node >= 0)
>>>> -		cpu_mask = cpumask_of_node(its_dev->its->numa_node);
>>>> +	if (irqd_affinity_is_managed(d))
>>>> +		reqmask = irq_data_get_affinity_mask(d);
>>>> +	else
>>>> +		reqmask = cpu_online_mask;
>>>>    -	/* Bind the LPI to the first possible CPU */
>>>> -	cpu = cpumask_first_and(cpu_mask, cpu_online_mask);
>>>> -	if (cpu >= nr_cpu_ids) {
>>>> -		if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144)
>>>> -			return -EINVAL;
>>>> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
>>>> +		return -ENOMEM;
>>>>    -		cpu = cpumask_first(cpu_online_mask);
>>>> +	its_compute_affinity(d, reqmask, mask);
>>>> +	cpu = its_pick_target_cpu(mask);
>>>> +	if (cpu >= nr_cpu_ids) {
>>>> +		ret = -EINVAL;
>>>> +		goto out;
>>>>    	}
>>>>    +	atomic_inc(per_cpu_ptr(&cpu_lpi_count, cpu));
>>> 
>>> I wonder if we should only consider managed interrupts in this
>>> accounting?
>>> 
>>> So cpu0 is effectively going to be excluded from the balancing, as it
>>> will have so many lpis targeted.
>> 
>> Maybe, but only if the provided managed affinity gives you the
>> opportunity of placing the LPI somewhere else.
> 
> Of course, if there's no other cpu in the mask then so be it.
> 
> If the managed
>> affinity says CPU0 only, then that's where you end up.
>> 
> 
> If my debug code is correct (with the above fix), cpu0 had 763
> interrupts targeted on my D06 initially :)

You obviously have too many devices in this machine... ;-)

> But it's not just cpu0. I find initial non-managed interrupt affinity
> masks are set generally on cpu cluster/numa node masks, so the first
> cpus in those masks are bit over-subscribed, so then we may be
> spreading the managed interrupts over less cpus in the mask.
> 
> This is a taste of lpi distribution on my 96 core system:
> cpu0 763
> cpu1 2
> cpu3 1
> cpu4 2
> cpu5 2
> cpu6 0
> cpu7 0
> cpu8 2
> cpu9 1
> cpu10 0
> ...
> cpu16 2
> ...
> cpu24 8
> ...
> cpu48 10 (numa node boundary)
> ...

We're stuck between a rock and a hard place here:

(1) We place all interrupts on the least loaded CPU that matches
     the affinity -> results in performance issues on some funky
     HW (like D05's SAS controller).

(2) We place managed interrupts on the least loaded CPU that matches
     the affinity -> we have artificial load on NUMA boundaries, and
     reduced spread of overlapping managed interrupts.

(3) We don't account for non-managed LPIs, and we run the risk of
     unpredictable performance because we don't really know where
     the *other* interrupts are.

My personal preference would be to go for (1), as in my original post.
I find (3) the least appealing, because we don't track things anymore.
(2) feels like "the least of all evils", as it is a decent performance
gain, seems to give predictable performance, and doesn't regress lesser
systems...

I'm definitely open to suggestions here.

Thanks,

         M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] irqchip/gic-v3-its: Balance initial LPI affinity across CPUs
  2020-01-20 18:45       ` Marc Zyngier
@ 2020-01-20 19:24         ` Thomas Gleixner
  2020-01-21 10:46           ` John Garry
  2020-03-10 11:33           ` John Garry
  0 siblings, 2 replies; 9+ messages in thread
From: Thomas Gleixner @ 2020-01-20 19:24 UTC (permalink / raw)
  To: Marc Zyngier, John Garry
  Cc: linux-kernel, Jason Cooper, Ming Lei, chenxiang (M)

Marc,

Marc Zyngier <maz@kernel.org> writes:
> We're stuck between a rock and a hard place here:
>
> (1) We place all interrupts on the least loaded CPU that matches
>      the affinity -> results in performance issues on some funky
>      HW (like D05's SAS controller).
>
> (2) We place managed interrupts on the least loaded CPU that matches
>      the affinity -> we have artificial load on NUMA boundaries, and
>      reduced spread of overlapping managed interrupts.
>
> (3) We don't account for non-managed LPIs, and we run the risk of
>      unpredictable performance because we don't really know where
>      the *other* interrupts are.
>
> My personal preference would be to go for (1), as in my original post.
> I find (3) the least appealing, because we don't track things anymore.
> (2) feels like "the least of all evils", as it is a decent performance
> gain, seems to give predictable performance, and doesn't regress lesser
> systems...
>
> I'm definitely open to suggestions here.

The way x86 does it and that's mostly ok except for some really broken
setups is:

1) Non-managed interrupts:

   If the interrupt is bound to a node, then we try to find a target

     I)  in the intersection of affinity mask and node mask.

     II) in the nodemask itself

         Yes we ignore affinity mask there because that's pretty much
         the same as if the given affinity does not contain an online
         CPU.

     If all of that fails then we try the nodeless mode

   If the interrupt is not bound to a node, then we try to find a target

     I)  in the intersection of affinity mask and online mask.

     II) in the onlinemask itself

  Each step searches for the CPU in the searched mask which has the
  least number of total interrupts assigned.

2) Managed interrupts

  For managed interrupts we just search in the intersection of assigned
  mask and online CPUs for the CPU with the least number of managed
  interrupts.

  If no CPU is online then the interrupt is shutdown anyway, so no
  fallback required.

Don't know whether that's something you can map to ARM64, but I assume
the principle of trying to enforce NUMA locality plus balancing the
number of interrupts makes sense in general.

Thanks,

        tglx



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] irqchip/gic-v3-its: Balance initial LPI affinity across CPUs
  2020-01-20 19:24         ` Thomas Gleixner
@ 2020-01-21 10:46           ` John Garry
  2020-03-10 11:33           ` John Garry
  1 sibling, 0 replies; 9+ messages in thread
From: John Garry @ 2020-01-21 10:46 UTC (permalink / raw)
  To: Thomas Gleixner, Marc Zyngier
  Cc: linux-kernel, Jason Cooper, Ming Lei, chenxiang (M)

On 20/01/2020 19:24, Thomas Gleixner wrote:
> Marc,
> 
> Marc Zyngier <maz@kernel.org> writes:
>> We're stuck between a rock and a hard place here:
>>
>> (1) We place all interrupts on the least loaded CPU that matches
>>       the affinity -> results in performance issues on some funky
>>       HW (like D05's SAS controller).

I think that the software driver was more of the issue in that case, 
which I'm fixing in the driver by spreading the interrupts properly.

But I am not sure which other platforms rely on this behavior.

>>
>> (2) We place managed interrupts on the least loaded CPU that matches
>>       the affinity -> we have artificial load on NUMA boundaries, and
>>       reduced spread of overlapping managed interrupts.
>>
>> (3) We don't account for non-managed LPIs, and we run the risk of
>>       unpredictable performance because we don't really know where
>>       the *other* interrupts are.
>>
>> My personal preference would be to go for (1), as in my original post.

That seems reasonable, but I like how x86 accounts only for managed 
interrupt count per-cpu when choosing the target cpu (for a managed 
interrupt).

>> I find (3) the least appealing, because we don't track things anymore.
>> (2) feels like "the least of all evils", as it is a decent performance
>> gain, seems to give predictable performance, and doesn't regress lesser
>> systems...
>>
>> I'm definitely open to suggestions here.
> 
> The way x86 does it and that's mostly ok except for some really broken
> setups is:
> 
> 1) Non-managed interrupts:
> 
>     If the interrupt is bound to a node, then we try to find a target
> 
>       I)  in the intersection of affinity mask and node mask.
> 
>       II) in the nodemask itself
> 
>           Yes we ignore affinity mask there because that's pretty much
>           the same as if the given affinity does not contain an online
>           CPU.
> 
>       If all of that fails then we try the nodeless mode
> 
>     If the interrupt is not bound to a node, then we try to find a target
> 
>       I)  in the intersection of affinity mask and online mask.
> 
>       II) in the onlinemask itself
> 
>    Each step searches for the CPU in the searched mask which has the
>    least number of total interrupts assigned.
> 
> 2) Managed interrupts
> 
>    For managed interrupts we just search in the intersection of assigned
>    mask and online CPUs for the CPU with the least number of managed
>    interrupts.

As above, this is something which I prefer we do.

> 
>    If no CPU is online then the interrupt is shutdown anyway, so no
>    fallback required.
> 
> Don't know whether that's something you can map to ARM64, but I assume
> the principle of trying to enforce NUMA locality plus balancing the
> number of interrupts makes sense in general.
I guess that we could use irq matrix code directly if we wanted to go 
this way. That's why it is in a common location...

Cheers,
John

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] irqchip/gic-v3-its: Balance initial LPI affinity across CPUs
  2020-01-20 19:24         ` Thomas Gleixner
  2020-01-21 10:46           ` John Garry
@ 2020-03-10 11:33           ` John Garry
  2020-03-11 13:18             ` Marc Zyngier
  1 sibling, 1 reply; 9+ messages in thread
From: John Garry @ 2020-03-10 11:33 UTC (permalink / raw)
  To: Thomas Gleixner, Marc Zyngier
  Cc: linux-kernel, Jason Cooper, Ming Lei, chenxiang (M)

On 20/01/2020 19:24, Thomas Gleixner wrote:
> Marc,
> 
> Marc Zyngier <maz@kernel.org> writes:
>> We're stuck between a rock and a hard place here:
>>
>> (1) We place all interrupts on the least loaded CPU that matches
>>       the affinity -> results in performance issues on some funky
>>       HW (like D05's SAS controller).
>>
>> (2) We place managed interrupts on the least loaded CPU that matches
>>       the affinity -> we have artificial load on NUMA boundaries, and
>>       reduced spread of overlapping managed interrupts.
>>
>> (3) We don't account for non-managed LPIs, and we run the risk of
>>       unpredictable performance because we don't really know where
>>       the *other* interrupts are.
>>
>> My personal preference would be to go for (1), as in my original post.
>> I find (3) the least appealing, because we don't track things anymore.
>> (2) feels like "the least of all evils", as it is a decent performance
>> gain, seems to give predictable performance, and doesn't regress lesser
>> systems...
>>
>> I'm definitely open to suggestions here.
> 
> The way x86 does it and that's mostly ok except for some really broken
> setups is:
> 
> 1) Non-managed interrupts:
> 
>     If the interrupt is bound to a node, then we try to find a target
> 
>       I)  in the intersection of affinity mask and node mask.
> 
>       II) in the nodemask itself
> 
>           Yes we ignore affinity mask there because that's pretty much
>           the same as if the given affinity does not contain an online
>           CPU.
> 
>       If all of that fails then we try the nodeless mode
> 
>     If the interrupt is not bound to a node, then we try to find a target
> 
>       I)  in the intersection of affinity mask and online mask.
> 
>       II) in the onlinemask itself
> 
>    Each step searches for the CPU in the searched mask which has the
>    least number of total interrupts assigned.
> 
> 2) Managed interrupts
> 
>    For managed interrupts we just search in the intersection of assigned
>    mask and online CPUs for the CPU with the least number of managed
>    interrupts.
> 
>    If no CPU is online then the interrupt is shutdown anyway, so no
>    fallback required.
> 
> Don't know whether that's something you can map to ARM64, but I assume
> the principle of trying to enforce NUMA locality plus balancing the
> number of interrupts makes sense in general.
> 

Hi Marc,

I was wondering if there is anything we can do to progress this patch?

Apart from being a good change in itself, I need to do some SMMU testing 
for nextgen product development and I would like to include this patch, 
most preferably from mainline.

Cheers,
John

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] irqchip/gic-v3-its: Balance initial LPI affinity across CPUs
  2020-03-10 11:33           ` John Garry
@ 2020-03-11 13:18             ` Marc Zyngier
  0 siblings, 0 replies; 9+ messages in thread
From: Marc Zyngier @ 2020-03-11 13:18 UTC (permalink / raw)
  To: John Garry
  Cc: Thomas Gleixner, linux-kernel, Jason Cooper, Ming Lei, chenxiang (M)

On 2020-03-10 11:33, John Garry wrote:

[...]

> Hi Marc,
> 
> I was wondering if there is anything we can do to progress this patch?
> 
> Apart from being a good change in itself, I need to do some SMMU
> testing for nextgen product development and I would like to include
> this patch, most preferably from mainline.

Let me revive it and see how to apply a similar logic as what Thomas
was describing. Shouldn't take too long...

Thanks,

         M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-03-11 13:18 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-19 19:05 [PATCH] irqchip/gic-v3-its: Balance initial LPI affinity across CPUs Marc Zyngier
2020-01-20 16:13 ` John Garry
2020-01-20 17:42   ` Marc Zyngier
2020-01-20 18:21     ` John Garry
2020-01-20 18:45       ` Marc Zyngier
2020-01-20 19:24         ` Thomas Gleixner
2020-01-21 10:46           ` John Garry
2020-03-10 11:33           ` John Garry
2020-03-11 13:18             ` Marc Zyngier

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.