Re: [PATCH v3 2/2] irqchip/gic-v3-its: Balance initial LPI affinity across CPUs

From: John Garry <john.garry@huawei.com>
To: Marc Zyngier <maz@kernel.org>, <linux-kernel@vger.kernel.org>,
	<linux-arm-kernel@lists.infradead.org>
Cc: chenxiang <chenxiang66@hisilicon.com>,
	Zhou Wang <wangzhou1@hisilicon.com>,
	Ming Lei <ming.lei@redhat.com>,
	Jason Cooper <jason@lakedaemon.net>,
	Thomas Gleixner <tglx@linutronix.de>,
	"luojiaxing@huawei.com" <luojiaxing@huawei.com>
Subject: Re: [PATCH v3 2/2] irqchip/gic-v3-its: Balance initial LPI affinity across CPUs
Date: Mon, 16 Mar 2020 13:02:48 +0000	[thread overview]
Message-ID: <2c367508-f81b-342e-eb05-8bbd1b056279@huawei.com> (raw)
In-Reply-To: <20200316115433.9017-3-maz@kernel.org>

On 16/03/2020 11:54, Marc Zyngier wrote:
> When mapping a LPI, the ITS driver picks the first possible
> affinity, which is in most cases CPU0, assuming that if
> that's not suitable, someone will come and set the affinity
> to something more interesting.
> 
> It apparently isn't the case, and people complain of poor
> performance when many interrupts are glued to the same CPU.
> So let's place the interrupts by finding the "least loaded"
> CPU (that is, the one that has the fewer LPIs mapped to it).
> So called 'managed' interrupts are an interesting case where
> the affinity is actually dictated by the kernel itself, and
> we should honor this.
> 
> Reported-by: John Garry <john.garry@huawei.com>
> Link: https://lore.kernel.org/r/1575642904-58295-1-git-send-email-john.garry@huawei.com
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>   drivers/irqchip/irq-gic-v3-its.c | 118 ++++++++++++++++++++++++-------
>   1 file changed, 92 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index 941786e1e8f7..7f1b731c04bb 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -1531,31 +1531,107 @@ static void its_dec_lpi_count(struct irq_data *d, int cpu)
>   		atomic_dec(&per_cpu_ptr(&cpu_lpi_count, cpu)->unmanaged);
>   }
>   
> +static unsigned int cpumask_pick_least_loaded(struct irq_data *d,
> +					      const struct cpumask *cpu_mask)
> +{
> +	unsigned int cpu = nr_cpu_ids, tmp;
> +	int count = S32_MAX;
> +
> +	for_each_cpu(tmp, cpu_mask) {

Hi Marc,

> +		int this_count = its_read_lpi_count(d, tmp);

Not sure if it's intentional, but now there seems to be a subtle 
difference to what Thomas described for non-managed interrupts - for 
non-managed interrupts, x86 selects the CPU based on the total interrupt 
load per CPU (or, more specifically, lowest vector allocation count), 
and not just the non-managed load. Or maybe I misread it.

Anyway, we can test this now for NVMe with its managed interrupts.

Cheers,
John

> +		if (this_count < count) {
> +			cpu = tmp;
> +		        count = this_count;
> +		}
> +	}
> +
> +	return cpu;
> +}
> +
> +/*
> + * As suggested by Thomas Gleixner in:
> + * https://lore.kernel.org/r/87h80q2aoc.fsf@nanos.tec.linutronix.de
> + */
> +static int its_select_cpu(struct irq_data *d,
> +			  const struct cpumask *aff_mask)
> +{
> +	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
> +	cpumask_var_t tmpmask;
> +	int cpu, node;
> +
> +	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	node = its_dev->its->numa_node;
> +
> +	if (!irqd_affinity_is_managed(d)) {
> +		/* First try the NUMA node */
> +		if (node != NUMA_NO_NODE) {
> +			/*
> +			 * Try the intersection of the affinity mask and the
> +			 * node mask (and the online mask, just to be safe).
> +			 */
> +			cpumask_and(tmpmask, cpumask_of_node(node), aff_mask);
> +			cpumask_and(tmpmask, tmpmask, cpu_online_mask);
> +
> +			/* If that doesn't work, try the nodemask itself */
> +			if (cpumask_empty(tmpmask))
> +				cpumask_and(tmpmask, cpumask_of_node(node), cpu_online_mask);
> +
> +			cpu = cpumask_pick_least_loaded(d, tmpmask);
> +			if (cpu < nr_cpu_ids)
> +				goto out;
> +
> +			/* If we can't cross sockets, give up */
> +			if ((its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144))
> +				goto out;
> +
> +			/* If the above failed, expand the search */
> +		}
> +
> +		/* Try the intersection of the affinity and online masks */
> +		cpumask_and(tmpmask, aff_mask, cpu_online_mask);
> +
> +		/* If that doesn't fly, the online mask is the last resort */
> +		if (cpumask_empty(tmpmask))
> +			cpumask_copy(tmpmask, cpu_online_mask);
> +
> +		cpu = cpumask_pick_least_loaded(d, tmpmask);
> +	} else {
> +		cpumask_and(tmpmask, irq_data_get_affinity_mask(d), cpu_online_mask);
> +
> +		/* If we cannot cross sockets, limit the search to that node */
> +		if ((its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) &&
> +		    node != NUMA_NO_NODE)
> +			cpumask_and(tmpmask, tmpmask, cpumask_of_node(node));
> +
> +		cpu = cpumask_pick_least_loaded(d, tmpmask);
> +	}
> +out:
> +	free_cpumask_var(tmpmask);
> +
> +	pr_debug("IRQ%d -> %*pbl CPU%d\n", d->irq, cpumask_pr_args(aff_mask), cpu);
> +	return cpu;
> +}
> +
>   static int its_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
>   			    bool force)
>   {
> -	unsigned int cpu;
> -	const struct cpumask *cpu_mask = cpu_online_mask;
>   	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>   	struct its_collection *target_col;
>   	u32 id = its_get_event_id(d);
> +	int cpu;
>   
>   	/* A forwarded interrupt should use irq_set_vcpu_affinity */
>   	if (irqd_is_forwarded_to_vcpu(d))
>   		return -EINVAL;
>   
> -       /* lpi cannot be routed to a redistributor that is on a foreign node */
> -	if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) {
> -		if (its_dev->its->numa_node >= 0) {
> -			cpu_mask = cpumask_of_node(its_dev->its->numa_node);
> -			if (!cpumask_intersects(mask_val, cpu_mask))
> -				return -EINVAL;
> -		}
> -	}
> -
> -	cpu = cpumask_any_and(mask_val, cpu_mask);
> +	if (!force)
> +		cpu = its_select_cpu(d, mask_val);
> +	else
> +		cpu = cpumask_pick_least_loaded(d, mask_val);
>   
> -	if (cpu >= nr_cpu_ids)
> +	if (cpu < 0 || cpu >= nr_cpu_ids)
>   		return -EINVAL;
>   
>   	/* don't set the affinity when the target cpu is same as current one */
> @@ -3455,21 +3531,11 @@ static int its_irq_domain_activate(struct irq_domain *domain,
>   {
>   	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>   	u32 event = its_get_event_id(d);
> -	const struct cpumask *cpu_mask = cpu_online_mask;
>   	int cpu;
>   
> -	/* get the cpu_mask of local node */
> -	if (its_dev->its->numa_node >= 0)
> -		cpu_mask = cpumask_of_node(its_dev->its->numa_node);
> -
> -	/* Bind the LPI to the first possible CPU */
> -	cpu = cpumask_first_and(cpu_mask, cpu_online_mask);
> -	if (cpu >= nr_cpu_ids) {
> -		if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144)
> -			return -EINVAL;
> -
> -		cpu = cpumask_first(cpu_online_mask);
> -	}
> +	cpu = its_select_cpu(d, cpu_online_mask);
> +	if (cpu < 0 || cpu >= nr_cpu_ids)
> +		return -EINVAL;
>   
>   	its_inc_lpi_count(d, cpu);
>   	its_dev->event_map.col_map[event] = cpu;
>