Re: [PATCH 1/2] irqchip/sifive-plic: Fix PLIC crash on touching offline CPU context

From: Anup Patel <anup@brainfault.org>
To: Guo Ren <guoren@kernel.org>
Cc: Anup Patel <anup.patel@wdc.com>,
	Atish Patra <atish.patra@wdc.com>,
	Palmer Dabbelt <palmerdabbelt@google.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Marc Zyngier <maz@kernel.org>,
	linux-riscv <linux-riscv@lists.infradead.org>,
	"linux-kernel@vger.kernel.org List"
	<linux-kernel@vger.kernel.org>,
	Guo Ren <guoren@linux.alibaba.com>,
	Greentime Hu <greentime.hu@sifive.com>
Subject: Re: [PATCH 1/2] irqchip/sifive-plic: Fix PLIC crash on touching offline CPU context
Date: Tue, 3 Aug 2021 10:43:10 +0530	[thread overview]
Message-ID: <CAAhSdy3ExFE_bTjxiGPh157ev89GEFypMcJ0OU63TQA5CRUONw@mail.gmail.com> (raw)
In-Reply-To: <1627953123-24248-1-git-send-email-guoren@kernel.org>

On Tue, Aug 3, 2021 at 6:42 AM <guoren@kernel.org> wrote:
>
> From: Guo Ren <guoren@linux.alibaba.com>
>
> The current plic driver would touch offline CPU context and cause
> bus error in some chip when in CPU hotplug scenario.
>
> This patch fixes up the problem and prevents plic access offline
> CPU context in plic_init() & plic_set_affinity().
>
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> Cc: Anup Patel <anup@brainfault.org>
> Cc: Atish Patra <atish.patra@wdc.com>
> Cc: Greentime Hu <greentime.hu@sifive.com>
> Cc: Marc Zyngier <maz@kernel.org>
> ---
>  drivers/irqchip/irq-sifive-plic.c | 26 +++++++++++++++++---------
>  1 file changed, 17 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c
> index cf74cfa..9c9bb20 100644
> --- a/drivers/irqchip/irq-sifive-plic.c
> +++ b/drivers/irqchip/irq-sifive-plic.c
> @@ -64,6 +64,7 @@ struct plic_priv {
>         struct cpumask lmask;
>         struct irq_domain *irqdomain;
>         void __iomem *regs;
> +       unsigned int nr_irqs;
>  };
>
>  struct plic_handler {
> @@ -150,7 +151,7 @@ static int plic_set_affinity(struct irq_data *d,
>         if (cpu >= nr_cpu_ids)
>                 return -EINVAL;
>
> -       plic_irq_toggle(&priv->lmask, d, 0);
> +       plic_irq_toggle(cpu_online_mask, d, 0);

This breaks RISC-V multi-socket platforms with multiple PLIC instances.

When we have multiple PLIC instances in a RISC-V platform, each PLIC
instance will target a different set of HARTs. The "priv->lmask" represents
the CPUs/HARTs targeted by a given PLIC instance.

I am not sure how you are testing your patches but you certainly need to
test more on QEMU. The QEMU virt machine support multi-socket so make
sure any patch which can potentially affect multi-socket support is at least
tested on QEMU virt machine multi-socket configuration.

>         plic_irq_toggle(cpumask_of(cpu), d, !irqd_irq_masked(d));
>
>         irq_data_update_effective_affinity(d, cpumask_of(cpu));
> @@ -251,15 +252,25 @@ static void plic_set_threshold(struct plic_handler *handler, u32 threshold)
>
>  static int plic_dying_cpu(unsigned int cpu)
>  {
> +       struct plic_handler *handler = this_cpu_ptr(&plic_handlers);
> +
>         if (plic_parent_irq)
>                 disable_percpu_irq(plic_parent_irq);
>
> +       handler->present = false;
> +

Drop these changes in plic_dying_cpu(), see comments below.

>         return 0;
>  }
>
>  static int plic_starting_cpu(unsigned int cpu)
>  {
>         struct plic_handler *handler = this_cpu_ptr(&plic_handlers);
> +       irq_hw_number_t hwirq;
> +
> +       handler->present = true;

The "handler->present" flag indicates that we have PLIC context
associated with the given handler. It has nothing to do with CPU
hot-plug.

> +
> +       for (hwirq = 1; hwirq <= handler->priv->nr_irqs; hwirq++)
> +               plic_toggle(handler, hwirq, 0);
>
>         if (plic_parent_irq)
>                 enable_percpu_irq(plic_parent_irq,
> @@ -275,7 +286,6 @@ static int __init plic_init(struct device_node *node,
>                 struct device_node *parent)
>  {
>         int error = 0, nr_contexts, nr_handlers = 0, i;
> -       u32 nr_irqs;
>         struct plic_priv *priv;
>         struct plic_handler *handler;
>
> @@ -290,8 +300,8 @@ static int __init plic_init(struct device_node *node,
>         }
>
>         error = -EINVAL;
> -       of_property_read_u32(node, "riscv,ndev", &nr_irqs);
> -       if (WARN_ON(!nr_irqs))
> +       of_property_read_u32(node, "riscv,ndev", &priv->nr_irqs);
> +       if (WARN_ON(!priv->nr_irqs))
>                 goto out_iounmap;
>
>         nr_contexts = of_irq_count(node);
> @@ -299,14 +309,13 @@ static int __init plic_init(struct device_node *node,
>                 goto out_iounmap;
>
>         error = -ENOMEM;
> -       priv->irqdomain = irq_domain_add_linear(node, nr_irqs + 1,
> +       priv->irqdomain = irq_domain_add_linear(node, priv->nr_irqs + 1,
>                         &plic_irqdomain_ops, priv);
>         if (WARN_ON(!priv->irqdomain))
>                 goto out_iounmap;
>
>         for (i = 0; i < nr_contexts; i++) {
>                 struct of_phandle_args parent;
> -               irq_hw_number_t hwirq;
>                 int cpu, hartid;
>
>                 if (of_irq_parse_one(node, i, &parent)) {
> @@ -354,7 +363,8 @@ static int __init plic_init(struct device_node *node,
>                 }
>
>                 cpumask_set_cpu(cpu, &priv->lmask);
> -               handler->present = true;
> +               if (cpu == smp_processor_id())
> +                       handler->present = true;

Drop this change.

>                 handler->hart_base =
>                         priv->regs + CONTEXT_BASE + i * CONTEXT_PER_HART;
>                 raw_spin_lock_init(&handler->enable_lock);
> @@ -362,8 +372,6 @@ static int __init plic_init(struct device_node *node,
>                         priv->regs + ENABLE_BASE + i * ENABLE_PER_HART;
>                 handler->priv = priv;
>  done:
> -               for (hwirq = 1; hwirq <= nr_irqs; hwirq++)
> -                       plic_toggle(handler, hwirq, 0);

In plic_init(), we are bringing all interrupts of PLIC context to a known
state which is being disabled by default. We don't need to do this every
time a HART/CPU is brought-up but I am okay to move this to
plic_starting_cpu() if it helps fix issues on any RISC-V platform.

>                 nr_handlers++;
>         }
>
> --
> 2.7.4
>

Regards,
Anup