All of lore.kernel.org
 help / color / mirror / Atom feed
From: Anup Patel <anup@brainfault.org>
To: Guo Ren <guoren@kernel.org>
Cc: Anup Patel <anup.patel@wdc.com>,
	Atish Patra <atish.patra@wdc.com>,
	Palmer Dabbelt <palmerdabbelt@google.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Marc Zyngier <maz@kernel.org>,
	linux-riscv <linux-riscv@lists.infradead.org>,
	"linux-kernel@vger.kernel.org List"
	<linux-kernel@vger.kernel.org>,
	Guo Ren <guoren@linux.alibaba.com>,
	Greentime Hu <greentime.hu@sifive.com>
Subject: Re: [PATCH 1/2] irqchip/sifive-plic: Fix PLIC crash on touching offline CPU context
Date: Tue, 3 Aug 2021 10:43:10 +0530	[thread overview]
Message-ID: <CAAhSdy3ExFE_bTjxiGPh157ev89GEFypMcJ0OU63TQA5CRUONw@mail.gmail.com> (raw)
In-Reply-To: <1627953123-24248-1-git-send-email-guoren@kernel.org>

On Tue, Aug 3, 2021 at 6:42 AM <guoren@kernel.org> wrote:
>
> From: Guo Ren <guoren@linux.alibaba.com>
>
> The current plic driver would touch offline CPU context and cause
> bus error in some chip when in CPU hotplug scenario.
>
> This patch fixes up the problem and prevents plic access offline
> CPU context in plic_init() & plic_set_affinity().
>
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> Cc: Anup Patel <anup@brainfault.org>
> Cc: Atish Patra <atish.patra@wdc.com>
> Cc: Greentime Hu <greentime.hu@sifive.com>
> Cc: Marc Zyngier <maz@kernel.org>
> ---
>  drivers/irqchip/irq-sifive-plic.c | 26 +++++++++++++++++---------
>  1 file changed, 17 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c
> index cf74cfa..9c9bb20 100644
> --- a/drivers/irqchip/irq-sifive-plic.c
> +++ b/drivers/irqchip/irq-sifive-plic.c
> @@ -64,6 +64,7 @@ struct plic_priv {
>         struct cpumask lmask;
>         struct irq_domain *irqdomain;
>         void __iomem *regs;
> +       unsigned int nr_irqs;
>  };
>
>  struct plic_handler {
> @@ -150,7 +151,7 @@ static int plic_set_affinity(struct irq_data *d,
>         if (cpu >= nr_cpu_ids)
>                 return -EINVAL;
>
> -       plic_irq_toggle(&priv->lmask, d, 0);
> +       plic_irq_toggle(cpu_online_mask, d, 0);

This breaks RISC-V multi-socket platforms with multiple PLIC instances.

When we have multiple PLIC instances in a RISC-V platform, each PLIC
instance will target a different set of HARTs. The "priv->lmask" represents
the CPUs/HARTs targeted by a given PLIC instance.

I am not sure how you are testing your patches but you certainly need to
test more on QEMU. The QEMU virt machine support multi-socket so make
sure any patch which can potentially affect multi-socket support is at least
tested on QEMU virt machine multi-socket configuration.

>         plic_irq_toggle(cpumask_of(cpu), d, !irqd_irq_masked(d));
>
>         irq_data_update_effective_affinity(d, cpumask_of(cpu));
> @@ -251,15 +252,25 @@ static void plic_set_threshold(struct plic_handler *handler, u32 threshold)
>
>  static int plic_dying_cpu(unsigned int cpu)
>  {
> +       struct plic_handler *handler = this_cpu_ptr(&plic_handlers);
> +
>         if (plic_parent_irq)
>                 disable_percpu_irq(plic_parent_irq);
>
> +       handler->present = false;
> +

Drop these changes in plic_dying_cpu(), see comments below.

>         return 0;
>  }
>
>  static int plic_starting_cpu(unsigned int cpu)
>  {
>         struct plic_handler *handler = this_cpu_ptr(&plic_handlers);
> +       irq_hw_number_t hwirq;
> +
> +       handler->present = true;

The "handler->present" flag indicates that we have PLIC context
associated with the given handler. It has nothing to do with CPU
hot-plug.

> +
> +       for (hwirq = 1; hwirq <= handler->priv->nr_irqs; hwirq++)
> +               plic_toggle(handler, hwirq, 0);
>
>         if (plic_parent_irq)
>                 enable_percpu_irq(plic_parent_irq,
> @@ -275,7 +286,6 @@ static int __init plic_init(struct device_node *node,
>                 struct device_node *parent)
>  {
>         int error = 0, nr_contexts, nr_handlers = 0, i;
> -       u32 nr_irqs;
>         struct plic_priv *priv;
>         struct plic_handler *handler;
>
> @@ -290,8 +300,8 @@ static int __init plic_init(struct device_node *node,
>         }
>
>         error = -EINVAL;
> -       of_property_read_u32(node, "riscv,ndev", &nr_irqs);
> -       if (WARN_ON(!nr_irqs))
> +       of_property_read_u32(node, "riscv,ndev", &priv->nr_irqs);
> +       if (WARN_ON(!priv->nr_irqs))
>                 goto out_iounmap;
>
>         nr_contexts = of_irq_count(node);
> @@ -299,14 +309,13 @@ static int __init plic_init(struct device_node *node,
>                 goto out_iounmap;
>
>         error = -ENOMEM;
> -       priv->irqdomain = irq_domain_add_linear(node, nr_irqs + 1,
> +       priv->irqdomain = irq_domain_add_linear(node, priv->nr_irqs + 1,
>                         &plic_irqdomain_ops, priv);
>         if (WARN_ON(!priv->irqdomain))
>                 goto out_iounmap;
>
>         for (i = 0; i < nr_contexts; i++) {
>                 struct of_phandle_args parent;
> -               irq_hw_number_t hwirq;
>                 int cpu, hartid;
>
>                 if (of_irq_parse_one(node, i, &parent)) {
> @@ -354,7 +363,8 @@ static int __init plic_init(struct device_node *node,
>                 }
>
>                 cpumask_set_cpu(cpu, &priv->lmask);
> -               handler->present = true;
> +               if (cpu == smp_processor_id())
> +                       handler->present = true;

Drop this change.

>                 handler->hart_base =
>                         priv->regs + CONTEXT_BASE + i * CONTEXT_PER_HART;
>                 raw_spin_lock_init(&handler->enable_lock);
> @@ -362,8 +372,6 @@ static int __init plic_init(struct device_node *node,
>                         priv->regs + ENABLE_BASE + i * ENABLE_PER_HART;
>                 handler->priv = priv;
>  done:
> -               for (hwirq = 1; hwirq <= nr_irqs; hwirq++)
> -                       plic_toggle(handler, hwirq, 0);

In plic_init(), we are bringing all interrupts of PLIC context to a known
state which is being disabled by default. We don't need to do this every
time a HART/CPU is brought-up but I am okay to move this to
plic_starting_cpu() if it helps fix issues on any RISC-V platform.

>                 nr_handlers++;
>         }
>
> --
> 2.7.4
>

Regards,
Anup

WARNING: multiple messages have this Message-ID (diff)
From: Anup Patel <anup@brainfault.org>
To: Guo Ren <guoren@kernel.org>
Cc: Anup Patel <anup.patel@wdc.com>,
	Atish Patra <atish.patra@wdc.com>,
	 Palmer Dabbelt <palmerdabbelt@google.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Marc Zyngier <maz@kernel.org>,
	linux-riscv <linux-riscv@lists.infradead.org>,
	 "linux-kernel@vger.kernel.org List"
	<linux-kernel@vger.kernel.org>,
	Guo Ren <guoren@linux.alibaba.com>,
	Greentime Hu <greentime.hu@sifive.com>
Subject: Re: [PATCH 1/2] irqchip/sifive-plic: Fix PLIC crash on touching offline CPU context
Date: Tue, 3 Aug 2021 10:43:10 +0530	[thread overview]
Message-ID: <CAAhSdy3ExFE_bTjxiGPh157ev89GEFypMcJ0OU63TQA5CRUONw@mail.gmail.com> (raw)
In-Reply-To: <1627953123-24248-1-git-send-email-guoren@kernel.org>

On Tue, Aug 3, 2021 at 6:42 AM <guoren@kernel.org> wrote:
>
> From: Guo Ren <guoren@linux.alibaba.com>
>
> The current plic driver would touch offline CPU context and cause
> bus error in some chip when in CPU hotplug scenario.
>
> This patch fixes up the problem and prevents plic access offline
> CPU context in plic_init() & plic_set_affinity().
>
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> Cc: Anup Patel <anup@brainfault.org>
> Cc: Atish Patra <atish.patra@wdc.com>
> Cc: Greentime Hu <greentime.hu@sifive.com>
> Cc: Marc Zyngier <maz@kernel.org>
> ---
>  drivers/irqchip/irq-sifive-plic.c | 26 +++++++++++++++++---------
>  1 file changed, 17 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c
> index cf74cfa..9c9bb20 100644
> --- a/drivers/irqchip/irq-sifive-plic.c
> +++ b/drivers/irqchip/irq-sifive-plic.c
> @@ -64,6 +64,7 @@ struct plic_priv {
>         struct cpumask lmask;
>         struct irq_domain *irqdomain;
>         void __iomem *regs;
> +       unsigned int nr_irqs;
>  };
>
>  struct plic_handler {
> @@ -150,7 +151,7 @@ static int plic_set_affinity(struct irq_data *d,
>         if (cpu >= nr_cpu_ids)
>                 return -EINVAL;
>
> -       plic_irq_toggle(&priv->lmask, d, 0);
> +       plic_irq_toggle(cpu_online_mask, d, 0);

This breaks RISC-V multi-socket platforms with multiple PLIC instances.

When we have multiple PLIC instances in a RISC-V platform, each PLIC
instance will target a different set of HARTs. The "priv->lmask" represents
the CPUs/HARTs targeted by a given PLIC instance.

I am not sure how you are testing your patches but you certainly need to
test more on QEMU. The QEMU virt machine support multi-socket so make
sure any patch which can potentially affect multi-socket support is at least
tested on QEMU virt machine multi-socket configuration.

>         plic_irq_toggle(cpumask_of(cpu), d, !irqd_irq_masked(d));
>
>         irq_data_update_effective_affinity(d, cpumask_of(cpu));
> @@ -251,15 +252,25 @@ static void plic_set_threshold(struct plic_handler *handler, u32 threshold)
>
>  static int plic_dying_cpu(unsigned int cpu)
>  {
> +       struct plic_handler *handler = this_cpu_ptr(&plic_handlers);
> +
>         if (plic_parent_irq)
>                 disable_percpu_irq(plic_parent_irq);
>
> +       handler->present = false;
> +

Drop these changes in plic_dying_cpu(), see comments below.

>         return 0;
>  }
>
>  static int plic_starting_cpu(unsigned int cpu)
>  {
>         struct plic_handler *handler = this_cpu_ptr(&plic_handlers);
> +       irq_hw_number_t hwirq;
> +
> +       handler->present = true;

The "handler->present" flag indicates that we have PLIC context
associated with the given handler. It has nothing to do with CPU
hot-plug.

> +
> +       for (hwirq = 1; hwirq <= handler->priv->nr_irqs; hwirq++)
> +               plic_toggle(handler, hwirq, 0);
>
>         if (plic_parent_irq)
>                 enable_percpu_irq(plic_parent_irq,
> @@ -275,7 +286,6 @@ static int __init plic_init(struct device_node *node,
>                 struct device_node *parent)
>  {
>         int error = 0, nr_contexts, nr_handlers = 0, i;
> -       u32 nr_irqs;
>         struct plic_priv *priv;
>         struct plic_handler *handler;
>
> @@ -290,8 +300,8 @@ static int __init plic_init(struct device_node *node,
>         }
>
>         error = -EINVAL;
> -       of_property_read_u32(node, "riscv,ndev", &nr_irqs);
> -       if (WARN_ON(!nr_irqs))
> +       of_property_read_u32(node, "riscv,ndev", &priv->nr_irqs);
> +       if (WARN_ON(!priv->nr_irqs))
>                 goto out_iounmap;
>
>         nr_contexts = of_irq_count(node);
> @@ -299,14 +309,13 @@ static int __init plic_init(struct device_node *node,
>                 goto out_iounmap;
>
>         error = -ENOMEM;
> -       priv->irqdomain = irq_domain_add_linear(node, nr_irqs + 1,
> +       priv->irqdomain = irq_domain_add_linear(node, priv->nr_irqs + 1,
>                         &plic_irqdomain_ops, priv);
>         if (WARN_ON(!priv->irqdomain))
>                 goto out_iounmap;
>
>         for (i = 0; i < nr_contexts; i++) {
>                 struct of_phandle_args parent;
> -               irq_hw_number_t hwirq;
>                 int cpu, hartid;
>
>                 if (of_irq_parse_one(node, i, &parent)) {
> @@ -354,7 +363,8 @@ static int __init plic_init(struct device_node *node,
>                 }
>
>                 cpumask_set_cpu(cpu, &priv->lmask);
> -               handler->present = true;
> +               if (cpu == smp_processor_id())
> +                       handler->present = true;

Drop this change.

>                 handler->hart_base =
>                         priv->regs + CONTEXT_BASE + i * CONTEXT_PER_HART;
>                 raw_spin_lock_init(&handler->enable_lock);
> @@ -362,8 +372,6 @@ static int __init plic_init(struct device_node *node,
>                         priv->regs + ENABLE_BASE + i * ENABLE_PER_HART;
>                 handler->priv = priv;
>  done:
> -               for (hwirq = 1; hwirq <= nr_irqs; hwirq++)
> -                       plic_toggle(handler, hwirq, 0);

In plic_init(), we are bringing all interrupts of PLIC context to a known
state which is being disabled by default. We don't need to do this every
time a HART/CPU is brought-up but I am okay to move this to
plic_starting_cpu() if it helps fix issues on any RISC-V platform.

>                 nr_handlers++;
>         }
>
> --
> 2.7.4
>

Regards,
Anup

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

  parent reply	other threads:[~2021-08-03  5:13 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-03  1:12 [PATCH 1/2] irqchip/sifive-plic: Fix PLIC crash on touching offline CPU context guoren
2021-08-03  1:12 ` guoren
2021-08-03  1:12 ` [PATCH 2/2] riscv: Improve status in cpu sections of device-tree for cpuhotplug usage guoren
2021-08-03  1:12   ` guoren
2021-08-03  5:13 ` Anup Patel [this message]
2021-08-03  5:13   ` [PATCH 1/2] irqchip/sifive-plic: Fix PLIC crash on touching offline CPU context Anup Patel
2021-08-10  2:43   ` Guo Ren
2021-08-10  2:43     ` Guo Ren

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAAhSdy3ExFE_bTjxiGPh157ev89GEFypMcJ0OU63TQA5CRUONw@mail.gmail.com \
    --to=anup@brainfault.org \
    --cc=anup.patel@wdc.com \
    --cc=atish.patra@wdc.com \
    --cc=greentime.hu@sifive.com \
    --cc=guoren@kernel.org \
    --cc=guoren@linux.alibaba.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-riscv@lists.infradead.org \
    --cc=maz@kernel.org \
    --cc=palmerdabbelt@google.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.