All of lore.kernel.org
 help / color / mirror / Atom feed
* NVME Kernel Panic due to nvme_free_queue running from call_rcu
@ 2014-07-01 16:48 Matthew Minter
  2014-07-01 17:47 ` Keith Busch
  0 siblings, 1 reply; 3+ messages in thread
From: Matthew Minter @ 2014-07-01 16:48 UTC (permalink / raw)


Hi Everyone,

First I apologize if I have not sent this to the right people but I
picked those who had signed of the commit concerned and those in the
maintainers list for this driver.

I just upgraded my ARM based board to kernel 3.16-rc3 from 3.14 and am
now having the kernel panic when I connect an NVME based SSD, it
appears the problem was introduced by the following commit:
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/drivers/block/nvme-core.c?id=5a92e700af2e5e0e6404988d6a7f2ed3dad3f46f

What seems to be happening is that the nvme_free_queue function is now
being called from call_rcu as changed in this part of the patch:

@@ -1075,10 +1088,13 @@ static void nvme_free_queues(struct nvme_dev
*dev, int lowest)
{
int i;
+ for (i = num_possible_cpus(); i > dev->queue_count - 1; i--)
+ rcu_assign_pointer(dev->queues[i], NULL);
for (i = dev->queue_count - 1; i >= lowest; i--) {
- nvme_free_queue(dev->queues[i]);
+ struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
+ rcu_assign_pointer(dev->queues[i], NULL);
+ call_rcu(&nvmeq->r_head, nvme_free_queue);
dev->queue_count--;
- dev->queues[i] = NULL;
}
}

This means that nvme_free_queue is now being called within a soft
interrupt context, however nvme_free_queue calls dma_free_coherent
which is implemented as arm_dma_free on ARM and that calls vunmap.
vunmap cannot be called within an interrupt context so the kernel BUGs
and ends up panicking.

I have managed to work around this issue by not using an rcu callback
and instead using synchronize_rcu(); in the nvme_free_queues function.
The patch is below.

I am not sure if it is the correct solution but it definitely seems to
work on our board. Any comments would be appreciated.


>From 7c926da62fc2c77f6e0948121938327f69d2194d Mon Sep 17 00:00:00 2001
From: matthew_minter <matthew_minter@xyratex.com>
Date: Tue, 1 Jul 2014 17:37:33 +0100
Subject: [PATCH] Prevented nvme_free_queue from being called in an
interrupt context
Fixes: 5a92e700af2e ("NVMe: RCU protected access to io queues")

Signed-off-by: matthew_minter <matthew_minter at xyratex.com>
---
 drivers/block/nvme-core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 28aec2d..a3755ea 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1225,7 +1225,8 @@ static void nvme_free_queues(struct nvme_dev
*dev, int lowest)
        for (i = dev->queue_count - 1; i >= lowest; i--) {
                struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
                rcu_assign_pointer(dev->queues[i], NULL);
-               call_rcu(&nvmeq->r_head, nvme_free_queue);
+               synchronize_rcu();
+               nvme_free_queue(&nvmeq->r_head);
                dev->queue_count--;
        }
 }
-- 
2.0.0

-- 


------------------------------
For additional information including the registered office and the treatment of Xyratex confidential information please visit www.xyratex.com

------------------------------

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* NVME Kernel Panic due to nvme_free_queue running from call_rcu
  2014-07-01 16:48 NVME Kernel Panic due to nvme_free_queue running from call_rcu Matthew Minter
@ 2014-07-01 17:47 ` Keith Busch
  2014-07-03 14:43   ` Matthew Minter
  0 siblings, 1 reply; 3+ messages in thread
From: Keith Busch @ 2014-07-01 17:47 UTC (permalink / raw)


On Tue, 1 Jul 2014, Matthew Minter wrote:
> Fixes: 5a92e700af2e ("NVMe: RCU protected access to io queues")
>
> Signed-off-by: matthew_minter <matthew_minter at xyratex.com>
> ---
> drivers/block/nvme-core.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
> index 28aec2d..a3755ea 100644
> --- a/drivers/block/nvme-core.c
> +++ b/drivers/block/nvme-core.c
> @@ -1225,7 +1225,8 @@ static void nvme_free_queues(struct nvme_dev
> *dev, int lowest)
>        for (i = dev->queue_count - 1; i >= lowest; i--) {
>                struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
>                rcu_assign_pointer(dev->queues[i], NULL);
> -               call_rcu(&nvmeq->r_head, nvme_free_queue);
> +               synchronize_rcu();
> +               nvme_free_queue(&nvmeq->r_head);
>                dev->queue_count--;
>        }
> }

Darn, that's my patch you're fixing, so that hurts. :( But thanks for
pointing out the arch differences.

We can have many dozens of queues and that many calls to synchronize_rcu()
can make the unload take a long time. Maybe replace the nvmeq's
rcu_head with a list_head, put the nvmeqs on a remove list after calling
rcu_assign_pointer(), call synchronize_rcu() once, then loop through
the list to free them, kinda like this:

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 02351e2..ccef911 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -90,7 +90,7 @@ struct async_cmd_info {
   * commands and one for I/O commands).
   */
  struct nvme_queue {
-	struct rcu_head r_head;
+	struct list_head r_head;
  	struct device *q_dmadev;
  	struct nvme_dev *dev;
  	char irqname[24];	/* nvme4294967295-65535\0 */
@@ -1172,10 +1172,8 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
  	}
  }

-static void nvme_free_queue(struct rcu_head *r)
+static void nvme_free_queue(struct nvme_queue *nvmeq)
  {
-	struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head);
-
  	spin_lock_irq(&nvmeq->q_lock);
  	while (bio_list_peek(&nvmeq->sq_cong)) {
  		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
@@ -1206,13 +1204,21 @@ static void nvme_free_queue(struct rcu_head *r)
  static void nvme_free_queues(struct nvme_dev *dev, int lowest)
  {
  	int i;
+	LIST_HEAD(q_list);
+	struct nvme_queue *nvmeq, *next;

  	for (i = dev->queue_count - 1; i >= lowest; i--) {
-		struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
+		nvmeq = raw_nvmeq(dev, i);
  		rcu_assign_pointer(dev->queues[i], NULL);
-		call_rcu(&nvmeq->r_head, nvme_free_queue);
+		list_add_tail(&nvmeq->r_head, &q_list);
  		dev->queue_count--;
  	}
+
+	synchronize_rcu();
+	list_for_each_entry_safe(nvmeq, next, &q_list, r_head) {
+		list_del(&nvmeq->r_head);
+		nvme_free_queue(nvmeq);
+	}
  }

  /**
@@ -2857,7 +2863,6 @@ static void nvme_remove(struct pci_dev *pdev)
  	nvme_dev_remove(dev);
  	nvme_dev_shutdown(dev);
  	nvme_free_queues(dev, 0);
-	rcu_barrier();
  	nvme_release_instance(dev);
  	nvme_release_prp_pools(dev);
  	kref_put(&dev->kref, nvme_free_dev);

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* NVME Kernel Panic due to nvme_free_queue running from call_rcu
  2014-07-01 17:47 ` Keith Busch
@ 2014-07-03 14:43   ` Matthew Minter
  0 siblings, 0 replies; 3+ messages in thread
From: Matthew Minter @ 2014-07-03 14:43 UTC (permalink / raw)


Hi,

I can confirm that that patch works on all of our boards, and appears
to be a sensible way of solving this. The code looks good and seems to
work fine. I would certainly say this fixes the problem nicely.

The only possible improvement I could see would be to consider using
an llist which might optimize the removal process due to what looks
like faster deletion and iteration paths. However I doubt this is
speed critical enough to warrant such. Still, this works on all our
boards now so I definitely support applying it. (if you want an
acknowledged or 'tested by' I would be happy to add one.)

Many thanks

On 1 July 2014 18:47, Keith Busch <keith.busch@intel.com> wrote:
> On Tue, 1 Jul 2014, Matthew Minter wrote:
>>
>> Fixes: 5a92e700af2e ("NVMe: RCU protected access to io queues")
>>
>> Signed-off-by: matthew_minter <matthew_minter at xyratex.com>
>> ---
>> drivers/block/nvme-core.c | 3 ++-
>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
>> index 28aec2d..a3755ea 100644
>> --- a/drivers/block/nvme-core.c
>> +++ b/drivers/block/nvme-core.c
>> @@ -1225,7 +1225,8 @@ static void nvme_free_queues(struct nvme_dev
>> *dev, int lowest)
>>        for (i = dev->queue_count - 1; i >= lowest; i--) {
>>                struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
>>                rcu_assign_pointer(dev->queues[i], NULL);
>> -               call_rcu(&nvmeq->r_head, nvme_free_queue);
>> +               synchronize_rcu();
>> +               nvme_free_queue(&nvmeq->r_head);
>>                dev->queue_count--;
>>        }
>> }
>
>
> Darn, that's my patch you're fixing, so that hurts. :( But thanks for
> pointing out the arch differences.
>
> We can have many dozens of queues and that many calls to synchronize_rcu()
> can make the unload take a long time. Maybe replace the nvmeq's
> rcu_head with a list_head, put the nvmeqs on a remove list after calling
> rcu_assign_pointer(), call synchronize_rcu() once, then loop through
> the list to free them, kinda like this:
>
> diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
> index 02351e2..ccef911 100644
> --- a/drivers/block/nvme-core.c
> +++ b/drivers/block/nvme-core.c
> @@ -90,7 +90,7 @@ struct async_cmd_info {
>   * commands and one for I/O commands).
>   */
>  struct nvme_queue {
> -       struct rcu_head r_head;
> +       struct list_head r_head;
>         struct device *q_dmadev;
>         struct nvme_dev *dev;
>         char irqname[24];       /* nvme4294967295-65535\0 */
> @@ -1172,10 +1172,8 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq,
> bool timeout)
>         }
>  }
>
> -static void nvme_free_queue(struct rcu_head *r)
> +static void nvme_free_queue(struct nvme_queue *nvmeq)
>  {
> -       struct nvme_queue *nvmeq = container_of(r, struct nvme_queue,
> r_head);
> -
>         spin_lock_irq(&nvmeq->q_lock);
>         while (bio_list_peek(&nvmeq->sq_cong)) {
>                 struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
> @@ -1206,13 +1204,21 @@ static void nvme_free_queue(struct rcu_head *r)
>
>  static void nvme_free_queues(struct nvme_dev *dev, int lowest)
>  {
>         int i;
> +       LIST_HEAD(q_list);
> +       struct nvme_queue *nvmeq, *next;
>
>
>         for (i = dev->queue_count - 1; i >= lowest; i--) {
> -               struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
> +               nvmeq = raw_nvmeq(dev, i);
>
>                 rcu_assign_pointer(dev->queues[i], NULL);
> -               call_rcu(&nvmeq->r_head, nvme_free_queue);
> +               list_add_tail(&nvmeq->r_head, &q_list);
>                 dev->queue_count--;
>         }
> +
> +       synchronize_rcu();
> +       list_for_each_entry_safe(nvmeq, next, &q_list, r_head) {
> +               list_del(&nvmeq->r_head);
> +               nvme_free_queue(nvmeq);
> +       }
>  }
>
>  /**
> @@ -2857,7 +2863,6 @@ static void nvme_remove(struct pci_dev *pdev)
>         nvme_dev_remove(dev);
>         nvme_dev_shutdown(dev);
>         nvme_free_queues(dev, 0);
> -       rcu_barrier();
>         nvme_release_instance(dev);
>         nvme_release_prp_pools(dev);
>         kref_put(&dev->kref, nvme_free_dev);

-- 


------------------------------
For additional information including the registered office and the treatment of Xyratex confidential information please visit www.xyratex.com

------------------------------

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2014-07-03 14:43 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-01 16:48 NVME Kernel Panic due to nvme_free_queue running from call_rcu Matthew Minter
2014-07-01 17:47 ` Keith Busch
2014-07-03 14:43   ` Matthew Minter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.