All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/8] NVMe: Queue bio requests on device
@ 2013-02-20 23:52 Keith Busch
  2013-02-20 23:52 ` [PATCH 2/8] NVMe: Controller reset from user Keith Busch
                   ` (6 more replies)
  0 siblings, 7 replies; 10+ messages in thread
From: Keith Busch @ 2013-02-20 23:52 UTC (permalink / raw)


A bio request is not tied to an NVMe IO submission queue, so bio requests
can be resubmitted on any available queue. Instead of adding bios on the
nvme_queue, this queues bios on the nvme_dev. This should help balance
the load across the submission queues.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme.c |   57 +++++++++++++++++++++++--------------------------
 1 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 993c014..0a25765 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -80,6 +80,8 @@ struct nvme_dev {
 	char model[40];
 	char firmware_rev[8];
 	u32 max_hw_sectors;
+	struct bio_list bio_list;
+	spinlock_t dev_lock;
 };
 
 /*
@@ -109,8 +111,6 @@ struct nvme_queue {
 	dma_addr_t sq_dma_addr;
 	dma_addr_t cq_dma_addr;
 	wait_queue_head_t sq_full;
-	wait_queue_t sq_cong_wait;
-	struct bio_list sq_cong;
 	u32 __iomem *q_db;
 	u16 q_depth;
 	u16 cq_vector;
@@ -245,6 +245,8 @@ static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
 	info[cmdid].ctx = CMD_CTX_COMPLETED;
 	clear_bit(cmdid, nvmeq->cmdid_data);
 	wake_up(&nvmeq->sq_full);
+	if (!bio_list_empty(&nvmeq->dev->bio_list))
+		wake_up_process(nvme_thread);
 	return ctx;
 }
 
@@ -363,11 +365,10 @@ static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 
 static void requeue_bio(struct nvme_dev *dev, struct bio *bio)
 {
-	struct nvme_queue *nvmeq = get_nvmeq(dev);
-	if (bio_list_empty(&nvmeq->sq_cong))
-		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-	bio_list_add(&nvmeq->sq_cong, bio);
-	put_nvmeq(nvmeq);
+	unsigned long flags;
+	spin_lock_irqsave(&dev->dev_lock, flags);
+	bio_list_add(&dev->bio_list, bio);
+	spin_unlock_irqrestore(&dev->dev_lock, flags);
 	wake_up_process(nvme_thread);
 }
 
@@ -624,19 +625,17 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 static void nvme_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct nvme_ns *ns = q->queuedata;
-	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
-	int result = -EBUSY;
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_queue *nvmeq = get_nvmeq(dev);
+	int result;
 
 	spin_lock_irq(&nvmeq->q_lock);
-	if (bio_list_empty(&nvmeq->sq_cong))
-		result = nvme_submit_bio_queue(nvmeq, ns, bio);
-	if (unlikely(result)) {
-		if (bio_list_empty(&nvmeq->sq_cong))
-			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-		bio_list_add(&nvmeq->sq_cong, bio);
-	}
-
+	result = nvme_submit_bio_queue(nvmeq, ns, bio);
 	spin_unlock_irq(&nvmeq->q_lock);
+
+	if (unlikely(result))
+		requeue_bio(dev, bio);
+
 	put_nvmeq(nvmeq);
 }
 
@@ -912,10 +911,6 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid)
 
 	spin_lock_irq(&nvmeq->q_lock);
 	nvme_cancel_ios(nvmeq, false);
-	while (bio_list_peek(&nvmeq->sq_cong)) {
-		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
-		bio_endio(bio, -EIO);
-	}
 	spin_unlock_irq(&nvmeq->q_lock);
 
 	irq_set_affinity_hint(vector, NULL);
@@ -957,8 +952,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
 	init_waitqueue_head(&nvmeq->sq_full);
-	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
-	bio_list_init(&nvmeq->sq_cong);
 	nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
 	nvmeq->q_depth = depth;
 	nvmeq->cq_vector = vector;
@@ -1279,17 +1272,19 @@ static const struct block_device_operations nvme_fops = {
 
 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
 {
-	while (bio_list_peek(&nvmeq->sq_cong)) {
-		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
+	unsigned long flags;
+	struct bio_list *list = &nvmeq->dev->bio_list;
+
+	spin_lock_irqsave(&nvmeq->dev->dev_lock, flags);
+	while (bio_list_peek(list)) {
+		struct bio *bio = bio_list_pop(list);
 		struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
 		if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
-			bio_list_add_head(&nvmeq->sq_cong, bio);
+			bio_list_add_head(list, bio);
 			break;
 		}
-		if (bio_list_empty(&nvmeq->sq_cong))
-			remove_wait_queue(&nvmeq->sq_full,
-							&nvmeq->sq_cong_wait);
 	}
+	spin_unlock_irqrestore(&nvmeq->dev->dev_lock, flags);
 }
 
 static int nvme_kthread(void *data)
@@ -1309,7 +1304,8 @@ static int nvme_kthread(void *data)
 				if (nvme_process_cq(nvmeq))
 					printk("process_cq did something\n");
 				nvme_cancel_ios(nvmeq, true);
-				nvme_resubmit_bios(nvmeq);
+				if (i)
+					nvme_resubmit_bios(nvmeq);
 				spin_unlock_irq(&nvmeq->q_lock);
 			}
 		}
@@ -1660,6 +1656,7 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 		goto disable;
 
 	INIT_LIST_HEAD(&dev->namespaces);
+	spin_lock_init(&dev->dev_lock);
 	dev->pci_dev = pdev;
 	pci_set_drvdata(pdev, dev);
 	dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/8] NVMe: Controller reset from user
  2013-02-20 23:52 [PATCH 1/8] NVMe: Queue bio requests on device Keith Busch
@ 2013-02-20 23:52 ` Keith Busch
  2013-02-26 14:04   ` Matthew Wilcox
  2013-02-20 23:52 ` [PATCH 3/8] NVMe: Suspend/resume power management Keith Busch
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 10+ messages in thread
From: Keith Busch @ 2013-02-20 23:52 UTC (permalink / raw)


Allow a user to issue a controller reset. A reset does not delete the
gendisks so that IO may continue, or the namespaces may be mounted. This
may be done by a user if they need to reset the controller for any reason,
like if it is required as part of an activate firmware operation.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme.c |  155 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 147 insertions(+), 8 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 0a25765..28e014e 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -111,6 +111,7 @@ struct nvme_queue {
 	dma_addr_t sq_dma_addr;
 	dma_addr_t cq_dma_addr;
 	wait_queue_head_t sq_full;
+	atomic_t busy;
 	u32 __iomem *q_db;
 	u16 q_depth;
 	u16 cq_vector;
@@ -265,11 +266,18 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
 
 static struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
 {
-	return dev->queues[get_cpu() + 1];
+	struct nvme_queue *nvmeq;
+	spin_lock(&dev->dev_lock);
+	nvmeq = dev->queues[get_cpu() + 1];
+	if (nvmeq)
+		atomic_inc(&nvmeq->busy);
+	spin_unlock(&dev->dev_lock);
+	return nvmeq;
 }
 
 static void put_nvmeq(struct nvme_queue *nvmeq)
 {
+	atomic_dec(&nvmeq->busy);
 	put_cpu();
 }
 
@@ -629,6 +637,11 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio)
 	struct nvme_queue *nvmeq = get_nvmeq(dev);
 	int result;
 
+	if (!nvmeq) {
+		requeue_bio(dev, bio);
+		return;
+	}
+
 	spin_lock_irq(&nvmeq->q_lock);
 	result = nvme_submit_bio_queue(nvmeq, ns, bio);
 	spin_unlock_irq(&nvmeq->q_lock);
@@ -909,10 +922,15 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid)
 	struct nvme_queue *nvmeq = dev->queues[qid];
 	int vector = dev->entry[nvmeq->cq_vector].vector;
 
-	spin_lock_irq(&nvmeq->q_lock);
-	nvme_cancel_ios(nvmeq, false);
-	spin_unlock_irq(&nvmeq->q_lock);
+	spin_lock(&dev->dev_lock);
+	dev->queues[qid] = NULL;
+	spin_unlock(&dev->dev_lock);
+
+	while (atomic_read(&nvmeq->busy))
+		msleep(10);
+
 
+	synchronize_irq(vector);
 	irq_set_affinity_hint(vector, NULL);
 	free_irq(vector, nvmeq);
 
@@ -922,6 +940,11 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid)
 		adapter_delete_cq(dev, qid);
 	}
 
+	spin_lock_irq(&nvmeq->q_lock);
+	nvme_process_cq(nvmeq);
+	nvme_cancel_ios(nvmeq, false);
+	spin_unlock_irq(&nvmeq->q_lock);
+
 	nvme_free_queue_mem(nvmeq);
 }
 
@@ -1014,7 +1037,7 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
 	return ERR_PTR(result);
 }
 
-static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
+static int nvme_configure_admin_queue(struct nvme_dev *dev)
 {
 	int result = 0;
 	u32 aqa;
@@ -1177,6 +1200,11 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
 
 	nvmeq = get_nvmeq(dev);
+	if (!nvmeq) {
+		status = -EFAULT;
+		goto unmap_pages;
+	}
+
 	/*
 	 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption
 	 * disabled.  We may be preempted at any point, and be rescheduled
@@ -1189,6 +1217,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	else
 		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
 
+ unmap_pages:
 	nvme_unmap_user_pages(dev, io.opcode & 1, iod);
 	nvme_free_iod(dev, iod);
 	return status;
@@ -1419,7 +1448,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 	return min(result & 0xffff, result >> 16) + 1;
 }
 
-static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
+static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	int result, cpu, i, nr_io_queues, db_bar_size, q_depth;
 
@@ -1490,6 +1519,7 @@ static void nvme_free_queues(struct nvme_dev *dev)
 
 	for (i = dev->queue_count - 1; i >= 0; i--)
 		nvme_free_queue(dev, i);
+	dev->queue_count = 0;
 }
 
 static int __devinit nvme_dev_add(struct nvme_dev *dev)
@@ -1630,6 +1660,108 @@ static void nvme_release_instance(struct nvme_dev *dev)
 	spin_unlock(&dev_list_lock);
 }
 
+static int nvme_shutdown_controller(struct nvme_dev *dev)
+{
+	int i;
+	unsigned long timeout;
+
+	spin_lock(&dev_list_lock);
+	list_del(&dev->node);
+	spin_unlock(&dev_list_lock);
+
+	spin_lock(&dev->dev_lock);
+	for (i = dev->queue_count; i < num_possible_cpus(); i++)
+		dev->queues[i] = NULL;
+	spin_unlock(&dev->dev_lock);
+	nvme_free_queues(dev);
+
+	dev->ctrl_config |= NVME_CC_SHN_NORMAL;
+	writel(dev->ctrl_config, &dev->bar->cc);
+	timeout = HZ + jiffies;
+
+	while (!(readl(&dev->bar->csts) & NVME_CSTS_SHST_CMPLT)) {
+		msleep(5);
+		if (fatal_signal_pending(current))
+			break;
+		if (time_after(jiffies, timeout)) {
+			dev_err(&dev->pci_dev->dev,
+				"Device still ready; aborting shutdown\n");
+			break;
+		}
+	}
+
+	pci_disable_msix(dev->pci_dev);
+	iounmap(dev->bar);
+	pci_disable_device(dev->pci_dev);
+	pci_release_regions(dev->pci_dev);
+
+	return 0;
+}
+
+static int nvme_restart_controller(struct nvme_dev *dev)
+{
+	int bars, result = -ENOMEM;
+
+	if (pci_enable_device_mem(dev->pci_dev))
+		return -ENOMEM;
+
+	pci_set_master(dev->pci_dev);
+	bars = pci_select_bars(dev->pci_dev, IORESOURCE_MEM);
+	if (pci_request_selected_regions(dev->pci_dev, bars, "nvme"))
+		goto disable_pci;
+
+	dma_set_mask(&dev->pci_dev->dev, DMA_BIT_MASK(64));
+	dma_set_coherent_mask(&dev->pci_dev->dev, DMA_BIT_MASK(64));
+	dev->entry[0].vector = dev->pci_dev->irq;
+	dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0), 8192);
+	if (!dev->bar)
+		goto disable;
+
+	result = nvme_configure_admin_queue(dev);
+	if (result)
+		goto unmap;
+	dev->queue_count++;
+
+	spin_lock(&dev_list_lock);
+	list_add(&dev->node, &dev_list);
+	spin_unlock(&dev_list_lock);
+
+	result = nvme_setup_io_queues(dev);
+	if (result)
+		goto remove;
+
+	return 0;
+
+ remove:
+	nvme_dev_remove(dev);
+ unmap:
+	iounmap(dev->bar);
+ disable:
+	pci_release_regions(dev->pci_dev);
+ disable_pci:
+	pci_disable_device(dev->pci_dev);
+	return result;
+}
+
+static int nvme_reset_controller(struct nvme_dev *dev)
+{
+	int ret = nvme_shutdown_controller(dev);
+	if (ret)
+		return ret;
+	ret = nvme_restart_controller(dev);
+	return ret;
+}
+
+static ssize_t reset_controller(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct pci_dev  *pdev = container_of(dev, struct pci_dev, dev);
+	struct nvme_dev *ndev = pci_get_drvdata(pdev);
+	nvme_reset_controller(ndev);
+	return count;
+}
+static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, reset_controller);
+
 static int __devinit nvme_probe(struct pci_dev *pdev,
 						const struct pci_device_id *id)
 {
@@ -1686,13 +1818,19 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	list_add(&dev->node, &dev_list);
 	spin_unlock(&dev_list_lock);
 
-	result = nvme_dev_add(dev);
+	result = device_create_file(&pdev->dev, &dev_attr_reset_controller);
 	if (result)
 		goto delete;
 
+	result = nvme_dev_add(dev);
+	if (result)
+		goto del_sysfs;
+
 	return 0;
 
- delete:
+ del_sysfs:
+	device_remove_file(&pdev->dev, &dev_attr_reset_controller);
+delete:
 	spin_lock(&dev_list_lock);
 	list_del(&dev->node);
 	spin_unlock(&dev_list_lock);
@@ -1718,6 +1856,7 @@ static void __devexit nvme_remove(struct pci_dev *pdev)
 {
 	struct nvme_dev *dev = pci_get_drvdata(pdev);
 	nvme_dev_remove(dev);
+	device_remove_file(&pdev->dev, &dev_attr_reset_controller);
 	pci_disable_msix(pdev);
 	iounmap(dev->bar);
 	nvme_release_instance(dev);
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/8] NVMe: Suspend/resume power management
  2013-02-20 23:52 [PATCH 1/8] NVMe: Queue bio requests on device Keith Busch
  2013-02-20 23:52 ` [PATCH 2/8] NVMe: Controller reset from user Keith Busch
@ 2013-02-20 23:52 ` Keith Busch
  2013-02-20 23:52 ` [PATCH 4/8] NVMe: Add shutdown callback Keith Busch
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 10+ messages in thread
From: Keith Busch @ 2013-02-20 23:52 UTC (permalink / raw)


Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme.c |   24 ++++++++++++++++++++++--
 1 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 28e014e..98ef07a 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1874,8 +1874,27 @@ static void __devexit nvme_remove(struct pci_dev *pdev)
 #define nvme_link_reset NULL
 #define nvme_slot_reset NULL
 #define nvme_error_resume NULL
+
+#ifdef CONFIG_PM
+static int nvme_suspend(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct nvme_dev *ndev = pci_get_drvdata(pdev);
+	return nvme_shutdown_controller(ndev);
+}
+
+static int nvme_resume(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct nvme_dev *ndev = pci_get_drvdata(pdev);
+	return nvme_restart_controller(ndev);
+}
+#else
 #define nvme_suspend NULL
 #define nvme_resume NULL
+#endif
+
+UNIVERSAL_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume, NULL);
 
 static const struct pci_error_handlers nvme_err_handler = {
 	.error_detected	= nvme_error_detected,
@@ -1899,8 +1918,9 @@ static struct pci_driver nvme_driver = {
 	.id_table	= nvme_id_table,
 	.probe		= nvme_probe,
 	.remove		= __devexit_p(nvme_remove),
-	.suspend	= nvme_suspend,
-	.resume		= nvme_resume,
+	.driver 	= {
+		.pm  	= &nvme_dev_pm_ops,
+	},
 	.err_handler	= &nvme_err_handler,
 };
 
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 4/8] NVMe: Add shutdown callback
  2013-02-20 23:52 [PATCH 1/8] NVMe: Queue bio requests on device Keith Busch
  2013-02-20 23:52 ` [PATCH 2/8] NVMe: Controller reset from user Keith Busch
  2013-02-20 23:52 ` [PATCH 3/8] NVMe: Suspend/resume power management Keith Busch
@ 2013-02-20 23:52 ` Keith Busch
  2013-02-20 23:52 ` [PATCH 5/8] NVMe: End queued bio requests for removed disks Keith Busch
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 10+ messages in thread
From: Keith Busch @ 2013-02-20 23:52 UTC (permalink / raw)


Shuts down the controller in a controlled way in accordance with the
NVMe spec when a system shutdown request occurs.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme.c |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 98ef07a..aeaea1a 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1875,6 +1875,12 @@ static void __devexit nvme_remove(struct pci_dev *pdev)
 #define nvme_slot_reset NULL
 #define nvme_error_resume NULL
 
+static void nvme_shutdown(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+	nvme_shutdown_controller(dev);
+}
+
 #ifdef CONFIG_PM
 static int nvme_suspend(struct device *dev)
 {
@@ -1918,6 +1924,7 @@ static struct pci_driver nvme_driver = {
 	.id_table	= nvme_id_table,
 	.probe		= nvme_probe,
 	.remove		= __devexit_p(nvme_remove),
+	.shutdown	= nvme_shutdown,
 	.driver 	= {
 		.pm  	= &nvme_dev_pm_ops,
 	},
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 5/8] NVMe: End queued bio requests for removed disks
  2013-02-20 23:52 [PATCH 1/8] NVMe: Queue bio requests on device Keith Busch
                   ` (2 preceding siblings ...)
  2013-02-20 23:52 ` [PATCH 4/8] NVMe: Add shutdown callback Keith Busch
@ 2013-02-20 23:52 ` Keith Busch
  2013-02-20 23:52 ` [PATCH 6/8] NVMe: Wait for controller status ready to clear Keith Busch
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 10+ messages in thread
From: Keith Busch @ 2013-02-20 23:52 UTC (permalink / raw)


Bio requests queued in the driver should not submit the request to disks
being deleted. This patch checks if the disk is up before submitting and
lets the kthread run on a device being removed until after the namespaces
have been cleared so that the bio's can be ended.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme.c |   12 ++++++++----
 1 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index aeaea1a..199c182 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1308,6 +1308,10 @@ static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
 	while (bio_list_peek(list)) {
 		struct bio *bio = bio_list_pop(list);
 		struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+		if (!(bio->bi_bdev->bd_disk->flags & GENHD_FL_UP)) {
+				bio_endio(bio, -EIO);
+				continue;
+		}
 		if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
 			bio_list_add_head(list, bio);
 			break;
@@ -1592,16 +1596,16 @@ static int nvme_dev_remove(struct nvme_dev *dev)
 {
 	struct nvme_ns *ns, *next;
 
-	spin_lock(&dev_list_lock);
-	list_del(&dev->node);
-	spin_unlock(&dev_list_lock);
-
 	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
 		list_del(&ns->list);
 		del_gendisk(ns->disk);
 		nvme_ns_free(ns);
 	}
 
+	spin_lock(&dev_list_lock);
+	list_del(&dev->node);
+	spin_unlock(&dev_list_lock);
+
 	nvme_free_queues(dev);
 
 	return 0;
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 6/8] NVMe: Wait for controller status ready to clear
  2013-02-20 23:52 [PATCH 1/8] NVMe: Queue bio requests on device Keith Busch
                   ` (3 preceding siblings ...)
  2013-02-20 23:52 ` [PATCH 5/8] NVMe: End queued bio requests for removed disks Keith Busch
@ 2013-02-20 23:52 ` Keith Busch
  2013-02-20 23:52 ` [PATCH 7/8] NVMe: Automatically reset failed controller Keith Busch
  2013-02-20 23:52 ` [PATCH 8/8] NVMe: Use schedule_timeout for sync commands Keith Busch
  6 siblings, 0 replies; 10+ messages in thread
From: Keith Busch @ 2013-02-20 23:52 UTC (permalink / raw)


The nvme spec specifies potentially undefined results if we continue
to use the driver after clearing enable but not waiting for CSTS.RDY to
clear. This waits for ready status to clear before proceeding.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme.c |   16 ++++++++++++++--
 1 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 199c182..0acccc1 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1045,6 +1045,20 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	unsigned long timeout;
 	struct nvme_queue *nvmeq;
 
+	cap = readq(&dev->bar->cap);
+	timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+	writel(0, &dev->bar->cc);
+	while (!result && (readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
+		msleep(100);
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		if (time_after(jiffies, timeout)) {
+			dev_err(&dev->pci_dev->dev,
+				"Device still ready; aborting initialisation\n");
+			return -ENODEV;
+		}
+	}
+
 	dev->dbs = ((void __iomem *)dev->bar) + 4096;
 
 	nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
@@ -1059,13 +1073,11 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
 	dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
 
-	writel(0, &dev->bar->cc);
 	writel(aqa, &dev->bar->aqa);
 	writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
 	writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
 	writel(dev->ctrl_config, &dev->bar->cc);
 
-	cap = readq(&dev->bar->cap);
 	timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
 	dev->db_stride = NVME_CAP_STRIDE(cap);
 
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 7/8] NVMe: Automatically reset failed controller
  2013-02-20 23:52 [PATCH 1/8] NVMe: Queue bio requests on device Keith Busch
                   ` (4 preceding siblings ...)
  2013-02-20 23:52 ` [PATCH 6/8] NVMe: Wait for controller status ready to clear Keith Busch
@ 2013-02-20 23:52 ` Keith Busch
  2013-02-20 23:52 ` [PATCH 8/8] NVMe: Use schedule_timeout for sync commands Keith Busch
  6 siblings, 0 replies; 10+ messages in thread
From: Keith Busch @ 2013-02-20 23:52 UTC (permalink / raw)


An nvme controller may indicate it is failed and requires a reset by
setting its CSTS.CFS register to 1. This patch has the polling thread
check this value and automatically issue the reset when the controller
reports itself as failed.

Also, if the controller fails to return an IO, this will be considered
a failed controller and require a reset. Previously the polling thread
would timeout an IO and free up the resources. This is bad because if the
controller happens to service that command in the future, the resources
the driver allocated for that IO are no longer valid and could result in
the controller accessing memory that is associated with something else,
resulting in memory or data corruption.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme.c |   42 +++++++++++++++++++++++++++++++++++++++---
 1 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 0acccc1..120686c 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -58,6 +58,7 @@ module_param(use_threaded_interrupts, int, 0);
 static DEFINE_SPINLOCK(dev_list_lock);
 static LIST_HEAD(dev_list);
 static struct task_struct *nvme_thread;
+static struct workqueue_struct *nvme_workq;
 
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
@@ -82,6 +83,7 @@ struct nvme_dev {
 	u32 max_hw_sectors;
 	struct bio_list bio_list;
 	spinlock_t dev_lock;
+	struct work_struct ws;
 };
 
 /*
@@ -886,7 +888,7 @@ static int nvme_set_features(struct nvme_dev *dev, unsigned fid,
  * @queue: The queue to cancel I/Os on
  * @timeout: True to only cancel I/Os which have timed out
  */
-static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
+static int nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
 {
 	int depth = nvmeq->q_depth - 1;
 	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
@@ -902,10 +904,15 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
 
 		if (timeout && !time_after(now, info[cmdid].timeout))
 			continue;
+		if (timeout)
+			return 1;
+
 		dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid);
 		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
 		fn(nvmeq->dev, ctx, &cqe);
 	}
+
+	return 0;
 }
 
 static void nvme_free_queue_mem(struct nvme_queue *nvmeq)
@@ -1341,6 +1348,12 @@ static int nvme_kthread(void *data)
 		spin_lock(&dev_list_lock);
 		list_for_each_entry(dev, &dev_list, node) {
 			int i;
+			if (readl(&dev->bar->csts) & NVME_CSTS_CFS) {
+				dev_warn(&dev->pci_dev->dev,
+					"failed status, reset controller\n");
+				queue_work(nvme_workq, &dev->ws);
+				continue;
+			}
 			for (i = 0; i < dev->queue_count; i++) {
 				struct nvme_queue *nvmeq = dev->queues[i];
 				if (!nvmeq)
@@ -1348,7 +1361,12 @@ static int nvme_kthread(void *data)
 				spin_lock_irq(&nvmeq->q_lock);
 				if (nvme_process_cq(nvmeq))
 					printk("process_cq did something\n");
-				nvme_cancel_ios(nvmeq, true);
+				if (nvme_cancel_ios(nvmeq, true)) {
+					dev_warn(&dev->pci_dev->dev,
+						"timed out I/O, reset controller\n");
+					queue_work(nvme_workq, &dev->ws);
+					break;
+				}
 				if (i)
 					nvme_resubmit_bios(nvmeq);
 				spin_unlock_irq(&nvmeq->q_lock);
@@ -1778,6 +1796,14 @@ static ssize_t reset_controller(struct device *dev,
 }
 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, reset_controller);
 
+static void nvme_reset_failed_dev(struct work_struct *ws)
+{
+	struct nvme_dev *dev = container_of(ws, struct nvme_dev, ws);
+	if (nvme_reset_controller(dev))
+		dev_warn(&dev->pci_dev->dev,
+					"failed to reset failed controller\n");
+}
+
 static int __devinit nvme_probe(struct pci_dev *pdev,
 						const struct pci_device_id *id)
 {
@@ -1830,6 +1856,8 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 		goto unmap;
 	dev->queue_count++;
 
+	INIT_WORK(&dev->ws, nvme_reset_failed_dev);
+
 	spin_lock(&dev_list_lock);
 	list_add(&dev->node, &dev_list);
 	spin_unlock(&dev_list_lock);
@@ -1955,9 +1983,14 @@ static int __init nvme_init(void)
 	if (IS_ERR(nvme_thread))
 		return PTR_ERR(nvme_thread);
 
+	result = -ENOMEM;
+	nvme_workq = create_workqueue("nvme");
+	if (!nvme_workq)
+		goto kill_kthread;
+
 	result = register_blkdev(nvme_major, "nvme");
 	if (result < 0)
-		goto kill_kthread;
+		goto kill_workq;
 	else if (result > 0)
 		nvme_major = result;
 
@@ -1968,6 +2001,8 @@ static int __init nvme_init(void)
 
  unregister_blkdev:
 	unregister_blkdev(nvme_major, "nvme");
+ kill_workq:
+	destroy_workqueue(nvme_workq);
  kill_kthread:
 	kthread_stop(nvme_thread);
 	return result;
@@ -1977,6 +2012,7 @@ static void __exit nvme_exit(void)
 {
 	pci_unregister_driver(&nvme_driver);
 	unregister_blkdev(nvme_major, "nvme");
+	destroy_workqueue(nvme_workq);
 	kthread_stop(nvme_thread);
 }
 
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 8/8] NVMe: Use schedule_timeout for sync commands
  2013-02-20 23:52 [PATCH 1/8] NVMe: Queue bio requests on device Keith Busch
                   ` (5 preceding siblings ...)
  2013-02-20 23:52 ` [PATCH 7/8] NVMe: Automatically reset failed controller Keith Busch
@ 2013-02-20 23:52 ` Keith Busch
  6 siblings, 0 replies; 10+ messages in thread
From: Keith Busch @ 2013-02-20 23:52 UTC (permalink / raw)


Schedule a timeout for sync commands should the controller fail to return
status and the device is not being polled for timed out commands. This
could happen when a device's queues are being deleted.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 120686c..dfb2e57 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -755,7 +755,7 @@ static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
 
 	set_current_state(TASK_KILLABLE);
 	nvme_submit_cmd(nvmeq, cmd);
-	schedule();
+	schedule_timeout(timeout);
 
 	if (cmdinfo.status == -EINTR) {
 		nvme_abort_command(nvmeq, cmdid);
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/8] NVMe: Controller reset from user
  2013-02-20 23:52 ` [PATCH 2/8] NVMe: Controller reset from user Keith Busch
@ 2013-02-26 14:04   ` Matthew Wilcox
  2013-03-11 17:02     ` Keith Busch
  0 siblings, 1 reply; 10+ messages in thread
From: Matthew Wilcox @ 2013-02-26 14:04 UTC (permalink / raw)


On Wed, Feb 20, 2013@04:52:39PM -0700, Keith Busch wrote:
> Allow a user to issue a controller reset. A reset does not delete the
> gendisks so that IO may continue, or the namespaces may be mounted. This
> may be done by a user if they need to reset the controller for any reason,
> like if it is required as part of an activate firmware operation.

> @@ -265,11 +266,18 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
>  
>  static struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
>  {
> -	return dev->queues[get_cpu() + 1];
> +	struct nvme_queue *nvmeq;
> +	spin_lock(&dev->dev_lock);
> +	nvmeq = dev->queues[get_cpu() + 1];
> +	if (nvmeq)
> +		atomic_inc(&nvmeq->busy);
> +	spin_unlock(&dev->dev_lock);
> +	return nvmeq;
>  }

OK, we can't do this.  Right now, submitting an I/O is entirely per-CPU
... this introduces cacheline contention for the dev_lock.  Even if it's
never contended, the cacheline has to be obtained from the other CPU,
so it'll be a major slowdown for large systems, and destroys scaling.

We have options to fix this.  Here are a few:

1. Use RCU.  You can find it documented in Documentation/RCU/ but the basic
idea is to use ordering to protect against future users of the queue, and
then wait for all existing users to be done with it before freeing it.

2. Avoid freeing the queue.  When resetting the controller, we don't
actually have to free the memory and then reallocate it; we can reprogram
the controller when it comes back up with the addresses that this memory
already has.  So what we could do is put a single bit in nvme_queue
... call it something like q_suspended, and check it thusly:

 static struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
 {
-	return dev->queues[get_cpu() + 1];
+	struct nvme_queue *nvmeq = dev->queues[get_cpu() + 1];
+	if (nvmeq->q_suspended) {
+		put_cpu();
+		return NULL;
+ 	} else {
+ 		nvmeq->q_usage++;
+		return nvmeq;
+ 	}
 }

I think q_usage can be a simple variable at this point ... unless you
see a reason for it to be atomic?

> @@ -1630,6 +1660,108 @@ static void nvme_release_instance(struct nvme_dev *dev)
>  	spin_unlock(&dev_list_lock);
>  }
>  
> +static int nvme_shutdown_controller(struct nvme_dev *dev)
> +{
> +	int i;
> +	unsigned long timeout;
> +
> +	spin_lock(&dev_list_lock);
> +	list_del(&dev->node);
> +	spin_unlock(&dev_list_lock);
> +
> +	spin_lock(&dev->dev_lock);
> +	for (i = dev->queue_count; i < num_possible_cpus(); i++)
> +		dev->queues[i] = NULL;
> +	spin_unlock(&dev->dev_lock);
> +	nvme_free_queues(dev);
> +
> +	dev->ctrl_config |= NVME_CC_SHN_NORMAL;
> +	writel(dev->ctrl_config, &dev->bar->cc);
> +	timeout = HZ + jiffies;
> +
> +	while (!(readl(&dev->bar->csts) & NVME_CSTS_SHST_CMPLT)) {
> +		msleep(5);
> +		if (fatal_signal_pending(current))
> +			break;
> +		if (time_after(jiffies, timeout)) {
> +			dev_err(&dev->pci_dev->dev,
> +				"Device still ready; aborting shutdown\n");
> +			break;
> +		}
> +	}
> +
> +	pci_disable_msix(dev->pci_dev);
> +	iounmap(dev->bar);
> +	pci_disable_device(dev->pci_dev);
> +	pci_release_regions(dev->pci_dev);
> +
> +	return 0;
> +}
> +
> +static int nvme_restart_controller(struct nvme_dev *dev)
> +{
> +	int bars, result = -ENOMEM;
> +
> +	if (pci_enable_device_mem(dev->pci_dev))
> +		return -ENOMEM;
> +
> +	pci_set_master(dev->pci_dev);
> +	bars = pci_select_bars(dev->pci_dev, IORESOURCE_MEM);
> +	if (pci_request_selected_regions(dev->pci_dev, bars, "nvme"))
> +		goto disable_pci;
> +
> +	dma_set_mask(&dev->pci_dev->dev, DMA_BIT_MASK(64));
> +	dma_set_coherent_mask(&dev->pci_dev->dev, DMA_BIT_MASK(64));
> +	dev->entry[0].vector = dev->pci_dev->irq;
> +	dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0), 8192);
> +	if (!dev->bar)
> +		goto disable;
> +
> +	result = nvme_configure_admin_queue(dev);
> +	if (result)
> +		goto unmap;
> +	dev->queue_count++;
> +
> +	spin_lock(&dev_list_lock);
> +	list_add(&dev->node, &dev_list);
> +	spin_unlock(&dev_list_lock);
> +
> +	result = nvme_setup_io_queues(dev);
> +	if (result)
> +		goto remove;
> +
> +	return 0;
> +
> + remove:
> +	nvme_dev_remove(dev);
> + unmap:
> +	iounmap(dev->bar);
> + disable:
> +	pci_release_regions(dev->pci_dev);
> + disable_pci:
> +	pci_disable_device(dev->pci_dev);
> +	return result;
> +}
> +
> +static int nvme_reset_controller(struct nvme_dev *dev)
> +{
> +	int ret = nvme_shutdown_controller(dev);
> +	if (ret)
> +		return ret;
> +	ret = nvme_restart_controller(dev);
> +	return ret;
> +}
> +
> +static ssize_t reset_controller(struct device *dev,
> +		struct device_attribute *attr, const char *buf, size_t count)
> +{
> +	struct pci_dev  *pdev = container_of(dev, struct pci_dev, dev);
> +	struct nvme_dev *ndev = pci_get_drvdata(pdev);
> +	nvme_reset_controller(ndev);
> +	return count;
> +}
> +static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, reset_controller);
> +
>  static int __devinit nvme_probe(struct pci_dev *pdev,
>  						const struct pci_device_id *id)
>  {

The problem with this implementation of reset controller is that it
requires the controller to be sufficiently alive to process admin
commands.  I think we need reset controller to be able to work if the
admin queue is broken for some reason.  So just whacking the config
register, waiting for it to complete, then reprogramming the registers,
and resending all the admin init commands seems like the right move to me.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2/8] NVMe: Controller reset from user
  2013-02-26 14:04   ` Matthew Wilcox
@ 2013-03-11 17:02     ` Keith Busch
  0 siblings, 0 replies; 10+ messages in thread
From: Keith Busch @ 2013-03-11 17:02 UTC (permalink / raw)


On Tue, 26 Feb 2013, Matthew Wilcox wrote:
> 2. Avoid freeing the queue.  When resetting the controller, we don't
> actually have to free the memory and then reallocate it; we can reprogram
> the controller when it comes back up with the addresses that this memory
> already has.  So what we could do is put a single bit in nvme_queue
> ... call it something like q_suspended, and check it thusly:

This sounds like a pretty good idea. The only problem I see is the queues
had to have been created in the first place, and if a controller was
security locked, it would not have allowed this operation.

I was trying to reuse the exisiting queue initialization as much as
possible in case something changes after the reset, like more cpu's were
brought online to allow more queues, or if the controller configuration
changed from new firmware that may alter the queue count or depth. I
think this can all be accounted for without freeing the queue memory,
but not quite as easily.

> The problem with this implementation of reset controller is that it
> requires the controller to be sufficiently alive to process admin
> commands.  I think we need reset controller to be able to work if the
> admin queue is broken for some reason.  So just whacking the config
> register, waiting for it to complete, then reprogramming the registers,
> and resending all the admin init commands seems like the right move to me.

The patch was following the Shutdown sequence from the NVMe spec (7.6.2 in
revision 1.1). Hitting CC.EN to reset the controller isn't recommended
for normal shutdowns, but you're right, we can't hold up a shutdown
if the controller isn't responding to admin commands either. I added
schedule_timeout for sync commands in this patch set for commands that
do not complete, but that would hold things up for an unexceptable time
if the controller is not responding. Maybe we should have separate
graceful and abrupt shutdown sequences?

Anyway, thanks for the info. I'll take another shot at this.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2013-03-11 17:02 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-02-20 23:52 [PATCH 1/8] NVMe: Queue bio requests on device Keith Busch
2013-02-20 23:52 ` [PATCH 2/8] NVMe: Controller reset from user Keith Busch
2013-02-26 14:04   ` Matthew Wilcox
2013-03-11 17:02     ` Keith Busch
2013-02-20 23:52 ` [PATCH 3/8] NVMe: Suspend/resume power management Keith Busch
2013-02-20 23:52 ` [PATCH 4/8] NVMe: Add shutdown callback Keith Busch
2013-02-20 23:52 ` [PATCH 5/8] NVMe: End queued bio requests for removed disks Keith Busch
2013-02-20 23:52 ` [PATCH 6/8] NVMe: Wait for controller status ready to clear Keith Busch
2013-02-20 23:52 ` [PATCH 7/8] NVMe: Automatically reset failed controller Keith Busch
2013-02-20 23:52 ` [PATCH 8/8] NVMe: Use schedule_timeout for sync commands Keith Busch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.