All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/7] Fixes and improvements
@ 2014-01-24 23:50 Keith Busch
  2014-01-24 23:50 ` [PATCH 1/7] NVMe: Namespace use after free on surprise removal Keith Busch
                   ` (6 more replies)
  0 siblings, 7 replies; 9+ messages in thread
From: Keith Busch @ 2014-01-24 23:50 UTC (permalink / raw)


Starts off the same as this one:

http://merlin.infradead.org/pipermail/linux-nvme/2014-January/000624.html

... then I kept adding more stuff and revived the percpu optimization
and hot cpu stuff and think it's in good shape.

If you want to test hot cpu, this linux-nvme tree is broken due to bad
timing on the merge from upstream during 3.13rc and merged in a scheduler
bug, so will need merge upstream or at the very least cherry-pick these:

2cce5600486 sched: Avoid NULL dereference on sd_busy
a53cb752016 sched: Assign correct scheduling domain to 'sd_llc'

Keith Busch (7):
  NVMe: Namespace use after free on surprise removal
  NVMe: RCU access to nvme_queue
  NVMe: Initialization clean-up
  NVMe: Clean-up character device bring-up
  NVMe: Per-cpu IO queues
  NVMe: CPU hot plug notification
  NVMe: Share interrupt vectors among IO queues

 drivers/block/nvme-core.c |  377 +++++++++++++++++++++++++++++++++------------
 include/linux/nvme.h      |   11 +-
 2 files changed, 289 insertions(+), 99 deletions(-)

-- 
1.7.10.4

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/7] NVMe: Namespace use after free on surprise removal
  2014-01-24 23:50 [PATCH 0/7] Fixes and improvements Keith Busch
@ 2014-01-24 23:50 ` Keith Busch
  2014-01-24 23:50 ` [PATCH 2/7] NVMe: RCU access to nvme_queue Keith Busch
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: Keith Busch @ 2014-01-24 23:50 UTC (permalink / raw)


An nvme block device may have open references when the device is
removed. New commands may still be sent on the removed device, so we
need to ref count the opens, return errors for to new commands, and not
free the namespace and nvme_dev until all references are closed.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme-core.c |   55 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index b59a93a..7163a2f 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1712,10 +1712,31 @@ static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
 #define nvme_compat_ioctl	NULL
 #endif
 
+static int nvme_open(struct block_device *bdev, fmode_t mode)
+{
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+	struct nvme_dev *dev = ns->dev;
+
+	kref_get(&dev->kref);
+	return 0;
+}
+
+static void nvme_free_dev(struct kref *kref);
+
+static void nvme_release(struct gendisk *disk, fmode_t mode)
+{
+	struct nvme_ns *ns = disk->private_data;
+	struct nvme_dev *dev = ns->dev;
+
+	kref_put(&dev->kref, nvme_free_dev);
+}
+
 static const struct block_device_operations nvme_fops = {
 	.owner		= THIS_MODULE,
 	.ioctl		= nvme_ioctl,
 	.compat_ioctl	= nvme_compat_ioctl,
+	.open		= nvme_open,
+	.release	= nvme_release,
 };
 
 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
@@ -1844,13 +1865,6 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	return NULL;
 }
 
-static void nvme_ns_free(struct nvme_ns *ns)
-{
-	put_disk(ns->disk);
-	blk_cleanup_queue(ns->queue);
-	kfree(ns);
-}
-
 static int set_queue_count(struct nvme_dev *dev, int count)
 {
 	int status;
@@ -2281,12 +2295,13 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 
 static void nvme_dev_remove(struct nvme_dev *dev)
 {
-	struct nvme_ns *ns, *next;
+	struct nvme_ns *ns;
 
-	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
-		list_del(&ns->list);
-		del_gendisk(ns->disk);
-		nvme_ns_free(ns);
+	list_for_each_entry(ns, &dev->namespaces, list) {
+		if (ns->disk->flags & GENHD_FL_UP)
+			del_gendisk(ns->disk);
+		if (!blk_queue_dying(ns->queue))
+			blk_cleanup_queue(ns->queue);
 	}
 }
 
@@ -2343,9 +2358,22 @@ static void nvme_release_instance(struct nvme_dev *dev)
 	spin_unlock(&dev_list_lock);
 }
 
+static void nvme_free_namespaces(struct nvme_dev *dev)
+{
+	struct nvme_ns *ns, *next;
+
+	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+		list_del(&ns->list);
+		put_disk(ns->disk);
+		kfree(ns);
+	}
+}
+
 static void nvme_free_dev(struct kref *kref)
 {
 	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
+
+	nvme_free_namespaces(dev);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
@@ -2518,6 +2546,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto release_pools;
 	}
 
+	kref_init(&dev->kref);
 	result = nvme_dev_add(dev);
 	if (result)
 		goto shutdown;
@@ -2533,11 +2562,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto remove;
 
 	dev->initialized = 1;
-	kref_init(&dev->kref);
 	return 0;
 
  remove:
 	nvme_dev_remove(dev);
+	nvme_free_namespaces(dev); 
  shutdown:
 	nvme_dev_shutdown(dev);
  release_pools:
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/7] NVMe: RCU access to nvme_queue
  2014-01-24 23:50 [PATCH 0/7] Fixes and improvements Keith Busch
  2014-01-24 23:50 ` [PATCH 1/7] NVMe: Namespace use after free on surprise removal Keith Busch
@ 2014-01-24 23:50 ` Keith Busch
  2014-01-24 23:50 ` [PATCH 3/7] NVMe: Initialization clean-up Keith Busch
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: Keith Busch @ 2014-01-24 23:50 UTC (permalink / raw)


This adds rcu protected access to nvme_queue to fix a potential race
between a surprise removal freeing the queue and a thread with open
reference on a NVMe block device using that queue.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme-core.c |   53 ++++++++++++++++++++-------------------------
 include/linux/nvme.h      |    2 +-
 2 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 7163a2f..e27f8d7 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -74,6 +74,7 @@ struct async_cmd_info {
  * commands and one for I/O commands).
  */
 struct nvme_queue {
+	struct rcu_head r_head;
 	struct device *q_dmadev;
 	struct nvme_dev *dev;
 	spinlock_t q_lock;
@@ -263,12 +264,16 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
 
 struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
 {
-	return dev->queues[get_cpu() + 1];
+	int queue;
+	rcu_read_lock();
+	queue = get_cpu() + 1;
+	return rcu_dereference(dev->queues[queue]);
 }
 
 void put_nvmeq(struct nvme_queue *nvmeq)
 {
 	put_cpu();
+	rcu_read_unlock();
 }
 
 /**
@@ -818,9 +823,9 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio)
 	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
 	int result = -EBUSY;
 
-	if (!nvmeq) {
+	if (unlikely(!nvmeq)) {
 		put_nvmeq(NULL);
-		bio_endio(bio, -EIO);
+		bio_endio(bio, -ENXIO);
 		return;
 	}
 
@@ -1135,8 +1140,10 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
 	}
 }
 
-static void nvme_free_queue(struct nvme_queue *nvmeq)
+static void nvme_free_queue(struct rcu_head *r)
 {
+	struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head);
+
 	spin_lock_irq(&nvmeq->q_lock);
 	while (bio_list_peek(&nvmeq->sq_cong)) {
 		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
@@ -1155,10 +1162,13 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 {
 	int i;
 
+	for (i = num_possible_cpus(); i > dev->queue_count - 1; i--)
+		rcu_assign_pointer(dev->queues[i], NULL);
 	for (i = dev->queue_count - 1; i >= lowest; i--) {
-		nvme_free_queue(dev->queues[i]);
+		struct nvme_queue *nvmeq = dev->queues[i];
+		rcu_assign_pointer(dev->queues[i], NULL);
+		call_rcu(&nvmeq->r_head, nvme_free_queue);
 		dev->queue_count--;
-		dev->queues[i] = NULL;
 	}
 }
 
@@ -1778,8 +1788,11 @@ static int nvme_kthread(void *data)
 				queue_work(nvme_workq, &dev->reset_work);
 				continue;
 			}
+
+			rcu_read_lock();
 			for (i = 0; i < dev->queue_count; i++) {
-				struct nvme_queue *nvmeq = dev->queues[i];
+				struct nvme_queue *nvmeq =
+						rcu_dereference(dev->queues[i]);
 				if (!nvmeq)
 					continue;
 				spin_lock_irq(&nvmeq->q_lock);
@@ -1791,6 +1804,7 @@ static int nvme_kthread(void *data)
  unlock:
 				spin_unlock_irq(&nvmeq->q_lock);
 			}
+			rcu_read_unlock();
 		}
 		spin_unlock(&dev_list_lock);
 		schedule_timeout(round_jiffies_relative(HZ));
@@ -1956,19 +1970,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	}
 
 	/* Free previously allocated queues that are no longer usable */
-	spin_lock(&dev_list_lock);
-	for (i = dev->queue_count - 1; i > nr_io_queues; i--) {
-		struct nvme_queue *nvmeq = dev->queues[i];
-
-		spin_lock_irq(&nvmeq->q_lock);
-		nvme_cancel_ios(nvmeq, false);
-		spin_unlock_irq(&nvmeq->q_lock);
-
-		nvme_free_queue(nvmeq);
-		dev->queue_count--;
-		dev->queues[i] = NULL;
-	}
-	spin_unlock(&dev_list_lock);
+	nvme_free_queues(dev, nr_io_queues + 1);
 
 	cpu = cpumask_first(cpu_online_mask);
 	for (i = 0; i < nr_io_queues; i++) {
@@ -2459,18 +2461,10 @@ static int nvme_remove_dead_ctrl(void *arg)
 
 static void nvme_remove_disks(struct work_struct *ws)
 {
-	int i;
 	struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
 
 	nvme_dev_remove(dev);
-	spin_lock(&dev_list_lock);
-	for (i = dev->queue_count - 1; i > 0; i--) {
-		BUG_ON(!dev->queues[i] || !dev->queues[i]->q_suspended);
-		nvme_free_queue(dev->queues[i]);
-		dev->queue_count--;
-		dev->queues[i] = NULL;
-	}
-	spin_unlock(&dev_list_lock);
+	nvme_free_queues(dev, 1);
 }
 
 static int nvme_dev_resume(struct nvme_dev *dev)
@@ -2595,6 +2589,7 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_dev_remove(dev);
 	nvme_dev_shutdown(dev);
 	nvme_free_queues(dev, 0);
+	rcu_barrier();
 	nvme_release_instance(dev);
 	nvme_release_prp_pools(dev);
 	kref_put(&dev->kref, nvme_free_dev);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 69ae03f..98d367b 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -73,7 +73,7 @@ enum {
  */
 struct nvme_dev {
 	struct list_head node;
-	struct nvme_queue **queues;
+	struct nvme_queue __rcu **queues;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
 	struct dma_pool *prp_page_pool;
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/7] NVMe: Initialization clean-up
  2014-01-24 23:50 [PATCH 0/7] Fixes and improvements Keith Busch
  2014-01-24 23:50 ` [PATCH 1/7] NVMe: Namespace use after free on surprise removal Keith Busch
  2014-01-24 23:50 ` [PATCH 2/7] NVMe: RCU access to nvme_queue Keith Busch
@ 2014-01-24 23:50 ` Keith Busch
  2014-01-24 23:50 ` [PATCH 4/7] NVMe: Clean-up character device bring-up Keith Busch
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: Keith Busch @ 2014-01-24 23:50 UTC (permalink / raw)


Just cleaning up harmless but unnecessary initialization code in queue
allocation and initialization.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme-core.c |    4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e27f8d7..e9a4acc 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1245,12 +1245,9 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	nvmeq->q_dmadev = dmadev;
 	nvmeq->dev = dev;
 	spin_lock_init(&nvmeq->q_lock);
-	nvmeq->cq_head = 0;
-	nvmeq->cq_phase = 1;
 	init_waitqueue_head(&nvmeq->sq_full);
 	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
 	bio_list_init(&nvmeq->sq_cong);
-	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
 	nvmeq->q_depth = depth;
 	nvmeq->cq_vector = vector;
 	nvmeq->qid = qid;
@@ -1289,7 +1286,6 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
 	memset(nvmeq->cmdid_data, 0, extra);
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
-	nvme_cancel_ios(nvmeq, false);
 	nvmeq->q_suspended = 0;
 }
 
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 4/7] NVMe: Clean-up character device bring-up
  2014-01-24 23:50 [PATCH 0/7] Fixes and improvements Keith Busch
                   ` (2 preceding siblings ...)
  2014-01-24 23:50 ` [PATCH 3/7] NVMe: Initialization clean-up Keith Busch
@ 2014-01-24 23:50 ` Keith Busch
  2014-01-24 23:50 ` [PATCH 5/7] NVMe: Per-cpu IO queues Keith Busch
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: Keith Busch @ 2014-01-24 23:50 UTC (permalink / raw)


... because the way for getting an nvme character device up when it the
controller is otherwise unable to create IO queues was a bit tacky.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme-core.c |   23 ++++++++++++-----------
 include/linux/nvme.h      |    1 +
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e9a4acc..076987e 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1193,6 +1193,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 	irq_set_affinity_hint(vector, NULL);
 	free_irq(vector, nvmeq);
 
+	nvmeq->dev->online_queues--;
 	return 0;
 }
 
@@ -1287,6 +1288,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	memset(nvmeq->cmdid_data, 0, extra);
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
 	nvmeq->q_suspended = 0;
+	nvmeq->dev->online_queues++;
 }
 
 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
@@ -1884,7 +1886,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 	status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
 								&result);
 	if (status)
-		return status < 0 ? -EIO : -EBUSY;
+		return status < 0 ? -EIO : 0;
 	return min(result & 0xffff, result >> 16) + 1;
 }
 
@@ -1900,7 +1902,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
 	nr_io_queues = num_online_cpus();
 	result = set_queue_count(dev, nr_io_queues);
-	if (result < 0)
+	if (result <= 0)
 		return result;
 	if (result < nr_io_queues)
 		nr_io_queues = result;
@@ -2429,7 +2431,7 @@ static int nvme_dev_start(struct nvme_dev *dev)
 	spin_unlock(&dev_list_lock);
 
 	result = nvme_setup_io_queues(dev);
-	if (result && result != -EBUSY)
+	if (result)
 		goto disable;
 
 	return result;
@@ -2468,9 +2470,9 @@ static int nvme_dev_resume(struct nvme_dev *dev)
 	int ret;
 
 	ret = nvme_dev_start(dev);
-	if (ret && ret != -EBUSY)
+	if (ret)
 		return ret;
-	if (ret == -EBUSY) {
+	if (dev->online_queues < 2) {
 		spin_lock(&dev_list_lock);
 		INIT_WORK(&dev->reset_work, nvme_remove_disks);
 		queue_work(nvme_workq, &dev->reset_work);
@@ -2530,18 +2532,17 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto release;
 
 	result = nvme_dev_start(dev);
-	if (result) {
-		if (result == -EBUSY)
-			goto create_cdev;
+	if (result)
 		goto release_pools;
-	}
 
 	kref_init(&dev->kref);
-	result = nvme_dev_add(dev);
+
+	/* Don't bother adding disks if we don't have online IO queues */
+	if (dev->online_queues > 1)
+		result = nvme_dev_add(dev);
 	if (result)
 		goto shutdown;
 
- create_cdev:
 	scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance);
 	dev->miscdev.minor = MISC_DYNAMIC_MINOR;
 	dev->miscdev.parent = &pdev->dev;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 98d367b..2fef3ce 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -80,6 +80,7 @@ struct nvme_dev {
 	struct dma_pool *prp_small_pool;
 	int instance;
 	int queue_count;
+	unsigned online_queues;
 	u32 db_stride;
 	u32 ctrl_config;
 	struct msix_entry *entry;
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 5/7] NVMe: Per-cpu IO queues
  2014-01-24 23:50 [PATCH 0/7] Fixes and improvements Keith Busch
                   ` (3 preceding siblings ...)
  2014-01-24 23:50 ` [PATCH 4/7] NVMe: Clean-up character device bring-up Keith Busch
@ 2014-01-24 23:50 ` Keith Busch
  2014-01-31 17:47   ` Matthew Wilcox
  2014-01-24 23:50 ` [PATCH 6/7] NVMe: CPU hot plug notification Keith Busch
  2014-01-24 23:50 ` [PATCH 7/7] NVMe: Share interrupt vectors among IO queues Keith Busch
  6 siblings, 1 reply; 9+ messages in thread
From: Keith Busch @ 2014-01-24 23:50 UTC (permalink / raw)


NVMe IO queues are associated with CPUs, and linux provices a handy
per-cpu implementation. This gives us a convienient way to optimally
assign queues to multiple cpus when the device supports fewer queues
than the host has cpus. The previous implementation did not share these
optimally and may have shared very poorly in some situations. This new
way will share queues among cpus that are "close" together and should
have the lowest penalty for lock contention.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme-core.c |  213 ++++++++++++++++++++++++++++++++++++---------
 include/linux/nvme.h      |    5 +-
 2 files changed, 175 insertions(+), 43 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 076987e..c85b369 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -20,6 +20,7 @@
 #include <linux/bio.h>
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
+#include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -35,6 +36,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
+#include <linux/percpu.h>
 #include <linux/poison.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
@@ -95,6 +97,7 @@ struct nvme_queue {
 	u8 cq_phase;
 	u8 cqe_seen;
 	u8 q_suspended;
+	cpumask_t cpu_mask;
 	struct async_cmd_info cmdinfo;
 	unsigned long cmdid_data[];
 };
@@ -264,15 +267,13 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
 
 struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
 {
-	int queue;
 	rcu_read_lock();
-	queue = get_cpu() + 1;
-	return rcu_dereference(dev->queues[queue]);
+	return rcu_dereference(*get_cpu_ptr(dev->io_queues));
 }
 
 void put_nvmeq(struct nvme_queue *nvmeq)
 {
-	put_cpu();
+	put_cpu_ptr(nvmeq->dev->io_queues);
 	rcu_read_unlock();
 }
 
@@ -1160,12 +1161,17 @@ static void nvme_free_queue(struct rcu_head *r)
 
 static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 {
-	int i;
+	int i, cpu;
 
-	for (i = num_possible_cpus(); i > dev->queue_count - 1; i--)
-		rcu_assign_pointer(dev->queues[i], NULL);
 	for (i = dev->queue_count - 1; i >= lowest; i--) {
 		struct nvme_queue *nvmeq = dev->queues[i];
+
+		for_each_cpu(cpu, &nvmeq->cpu_mask) {
+			rcu_assign_pointer(
+				*per_cpu_ptr(dev->io_queues, cpu),
+				NULL);
+			cpumask_clear_cpu(cpu, &nvmeq->cpu_mask);
+		}
 		rcu_assign_pointer(dev->queues[i], NULL);
 		call_rcu(&nvmeq->r_head, nvme_free_queue);
 		dev->queue_count--;
@@ -1253,6 +1259,8 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	nvmeq->cq_vector = vector;
 	nvmeq->qid = qid;
 	nvmeq->q_suspended = 1;
+	cpumask_clear(&nvmeq->cpu_mask);
+	rcu_assign_pointer(dev->queues[qid], nvmeq);
 	dev->queue_count++;
 
 	return nvmeq;
@@ -1288,7 +1296,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	memset(nvmeq->cmdid_data, 0, extra);
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
 	nvmeq->q_suspended = 0;
-	nvmeq->dev->online_queues++;
+	dev->online_queues++;
 }
 
 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
@@ -1877,6 +1885,147 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
 	return NULL;
 }
 
+static int nvme_find_closest_node(int node)
+{
+	int n, val, min_val = INT_MAX, best_node = node;
+
+	for_each_online_node(n) {
+		if (n == node)
+			continue;
+		val = node_distance(node, n);
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+	return best_node;
+}
+
+static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq,
+								int count)
+{
+	int cpu;
+	for_each_cpu(cpu, qmask) {
+		if (cpus_weight(nvmeq->cpu_mask) >= count)
+			break;
+		if (!cpumask_test_and_set_cpu(cpu, &nvmeq->cpu_mask))
+			rcu_assign_pointer(
+				*per_cpu_ptr(nvmeq->dev->io_queues, cpu),
+				nvmeq);
+	}
+}
+
+static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
+	const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue)
+{
+	int next_cpu;
+	for_each_cpu(next_cpu, new_mask) {
+		cpumask_or(mask, mask, get_cpu_mask(next_cpu));
+		cpumask_or(mask, mask, topology_thread_cpumask(next_cpu));
+		cpumask_and(mask, mask, unassigned_cpus);
+		nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue);
+	}
+}
+
+static void nvme_create_io_queues(struct nvme_dev *dev)
+{
+	unsigned i, max;
+
+	max = min(dev->max_qid, num_online_cpus());
+	for (i = dev->queue_count; i <= max; i++)
+		if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
+			break;
+
+	max = min(dev->queue_count - 1, num_online_cpus());
+	for (i = dev->online_queues; i <= max; i++)
+		if (nvme_create_queue(dev->queues[i], i))
+			break;
+}
+
+/*
+ * If there are fewer queues than online cpus, this will try to optimally
+ * assign a queue to multiple cpus by grouping cpus that are "close" together:
+ * thread siblings, core, socket, closest node, then whatever else is
+ * available.
+ */
+static void nvme_assign_io_queues(struct nvme_dev *dev)
+{
+	unsigned cpu, cpus_per_queue, queues, remainder, i;
+	cpumask_t unassigned_cpus;
+
+	nvme_create_io_queues(dev);
+
+	queues = min(dev->online_queues - 1, num_online_cpus());
+	if (!queues)
+		return;
+
+	cpus_per_queue = num_online_cpus() / queues;
+	remainder = queues - (num_online_cpus() - queues * cpus_per_queue);
+
+	unassigned_cpus = *cpu_online_mask;
+	cpu = cpumask_first(&unassigned_cpus);
+	for (i = 1; i <= queues; i++) {
+		struct nvme_queue *nvmeq = dev->queues[i];
+		cpumask_t mask;
+
+		cpumask_clear(&nvmeq->cpu_mask);
+		if (!cpus_weight(unassigned_cpus))
+			break;
+
+		mask = *get_cpu_mask(cpu);
+		nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, &unassigned_cpus,
+				topology_thread_cpumask(cpu),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, &unassigned_cpus,
+				topology_core_cpumask(cpu),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, &unassigned_cpus,
+				cpumask_of_node(cpu_to_node(cpu)),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, &unassigned_cpus,
+				cpumask_of_node(
+					nvme_find_closest_node(
+						cpu_to_node(cpu))),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, &unassigned_cpus,
+				&unassigned_cpus,
+				nvmeq, cpus_per_queue);
+
+		WARN(cpus_weight(nvmeq->cpu_mask) != cpus_per_queue,
+			"nvme%d qid:%d mis-matched queue-to-cpu assignment\n",
+			dev->instance, i);
+
+		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
+							&nvmeq->cpu_mask);
+
+		cpumask_andnot(&unassigned_cpus, &unassigned_cpus,
+						&nvmeq->cpu_mask);
+
+		cpu = cpumask_next(cpu, &unassigned_cpus);
+		if (remainder && !--remainder)
+			cpus_per_queue++;
+	}
+	WARN(cpus_weight(unassigned_cpus), "nvme%d unassigned online cpus\n",
+								dev->instance);
+
+	/*
+	 * All possible cpus must point to a valid queue. We don't have thread
+	 * sibling info on offline cpus, so no sharing optimization on these
+	 * cpus.
+	 */
+	cpumask_andnot(&unassigned_cpus, cpu_possible_mask, cpu_online_mask);
+	i = 0;
+	for_each_cpu(cpu, &unassigned_cpus)
+		rcu_assign_pointer(*per_cpu_ptr(dev->io_queues, cpu),
+					dev->queues[(i++ % queues) + 1]);
+}
+
 static int set_queue_count(struct nvme_dev *dev, int count)
 {
 	int status;
@@ -1898,9 +2047,9 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	struct pci_dev *pdev = dev->pci_dev;
-	int result, cpu, i, vecs, nr_io_queues, size, q_depth;
+	int result, i, vecs, nr_io_queues, size;
 
-	nr_io_queues = num_online_cpus();
+	nr_io_queues = num_possible_cpus();
 	result = set_queue_count(dev, nr_io_queues);
 	if (result <= 0)
 		return result;
@@ -1960,6 +2109,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	 * number of interrupts.
 	 */
 	nr_io_queues = vecs;
+	dev->max_qid = nr_io_queues;
 
 	result = queue_request_irq(dev, dev->queues[0], "nvme admin");
 	if (result) {
@@ -1969,37 +2119,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
 	/* Free previously allocated queues that are no longer usable */
 	nvme_free_queues(dev, nr_io_queues + 1);
-
-	cpu = cpumask_first(cpu_online_mask);
-	for (i = 0; i < nr_io_queues; i++) {
-		irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
-		cpu = cpumask_next(cpu, cpu_online_mask);
-	}
-
-	q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
-								NVME_Q_DEPTH);
-	for (i = dev->queue_count - 1; i < nr_io_queues; i++) {
-		dev->queues[i + 1] = nvme_alloc_queue(dev, i + 1, q_depth, i);
-		if (!dev->queues[i + 1]) {
-			result = -ENOMEM;
-			goto free_queues;
-		}
-	}
-
-	for (; i < num_possible_cpus(); i++) {
-		int target = i % rounddown_pow_of_two(dev->queue_count - 1);
-		dev->queues[i + 1] = dev->queues[target + 1];
-	}
-
-	for (i = 1; i < dev->queue_count; i++) {
-		result = nvme_create_queue(dev->queues[i], i);
-		if (result) {
-			for (--i; i > 0; i--)
-				nvme_disable_queue(dev, i);
-			goto free_queues;
-		}
-	}
-
+	nvme_assign_io_queues(dev);
 	return 0;
 
  free_queues:
@@ -2077,6 +2197,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 
 static int nvme_dev_map(struct nvme_dev *dev)
 {
+	u64 cap;
 	int bars, result = -ENOMEM;
 	struct pci_dev *pdev = dev->pci_dev;
 
@@ -2100,7 +2221,9 @@ static int nvme_dev_map(struct nvme_dev *dev)
 		result = -ENODEV;
 		goto unmap;
 	}
-	dev->db_stride = 1 << NVME_CAP_STRIDE(readq(&dev->bar->cap));
+	cap = readq(&dev->bar->cap);
+	dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
+	dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
 	dev->dbs = ((void __iomem *)dev->bar) + 4096;
 
 	return 0;
@@ -2374,6 +2497,7 @@ static void nvme_free_dev(struct kref *kref)
 	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
 
 	nvme_free_namespaces(dev);
+	free_percpu(dev->io_queues);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
@@ -2519,6 +2643,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 								GFP_KERNEL);
 	if (!dev->queues)
 		goto free;
+	dev->io_queues = alloc_percpu(struct nvme_queue *);
+	if (!dev->io_queues)
+		goto free;
 
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->pci_dev = pdev;
@@ -2566,6 +2693,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  release:
 	nvme_release_instance(dev);
  free:
+	free_percpu(dev->io_queues);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
@@ -2672,6 +2800,7 @@ static int __init nvme_init(void)
 	result = pci_register_driver(&nvme_driver);
 	if (result)
 		goto unregister_blkdev;
+
 	return 0;
 
  unregister_blkdev:
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 2fef3ce..58ffaea 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -74,13 +74,16 @@ enum {
 struct nvme_dev {
 	struct list_head node;
 	struct nvme_queue __rcu **queues;
+	struct nvme_queue __rcu __percpu **io_queues;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
 	struct dma_pool *prp_page_pool;
 	struct dma_pool *prp_small_pool;
 	int instance;
-	int queue_count;
+	unsigned queue_count;
 	unsigned online_queues;
+	unsigned max_qid;
+	int q_depth;
 	u32 db_stride;
 	u32 ctrl_config;
 	struct msix_entry *entry;
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 6/7] NVMe: CPU hot plug notification
  2014-01-24 23:50 [PATCH 0/7] Fixes and improvements Keith Busch
                   ` (4 preceding siblings ...)
  2014-01-24 23:50 ` [PATCH 5/7] NVMe: Per-cpu IO queues Keith Busch
@ 2014-01-24 23:50 ` Keith Busch
  2014-01-24 23:50 ` [PATCH 7/7] NVMe: Share interrupt vectors among IO queues Keith Busch
  6 siblings, 0 replies; 9+ messages in thread
From: Keith Busch @ 2014-01-24 23:50 UTC (permalink / raw)


Registers with hot cpu notification to rebalance - and potentially
allocate additional - io queues among cpus.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme-core.c |   23 ++++++++++++++++++++++-
 include/linux/nvme.h      |    1 +
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index c85b369..f59af3f 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2017,7 +2017,8 @@ static void nvme_assign_io_queues(struct nvme_dev *dev)
 	/*
 	 * All possible cpus must point to a valid queue. We don't have thread
 	 * sibling info on offline cpus, so no sharing optimization on these
-	 * cpus.
+	 * cpus. These should automatically be rebalanced from hot plug
+	 * notification.
 	 */
 	cpumask_andnot(&unassigned_cpus, cpu_possible_mask, cpu_online_mask);
 	i = 0;
@@ -2044,6 +2045,19 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
 	return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
 }
 
+static int nvme_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	struct nvme_dev *dev = container_of(self, struct nvme_dev, nb);
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		nvme_assign_io_queues(dev);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	struct pci_dev *pdev = dev->pci_dev;
@@ -2120,6 +2134,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	/* Free previously allocated queues that are no longer usable */
 	nvme_free_queues(dev, nr_io_queues + 1);
 	nvme_assign_io_queues(dev);
+
+	dev->nb.notifier_call = &nvme_cpu_notify;
+	result = register_hotcpu_notifier(&dev->nb);
+	if (result)
+		goto free_queues;
+
 	return 0;
 
  free_queues:
@@ -2397,6 +2417,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 	int i;
 
 	dev->initialized = 0;
+	unregister_hotcpu_notifier(&dev->nb);
 
 	spin_lock(&dev_list_lock);
 	list_del_init(&dev->node);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 58ffaea..beeddee 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -92,6 +92,7 @@ struct nvme_dev {
 	struct kref kref;
 	struct miscdevice miscdev;
 	struct work_struct reset_work;
+	struct notifier_block nb;
 	char name[12];
 	char serial[20];
 	char model[40];
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 7/7] NVMe: Share interrupt vectors among IO queues
  2014-01-24 23:50 [PATCH 0/7] Fixes and improvements Keith Busch
                   ` (5 preceding siblings ...)
  2014-01-24 23:50 ` [PATCH 6/7] NVMe: CPU hot plug notification Keith Busch
@ 2014-01-24 23:50 ` Keith Busch
  6 siblings, 0 replies; 9+ messages in thread
From: Keith Busch @ 2014-01-24 23:50 UTC (permalink / raw)


The driver code contained a comment that proposed investigating if there's
a win from allocating more io queues than interupt vectors. The answer
is "yes", it's a win. Well, it's a win most of the time, but it's never
a loss, and the win was enough for me to post a patch proposal.

For the most extreme case, I used a 32 core machine, forced allocating
a single MSI-x interrupt per nvme device, and compared a single NVMe IO
queue with 32 IO queues for various IO workloads. The 32 queue tests were
always greater or equal to 1 queue. I measured a whopping 22% improvement
in IOPs from sharing among multiple queues in the workload below.

For reference, this is the fio profile I used that showed the biggest win
(not listing all 32 jobs, but should be obvious):

; per-cpu 4k O_DIRECT async rand read
[global]
rw=randread
bs=4k
direct=1
filename=/dev/nvme0n1
ioengine=libaio
iodepth=64

[cpu0]
cpus_allowed=0

[cpu1]
cpus_allowed=1

...

[cpu30]
cpus_allowed=30

[cpu31]
cpus_allowed=31

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/block/nvme-core.c |   42 +++++++++++++++++++++++++++---------------
 include/linux/nvme.h      |    2 ++
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index f59af3f..fb009a2 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1186,7 +1186,8 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
  */
 static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 {
-	int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
+	struct nvme_dev *dev = nvmeq->dev;
+	int vector = dev->entry[nvmeq->cq_vector].vector;
 
 	spin_lock_irq(&nvmeq->q_lock);
 	if (nvmeq->q_suspended) {
@@ -1199,7 +1200,13 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 	irq_set_affinity_hint(vector, NULL);
 	free_irq(vector, nvmeq);
 
-	nvmeq->dev->online_queues--;
+	cpumask_andnot(&dev->affinity_masks[vector],
+				&dev->affinity_masks[vector],
+				&nvmeq->cpu_mask);
+	if (cpus_weight(dev->affinity_masks[vector]))
+		irq_set_affinity_hint(vector, &dev->affinity_masks[vector]);
+
+	dev->online_queues--;
 	return 0;
 }
 
@@ -1927,13 +1934,14 @@ static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
 	}
 }
 
+/* XXX: How to share irqs among queues before we've assigned them to cpus? */
 static void nvme_create_io_queues(struct nvme_dev *dev)
 {
 	unsigned i, max;
 
 	max = min(dev->max_qid, num_online_cpus());
 	for (i = dev->queue_count; i <= max; i++)
-		if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
+		if (!nvme_alloc_queue(dev, i, dev->q_depth, i % dev->nr_vecs))
 			break;
 
 	max = min(dev->queue_count - 1, num_online_cpus());
@@ -1953,6 +1961,8 @@ static void nvme_assign_io_queues(struct nvme_dev *dev)
 	unsigned cpu, cpus_per_queue, queues, remainder, i;
 	cpumask_t unassigned_cpus;
 
+	for (i = 0; i < dev->nr_vecs; i++)
+		cpumask_clear(&dev->affinity_masks[i]);
 	nvme_create_io_queues(dev);
 
 	queues = min(dev->online_queues - 1, num_online_cpus());
@@ -2001,9 +2011,9 @@ static void nvme_assign_io_queues(struct nvme_dev *dev)
 			"nvme%d qid:%d mis-matched queue-to-cpu assignment\n",
 			dev->instance, i);
 
-		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
-							&nvmeq->cpu_mask);
-
+		cpumask_or(&dev->affinity_masks[nvmeq->cq_vector],
+				&dev->affinity_masks[nvmeq->cq_vector],
+				&nvmeq->cpu_mask);
 		cpumask_andnot(&unassigned_cpus, &unassigned_cpus,
 						&nvmeq->cpu_mask);
 
@@ -2088,7 +2098,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	/* Deregister the admin queue's interrupt */
 	free_irq(dev->entry[0].vector, dev->queues[0]);
 
-	vecs = nr_io_queues;
+	dev->max_qid = vecs = nr_io_queues;
 	for (i = 0; i < vecs; i++)
 		dev->entry[i].entry = i;
 	for (;;) {
@@ -2116,14 +2126,10 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 		}
 	}
 
-	/*
-	 * Should investigate if there's a performance win from allocating
-	 * more queues than interrupt vectors; it might allow the submission
-	 * path to scale better, even if the receive path is limited by the
-	 * number of interrupts.
-	 */
-	nr_io_queues = vecs;
-	dev->max_qid = nr_io_queues;
+	dev->nr_vecs = vecs;
+	for (i = 0; i < vecs; i++)
+		irq_set_affinity_hint(dev->entry[i].vector,
+				&dev->affinity_masks[i]);
 
 	result = queue_request_irq(dev, dev->queues[0], "nvme admin");
 	if (result) {
@@ -2519,6 +2525,7 @@ static void nvme_free_dev(struct kref *kref)
 
 	nvme_free_namespaces(dev);
 	free_percpu(dev->io_queues);
+	kfree(dev->affinity_masks);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
@@ -2664,6 +2671,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 								GFP_KERNEL);
 	if (!dev->queues)
 		goto free;
+	dev->affinity_masks = kcalloc(num_possible_cpus() + 1,
+				sizeof(*dev->affinity_masks), GFP_KERNEL);
+	if (!dev->queues)
+		goto free;
 	dev->io_queues = alloc_percpu(struct nvme_queue *);
 	if (!dev->io_queues)
 		goto free;
@@ -2715,6 +2726,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	nvme_release_instance(dev);
  free:
 	free_percpu(dev->io_queues);
+	kfree(dev->affinity_masks);
 	kfree(dev->queues);
 	kfree(dev->entry);
 	kfree(dev);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index beeddee..b5f0352 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -83,10 +83,12 @@ struct nvme_dev {
 	unsigned queue_count;
 	unsigned online_queues;
 	unsigned max_qid;
+	unsigned nr_vecs;
 	int q_depth;
 	u32 db_stride;
 	u32 ctrl_config;
 	struct msix_entry *entry;
+	cpumask_t *affinity_masks;
 	struct nvme_bar __iomem *bar;
 	struct list_head namespaces;
 	struct kref kref;
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 5/7] NVMe: Per-cpu IO queues
  2014-01-24 23:50 ` [PATCH 5/7] NVMe: Per-cpu IO queues Keith Busch
@ 2014-01-31 17:47   ` Matthew Wilcox
  0 siblings, 0 replies; 9+ messages in thread
From: Matthew Wilcox @ 2014-01-31 17:47 UTC (permalink / raw)


On Fri, Jan 24, 2014@04:50:52PM -0700, Keith Busch wrote:
> NVMe IO queues are associated with CPUs, and linux provices a handy
> per-cpu implementation. This gives us a convienient way to optimally
> assign queues to multiple cpus when the device supports fewer queues
> than the host has cpus. The previous implementation did not share these
> optimally and may have shared very poorly in some situations. This new
> way will share queues among cpus that are "close" together and should
> have the lowest penalty for lock contention.

I got to thinking about this one after sparse flagged a couple of
problems.  Why not do it this way?

Advantage that it only requires one percpu allocation.

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e302f55..0afa8ee 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -268,16 +268,15 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
 
 struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
 {
-	struct nvme_queue __rcu **nvmeqp;
+	unsigned short i = get_cpu_var(*dev->io_queue);
 	rcu_read_lock();
-	nvmeqp = get_cpu_ptr(dev->io_queues);
-	return rcu_dereference(*nvmeqp);
+	return rcu_dereference(dev->queues[i]);
 }
 
 void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
 {
-	put_cpu_ptr(nvmeq->dev->io_queues);
 	rcu_read_unlock();
+	put_cpu_var(nvmeq->dev->io_queue);
 }
 
 /**
@@ -1171,9 +1170,6 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 		struct nvme_queue *nvmeq = dev->queues[i];
 
 		for_each_cpu(cpu, &nvmeq->cpu_mask) {
-			rcu_assign_pointer(
-				*per_cpu_ptr(dev->io_queues, cpu),
-				NULL);
 			cpumask_clear_cpu(cpu, &nvmeq->cpu_mask);
 		}
 		rcu_assign_pointer(dev->queues[i], NULL);
@@ -1923,9 +1919,7 @@ static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq,
 		if (cpus_weight(nvmeq->cpu_mask) >= count)
 			break;
 		if (!cpumask_test_and_set_cpu(cpu, &nvmeq->cpu_mask))
-			rcu_assign_pointer(
-				*per_cpu_ptr(nvmeq->dev->io_queues, cpu),
-				nvmeq);
+			*per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid;
 	}
 }
 
@@ -2040,8 +2034,7 @@ static void nvme_assign_io_queues(struct nvme_dev *dev)
 	cpumask_andnot(&unassigned_cpus, cpu_possible_mask, cpu_online_mask);
 	i = 0;
 	for_each_cpu(cpu, &unassigned_cpus)
-		rcu_assign_pointer(*per_cpu_ptr(dev->io_queues, cpu),
-					dev->queues[(i++ % queues) + 1]);
+		*per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1;
 }
 
 static int set_queue_count(struct nvme_dev *dev, int count)
@@ -2532,7 +2525,7 @@ static void nvme_free_dev(struct kref *kref)
 	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
 
 	nvme_free_namespaces(dev);
-	free_percpu(dev->io_queues);
+	free_percpu(dev->io_queue);
 	kfree(dev->affinity_masks);
 	kfree(dev->queues);
 	kfree(dev->entry);
@@ -2683,8 +2676,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 				sizeof(*dev->affinity_masks), GFP_KERNEL);
 	if (!dev->queues)
 		goto free;
-	dev->io_queues = alloc_percpu(struct nvme_queue *);
-	if (!dev->io_queues)
+	dev->io_queue = alloc_percpu(unsigned short);
+	if (!dev->io_queue)
 		goto free;
 
 	INIT_LIST_HEAD(&dev->namespaces);
@@ -2734,7 +2727,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  release:
 	nvme_release_instance(dev);
  free:
-	free_percpu(dev->io_queues);
+	free_percpu(dev->io_queue);
 	kfree(dev->affinity_masks);
 	kfree(dev->queues);
 	kfree(dev->entry);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 89966c0..0c051e5 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -74,7 +74,7 @@ enum {
 struct nvme_dev {
 	struct list_head node;
 	struct nvme_queue __rcu **queues;
-	struct nvme_queue __rcu * __percpu *io_queues;
+	unsigned short __percpu *io_queue;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
 	struct dma_pool *prp_page_pool;

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2014-01-31 17:47 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-01-24 23:50 [PATCH 0/7] Fixes and improvements Keith Busch
2014-01-24 23:50 ` [PATCH 1/7] NVMe: Namespace use after free on surprise removal Keith Busch
2014-01-24 23:50 ` [PATCH 2/7] NVMe: RCU access to nvme_queue Keith Busch
2014-01-24 23:50 ` [PATCH 3/7] NVMe: Initialization clean-up Keith Busch
2014-01-24 23:50 ` [PATCH 4/7] NVMe: Clean-up character device bring-up Keith Busch
2014-01-24 23:50 ` [PATCH 5/7] NVMe: Per-cpu IO queues Keith Busch
2014-01-31 17:47   ` Matthew Wilcox
2014-01-24 23:50 ` [PATCH 6/7] NVMe: CPU hot plug notification Keith Busch
2014-01-24 23:50 ` [PATCH 7/7] NVMe: Share interrupt vectors among IO queues Keith Busch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.