* [RFC PATCH 1/3] nvme: split admin queue in pci
2021-09-24 21:08 [RFC PATCH 0/3] nvme sq associations Andrey Nikitin
@ 2021-09-24 21:08 ` Andrey Nikitin
2021-09-24 21:08 ` [RFC PATCH 2/3] nvme: add NVM set structures Andrey Nikitin
` (3 subsequent siblings)
4 siblings, 0 replies; 11+ messages in thread
From: Andrey Nikitin @ 2021-09-24 21:08 UTC (permalink / raw)
To: linux-nvme; +Cc: benh, davebuch, Andrey Nikitin
To determine the number of required IO queues it may be needed to
send admin commands. To facilitate that process admin queue is split
from the array of IO queues and allocation of IO queues is moved to
reset routine.
Signed-off-by: Andrey Nikitin <nikitina@amazon.com>
---
drivers/nvme/host/pci.c | 162 +++++++++++++++++++++++-----------------
1 file changed, 93 insertions(+), 69 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b82492cd7503..37de7ac79b3f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -108,12 +108,47 @@ struct nvme_queue;
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
+/*
+ * An NVM Express queue. Each device has at least two (one for admin
+ * commands and one for I/O commands).
+ */
+struct nvme_queue {
+ struct nvme_dev *dev;
+ spinlock_t sq_lock;
+ void *sq_cmds;
+ /* only used for poll queues: */
+ spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
+ struct nvme_completion *cqes;
+ dma_addr_t sq_dma_addr;
+ dma_addr_t cq_dma_addr;
+ u32 __iomem *q_db;
+ u32 q_depth;
+ u16 cq_vector;
+ u16 sq_tail;
+ u16 last_sq_tail;
+ u16 cq_head;
+ u16 qid;
+ u8 cq_phase;
+ u8 sqes;
+ unsigned long flags;
+#define NVMEQ_ENABLED 0
+#define NVMEQ_SQ_CMB 1
+#define NVMEQ_DELETE_ERROR 2
+#define NVMEQ_POLLED 3
+ u32 *dbbuf_sq_db;
+ u32 *dbbuf_cq_db;
+ u32 *dbbuf_sq_ei;
+ u32 *dbbuf_cq_ei;
+ struct completion delete_done;
+};
+
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
struct nvme_dev {
struct nvme_queue *queues;
struct blk_mq_tag_set tagset;
+ struct nvme_queue admin_queue;
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
struct device *dev;
@@ -181,40 +216,6 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
return container_of(ctrl, struct nvme_dev, ctrl);
}
-/*
- * An NVM Express queue. Each device has at least two (one for admin
- * commands and one for I/O commands).
- */
-struct nvme_queue {
- struct nvme_dev *dev;
- spinlock_t sq_lock;
- void *sq_cmds;
- /* only used for poll queues: */
- spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
- struct nvme_completion *cqes;
- dma_addr_t sq_dma_addr;
- dma_addr_t cq_dma_addr;
- u32 __iomem *q_db;
- u32 q_depth;
- u16 cq_vector;
- u16 sq_tail;
- u16 last_sq_tail;
- u16 cq_head;
- u16 qid;
- u8 cq_phase;
- u8 sqes;
- unsigned long flags;
-#define NVMEQ_ENABLED 0
-#define NVMEQ_SQ_CMB 1
-#define NVMEQ_DELETE_ERROR 2
-#define NVMEQ_POLLED 3
- u32 *dbbuf_sq_db;
- u32 *dbbuf_cq_db;
- u32 *dbbuf_sq_ei;
- u32 *dbbuf_cq_ei;
- struct completion delete_done;
-};
-
/*
* The nvme_iod describes the data in an I/O.
*
@@ -235,9 +236,14 @@ struct nvme_iod {
struct scatterlist *sg;
};
+static struct nvme_queue *nvme_get_queue(struct nvme_dev *dev, unsigned int qid)
+{
+ return qid == 0 ? &dev->admin_queue : &dev->queues[qid - 1];
+}
+
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
{
- return dev->nr_allocated_queues * 8 * dev->db_stride;
+ return dev->ctrl.queue_count * 8 * dev->db_stride;
}
static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
@@ -322,7 +328,7 @@ static void nvme_dbbuf_set(struct nvme_dev *dev)
nvme_dbbuf_dma_free(dev);
for (i = 1; i <= dev->online_queues; i++)
- nvme_dbbuf_free(&dev->queues[i]);
+ nvme_dbbuf_free(nvme_get_queue(dev, i));
}
}
@@ -396,7 +402,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
- struct nvme_queue *nvmeq = &dev->queues[0];
+ struct nvme_queue *nvmeq = &dev->admin_queue;
WARN_ON(hctx_idx != 0);
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
@@ -409,19 +415,34 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
- struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
+ struct nvme_queue *nvmeq = &dev->queues[hctx_idx];
WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
hctx->driver_data = nvmeq;
return 0;
}
+static int nvme_admin_init_request(struct blk_mq_tag_set *set, struct request *req,
+ unsigned int hctx_idx, unsigned int numa_node)
+{
+ struct nvme_dev *dev = set->driver_data;
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = &dev->admin_queue;
+
+ BUG_ON(!nvmeq);
+ iod->nvmeq = nvmeq;
+
+ nvme_req(req)->ctrl = &dev->ctrl;
+ nvme_req(req)->cmd = &iod->cmd;
+ return 0;
+}
+
static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
unsigned int hctx_idx, unsigned int numa_node)
{
struct nvme_dev *dev = set->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
+ int queue_idx = hctx_idx;
struct nvme_queue *nvmeq = &dev->queues[queue_idx];
BUG_ON(!nvmeq);
@@ -1109,7 +1130,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx)
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
- struct nvme_queue *nvmeq = &dev->queues[0];
+ struct nvme_queue *nvmeq = &dev->admin_queue;
struct nvme_command c = { };
c.common.opcode = nvme_admin_async_event;
@@ -1377,7 +1398,7 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
dev->ctrl.queue_count--;
- nvme_free_queue(&dev->queues[i]);
+ nvme_free_queue(nvme_get_queue(dev, i));
}
}
@@ -1406,12 +1427,12 @@ static void nvme_suspend_io_queues(struct nvme_dev *dev)
int i;
for (i = dev->ctrl.queue_count - 1; i > 0; i--)
- nvme_suspend_queue(&dev->queues[i]);
+ nvme_suspend_queue(nvme_get_queue(dev, i));
}
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
{
- struct nvme_queue *nvmeq = &dev->queues[0];
+ struct nvme_queue *nvmeq = &dev->admin_queue;
if (shutdown)
nvme_shutdown_ctrl(&dev->ctrl);
@@ -1432,9 +1453,11 @@ static void nvme_reap_pending_cqes(struct nvme_dev *dev)
int i;
for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
- spin_lock(&dev->queues[i].cq_poll_lock);
- nvme_process_cq(&dev->queues[i]);
- spin_unlock(&dev->queues[i].cq_poll_lock);
+ struct nvme_queue *nvmeq = nvme_get_queue(dev, i);
+
+ spin_lock(&nvmeq->cq_poll_lock);
+ nvme_process_cq(nvmeq);
+ spin_unlock(&nvmeq->cq_poll_lock);
}
}
@@ -1491,7 +1514,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
{
- struct nvme_queue *nvmeq = &dev->queues[qid];
+ struct nvme_queue *nvmeq = nvme_get_queue(dev, qid);
if (dev->ctrl.queue_count > qid)
return 0;
@@ -1631,7 +1654,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
.queue_rq = nvme_queue_rq,
.complete = nvme_pci_complete_rq,
.init_hctx = nvme_admin_init_hctx,
- .init_request = nvme_init_request,
+ .init_request = nvme_admin_init_request,
.timeout = nvme_timeout,
};
@@ -1746,7 +1769,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
dev->ctrl.numa_node = dev_to_node(dev->dev);
- nvmeq = &dev->queues[0];
+ nvmeq = &dev->admin_queue;
aqa = nvmeq->q_depth - 1;
aqa |= aqa << 16;
@@ -1793,7 +1816,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
for (i = dev->online_queues; i <= max; i++) {
bool polled = i > rw_queues;
- ret = nvme_create_queue(&dev->queues[i], i, polled);
+ ret = nvme_create_queue(nvme_get_queue(dev, i), i, polled);
if (ret)
break;
}
@@ -2245,7 +2268,7 @@ static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
- struct nvme_queue *adminq = &dev->queues[0];
+ struct nvme_queue *adminq = &dev->admin_queue;
struct pci_dev *pdev = to_pci_dev(dev->dev);
unsigned int nr_io_queues;
unsigned long size;
@@ -2258,7 +2281,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
dev->nr_write_queues = write_queues;
dev->nr_poll_queues = poll_queues;
- nr_io_queues = dev->nr_allocated_queues - 1;
+ nr_io_queues = nvme_max_io_queues(dev);
+ if (nr_io_queues != dev->nr_allocated_queues) {
+ dev->nr_allocated_queues = nr_io_queues;
+
+ kfree(dev->queues);
+ dev->queues = kcalloc_node(dev->nr_allocated_queues,
+ sizeof(struct nvme_queue), GFP_KERNEL,
+ dev_to_node(dev->dev));
+ if (!dev->queues)
+ return -ENOMEM;
+ }
+
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
return result;
@@ -2404,13 +2438,13 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
retry:
timeout = NVME_ADMIN_TIMEOUT;
while (nr_queues > 0) {
- if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
+ if (nvme_delete_queue(nvme_get_queue(dev, nr_queues), opcode))
break;
nr_queues--;
sent++;
}
while (sent) {
- struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];
+ struct nvme_queue *nvmeq = nvme_get_queue(dev, nr_queues + sent);
timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
timeout);
@@ -2606,7 +2640,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
nvme_disable_admin_queue(dev, shutdown);
}
nvme_suspend_io_queues(dev);
- nvme_suspend_queue(&dev->queues[0]);
+ nvme_suspend_queue(&dev->admin_queue);
nvme_pci_disable(dev);
nvme_reap_pending_cqes(dev);
@@ -2779,6 +2813,10 @@ static void nvme_reset_work(struct work_struct *work)
dev->ctrl.opal_dev = NULL;
}
+ result = nvme_setup_io_queues(dev);
+ if (result)
+ goto out;
+
if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
result = nvme_dbbuf_dma_alloc(dev);
if (result)
@@ -2792,10 +2830,6 @@ static void nvme_reset_work(struct work_struct *work)
goto out;
}
- result = nvme_setup_io_queues(dev);
- if (result)
- goto out;
-
/*
* Keep the controller around but remove all namespaces if we don't have
* any working I/O queue.
@@ -2970,14 +3004,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (!dev)
return -ENOMEM;
- dev->nr_write_queues = write_queues;
- dev->nr_poll_queues = poll_queues;
- dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
- dev->queues = kcalloc_node(dev->nr_allocated_queues,
- sizeof(struct nvme_queue), GFP_KERNEL, node);
- if (!dev->queues)
- goto free;
-
dev->dev = get_device(&pdev->dev);
pci_set_drvdata(pdev, dev);
@@ -3041,8 +3067,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
nvme_dev_unmap(dev);
put_pci:
put_device(dev->dev);
- free:
- kfree(dev->queues);
kfree(dev);
return result;
}
--
2.32.0
_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [RFC PATCH 3/3] nvme: implement SQ associations
2021-09-24 21:08 [RFC PATCH 0/3] nvme sq associations Andrey Nikitin
2021-09-24 21:08 ` [RFC PATCH 1/3] nvme: split admin queue in pci Andrey Nikitin
2021-09-24 21:08 ` [RFC PATCH 2/3] nvme: add NVM set structures Andrey Nikitin
@ 2021-09-24 21:08 ` Andrey Nikitin
2021-09-25 3:02 ` [RFC PATCH 0/3] nvme sq associations Keith Busch
2021-09-29 6:07 ` Chaitanya Kulkarni
4 siblings, 0 replies; 11+ messages in thread
From: Andrey Nikitin @ 2021-09-24 21:08 UTC (permalink / raw)
To: linux-nvme; +Cc: benh, davebuch, Andrey Nikitin
This patch implements SQ association feature for PCI host.
If the controller indicates support for NVM sets and SQ associations
and has a sufficient number of queues supported then each NVM set
gets its own blk-mq tagset and queues are distributed uniformly
across NVM sets.
If the number of mxi-x vectors supported by a controller is less
than the total number of queues for all NVM sets the vectors will
be shared by queues associated with different NVM sets, they’ll
be assigned in a round-robin fashion.
Signed-off-by: Andrey Nikitin <nikitina@amazon.com>
---
drivers/nvme/host/core.c | 17 ++-
drivers/nvme/host/fc.c | 1 +
drivers/nvme/host/nvme.h | 2 +
drivers/nvme/host/pci.c | 215 ++++++++++++++++++++++++++++---------
drivers/nvme/host/rdma.c | 1 +
drivers/nvme/host/tcp.c | 1 +
drivers/nvme/target/loop.c | 1 +
7 files changed, 182 insertions(+), 56 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 63d6a2162e72..6aa6163ec407 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -412,10 +412,16 @@ EXPORT_SYMBOL_GPL(nvme_cancel_request);
void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
{
- if (ctrl->tagset) {
- blk_mq_tagset_busy_iter(ctrl->tagset,
+ int i;
+
+ if (!ctrl->tagset) {
+ return;
+ }
+
+ for (i = 0; i < ctrl->nr_tagsets; i++) {
+ blk_mq_tagset_busy_iter(&ctrl->tagset[i],
nvme_cancel_request, ctrl);
- blk_mq_tagset_wait_completed_request(ctrl->tagset);
+ blk_mq_tagset_wait_completed_request(&ctrl->tagset[i]);
}
}
EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
@@ -3716,6 +3722,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
struct gendisk *disk;
struct nvme_id_ns *id;
int node = ctrl->numa_node;
+ int tagset_idx;
if (nvme_identify_ns(ctrl, nsid, ids, &id))
return;
@@ -3724,7 +3731,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
if (!ns)
goto out_free_id;
- disk = blk_mq_alloc_disk(ctrl->tagset, ns);
+ tagset_idx = id->nvmsetid && id->nvmsetid <= ctrl->nr_tagsets ?
+ id->nvmsetid - 1 : 0;
+ disk = blk_mq_alloc_disk(&ctrl->tagset[tagset_idx], ns);
if (IS_ERR(disk))
goto out_free_ns;
disk->fops = &nvme_bdev_ops;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index b08a61ca283f..e3eaee68d436 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2889,6 +2889,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
return ret;
ctrl->ctrl.tagset = &ctrl->tag_set;
+ ctrl->ctrl.nr_tagsets = 1;
ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
if (IS_ERR(ctrl->ctrl.connect_q)) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 03be6c913216..ceb3a2c4f99b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -235,6 +235,8 @@ struct nvme_ctrl {
struct device *dev;
int instance;
int numa_node;
+ int nr_tagsets;
+ int nr_tagset_queues;
struct blk_mq_tag_set *tagset;
struct blk_mq_tag_set *admin_tagset;
struct list_head namespaces;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 37de7ac79b3f..0d6f33399a26 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -114,6 +114,8 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
*/
struct nvme_queue {
struct nvme_dev *dev;
+ int tagset_idx;
+ int hctx_idx;
spinlock_t sq_lock;
void *sq_cmds;
/* only used for poll queues: */
@@ -147,7 +149,8 @@ struct nvme_queue {
*/
struct nvme_dev {
struct nvme_queue *queues;
- struct blk_mq_tag_set tagset;
+ struct blk_mq_tag_set *tagset;
+ unsigned int tagset_hw_queues;
struct nvme_queue admin_queue;
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
@@ -368,6 +371,13 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
return true;
}
+static bool nvme_check_qassoc(struct nvme_dev *dev, unsigned int nr_io_queues)
+{
+ return nvme_is_qassoc_supported(&dev->ctrl) &&
+ !dev->nr_write_queues && !dev->nr_poll_queues &&
+ nr_io_queues >= dev->ctrl.nsetidmax;
+}
+
/*
* Will slightly overestimate the number of pages needed. This is OK
* as it only leads to a small amount of wasted memory for the lifetime of
@@ -415,9 +425,23 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
- struct nvme_queue *nvmeq = &dev->queues[hctx_idx];
+ struct nvme_queue *nvmeq = NULL;
+ int i;
- WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
+ for (i = 0; i < dev->ctrl.nr_tagsets; i++)
+ if (dev->tagset[i].tags[hctx_idx] == hctx->tags) {
+ nvmeq = &dev->queues[i * dev->tagset_hw_queues + hctx_idx];
+ break;
+ }
+
+ if (!nvmeq) {
+ dev_warn(dev->ctrl.device,
+ "Device queue not found for hctx %d\n", hctx_idx);
+ return -EINVAL;
+ }
+
+ WARN_ON(dev->tagset[nvmeq->tagset_idx].tags[hctx_idx] != hctx->tags);
+ nvmeq->hctx_idx = hctx_idx;
hctx->driver_data = nvmeq;
return 0;
}
@@ -442,8 +466,18 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
{
struct nvme_dev *dev = set->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- int queue_idx = hctx_idx;
- struct nvme_queue *nvmeq = &dev->queues[queue_idx];
+ int tagset_idx, queue_idx;
+ struct nvme_queue *nvmeq;
+
+ if (set < dev->tagset || set >= &dev->tagset[dev->ctrl.nr_tagsets]) {
+ dev_warn(dev->ctrl.device,
+ "Invalid tagset for request init %p\n", set);
+ return -EINVAL;
+ }
+
+ tagset_idx = set - dev->tagset;
+ queue_idx = tagset_idx * dev->tagset_hw_queues + hctx_idx;
+ nvmeq = &dev->queues[queue_idx];
BUG_ON(!nvmeq);
iod->nvmeq = nvmeq;
@@ -471,7 +505,8 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
for (i = 0, qoff = 0; i < set->nr_maps; i++) {
struct blk_mq_queue_map *map = &set->map[i];
- map->nr_queues = dev->io_queues[i];
+ map->nr_queues = dev->ctrl.nr_tagsets > 1 && i == 0 ?
+ dev->tagset_hw_queues : dev->io_queues[i];
if (!map->nr_queues) {
BUG_ON(i == HCTX_TYPE_DEFAULT);
continue;
@@ -1013,7 +1048,7 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
{
if (!nvmeq->qid)
return nvmeq->dev->admin_tagset.tags[0];
- return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
+ return nvmeq->dev->tagset[nvmeq->tagset_idx].tags[nvmeq->hctx_idx];
}
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
@@ -1172,7 +1207,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
}
static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
- struct nvme_queue *nvmeq)
+ struct nvme_queue *nvmeq, u16 qassoc)
{
struct nvme_ctrl *ctrl = &dev->ctrl;
struct nvme_command c = { };
@@ -1196,6 +1231,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
c.create_sq.sq_flags = cpu_to_le16(flags);
c.create_sq.cqid = cpu_to_le16(qid);
+ c.create_sq.nvmsetid = cpu_to_le16(qassoc);
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
}
@@ -1598,7 +1634,8 @@ static int nvme_setup_io_queues_trylock(struct nvme_dev *dev)
return 0;
}
-static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid,
+ bool polled, int qassoc)
{
struct nvme_dev *dev = nvmeq->dev;
int result;
@@ -1607,11 +1644,12 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
/*
- * A queue's vector matches the queue identifier unless the controller
- * has only one vector available.
+ * A queue's vector matches the queue identifier (modulo number of
+ * vectors) unless the controller has only one vector available.
*/
if (!polled)
- vector = dev->num_vecs == 1 ? 0 : qid;
+ vector = dev->num_vecs == 1 ? 0 :
+ ((qid - 1) % (dev->num_vecs - 1)) + 1;
else
set_bit(NVMEQ_POLLED, &nvmeq->flags);
@@ -1619,13 +1657,14 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
if (result)
return result;
- result = adapter_alloc_sq(dev, qid, nvmeq);
+ result = adapter_alloc_sq(dev, qid, nvmeq, qassoc);
if (result < 0)
return result;
if (result)
goto release_cq;
nvmeq->cq_vector = vector;
+ nvmeq->tagset_idx = qassoc ? qassoc - 1 : 0;
result = nvme_setup_io_queues_trylock(dev);
if (result)
@@ -1796,7 +1835,8 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
static int nvme_create_io_queues(struct nvme_dev *dev)
{
unsigned i, max, rw_queues;
- int ret = 0;
+ int ret = 0i, queues_per_set;
+ bool qassoc_en;
for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
if (nvme_alloc_queue(dev, i, dev->q_depth)) {
@@ -1813,10 +1853,14 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
rw_queues = max;
}
+ qassoc_en = nvme_check_qassoc(dev, max);
+ queues_per_set = qassoc_en ? max / dev->ctrl.nsetidmax : 0;
+
for (i = dev->online_queues; i <= max; i++) {
bool polled = i > rw_queues;
+ int qassoc = qassoc_en ? ((i - 1) / queues_per_set) + 1 : 0;
- ret = nvme_create_queue(nvme_get_queue(dev, i), i, polled);
+ ret = nvme_create_queue(nvme_get_queue(dev, i), i, polled, qassoc);
if (ret)
break;
}
@@ -2243,8 +2287,10 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
* vector.
*/
irq_queues = 1;
- if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
+ if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)) {
irq_queues += (nr_io_queues - poll_queues);
+ }
+
return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
}
@@ -2255,6 +2301,11 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
}
+static unsigned int nvme_default_io_queues(struct nvme_dev *dev)
+{
+ return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
+}
+
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
{
/*
@@ -2263,7 +2314,13 @@ static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
*/
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
return 1;
- return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
+
+ if (nvme_check_qassoc(dev, NVME_MAX_QID)) {
+ return min(num_possible_cpus() * dev->ctrl.nsetidmax,
+ (unsigned int)NVME_MAX_QID);
+ }
+
+ return nvme_default_io_queues(dev);
}
static int nvme_setup_io_queues(struct nvme_dev *dev)
@@ -2282,6 +2339,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
dev->nr_poll_queues = poll_queues;
nr_io_queues = nvme_max_io_queues(dev);
+
+ result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
+ if (result < 0)
+ return result;
+
+ nr_io_queues = nvme_check_qassoc(dev, nr_io_queues) ?
+ rounddown(nr_io_queues, dev->ctrl.nsetidmax) :
+ min(nr_io_queues, nvme_default_io_queues(dev));
+
if (nr_io_queues != dev->nr_allocated_queues) {
dev->nr_allocated_queues = nr_io_queues;
@@ -2293,10 +2359,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
return -ENOMEM;
}
- result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
- if (result < 0)
- return result;
-
if (nr_io_queues == 0)
return 0;
@@ -2353,7 +2415,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
dev->num_vecs = result;
result = max(result - 1, 1);
- dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
+ dev->max_qid = nvme_check_qassoc(dev, nr_io_queues) ?
+ min((int)nr_io_queues, result * dev->ctrl.nsetidmax) :
+ result + dev->io_queues[HCTX_TYPE_POLL];
/*
* Should investigate if there's a performance win from allocating
@@ -2460,39 +2524,74 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
static void nvme_dev_add(struct nvme_dev *dev)
{
- int ret;
+ int ret, i;
- if (!dev->ctrl.tagset) {
- dev->tagset.ops = &nvme_mq_ops;
- dev->tagset.nr_hw_queues = dev->online_queues - 1;
- dev->tagset.nr_maps = 2; /* default + read */
- if (dev->io_queues[HCTX_TYPE_POLL])
- dev->tagset.nr_maps++;
- dev->tagset.timeout = NVME_IO_TIMEOUT;
- dev->tagset.numa_node = dev->ctrl.numa_node;
- dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
- BLK_MQ_MAX_DEPTH) - 1;
- dev->tagset.cmd_size = sizeof(struct nvme_iod);
- dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
- dev->tagset.driver_data = dev;
+ dev->tagset_hw_queues = dev->online_queues - 1;
- /*
- * Some Apple controllers requires tags to be unique
- * across admin and IO queue, so reserve the first 32
- * tags of the IO queue.
- */
- if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
- dev->tagset.reserved_tags = NVME_AQ_DEPTH;
+ if (nvme_check_qassoc(dev, dev->tagset_hw_queues)) {
+ if (dev->ctrl.nsetidmax != dev->ctrl.nr_tagsets) {
+ kfree(dev->tagset);
+ dev->ctrl.tagset = NULL;
+ dev->ctrl.nr_tagsets = dev->ctrl.nsetidmax;
+ }
+
+ dev->tagset_hw_queues /= dev->ctrl.nsetidmax;
+ } else if (dev->ctrl.nr_tagsets != 1) {
+ kfree(dev->tagset);
+ dev->ctrl.tagset = NULL;
+ dev->ctrl.nr_tagsets = 1;
+ }
- ret = blk_mq_alloc_tag_set(&dev->tagset);
- if (ret) {
+ if (!dev->ctrl.tagset) {
+ dev->tagset = kcalloc_node(dev->ctrl.nr_tagsets,
+ sizeof(struct blk_mq_tag_set), GFP_KERNEL,
+ dev_to_node(dev->dev));
+ if (!dev->tagset) {
dev_warn(dev->ctrl.device,
- "IO queues tagset allocation failed %d\n", ret);
+ "Device %d tagsets allocation failed\n",
+ dev->ctrl.nr_tagsets);
+ dev->ctrl.nr_tagsets = 0;
return;
}
- dev->ctrl.tagset = &dev->tagset;
+
+ for (i = 0; i < dev->ctrl.nr_tagsets; i++) {
+ struct blk_mq_tag_set *tagset = &dev->tagset[i];
+
+ tagset->ops = &nvme_mq_ops;
+ tagset->nr_hw_queues = dev->tagset_hw_queues;
+ tagset->nr_maps = 2; /* default + read */
+ if (dev->io_queues[HCTX_TYPE_POLL])
+ tagset->nr_maps++;
+ tagset->timeout = NVME_IO_TIMEOUT;
+ tagset->numa_node = dev->ctrl.numa_node;
+ tagset->queue_depth = min_t(unsigned int, dev->q_depth,
+ BLK_MQ_MAX_DEPTH) - 1;
+ tagset->cmd_size = sizeof(struct nvme_iod);
+ tagset->flags = BLK_MQ_F_SHOULD_MERGE;
+ tagset->driver_data = dev;
+
+ /*
+ * Some Apple controllers requires tags to be unique
+ * across admin and IO queue, so reserve the first 32
+ * tags of the IO queue.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
+ tagset->reserved_tags = NVME_AQ_DEPTH;
+
+ ret = blk_mq_alloc_tag_set(tagset);
+ if (ret) {
+ dev_warn(dev->ctrl.device,
+ "IO queues tagset allocation failed %d\n",
+ ret);
+ return;
+ }
+ }
+
+ dev->ctrl.tagset = dev->tagset;
} else {
- blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
+ for (i = 0; i < dev->ctrl.nr_tagsets; i++)
+ blk_mq_update_nr_hw_queues(&dev->tagset[i],
+ dev->tagset_hw_queues);
/* Free previously allocated queues that are no longer usable */
nvme_free_queues(dev, dev->online_queues);
@@ -2612,6 +2711,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
bool dead = true, freeze = false;
struct pci_dev *pdev = to_pci_dev(dev->dev);
+ int i;
mutex_lock(&dev->shutdown_lock);
if (pci_is_enabled(pdev)) {
@@ -2644,9 +2744,13 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
nvme_pci_disable(dev);
nvme_reap_pending_cqes(dev);
- blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
+ for (i = 0; i < dev->ctrl.nr_tagsets; i++) {
+ blk_mq_tagset_busy_iter(&dev->tagset[i], nvme_cancel_request,
+ &dev->ctrl);
+ blk_mq_tagset_wait_completed_request(&dev->tagset[i]);
+ }
+
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
- blk_mq_tagset_wait_completed_request(&dev->tagset);
blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
/*
@@ -2696,9 +2800,15 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
static void nvme_free_tagset(struct nvme_dev *dev)
{
- if (dev->tagset.tags)
- blk_mq_free_tag_set(&dev->tagset);
+ int i;
+
+ for (i = 0; i < dev->ctrl.nr_tagsets; i++) {
+ if (dev->tagset[i].tags)
+ blk_mq_free_tag_set(&dev->tagset[i]);
+ }
+
dev->ctrl.tagset = NULL;
+ dev->ctrl.nr_tagsets = 0;
}
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
@@ -2713,6 +2823,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
mempool_destroy(dev->iod_mempool);
put_device(dev->dev);
kfree(dev->queues);
+ kfree(dev->tagset);
kfree(dev);
}
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index a68704e39084..47d29512d96b 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -977,6 +977,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
goto out_free_io_queues;
}
+ ctrl->ctrl.nr_tagsets = 1;
ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
if (IS_ERR(ctrl->ctrl.connect_q)) {
ret = PTR_ERR(ctrl->ctrl.connect_q);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 645025620154..979164bca237 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1803,6 +1803,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
goto out_free_io_queues;
}
+ ctrl->nr_tagsets = 1;
ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
if (IS_ERR(ctrl->connect_q)) {
ret = PTR_ERR(ctrl->connect_q);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 0285ccc7541f..2df8aa11f46a 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -536,6 +536,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1;
ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
ctrl->ctrl.tagset = &ctrl->tag_set;
+ ctrl->ctrl.nr_tagsets = 1;
ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
if (ret)
--
2.32.0
_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme
^ permalink raw reply related [flat|nested] 11+ messages in thread