All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nvme: Add weighted-round-robin arbitration support
       [not found] <CGME20180104150334epcas1p14579a6ab7bcf1b6ce326d0bce89c91e1@epcas1p1.samsung.com>
@ 2018-01-04 15:02 ` Kanchan Joshi
  2018-01-04 17:38   ` Keith Busch
  2018-01-14  9:56   ` Sagi Grimberg
  0 siblings, 2 replies; 7+ messages in thread
From: Kanchan Joshi @ 2018-01-04 15:02 UTC (permalink / raw)


This patch enables support for Weighted-Round-Robin (WRR) arbitration, so
that applications can make use of the prioritization capabilities natively
present in NVMe controller.

- It links existing io-nice classes (real-time, best-effort, none, low)
to NVMe priorities (urgent, high, medium, low).  This is done through
'request->ioprio' field inside 'queue_rq' function.

- Current driver has 1:1 mapping (1 SQ, 1 CQ) per cpu, encapsulated in
'nvmeq' structure.  This patch refactors the code so that N:1 mapping per
cpu can be created; 'nvmeq' has been changed to contain variable number of SQ
related fields.  For WRR, 4 submission-queues (corresponding to each queue
priorites) need to be created on each cpu.

- When 'enable_wrr' module param is passed, it creates 4:1 mapping and enables
controller in WRR mode.  Otherwise, it cotinues to retain 1:1 mapping and
controller remains in RR mode.

- NVMe device may have less number of queues than required for 4:1 mapping
per cpu.  For example, when num_possible_cpus is 64, 256 submission-queues are
required for 4:1 mapping while device may support, say, 128.  
This case is handled by creating 32 queue-pairs which are shared among 64 cpus.
Another way to handle this could have been reducing to 3:1 or 2:1 mapping
(and remapping 4 ionice classes as well).

-Admin queue, contains 1:1 mapping irrespective of the mode (RR or WRR) used.

Earlier I had collected results on 4.10
kernel, which indicate distribution happening as per
weights applied.  Please refer to (section 5) in this paper -
http://www.usenix.org/system/files/conference/hotstorage17/hotstorage17-paper-joshi.pdf
I see similar results in current kernel as well.

Signed-off-by: Kanchan Joshi <joshi.k at samsung.com>
---
 drivers/nvme/host/core.c |   4 +-
 drivers/nvme/host/pci.c  | 310 +++++++++++++++++++++++++++++++----------------
 include/linux/nvme.h     |   1 +
 3 files changed, 210 insertions(+), 105 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1e46e60..6920bdf 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1649,9 +1649,9 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
 
 	ctrl->page_size = 1 << page_shift;
 
-	ctrl->ctrl_config = NVME_CC_CSS_NVM;
+	ctrl->ctrl_config |= NVME_CC_CSS_NVM;
 	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
-	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
+	ctrl->ctrl_config |= NVME_CC_SHN_NONE;
 	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
 	ctrl->ctrl_config |= NVME_CC_ENABLE;
 
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f5800c3..5f99ee5e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -37,6 +37,13 @@
 
 #define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
 
+#define SQ_PER_CORE_RR		1
+#define SQ_PER_CORE_WRR		4
+
+static bool enable_wrr = false;
+module_param(enable_wrr, bool, 0644);
+MODULE_PARM_DESC(enable_wrr, "enable wrr arbitration among I/O SQes");
+
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
@@ -111,6 +118,9 @@ struct nvme_dev {
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
 	void **host_mem_desc_bufs;
+
+	/* 1 for RR, 4 for WRR */
+	u8 sq_per_core;
 };
 
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -147,24 +157,31 @@ struct nvme_queue {
 	struct device *q_dmadev;
 	struct nvme_dev *dev;
 	spinlock_t q_lock;
-	struct nvme_command *sq_cmds;
-	struct nvme_command __iomem *sq_cmds_io;
 	volatile struct nvme_completion *cqes;
 	struct blk_mq_tags **tags;
-	dma_addr_t sq_dma_addr;
 	dma_addr_t cq_dma_addr;
-	u32 __iomem *q_db;
+	u32 __iomem *cq_db;
 	u16 q_depth;
 	s16 cq_vector;
-	u16 sq_tail;
 	u16 cq_head;
-	u16 qid;
+	u16 cq_id;
 	u8 cq_phase;
 	u8 cqe_seen;
-	u32 *dbbuf_sq_db;
 	u32 *dbbuf_cq_db;
-	u32 *dbbuf_sq_ei;
 	u32 *dbbuf_cq_ei;
+	/* sq related fields start here */
+	u8 nr_sq;
+	struct sq_data {
+		struct nvme_command *sq_cmds;
+		struct nvme_command __iomem *sq_cmds_io;
+		dma_addr_t sq_dma_addr;
+		u32 __iomem *sq_db;
+		u16 id;
+		u16 sq_tail;
+		u32 *dbbuf_sq_db;
+		u32 *dbbuf_sq_ei;
+	} sq[];
+
 };
 
 /*
@@ -181,6 +198,7 @@ struct nvme_iod {
 	int npages;		/* In the PRP list. 0 means small pool in use */
 	int nents;		/* Used in scatterlist */
 	int length;		/* Of data, in bytes */
+	int sq_indx;
 	dma_addr_t first_dma;
 	struct scatterlist meta_sg; /* metadata requires single contiguous buffer */
 	struct scatterlist *sg;
@@ -207,14 +225,14 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
 }
 
-static inline unsigned int nvme_dbbuf_size(u32 stride)
+static inline unsigned int nvme_dbbuf_size(u32 stride, u8 sq_per_core)
 {
-	return ((num_possible_cpus() + 1) * 8 * stride);
+	return ((sq_per_core * num_possible_cpus() + 1) * 8 * stride);
 }
 
 static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
 {
-	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride, dev->sq_per_core);
 
 	if (dev->dbbuf_dbs)
 		return 0;
@@ -239,7 +257,7 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
 
 static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
 {
-	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride, dev->sq_per_core);
 
 	if (dev->dbbuf_dbs) {
 		dma_free_coherent(dev->dev, mem_size,
@@ -256,13 +274,17 @@ static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
 static void nvme_dbbuf_init(struct nvme_dev *dev,
 			    struct nvme_queue *nvmeq, int qid)
 {
+	int i;
 	if (!dev->dbbuf_dbs || !qid)
 		return;
-
-	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
-	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
-	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
-	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
+	for (i = 0; i < nvmeq->nr_sq; i++) {
+		nvmeq->sq[i].dbbuf_sq_db =
+			&dev->dbbuf_dbs[sq_idx(nvmeq->sq[i].id, dev->db_stride)];
+		nvmeq->sq[i].dbbuf_sq_ei =
+			&dev->dbbuf_eis[sq_idx(nvmeq->sq[i].id, dev->db_stride)];
+	}
+	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(nvmeq->cq_id, dev->db_stride)];
+	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(nvmeq->cq_id, dev->db_stride)];
 }
 
 static void nvme_dbbuf_set(struct nvme_dev *dev)
@@ -425,21 +447,22 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
  * Safe to use from interrupt context
  */
 static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
-						struct nvme_command *cmd)
+						struct nvme_command *cmd,
+						int idx)
 {
-	u16 tail = nvmeq->sq_tail;
+	u16 tail = nvmeq->sq[idx].sq_tail;
 
-	if (nvmeq->sq_cmds_io)
-		memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
+	if (nvmeq->sq[idx].sq_cmds_io)
+		memcpy_toio(&nvmeq->sq[idx].sq_cmds_io[tail], cmd, sizeof(*cmd));
 	else
-		memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+		memcpy(&nvmeq->sq[idx].sq_cmds[tail], cmd, sizeof(*cmd));
 
 	if (++tail == nvmeq->q_depth)
 		tail = 0;
-	if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db,
-					      nvmeq->dbbuf_sq_ei))
-		writel(tail, nvmeq->q_db);
-	nvmeq->sq_tail = tail;
+	if (nvme_dbbuf_update_and_check_event(tail, nvmeq->sq[idx].dbbuf_sq_db,
+					      nvmeq->sq[idx].dbbuf_sq_ei))
+		writel(tail, nvmeq->sq[idx].sq_db);
+	nvmeq->sq[idx].sq_tail = tail;
 }
 
 static void **nvme_pci_iod_list(struct request *req)
@@ -448,7 +471,8 @@ static void **nvme_pci_iod_list(struct request *req)
 	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
 }
 
-static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
+static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev,
+					int sq_indx)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
 	int nseg = blk_rq_nr_phys_segments(rq);
@@ -469,6 +493,7 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
 	iod->npages = -1;
 	iod->nents = 0;
 	iod->length = size;
+	iod->sq_indx = sq_indx;
 
 	return BLK_STS_OK;
 }
@@ -780,7 +805,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
 
 	if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
 		return false;
-	if (!iod->nvmeq->qid)
+	if (!iod->nvmeq->cq_id)
 		return false;
 	if (!sgl_threshold || avg_seg_size < sgl_threshold)
 		return false;
@@ -859,6 +884,12 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 	nvme_free_iod(dev, req);
 }
 
+static inline int ioprio_to_sqindx(struct nvme_queue *nvmeq, struct request *req)
+{
+	int ioprio_class;
+	ioprio_class = req->ioprio >> IOPRIO_CLASS_SHIFT;
+	return (ioprio_class % nvmeq->nr_sq);
+}
 /*
  * NOTE: ns is NULL when called on the admin queue.
  */
@@ -871,12 +902,18 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request *req = bd->rq;
 	struct nvme_command cmnd;
 	blk_status_t ret;
+	int sq_indx = 0;
+	/*
+	 * no need to check iopriority for admin queue, and when in RR mode
+	 */
+	if (nvmeq->nr_sq > SQ_PER_CORE_RR)
+		sq_indx = ioprio_to_sqindx(nvmeq, req);
 
 	ret = nvme_setup_cmd(ns, req, &cmnd);
 	if (ret)
 		return ret;
 
-	ret = nvme_init_iod(req, dev);
+	ret = nvme_init_iod(req, dev, sq_indx);
 	if (ret)
 		goto out_free_cmd;
 
@@ -894,7 +931,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 		spin_unlock_irq(&nvmeq->q_lock);
 		goto out_cleanup_iod;
 	}
-	__nvme_submit_cmd(nvmeq, &cmnd);
+	__nvme_submit_cmd(nvmeq, &cmnd, sq_indx);
 	nvme_process_cq(nvmeq);
 	spin_unlock_irq(&nvmeq->q_lock);
 	return BLK_STS_OK;
@@ -927,7 +964,7 @@ static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
 	if (likely(nvmeq->cq_vector >= 0)) {
 		if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
 						      nvmeq->dbbuf_cq_ei))
-			writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+			writel(head, nvmeq->cq_db);
 	}
 }
 
@@ -935,7 +972,6 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 		struct nvme_completion *cqe)
 {
 	struct request *req;
-
 	if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
 		dev_warn(nvmeq->dev->ctrl.device,
 			"invalid id %d completed on queue %d\n",
@@ -949,7 +985,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 	 * aborts.  We don't even bother to allocate a struct request
 	 * for them but rather special case them here.
 	 */
-	if (unlikely(nvmeq->qid == 0 &&
+	if (unlikely(nvmeq->cq_id == 0 &&
 			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
 		nvme_complete_async_event(&nvmeq->dev->ctrl,
 				cqe->status, &cqe->result);
@@ -1054,7 +1090,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
 
 	spin_lock_irq(&nvmeq->q_lock);
-	__nvme_submit_cmd(nvmeq, &c);
+	__nvme_submit_cmd(nvmeq, &c, 0);
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 
@@ -1086,28 +1122,36 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
 	c.create_cq.cq_flags = cpu_to_le16(flags);
 	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
-
 	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
 }
 
-static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
+static int adapter_alloc_sq(struct nvme_dev *dev, int sq_indx,
 						struct nvme_queue *nvmeq)
 {
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG;
 
+	if (enable_wrr) {
+		/*
+		 * Note: io-prio class to nvme priority mapping
+		 * none -> medium, realtime -> urgent, best-effort -> high,
+		 * idle->low
+		 */
+		int prio[] = {NVME_SQ_PRIO_MEDIUM, NVME_SQ_PRIO_URGENT,
+			NVME_SQ_PRIO_HIGH, NVME_SQ_PRIO_LOW};
+		flags |= prio[sq_indx];
+	}
 	/*
 	 * Note: we (ab)use the fact that the prp fields survive if no data
 	 * is attached to the request.
 	 */
 	memset(&c, 0, sizeof(c));
 	c.create_sq.opcode = nvme_admin_create_sq;
-	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
-	c.create_sq.sqid = cpu_to_le16(qid);
+	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq[sq_indx].sq_dma_addr);
+	c.create_sq.sqid = cpu_to_le16(nvmeq->sq[sq_indx].id);
 	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
 	c.create_sq.sq_flags = cpu_to_le16(flags);
-	c.create_sq.cqid = cpu_to_le16(qid);
-
+	c.create_sq.cqid = cpu_to_le16(nvmeq->cq_id);
 	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
 }
 
@@ -1202,7 +1246,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	if (__nvme_poll(nvmeq, req->tag)) {
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, completion polled\n",
-			 req->tag, nvmeq->qid);
+			 req->tag, nvmeq->cq_id);
 		return BLK_EH_HANDLED;
 	}
 
@@ -1215,7 +1259,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	if (dev->ctrl.state == NVME_CTRL_RESETTING) {
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
-			 req->tag, nvmeq->qid);
+			 req->tag, nvmeq->cq_id);
 		nvme_dev_disable(dev, false);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
 		return BLK_EH_HANDLED;
@@ -1226,10 +1270,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
  	 * command was already aborted once before and still hasn't been
  	 * returned to the driver, or if this is the admin queue.
 	 */
-	if (!nvmeq->qid || iod->aborted) {
+	if (!nvmeq->cq_id || iod->aborted) {
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, reset controller\n",
-			 req->tag, nvmeq->qid);
+			 req->tag, nvmeq->cq_id);
 		nvme_dev_disable(dev, false);
 		nvme_reset_ctrl(&dev->ctrl);
 
@@ -1250,11 +1294,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.abort.opcode = nvme_admin_abort_cmd;
 	cmd.abort.cid = req->tag;
-	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
+	cmd.abort.sqid = cpu_to_le16(iod->sq_indx);
 
 	dev_warn(nvmeq->dev->ctrl.device,
 		"I/O %d QID %d timeout, aborting\n",
-		 req->tag, nvmeq->qid);
+		 req->tag, nvmeq->cq_id);
 
 	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
 			BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
@@ -1277,11 +1321,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 
 static void nvme_free_queue(struct nvme_queue *nvmeq)
 {
+	unsigned idx = 0;
 	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
 				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
-	if (nvmeq->sq_cmds)
-		dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
-					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+	for (idx = 0; idx < nvmeq->nr_sq; idx++) {
+		if (nvmeq->sq[idx].sq_cmds)
+			dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+					nvmeq->sq[idx].sq_cmds,
+					nvmeq->sq[idx].sq_dma_addr);
+
+
+	}
 	kfree(nvmeq);
 }
 
@@ -1315,7 +1365,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 	nvmeq->cq_vector = -1;
 	spin_unlock_irq(&nvmeq->q_lock);
 
-	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
+	if (!nvmeq->cq_id && nvmeq->dev->ctrl.admin_q)
 		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
 
 	pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq);
@@ -1367,17 +1417,18 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
 }
 
 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
-				int qid, int depth)
+				int sq_indx, int depth)
 {
+	int qid = nvmeq->sq[sq_indx].id;
 	if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
 		unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
 						      dev->ctrl.page_size);
-		nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
-		nvmeq->sq_cmds_io = dev->cmb + offset;
+		nvmeq->sq[sq_indx].sq_dma_addr = dev->cmb_bus_addr + offset;
+		nvmeq->sq[sq_indx].sq_cmds_io = dev->cmb + offset;
 	} else {
-		nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
-					&nvmeq->sq_dma_addr, GFP_KERNEL);
-		if (!nvmeq->sq_cmds)
+		nvmeq->sq[sq_indx].sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+					&nvmeq->sq[sq_indx].sq_dma_addr, GFP_KERNEL);
+		if (!nvmeq->sq[sq_indx].sq_cmds)
 			return -ENOMEM;
 	}
 
@@ -1385,36 +1436,51 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 }
 
 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
-							int depth, int node)
+							int depth, int node,
+							int nr_sq)
 {
-	struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
+	struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq) + \
+				(nr_sq * sizeof(struct sq_data)), GFP_KERNEL,
 							node);
+	int cq_id, i;
 	if (!nvmeq)
 		return NULL;
-
 	nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
 					  &nvmeq->cq_dma_addr, GFP_KERNEL);
 	if (!nvmeq->cqes)
 		goto free_nvmeq;
 
-	if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
-		goto free_cqdma;
+	nvmeq->nr_sq = nr_sq;
+	cq_id = (qid * nr_sq) - nr_sq + 1;
+	nvmeq->cq_id = cq_id;
+	nvmeq->cq_db = &dev->dbs[cq_idx(nvmeq->cq_id, dev->db_stride)];
+	for (i = 0; i < nr_sq; i++) {
+		nvmeq->sq[i].id = cq_id++;
+		if (nvme_alloc_sq_cmds(dev, nvmeq, i, depth))
+			goto free_cqdma;
+
+		nvmeq->sq[i].sq_db = &dev->dbs[sq_idx(nvmeq->sq[i].id, dev->db_stride)];
+	}
 
 	nvmeq->q_dmadev = dev->dev;
 	nvmeq->dev = dev;
 	spin_lock_init(&nvmeq->q_lock);
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
-	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
 	nvmeq->q_depth = depth;
-	nvmeq->qid = qid;
 	nvmeq->cq_vector = -1;
 	dev->queues[qid] = nvmeq;
 	dev->ctrl.queue_count++;
-
 	return nvmeq;
 
  free_cqdma:
+	for (i = 0; i < nr_sq; i++) {
+		if (nvmeq->sq[i].sq_cmds) {
+			dma_free_coherent(dev->dev, SQ_SIZE(depth),
+					nvmeq->sq[i].sq_cmds,
+					nvmeq->sq[i].sq_dma_addr);
+		}
+	}
 	dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
 							nvmeq->cq_dma_addr);
  free_nvmeq:
@@ -1429,22 +1495,26 @@ static int queue_request_irq(struct nvme_queue *nvmeq)
 
 	if (use_threaded_interrupts) {
 		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
-				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
+				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->cq_id);
 	} else {
 		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
-				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
+				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->cq_id);
 	}
 }
 
 static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 {
 	struct nvme_dev *dev = nvmeq->dev;
-
+	int i;
 	spin_lock_irq(&nvmeq->q_lock);
-	nvmeq->sq_tail = 0;
+	for (i = 0; i < nvmeq->nr_sq; i++) {
+		nvmeq->sq[i].sq_tail = 0;
+		nvmeq->sq[i].sq_db = &dev->dbs[sq_idx(nvmeq->sq[i].id,
+					dev->db_stride)];
+	}
+	nvmeq->cq_db = &dev->dbs[cq_idx(nvmeq->cq_id, dev->db_stride)];
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
-	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
 	nvme_dbbuf_init(dev, nvmeq, qid);
 	dev->online_queues++;
@@ -1454,16 +1524,16 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 {
 	struct nvme_dev *dev = nvmeq->dev;
-	int result;
-
+	int result, i;
 	nvmeq->cq_vector = qid - 1;
-	result = adapter_alloc_cq(dev, qid, nvmeq);
+	result = adapter_alloc_cq(dev, nvmeq->cq_id, nvmeq);
 	if (result < 0)
 		return result;
-
-	result = adapter_alloc_sq(dev, qid, nvmeq);
-	if (result < 0)
-		goto release_cq;
+	for (i = 0; i < nvmeq->nr_sq; i++) {
+		result = adapter_alloc_sq(dev, i, nvmeq);
+		if (result < 0)
+			goto release_cq;
+	}
 
 	nvme_init_queue(nvmeq, qid);
 	result = queue_request_irq(nvmeq);
@@ -1473,9 +1543,12 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 	return result;
 
  release_sq:
-	adapter_delete_sq(dev, qid);
+	while (i) {
+		adapter_delete_sq(dev, nvmeq->sq[i].id);
+		--i;
+	}
  release_cq:
-	adapter_delete_cq(dev, qid);
+	adapter_delete_cq(dev, nvmeq->cq_id);
 	return result;
 }
 
@@ -1595,7 +1668,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
 	nvmeq = dev->queues[0];
 	if (!nvmeq) {
 		nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
-					dev_to_node(dev->dev));
+					dev_to_node(dev->dev), 1);
 		if (!nvmeq)
 			return -ENOMEM;
 	}
@@ -1604,13 +1677,12 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
 	aqa |= aqa << 16;
 
 	writel(aqa, dev->bar + NVME_REG_AQA);
-	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
+	lo_hi_writeq(nvmeq->sq[0].sq_dma_addr, dev->bar + NVME_REG_ASQ);
 	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
 
 	result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap);
 	if (result)
 		return result;
-
 	nvmeq->cq_vector = 0;
 	nvme_init_queue(nvmeq, 0);
 	result = queue_request_irq(nvmeq);
@@ -1626,11 +1698,11 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
 {
 	unsigned i, max;
 	int ret = 0;
-
 	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
 		/* vector == qid - 1, match nvme_create_queue */
 		if (!nvme_alloc_queue(dev, i, dev->q_depth,
-		     pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
+		     pci_irq_get_node(to_pci_dev(dev->dev), i - 1),
+				dev->sq_per_core)) {
 			ret = -ENOMEM;
 			break;
 		}
@@ -1896,19 +1968,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	struct nvme_queue *adminq = dev->queues[0];
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
-	int result, nr_io_queues;
+	int result, nr_io_sqes, nr_io_cqes;
 	unsigned long size;
 
-	nr_io_queues = num_present_cpus();
-	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
+	nr_io_sqes = num_present_cpus() * dev->sq_per_core;
+	result = nvme_set_queue_count(&dev->ctrl, &nr_io_sqes);
 	if (result < 0)
 		return result;
 
-	if (nr_io_queues == 0)
+	if (nr_io_sqes == 0)
 		return 0;
-
 	if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
-		result = nvme_cmb_qdepth(dev, nr_io_queues,
+		result = nvme_cmb_qdepth(dev, nr_io_sqes,
 				sizeof(struct nvme_command));
 		if (result > 0)
 			dev->q_depth = result;
@@ -1917,14 +1988,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	}
 
 	do {
-		size = db_bar_size(dev, nr_io_queues);
+		size = db_bar_size(dev, nr_io_sqes);
 		result = nvme_remap_bar(dev, size);
 		if (!result)
 			break;
-		if (!--nr_io_queues)
+		nr_io_sqes -= dev->sq_per_core;
+		if (!nr_io_sqes)
 			return -ENOMEM;
 	} while (1);
-	adminq->q_db = dev->dbs;
+	adminq->sq[0].sq_db = dev->dbs;
+	adminq->cq_db = &dev->dbs[dev->db_stride];
+
+	nr_io_cqes = nr_io_sqes / dev->sq_per_core;
 
 	/* Deregister the admin queue's interrupt */
 	pci_free_irq(pdev, 0, adminq);
@@ -1934,11 +2009,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	 * setting up the full range we need.
 	 */
 	pci_free_irq_vectors(pdev);
-	nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues,
+	nr_io_cqes = pci_alloc_irq_vectors(pdev, 1, nr_io_cqes,
 			PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY);
-	if (nr_io_queues <= 0)
+	if (nr_io_cqes <= 0)
 		return -EIO;
-	dev->max_qid = nr_io_queues;
+	/*
+	 * Recalculate sqes, in case nr_io_cqes reduces due to above call
+	 */
+	nr_io_sqes = nr_io_cqes * dev->sq_per_core;
+	dev->max_qid = nr_io_cqes;
 
 	/*
 	 * Should investigate if there's a performance win from allocating
@@ -1984,7 +2063,7 @@ static void nvme_del_cq_end(struct request *req, blk_status_t error)
 	nvme_del_queue_end(req, error);
 }
 
-static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
+static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode, u16 qid)
 {
 	struct request_queue *q = nvmeq->dev->ctrl.admin_q;
 	struct request *req;
@@ -1992,7 +2071,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
 
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.delete_queue.opcode = opcode;
-	cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
+	cmd.delete_queue.qid = cpu_to_le16(qid);
 
 	req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
 	if (IS_ERR(req))
@@ -2009,20 +2088,34 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
 
 static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
 {
-	int pass;
+	int pass, err;
 	unsigned long timeout;
 	u8 opcode = nvme_admin_delete_sq;
 
 	for (pass = 0; pass < 2; pass++) {
-		int sent = 0, i = queues;
+		int sent = 0, i = queues, j;
 
 		reinit_completion(&dev->ioq_wait);
  retry:
 		timeout = ADMIN_TIMEOUT;
-		for (; i > 0; i--, sent++)
-			if (nvme_delete_queue(dev->queues[i], opcode))
-				break;
+		if (opcode == nvme_admin_delete_cq) {
+			for (; i > 0; i--, sent++)
+				if (nvme_delete_queue(dev->queues[i], opcode,
+						dev->queues[i]->cq_id))
+					break;
+		} else {
+			for (; i > 0; i--) {
+				for (j = 0; j < dev->sq_per_core; j++) {
+					err = nvme_delete_queue(dev->queues[i],
+							opcode,
+							dev->queues[i]->sq[j].id);
+					if (err)
+						break;
+					++sent;
+				}
+			}
 
+		}
 		while (sent--) {
 			timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout);
 			if (timeout == 0)
@@ -2106,7 +2199,6 @@ static int nvme_pci_enable(struct nvme_dev *dev)
 				io_queue_depth);
 	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
 	dev->dbs = dev->bar + 4096;
-
 	/*
 	 * Temporary fix for the Apple controller found in the MacBook8,1 and
 	 * some MacBook7,1 to avoid controller resets and data loss.
@@ -2306,6 +2398,18 @@ static void nvme_reset_work(struct work_struct *work)
 	if (result)
 		goto out;
 
+	dev->sq_per_core = SQ_PER_CORE_RR;
+	if (enable_wrr) {
+		if (NVME_CAP_WRR(dev->ctrl.cap)) {
+			dev->sq_per_core = SQ_PER_CORE_WRR;
+			dev->ctrl.ctrl_config = NVME_CC_AMS_WRRU;
+			dev_info(dev->ctrl.device,
+					"enabling wrr, %u sq per core\n",
+					dev->sq_per_core);
+		} else
+			dev_warn(dev->ctrl.device, "does not support WRR\n");
+	}
+
 	result = nvme_pci_configure_admin_queue(dev);
 	if (result)
 		goto out;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index aea87f0d..7b33a47 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -116,6 +116,7 @@ enum {
 };
 
 #define NVME_CAP_MQES(cap)	((cap) & 0xffff)
+#define NVME_CAP_WRR(cap)	(((cap) >> 17) & 0x1)
 #define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)
 #define NVME_CAP_STRIDE(cap)	(((cap) >> 32) & 0xf)
 #define NVME_CAP_NSSRC(cap)	(((cap) >> 36) & 0x1)
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH] nvme: Add weighted-round-robin arbitration support
  2018-01-04 15:02 ` [PATCH] nvme: Add weighted-round-robin arbitration support Kanchan Joshi
@ 2018-01-04 17:38   ` Keith Busch
  2018-01-09 13:50     ` Kanchan Joshi
  2018-01-14  9:56   ` Sagi Grimberg
  1 sibling, 1 reply; 7+ messages in thread
From: Keith Busch @ 2018-01-04 17:38 UTC (permalink / raw)


On Thu, Jan 04, 2018@08:32:09PM +0530, Kanchan Joshi wrote:
> This patch enables support for Weighted-Round-Robin (WRR) arbitration, so
> that applications can make use of the prioritization capabilities natively
> present in NVMe controller.
> 
> - It links existing io-nice classes (real-time, best-effort, none, low)
> to NVMe priorities (urgent, high, medium, low).  This is done through
> 'request->ioprio' field inside 'queue_rq' function.
> 
> - Current driver has 1:1 mapping (1 SQ, 1 CQ) per cpu, encapsulated in
> 'nvmeq' structure.  This patch refactors the code so that N:1 mapping per
> cpu can be created; 'nvmeq' has been changed to contain variable number of SQ
> related fields.  For WRR, 4 submission-queues (corresponding to each queue
> priorites) need to be created on each cpu.

You have a single tagset per CQ for up to N oustandanding commands, but
allocate enough submission entries for 4 * N. And since they're sharing
tags, a lower pri task can limit a high-pri one from getting a tag. I
think we could use a little more help from the block layer for WRR.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] nvme: Add weighted-round-robin arbitration support
  2018-01-04 17:38   ` Keith Busch
@ 2018-01-09 13:50     ` Kanchan Joshi
  2018-01-09 17:05       ` Keith Busch
  0 siblings, 1 reply; 7+ messages in thread
From: Kanchan Joshi @ 2018-01-09 13:50 UTC (permalink / raw)


Hi Keith,

It seems to me that some sort of differentiation (among tasks of various 
classes) during tag allocation is required to handle this. Perhaps, 
static division of available tags into 4 priority classes needs to be 
done in block layer.
Or does there exist any other facility in block layer which you hinted 
at? I would appreciate your suggestions/concerns.

Thanks
Kanchan

On Thursday 04 January 2018 11:08 PM, Keith Busch wrote:
> On Thu, Jan 04, 2018@08:32:09PM +0530, Kanchan Joshi wrote:
>> This patch enables support for Weighted-Round-Robin (WRR) arbitration, so
>> that applications can make use of the prioritization capabilities natively
>> present in NVMe controller.
>>
>> - It links existing io-nice classes (real-time, best-effort, none, low)
>> to NVMe priorities (urgent, high, medium, low).  This is done through
>> 'request->ioprio' field inside 'queue_rq' function.
>>
>> - Current driver has 1:1 mapping (1 SQ, 1 CQ) per cpu, encapsulated in
>> 'nvmeq' structure.  This patch refactors the code so that N:1 mapping per
>> cpu can be created; 'nvmeq' has been changed to contain variable number of SQ
>> related fields.  For WRR, 4 submission-queues (corresponding to each queue
>> priorites) need to be created on each cpu.
> 
> You have a single tagset per CQ for up to N oustandanding commands, but
> allocate enough submission entries for 4 * N. And since they're sharing
> tags, a lower pri task can limit a high-pri one from getting a tag. I
> think we could use a little more help from the block layer for WRR.
> 
> 
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] nvme: Add weighted-round-robin arbitration support
  2018-01-09 13:50     ` Kanchan Joshi
@ 2018-01-09 17:05       ` Keith Busch
  2018-01-10 15:27         ` Kanchan Joshi
  0 siblings, 1 reply; 7+ messages in thread
From: Keith Busch @ 2018-01-09 17:05 UTC (permalink / raw)


On Tue, Jan 09, 2018@07:20:58PM +0530, Kanchan Joshi wrote:
> It seems to me that some sort of differentiation (among tasks of various
> classes) during tag allocation is required to handle this. Perhaps, static
> division of available tags into 4 priority classes needs to be done in block
> layer.

Yes, I was thinking either some kind priorty based tag reservation, or
possibly entirely different hctx's for each priority. The block layer
currently doesn't provide this kind of separation, but it was mentioned
the latter might be appropriate for hipri/polled IO, so maybe that idea
can extend to WRR.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] nvme: Add weighted-round-robin arbitration support
  2018-01-09 17:05       ` Keith Busch
@ 2018-01-10 15:27         ` Kanchan Joshi
  2018-01-10 23:57           ` Keith Busch
  0 siblings, 1 reply; 7+ messages in thread
From: Kanchan Joshi @ 2018-01-10 15:27 UTC (permalink / raw)


Different hctx sounds good, and can be used for both polled IO and WRR 
(somewhat more resource-heavy for WRR though). May I know if that work 
is in progress already.
It seems NVMe driver does not have to do much in that scheme-of-things, 
while block layer takes most of the load.

On Tuesday 09 January 2018 10:35 PM, Keith Busch wrote:
> On Tue, Jan 09, 2018@07:20:58PM +0530, Kanchan Joshi wrote:
>> It seems to me that some sort of differentiation (among tasks of various
>> classes) during tag allocation is required to handle this. Perhaps, static
>> division of available tags into 4 priority classes needs to be done in block
>> layer.
> 
> Yes, I was thinking either some kind priorty based tag reservation, or
> possibly entirely different hctx's for each priority. The block layer
> currently doesn't provide this kind of separation, but it was mentioned
> the latter might be appropriate for hipri/polled IO, so maybe that idea
> can extend to WRR.
> 
> 
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] nvme: Add weighted-round-robin arbitration support
  2018-01-10 15:27         ` Kanchan Joshi
@ 2018-01-10 23:57           ` Keith Busch
  0 siblings, 0 replies; 7+ messages in thread
From: Keith Busch @ 2018-01-10 23:57 UTC (permalink / raw)


On Wed, Jan 10, 2018@08:57:19PM +0530, Kanchan Joshi wrote:
> Different hctx sounds good, and can be used for both polled IO and WRR
> (somewhat more resource-heavy for WRR though). May I know if that work is in
> progress already.

As far as I know, no one has anything developed in this area yet, or at
least not to the point of sharing. 

> It seems NVMe driver does not have to do much in that scheme-of-things,
> while block layer takes most of the load.

Yep, that's the idea. Just need to see if anyone comes up with
something. ;)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] nvme: Add weighted-round-robin arbitration support
  2018-01-04 15:02 ` [PATCH] nvme: Add weighted-round-robin arbitration support Kanchan Joshi
  2018-01-04 17:38   ` Keith Busch
@ 2018-01-14  9:56   ` Sagi Grimberg
  1 sibling, 0 replies; 7+ messages in thread
From: Sagi Grimberg @ 2018-01-14  9:56 UTC (permalink / raw)


Hi Joshi,

> This patch enables support for Weighted-Round-Robin (WRR) arbitration, so
> that applications can make use of the prioritization capabilities natively
> present in NVMe controller.
> 
> - It links existing io-nice classes (real-time, best-effort, none, low)
> to NVMe priorities (urgent, high, medium, low).  This is done through
> 'request->ioprio' field inside 'queue_rq' function.
> 
> - Current driver has 1:1 mapping (1 SQ, 1 CQ) per cpu, encapsulated in
> 'nvmeq' structure.  This patch refactors the code so that N:1 mapping per
> cpu can be created; 'nvmeq' has been changed to contain variable number of SQ
> related fields.  For WRR, 4 submission-queues (corresponding to each queue
> priorites) need to be created on each cpu.
> 
> - When 'enable_wrr' module param is passed, it creates 4:1 mapping and enables
> controller in WRR mode.  Otherwise, it cotinues to retain 1:1 mapping and
> controller remains in RR mode.
> 
> - NVMe device may have less number of queues than required for 4:1 mapping
> per cpu.  For example, when num_possible_cpus is 64, 256 submission-queues are
> required for 4:1 mapping while device may support, say, 128.
> This case is handled by creating 32 queue-pairs which are shared among 64 cpus.
> Another way to handle this could have been reducing to 3:1 or 2:1 mapping
> (and remapping 4 ionice classes as well).
> 
> -Admin queue, contains 1:1 mapping irrespective of the mode (RR or WRR) used.

Regardless the dicussion with Keihth, this patch should be divided into
three or four preparatory patches and the wrr patch.

1. keeping nvmeq->cq_db
2. changing nvme_enable_ctrl not to set ctrl_config (needs to verify
    doesn't break anything)
3. keeping multiple sqs per nvmeq and plumbing the sq_index
4. wire up wrr

This is true also if this is moving to blk-mq

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2018-01-14  9:56 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <CGME20180104150334epcas1p14579a6ab7bcf1b6ce326d0bce89c91e1@epcas1p1.samsung.com>
2018-01-04 15:02 ` [PATCH] nvme: Add weighted-round-robin arbitration support Kanchan Joshi
2018-01-04 17:38   ` Keith Busch
2018-01-09 13:50     ` Kanchan Joshi
2018-01-09 17:05       ` Keith Busch
2018-01-10 15:27         ` Kanchan Joshi
2018-01-10 23:57           ` Keith Busch
2018-01-14  9:56   ` Sagi Grimberg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.