All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] nvme patches for 6.3
@ 2023-01-05 20:28 Keith Busch
  2023-01-05 20:28 ` [PATCH 1/4] nvme-pci: remove SGL segment descriptors Keith Busch
                   ` (4 more replies)
  0 siblings, 5 replies; 16+ messages in thread
From: Keith Busch @ 2023-01-05 20:28 UTC (permalink / raw)
  To: linux-nvme, hch, sagi; +Cc: Keith Busch

From: Keith Busch <kbusch@kernel.org>

Just a collection of patches developed earlier that didn't make it in
time for the 6.2 merge.

Keith Busch (4):
  nvme-pci: remove SGL segment descriptors
  nvme-pci: use mapped entries for sgl decision
  nvme-pci: place descriptor addresses in iod
  nvme: always initialize known command effects

 drivers/nvme/host/core.c |  81 ++++++++++++++++---------------
 drivers/nvme/host/pci.c  | 100 ++++++++++-----------------------------
 2 files changed, 68 insertions(+), 113 deletions(-)

-- 
2.30.2



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/4] nvme-pci: remove SGL segment descriptors
  2023-01-05 20:28 [PATCH 0/4] nvme patches for 6.3 Keith Busch
@ 2023-01-05 20:28 ` Keith Busch
  2023-01-09  7:07   ` Chaitanya Kulkarni
  2023-02-13 10:15   ` Sagi Grimberg
  2023-01-05 20:28 ` [PATCH 2/4] nvme-pci: use mapped entries for sgl decision Keith Busch
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 16+ messages in thread
From: Keith Busch @ 2023-01-05 20:28 UTC (permalink / raw)
  To: linux-nvme, hch, sagi; +Cc: Keith Busch

From: Keith Busch <kbusch@kernel.org>

The max segments this driver can see is 127, well below the 256
threshold needed to add an nvme sgl segment descriptor. Remove all the
useless checks and dead code.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 47 +++++------------------------------------
 1 file changed, 5 insertions(+), 42 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b13baccedb4a9..693c6e5495846 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -548,22 +548,6 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
 	}
 }
 
-static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
-{
-	const int last_sg = SGES_PER_PAGE - 1;
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	dma_addr_t dma_addr = iod->first_dma;
-	int i;
-
-	for (i = 0; i < iod->nr_allocations; i++) {
-		struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i];
-		dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr);
-
-		dma_pool_free(dev->prp_page_pool, sg_list, dma_addr);
-		dma_addr = next_dma_addr;
-	}
-}
-
 static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -582,7 +566,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
 			      iod->first_dma);
 	else if (iod->use_sgl)
-		nvme_free_sgls(dev, req);
+		dma_pool_free(dev->prp_page_pool, nvme_pci_iod_list(req)[0],
+			      iod->first_dma);
 	else
 		nvme_free_prps(dev, req);
 	mempool_free(iod->sgt.sgl, dev->iod_mempool);
@@ -705,13 +690,8 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
 		dma_addr_t dma_addr, int entries)
 {
 	sge->addr = cpu_to_le64(dma_addr);
-	if (entries < SGES_PER_PAGE) {
-		sge->length = cpu_to_le32(entries * sizeof(*sge));
-		sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
-	} else {
-		sge->length = cpu_to_le32(NVME_CTRL_PAGE_SIZE);
-		sge->type = NVME_SGL_FMT_SEG_DESC << 4;
-	}
+	sge->length = cpu_to_le32(entries * sizeof(*sge));
+	sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
 }
 
 static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
@@ -751,30 +731,12 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
 	iod->first_dma = sgl_dma;
 
 	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
-
 	do {
-		if (i == SGES_PER_PAGE) {
-			struct nvme_sgl_desc *old_sg_desc = sg_list;
-			struct nvme_sgl_desc *link = &old_sg_desc[i - 1];
-
-			sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
-			if (!sg_list)
-				goto free_sgls;
-
-			i = 0;
-			nvme_pci_iod_list(req)[iod->nr_allocations++] = sg_list;
-			sg_list[i++] = *link;
-			nvme_pci_sgl_set_seg(link, sgl_dma, entries);
-		}
-
 		nvme_pci_sgl_set_data(&sg_list[i++], sg);
 		sg = sg_next(sg);
 	} while (--entries > 0);
 
 	return BLK_STS_OK;
-free_sgls:
-	nvme_free_sgls(dev, req);
-	return BLK_STS_RESOURCE;
 }
 
 static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
@@ -3525,6 +3487,7 @@ static int __init nvme_init(void)
 	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
 	BUILD_BUG_ON(DIV_ROUND_UP(nvme_pci_npages_prp(), NVME_CTRL_PAGE_SIZE) >
 		     S8_MAX);
+	BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE);
 
 	return pci_register_driver(&nvme_driver);
 }
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/4] nvme-pci: use mapped entries for sgl decision
  2023-01-05 20:28 [PATCH 0/4] nvme patches for 6.3 Keith Busch
  2023-01-05 20:28 ` [PATCH 1/4] nvme-pci: remove SGL segment descriptors Keith Busch
@ 2023-01-05 20:28 ` Keith Busch
  2023-01-09  7:09   ` Chaitanya Kulkarni
  2023-01-05 20:28 ` [PATCH 3/4] nvme-pci: place descriptor addresses in iod Keith Busch
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 16+ messages in thread
From: Keith Busch @ 2023-01-05 20:28 UTC (permalink / raw)
  To: linux-nvme, hch, sagi; +Cc: Keith Busch

From: Keith Busch <kbusch@kernel.org>

The driver uses the dma entries for setting up its command's SGL/PRP
lists. The dma mapping might have fewer entries than the physical
segments, so check the dma mapped count to determine which nvme data
layout method is more optimal.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/pci.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 693c6e5495846..08ae1d69bf8c4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -515,10 +515,10 @@ static void **nvme_pci_iod_list(struct request *req)
 	return (void **)(iod->sgt.sgl + blk_rq_nr_phys_segments(req));
 }
 
-static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
+static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
+				     int nseg)
 {
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-	int nseg = blk_rq_nr_phys_segments(req);
 	unsigned int avg_seg_size;
 
 	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
@@ -818,7 +818,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 		goto out_free_sg;
 	}
 
-	iod->use_sgl = nvme_pci_use_sgls(dev, req);
+	iod->use_sgl = nvme_pci_use_sgls(dev, req, iod->sgt.nents);
 	if (iod->use_sgl)
 		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
 	else
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 3/4] nvme-pci: place descriptor addresses in iod
  2023-01-05 20:28 [PATCH 0/4] nvme patches for 6.3 Keith Busch
  2023-01-05 20:28 ` [PATCH 1/4] nvme-pci: remove SGL segment descriptors Keith Busch
  2023-01-05 20:28 ` [PATCH 2/4] nvme-pci: use mapped entries for sgl decision Keith Busch
@ 2023-01-05 20:28 ` Keith Busch
  2023-02-13 10:19   ` Sagi Grimberg
  2023-01-05 20:28 ` [PATCH 4/4] nvme: always initialize known command effects Keith Busch
  2023-01-08 18:22 ` [PATCH 0/4] nvme patches for 6.3 Christoph Hellwig
  4 siblings, 1 reply; 16+ messages in thread
From: Keith Busch @ 2023-01-05 20:28 UTC (permalink / raw)
  To: linux-nvme, hch, sagi; +Cc: Keith Busch

From: Keith Busch <kbusch@kernel.org>

The 'struct nvme_iod' space is appended at the end of the preallocated
'struct request', and padded to the cache line size. This leaves some
free memory (in most kernel configs) up for grabs.

Instead of appending the nvme data descriptor addresses after the
scatterlist, inline these for free within struct nvme_iod. There is now
enough space in the mempool for 128 possibe segments.

And without increasing the size of the preallocated requests, we can
hold up to 5 PRP descriptor elements, allowing the driver to increase
its max transfer size to 8MB.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 49 +++++++++++++++--------------------------
 1 file changed, 18 insertions(+), 31 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 08ae1d69bf8c4..b294b41a149a7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -42,8 +42,9 @@
  * These can be higher, but we need to ensure that any command doesn't
  * require an sg allocation that needs more than a page of data.
  */
-#define NVME_MAX_KB_SZ	4096
-#define NVME_MAX_SEGS	127
+#define NVME_MAX_KB_SZ	8192
+#define NVME_MAX_SEGS	128
+#define NVME_MAX_NR_ALLOCATIONS	5
 
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0444);
@@ -215,6 +216,11 @@ struct nvme_queue {
 	struct completion delete_done;
 };
 
+union nvme_descriptor {
+	struct nvme_sgl_desc	*sg_list;
+	__le64			*prp_list;
+};
+
 /*
  * The nvme_iod describes the data in an I/O.
  *
@@ -232,6 +238,7 @@ struct nvme_iod {
 	dma_addr_t first_dma;
 	dma_addr_t meta_dma;
 	struct sg_table sgt;
+	union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
 };
 
 static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
@@ -386,16 +393,6 @@ static int nvme_pci_npages_prp(void)
 	return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8);
 }
 
-/*
- * Calculates the number of pages needed for the SGL segments. For example a 4k
- * page can accommodate 256 SGL descriptors.
- */
-static int nvme_pci_npages_sgl(void)
-{
-	return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
-			NVME_CTRL_PAGE_SIZE);
-}
-
 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 				unsigned int hctx_idx)
 {
@@ -509,12 +506,6 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
 	spin_unlock(&nvmeq->sq_lock);
 }
 
-static void **nvme_pci_iod_list(struct request *req)
-{
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	return (void **)(iod->sgt.sgl + blk_rq_nr_phys_segments(req));
-}
-
 static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
 				     int nseg)
 {
@@ -540,7 +531,7 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
 	int i;
 
 	for (i = 0; i < iod->nr_allocations; i++) {
-		__le64 *prp_list = nvme_pci_iod_list(req)[i];
+		__le64 *prp_list = iod->list[i].prp_list;
 		dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
 
 		dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
@@ -563,10 +554,10 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 	dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
 
 	if (iod->nr_allocations == 0)
-		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
+		dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
 			      iod->first_dma);
 	else if (iod->use_sgl)
-		dma_pool_free(dev->prp_page_pool, nvme_pci_iod_list(req)[0],
+		dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
 			      iod->first_dma);
 	else
 		nvme_free_prps(dev, req);
@@ -598,7 +589,6 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 	u64 dma_addr = sg_dma_address(sg);
 	int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
 	__le64 *prp_list;
-	void **list = nvme_pci_iod_list(req);
 	dma_addr_t prp_dma;
 	int nprps, i;
 
@@ -636,7 +626,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 		iod->nr_allocations = -1;
 		return BLK_STS_RESOURCE;
 	}
-	list[0] = prp_list;
+	iod->list[0].prp_list = prp_list;
 	iod->first_dma = prp_dma;
 	i = 0;
 	for (;;) {
@@ -645,7 +635,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
 			if (!prp_list)
 				goto free_prps;
-			list[iod->nr_allocations++] = prp_list;
+			iod->list[iod->nr_allocations++].prp_list = prp_list;
 			prp_list[0] = old_prp_list[i - 1];
 			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
 			i = 1;
@@ -727,7 +717,7 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
 		return BLK_STS_RESOURCE;
 	}
 
-	nvme_pci_iod_list(req)[0] = sg_list;
+	iod->list[0].sg_list = sg_list;
 	iod->first_dma = sgl_dma;
 
 	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
@@ -2661,11 +2651,8 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
 
 static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
 {
-	size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
-	size_t alloc_size = sizeof(__le64 *) * npages +
-			    sizeof(struct scatterlist) * NVME_MAX_SEGS;
+	size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
 
-	WARN_ON_ONCE(alloc_size > PAGE_SIZE);
 	dev->iod_mempool = mempool_create_node(1,
 			mempool_kmalloc, mempool_kfree,
 			(void *)alloc_size, GFP_KERNEL,
@@ -3485,9 +3472,9 @@ static int __init nvme_init(void)
 	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
 	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
-	BUILD_BUG_ON(DIV_ROUND_UP(nvme_pci_npages_prp(), NVME_CTRL_PAGE_SIZE) >
-		     S8_MAX);
 	BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE);
+	BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE);
+	BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS);
 
 	return pci_register_driver(&nvme_driver);
 }
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 4/4] nvme: always initialize known command effects
  2023-01-05 20:28 [PATCH 0/4] nvme patches for 6.3 Keith Busch
                   ` (2 preceding siblings ...)
  2023-01-05 20:28 ` [PATCH 3/4] nvme-pci: place descriptor addresses in iod Keith Busch
@ 2023-01-05 20:28 ` Keith Busch
  2023-01-08 18:21   ` Christoph Hellwig
  2023-01-08 18:22 ` [PATCH 0/4] nvme patches for 6.3 Christoph Hellwig
  4 siblings, 1 reply; 16+ messages in thread
From: Keith Busch @ 2023-01-05 20:28 UTC (permalink / raw)
  To: linux-nvme, hch, sagi; +Cc: Keith Busch

From: Keith Busch <kbusch@kernel.org>

Instead of appending command effects flags per IO, set the known effects
flags the driver needs to react to just once during initial setup.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 81 +++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 38 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7be562a4e1aa7..6b8d2b1a926d3 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1060,41 +1060,12 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 }
 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
 
-static u32 nvme_known_admin_effects(u8 opcode)
-{
-	switch (opcode) {
-	case nvme_admin_format_nvm:
-		return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC |
-			NVME_CMD_EFFECTS_CSE_MASK;
-	case nvme_admin_sanitize_nvm:
-		return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK;
-	default:
-		break;
-	}
-	return 0;
-}
-
-static u32 nvme_known_nvm_effects(u8 opcode)
-{
-	switch (opcode) {
-	case nvme_cmd_write:
-	case nvme_cmd_write_zeroes:
-	case nvme_cmd_write_uncor:
-		 return NVME_CMD_EFFECTS_LBCC;
-	default:
-		return 0;
-	}
-}
-
 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
 {
 	u32 effects = 0;
 
 	if (ns) {
-		if (ns->head->effects)
-			effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
-		if (ns->head->ids.csi == NVME_CAP_CSS_NVM)
-			effects |= nvme_known_nvm_effects(opcode);
+		effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
 		if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
 			dev_warn_once(ctrl->device,
 				"IO command:%02x has unusual effects:%08x\n",
@@ -1107,9 +1078,7 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
 		 */
 		effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
 	} else {
-		if (ctrl->effects)
-			effects = le32_to_cpu(ctrl->effects->acs[opcode]);
-		effects |= nvme_known_admin_effects(opcode);
+		effects = le32_to_cpu(ctrl->effects->acs[opcode]);
 	}
 
 	return effects;
@@ -3122,6 +3091,44 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
 	return ret;
 }
 
+static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl)
+{
+	struct nvme_effects_log	*log = ctrl->effects;
+
+	log->acs[nvme_admin_format_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC |
+						NVME_CMD_EFFECTS_NCC |
+						NVME_CMD_EFFECTS_CSE_MASK);
+	log->acs[nvme_admin_sanitize_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC |
+						NVME_CMD_EFFECTS_CSE_MASK);
+
+	log->iocs[nvme_cmd_write] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
+	log->iocs[nvme_cmd_write_zeroes] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
+	log->iocs[nvme_cmd_write_uncor] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
+}
+
+static int nvme_init_effects(struct nvme_ctrl *ctrl, bool read_log)
+{
+	int ret = 0;
+
+	if (ctrl->effects)
+		return 0;
+
+	if (read_log) {
+		ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (!ctrl->effects) {
+		ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
+		if (!ctrl->effects)
+			return -ENOMEM;
+	}
+
+	nvme_init_known_nvm_effects(ctrl);
+	return 0;
+}
+
 static int nvme_init_identify(struct nvme_ctrl *ctrl)
 {
 	struct nvme_id_ctrl *id;
@@ -3135,11 +3142,9 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
 		return -EIO;
 	}
 
-	if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
-		ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
-		if (ret < 0)
-			goto out_free;
-	}
+	ret = nvme_init_effects(ctrl, id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG);
+	if (ret)
+		return ret;
 
 	if (!(ctrl->ops->flags & NVME_F_FABRICS))
 		ctrl->cntlid = le16_to_cpu(id->cntlid);
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] nvme: always initialize known command effects
  2023-01-05 20:28 ` [PATCH 4/4] nvme: always initialize known command effects Keith Busch
@ 2023-01-08 18:21   ` Christoph Hellwig
  0 siblings, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2023-01-08 18:21 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-nvme, hch, sagi, Keith Busch


> +static int nvme_init_effects(struct nvme_ctrl *ctrl, bool read_log)

.. this read_log argument really confused me ...

> +	ret = nvme_init_effects(ctrl, id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG);

.. until I saw this.

Any reason to not pass the nvme_id_ctrl to nvme_init_effects and
check the actual flag there?  That avoids losing the context through
that bool parameter.

The rest of this looks good to me.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] nvme patches for 6.3
  2023-01-05 20:28 [PATCH 0/4] nvme patches for 6.3 Keith Busch
                   ` (3 preceding siblings ...)
  2023-01-05 20:28 ` [PATCH 4/4] nvme: always initialize known command effects Keith Busch
@ 2023-01-08 18:22 ` Christoph Hellwig
  2023-02-10 14:34   ` Niklas Schnelle
  4 siblings, 1 reply; 16+ messages in thread
From: Christoph Hellwig @ 2023-01-08 18:22 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-nvme, hch, sagi, Keith Busch

Thanks,

I've applied patches 1-3 to nvme-6.3, holding on on the last one for
a bit.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] nvme-pci: remove SGL segment descriptors
  2023-01-05 20:28 ` [PATCH 1/4] nvme-pci: remove SGL segment descriptors Keith Busch
@ 2023-01-09  7:07   ` Chaitanya Kulkarni
  2023-02-13 10:15   ` Sagi Grimberg
  1 sibling, 0 replies; 16+ messages in thread
From: Chaitanya Kulkarni @ 2023-01-09  7:07 UTC (permalink / raw)
  To: Keith Busch, linux-nvme, hch, sagi; +Cc: Keith Busch

On 1/5/23 12:28, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> The max segments this driver can see is 127, well below the 256
> threshold needed to add an nvme sgl segment descriptor. Remove all the
> useless checks and dead code.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>

Thanks for this cleanup, Looks good.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/4] nvme-pci: use mapped entries for sgl decision
  2023-01-05 20:28 ` [PATCH 2/4] nvme-pci: use mapped entries for sgl decision Keith Busch
@ 2023-01-09  7:09   ` Chaitanya Kulkarni
  0 siblings, 0 replies; 16+ messages in thread
From: Chaitanya Kulkarni @ 2023-01-09  7:09 UTC (permalink / raw)
  To: Keith Busch, linux-nvme, hch, sagi; +Cc: Keith Busch

On 1/5/23 12:28, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> The driver uses the dma entries for setting up its command's SGL/PRP
> lists. The dma mapping might have fewer entries than the physical
> segments, so check the dma mapped count to determine which nvme data
> layout method is more optimal.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
> ---

I vaguely remember a similar bug in this area where this actually happened

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] nvme patches for 6.3
  2023-01-08 18:22 ` [PATCH 0/4] nvme patches for 6.3 Christoph Hellwig
@ 2023-02-10 14:34   ` Niklas Schnelle
  2023-02-10 15:24     ` Keith Busch
  0 siblings, 1 reply; 16+ messages in thread
From: Niklas Schnelle @ 2023-02-10 14:34 UTC (permalink / raw)
  To: hch; +Cc: kbusch, kbusch, linux-nvme, sagi

Hi Christoph, Hi Keith,

It looks like this series causes crashes on s390x.
With current linux-next-20230210 and a Samsung PM173X I get the
below[0] crash. Reverting patches 1-3 makes the NVMe work again. I
tried reverting just patch 3 and 2/3 but this results in crashes as
well and as far as I can see patches 2/3 depend on each other. Not
entirely sure what's going on but patch 3 mentions padding to the cache
line size and our 256 byte cache lines are definitely unusual. I didn't
see any obvious place where this would break things though. I did debug
that in the crashing nvme_unmap_data() iod->nr_allocations is -1 and
iod->use_sgl is true which is weird since it looks to me like iod-
>nr_allocations should only be -1 if sg_list couldn't be allocated from
the pool.

Best regards,
Niklas

[0]  
[   10.631468] nvme nvme0: Shutdown timeout set to 10 seconds
[   10.639143] nvme nvme0: 63/0/0 default/read/poll queues
[   10.646890]  nvme0n1: p1
[   10.652923] Unable to handle kernel pointer dereference in virtual kernel address space
[   10.652927] Failing address: 0000000000000000 TEID: 0000000000000483
[   10.652929] Fault in home space mode while using kernel ASCE.
[   10.652932] AS:00000001ae5c8007 R3:000000027fffc007 S:000000027fffb800 P:000000000000003d
[   10.652950] Oops: 0004 ilc:3 [#1] SMP
[   10.652954] Modules linked in: aes_s390(E+) des_s390(E) libdes(E) sha3_512_s390(E) sha3_256_s390(E) sha512_s390(E) sha256_s390(E) nvme(E) sha1_s390(E) sha_common(E) nvme_core(E) scsi_dh_rdac(E) scsi_dh_emc(E) scsi_dh_alua(E) pkey(E) zcrypt(E) rng_core(E) dm_multipath(E) autofs4(E)
[   10.652968] CPU: 3 PID: 0 Comm: swapper/3 Tainted: G            E      6.2.0-20230209.rc7.git113.20f513df926f.300.fc37.s390x+next #1
[   10.652971] Hardware name: IBM 8561 T01 703 (LPAR)
[   10.652973] Krnl PSW : 0404c00180000000 00000001aca225c6 (dma_pool_free+0x4e/0xe8)
[   10.652984]            R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
[   10.652987] Krnl GPRS: 00000000fffffff4 0000000100000000 000000008397c490 0000000000000004
[   10.652990]            0000000000000000 ffffffe000000000 0000000000000000 0000000095e80400
[   10.652992]            04000001ace74d6e 0000000000000000 0000000000000000 000000008397c480
[   10.652994]            00000000803aa100 0000000000000001 000003800081bb10 000003800081bac0
[   10.653003] Krnl Code: 00000001aca225b6: ba13b010\x09\x09cs\x09%r1,%r3,16(%r11)
                          00000001aca225ba: ec160048007e\x09cij\x09%r1,0,6,00000001aca2264a
                         #00000001aca225c0: c00400000024\x09brcl\x090,00000001aca22608
                         >00000001aca225c6: e390a0080024\x09stg\x09%r9,8(%r10)
                          00000001aca225cc: e310b0180004\x09lg\x09%r1,24(%r11)
                          00000001aca225d2: e310a0000024\x09stg\x09%r1,0(%r10)
                          00000001aca225d8: e3a0b0180024\x09stg\x09%r10,24(%r11)
                          00000001aca225de: ebffb028007a\x09agsi\x0940(%r11),-1
[   10.653039] Call Trace:
[   10.653042]  [<00000001aca225c6>] dma_pool_free+0x4e/0xe8
[   10.653049]  [<000003ff7fcd02e8>] nvme_unmap_data+0x98/0x168 [nvme]
[   10.653054]  [<000003ff7fcd043c>] nvme_pci_complete_batch+0x84/0x100 [nvme]
[   10.653058]  [<000003ff7fcd0b34>] nvme_irq+0x64/0x80 [nvme]
[   10.653061]  [<00000001ac80b1c6>] __handle_irq_event_percpu+0x5e/0x1b8
[   10.653065]  [<00000001ac80b346>] handle_irq_event_percpu+0x26/0x70
[   10.653068]  [<00000001ac8112f0>] handle_percpu_irq+0x68/0x90
[   10.653072]  [<00000001ac809c8e>] generic_handle_irq+0x3e/0x60
[   10.653074]  [<00000001ac77f794>] zpci_floating_irq_handler+0xdc/0x190
[   10.653079]  [<00000001ad2f212e>] do_airq_interrupt+0x8e/0xf0
[   10.653085]  [<00000001ac80b1c6>] __handle_irq_event_percpu+0x5e/0x1b8
[   10.653087]  [<00000001ac80b346>] handle_irq_event_percpu+0x26/0x70
[   10.653090]  [<00000001ac8112f0>] handle_percpu_irq+0x68/0x90
[   10.653092]  [<00000001ac809c8e>] generic_handle_irq+0x3e/0x60
[   10.653094]  [<00000001ac7451d8>] do_irq_async+0x50/0xb0
[   10.653100]  [<00000001ad3b5dca>] do_io_irq+0xba/0x168
[   10.653103]  [<00000001ad3c5736>] io_int_handler+0xd6/0x110
[   10.653107]  [<00000001ad3c57b6>] psw_idle_exit+0x0/0xa
[   10.653109] ([<00000001ad3b6aac>] default_idle_call+0x3c/0x108)
[   10.653112]  [<00000001ac7ea43c>] do_idle+0xd4/0x168
[   10.653115]  [<00000001ac7ea696>] cpu_startup_entry+0x36/0x40
[   10.653118]  [<00000001ac750ac4>] smp_start_secondary+0x12c/0x138
[   10.653121]  [<00000001ad3c5a5e>] restart_int_handler+0x6e/0x90
[   10.653123] Last Breaking-Event-Address:
[   10.653124]  [<000003ff7fccd2a4>] 0x3ff7fccd2a4
[   10.653130] Kernel panic - not syncing: Fatal exception in interrupt



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] nvme patches for 6.3
  2023-02-10 14:34   ` Niklas Schnelle
@ 2023-02-10 15:24     ` Keith Busch
  2023-02-10 15:37       ` Keith Busch
  0 siblings, 1 reply; 16+ messages in thread
From: Keith Busch @ 2023-02-10 15:24 UTC (permalink / raw)
  To: Niklas Schnelle; +Cc: hch, kbusch, linux-nvme, sagi

On Fri, Feb 10, 2023 at 03:34:09PM +0100, Niklas Schnelle wrote:
> Hi Christoph, Hi Keith,
> 
> It looks like this series causes crashes on s390x.
> With current linux-next-20230210 and a Samsung PM173X I get the
> below[0] crash. Reverting patches 1-3 makes the NVMe work again. I
> tried reverting just patch 3 and 2/3 but this results in crashes as
> well and as far as I can see patches 2/3 depend on each other. Not
> entirely sure what's going on but patch 3 mentions padding to the cache
> line size and our 256 byte cache lines are definitely unusual. I didn't
> see any obvious place where this would break things though. I did debug
> that in the crashing nvme_unmap_data() iod->nr_allocations is -1 and
> iod->use_sgl is true which is weird since it looks to me like iod-
> >nr_allocations should only be -1 if sg_list couldn't be allocated from
> the pool.

Thanks for the notice.

I think the driver must have received a request with multiple physical
segments, but the dma mapping collapsed it to 1. In that case, the driver won't
use the "simple" sgl, but we don't allocate a descriptor either, so we need to
account for that. I'll send a fix shortly.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] nvme patches for 6.3
  2023-02-10 15:24     ` Keith Busch
@ 2023-02-10 15:37       ` Keith Busch
  2023-02-10 16:29         ` Niklas Schnelle
  0 siblings, 1 reply; 16+ messages in thread
From: Keith Busch @ 2023-02-10 15:37 UTC (permalink / raw)
  To: Niklas Schnelle; +Cc: hch, kbusch, linux-nvme, sagi

On Fri, Feb 10, 2023 at 08:24:46AM -0700, Keith Busch wrote:
> On Fri, Feb 10, 2023 at 03:34:09PM +0100, Niklas Schnelle wrote:
> > Hi Christoph, Hi Keith,
> > 
> > It looks like this series causes crashes on s390x.
> > With current linux-next-20230210 and a Samsung PM173X I get the
> > below[0] crash. Reverting patches 1-3 makes the NVMe work again. I
> > tried reverting just patch 3 and 2/3 but this results in crashes as
> > well and as far as I can see patches 2/3 depend on each other. Not
> > entirely sure what's going on but patch 3 mentions padding to the cache
> > line size and our 256 byte cache lines are definitely unusual. I didn't
> > see any obvious place where this would break things though. I did debug
> > that in the crashing nvme_unmap_data() iod->nr_allocations is -1 and
> > iod->use_sgl is true which is weird since it looks to me like iod-
> > >nr_allocations should only be -1 if sg_list couldn't be allocated from
> > the pool.
> 
> Thanks for the notice.
> 
> I think the driver must have received a request with multiple physical
> segments, but the dma mapping collapsed it to 1. In that case, the driver won't
> use the "simple" sgl, but we don't allocate a descriptor either, so we need to
> account for that. I'll send a fix shortly.

This should fix it:

---
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a331fbfa9a667..6e8fcbf9306d2 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -553,14 +553,16 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 
 	dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
 
-	if (iod->nr_allocations == 0)
+	if (iod->nr_allocations == 0) {
 		dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
 			      iod->first_dma);
-	else if (iod->use_sgl)
-		dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
-			      iod->first_dma);
-	else
-		nvme_free_prps(dev, req);
+	} else if (iod->nr_allocations > 0) {
+		if (iod->use_sgl)
+			dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
+				      iod->first_dma);
+		else
+			nvme_free_prps(dev, req);
+	}
 	mempool_free(iod->sgt.sgl, dev->iod_mempool);
 }
 
--


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] nvme patches for 6.3
  2023-02-10 15:37       ` Keith Busch
@ 2023-02-10 16:29         ` Niklas Schnelle
  2023-02-10 17:14           ` Keith Busch
  0 siblings, 1 reply; 16+ messages in thread
From: Niklas Schnelle @ 2023-02-10 16:29 UTC (permalink / raw)
  To: Keith Busch; +Cc: hch, kbusch, linux-nvme, sagi

On Fri, 2023-02-10 at 08:37 -0700, Keith Busch wrote:
> On Fri, Feb 10, 2023 at 08:24:46AM -0700, Keith Busch wrote:
> > On Fri, Feb 10, 2023 at 03:34:09PM +0100, Niklas Schnelle wrote:
> > > Hi Christoph, Hi Keith,
> > > 
> > > It looks like this series causes crashes on s390x.
> > > With current linux-next-20230210 and a Samsung PM173X I get the
> > > below[0] crash. Reverting patches 1-3 makes the NVMe work again. I
> > > tried reverting just patch 3 and 2/3 but this results in crashes as
> > > well and as far as I can see patches 2/3 depend on each other. Not
> > > entirely sure what's going on but patch 3 mentions padding to the cache
> > > line size and our 256 byte cache lines are definitely unusual. I didn't
> > > see any obvious place where this would break things though. I did debug
> > > that in the crashing nvme_unmap_data() iod->nr_allocations is -1 and
> > > iod->use_sgl is true which is weird since it looks to me like iod-
> > > > nr_allocations should only be -1 if sg_list couldn't be allocated from
> > > the pool.
> > 
> > Thanks for the notice.
> > 
> > I think the driver must have received a request with multiple physical
> > segments, but the dma mapping collapsed it to 1. In that case, the driver won't
> > use the "simple" sgl, but we don't allocate a descriptor either, so we need to
> > account for that. I'll send a fix shortly.
> 
> This should fix it:
> 
> ---
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index a331fbfa9a667..6e8fcbf9306d2 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -553,14 +553,16 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
>  
>  	dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
>  
> -	if (iod->nr_allocations == 0)
> +	if (iod->nr_allocations == 0) {
>  		dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
>  			      iod->first_dma);
> -	else if (iod->use_sgl)
> -		dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
> -			      iod->first_dma);
> -	else
> -		nvme_free_prps(dev, req);
> +	} else if (iod->nr_allocations > 0) {
> +		if (iod->use_sgl)
> +			dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
> +				      iod->first_dma);
> +		else
> +			nvme_free_prps(dev, req);
> +	}
>  	mempool_free(iod->sgt.sgl, dev->iod_mempool);
>  }
>  
> --

Thanks, yes that seems to have been it and with the above patch applied
on top of linux-next-20230210 the crash disappears and my standard fio
script runs fine too.

So if I understand things correctly possibly thanks to "nvme-pci: use
mapped entries for sgl decision" a collapsed DMA mapping lands us in
the entries == 1 case in nvme_pci_setup_sgls() so no extra mapping is
needed and thus the dma_unmap_sgtable() is the only unmapping we need
in nvme_unmap_data()? Does that mean that previously this case might
have used prps and thus now needs fewer mapping operations?

Either way, feel free to add my:

Tested-by: Niklas Schnelle <schnelle@linux.ibm.com>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] nvme patches for 6.3
  2023-02-10 16:29         ` Niklas Schnelle
@ 2023-02-10 17:14           ` Keith Busch
  0 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2023-02-10 17:14 UTC (permalink / raw)
  To: Niklas Schnelle; +Cc: hch, kbusch, linux-nvme, sagi

On Fri, Feb 10, 2023 at 05:29:31PM +0100, Niklas Schnelle wrote:
> So if I understand things correctly possibly thanks to "nvme-pci: use
> mapped entries for sgl decision" a collapsed DMA mapping lands us in
> the entries == 1 case in nvme_pci_setup_sgls() so no extra mapping is
> needed and thus the dma_unmap_sgtable() is the only unmapping we need
> in nvme_unmap_data()? Does that mean that previously this case might
> have used prps and thus now needs fewer mapping operations?

Yes, exactly. The older code could have still decided on using sgls, but it's
definitely more likely with the newer code. This does reduce the memory
allocations so it should be more optimal.

The older code did account for this use_sgl nr_allocations=-1 correctly though,
and I messed that part up when changing it.

> Either way, feel free to add my:
> 
> Tested-by: Niklas Schnelle <schnelle@linux.ibm.com>

Thanks, I'll send a formal patch today.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] nvme-pci: remove SGL segment descriptors
  2023-01-05 20:28 ` [PATCH 1/4] nvme-pci: remove SGL segment descriptors Keith Busch
  2023-01-09  7:07   ` Chaitanya Kulkarni
@ 2023-02-13 10:15   ` Sagi Grimberg
  1 sibling, 0 replies; 16+ messages in thread
From: Sagi Grimberg @ 2023-02-13 10:15 UTC (permalink / raw)
  To: Keith Busch, linux-nvme, hch; +Cc: Keith Busch

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] nvme-pci: place descriptor addresses in iod
  2023-01-05 20:28 ` [PATCH 3/4] nvme-pci: place descriptor addresses in iod Keith Busch
@ 2023-02-13 10:19   ` Sagi Grimberg
  0 siblings, 0 replies; 16+ messages in thread
From: Sagi Grimberg @ 2023-02-13 10:19 UTC (permalink / raw)
  To: Keith Busch, linux-nvme, hch; +Cc: Keith Busch

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2023-02-13 10:19 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-05 20:28 [PATCH 0/4] nvme patches for 6.3 Keith Busch
2023-01-05 20:28 ` [PATCH 1/4] nvme-pci: remove SGL segment descriptors Keith Busch
2023-01-09  7:07   ` Chaitanya Kulkarni
2023-02-13 10:15   ` Sagi Grimberg
2023-01-05 20:28 ` [PATCH 2/4] nvme-pci: use mapped entries for sgl decision Keith Busch
2023-01-09  7:09   ` Chaitanya Kulkarni
2023-01-05 20:28 ` [PATCH 3/4] nvme-pci: place descriptor addresses in iod Keith Busch
2023-02-13 10:19   ` Sagi Grimberg
2023-01-05 20:28 ` [PATCH 4/4] nvme: always initialize known command effects Keith Busch
2023-01-08 18:21   ` Christoph Hellwig
2023-01-08 18:22 ` [PATCH 0/4] nvme patches for 6.3 Christoph Hellwig
2023-02-10 14:34   ` Niklas Schnelle
2023-02-10 15:24     ` Keith Busch
2023-02-10 15:37       ` Keith Busch
2023-02-10 16:29         ` Niklas Schnelle
2023-02-10 17:14           ` Keith Busch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.