All of lore.kernel.org
 help / color / mirror / Atom feed
From: Keith Busch <kbusch@meta.com>
To: <linux-nvme@lists.infradead.org>, <hch@lst.de>, <sagi@grimberg.me>
Cc: Keith Busch <kbusch@kernel.org>
Subject: [PATCH 3/4] nvme-pci: place descriptor addresses in iod
Date: Thu, 5 Jan 2023 12:28:32 -0800	[thread overview]
Message-ID: <20230105202833.3225645-4-kbusch@meta.com> (raw)
In-Reply-To: <20230105202833.3225645-1-kbusch@meta.com>

From: Keith Busch <kbusch@kernel.org>

The 'struct nvme_iod' space is appended at the end of the preallocated
'struct request', and padded to the cache line size. This leaves some
free memory (in most kernel configs) up for grabs.

Instead of appending the nvme data descriptor addresses after the
scatterlist, inline these for free within struct nvme_iod. There is now
enough space in the mempool for 128 possibe segments.

And without increasing the size of the preallocated requests, we can
hold up to 5 PRP descriptor elements, allowing the driver to increase
its max transfer size to 8MB.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 49 +++++++++++++++--------------------------
 1 file changed, 18 insertions(+), 31 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 08ae1d69bf8c4..b294b41a149a7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -42,8 +42,9 @@
  * These can be higher, but we need to ensure that any command doesn't
  * require an sg allocation that needs more than a page of data.
  */
-#define NVME_MAX_KB_SZ	4096
-#define NVME_MAX_SEGS	127
+#define NVME_MAX_KB_SZ	8192
+#define NVME_MAX_SEGS	128
+#define NVME_MAX_NR_ALLOCATIONS	5
 
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0444);
@@ -215,6 +216,11 @@ struct nvme_queue {
 	struct completion delete_done;
 };
 
+union nvme_descriptor {
+	struct nvme_sgl_desc	*sg_list;
+	__le64			*prp_list;
+};
+
 /*
  * The nvme_iod describes the data in an I/O.
  *
@@ -232,6 +238,7 @@ struct nvme_iod {
 	dma_addr_t first_dma;
 	dma_addr_t meta_dma;
 	struct sg_table sgt;
+	union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
 };
 
 static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
@@ -386,16 +393,6 @@ static int nvme_pci_npages_prp(void)
 	return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8);
 }
 
-/*
- * Calculates the number of pages needed for the SGL segments. For example a 4k
- * page can accommodate 256 SGL descriptors.
- */
-static int nvme_pci_npages_sgl(void)
-{
-	return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
-			NVME_CTRL_PAGE_SIZE);
-}
-
 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 				unsigned int hctx_idx)
 {
@@ -509,12 +506,6 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
 	spin_unlock(&nvmeq->sq_lock);
 }
 
-static void **nvme_pci_iod_list(struct request *req)
-{
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	return (void **)(iod->sgt.sgl + blk_rq_nr_phys_segments(req));
-}
-
 static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
 				     int nseg)
 {
@@ -540,7 +531,7 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
 	int i;
 
 	for (i = 0; i < iod->nr_allocations; i++) {
-		__le64 *prp_list = nvme_pci_iod_list(req)[i];
+		__le64 *prp_list = iod->list[i].prp_list;
 		dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
 
 		dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
@@ -563,10 +554,10 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 	dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
 
 	if (iod->nr_allocations == 0)
-		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
+		dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
 			      iod->first_dma);
 	else if (iod->use_sgl)
-		dma_pool_free(dev->prp_page_pool, nvme_pci_iod_list(req)[0],
+		dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
 			      iod->first_dma);
 	else
 		nvme_free_prps(dev, req);
@@ -598,7 +589,6 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 	u64 dma_addr = sg_dma_address(sg);
 	int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
 	__le64 *prp_list;
-	void **list = nvme_pci_iod_list(req);
 	dma_addr_t prp_dma;
 	int nprps, i;
 
@@ -636,7 +626,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 		iod->nr_allocations = -1;
 		return BLK_STS_RESOURCE;
 	}
-	list[0] = prp_list;
+	iod->list[0].prp_list = prp_list;
 	iod->first_dma = prp_dma;
 	i = 0;
 	for (;;) {
@@ -645,7 +635,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
 			if (!prp_list)
 				goto free_prps;
-			list[iod->nr_allocations++] = prp_list;
+			iod->list[iod->nr_allocations++].prp_list = prp_list;
 			prp_list[0] = old_prp_list[i - 1];
 			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
 			i = 1;
@@ -727,7 +717,7 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
 		return BLK_STS_RESOURCE;
 	}
 
-	nvme_pci_iod_list(req)[0] = sg_list;
+	iod->list[0].sg_list = sg_list;
 	iod->first_dma = sgl_dma;
 
 	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
@@ -2661,11 +2651,8 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
 
 static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
 {
-	size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
-	size_t alloc_size = sizeof(__le64 *) * npages +
-			    sizeof(struct scatterlist) * NVME_MAX_SEGS;
+	size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
 
-	WARN_ON_ONCE(alloc_size > PAGE_SIZE);
 	dev->iod_mempool = mempool_create_node(1,
 			mempool_kmalloc, mempool_kfree,
 			(void *)alloc_size, GFP_KERNEL,
@@ -3485,9 +3472,9 @@ static int __init nvme_init(void)
 	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
 	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
-	BUILD_BUG_ON(DIV_ROUND_UP(nvme_pci_npages_prp(), NVME_CTRL_PAGE_SIZE) >
-		     S8_MAX);
 	BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE);
+	BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE);
+	BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS);
 
 	return pci_register_driver(&nvme_driver);
 }
-- 
2.30.2



  parent reply	other threads:[~2023-01-05 23:21 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-05 20:28 [PATCH 0/4] nvme patches for 6.3 Keith Busch
2023-01-05 20:28 ` [PATCH 1/4] nvme-pci: remove SGL segment descriptors Keith Busch
2023-01-09  7:07   ` Chaitanya Kulkarni
2023-02-13 10:15   ` Sagi Grimberg
2023-01-05 20:28 ` [PATCH 2/4] nvme-pci: use mapped entries for sgl decision Keith Busch
2023-01-09  7:09   ` Chaitanya Kulkarni
2023-01-05 20:28 ` Keith Busch [this message]
2023-02-13 10:19   ` [PATCH 3/4] nvme-pci: place descriptor addresses in iod Sagi Grimberg
2023-01-05 20:28 ` [PATCH 4/4] nvme: always initialize known command effects Keith Busch
2023-01-08 18:21   ` Christoph Hellwig
2023-01-08 18:22 ` [PATCH 0/4] nvme patches for 6.3 Christoph Hellwig
2023-02-10 14:34   ` Niklas Schnelle
2023-02-10 15:24     ` Keith Busch
2023-02-10 15:37       ` Keith Busch
2023-02-10 16:29         ` Niklas Schnelle
2023-02-10 17:14           ` Keith Busch

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230105202833.3225645-4-kbusch@meta.com \
    --to=kbusch@meta.com \
    --cc=hch@lst.de \
    --cc=kbusch@kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.