All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nvme-pci: fix prp list allocation
@ 2022-02-12 20:06 Matthew Waltz
  2022-02-12 20:48 ` Keith Busch
  0 siblings, 1 reply; 5+ messages in thread
From: Matthew Waltz @ 2022-02-12 20:06 UTC (permalink / raw)
  To: matthewwaltzis
  Cc: Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	linux-nvme, linux-kernel

Fixes kernel block errors originating from the hard-coded 256-byte
alignment of dma_pool_create(). The NVMe specification requires a PRP
List PBAO offset field of 0h, i.e. a PRP List must be aligned to the
configured 4096-byte memory page size.

Essentially reverts commit 99802a7aee2b ("NVMe: Optimise memory usage
for I/Os between 4k and 128k")

Resolved by using default PRP pool which is properly aligned.

Signed-off-by: Matthew Waltz <matthewwaltzis@gmail.com>
---
 drivers/nvme/host/pci.c | 51 +++++++++++------------------------------
 1 file changed, 14 insertions(+), 37 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6a99ed680915..4fcb159c7df3 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -119,7 +119,6 @@ struct nvme_dev {
 	u32 __iomem *dbs;
 	struct device *dev;
 	struct dma_pool *prp_page_pool;
-	struct dma_pool *prp_small_pool;
 	unsigned online_queues;
 	unsigned max_qid;
 	unsigned io_queues[HCTX_MAX_TYPES];
@@ -228,7 +227,7 @@ struct nvme_iod {
 	struct nvme_queue *nvmeq;
 	bool use_sgl;
 	int aborted;
-	int npages;		/* In the PRP list. 0 means small pool in use */
+	int npages;		/* In the PRP list. */
 	int nents;		/* Used in scatterlist */
 	dma_addr_t first_dma;
 	unsigned int dma_len;	/* length of single DMA segment mapping */
@@ -598,10 +597,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 	WARN_ON_ONCE(!iod->nents);
 
 	nvme_unmap_sg(dev, req);
-	if (iod->npages == 0)
-		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
-			      iod->first_dma);
-	else if (iod->use_sgl)
+	if (iod->use_sgl)
 		nvme_free_sgls(dev, req);
 	else
 		nvme_free_prps(dev, req);
@@ -635,7 +631,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 	__le64 *prp_list;
 	void **list = nvme_pci_iod_list(req);
 	dma_addr_t prp_dma;
-	int nprps, i;
+	int i;
 
 	length -= (NVME_CTRL_PAGE_SIZE - offset);
 	if (length <= 0) {
@@ -657,14 +653,8 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 		goto done;
 	}
 
-	nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
-	if (nprps <= (256 / 8)) {
-		pool = dev->prp_small_pool;
-		iod->npages = 0;
-	} else {
-		pool = dev->prp_page_pool;
-		iod->npages = 1;
-	}
+	pool = dev->prp_page_pool;
+	iod->npages = 1;
 
 	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
 	if (!prp_list) {
@@ -753,13 +743,8 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
 		return BLK_STS_OK;
 	}
 
-	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
-		pool = dev->prp_small_pool;
-		iod->npages = 0;
-	} else {
-		pool = dev->prp_page_pool;
-		iod->npages = 1;
-	}
+	pool = dev->prp_page_pool;
+	iod->npages = 1;
 
 	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
 	if (!sg_list) {
@@ -2727,7 +2712,7 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
 	return 0;
 }
 
-static int nvme_setup_prp_pools(struct nvme_dev *dev)
+static int nvme_setup_prp_pool(struct nvme_dev *dev)
 {
 	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
 						NVME_CTRL_PAGE_SIZE,
@@ -2735,20 +2720,12 @@ static int nvme_setup_prp_pools(struct nvme_dev *dev)
 	if (!dev->prp_page_pool)
 		return -ENOMEM;
 
-	/* Optimisation for I/Os between 4k and 128k */
-	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
-						256, 256, 0);
-	if (!dev->prp_small_pool) {
-		dma_pool_destroy(dev->prp_page_pool);
-		return -ENOMEM;
-	}
 	return 0;
 }
 
-static void nvme_release_prp_pools(struct nvme_dev *dev)
+static void nvme_release_prp_pool(struct nvme_dev *dev)
 {
 	dma_pool_destroy(dev->prp_page_pool);
-	dma_pool_destroy(dev->prp_small_pool);
 }
 
 static void nvme_free_tagset(struct nvme_dev *dev)
@@ -3080,7 +3057,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
 	mutex_init(&dev->shutdown_lock);
 
-	result = nvme_setup_prp_pools(dev);
+	result = nvme_setup_prp_pool(dev);
 	if (result)
 		goto unmap;
 
@@ -3109,7 +3086,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 						GFP_KERNEL, node);
 	if (!dev->iod_mempool) {
 		result = -ENOMEM;
-		goto release_pools;
+		goto release_pool;
 	}
 
 	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
@@ -3126,8 +3103,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
  release_mempool:
 	mempool_destroy(dev->iod_mempool);
- release_pools:
-	nvme_release_prp_pools(dev);
+ release_pool:
+	nvme_release_prp_pool(dev);
  unmap:
 	nvme_dev_unmap(dev);
  put_pci:
@@ -3198,7 +3175,7 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_free_host_mem(dev);
 	nvme_dev_remove_admin(dev);
 	nvme_free_queues(dev, 0);
-	nvme_release_prp_pools(dev);
+	nvme_release_prp_pool(dev);
 	nvme_dev_unmap(dev);
 	nvme_uninit_ctrl(&dev->ctrl);
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] nvme-pci: fix prp list allocation
  2022-02-12 20:06 [PATCH] nvme-pci: fix prp list allocation Matthew Waltz
@ 2022-02-12 20:48 ` Keith Busch
  2022-02-12 20:53   ` Matt Waltz
  0 siblings, 1 reply; 5+ messages in thread
From: Keith Busch @ 2022-02-12 20:48 UTC (permalink / raw)
  To: Matthew Waltz
  Cc: Jens Axboe, Christoph Hellwig, Sagi Grimberg, linux-nvme, linux-kernel

On Sat, Feb 12, 2022 at 01:06:49PM -0700, Matthew Waltz wrote:
> Fixes kernel block errors originating from the hard-coded 256-byte
> alignment of dma_pool_create(). The NVMe specification requires a PRP
> List PBAO offset field of 0h, i.e. a PRP List must be aligned to the
> configured 4096-byte memory page size.
 
That is not correct. The spec (2.0a section 4.1.1) says:

   The first PRP List entry [...] shall be qword aligned and may also
   have a non-zero offset within the memory page.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] nvme-pci: fix prp list allocation
  2022-02-12 20:48 ` Keith Busch
@ 2022-02-12 20:53   ` Matt Waltz
  2022-02-12 21:02     ` Keith Busch
  0 siblings, 1 reply; 5+ messages in thread
From: Matt Waltz @ 2022-02-12 20:53 UTC (permalink / raw)
  To: Keith Busch
  Cc: Jens Axboe, Christoph Hellwig, Sagi Grimberg, linux-nvme, linux-kernel

That is the wrong part of the specification, describing the PRP entries,
not the list pointer. The relevant portion of the spec (2.0a figure
109 page 129) says:

    If this entry is not the first PRP entry in the command or a
    PRP List pointer in a command, then the Offset portion of this
    field shall be cleared to 0h


On Sat, Feb 12, 2022 at 1:48 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Sat, Feb 12, 2022 at 01:06:49PM -0700, Matthew Waltz wrote:
> > Fixes kernel block errors originating from the hard-coded 256-byte
> > alignment of dma_pool_create(). The NVMe specification requires a PRP
> > List PBAO offset field of 0h, i.e. a PRP List must be aligned to the
> > configured 4096-byte memory page size.
>
> That is not correct. The spec (2.0a section 4.1.1) says:
>
>    The first PRP List entry [...] shall be qword aligned and may also
>    have a non-zero offset within the memory page.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] nvme-pci: fix prp list allocation
  2022-02-12 20:53   ` Matt Waltz
@ 2022-02-12 21:02     ` Keith Busch
  2022-02-12 22:16       ` Matt Waltz
  0 siblings, 1 reply; 5+ messages in thread
From: Keith Busch @ 2022-02-12 21:02 UTC (permalink / raw)
  To: Matt Waltz
  Cc: Jens Axboe, Christoph Hellwig, Sagi Grimberg, linux-nvme, linux-kernel

On Sat, Feb 12, 2022 at 01:53:59PM -0700, Matt Waltz wrote:
> That is the wrong part of the specification, describing the PRP entries,
> not the list pointer. 

The text for qword alignment rules specifically says "PRP List entry",
as is associated with command's prp2 field.

> The relevant portion of the spec (2.0a figure
> 109 page 129) says:
> 
>     If this entry is not the first PRP entry in the command or a
>     PRP List pointer in a command, then the Offset portion of this
>     field shall be cleared to 0h

That only applies to chaining list elements. It does not apply to the
"PRP list pointer in a command". The driver allocates from the 256B dma
pool only for the command's PRP list pointer, so we're fine.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] nvme-pci: fix prp list allocation
  2022-02-12 21:02     ` Keith Busch
@ 2022-02-12 22:16       ` Matt Waltz
  0 siblings, 0 replies; 5+ messages in thread
From: Matt Waltz @ 2022-02-12 22:16 UTC (permalink / raw)
  To: Keith Busch
  Cc: Jens Axboe, Christoph Hellwig, Sagi Grimberg, linux-nvme, linux-kernel

On Sat, Feb 12, 2022 at 2:03 PM Keith Busch <kbusch@kernel.org> wrote:
> That only applies to chaining list elements. It does not apply to the
> "PRP list pointer in a command". The driver allocates from the 256B dma
> pool only for the command's PRP list pointer, so we're fine.

Ah yes, I was confused by the PRP2 definition which says it can be a
"PRP list pointer",
now I feel extremely silly. Guess I'll need to look elsewhere for a
fix for this bug I am
encountering, thank you for the clarity!

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-02-12 22:17 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-12 20:06 [PATCH] nvme-pci: fix prp list allocation Matthew Waltz
2022-02-12 20:48 ` Keith Busch
2022-02-12 20:53   ` Matt Waltz
2022-02-12 21:02     ` Keith Busch
2022-02-12 22:16       ` Matt Waltz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.