All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] NVMe: avoid kmalloc/kfree for smaller IO
@ 2015-01-22  4:38 Jens Axboe
  2015-01-22 17:26 ` Keith Busch
  0 siblings, 1 reply; 6+ messages in thread
From: Jens Axboe @ 2015-01-22  4:38 UTC (permalink / raw)



Currently we allocate an nvme_iod for each IO, which holds the
sg list, prps, and other IO related info. Set a threshold of
2 pages and/or 8KB of data, below which we can just embed this
in the per-command pdu in blk-mq. For any IO at or below
NVME_INT_PAGES and NVME_INT_BYTES, we save a kmalloc and kfree.

For higher IOPS, this saves up to 1% of CPU time.

Signed-off-by: Jens Axboe <axboe at fb.com>

----

Starting to flush out a few patches that we're running at FB
on NVMe, this is one of them. We've been running this for the last ~6
months in various types of testing. It's pretty straight forward, the
ugliest bit is the ->private bit store in the bottom bit. The
alternative is adding a flag for this instead, but that'd eat up a bit
of space. The patch does kill the ->node list member, which isn't used
for anything, though...


diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index d826bf3e62c8..f7a337272138 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -144,8 +144,37 @@ struct nvme_cmd_info {
 	void *ctx;
 	int aborted;
 	struct nvme_queue *nvmeq;
+	struct nvme_iod iod[0];
 };
 
+/*
+ * Max size of iod being embedded in the request payload
+ */
+#define NVME_INT_PAGES		2
+#define NVME_INT_BYTES		(NVME_INT_PAGES * PAGE_CACHE_SIZE)
+
+/*
+ * Will slightly overestimate the number of pages needed.  This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size, struct nvme_dev *dev)
+{
+	unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
+	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+{
+	unsigned int ret = sizeof(struct nvme_cmd_info);
+
+	ret += sizeof(struct nvme_iod);
+	ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES, dev);
+	ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
+
+	return ret;
+}
+
 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 				unsigned int hctx_idx)
 {
@@ -218,6 +247,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
 	blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
 }
 
+static void *iod_get_private(struct nvme_iod *iod)
+{
+	return (void *) (iod->private & ~0x1UL);
+}
+
+/*
+ * If bit 0 is set, the iod is embedded in the request payload.
+ */
+static bool iod_should_kfree(struct nvme_iod *iod)
+{
+	return (iod->private & 0x01) == 0;
+}
+
 /* Special values must be less than 0x1000 */
 #define CMD_CTX_BASE		((void *)POISON_POINTER_DELTA)
 #define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
@@ -361,35 +403,52 @@ static __le64 **iod_list(struct nvme_iod *iod)
 	return ((void *)iod) + iod->offset;
 }
 
-/*
- * Will slightly overestimate the number of pages needed.  This is OK
- * as it only leads to a small amount of wasted memory for the lifetime of
- * the I/O.
- */
-static int nvme_npages(unsigned size, struct nvme_dev *dev)
+static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
+			    unsigned nseg, unsigned long private)
 {
-	unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
-	return DIV_ROUND_UP(8 * nprps, dev->page_size - 8);
+	iod->private = private;
+	iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+	iod->npages = -1;
+	iod->length = nbytes;
+	iod->nents = 0;
 }
 
 static struct nvme_iod *
-nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp)
+__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
+		 unsigned long priv, gfp_t gfp)
 {
 	struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
-				sizeof(__le64 *) * nvme_npages(nbytes, dev) +
+				sizeof(__le64 *) * nvme_npages(bytes, dev) +
 				sizeof(struct scatterlist) * nseg, gfp);
 
-	if (iod) {
-		iod->offset = offsetof(struct nvme_iod, sg[nseg]);
-		iod->npages = -1;
-		iod->length = nbytes;
-		iod->nents = 0;
-		iod->first_dma = 0ULL;
-	}
+	if (iod)
+		iod_init(iod, bytes, nseg, priv);
 
 	return iod;
 }
 
+static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
+			               gfp_t gfp)
+{
+	unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
+                                                sizeof(struct nvme_dsm_range);
+	unsigned long mask = 0;
+	struct nvme_iod *iod;
+
+	if (rq->nr_phys_segments <= NVME_INT_PAGES && size <= NVME_INT_BYTES) {
+		struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
+
+		iod = cmd->iod;
+		mask = 0x01;
+		iod_init(iod, size, rq->nr_phys_segments,
+				(unsigned long) rq | 0x01);
+		return iod;
+	}
+
+	return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
+				(unsigned long) rq, gfp);
+}
+
 void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 {
 	const int last_prp = dev->page_size / 8 - 1;
@@ -405,7 +464,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
 		prp_dma = next_prp_dma;
 	}
-	kfree(iod);
+
+	if (iod_should_kfree(iod))
+		kfree(iod);
 }
 
 static int nvme_error_status(u16 status)
@@ -424,7 +485,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 						struct nvme_completion *cqe)
 {
 	struct nvme_iod *iod = ctx;
-	struct request *req = iod->private;
+	struct request *req = iod_get_private(iod);
 	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
 
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
@@ -585,7 +646,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 							struct nvme_ns *ns)
 {
-	struct request *req = iod->private;
+	struct request *req = iod_get_private(iod);
 	struct nvme_command *cmnd;
 	u16 control = 0;
 	u32 dsmgmt = 0;
@@ -626,17 +687,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request *req = bd->rq;
 	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
 	struct nvme_iod *iod;
-	int psegs = req->nr_phys_segments;
 	enum dma_data_direction dma_dir;
-	unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
-						sizeof(struct nvme_dsm_range);
 
-	iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
+	iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
 	if (!iod)
 		return BLK_MQ_RQ_QUEUE_BUSY;
 
-	iod->private = req;
-
 	if (req->cmd_flags & REQ_DISCARD) {
 		void *range;
 		/*
@@ -651,10 +707,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 			goto retry_cmd;
 		iod_list(iod)[0] = (__le64 *)range;
 		iod->npages = 0;
-	} else if (psegs) {
+	} else if (req->nr_phys_segments) {
 		dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
 
-		sg_init_table(iod->sg, psegs);
+		sg_init_table(iod->sg, req->nr_phys_segments);
 		iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
 		if (!iod->nents)
 			goto error_cmd;
@@ -1408,7 +1464,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 		dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
 		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
 		dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
-		dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
+		dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
 		dev->admin_tagset.driver_data = dev;
 
 		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
@@ -1522,7 +1578,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 	}
 
 	err = -ENOMEM;
-	iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL);
+	iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
 	if (!iod)
 		goto put_pages;
 
@@ -2148,7 +2204,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
 	dev->tagset.queue_depth =
 				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
-	dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
+	dev->tagset.cmd_size = nvme_cmd_size(dev);
 	dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
 	dev->tagset.driver_data = dev;
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 258945fcabf1..19a5d4b23209 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -132,13 +132,12 @@ struct nvme_ns {
  * allocated to store the PRP list.
  */
 struct nvme_iod {
-	void *private;		/* For the use of the submitter of the I/O */
+	unsigned long private;	/* For the use of the submitter of the I/O */
 	int npages;		/* In the PRP list. 0 means small pool in use */
 	int offset;		/* Of PRP list */
 	int nents;		/* Used in scatterlist */
 	int length;		/* Of data, in bytes */
 	dma_addr_t first_dma;
-	struct list_head node;
 	struct scatterlist sg[0];
 };
 

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH] NVMe: avoid kmalloc/kfree for smaller IO
  2015-01-22  4:38 [PATCH] NVMe: avoid kmalloc/kfree for smaller IO Jens Axboe
@ 2015-01-22 17:26 ` Keith Busch
       [not found]   ` <CANvN+end4BQMAeBZKggbA8k+gjQzKj2xYx+=hovbTvmK_GH4Ug@mail.gmail.com>
  2015-01-22 18:59   ` Jens Axboe
  0 siblings, 2 replies; 6+ messages in thread
From: Keith Busch @ 2015-01-22 17:26 UTC (permalink / raw)


On Wed, 21 Jan 2015, Jens Axboe wrote:
> Currently we allocate an nvme_iod for each IO, which holds the
> sg list, prps, and other IO related info. Set a threshold of
> 2 pages and/or 8KB of data, below which we can just embed this
> in the per-command pdu in blk-mq. For any IO at or below
> NVME_INT_PAGES and NVME_INT_BYTES, we save a kmalloc and kfree.
>
> For higher IOPS, this saves up to 1% of CPU time.
>
> Signed-off-by: Jens Axboe <axboe at fb.com>
>
> ----

> +/*
> + * Max size of iod being embedded in the request payload
> + */
> +#define NVME_INT_PAGES		2
> +#define NVME_INT_BYTES		(NVME_INT_PAGES * PAGE_CACHE_SIZE)

I think the above needs to use what the device thinks a page size, right? If
there's a mismatched host-device page size, nvme_setup_prps could end up
accessing a non-existent prp list.

   #define NVME_INT_BYTES(dev) (NVME_INT_PAGES * dev->page_size)

Otherwise, looks good!

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] NVMe: avoid kmalloc/kfree for smaller IO
       [not found]   ` <CANvN+end4BQMAeBZKggbA8k+gjQzKj2xYx+=hovbTvmK_GH4Ug@mail.gmail.com>
@ 2015-01-22 18:33     ` Keith Busch
       [not found]       ` <CANvN+emqnrm1YE2Ft68=Q+aMBgDUVEUhHUTfPwPM=yBX-EhtHg@mail.gmail.com>
  0 siblings, 1 reply; 6+ messages in thread
From: Keith Busch @ 2015-01-22 18:33 UTC (permalink / raw)


On Thu, 22 Jan 2015, Andrey Kuzmin wrote:
> On Jan 22, 2015 8:27 PM, "Keith Busch" <keith.busch@intel.com> wrote:
> > On Wed, 21 Jan 2015, Jens Axboe wrote:
> >>
> >> Currently we allocate an nvme_iod for each IO, which holds the
> >> sg list, prps, and other IO related info. Set a threshold of
> >> 2 pages and/or 8KB of data, below which we can just embed this
> >> in the per-command pdu in blk-mq. For any IO at or below
> >> NVME_INT_PAGES and NVME_INT_BYTES, we save a kmalloc and kfree.
> >>
> >> For higher IOPS, this saves up to 1% of CPU time.
> >>
> >> Signed-off-by: Jens Axboe <axboe at fb.com>
> >>
> >> ----
> >
> >
> >> +/*
> >> + * Max size of iod being embedded in the request payload
> >> + */
> >> +#define NVME_INT_PAGES? ? ? ? ?2
> >> +#define NVME_INT_BYTES? ? ? ? ?(NVME_INT_PAGES * PAGE_CACHE_SIZE)
> >
> >
> > I think the above needs to use what the device thinks a page size, right? If
> > there's a mismatched host-device page size, nvme_setup_prps could end up
> > accessing a non-existent prp list.
> >
> 
> AFAIR, per spec NVMe device operates in system page size terms.

That's the ideal situation, but the device and system don't always have
the same capabilities. Pretty much every nvme controller supports 4k,
and for many, that's all they understand. Some archs use 8k pages, so
we have to split a system page into two logical pages when setting up
the PRPs. The alternative is to not work at all, but the sales people
didn't like that idea.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] NVMe: avoid kmalloc/kfree for smaller IO
       [not found]       ` <CANvN+emqnrm1YE2Ft68=Q+aMBgDUVEUhHUTfPwPM=yBX-EhtHg@mail.gmail.com>
@ 2015-01-22 18:47         ` Keith Busch
       [not found]           ` <CANvN+enF8Btab2NbuALyzRfpmkfR0ibRVbfCt-ZxWdW9vt9AmA@mail.gmail.com>
  0 siblings, 1 reply; 6+ messages in thread
From: Keith Busch @ 2015-01-22 18:47 UTC (permalink / raw)


On Thu, 22 Jan 2015, Andrey Kuzmin wrote:
> On Jan 22, 2015 9:33 PM, "Keith Busch" <keith.busch@intel.com> wrote:
> > On Thu, 22 Jan 2015, Andrey Kuzmin wrote:
> >> AFAIR, per spec NVMe device operates in system page size terms.
> >
> > That's the ideal situation, but the device and system don't always have
> > the same capabilities. Pretty much every nvme controller supports 4k,
> > and for many, that's all they understand. Some archs use 8k pages, so
> 
> Doesn't this get negotiated at PCIe level?

It's initialized through NVMe controller registers during initialization
based on its reported capabilities and the system page size. This is
just informing the controller of the DMA alignment to use for its PRP
scatter-gather mechanism.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] NVMe: avoid kmalloc/kfree for smaller IO
  2015-01-22 17:26 ` Keith Busch
       [not found]   ` <CANvN+end4BQMAeBZKggbA8k+gjQzKj2xYx+=hovbTvmK_GH4Ug@mail.gmail.com>
@ 2015-01-22 18:59   ` Jens Axboe
  1 sibling, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2015-01-22 18:59 UTC (permalink / raw)


On 01/22/2015 10:26 AM, Keith Busch wrote:
> On Wed, 21 Jan 2015, Jens Axboe wrote:
>> Currently we allocate an nvme_iod for each IO, which holds the
>> sg list, prps, and other IO related info. Set a threshold of
>> 2 pages and/or 8KB of data, below which we can just embed this
>> in the per-command pdu in blk-mq. For any IO at or below
>> NVME_INT_PAGES and NVME_INT_BYTES, we save a kmalloc and kfree.
>>
>> For higher IOPS, this saves up to 1% of CPU time.
>>
>> Signed-off-by: Jens Axboe <axboe at fb.com>
>>
>> ----
> 
>> +/*
>> + * Max size of iod being embedded in the request payload
>> + */
>> +#define NVME_INT_PAGES        2
>> +#define NVME_INT_BYTES        (NVME_INT_PAGES * PAGE_CACHE_SIZE)
> 
> I think the above needs to use what the device thinks a page size,
> right? If
> there's a mismatched host-device page size, nvme_setup_prps could end up
> accessing a non-existent prp list.
> 
>   #define NVME_INT_BYTES(dev) (NVME_INT_PAGES * dev->page_size)

Good point, I missed that aspect of it. I'll make that change and repost.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] NVMe: avoid kmalloc/kfree for smaller IO
       [not found]           ` <CANvN+enF8Btab2NbuALyzRfpmkfR0ibRVbfCt-ZxWdW9vt9AmA@mail.gmail.com>
@ 2015-01-22 19:26             ` Keith Busch
  0 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2015-01-22 19:26 UTC (permalink / raw)


On Thu, 22 Jan 2015, Andrey Kuzmin wrote:
> According to 1.2 (that's what I've got at hand, see the quote below), controller reports min/max supported page size, and the host then sets memory page size (via CC.MPS register) within the reported limits. So those negotiations
> should be in charge of setting and storing the page size, whereupon Jens's code can just utilize the setting the driver has applied to CC.MPS.

Right, and that was the suggestion. :)

The spec doesn't call this out, but the host can set CC.MPS to any value
in the controller's capable range as long as that value is less than or
equal to the host's true size.

... actually you could probably make it work the other way around (host
size smaller than the device's), but that'd force a lot of IO splitting
and no such device exists anyway.

> 3.1.5 Offset 14h: CC ? Controller Configuration
> This register modifies settings for the controller. Host software shall set the Arbitration Mechanism
> (CC.AMS), the Memory Page Size (CC.MPS), and the Command Set (CC.CSS) to valid values prior to
> enabling the controller by setting CC.EN to ?1?.
> 
> Memory Page Size (MPS): This field indicates the host memory page size.
> The memory page size is (2 ^ (12 + MPS)). Thus, the minimum host memory page size is 4KB and the maximum host memory page size is 128MB. The
> value set by host software shall be a supported value as indicated by the CAP.MPSMAX and CAP.MPSMIN fields. This field describes the value used for PRP entry size. This field shall only be modified when EN is cleared to ?0?.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2015-01-22 19:26 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-22  4:38 [PATCH] NVMe: avoid kmalloc/kfree for smaller IO Jens Axboe
2015-01-22 17:26 ` Keith Busch
     [not found]   ` <CANvN+end4BQMAeBZKggbA8k+gjQzKj2xYx+=hovbTvmK_GH4Ug@mail.gmail.com>
2015-01-22 18:33     ` Keith Busch
     [not found]       ` <CANvN+emqnrm1YE2Ft68=Q+aMBgDUVEUhHUTfPwPM=yBX-EhtHg@mail.gmail.com>
2015-01-22 18:47         ` Keith Busch
     [not found]           ` <CANvN+enF8Btab2NbuALyzRfpmkfR0ibRVbfCt-ZxWdW9vt9AmA@mail.gmail.com>
2015-01-22 19:26             ` Keith Busch
2015-01-22 18:59   ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.