All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 3/3] nvme-tcp: replace sendpage calls with sendmsg calls for SLAB chunks
  2019-07-04 10:03 [PATCH 0/3] Miscellaneous changes for nvme-tcp Mikhail Skorzhinskii
@ 2019-07-03 10:47 ` Mikhail Skorzhinskii
  2019-07-08  9:57   ` Sagi Grimberg
  2019-07-04  7:59 ` [PATCH 2/3] nvme-tcp: set the STABLE_WRITES flag when data digests are enabled Mikhail Skorzhinskii
  2019-07-04  8:01 ` [PATCH 1/3] nvmet: print a hint while rejecting NSID 0 or 0xffffffff Mikhail Skorzhinskii
  2 siblings, 1 reply; 8+ messages in thread
From: Mikhail Skorzhinskii @ 2019-07-03 10:47 UTC (permalink / raw)


According to commit a10674bf2406 ("tcp: detecting the misuse of .sendpage
for Slab objects") and previous discussion[1][2], tcp_sendpage should not
be used for pages that is managed by SLAB, as SLAB is not taking page
reference counters into consideration.

This change prevents sendpage calls for payload sending too, although this
is true only for admin commands, so actual data transfer performance
should be untouched.

[1] https://www.spinics.net/lists/netdev/msg553616.html
[2] https://www.spinics.net/lists/netdev/msg553285.html

Signed-off-by: Mikhail Skorzhinskii <mskorzhinskiy at solarflare.com>
---
 drivers/nvme/host/tcp.c | 40 ++++++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 08a2501b9357..8cca9967d909 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -844,6 +844,24 @@ static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
 	nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_DATA_XFER_ERROR);
 }
 
+static int nvme_tcp_sendpage(struct nvme_tcp_request *req, struct page *page,
+			      size_t offset, size_t len, int flags)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+
+	if (PageSlab(page)) {
+		struct msghdr msg = { .msg_flags = flags };
+		struct kvec iov =  {
+				    .iov_base = page_to_virt(page) + offset,
+				    .iov_len = len
+		};
+
+		return kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+	} else {
+		return kernel_sendpage(queue->sock, page, offset, len, flags);
+	}
+}
+
 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 {
 	struct nvme_tcp_queue *queue = req->queue;
@@ -860,7 +878,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 		else
 			flags |= MSG_MORE;
 
-		ret = kernel_sendpage(queue->sock, page, offset, len, flags);
+		ret = nvme_tcp_sendpage(req, page, offset, len, flags);
 		if (ret <= 0)
 			return ret;
 
@@ -885,6 +903,19 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 	return -EAGAIN;
 }
 
+static inline int nvme_tcp_try_send_pdu(struct nvme_tcp_request *req,
+					int len, int flags)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+	struct msghdr msg = { .msg_flags = flags };
+	struct kvec iov = {
+			   .iov_base = req->pdu + req->offset,
+			   .iov_len = len,
+	};
+
+	return kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+}
+
 static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 {
 	struct nvme_tcp_queue *queue = req->queue;
@@ -898,8 +929,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 	if (queue->hdr_digest && !req->offset)
 		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 
-	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
-			offset_in_page(pdu) + req->offset, len,  flags);
+	ret = nvme_tcp_try_send_pdu(req, len, flags);
 	if (unlikely(ret <= 0))
 		return ret;
 
@@ -931,9 +961,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 	if (queue->hdr_digest && !req->offset)
 		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 
-	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
-			offset_in_page(pdu) + req->offset, len,
-			MSG_DONTWAIT | MSG_MORE);
+	ret = nvme_tcp_try_send_pdu(req, len, MSG_DONTWAIT | MSG_MORE);
 	if (unlikely(ret <= 0))
 		return ret;
 
-- 
2.16.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/3] nvme-tcp: set the STABLE_WRITES flag when data digests are enabled
  2019-07-04 10:03 [PATCH 0/3] Miscellaneous changes for nvme-tcp Mikhail Skorzhinskii
  2019-07-03 10:47 ` [PATCH 3/3] nvme-tcp: replace sendpage calls with sendmsg calls for SLAB chunks Mikhail Skorzhinskii
@ 2019-07-04  7:59 ` Mikhail Skorzhinskii
  2019-07-08 10:03   ` Sagi Grimberg
  2019-07-04  8:01 ` [PATCH 1/3] nvmet: print a hint while rejecting NSID 0 or 0xffffffff Mikhail Skorzhinskii
  2 siblings, 1 reply; 8+ messages in thread
From: Mikhail Skorzhinskii @ 2019-07-04  7:59 UTC (permalink / raw)


There was a few false alarms sighted on target side about wrong data digest
while performing high throughput load to XFS filesystem shared through
NVMoF TCP.

This flag tells[1] the rest of the kernel to ensure that the data buffer
does not change while the write is in flight. It incurs a performance
penalty, so only enable it when it is actually needed, i.e. when we are
calculating data digests.

Although even with this change in place[2], ext2 users can steel experience
false positives, as ext2 is not respecting this flag. This may be apply to
vfat as well.

References:

[1] https://lwn.net/Articles/528031/
[2] https://www.redhat.com/archives/dm-devel/2015-May/msg00158.html

Signed-off-by: Mikhail Skorzhinskii <mskorzhinskiy at solarflare.com>
Signed-off-by: Mike Playle <mplayle at solarflare.com>
---
 drivers/nvme/host/core.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 5417110cbf1b..f4340dc1d399 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -11,6 +11,7 @@
 #include <linux/hdreg.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/backing-dev.h>
 #include <linux/list_sort.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -3304,6 +3305,10 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		goto out_free_ns;
 	}
 
+	if (ctrl->opts->data_digest)
+		ns->queue->backing_dev_info->capabilities
+			|= BDI_CAP_STABLE_WRITES;
+
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
 	if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
 		blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
-- 
2.16.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 1/3] nvmet: print a hint while rejecting NSID 0 or 0xffffffff
  2019-07-04 10:03 [PATCH 0/3] Miscellaneous changes for nvme-tcp Mikhail Skorzhinskii
  2019-07-03 10:47 ` [PATCH 3/3] nvme-tcp: replace sendpage calls with sendmsg calls for SLAB chunks Mikhail Skorzhinskii
  2019-07-04  7:59 ` [PATCH 2/3] nvme-tcp: set the STABLE_WRITES flag when data digests are enabled Mikhail Skorzhinskii
@ 2019-07-04  8:01 ` Mikhail Skorzhinskii
  2019-07-08  9:59   ` Sagi Grimberg
  2 siblings, 1 reply; 8+ messages in thread
From: Mikhail Skorzhinskii @ 2019-07-04  8:01 UTC (permalink / raw)


Adding this hint for the sake of convenience.

It was spotted that a few times people spent some time before understanding
what is exactly wrong in configuration process. This should save a few time
in such situations, especially for people who is not very confident with
NVMe requirements.

Signed-off-by: Mikhail Skorzhinskii <mskorzhinskiy at solarflare.com>
---
 drivers/nvme/target/configfs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 08dd5af357f7..c91cad4f7927 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -588,8 +588,10 @@ static struct config_group *nvmet_ns_make(struct config_group *group,
 		goto out;
 
 	ret = -EINVAL;
-	if (nsid == 0 || nsid == NVME_NSID_ALL)
+	if (nsid == 0 || nsid == NVME_NSID_ALL) {
+		pr_err("NSIDs 0 and 0xffffffff are invalid NSIDs\n");
 		goto out;
+	}
 
 	ret = -ENOMEM;
 	ns = nvmet_ns_alloc(subsys, nsid);
-- 
2.16.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 0/3] Miscellaneous changes for nvme-tcp
@ 2019-07-04 10:03 Mikhail Skorzhinskii
  2019-07-03 10:47 ` [PATCH 3/3] nvme-tcp: replace sendpage calls with sendmsg calls for SLAB chunks Mikhail Skorzhinskii
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Mikhail Skorzhinskii @ 2019-07-04 10:03 UTC (permalink / raw)


A few unrelated changes for NVMoF subsystem, that was found during testing
of NVMoF TCP.

I didn't add any special comments inside the code regarding reasons why
that was done, as I assume people who interested in this will take a look
into the git blame\git log. Of cource, I can add this comments into the
code and resend, if it is prefferable.

patch 1: just a hint message
patch 2: found during extensive testing on various filesystems
patch 3: found during testing on debug kernels with VM_DEBUG enabled

Mike Playle (1):
  nvme-tcp: set the STABLE_WRITES flag when data digests are enabled

Mikhail Skorzhinskii (2):
  nvmet: print a hint while rejecting NSID 0 or 0xffffffff
  nvme-tcp: replace sendpage calls with sendmsg calls for SLAB chunks

 drivers/nvme/host/core.c       |  5 +++++
 drivers/nvme/host/tcp.c        | 40 ++++++++++++++++++++++++++++++++++------
 drivers/nvme/target/configfs.c |  4 +++-
 3 files changed, 42 insertions(+), 7 deletions(-)

-- 
2.16.4

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 3/3] nvme-tcp: replace sendpage calls with sendmsg calls for SLAB chunks
  2019-07-03 10:47 ` [PATCH 3/3] nvme-tcp: replace sendpage calls with sendmsg calls for SLAB chunks Mikhail Skorzhinskii
@ 2019-07-08  9:57   ` Sagi Grimberg
  2019-07-08 10:15     ` Mikhail Skorzhinskii
  0 siblings, 1 reply; 8+ messages in thread
From: Sagi Grimberg @ 2019-07-08  9:57 UTC (permalink / raw)



> According to commit a10674bf2406 ("tcp: detecting the misuse of .sendpage
> for Slab objects") and previous discussion[1][2], tcp_sendpage should not
> be used for pages that is managed by SLAB, as SLAB is not taking page
> reference counters into consideration.
> 
> This change prevents sendpage calls for payload sending too, although this
> is true only for admin commands, so actual data transfer performance
> should be untouched.
> 
> [1] https://www.spinics.net/lists/netdev/msg553616.html
> [2] https://www.spinics.net/lists/netdev/msg553285.html

Well, the lifetime here is guaranteed to be the same for the buffers
in question.

Was any issue seen in action?

> @@ -860,7 +878,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
>   		else
>   			flags |= MSG_MORE;
>   
> -		ret = kernel_sendpage(queue->sock, page, offset, len, flags);
> +		ret = nvme_tcp_sendpage(req, page, offset, len, flags);

Please just do this instead:
--
	/* can't zcopy slab pages */
	if (unlikely(PageSlab(page))
		sock_no_sendpage()
	else
		kernel_sendpage()
--

> +static inline int nvme_tcp_try_send_pdu(struct nvme_tcp_request *req,
> +					int len, int flags)
> +{
> +	struct nvme_tcp_queue *queue = req->queue;
> +	struct msghdr msg = { .msg_flags = flags };
> +	struct kvec iov = {
> +			   .iov_base = req->pdu + req->offset,
> +			   .iov_len = len,
> +	};
> +
> +	return kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
> +}
> +
>   static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
>   {
>   	struct nvme_tcp_queue *queue = req->queue;
> @@ -898,8 +929,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
>   	if (queue->hdr_digest && !req->offset)
>   		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
>   
> -	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> -			offset_in_page(pdu) + req->offset, len,  flags);
> +	ret = nvme_tcp_try_send_pdu(req, len, flags);

This is unneeded given that pdus are page fragments, NOT slab pages.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/3] nvmet: print a hint while rejecting NSID 0 or 0xffffffff
  2019-07-04  8:01 ` [PATCH 1/3] nvmet: print a hint while rejecting NSID 0 or 0xffffffff Mikhail Skorzhinskii
@ 2019-07-08  9:59   ` Sagi Grimberg
  0 siblings, 0 replies; 8+ messages in thread
From: Sagi Grimberg @ 2019-07-08  9:59 UTC (permalink / raw)


Please send it as a separate patch not related to the other two...

> Adding this hint for the sake of convenience.
> 
> It was spotted that a few times people spent some time before understanding
> what is exactly wrong in configuration process. This should save a few time
> in such situations, especially for people who is not very confident with
> NVMe requirements.
> 
> Signed-off-by: Mikhail Skorzhinskii <mskorzhinskiy at solarflare.com>
> ---
>   drivers/nvme/target/configfs.c | 4 +++-
>   1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
> index 08dd5af357f7..c91cad4f7927 100644
> --- a/drivers/nvme/target/configfs.c
> +++ b/drivers/nvme/target/configfs.c
> @@ -588,8 +588,10 @@ static struct config_group *nvmet_ns_make(struct config_group *group,
>   		goto out;
>   
>   	ret = -EINVAL;
> -	if (nsid == 0 || nsid == NVME_NSID_ALL)
> +	if (nsid == 0 || nsid == NVME_NSID_ALL) {
> +		pr_err("NSIDs 0 and 0xffffffff are invalid NSIDs\n");

Why not?
		pr_err("invalid nsid %#x", nsid);

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 2/3] nvme-tcp: set the STABLE_WRITES flag when data digests are enabled
  2019-07-04  7:59 ` [PATCH 2/3] nvme-tcp: set the STABLE_WRITES flag when data digests are enabled Mikhail Skorzhinskii
@ 2019-07-08 10:03   ` Sagi Grimberg
  0 siblings, 0 replies; 8+ messages in thread
From: Sagi Grimberg @ 2019-07-08 10:03 UTC (permalink / raw)



> There was a few false alarms sighted on target side about wrong data digest
> while performing high throughput load to XFS filesystem shared through
> NVMoF TCP.
> 
> This flag tells[1] the rest of the kernel to ensure that the data buffer
> does not change while the write is in flight. It incurs a performance
> penalty, so only enable it when it is actually needed, i.e. when we are
> calculating data digests.
> 
> Although even with this change in place[2], ext2 users can steel experience
> false positives, as ext2 is not respecting this flag. This may be apply to
> vfat as well.
> 
> References:
> 
> [1] https://lwn.net/Articles/528031/
> [2] https://www.redhat.com/archives/dm-devel/2015-May/msg00158.html

Better not to reference a url in the commit message. You can move
this to under the '---' separator (and don't reference to it, summarize
the explanation instead).

> 
> Signed-off-by: Mikhail Skorzhinskii <mskorzhinskiy at solarflare.com>
> Signed-off-by: Mike Playle <mplayle at solarflare.com>
> ---
>   drivers/nvme/host/core.c | 5 +++++
>   1 file changed, 5 insertions(+)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 5417110cbf1b..f4340dc1d399 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -11,6 +11,7 @@
>   #include <linux/hdreg.h>
>   #include <linux/kernel.h>
>   #include <linux/module.h>
> +#include <linux/backing-dev.h>
>   #include <linux/list_sort.h>
>   #include <linux/slab.h>
>   #include <linux/types.h>
> @@ -3304,6 +3305,10 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>   		goto out_free_ns;
>   	}
>   
> +	if (ctrl->opts->data_digest)
> +		ns->queue->backing_dev_info->capabilities
> +			|= BDI_CAP_STABLE_WRITES;
> +

This looks correct, other than the change log.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 3/3] nvme-tcp: replace sendpage calls with sendmsg calls for SLAB chunks
  2019-07-08  9:57   ` Sagi Grimberg
@ 2019-07-08 10:15     ` Mikhail Skorzhinskii
  0 siblings, 0 replies; 8+ messages in thread
From: Mikhail Skorzhinskii @ 2019-07-08 10:15 UTC (permalink / raw)



Sagi Grimberg <sagi at grimberg.me> writes:
 > > According to commit a10674bf2406 ("tcp: detecting the misuse of .sendpage
 > > for Slab objects") and previous discussion[1][2], tcp_sendpage should not
 > > be used for pages that is managed by SLAB, as SLAB is not taking page
 > > reference counters into consideration.
 > >
 > > This change prevents sendpage calls for payload sending too, although this
 > > is true only for admin commands, so actual data transfer performance
 > > should be untouched.
 > >
 > > [1] https://www.spinics.net/lists/netdev/msg553616.html
 > > [2] https://www.spinics.net/lists/netdev/msg553285.html
 >
 > Well, the lifetime here is guaranteed to be the same for the buffers
 > in question.
 >
 > Was any issue seen in action?

No.

And if you go through the threads related to this warning, original bug
only observed when sender and receiver happens to be one physical
machine. Making this patch a bit overreaction to the warning, of course.

I was thinking about disabling this check inside tcp code for our caller
then, but I didn't come up with anything clever, so instead I patch
NVMoF TCP.

 > > @@ -860,7 +878,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 > >   		else
 > >   			flags |= MSG_MORE;
 > >   -		ret = kernel_sendpage(queue->sock, page, offset, len,
 > > flags);
 > > +		ret = nvme_tcp_sendpage(req, page, offset, len, flags);
 >
 > Please just do this instead:
 > --
 > 	/* can't zcopy slab pages */
 > 	if (unlikely(PageSlab(page))
 > 		sock_no_sendpage()
 > 	else
 > 		kernel_sendpage()
 > --

Will fix.

 > > +static inline int nvme_tcp_try_send_pdu(struct nvme_tcp_request *req,
 > > +					int len, int flags)
 > > +{
 > > +	struct nvme_tcp_queue *queue = req->queue;
 > > +	struct msghdr msg = { .msg_flags = flags };
 > > +	struct kvec iov = {
 > > +			   .iov_base = req->pdu + req->offset,
 > > +			   .iov_len = len,
 > > +	};
 > > +
 > > +	return kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
 > > +}
 > > +
 > >   static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 > >   {
 > >   	struct nvme_tcp_queue *queue = req->queue;
 > > @@ -898,8 +929,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 > >   	if (queue->hdr_digest && !req->offset)
 > >   		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 > >   -	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
 > > -			offset_in_page(pdu) + req->offset, len,  flags);
 > > +	ret = nvme_tcp_try_send_pdu(req, len, flags);
 >
 > This is unneeded given that pdus are page fragments, NOT slab pages.

Will fix.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2019-07-08 10:15 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-04 10:03 [PATCH 0/3] Miscellaneous changes for nvme-tcp Mikhail Skorzhinskii
2019-07-03 10:47 ` [PATCH 3/3] nvme-tcp: replace sendpage calls with sendmsg calls for SLAB chunks Mikhail Skorzhinskii
2019-07-08  9:57   ` Sagi Grimberg
2019-07-08 10:15     ` Mikhail Skorzhinskii
2019-07-04  7:59 ` [PATCH 2/3] nvme-tcp: set the STABLE_WRITES flag when data digests are enabled Mikhail Skorzhinskii
2019-07-08 10:03   ` Sagi Grimberg
2019-07-04  8:01 ` [PATCH 1/3] nvmet: print a hint while rejecting NSID 0 or 0xffffffff Mikhail Skorzhinskii
2019-07-08  9:59   ` Sagi Grimberg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.