All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] NFSv4: Fix the alignment of page data in the getdeviceinfo reply
@ 2020-11-18 22:19 trondmy
  2020-11-18 22:19 ` [PATCH 2/3] NFS: Avoid copy of xdr padding in readlink, layoutget and getxattr trondmy
  0 siblings, 1 reply; 12+ messages in thread
From: trondmy @ 2020-11-18 22:19 UTC (permalink / raw)
  To: linux-nfs

From: Trond Myklebust <trond.myklebust@hammerspace.com>

We can fit the device_addr4 opaque data padding in the pages.

Fixes: cf500bac8fd4 ("SUNRPC: Introduce rpc_prepare_reply_pages()")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4xdr.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c6dbfcae7517..c8714381d511 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3009,15 +3009,19 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
+	uint32_t replen;
 
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
+
+	replen = hdr.replen + op_decode_hdr_maxsz;
+
 	encode_getdeviceinfo(xdr, args, &hdr);
 
-	/* set up reply kvec. Subtract notification bitmap max size (2)
-	 * so that notification bitmap is put in xdr_buf tail */
+	/* set up reply kvec. device_addr4 opaque data is read into the
+	 * pages */
 	rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase,
-				args->pdev->pglen, hdr.replen - 2);
+				args->pdev->pglen, replen + 2);
 	encode_nops(&hdr);
 }
 
@@ -5848,7 +5852,9 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
 	 * and places the remaining xdr data in xdr_buf->tail
 	 */
 	pdev->mincount = be32_to_cpup(p);
-	if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount)
+	/* Calculate padding */
+	len = xdr_align_size(pdev->mincount);
+	if (xdr_read_pages(xdr, len) != len)
 		return -EIO;
 
 	/* Parse notification bitmap, verifying that it is zero. */
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/3] NFS: Avoid copy of xdr padding in readlink, layoutget and getxattr
  2020-11-18 22:19 [PATCH 1/3] NFSv4: Fix the alignment of page data in the getdeviceinfo reply trondmy
@ 2020-11-18 22:19 ` trondmy
  2020-11-18 22:19   ` [PATCH 3/3] NFS: Avoid copy of xdr padding in read() trondmy
  0 siblings, 1 reply; 12+ messages in thread
From: trondmy @ 2020-11-18 22:19 UTC (permalink / raw)
  To: linux-nfs

From: Trond Myklebust <trond.myklebust@hammerspace.com>

When doing a readlink, layoutget or getxattr call into an array of
pages, we don't care if the XDR padding ends up in the last page
when the data length is not 32-bit aligned.
Avoid an unnecessary copy by just leaving that padding in place.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs2xdr.c  | 2 +-
 fs/nfs/nfs3xdr.c  | 2 +-
 fs/nfs/nfs42xdr.c | 2 +-
 fs/nfs/nfs4xdr.c  | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index f6676af37d5d..db9c265ad9e1 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -437,7 +437,7 @@ static int decode_path(struct xdr_stream *xdr)
 	length = be32_to_cpup(p);
 	if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
 		goto out_size;
-	recvd = xdr_read_pages(xdr, length);
+	recvd = xdr_read_pages(xdr, xdr_align_size(length));
 	if (unlikely(length > recvd))
 		goto out_cheating;
 	xdr_terminate_string(xdr->buf, length);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 69971f6c840d..d3e1726d538b 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -234,7 +234,7 @@ static int decode_nfspath3(struct xdr_stream *xdr)
 	count = be32_to_cpup(p);
 	if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
 		goto out_nametoolong;
-	recvd = xdr_read_pages(xdr, count);
+	recvd = xdr_read_pages(xdr, xdr_align_size(count));
 	if (unlikely(count > recvd))
 		goto out_cheating;
 	xdr_terminate_string(xdr->buf, count);
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 6e060a88f98c..e58bfe208ae8 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -495,7 +495,7 @@ static int decode_getxattr(struct xdr_stream *xdr,
 	res->xattr_len = len;
 
 	if (len > 0) {
-		rdlen = xdr_read_pages(xdr, len);
+		rdlen = xdr_read_pages(xdr, xdr_align_size(len));
 		if (rdlen < len)
 			return -EIO;
 	}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c8714381d511..755b556e85c3 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5250,7 +5250,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
 	}
-	recvd = xdr_read_pages(xdr, len);
+	recvd = xdr_read_pages(xdr, xdr_align_size(len));
 	if (recvd < len) {
 		dprintk("NFS: server cheating in readlink reply: "
 				"count %u > recvd %u\n", len, recvd);
@@ -5925,7 +5925,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 		res->type,
 		res->layoutp->len);
 
-	recvd = xdr_read_pages(xdr, res->layoutp->len);
+	recvd = xdr_read_pages(xdr, xdr_align_size(res->layoutp->len));
 	if (res->layoutp->len > recvd) {
 		dprintk("NFS: server cheating in layoutget reply: "
 				"layout len %u > recvd %u\n",
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/3] NFS: Avoid copy of xdr padding in read()
  2020-11-18 22:19 ` [PATCH 2/3] NFS: Avoid copy of xdr padding in readlink, layoutget and getxattr trondmy
@ 2020-11-18 22:19   ` trondmy
  2020-11-19 14:17     ` Chuck Lever
  0 siblings, 1 reply; 12+ messages in thread
From: trondmy @ 2020-11-18 22:19 UTC (permalink / raw)
  To: linux-nfs

From: Trond Myklebust <trond.myklebust@hammerspace.com>

When doing a read() into a page, we also don't care if the nul padding
stays in that last page when the data length is not 32-bit aligned.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs2xdr.c | 2 +-
 fs/nfs/nfs3xdr.c | 2 +-
 fs/nfs/nfs4xdr.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index db9c265ad9e1..468bfbfe44d7 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -102,7 +102,7 @@ static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result)
 	if (unlikely(!p))
 		return -EIO;
 	count = be32_to_cpup(p);
-	recvd = xdr_read_pages(xdr, count);
+	recvd = xdr_read_pages(xdr, xdr_align_size(count));
 	if (unlikely(count > recvd))
 		goto out_cheating;
 out:
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index d3e1726d538b..8ef7c961d3e2 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1611,7 +1611,7 @@ static int decode_read3resok(struct xdr_stream *xdr,
 	ocount = be32_to_cpup(p++);
 	if (unlikely(ocount != count))
 		goto out_mismatch;
-	recvd = xdr_read_pages(xdr, count);
+	recvd = xdr_read_pages(xdr, xdr_align_size(count));
 	if (unlikely(count > recvd))
 		goto out_cheating;
 out:
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 755b556e85c3..5baa767106dc 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5202,7 +5202,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req,
 		return -EIO;
 	eof = be32_to_cpup(p++);
 	count = be32_to_cpup(p);
-	recvd = xdr_read_pages(xdr, count);
+	recvd = xdr_read_pages(xdr, xdr_align_size(count));
 	if (count > recvd) {
 		dprintk("NFS: server cheating in read reply: "
 				"count %u > recvd %u\n", count, recvd);
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] NFS: Avoid copy of xdr padding in read()
  2020-11-18 22:19   ` [PATCH 3/3] NFS: Avoid copy of xdr padding in read() trondmy
@ 2020-11-19 14:17     ` Chuck Lever
  2020-11-19 14:30       ` Trond Myklebust
  0 siblings, 1 reply; 12+ messages in thread
From: Chuck Lever @ 2020-11-19 14:17 UTC (permalink / raw)
  To: trondmy; +Cc: Linux NFS Mailing List



> On Nov 18, 2020, at 5:19 PM, trondmy@kernel.org wrote:
> 
> From: Trond Myklebust <trond.myklebust@hammerspace.com>
> 
> When doing a read() into a page, we also don't care if the nul padding
> stays in that last page when the data length is not 32-bit aligned.

What if the READ payload lands in the middle of a file? The
pad on the end will overwrite file content just past where
the READ payload lands.


> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> ---
> fs/nfs/nfs2xdr.c | 2 +-
> fs/nfs/nfs3xdr.c | 2 +-
> fs/nfs/nfs4xdr.c | 2 +-
> 3 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
> index db9c265ad9e1..468bfbfe44d7 100644
> --- a/fs/nfs/nfs2xdr.c
> +++ b/fs/nfs/nfs2xdr.c
> @@ -102,7 +102,7 @@ static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result)
> 	if (unlikely(!p))
> 		return -EIO;
> 	count = be32_to_cpup(p);
> -	recvd = xdr_read_pages(xdr, count);
> +	recvd = xdr_read_pages(xdr, xdr_align_size(count));
> 	if (unlikely(count > recvd))
> 		goto out_cheating;
> out:
> diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
> index d3e1726d538b..8ef7c961d3e2 100644
> --- a/fs/nfs/nfs3xdr.c
> +++ b/fs/nfs/nfs3xdr.c
> @@ -1611,7 +1611,7 @@ static int decode_read3resok(struct xdr_stream *xdr,
> 	ocount = be32_to_cpup(p++);
> 	if (unlikely(ocount != count))
> 		goto out_mismatch;
> -	recvd = xdr_read_pages(xdr, count);
> +	recvd = xdr_read_pages(xdr, xdr_align_size(count));
> 	if (unlikely(count > recvd))
> 		goto out_cheating;
> out:
> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> index 755b556e85c3..5baa767106dc 100644
> --- a/fs/nfs/nfs4xdr.c
> +++ b/fs/nfs/nfs4xdr.c
> @@ -5202,7 +5202,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req,
> 		return -EIO;
> 	eof = be32_to_cpup(p++);
> 	count = be32_to_cpup(p);
> -	recvd = xdr_read_pages(xdr, count);
> +	recvd = xdr_read_pages(xdr, xdr_align_size(count));
> 	if (count > recvd) {
> 		dprintk("NFS: server cheating in read reply: "
> 				"count %u > recvd %u\n", count, recvd);
> -- 
> 2.28.0
> 

--
Chuck Lever




^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] NFS: Avoid copy of xdr padding in read()
  2020-11-19 14:17     ` Chuck Lever
@ 2020-11-19 14:30       ` Trond Myklebust
  2020-11-19 14:31         ` Chuck Lever
  0 siblings, 1 reply; 12+ messages in thread
From: Trond Myklebust @ 2020-11-19 14:30 UTC (permalink / raw)
  To: chuck.lever, trondmy; +Cc: linux-nfs

On Thu, 2020-11-19 at 09:17 -0500, Chuck Lever wrote:
> 
> 
> > On Nov 18, 2020, at 5:19 PM, trondmy@kernel.org wrote:
> > 
> > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > 
> > When doing a read() into a page, we also don't care if the nul
> > padding
> > stays in that last page when the data length is not 32-bit aligned.
> 
> What if the READ payload lands in the middle of a file? The
> pad on the end will overwrite file content just past where
> the READ payload lands.

If the size > buf->page_len, then it gets truncated in
xdr_align_pages() afaik.

> 
> 
> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > ---
> > fs/nfs/nfs2xdr.c | 2 +-
> > fs/nfs/nfs3xdr.c | 2 +-
> > fs/nfs/nfs4xdr.c | 2 +-
> > 3 files changed, 3 insertions(+), 3 deletions(-)
> > 
> > diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
> > index db9c265ad9e1..468bfbfe44d7 100644
> > --- a/fs/nfs/nfs2xdr.c
> > +++ b/fs/nfs/nfs2xdr.c
> > @@ -102,7 +102,7 @@ static int decode_nfsdata(struct xdr_stream
> > *xdr, struct nfs_pgio_res *result)
> >         if (unlikely(!p))
> >                 return -EIO;
> >         count = be32_to_cpup(p);
> > -       recvd = xdr_read_pages(xdr, count);
> > +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
> >         if (unlikely(count > recvd))
> >                 goto out_cheating;
> > out:
> > diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
> > index d3e1726d538b..8ef7c961d3e2 100644
> > --- a/fs/nfs/nfs3xdr.c
> > +++ b/fs/nfs/nfs3xdr.c
> > @@ -1611,7 +1611,7 @@ static int decode_read3resok(struct
> > xdr_stream *xdr,
> >         ocount = be32_to_cpup(p++);
> >         if (unlikely(ocount != count))
> >                 goto out_mismatch;
> > -       recvd = xdr_read_pages(xdr, count);
> > +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
> >         if (unlikely(count > recvd))
> >                 goto out_cheating;
> > out:
> > diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> > index 755b556e85c3..5baa767106dc 100644
> > --- a/fs/nfs/nfs4xdr.c
> > +++ b/fs/nfs/nfs4xdr.c
> > @@ -5202,7 +5202,7 @@ static int decode_read(struct xdr_stream
> > *xdr, struct rpc_rqst *req,
> >                 return -EIO;
> >         eof = be32_to_cpup(p++);
> >         count = be32_to_cpup(p);
> > -       recvd = xdr_read_pages(xdr, count);
> > +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
> >         if (count > recvd) {
> >                 dprintk("NFS: server cheating in read reply: "
> >                                 "count %u > recvd %u\n", count,
> > recvd);
> > -- 
> > 2.28.0
> > 
> 
> --
> Chuck Lever
> 
> 
> 

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] NFS: Avoid copy of xdr padding in read()
  2020-11-19 14:30       ` Trond Myklebust
@ 2020-11-19 14:31         ` Chuck Lever
  2020-11-19 14:34           ` Trond Myklebust
  0 siblings, 1 reply; 12+ messages in thread
From: Chuck Lever @ 2020-11-19 14:31 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: trondmy, Linux NFS Mailing List



> On Nov 19, 2020, at 9:30 AM, Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
> On Thu, 2020-11-19 at 09:17 -0500, Chuck Lever wrote:
>> 
>> 
>>> On Nov 18, 2020, at 5:19 PM, trondmy@kernel.org wrote:
>>> 
>>> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>>> 
>>> When doing a read() into a page, we also don't care if the nul
>>> padding
>>> stays in that last page when the data length is not 32-bit aligned.
>> 
>> What if the READ payload lands in the middle of a file? The
>> pad on the end will overwrite file content just past where
>> the READ payload lands.
> 
> If the size > buf->page_len, then it gets truncated in
> xdr_align_pages() afaik.

I will need to check how RPC/RDMA behaves. It might build a
chunk that includes the pad in this case, which would break
things.



>>> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
>>> ---
>>> fs/nfs/nfs2xdr.c | 2 +-
>>> fs/nfs/nfs3xdr.c | 2 +-
>>> fs/nfs/nfs4xdr.c | 2 +-
>>> 3 files changed, 3 insertions(+), 3 deletions(-)
>>> 
>>> diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
>>> index db9c265ad9e1..468bfbfe44d7 100644
>>> --- a/fs/nfs/nfs2xdr.c
>>> +++ b/fs/nfs/nfs2xdr.c
>>> @@ -102,7 +102,7 @@ static int decode_nfsdata(struct xdr_stream
>>> *xdr, struct nfs_pgio_res *result)
>>>         if (unlikely(!p))
>>>                 return -EIO;
>>>         count = be32_to_cpup(p);
>>> -       recvd = xdr_read_pages(xdr, count);
>>> +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
>>>         if (unlikely(count > recvd))
>>>                 goto out_cheating;
>>> out:
>>> diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
>>> index d3e1726d538b..8ef7c961d3e2 100644
>>> --- a/fs/nfs/nfs3xdr.c
>>> +++ b/fs/nfs/nfs3xdr.c
>>> @@ -1611,7 +1611,7 @@ static int decode_read3resok(struct
>>> xdr_stream *xdr,
>>>         ocount = be32_to_cpup(p++);
>>>         if (unlikely(ocount != count))
>>>                 goto out_mismatch;
>>> -       recvd = xdr_read_pages(xdr, count);
>>> +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
>>>         if (unlikely(count > recvd))
>>>                 goto out_cheating;
>>> out:
>>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
>>> index 755b556e85c3..5baa767106dc 100644
>>> --- a/fs/nfs/nfs4xdr.c
>>> +++ b/fs/nfs/nfs4xdr.c
>>> @@ -5202,7 +5202,7 @@ static int decode_read(struct xdr_stream
>>> *xdr, struct rpc_rqst *req,
>>>                 return -EIO;
>>>         eof = be32_to_cpup(p++);
>>>         count = be32_to_cpup(p);
>>> -       recvd = xdr_read_pages(xdr, count);
>>> +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
>>>         if (count > recvd) {
>>>                 dprintk("NFS: server cheating in read reply: "
>>>                                 "count %u > recvd %u\n", count,
>>> recvd);
>>> -- 
>>> 2.28.0
>>> 
>> 
>> --
>> Chuck Lever
>> 
>> 
>> 
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com

--
Chuck Lever




^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] NFS: Avoid copy of xdr padding in read()
  2020-11-19 14:31         ` Chuck Lever
@ 2020-11-19 14:34           ` Trond Myklebust
  2020-11-19 22:58             ` Chuck Lever
  0 siblings, 1 reply; 12+ messages in thread
From: Trond Myklebust @ 2020-11-19 14:34 UTC (permalink / raw)
  To: chuck.lever; +Cc: linux-nfs

On Thu, 2020-11-19 at 09:31 -0500, Chuck Lever wrote:
> 
> 
> > On Nov 19, 2020, at 9:30 AM, Trond Myklebust <
> > trondmy@hammerspace.com> wrote:
> > 
> > On Thu, 2020-11-19 at 09:17 -0500, Chuck Lever wrote:
> > > 
> > > 
> > > > On Nov 18, 2020, at 5:19 PM, trondmy@kernel.org wrote:
> > > > 
> > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > 
> > > > When doing a read() into a page, we also don't care if the nul
> > > > padding
> > > > stays in that last page when the data length is not 32-bit
> > > > aligned.
> > > 
> > > What if the READ payload lands in the middle of a file? The
> > > pad on the end will overwrite file content just past where
> > > the READ payload lands.
> > 
> > If the size > buf->page_len, then it gets truncated in
> > xdr_align_pages() afaik.
> 
> I will need to check how RPC/RDMA behaves. It might build a
> chunk that includes the pad in this case, which would break
> things.

That would be a bug in the existing code too, then. It shouldn't be
writing beyond the buffer size we set in the NFS layer.

> > > > Signed-off-by: Trond Myklebust
> > > > <trond.myklebust@hammerspace.com>
> > > > ---
> > > > fs/nfs/nfs2xdr.c | 2 +-
> > > > fs/nfs/nfs3xdr.c | 2 +-
> > > > fs/nfs/nfs4xdr.c | 2 +-
> > > > 3 files changed, 3 insertions(+), 3 deletions(-)
> > > > 
> > > > diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
> > > > index db9c265ad9e1..468bfbfe44d7 100644
> > > > --- a/fs/nfs/nfs2xdr.c
> > > > +++ b/fs/nfs/nfs2xdr.c
> > > > @@ -102,7 +102,7 @@ static int decode_nfsdata(struct xdr_stream
> > > > *xdr, struct nfs_pgio_res *result)
> > > >         if (unlikely(!p))
> > > >                 return -EIO;
> > > >         count = be32_to_cpup(p);
> > > > -       recvd = xdr_read_pages(xdr, count);
> > > > +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
> > > >         if (unlikely(count > recvd))
> > > >                 goto out_cheating;
> > > > out:
> > > > diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
> > > > index d3e1726d538b..8ef7c961d3e2 100644
> > > > --- a/fs/nfs/nfs3xdr.c
> > > > +++ b/fs/nfs/nfs3xdr.c
> > > > @@ -1611,7 +1611,7 @@ static int decode_read3resok(struct
> > > > xdr_stream *xdr,
> > > >         ocount = be32_to_cpup(p++);
> > > >         if (unlikely(ocount != count))
> > > >                 goto out_mismatch;
> > > > -       recvd = xdr_read_pages(xdr, count);
> > > > +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
> > > >         if (unlikely(count > recvd))
> > > >                 goto out_cheating;
> > > > out:
> > > > diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> > > > index 755b556e85c3..5baa767106dc 100644
> > > > --- a/fs/nfs/nfs4xdr.c
> > > > +++ b/fs/nfs/nfs4xdr.c
> > > > @@ -5202,7 +5202,7 @@ static int decode_read(struct xdr_stream
> > > > *xdr, struct rpc_rqst *req,
> > > >                 return -EIO;
> > > >         eof = be32_to_cpup(p++);
> > > >         count = be32_to_cpup(p);
> > > > -       recvd = xdr_read_pages(xdr, count);
> > > > +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
> > > >         if (count > recvd) {
> > > >                 dprintk("NFS: server cheating in read reply: "
> > > >                                 "count %u > recvd %u\n", count,
> > > > recvd);
> > > > -- 
> > > > 2.28.0

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] NFS: Avoid copy of xdr padding in read()
  2020-11-19 14:34           ` Trond Myklebust
@ 2020-11-19 22:58             ` Chuck Lever
  2020-11-20  0:58               ` Trond Myklebust
  0 siblings, 1 reply; 12+ messages in thread
From: Chuck Lever @ 2020-11-19 22:58 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Linux NFS Mailing List



> On Nov 19, 2020, at 9:34 AM, Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
> On Thu, 2020-11-19 at 09:31 -0500, Chuck Lever wrote:
>> 
>> 
>>> On Nov 19, 2020, at 9:30 AM, Trond Myklebust <
>>> trondmy@hammerspace.com> wrote:
>>> 
>>> On Thu, 2020-11-19 at 09:17 -0500, Chuck Lever wrote:
>>>> 
>>>> 
>>>>> On Nov 18, 2020, at 5:19 PM, trondmy@kernel.org wrote:
>>>>> 
>>>>> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>>>>> 
>>>>> When doing a read() into a page, we also don't care if the nul
>>>>> padding
>>>>> stays in that last page when the data length is not 32-bit
>>>>> aligned.
>>>> 
>>>> What if the READ payload lands in the middle of a file? The
>>>> pad on the end will overwrite file content just past where
>>>> the READ payload lands.
>>> 
>>> If the size > buf->page_len, then it gets truncated in
>>> xdr_align_pages() afaik.
>> 
>> I will need to check how RPC/RDMA behaves. It might build a
>> chunk that includes the pad in this case, which would break
>> things.
> 
> That would be a bug in the existing code too, then. It shouldn't be
> writing beyond the buffer size we set in the NFS layer.

Testing now with xfstests, which should include fsx with direct
I/O of odd sizes. So far I haven't seen any unexpected behavior.

But I'm not sure what copy you're trying to avoid. This one in
xdr_align_pages() ?

1189         else if (nwords < xdr->nwords) {
1190                 /* Truncate page data and move it into the tail */
1191                 offset = buf->page_len - len;
1192                 copied = xdr_shrink_pagelen(buf, offset);
1193                 trace_rpc_xdr_alignment(xdr, offset, copied);
1194                 xdr->nwords = XDR_QUADLEN(buf->len - cur);
1195         }

We set up the receive buffer already to avoid this copy. It should
rarely, if ever, happen. That's the point of rpc_prepare_reply_pages().


>>>>> Signed-off-by: Trond Myklebust
>>>>> <trond.myklebust@hammerspace.com>
>>>>> ---
>>>>> fs/nfs/nfs2xdr.c | 2 +-
>>>>> fs/nfs/nfs3xdr.c | 2 +-
>>>>> fs/nfs/nfs4xdr.c | 2 +-
>>>>> 3 files changed, 3 insertions(+), 3 deletions(-)
>>>>> 
>>>>> diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
>>>>> index db9c265ad9e1..468bfbfe44d7 100644
>>>>> --- a/fs/nfs/nfs2xdr.c
>>>>> +++ b/fs/nfs/nfs2xdr.c
>>>>> @@ -102,7 +102,7 @@ static int decode_nfsdata(struct xdr_stream
>>>>> *xdr, struct nfs_pgio_res *result)
>>>>>         if (unlikely(!p))
>>>>>                 return -EIO;
>>>>>         count = be32_to_cpup(p);
>>>>> -       recvd = xdr_read_pages(xdr, count);
>>>>> +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
>>>>>         if (unlikely(count > recvd))
>>>>>                 goto out_cheating;
>>>>> out:
>>>>> diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
>>>>> index d3e1726d538b..8ef7c961d3e2 100644
>>>>> --- a/fs/nfs/nfs3xdr.c
>>>>> +++ b/fs/nfs/nfs3xdr.c
>>>>> @@ -1611,7 +1611,7 @@ static int decode_read3resok(struct
>>>>> xdr_stream *xdr,
>>>>>         ocount = be32_to_cpup(p++);
>>>>>         if (unlikely(ocount != count))
>>>>>                 goto out_mismatch;
>>>>> -       recvd = xdr_read_pages(xdr, count);
>>>>> +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
>>>>>         if (unlikely(count > recvd))
>>>>>                 goto out_cheating;
>>>>> out:
>>>>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
>>>>> index 755b556e85c3..5baa767106dc 100644
>>>>> --- a/fs/nfs/nfs4xdr.c
>>>>> +++ b/fs/nfs/nfs4xdr.c
>>>>> @@ -5202,7 +5202,7 @@ static int decode_read(struct xdr_stream
>>>>> *xdr, struct rpc_rqst *req,
>>>>>                 return -EIO;
>>>>>         eof = be32_to_cpup(p++);
>>>>>         count = be32_to_cpup(p);
>>>>> -       recvd = xdr_read_pages(xdr, count);
>>>>> +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
>>>>>         if (count > recvd) {
>>>>>                 dprintk("NFS: server cheating in read reply: "
>>>>>                                 "count %u > recvd %u\n", count,
>>>>> recvd);
>>>>> -- 
>>>>> 2.28.0
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
> 
> 

--
Chuck Lever




^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] NFS: Avoid copy of xdr padding in read()
  2020-11-19 22:58             ` Chuck Lever
@ 2020-11-20  0:58               ` Trond Myklebust
  2020-11-20 14:52                 ` Chuck Lever
  0 siblings, 1 reply; 12+ messages in thread
From: Trond Myklebust @ 2020-11-20  0:58 UTC (permalink / raw)
  To: chuck.lever; +Cc: linux-nfs

On Thu, 2020-11-19 at 17:58 -0500, Chuck Lever wrote:
> 
> 
> > On Nov 19, 2020, at 9:34 AM, Trond Myklebust <
> > trondmy@hammerspace.com> wrote:
> > 
> > On Thu, 2020-11-19 at 09:31 -0500, Chuck Lever wrote:
> > > 
> > > 
> > > > On Nov 19, 2020, at 9:30 AM, Trond Myklebust <
> > > > trondmy@hammerspace.com> wrote:
> > > > 
> > > > On Thu, 2020-11-19 at 09:17 -0500, Chuck Lever wrote:
> > > > > 
> > > > > 
> > > > > > On Nov 18, 2020, at 5:19 PM, trondmy@kernel.org wrote:
> > > > > > 
> > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > > > 
> > > > > > When doing a read() into a page, we also don't care if the
> > > > > > nul
> > > > > > padding
> > > > > > stays in that last page when the data length is not 32-bit
> > > > > > aligned.
> > > > > 
> > > > > What if the READ payload lands in the middle of a file? The
> > > > > pad on the end will overwrite file content just past where
> > > > > the READ payload lands.
> > > > 
> > > > If the size > buf->page_len, then it gets truncated in
> > > > xdr_align_pages() afaik.
> > > 
> > > I will need to check how RPC/RDMA behaves. It might build a
> > > chunk that includes the pad in this case, which would break
> > > things.
> > 
> > That would be a bug in the existing code too, then. It shouldn't be
> > writing beyond the buffer size we set in the NFS layer.
> 
> Testing now with xfstests, which should include fsx with direct
> I/O of odd sizes. So far I haven't seen any unexpected behavior.
> 
> But I'm not sure what copy you're trying to avoid. This one in
> xdr_align_pages() ?
> 
> 1189         else if (nwords < xdr->nwords) {
> 1190                 /* Truncate page data and move it into the tail
> */
> 1191                 offset = buf->page_len - len;
> 1192                 copied = xdr_shrink_pagelen(buf, offset);
> 1193                 trace_rpc_xdr_alignment(xdr, offset, copied);
> 1194                 xdr->nwords = XDR_QUADLEN(buf->len - cur);
> 1195         }
> 
> We set up the receive buffer already to avoid this copy. It should
> rarely, if ever, happen. That's the point of
> rpc_prepare_reply_pages().


...and the point of padding here is to avoid unaligned access to
memory. That is completely broken by this whole mechanism, which causes
us to place the real data in the tail in an unaligned buffer that
follows this padding.

Furthermore, rpc_prepare_reply_pages() only ever places the padding in
the tail _if_ our buffer size is already not 32-bit aligned. Otherwise,
we're engaging in this pointless exercise of making the tail buffer
data unaligned after the fact.

> 
> > > > > > Signed-off-by: Trond Myklebust
> > > > > > <trond.myklebust@hammerspace.com>
> > > > > > ---
> > > > > > fs/nfs/nfs2xdr.c | 2 +-
> > > > > > fs/nfs/nfs3xdr.c | 2 +-
> > > > > > fs/nfs/nfs4xdr.c | 2 +-
> > > > > > 3 files changed, 3 insertions(+), 3 deletions(-)
> > > > > > 
> > > > > > diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
> > > > > > index db9c265ad9e1..468bfbfe44d7 100644
> > > > > > --- a/fs/nfs/nfs2xdr.c
> > > > > > +++ b/fs/nfs/nfs2xdr.c
> > > > > > @@ -102,7 +102,7 @@ static int decode_nfsdata(struct
> > > > > > xdr_stream
> > > > > > *xdr, struct nfs_pgio_res *result)
> > > > > >         if (unlikely(!p))
> > > > > >                 return -EIO;
> > > > > >         count = be32_to_cpup(p);
> > > > > > -       recvd = xdr_read_pages(xdr, count);
> > > > > > +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
> > > > > >         if (unlikely(count > recvd))
> > > > > >                 goto out_cheating;
> > > > > > out:
> > > > > > diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
> > > > > > index d3e1726d538b..8ef7c961d3e2 100644
> > > > > > --- a/fs/nfs/nfs3xdr.c
> > > > > > +++ b/fs/nfs/nfs3xdr.c
> > > > > > @@ -1611,7 +1611,7 @@ static int decode_read3resok(struct
> > > > > > xdr_stream *xdr,
> > > > > >         ocount = be32_to_cpup(p++);
> > > > > >         if (unlikely(ocount != count))
> > > > > >                 goto out_mismatch;
> > > > > > -       recvd = xdr_read_pages(xdr, count);
> > > > > > +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
> > > > > >         if (unlikely(count > recvd))
> > > > > >                 goto out_cheating;
> > > > > > out:
> > > > > > diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> > > > > > index 755b556e85c3..5baa767106dc 100644
> > > > > > --- a/fs/nfs/nfs4xdr.c
> > > > > > +++ b/fs/nfs/nfs4xdr.c
> > > > > > @@ -5202,7 +5202,7 @@ static int decode_read(struct
> > > > > > xdr_stream
> > > > > > *xdr, struct rpc_rqst *req,
> > > > > >                 return -EIO;
> > > > > >         eof = be32_to_cpup(p++);
> > > > > >         count = be32_to_cpup(p);
> > > > > > -       recvd = xdr_read_pages(xdr, count);
> > > > > > +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
> > > > > >         if (count > recvd) {
> > > > > >                 dprintk("NFS: server cheating in read
> > > > > > reply: "
> > > > > >                                 "count %u > recvd %u\n",
> > > > > > count,
> > > > > > recvd);
> > > > > > -- 
> > > > > > 2.28.0
> > 
> > -- 
> > Trond Myklebust
> > Linux NFS client maintainer, Hammerspace
> > trond.myklebust@hammerspace.com
> > 
> > 
> 
> --
> Chuck Lever
> 
> 
> 

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] NFS: Avoid copy of xdr padding in read()
  2020-11-20  0:58               ` Trond Myklebust
@ 2020-11-20 14:52                 ` Chuck Lever
  2020-11-20 17:14                   ` Trond Myklebust
  0 siblings, 1 reply; 12+ messages in thread
From: Chuck Lever @ 2020-11-20 14:52 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Linux NFS Mailing List



> On Nov 19, 2020, at 7:58 PM, Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
> On Thu, 2020-11-19 at 17:58 -0500, Chuck Lever wrote:
>> 
>> 
>>> On Nov 19, 2020, at 9:34 AM, Trond Myklebust <
>>> trondmy@hammerspace.com> wrote:
>>> 
>>> On Thu, 2020-11-19 at 09:31 -0500, Chuck Lever wrote:
>>>> 
>>>> 
>>>>> On Nov 19, 2020, at 9:30 AM, Trond Myklebust <
>>>>> trondmy@hammerspace.com> wrote:
>>>>> 
>>>>> On Thu, 2020-11-19 at 09:17 -0500, Chuck Lever wrote:
>>>>>> 
>>>>>> 
>>>>>>> On Nov 18, 2020, at 5:19 PM, trondmy@kernel.org wrote:
>>>>>>> 
>>>>>>> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>>>>>>> 
>>>>>>> When doing a read() into a page, we also don't care if the
>>>>>>> nul
>>>>>>> padding
>>>>>>> stays in that last page when the data length is not 32-bit
>>>>>>> aligned.
>>>>>> 
>>>>>> What if the READ payload lands in the middle of a file? The
>>>>>> pad on the end will overwrite file content just past where
>>>>>> the READ payload lands.
>>>>> 
>>>>> If the size > buf->page_len, then it gets truncated in
>>>>> xdr_align_pages() afaik.
>>>> 
>>>> I will need to check how RPC/RDMA behaves. It might build a
>>>> chunk that includes the pad in this case, which would break
>>>> things.
>>> 
>>> That would be a bug in the existing code too, then. It shouldn't be
>>> writing beyond the buffer size we set in the NFS layer.
>> 
>> Testing now with xfstests, which should include fsx with direct
>> I/O of odd sizes. So far I haven't seen any unexpected behavior.
>> 
>> But I'm not sure what copy you're trying to avoid. This one in
>> xdr_align_pages() ?
>> 
>> 1189         else if (nwords < xdr->nwords) {
>> 1190                 /* Truncate page data and move it into the tail
>> */
>> 1191                 offset = buf->page_len - len;
>> 1192                 copied = xdr_shrink_pagelen(buf, offset);
>> 1193                 trace_rpc_xdr_alignment(xdr, offset, copied);
>> 1194                 xdr->nwords = XDR_QUADLEN(buf->len - cur);
>> 1195         }
>> 
>> We set up the receive buffer already to avoid this copy. It should
>> rarely, if ever, happen. That's the point of
>> rpc_prepare_reply_pages().
> 
> 
> ...and the point of padding here is to avoid unaligned access to
> memory. That is completely broken by this whole mechanism, which causes
> us to place the real data in the tail in an unaligned buffer that
> follows this padding.

I've never seen run-time complaints about unaligned accesses of
the tail data. Do you have a reproducer? (And obviously this
applies only to NFSv4 COMPOUND results, right?)


> Furthermore, rpc_prepare_reply_pages() only ever places the padding in
> the tail _if_ our buffer size is already not 32-bit aligned. Otherwise,
> we're engaging in this pointless exercise of making the tail buffer
> data unaligned after the fact.

Architecturally, I agree that it would be best if the tail buffer
presented the XDR data items aligned on 4 bytes.

But I do not agree that a pad goes in the pages. Some transports do
not send an XDR pad for unaligned data payloads. Transports have to
have a way of avoiding that pad. If it no longer goes in the tail,
then how will they do that?

We could perhaps have a new flag, XDRBUF_IMPLICIT_PAD, that means
the transport has to add a pad to xdr->pages on send, or did not add
a pad on receive.


>>>>>>> Signed-off-by: Trond Myklebust
>>>>>>> <trond.myklebust@hammerspace.com>
>>>>>>> ---
>>>>>>> fs/nfs/nfs2xdr.c | 2 +-
>>>>>>> fs/nfs/nfs3xdr.c | 2 +-
>>>>>>> fs/nfs/nfs4xdr.c | 2 +-
>>>>>>> 3 files changed, 3 insertions(+), 3 deletions(-)
>>>>>>> 
>>>>>>> diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
>>>>>>> index db9c265ad9e1..468bfbfe44d7 100644
>>>>>>> --- a/fs/nfs/nfs2xdr.c
>>>>>>> +++ b/fs/nfs/nfs2xdr.c
>>>>>>> @@ -102,7 +102,7 @@ static int decode_nfsdata(struct
>>>>>>> xdr_stream
>>>>>>> *xdr, struct nfs_pgio_res *result)
>>>>>>>         if (unlikely(!p))
>>>>>>>                 return -EIO;
>>>>>>>         count = be32_to_cpup(p);
>>>>>>> -       recvd = xdr_read_pages(xdr, count);
>>>>>>> +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
>>>>>>>         if (unlikely(count > recvd))
>>>>>>>                 goto out_cheating;
>>>>>>> out:
>>>>>>> diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
>>>>>>> index d3e1726d538b..8ef7c961d3e2 100644
>>>>>>> --- a/fs/nfs/nfs3xdr.c
>>>>>>> +++ b/fs/nfs/nfs3xdr.c
>>>>>>> @@ -1611,7 +1611,7 @@ static int decode_read3resok(struct
>>>>>>> xdr_stream *xdr,
>>>>>>>         ocount = be32_to_cpup(p++);
>>>>>>>         if (unlikely(ocount != count))
>>>>>>>                 goto out_mismatch;
>>>>>>> -       recvd = xdr_read_pages(xdr, count);
>>>>>>> +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
>>>>>>>         if (unlikely(count > recvd))
>>>>>>>                 goto out_cheating;
>>>>>>> out:
>>>>>>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
>>>>>>> index 755b556e85c3..5baa767106dc 100644
>>>>>>> --- a/fs/nfs/nfs4xdr.c
>>>>>>> +++ b/fs/nfs/nfs4xdr.c
>>>>>>> @@ -5202,7 +5202,7 @@ static int decode_read(struct
>>>>>>> xdr_stream
>>>>>>> *xdr, struct rpc_rqst *req,
>>>>>>>                 return -EIO;
>>>>>>>         eof = be32_to_cpup(p++);
>>>>>>>         count = be32_to_cpup(p);
>>>>>>> -       recvd = xdr_read_pages(xdr, count);
>>>>>>> +       recvd = xdr_read_pages(xdr, xdr_align_size(count));
>>>>>>>         if (count > recvd) {
>>>>>>>                 dprintk("NFS: server cheating in read
>>>>>>> reply: "
>>>>>>>                                 "count %u > recvd %u\n",
>>>>>>> count,
>>>>>>> recvd);
>>>>>>> -- 
>>>>>>> 2.28.0
>>> 
>>> -- 
>>> Trond Myklebust
>>> Linux NFS client maintainer, Hammerspace
>>> trond.myklebust@hammerspace.com
>>> 
>>> 
>> 
>> --
>> Chuck Lever
>> 
>> 
>> 
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com

--
Chuck Lever




^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] NFS: Avoid copy of xdr padding in read()
  2020-11-20 14:52                 ` Chuck Lever
@ 2020-11-20 17:14                   ` Trond Myklebust
  2020-11-20 17:51                     ` Chuck Lever
  0 siblings, 1 reply; 12+ messages in thread
From: Trond Myklebust @ 2020-11-20 17:14 UTC (permalink / raw)
  To: chuck.lever; +Cc: linux-nfs

On Fri, 2020-11-20 at 09:52 -0500, Chuck Lever wrote:
> 
> 
> > On Nov 19, 2020, at 7:58 PM, Trond Myklebust <
> > trondmy@hammerspace.com> wrote:
> > 
> > On Thu, 2020-11-19 at 17:58 -0500, Chuck Lever wrote:
> > > 
> > > 
> > > > On Nov 19, 2020, at 9:34 AM, Trond Myklebust <
> > > > trondmy@hammerspace.com> wrote:
> > > > 
> > > > On Thu, 2020-11-19 at 09:31 -0500, Chuck Lever wrote:
> > > > > 
> > > > > 
> > > > > > On Nov 19, 2020, at 9:30 AM, Trond Myklebust <
> > > > > > trondmy@hammerspace.com> wrote:
> > > > > > 
> > > > > > On Thu, 2020-11-19 at 09:17 -0500, Chuck Lever wrote:
> > > > > > > 
> > > > > > > 
> > > > > > > > On Nov 18, 2020, at 5:19 PM, trondmy@kernel.org wrote:
> > > > > > > > 
> > > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > > > > > 
> > > > > > > > When doing a read() into a page, we also don't care if
> > > > > > > > the
> > > > > > > > nul
> > > > > > > > padding
> > > > > > > > stays in that last page when the data length is not 32-
> > > > > > > > bit
> > > > > > > > aligned.
> > > > > > > 
> > > > > > > What if the READ payload lands in the middle of a file?
> > > > > > > The
> > > > > > > pad on the end will overwrite file content just past
> > > > > > > where
> > > > > > > the READ payload lands.
> > > > > > 
> > > > > > If the size > buf->page_len, then it gets truncated in
> > > > > > xdr_align_pages() afaik.
> > > > > 
> > > > > I will need to check how RPC/RDMA behaves. It might build a
> > > > > chunk that includes the pad in this case, which would break
> > > > > things.
> > > > 
> > > > That would be a bug in the existing code too, then. It
> > > > shouldn't be
> > > > writing beyond the buffer size we set in the NFS layer.
> > > 
> > > Testing now with xfstests, which should include fsx with direct
> > > I/O of odd sizes. So far I haven't seen any unexpected behavior.
> > > 
> > > But I'm not sure what copy you're trying to avoid. This one in
> > > xdr_align_pages() ?
> > > 
> > > 1189         else if (nwords < xdr->nwords) {
> > > 1190                 /* Truncate page data and move it into the
> > > tail
> > > */
> > > 1191                 offset = buf->page_len - len;
> > > 1192                 copied = xdr_shrink_pagelen(buf, offset);
> > > 1193                 trace_rpc_xdr_alignment(xdr, offset,
> > > copied);
> > > 1194                 xdr->nwords = XDR_QUADLEN(buf->len - cur);
> > > 1195         }
> > > 
> > > We set up the receive buffer already to avoid this copy. It
> > > should
> > > rarely, if ever, happen. That's the point of
> > > rpc_prepare_reply_pages().
> > 
> > 
> > ...and the point of padding here is to avoid unaligned access to
> > memory. That is completely broken by this whole mechanism, which
> > causes
> > us to place the real data in the tail in an unaligned buffer that
> > follows this padding.
> 
> I've never seen run-time complaints about unaligned accesses of
> the tail data. Do you have a reproducer? (And obviously this
> applies only to NFSv4 COMPOUND results, right?)

I don't think we can trigger it anywhere right now mainly because we're
careful never to put any further ops after a read, readdir or readlink,
even though we probably should append a VERIFY to some of those so as
to protect data caching (particularly for readlink).

> 
> > Furthermore, rpc_prepare_reply_pages() only ever places the padding
> > in
> > the tail _if_ our buffer size is already not 32-bit aligned.
> > Otherwise,
> > we're engaging in this pointless exercise of making the tail buffer
> > data unaligned after the fact.
> 
> Architecturally, I agree that it would be best if the tail buffer
> presented the XDR data items aligned on 4 bytes.
> 
> But I do not agree that a pad goes in the pages. Some transports do
> not send an XDR pad for unaligned data payloads. Transports have to
> have a way of avoiding that pad. If it no longer goes in the tail,
> then how will they do that?
> 
> We could perhaps have a new flag, XDRBUF_IMPLICIT_PAD, that means
> the transport has to add a pad to xdr->pages on send, or did not add
> a pad on receive.
> 

I don't see why we would need any of this. If the pad is being directly
placed in the tail by the RPC transport layer, then there is nothing in
xdr_read_pages() that will move it or change the alignment of the
existing code in the tail. The call just notes that xdr_align_pages()
returned a length that was shorter than the one we supplied, and
continues to skip the padding in the tail.

The one thing that perhaps is broken here is that if we supply a length
to xdr_read_pages() that is in fact larger than xdr_align_size(xdr-
>buf->page_len), then we should skip additional data that was placed in
the tail, but that is a bug with the existing xdr_read_pages() and is
unaffected by these patches.

> 
> > > > > > > > Signed-off-by: Trond Myklebust
> > > > > > > > <trond.myklebust@hammerspace.com>
> > > > > > > > ---
> > > > > > > > fs/nfs/nfs2xdr.c | 2 +-
> > > > > > > > fs/nfs/nfs3xdr.c | 2 +-
> > > > > > > > fs/nfs/nfs4xdr.c | 2 +-
> > > > > > > > 3 files changed, 3 insertions(+), 3 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
> > > > > > > > index db9c265ad9e1..468bfbfe44d7 100644
> > > > > > > > --- a/fs/nfs/nfs2xdr.c
> > > > > > > > +++ b/fs/nfs/nfs2xdr.c
> > > > > > > > @@ -102,7 +102,7 @@ static int decode_nfsdata(struct
> > > > > > > > xdr_stream
> > > > > > > > *xdr, struct nfs_pgio_res *result)
> > > > > > > >         if (unlikely(!p))
> > > > > > > >                 return -EIO;
> > > > > > > >         count = be32_to_cpup(p);
> > > > > > > > -       recvd = xdr_read_pages(xdr, count);
> > > > > > > > +       recvd = xdr_read_pages(xdr,
> > > > > > > > xdr_align_size(count));
> > > > > > > >         if (unlikely(count > recvd))
> > > > > > > >                 goto out_cheating;
> > > > > > > > out:
> > > > > > > > diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
> > > > > > > > index d3e1726d538b..8ef7c961d3e2 100644
> > > > > > > > --- a/fs/nfs/nfs3xdr.c
> > > > > > > > +++ b/fs/nfs/nfs3xdr.c
> > > > > > > > @@ -1611,7 +1611,7 @@ static int
> > > > > > > > decode_read3resok(struct
> > > > > > > > xdr_stream *xdr,
> > > > > > > >         ocount = be32_to_cpup(p++);
> > > > > > > >         if (unlikely(ocount != count))
> > > > > > > >                 goto out_mismatch;
> > > > > > > > -       recvd = xdr_read_pages(xdr, count);
> > > > > > > > +       recvd = xdr_read_pages(xdr,
> > > > > > > > xdr_align_size(count));
> > > > > > > >         if (unlikely(count > recvd))
> > > > > > > >                 goto out_cheating;
> > > > > > > > out:
> > > > > > > > diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> > > > > > > > index 755b556e85c3..5baa767106dc 100644
> > > > > > > > --- a/fs/nfs/nfs4xdr.c
> > > > > > > > +++ b/fs/nfs/nfs4xdr.c
> > > > > > > > @@ -5202,7 +5202,7 @@ static int decode_read(struct
> > > > > > > > xdr_stream
> > > > > > > > *xdr, struct rpc_rqst *req,
> > > > > > > >                 return -EIO;
> > > > > > > >         eof = be32_to_cpup(p++);
> > > > > > > >         count = be32_to_cpup(p);
> > > > > > > > -       recvd = xdr_read_pages(xdr, count);
> > > > > > > > +       recvd = xdr_read_pages(xdr,
> > > > > > > > xdr_align_size(count));
> > > > > > > >         if (count > recvd) {
> > > > > > > >                 dprintk("NFS: server cheating in read
> > > > > > > > reply: "
> > > > > > > >                                 "count %u > recvd
> > > > > > > > %u\n",
> > > > > > > > count,
> > > > > > > > recvd);
> > > > > > > > -- 
> > > > > > > > 2.28.0
> > > > 
> > > > -- 
> > > > Trond Myklebust
> > > > Linux NFS client maintainer, Hammerspace
> > > > trond.myklebust@hammerspace.com
> > > > 
> > > > 
> > > 
> > > --
> > > Chuck Lever
> > > 
> > > 
> > > 
> > 
> > -- 
> > Trond Myklebust
> > Linux NFS client maintainer, Hammerspace
> > trond.myklebust@hammerspace.com
> 
> --
> Chuck Lever
> 
> 
> 

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] NFS: Avoid copy of xdr padding in read()
  2020-11-20 17:14                   ` Trond Myklebust
@ 2020-11-20 17:51                     ` Chuck Lever
  0 siblings, 0 replies; 12+ messages in thread
From: Chuck Lever @ 2020-11-20 17:51 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Linux NFS Mailing List



> On Nov 20, 2020, at 12:14 PM, Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
> On Fri, 2020-11-20 at 09:52 -0500, Chuck Lever wrote:
>> 
>> 
>>> On Nov 19, 2020, at 7:58 PM, Trond Myklebust <
>>> trondmy@hammerspace.com> wrote:
>>> 
>>> On Thu, 2020-11-19 at 17:58 -0500, Chuck Lever wrote:
>>>> 
>>>> 
>>>>> On Nov 19, 2020, at 9:34 AM, Trond Myklebust <
>>>>> trondmy@hammerspace.com> wrote:
>>>>> 
>>>>> On Thu, 2020-11-19 at 09:31 -0500, Chuck Lever wrote:
>>>>>> 
>>>>>> 
>>>>>>> On Nov 19, 2020, at 9:30 AM, Trond Myklebust <
>>>>>>> trondmy@hammerspace.com> wrote:
>>>>>>> 
>>>>>>> On Thu, 2020-11-19 at 09:17 -0500, Chuck Lever wrote:
>>>>>>>> 
>>>>>>>> 
>>>>>>>>> On Nov 18, 2020, at 5:19 PM, trondmy@kernel.org wrote:
>>>>>>>>> 
>>>>>>>>> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>>>>>>>>> 
>>>>>>>>> When doing a read() into a page, we also don't care if
>>>>>>>>> the
>>>>>>>>> nul
>>>>>>>>> padding
>>>>>>>>> stays in that last page when the data length is not 32-
>>>>>>>>> bit
>>>>>>>>> aligned.
>>>>>>>> 
>>>>>>>> What if the READ payload lands in the middle of a file?
>>>>>>>> The
>>>>>>>> pad on the end will overwrite file content just past
>>>>>>>> where
>>>>>>>> the READ payload lands.
>>>>>>> 
>>>>>>> If the size > buf->page_len, then it gets truncated in
>>>>>>> xdr_align_pages() afaik.
>>>>>> 
>>>>>> I will need to check how RPC/RDMA behaves. It might build a
>>>>>> chunk that includes the pad in this case, which would break
>>>>>> things.
>>>>> 
>>>>> That would be a bug in the existing code too, then. It
>>>>> shouldn't be
>>>>> writing beyond the buffer size we set in the NFS layer.
>>>> 
>>>> Testing now with xfstests, which should include fsx with direct
>>>> I/O of odd sizes. So far I haven't seen any unexpected behavior.
>>>> 
>>>> But I'm not sure what copy you're trying to avoid. This one in
>>>> xdr_align_pages() ?
>>>> 
>>>> 1189         else if (nwords < xdr->nwords) {
>>>> 1190                 /* Truncate page data and move it into the
>>>> tail
>>>> */
>>>> 1191                 offset = buf->page_len - len;
>>>> 1192                 copied = xdr_shrink_pagelen(buf, offset);
>>>> 1193                 trace_rpc_xdr_alignment(xdr, offset,
>>>> copied);
>>>> 1194                 xdr->nwords = XDR_QUADLEN(buf->len - cur);
>>>> 1195         }
>>>> 
>>>> We set up the receive buffer already to avoid this copy. It
>>>> should
>>>> rarely, if ever, happen. That's the point of
>>>> rpc_prepare_reply_pages().
>>> 
>>> 
>>> ...and the point of padding here is to avoid unaligned access to
>>> memory. That is completely broken by this whole mechanism, which
>>> causes
>>> us to place the real data in the tail in an unaligned buffer that
>>> follows this padding.
>> 
>> I've never seen run-time complaints about unaligned accesses of
>> the tail data. Do you have a reproducer? (And obviously this
>> applies only to NFSv4 COMPOUND results, right?)
> 
> I don't think we can trigger it anywhere right now mainly because we're
> careful never to put any further ops after a read, readdir or readlink,
> even though we probably should append a VERIFY to some of those so as
> to protect data caching (particularly for readlink).
> 
>> 
>>> Furthermore, rpc_prepare_reply_pages() only ever places the padding
>>> in
>>> the tail _if_ our buffer size is already not 32-bit aligned.
>>> Otherwise,
>>> we're engaging in this pointless exercise of making the tail buffer
>>> data unaligned after the fact.
>> 
>> Architecturally, I agree that it would be best if the tail buffer
>> presented the XDR data items aligned on 4 bytes.
>> 
>> But I do not agree that a pad goes in the pages. Some transports do
>> not send an XDR pad for unaligned data payloads. Transports have to
>> have a way of avoiding that pad. If it no longer goes in the tail,
>> then how will they do that?
>> 
>> We could perhaps have a new flag, XDRBUF_IMPLICIT_PAD, that means
>> the transport has to add a pad to xdr->pages on send, or did not add
>> a pad on receive.
>> 
> 
> I don't see why we would need any of this.

Right. I was conflating the send and receive side. You're talking
about only receive. This would be a problem when the structure of
the reply doesn't allow the client to align the pages and tail[]
in advance.

So, READ/READLINK/READDIR/LISTXATTRS, on either TCP, or with RDMA
and krb5i and krb5p where XDR pads have to stay inline. I'm still
a little troubled by what happens to the pad during a short READ.

The common case for READ requests is page-aligned payloads, so I
don't think tail content alignment is a performance issue. Do you
concur?


> If the pad is being directly
> placed in the tail by the RPC transport layer, then there is nothing in
> xdr_read_pages() that will move it or change the alignment of the
> existing code in the tail. The call just notes that xdr_align_pages()
> returned a length that was shorter than the one we supplied, and
> continues to skip the padding in the tail.

That's what I observe today. I'm trying to understand the usage
scenario where the tail content is not word-aligned. Probably can
happen on server-side receives too.


> The one thing that perhaps is broken here is that if we supply a length
> to xdr_read_pages() that is in fact larger than xdr_align_size(xdr-
>> buf->page_len), then we should skip additional data that was placed in
> the tail, but that is a bug with the existing xdr_read_pages() and is
> unaffected by these patches.

I think I understand what these patches do well enough to drop my
objection, pending further exercise with a testing barrage.


>>>>>>>>> Signed-off-by: Trond Myklebust
>>>>>>>>> <trond.myklebust@hammerspace.com>
>>>>>>>>> ---
>>>>>>>>> fs/nfs/nfs2xdr.c | 2 +-
>>>>>>>>> fs/nfs/nfs3xdr.c | 2 +-
>>>>>>>>> fs/nfs/nfs4xdr.c | 2 +-
>>>>>>>>> 3 files changed, 3 insertions(+), 3 deletions(-)
>>>>>>>>> 
>>>>>>>>> diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
>>>>>>>>> index db9c265ad9e1..468bfbfe44d7 100644
>>>>>>>>> --- a/fs/nfs/nfs2xdr.c
>>>>>>>>> +++ b/fs/nfs/nfs2xdr.c
>>>>>>>>> @@ -102,7 +102,7 @@ static int decode_nfsdata(struct
>>>>>>>>> xdr_stream
>>>>>>>>> *xdr, struct nfs_pgio_res *result)
>>>>>>>>>         if (unlikely(!p))
>>>>>>>>>                 return -EIO;
>>>>>>>>>         count = be32_to_cpup(p);
>>>>>>>>> -       recvd = xdr_read_pages(xdr, count);
>>>>>>>>> +       recvd = xdr_read_pages(xdr,
>>>>>>>>> xdr_align_size(count));
>>>>>>>>>         if (unlikely(count > recvd))
>>>>>>>>>                 goto out_cheating;
>>>>>>>>> out:
>>>>>>>>> diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
>>>>>>>>> index d3e1726d538b..8ef7c961d3e2 100644
>>>>>>>>> --- a/fs/nfs/nfs3xdr.c
>>>>>>>>> +++ b/fs/nfs/nfs3xdr.c
>>>>>>>>> @@ -1611,7 +1611,7 @@ static int
>>>>>>>>> decode_read3resok(struct
>>>>>>>>> xdr_stream *xdr,
>>>>>>>>>         ocount = be32_to_cpup(p++);
>>>>>>>>>         if (unlikely(ocount != count))
>>>>>>>>>                 goto out_mismatch;
>>>>>>>>> -       recvd = xdr_read_pages(xdr, count);
>>>>>>>>> +       recvd = xdr_read_pages(xdr,
>>>>>>>>> xdr_align_size(count));
>>>>>>>>>         if (unlikely(count > recvd))
>>>>>>>>>                 goto out_cheating;
>>>>>>>>> out:
>>>>>>>>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
>>>>>>>>> index 755b556e85c3..5baa767106dc 100644
>>>>>>>>> --- a/fs/nfs/nfs4xdr.c
>>>>>>>>> +++ b/fs/nfs/nfs4xdr.c
>>>>>>>>> @@ -5202,7 +5202,7 @@ static int decode_read(struct
>>>>>>>>> xdr_stream
>>>>>>>>> *xdr, struct rpc_rqst *req,
>>>>>>>>>                 return -EIO;
>>>>>>>>>         eof = be32_to_cpup(p++);
>>>>>>>>>         count = be32_to_cpup(p);
>>>>>>>>> -       recvd = xdr_read_pages(xdr, count);
>>>>>>>>> +       recvd = xdr_read_pages(xdr,
>>>>>>>>> xdr_align_size(count));
>>>>>>>>>         if (count > recvd) {
>>>>>>>>>                 dprintk("NFS: server cheating in read
>>>>>>>>> reply: "
>>>>>>>>>                                 "count %u > recvd
>>>>>>>>> %u\n",
>>>>>>>>> count,
>>>>>>>>> recvd);
>>>>>>>>> -- 
>>>>>>>>> 2.28.0
>>>>> 
>>>>> -- 
>>>>> Trond Myklebust
>>>>> Linux NFS client maintainer, Hammerspace
>>>>> trond.myklebust@hammerspace.com
>>>>> 
>>>>> 
>>>> 
>>>> --
>>>> Chuck Lever
>>>> 
>>>> 
>>>> 
>>> 
>>> -- 
>>> Trond Myklebust
>>> Linux NFS client maintainer, Hammerspace
>>> trond.myklebust@hammerspace.com
>> 
>> --
>> Chuck Lever
>> 
>> 
>> 
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com

--
Chuck Lever




^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2020-11-20 17:52 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-18 22:19 [PATCH 1/3] NFSv4: Fix the alignment of page data in the getdeviceinfo reply trondmy
2020-11-18 22:19 ` [PATCH 2/3] NFS: Avoid copy of xdr padding in readlink, layoutget and getxattr trondmy
2020-11-18 22:19   ` [PATCH 3/3] NFS: Avoid copy of xdr padding in read() trondmy
2020-11-19 14:17     ` Chuck Lever
2020-11-19 14:30       ` Trond Myklebust
2020-11-19 14:31         ` Chuck Lever
2020-11-19 14:34           ` Trond Myklebust
2020-11-19 22:58             ` Chuck Lever
2020-11-20  0:58               ` Trond Myklebust
2020-11-20 14:52                 ` Chuck Lever
2020-11-20 17:14                   ` Trond Myklebust
2020-11-20 17:51                     ` Chuck Lever

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.