All of lore.kernel.org
 help / color / mirror / Atom feed
* server tcp performance patches
@ 2011-04-06 23:06 J. Bruce Fields
  2011-04-06 23:06 ` [PATCH 1/7] SUNRPC: requeue tcp socket less frequently J. Bruce Fields
                   ` (6 more replies)
  0 siblings, 7 replies; 8+ messages in thread
From: J. Bruce Fields @ 2011-04-06 23:06 UTC (permalink / raw)
  To: linux-nfs

We previously attempted to turn on autotuning of nfsd's tcp receive
buffers, enabling better performance on large bandwidth-delay-product
networks, but ran into some regressions on local gigabit networks.

At the time Trond proposed modifying the server to receive partial rpc
calls as they arrive instead of waiting for the entire request to
arrive, reasoning that that would a) free up receive buffer space
sooner, causing the server to advertise a larger window soon, and b)
avoid theoretical deadlocks possible if the receive buffer ever fell
below the minimum required to hold a request.

That seemed to solve the observed regression, but we didn't completely
understand why, so I put off applying the patches.

I still don't completely understand the cause of the original
regression, but it seems like a reasonable thing to do anyway, and
solves an immediate problem, so I've updated Trond's original patch (and
split it up slightly)--the main changes required were just to adapt to
the 4.1 backchannel reply receive code.

I'm considering queueing this up for 2.6.40.

--b.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/7] SUNRPC: requeue tcp socket less frequently
  2011-04-06 23:06 server tcp performance patches J. Bruce Fields
@ 2011-04-06 23:06 ` J. Bruce Fields
  2011-04-06 23:06 ` [PATCH 2/7] SUNRPC: svc_tcp_recvfrom cleanup J. Bruce Fields
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: J. Bruce Fields @ 2011-04-06 23:06 UTC (permalink / raw)
  To: linux-nfs; +Cc: Trond Myklebust, J. Bruce Fields

From: Trond Myklebust <Trond.Myklebust@netapp.com>

Don't requeue the socket in some cases where we know it's unnecessary.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c |    5 ++++-
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index d802e94..29b31df 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -959,7 +959,6 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 		goto err_again;	/* record not complete */
 	}
 	len = svsk->sk_reclen;
-	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
 	return len;
  error:
@@ -1109,6 +1108,10 @@ out:
 	/* Reset TCP read info */
 	svsk->sk_reclen = 0;
 	svsk->sk_tcplen = 0;
+	/* If we have more data, signal svc_xprt_enqueue() to try again */
+	if (svc_recv_available(svsk) > sizeof(rpc_fraghdr))
+		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+
 
 	svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
 	if (serv->sv_stats)
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/7] SUNRPC: svc_tcp_recvfrom cleanup
  2011-04-06 23:06 server tcp performance patches J. Bruce Fields
  2011-04-06 23:06 ` [PATCH 1/7] SUNRPC: requeue tcp socket less frequently J. Bruce Fields
@ 2011-04-06 23:06 ` J. Bruce Fields
  2011-04-06 23:06 ` [PATCH 3/7] svcrpc: note network-order types in svc_process_calldir J. Bruce Fields
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: J. Bruce Fields @ 2011-04-06 23:06 UTC (permalink / raw)
  To: linux-nfs; +Cc: Trond Myklebust, J. Bruce Fields

From: Trond Myklebust <Trond.Myklebust@netapp.com>

Minor cleanup in preparation for later patches.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c |   34 +++++++++++++++-------------------
 1 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 29b31df..828fff7 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -887,6 +887,7 @@ failed:
 static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 {
 	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
+	unsigned int want;
 	int len;
 
 	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
@@ -909,9 +910,9 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
 	if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) {
-		int		want = sizeof(rpc_fraghdr) - svsk->sk_tcplen;
 		struct kvec	iov;
 
+		want = sizeof(rpc_fraghdr) - svsk->sk_tcplen;
 		iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen;
 		iov.iov_len  = want;
 		if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0)
@@ -1034,8 +1035,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
 	int		len;
 	struct kvec *vec;
-	int pnum, vlen;
 	struct rpc_rqst *req = NULL;
+	unsigned int vlen;
+	int pnum;
 
 	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
 		svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
@@ -1066,7 +1068,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	}
 
 	pnum = 1;
-	while (vlen < len) {
+	while (vlen < svsk->sk_reclen - 8) {
 		vec[pnum].iov_base = (req) ?
 			page_address(req->rq_private_buf.pages[pnum - 1]) :
 			page_address(rqstp->rq_pages[pnum]);
@@ -1077,29 +1079,23 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	rqstp->rq_respages = &rqstp->rq_pages[pnum];
 
 	/* Now receive data */
-	len = svc_recvfrom(rqstp, vec, pnum, len);
+	len = svc_recvfrom(rqstp, vec, pnum, svsk->sk_reclen - 8);
 	if (len < 0)
 		goto err_again;
 
-	/*
-	 * Account for the 8 bytes we read earlier
-	 */
-	len += 8;
-
 	if (req) {
-		xprt_complete_rqst(req->rq_task, len);
-		len = 0;
+		xprt_complete_rqst(req->rq_task, svsk->sk_reclen);
+		rqstp->rq_arg.len = 0;
 		goto out;
 	}
-	dprintk("svc: TCP complete record (%d bytes)\n", len);
-	rqstp->rq_arg.len = len;
+	dprintk("svc: TCP complete record (%d bytes)\n", svsk->sk_reclen);
+	rqstp->rq_arg.len = svsk->sk_reclen;
 	rqstp->rq_arg.page_base = 0;
-	if (len <= rqstp->rq_arg.head[0].iov_len) {
-		rqstp->rq_arg.head[0].iov_len = len;
+	if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) {
+		rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len;
 		rqstp->rq_arg.page_len = 0;
-	} else {
-		rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
-	}
+	} else
+		rqstp->rq_arg.page_len = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
 
 	rqstp->rq_xprt_ctxt   = NULL;
 	rqstp->rq_prot	      = IPPROTO_TCP;
@@ -1117,7 +1113,7 @@ out:
 	if (serv->sv_stats)
 		serv->sv_stats->nettcpcnt++;
 
-	return len;
+	return rqstp->rq_arg.len;
 
 err_again:
 	if (len == -EAGAIN) {
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/7] svcrpc: note network-order types in svc_process_calldir
  2011-04-06 23:06 server tcp performance patches J. Bruce Fields
  2011-04-06 23:06 ` [PATCH 1/7] SUNRPC: requeue tcp socket less frequently J. Bruce Fields
  2011-04-06 23:06 ` [PATCH 2/7] SUNRPC: svc_tcp_recvfrom cleanup J. Bruce Fields
@ 2011-04-06 23:06 ` J. Bruce Fields
  2011-04-06 23:06 ` [PATCH 4/7] svcrpc: close connection if client sends short packet J. Bruce Fields
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: J. Bruce Fields @ 2011-04-06 23:06 UTC (permalink / raw)
  To: linux-nfs; +Cc: J. Bruce Fields

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 828fff7..2053259 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -976,9 +976,9 @@ static int svc_process_calldir(struct svc_sock *svsk, struct svc_rqst *rqstp,
 			       struct rpc_rqst **reqpp, struct kvec *vec)
 {
 	struct rpc_rqst *req = NULL;
-	u32 *p;
-	u32 xid;
-	u32 calldir;
+	__be32 *p;
+	__be32 xid;
+	__be32 calldir;
 	int len;
 
 	len = svc_recvfrom(rqstp, vec, 1, 8);
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 4/7] svcrpc: close connection if client sends short packet
  2011-04-06 23:06 server tcp performance patches J. Bruce Fields
                   ` (2 preceding siblings ...)
  2011-04-06 23:06 ` [PATCH 3/7] svcrpc: note network-order types in svc_process_calldir J. Bruce Fields
@ 2011-04-06 23:06 ` J. Bruce Fields
  2011-04-06 23:06 ` [PATCH 5/7] svcrpc: copy cb reply instead of pages J. Bruce Fields
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: J. Bruce Fields @ 2011-04-06 23:06 UTC (permalink / raw)
  To: linux-nfs; +Cc: J. Bruce Fields

If the client sents a record too short to contain even the beginning of
the rpc header, then just close the connection.

The current code drops the record data and continues.  I don't see the
point.  It's a hopeless situation and simpler just to cut off the
connection completely.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c |   21 +++++++--------------
 1 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 2053259..9d72a79 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -949,6 +949,9 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 		}
 	}
 
+	if (svsk->sk_reclen < 8)
+		goto err_delete; /* client is nuts. */
+
 	/* Check whether enough data is available */
 	len = svc_recv_available(svsk);
 	if (len < 0)
@@ -1052,20 +1055,10 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	vec[0] = rqstp->rq_arg.head[0];
 	vlen = PAGE_SIZE;
 
-	/*
-	 * We have enough data for the whole tcp record. Let's try and read the
-	 * first 8 bytes to get the xid and the call direction. We can use this
-	 * to figure out if this is a call or a reply to a callback. If
-	 * sk_reclen is < 8 (xid and calldir), then this is a malformed packet.
-	 * In that case, don't bother with the calldir and just read the data.
-	 * It will be rejected in svc_process.
-	 */
-	if (len >= 8) {
-		len = svc_process_calldir(svsk, rqstp, &req, vec);
-		if (len < 0)
-			goto err_again;
-		vlen -= 8;
-	}
+	len = svc_process_calldir(svsk, rqstp, &req, vec);
+	if (len < 0)
+		goto err_again;
+	vlen -= 8;
 
 	pnum = 1;
 	while (vlen < svsk->sk_reclen - 8) {
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 5/7] svcrpc: copy cb reply instead of pages
  2011-04-06 23:06 server tcp performance patches J. Bruce Fields
                   ` (3 preceding siblings ...)
  2011-04-06 23:06 ` [PATCH 4/7] svcrpc: close connection if client sends short packet J. Bruce Fields
@ 2011-04-06 23:06 ` J. Bruce Fields
  2011-04-06 23:06 ` [PATCH 6/7] SUNRPC: Don't wait for full record to receive tcp data J. Bruce Fields
  2011-04-06 23:06 ` [PATCH 7/7] svcrpc: take advantage of tcp autotuning J. Bruce Fields
  6 siblings, 0 replies; 8+ messages in thread
From: J. Bruce Fields @ 2011-04-06 23:06 UTC (permalink / raw)
  To: linux-nfs; +Cc: Trond Myklebust, J. Bruce Fields

From: Trond Myklebust <Trond.Myklebust@netapp.com>

It's much simpler just to copy the cb reply data than to play tricks
with pages.  Callback replies will typically be very small (at least
until we implement cb_getattr, in which case files with very long ACLs
could pose a problem), so there's no loss in efficiency.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c |  122 +++++++++++++++++++++++---------------------------
 1 files changed, 56 insertions(+), 66 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 9d72a79..f248bd4 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -975,57 +975,58 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 	return -EAGAIN;
 }
 
-static int svc_process_calldir(struct svc_sock *svsk, struct svc_rqst *rqstp,
-			       struct rpc_rqst **reqpp, struct kvec *vec)
+static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
 {
+	struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt;
 	struct rpc_rqst *req = NULL;
-	__be32 *p;
+	struct kvec *src, *dst;
+	__be32 *p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
 	__be32 xid;
 	__be32 calldir;
-	int len;
-
-	len = svc_recvfrom(rqstp, vec, 1, 8);
-	if (len < 0)
-		goto error;
 
-	p = (u32 *)rqstp->rq_arg.head[0].iov_base;
 	xid = *p++;
 	calldir = *p;
 
-	if (calldir == 0) {
-		/* REQUEST is the most common case */
-		vec[0] = rqstp->rq_arg.head[0];
-	} else {
-		/* REPLY */
-		struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt;
-
-		if (bc_xprt)
-			req = xprt_lookup_rqst(bc_xprt, xid);
-
-		if (!req) {
-			printk(KERN_NOTICE
-				"%s: Got unrecognized reply: "
-				"calldir 0x%x xpt_bc_xprt %p xid %08x\n",
-				__func__, ntohl(calldir),
-				bc_xprt, xid);
-			vec[0] = rqstp->rq_arg.head[0];
-			goto out;
-		}
+	if (bc_xprt)
+		req = xprt_lookup_rqst(bc_xprt, xid);
 
-		memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
-		       sizeof(struct xdr_buf));
-		/* copy the xid and call direction */
-		memcpy(req->rq_private_buf.head[0].iov_base,
-		       rqstp->rq_arg.head[0].iov_base, 8);
-		vec[0] = req->rq_private_buf.head[0];
+	if (!req) {
+		printk(KERN_NOTICE
+			"%s: Got unrecognized reply: "
+			"calldir 0x%x xpt_bc_xprt %p xid %08x\n",
+			__func__, ntohl(calldir),
+			bc_xprt, xid);
+		return -EAGAIN;
 	}
- out:
-	vec[0].iov_base += 8;
-	vec[0].iov_len -= 8;
-	len = svsk->sk_reclen - 8;
- error:
-	*reqpp = req;
-	return len;
+
+	memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
+	/*
+	 * XXX!: cheating for now!  Only copying HEAD.
+	 * But we know this is good enough for now (in fact, for any
+	 * callback reply in the forseeable future).
+	 */
+	dst = &req->rq_private_buf.head[0];
+	src = &rqstp->rq_arg.head[0];
+	if (dst->iov_len < src->iov_len)
+		return -EAGAIN; /* whatever; just giving up. */
+	memcpy(dst->iov_base, src->iov_base, src->iov_len);
+	xprt_complete_rqst(req->rq_task, svsk->sk_reclen);
+	rqstp->rq_arg.len = 0;
+	return 0;
+}
+
+static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len)
+{
+	int i = 0;
+	int t = 0;
+
+	while (t < len) {
+		vec[i].iov_base = page_address(pages[i]);
+		vec[i].iov_len = PAGE_SIZE;
+		i++;
+		t += PAGE_SIZE;
+	}
+	return i;
 }
 
 /*
@@ -1038,8 +1039,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
 	int		len;
 	struct kvec *vec;
-	struct rpc_rqst *req = NULL;
-	unsigned int vlen;
+	__be32 *p;
+	__be32 calldir;
 	int pnum;
 
 	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
@@ -1052,35 +1053,17 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 		goto error;
 
 	vec = rqstp->rq_vec;
-	vec[0] = rqstp->rq_arg.head[0];
-	vlen = PAGE_SIZE;
 
-	len = svc_process_calldir(svsk, rqstp, &req, vec);
-	if (len < 0)
-		goto err_again;
-	vlen -= 8;
-
-	pnum = 1;
-	while (vlen < svsk->sk_reclen - 8) {
-		vec[pnum].iov_base = (req) ?
-			page_address(req->rq_private_buf.pages[pnum - 1]) :
-			page_address(rqstp->rq_pages[pnum]);
-		vec[pnum].iov_len = PAGE_SIZE;
-		pnum++;
-		vlen += PAGE_SIZE;
-	}
+	pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0],
+						svsk->sk_reclen);
+
 	rqstp->rq_respages = &rqstp->rq_pages[pnum];
 
 	/* Now receive data */
-	len = svc_recvfrom(rqstp, vec, pnum, svsk->sk_reclen - 8);
+	len = svc_recvfrom(rqstp, vec, pnum, svsk->sk_reclen);
 	if (len < 0)
 		goto err_again;
 
-	if (req) {
-		xprt_complete_rqst(req->rq_task, svsk->sk_reclen);
-		rqstp->rq_arg.len = 0;
-		goto out;
-	}
 	dprintk("svc: TCP complete record (%d bytes)\n", svsk->sk_reclen);
 	rqstp->rq_arg.len = svsk->sk_reclen;
 	rqstp->rq_arg.page_base = 0;
@@ -1093,7 +1076,14 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	rqstp->rq_xprt_ctxt   = NULL;
 	rqstp->rq_prot	      = IPPROTO_TCP;
 
-out:
+	p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
+	calldir = p[1];
+	if (calldir) {
+		len = receive_cb_reply(svsk, rqstp);
+		if (len < 0)
+			goto err_again;
+	}
+
 	/* Reset TCP read info */
 	svsk->sk_reclen = 0;
 	svsk->sk_tcplen = 0;
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 6/7] SUNRPC: Don't wait for full record to receive tcp data
  2011-04-06 23:06 server tcp performance patches J. Bruce Fields
                   ` (4 preceding siblings ...)
  2011-04-06 23:06 ` [PATCH 5/7] svcrpc: copy cb reply instead of pages J. Bruce Fields
@ 2011-04-06 23:06 ` J. Bruce Fields
  2011-04-06 23:06 ` [PATCH 7/7] svcrpc: take advantage of tcp autotuning J. Bruce Fields
  6 siblings, 0 replies; 8+ messages in thread
From: J. Bruce Fields @ 2011-04-06 23:06 UTC (permalink / raw)
  To: linux-nfs; +Cc: J. Bruce Fields, Trond Myklebust

Ensure that we immediately read and buffer data from the incoming TCP
stream so that we grow the receive window quickly, and don't deadlock on
large READ or WRITE requests.

Also do some minor exit cleanup.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svcsock.h |    1 +
 net/sunrpc/svcsock.c           |  144 +++++++++++++++++++++++++++++++---------
 2 files changed, 113 insertions(+), 32 deletions(-)

diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 04dba23..85c50b4 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -28,6 +28,7 @@ struct svc_sock {
 	/* private TCP part */
 	u32			sk_reclen;	/* length of record */
 	u32			sk_tcplen;	/* current read length */
+	struct page *		sk_pages[RPCSVC_MAXPAGES];	/* received data */
 };
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index f248bd4..fdb8ea5 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -387,6 +387,33 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
 	return len;
 }
 
+static int svc_partial_recvfrom(struct svc_rqst *rqstp,
+				struct kvec *iov, int nr,
+				int buflen, unsigned int base)
+{
+	size_t save_iovlen;
+	void __user *save_iovbase;
+	unsigned int i;
+	int ret;
+
+	if (base == 0)
+		return svc_recvfrom(rqstp, iov, nr, buflen);
+
+	for (i = 0; i < nr; i++) {
+		if (iov[i].iov_len > base)
+			break;
+		base -= iov[i].iov_len;
+	}
+	save_iovlen = iov[i].iov_len;
+	save_iovbase = iov[i].iov_base;
+	iov[i].iov_len -= base;
+	iov[i].iov_base += base;
+	ret = svc_recvfrom(rqstp, &iov[i], nr - i, buflen);
+	iov[i].iov_len = save_iovlen;
+	iov[i].iov_base = save_iovbase;
+	return ret;
+}
+
 /*
  * Set socket snd and rcv buffer lengths
  */
@@ -878,6 +905,56 @@ failed:
 	return NULL;
 }
 
+static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
+{
+	unsigned int i, len, npages;
+
+	if (svsk->sk_tcplen <= sizeof(rpc_fraghdr))
+		return 0;
+	len = svsk->sk_tcplen - sizeof(rpc_fraghdr);
+	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	for (i = 0; i < npages; i++) {
+		if (rqstp->rq_pages[i] != NULL)
+			put_page(rqstp->rq_pages[i]);
+		BUG_ON(svsk->sk_pages[i] == NULL);
+		rqstp->rq_pages[i] = svsk->sk_pages[i];
+		svsk->sk_pages[i] = NULL;
+	}
+	rqstp->rq_arg.head[0].iov_base = page_address(rqstp->rq_pages[0]);
+	return len;
+}
+
+static void svc_tcp_save_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
+{
+	unsigned int i, len, npages;
+
+	if (svsk->sk_tcplen <= sizeof(rpc_fraghdr))
+		return;
+	len = svsk->sk_tcplen - sizeof(rpc_fraghdr);
+	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	for (i = 0; i < npages; i++) {
+		svsk->sk_pages[i] = rqstp->rq_pages[i];
+		rqstp->rq_pages[i] = NULL;
+	}
+}
+
+static void svc_tcp_clear_pages(struct svc_sock *svsk)
+{
+	unsigned int i, len, npages;
+
+	if (svsk->sk_tcplen <= sizeof(rpc_fraghdr))
+		goto out;
+	len = svsk->sk_tcplen - sizeof(rpc_fraghdr);
+	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	for (i = 0; i < npages; i++) {
+		BUG_ON(svsk->sk_pages[i] == NULL);
+		put_page(svsk->sk_pages[i]);
+		svsk->sk_pages[i] = NULL;
+	}
+out:
+	svsk->sk_tcplen = 0;
+}
+
 /*
  * Receive data.
  * If we haven't gotten the record length yet, get the next four bytes.
@@ -922,7 +999,7 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 		if (len < want) {
 			dprintk("svc: short recvfrom while reading record "
 				"length (%d of %d)\n", len, want);
-			goto err_again; /* record header not complete */
+			return -EAGAIN;
 		}
 
 		svsk->sk_reclen = ntohl(svsk->sk_reclen);
@@ -952,26 +1029,14 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 	if (svsk->sk_reclen < 8)
 		goto err_delete; /* client is nuts. */
 
-	/* Check whether enough data is available */
-	len = svc_recv_available(svsk);
-	if (len < 0)
-		goto error;
-
-	if (len < svsk->sk_reclen) {
-		dprintk("svc: incomplete TCP record (%d of %d)\n",
-			len, svsk->sk_reclen);
-		goto err_again;	/* record not complete */
-	}
 	len = svsk->sk_reclen;
 
 	return len;
- error:
-	if (len == -EAGAIN)
-		dprintk("RPC: TCP recv_record got EAGAIN\n");
+error:
+	dprintk("RPC: TCP recv_record got %d\n", len);
 	return len;
- err_delete:
+err_delete:
 	set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
- err_again:
 	return -EAGAIN;
 }
 
@@ -1029,6 +1094,7 @@ static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len)
 	return i;
 }
 
+
 /*
  * Receive data from a TCP socket.
  */
@@ -1039,6 +1105,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
 	int		len;
 	struct kvec *vec;
+	unsigned int want, base;
 	__be32 *p;
 	__be32 calldir;
 	int pnum;
@@ -1052,6 +1119,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	if (len < 0)
 		goto error;
 
+	base = svc_tcp_restore_pages(svsk, rqstp);
+	want = svsk->sk_reclen - base;
+
 	vec = rqstp->rq_vec;
 
 	pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0],
@@ -1060,11 +1130,18 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	rqstp->rq_respages = &rqstp->rq_pages[pnum];
 
 	/* Now receive data */
-	len = svc_recvfrom(rqstp, vec, pnum, svsk->sk_reclen);
-	if (len < 0)
-		goto err_again;
+	len = svc_partial_recvfrom(rqstp, vec, pnum, want, base);
+	if (len >= 0)
+		svsk->sk_tcplen += len;
+	if (len != want) {
+		if (len < 0 && len != -EAGAIN)
+			goto err_other;
+		svc_tcp_save_pages(svsk, rqstp);
+		dprintk("svc: incomplete TCP record (%d of %d)\n",
+			svsk->sk_tcplen, svsk->sk_reclen);
+		goto err_noclose;
+	}
 
-	dprintk("svc: TCP complete record (%d bytes)\n", svsk->sk_reclen);
 	rqstp->rq_arg.len = svsk->sk_reclen;
 	rqstp->rq_arg.page_base = 0;
 	if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) {
@@ -1081,7 +1158,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	if (calldir) {
 		len = receive_cb_reply(svsk, rqstp);
 		if (len < 0)
-			goto err_again;
+			goto error;
 	}
 
 	/* Reset TCP read info */
@@ -1096,20 +1173,20 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	if (serv->sv_stats)
 		serv->sv_stats->nettcpcnt++;
 
+	dprintk("svc: TCP complete record (%d bytes)\n", rqstp->rq_arg.len);
 	return rqstp->rq_arg.len;
 
-err_again:
-	if (len == -EAGAIN) {
-		dprintk("RPC: TCP recvfrom got EAGAIN\n");
-		return len;
-	}
 error:
-	if (len != -EAGAIN) {
-		printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
-		       svsk->sk_xprt.xpt_server->sv_name, -len);
-		set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
-	}
+	if (len != -EAGAIN)
+		goto err_other;
+	dprintk("RPC: TCP recvfrom got EAGAIN\n");
 	return -EAGAIN;
+err_other:
+	printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
+	       svsk->sk_xprt.xpt_server->sv_name, -len);
+	set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+err_noclose:
+	return -EAGAIN;	/* record not complete */
 }
 
 /*
@@ -1280,6 +1357,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 
 		svsk->sk_reclen = 0;
 		svsk->sk_tcplen = 0;
+		memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages));
 
 		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
 
@@ -1536,8 +1614,10 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 
 	svc_sock_detach(xprt);
 
-	if (!test_bit(XPT_LISTENER, &xprt->xpt_flags))
+	if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
+		svc_tcp_clear_pages(svsk);
 		kernel_sock_shutdown(svsk->sk_sock, SHUT_RDWR);
+	}
 }
 
 /*
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 7/7] svcrpc: take advantage of tcp autotuning
  2011-04-06 23:06 server tcp performance patches J. Bruce Fields
                   ` (5 preceding siblings ...)
  2011-04-06 23:06 ` [PATCH 6/7] SUNRPC: Don't wait for full record to receive tcp data J. Bruce Fields
@ 2011-04-06 23:06 ` J. Bruce Fields
  6 siblings, 0 replies; 8+ messages in thread
From: J. Bruce Fields @ 2011-04-06 23:06 UTC (permalink / raw)
  To: linux-nfs; +Cc: Olga Kornievskaia, Jim Rees, J. Bruce Fields

From: Olga Kornievskaia <aglo@citi.umich.edu>

Allow the NFSv4 server to make use of TCP autotuning behaviour, which
was previously disabled by setting the sk_userlocks variable.

Set the receive buffers to be big enough to receive the whole RPC
request, and set this for the listening socket, not the accept socket.

Remove the code that readjusts the receive/send buffer sizes for the
accepted socket. Previously this code was used to influence the TCP
window management behaviour, which is no longer needed when autotuning
is enabled.

This can improve IO bandwidth on networks with high bandwidth-delay
products, where a large tcp window is required.  It also simplifies
performance tuning, since getting adequate tcp buffers previously
required increasing the number of nfsd threads.

Signed-off-by: Olga Kornievskaia <aglo@citi.umich.edu>
Cc: Jim Rees <rees@umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svcsock.c |   35 +++++++----------------------------
 1 files changed, 7 insertions(+), 28 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index fdb8ea5..8485698 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -436,7 +436,6 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
 	lock_sock(sock->sk);
 	sock->sk->sk_sndbuf = snd * 2;
 	sock->sk->sk_rcvbuf = rcv * 2;
-	sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
 	sock->sk->sk_write_space(sock->sk);
 	release_sock(sock->sk);
 #endif
@@ -967,23 +966,6 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 	unsigned int want;
 	int len;
 
-	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
-		/* sndbuf needs to have room for one request
-		 * per thread, otherwise we can stall even when the
-		 * network isn't a bottleneck.
-		 *
-		 * We count all threads rather than threads in a
-		 * particular pool, which provides an upper bound
-		 * on the number of threads which will access the socket.
-		 *
-		 * rcvbuf just needs to be able to hold a few requests.
-		 * Normally they will be removed from the queue
-		 * as soon a a complete request arrives.
-		 */
-		svc_sock_setbufsize(svsk->sk_sock,
-				    (serv->sv_nrthreads+3) * serv->sv_max_mesg,
-				    3 * serv->sv_max_mesg);
-
 	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
 	if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) {
@@ -1361,15 +1343,6 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 
 		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
 
-		/* initialise setting must have enough space to
-		 * receive and respond to one request.
-		 * svc_tcp_recvfrom will re-adjust if necessary
-		 */
-		svc_sock_setbufsize(svsk->sk_sock,
-				    3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
-				    3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
-
-		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 		if (sk->sk_state != TCP_ESTABLISHED)
 			set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
@@ -1433,8 +1406,14 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	/* Initialize the socket */
 	if (sock->type == SOCK_DGRAM)
 		svc_udp_init(svsk, serv);
-	else
+	else {
+		/* initialise setting must have enough space to
+		 * receive and respond to one request.
+		 */
+		svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg,
+					4 * serv->sv_max_mesg);
 		svc_tcp_init(svsk, serv);
+	}
 
 	dprintk("svc: svc_setup_socket created %p (inet %p)\n",
 				svsk, svsk->sk_sk);
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2011-04-06 23:06 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-04-06 23:06 server tcp performance patches J. Bruce Fields
2011-04-06 23:06 ` [PATCH 1/7] SUNRPC: requeue tcp socket less frequently J. Bruce Fields
2011-04-06 23:06 ` [PATCH 2/7] SUNRPC: svc_tcp_recvfrom cleanup J. Bruce Fields
2011-04-06 23:06 ` [PATCH 3/7] svcrpc: note network-order types in svc_process_calldir J. Bruce Fields
2011-04-06 23:06 ` [PATCH 4/7] svcrpc: close connection if client sends short packet J. Bruce Fields
2011-04-06 23:06 ` [PATCH 5/7] svcrpc: copy cb reply instead of pages J. Bruce Fields
2011-04-06 23:06 ` [PATCH 6/7] SUNRPC: Don't wait for full record to receive tcp data J. Bruce Fields
2011-04-06 23:06 ` [PATCH 7/7] svcrpc: take advantage of tcp autotuning J. Bruce Fields

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.