All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC 0/10] nfsd41 server backchannel for 2.6.31
@ 2009-04-30 23:00 Benny Halevy
  2009-04-30 23:05 ` [RFC 01/10] nfsd: cleanup nfs4.0 callback encode routines Benny Halevy
                   ` (11 more replies)
  0 siblings, 12 replies; 29+ messages in thread
From: Benny Halevy @ 2009-04-30 23:00 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: pNFS Mailing List, NFS list, Ricardo Labiaga

Bruce,

After squashing and merging Ricardo's latest patchset
please review the following patchset and consider for 2.6.31.

Thanks,

Benny

[RFC 01/10] nfsd: cleanup nfs4.0 callback encode routines
[RFC 02/10] nfsd: minorversion support for the back channel
[RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling
[RFC 04/10] nfsd41: Remember the auth flavor to use for callbacks
[RFC 05/10] nfsd41: callback infrastructure
[RFC 06/10] nfsd41: Backchannel: Add sequence arguments to callback RPC arguments
[RFC 07/10] nfsd41: Backchannel: Server backchannel RPC wait queue
[RFC 08/10] nfsd41: Backchannel: Setup sequence information
[RFC 09/10] nfsd41: cb_sequence callback
[RFC 10/10] nfsd41: cb_recall callback

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [RFC 01/10] nfsd: cleanup nfs4.0 callback encode routines
  2009-04-30 23:00 [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
@ 2009-04-30 23:05 ` Benny Halevy
  2009-04-30 23:05 ` [RFC 02/10] nfsd: minorversion support for the back channel Benny Halevy
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 29+ messages in thread
From: Benny Halevy @ 2009-04-30 23:05 UTC (permalink / raw)
  To:  J. Bruce Fields
  Cc: Ricardo Labiaga, pnfs, linux-nfs, Andy Adamson, Benny Halevy

From: Andy Adamson <andros@netapp.com>

Mimic the client and prepare to share the back channel xdr with NFSv4.1.
Bump the number of operations in each encode routine, then backfill the
number of operations.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfsd/nfs4callback.c |   24 ++++++++++++++++--------
 1 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 290289b..7129b0c 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -140,8 +140,9 @@ struct nfs4_cb_compound_hdr {
 	int		status;
 	u32		ident;
 	u32		nops;
+	__be32		*nops_p;
 	u32		taglen;
-	char *		tag;
+	char		*tag;
 };
 
 static struct {
@@ -201,7 +202,7 @@ nfs_cb_stat_to_errno(int stat)
  * XDR encode
  */
 
-static int
+static void
 encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 {
 	__be32 * p;
@@ -210,12 +211,18 @@ encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 	WRITE32(0);            /* tag length is always 0 */
 	WRITE32(NFS4_MINOR_VERSION);
 	WRITE32(hdr->ident);
+	hdr->nops_p = p;
 	WRITE32(hdr->nops);
-	return 0;
 }
 
-static int
-encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
+static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+{
+	*hdr->nops_p = htonl(hdr->nops);
+}
+
+static void
+encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec,
+		 struct nfs4_cb_compound_hdr *hdr)
 {
 	__be32 *p;
 	int len = cb_rec->cbr_fh.fh_size;
@@ -227,7 +234,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 	WRITE32(cb_rec->cbr_trunc);
 	WRITE32(len);
 	WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
-	return 0;
+	hdr->nops++;
 }
 
 static int
@@ -246,12 +253,13 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_cb_recall *a
 	struct xdr_stream xdr;
 	struct nfs4_cb_compound_hdr hdr = {
 		.ident = args->cbr_ident,
-		.nops   = 1,
 	};
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_cb_compound_hdr(&xdr, &hdr);
-	return (encode_cb_recall(&xdr, args));
+	encode_cb_recall(&xdr, args, &hdr);
+	encode_cb_nops(&hdr);
+	return 0;
 }
 
 
-- 
1.6.2.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [RFC 02/10] nfsd: minorversion support for the back channel
  2009-04-30 23:00 [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
  2009-04-30 23:05 ` [RFC 01/10] nfsd: cleanup nfs4.0 callback encode routines Benny Halevy
@ 2009-04-30 23:05 ` Benny Halevy
  2009-04-30 23:05 ` [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling Benny Halevy
                   ` (9 subsequent siblings)
  11 siblings, 0 replies; 29+ messages in thread
From: Benny Halevy @ 2009-04-30 23:05 UTC (permalink / raw)
  To:  J. Bruce Fields
  Cc: Ricardo Labiaga, pnfs, linux-nfs, Andy Adamson, Benny Halevy

From: Andy Adamson <andros@netapp.com>

Prepare to share backchannel code with NFSv4.1.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: use nfsd4_cb_sequence for callback minorversion]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfsd/nfs4callback.c     |    3 ++-
 fs/nfsd/nfs4state.c        |    1 +
 include/linux/nfsd/state.h |    3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7129b0c..5823b9a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -141,6 +141,7 @@ struct nfs4_cb_compound_hdr {
 	u32		ident;
 	u32		nops;
 	__be32		*nops_p;
+	u32		minorversion;
 	u32		taglen;
 	char		*tag;
 };
@@ -209,7 +210,7 @@ encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 
 	RESERVE_SPACE(16);
 	WRITE32(0);            /* tag length is always 0 */
-	WRITE32(NFS4_MINOR_VERSION);
+	WRITE32(hdr->minorversion);
 	WRITE32(hdr->ident);
 	hdr->nops_p = p;
 	WRITE32(hdr->nops);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 60ae426..cc9705b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -978,6 +978,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
 	if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val,
 	                 &cb->cb_addr, &cb->cb_port)))
 		goto out_err;
+	cb->cb_minorversion = 0;
 	cb->cb_prog = se->se_callback_prog;
 	cb->cb_ident = se->se_callback_ident;
 	return;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 26a006a..8762843 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -93,7 +93,8 @@ struct nfs4_callback {
 	u32                     cb_addr;
 	unsigned short          cb_port;
 	u32                     cb_prog;
-	u32                     cb_ident;
+	u32			cb_minorversion;
+	u32                     cb_ident;	/* minorversion 0 only */
 	/* RPC client info */
 	atomic_t		cb_set;     /* successful CB_NULL call */
 	struct rpc_clnt *       cb_client;
-- 
1.6.2.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling
  2009-04-30 23:00 [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
  2009-04-30 23:05 ` [RFC 01/10] nfsd: cleanup nfs4.0 callback encode routines Benny Halevy
  2009-04-30 23:05 ` [RFC 02/10] nfsd: minorversion support for the back channel Benny Halevy
@ 2009-04-30 23:05 ` Benny Halevy
  2009-05-01  0:05   ` [pnfs] " Trond Myklebust
  2009-05-03 20:36   ` J. Bruce Fields
  2009-04-30 23:06 ` [RFC 04/10] nfsd41: Remember the auth flavor to use for callbacks Benny Halevy
                   ` (8 subsequent siblings)
  11 siblings, 2 replies; 29+ messages in thread
From: Benny Halevy @ 2009-04-30 23:05 UTC (permalink / raw)
  To:  J. Bruce Fields
  Cc: Ricardo Labiaga, pnfs, linux-nfs, Rahul Iyer, Mike Sager,
	Marc Eshel, Benny Halevy, Andy Adamson

From: Rahul Iyer <iyer@netapp.com>

FIXME: bhalevy: write up commit message

Signed-off-by: Rahul Iyer <iyer@netapp.com>
Signed-off-by: Mike Sager <sager@netapp.com>
Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>

When the call direction is a reply, copy the xid and call direction into the
req->rq_private_buf.head[0].iov_base otherwise rpc_verify_header returns
rpc_garbage.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[get rid of CONFIG_NFSD_V4_1]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/sunrpc/clnt.h    |    1 +
 include/linux/sunrpc/svcsock.h |    1 +
 include/linux/sunrpc/xprt.h    |    2 +
 net/sunrpc/clnt.c              |    1 +
 net/sunrpc/svcsock.c           |   68 ++++++++++-
 net/sunrpc/xprt.c              |   41 ++++++-
 net/sunrpc/xprtsock.c          |  278 +++++++++++++++++++++++++++++++++++++++-
 7 files changed, 381 insertions(+), 11 deletions(-)

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index c39a210..cf9a8ec 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -110,6 +110,7 @@ struct rpc_create_args {
 	rpc_authflavor_t	authflavor;
 	unsigned long		flags;
 	char			*client_name;
+	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel */
 };
 
 /* Values for "flags" field */
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 8271631..19228f4 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -28,6 +28,7 @@ struct svc_sock {
 	/* private TCP part */
 	u32			sk_reclen;	/* length of record */
 	u32			sk_tcplen;	/* current read length */
+	struct rpc_xprt	       *sk_bc_xprt;	/* NFSv4.1 backchannel xprt */
 };
 
 /*
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 1758d9f..063a6a7 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -174,6 +174,7 @@ struct rpc_xprt {
 	spinlock_t		reserve_lock;	/* lock slot table */
 	u32			xid;		/* Next XID value to use */
 	struct rpc_task *	snd_task;	/* Task blocked in send */
+	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel */
 	struct list_head	recv;
 
 	struct {
@@ -197,6 +198,7 @@ struct xprt_create {
 	struct sockaddr *	srcaddr;	/* optional local address */
 	struct sockaddr *	dstaddr;	/* remote peer address */
 	size_t			addrlen;
+	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel */
 };
 
 struct xprt_class {
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 5abab09..3dc847f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -266,6 +266,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 		.srcaddr = args->saddress,
 		.dstaddr = args->address,
 		.addrlen = args->addrsize,
+		.bc_sock = args->bc_sock,
 	};
 	char servername[48];
 
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 4e6d406..619764e 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -49,6 +49,7 @@
 #include <linux/sunrpc/msg_prot.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/xprt.h>
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
@@ -825,6 +826,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	int		len;
 	struct kvec *vec;
 	int pnum, vlen;
+	struct rpc_rqst *req = NULL;
 
 	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
 		svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
@@ -891,12 +893,65 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	len = svsk->sk_reclen;
 	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
+	/*
+	 * We have enough data for the whole tcp record. Let's try and read the
+	 * first 8 bytes to get the xid and the call direction. We can use this
+	 * to figure out if this is a call or a reply to a callback. If
+	 * sk_reclen is < 8 (xid and calldir), then this is a malformed packet.
+	 * In that case, don't bother with the calldir and just read the data.
+	 * It will be rejected in svc_process.
+	 */
+
 	vec = rqstp->rq_vec;
 	vec[0] = rqstp->rq_arg.head[0];
 	vlen = PAGE_SIZE;
+
+	if (len >= 8) {
+		u32 *p;
+		u32 xid;
+		u32 calldir;
+
+		len = svc_recvfrom(rqstp, vec, 1, 8);
+		if (len < 0)
+			goto error;
+
+		p = (u32 *)rqstp->rq_arg.head[0].iov_base;
+		xid = *p++;
+		calldir = *p;
+
+		if (calldir) {
+			/* REPLY */
+			if (svsk->sk_bc_xprt)
+				req = xprt_lookup_rqst(svsk->sk_bc_xprt, xid);
+			if (req) {
+				memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
+					sizeof(struct xdr_buf));
+				/* copy the xid and call direction */
+				memcpy(req->rq_private_buf.head[0].iov_base,
+					rqstp->rq_arg.head[0].iov_base, 8);
+				vec[0] = req->rq_private_buf.head[0];
+			} else
+				printk(KERN_NOTICE
+					"%s: Got unrecognized reply: "
+					"calldir 0x%x sk_bc_xprt %p xid %08x\n",
+					__func__, ntohl(calldir),
+					svsk->sk_bc_xprt, xid);
+		}
+
+		if (!calldir || !req)
+			vec[0] = rqstp->rq_arg.head[0];
+
+		vec[0].iov_base += 8;
+		vec[0].iov_len -= 8;
+		len = svsk->sk_reclen - 8;
+		vlen -= 8;
+	}
+
 	pnum = 1;
 	while (vlen < len) {
-		vec[pnum].iov_base = page_address(rqstp->rq_pages[pnum]);
+		vec[pnum].iov_base = (req) ?
+			page_address(req->rq_private_buf.pages[pnum - 1]) :
+			page_address(rqstp->rq_pages[pnum]);
 		vec[pnum].iov_len = PAGE_SIZE;
 		pnum++;
 		vlen += PAGE_SIZE;
@@ -908,6 +963,16 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	if (len < 0)
 		goto error;
 
+	/*
+	 * Account for the 8 bytes we read earlier
+	 */
+	len += 8;
+
+	if (req) {
+		xprt_complete_rqst(req->rq_task, len);
+		len = 0;
+		goto out;
+	}
 	dprintk("svc: TCP complete record (%d bytes)\n", len);
 	rqstp->rq_arg.len = len;
 	rqstp->rq_arg.page_base = 0;
@@ -921,6 +986,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	rqstp->rq_xprt_ctxt   = NULL;
 	rqstp->rq_prot	      = IPPROTO_TCP;
 
+out:
 	/* Reset TCP read info */
 	svsk->sk_reclen = 0;
 	svsk->sk_tcplen = 0;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index a0bfe53..03f175e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1015,6 +1015,27 @@ void xprt_release(struct rpc_task *task)
 	spin_unlock(&xprt->reserve_lock);
 }
 
+/*
+ * The autoclose function for the back channel
+ *
+ * The callback channel should never close the channel,
+ * let the forechannel do that.
+ */
+static void bc_autoclose(struct work_struct *work)
+{
+	return;
+}
+
+
+/*
+ * The autodisconnect routine for the back channel. We never disconnect
+ */
+static void
+bc_init_autodisconnect(unsigned long data)
+{
+	return;
+}
+
 /**
  * xprt_create_transport - create an RPC transport
  * @args: rpc transport creation arguments
@@ -1051,9 +1072,16 @@ found:
 
 	INIT_LIST_HEAD(&xprt->free);
 	INIT_LIST_HEAD(&xprt->recv);
-	INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
-	setup_timer(&xprt->timer, xprt_init_autodisconnect,
-			(unsigned long)xprt);
+	if (args->bc_sock) {
+		INIT_WORK(&xprt->task_cleanup, bc_autoclose);
+		setup_timer(&xprt->timer, bc_init_autodisconnect,
+			    (unsigned long)xprt);
+	} else {
+		INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
+		setup_timer(&xprt->timer, xprt_init_autodisconnect,
+			    (unsigned long)xprt);
+	}
+
 	xprt->last_used = jiffies;
 	xprt->cwnd = RPC_INITCWND;
 	xprt->bind_index = 0;
@@ -1073,6 +1101,13 @@ found:
 	dprintk("RPC:       created transport %p with %u slots\n", xprt,
 			xprt->max_reqs);
 
+	/*
+	 * Since we don't want connections for the backchannel, we set
+	 * the xprt status to connected
+	 */
+	if (args->bc_sock)
+		xprt_set_connected(xprt);
+
 	return xprt;
 }
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index d40ff50..067d205 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -32,6 +32,7 @@
 #include <linux/tcp.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/xprtsock.h>
 #include <linux/file.h>
 
@@ -1966,6 +1967,219 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
 			xprt->stat.bklog_u);
 }
 
+/*
+ * The connect worker for the backchannel
+ * This should never be called as we should never need to connect
+ */
+static void bc_connect_worker(struct work_struct *work)
+{
+	BUG();
+}
+
+/*
+ * The set_port routine of the rpc_xprt_ops. This is related to the portmapper
+ * and should never be called
+ */
+
+static void bc_set_port(struct rpc_xprt *xprt, unsigned short port)
+{
+	BUG();
+}
+
+/*
+ * The connect routine for the backchannel rpc_xprt ops
+ * Again, should never be called!
+ */
+
+static void bc_connect(struct rpc_task *task)
+{
+	BUG();
+}
+
+struct rpc_buffer {
+	size_t	len;
+	char	data[];
+};
+/*
+ * Allocate a bunch of pages for a scratch buffer for the rpc code. The reason
+ * we allocate pages instead doing a kmalloc like rpc_malloc is because we want
+ * to use the server side send routines.
+ */
+void *bc_malloc(struct rpc_task *task, size_t size)
+{
+	struct page *page;
+	struct rpc_buffer *buf;
+
+	BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer));
+	page = alloc_page(GFP_KERNEL);
+
+	if (!page)
+		return NULL;
+
+	buf = page_address(page);
+	buf->len = PAGE_SIZE;
+
+	return buf->data;
+}
+
+/*
+ * Free the space allocated in the bc_alloc routine
+ */
+void bc_free(void *buffer)
+{
+	struct rpc_buffer *buf;
+
+	if (!buffer)
+		return;
+
+	buf = container_of(buffer, struct rpc_buffer, data);
+	free_pages((unsigned long)buf, get_order(buf->len));
+}
+
+/*
+ * Use the svc_sock to send the callback. Must be called with svsk->sk_mutex
+ * held. Borrows heavily from svc_tcp_sendto and xs_tcp_semd_request.
+ */
+static int bc_sendto(struct rpc_rqst *req)
+{
+	int total_len;
+	int len;
+	int size;
+	int result;
+	struct xdr_buf *xbufp = &req->rq_snd_buf;
+	struct page **pages = xbufp->pages;
+	unsigned int flags = MSG_MORE;
+	unsigned int pglen = xbufp->page_len;
+	size_t base = xbufp->page_base;
+	struct rpc_xprt *xprt = req->rq_xprt;
+	struct sock_xprt *transport =
+				container_of(xprt, struct sock_xprt, xprt);
+	struct socket *sock = transport->sock;
+
+	total_len = xbufp->len;
+
+	/*
+	 * Set up the rpc header and record marker stuff
+	 */
+	xs_encode_tcp_record_marker(xbufp);
+
+	/*
+	 * The RPC message is divided into 3 pieces:
+	 * - The header: This is what most of the smaller RPC messages consist
+	 *   of. Often the whole message is in this.
+	 *
+	 *   - xdr->pages: This is a list of pages that contain data, for
+	 *   example in a write request or while using rpcsec gss
+	 *
+	 *   - The tail: This is the rest of the rpc message
+	 *
+	 *  First we send the header, then the pages and then finally the tail.
+	 *  The code borrows heavily from svc_sendto.
+	 */
+
+	/*
+	 * Send the head
+	 */
+	if (total_len == xbufp->head[0].iov_len)
+		flags = 0;
+
+	len = sock->ops->sendpage(sock, virt_to_page(xbufp->head[0].iov_base),
+			(unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK,
+			xbufp->head[0].iov_len, flags);
+
+	if (len != xbufp->head[0].iov_len)
+		goto out;
+
+	/*
+	 * send page data
+	 *
+	 * Check the amount of data to be sent. If it is less than the
+	 * remaining page, then send it else send the current page
+	 */
+
+	size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
+	while (pglen > 0) {
+		if (total_len == size)
+			flags = 0;
+		result = sock->ops->sendpage(sock, *pages, base, size, flags);
+		if (result > 0)
+			len += result;
+		if (result != size)
+			goto out;
+		total_len -= size;
+		pglen -= size;
+		size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
+		base = 0;
+		pages++;
+	}
+	/*
+	 * send tail
+	 */
+	if (xbufp->tail[0].iov_len) {
+		result = sock->ops->sendpage(sock,
+			xbufp->tail[0].iov_base,
+			(unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK,
+			xbufp->tail[0].iov_len,
+			0);
+
+		if (result > 0)
+			len += result;
+	}
+out:
+	if (len != xbufp->len)
+		printk(KERN_NOTICE "Error sending entire callback!\n");
+
+	return len;
+}
+
+/*
+ * The send routine. Borrows from svc_send
+ */
+static int bc_send_request(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_xprt *bc_xprt = req->rq_xprt;
+	struct svc_xprt	*xprt;
+	struct svc_sock         *svsk;
+	u32                     len;
+
+	dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid));
+	/*
+	 * Get the server socket associated with this callback xprt
+	 */
+	svsk = bc_xprt->bc_sock;
+	xprt = &svsk->sk_xprt;
+
+	mutex_lock(&xprt->xpt_mutex);
+	if (test_bit(XPT_DEAD, &xprt->xpt_flags))
+		len = -ENOTCONN;
+	else
+		len = bc_sendto(req);
+	mutex_unlock(&xprt->xpt_mutex);
+
+	return 0;
+
+}
+
+/*
+ * The close routine. Since this is client initiated, we do nothing
+ */
+
+static void bc_close(struct rpc_xprt *xprt)
+{
+	return;
+}
+
+/*
+ * The xprt destroy routine. Again, because this connection is client
+ * initiated, we do nothing
+ */
+
+static void bc_destroy(struct rpc_xprt *xprt)
+{
+	return;
+}
+
 static struct rpc_xprt_ops xs_udp_ops = {
 	.set_buffer_size	= xs_udp_set_buffer_size,
 	.reserve_xprt		= xprt_reserve_xprt_cong,
@@ -1999,6 +2213,24 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.print_stats		= xs_tcp_print_stats,
 };
 
+/*
+ * The rpc_xprt_ops for the server backchannel
+ */
+
+static struct rpc_xprt_ops bc_tcp_ops = {
+	.reserve_xprt		= xprt_reserve_xprt,
+	.release_xprt		= xprt_release_xprt,
+	.set_port		= bc_set_port,
+	.connect		= bc_connect,
+	.buf_alloc		= bc_malloc,
+	.buf_free		= bc_free,
+	.send_request		= bc_send_request,
+	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
+	.close			= bc_close,
+	.destroy		= bc_destroy,
+	.print_stats		= xs_tcp_print_stats,
+};
+
 static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
 				      unsigned int slot_table_size)
 {
@@ -2131,13 +2363,29 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
 	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
 
-	xprt->bind_timeout = XS_BIND_TO;
-	xprt->connect_timeout = XS_TCP_CONN_TO;
-	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
-	xprt->idle_timeout = XS_IDLE_DISC_TO;
+	if (args->bc_sock) {
+		/* backchannel */
+		xprt_set_bound(xprt);
+		INIT_DELAYED_WORK(&transport->connect_worker,
+				  bc_connect_worker);
+		xprt->bind_timeout = 0;
+		xprt->connect_timeout = 0;
+		xprt->reestablish_timeout = 0;
+		xprt->idle_timeout = (~0);
 
-	xprt->ops = &xs_tcp_ops;
-	xprt->timeout = &xs_tcp_default_timeout;
+		/*
+		 * The backchannel uses the same socket connection as the
+		 * forechannel
+		 */
+		xprt->bc_sock = args->bc_sock;
+		xprt->bc_sock->sk_bc_xprt = xprt;
+		transport->sock = xprt->bc_sock->sk_sock;
+		transport->inet = xprt->bc_sock->sk_sk;
+
+		xprt->ops = &bc_tcp_ops;
+
+		goto next;
+	}
 
 	switch (addr->sa_family) {
 	case AF_INET:
@@ -2145,13 +2393,29 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 			xprt_set_bound(xprt);
 
 		INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker4);
-		xs_format_ipv4_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP);
 		break;
 	case AF_INET6:
 		if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
 			xprt_set_bound(xprt);
 
 		INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker6);
+		break;
+	}
+	xprt->bind_timeout = XS_BIND_TO;
+	xprt->connect_timeout = XS_TCP_CONN_TO;
+	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+	xprt->idle_timeout = XS_IDLE_DISC_TO;
+
+	xprt->ops = &xs_tcp_ops;
+
+next:
+	xprt->timeout = &xs_tcp_default_timeout;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		xs_format_ipv4_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP);
+		break;
+	case AF_INET6:
 		xs_format_ipv6_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6);
 		break;
 	default:
-- 
1.6.2.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [RFC 04/10] nfsd41: Remember the auth flavor to use for callbacks
  2009-04-30 23:00 [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
                   ` (2 preceding siblings ...)
  2009-04-30 23:05 ` [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling Benny Halevy
@ 2009-04-30 23:06 ` Benny Halevy
  2009-05-03 20:42   ` J. Bruce Fields
  2009-04-30 23:06 ` [RFC 05/10] nfsd41: callback infrastructure Benny Halevy
                   ` (7 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Benny Halevy @ 2009-04-30 23:06 UTC (permalink / raw)
  To:  J. Bruce Fields; +Cc: Ricardo Labiaga, pnfs, linux-nfs, Benny Halevy

From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>

The callbacks will be sent using the same authentication flavor that
was used during session creation.  We'll add code to remember the
principal in the case RPCSEC_GSS in a separate patch.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfsd/nfs4state.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cc9705b..ad30039 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1284,6 +1284,7 @@ out_new:
 	copy_verf(new, &verf);
 	copy_cred(&new->cl_cred, &rqstp->rq_cred);
 	new->cl_addr = ip_addr;
+	new->cl_flavor = rqstp->rq_flavor;
 	gen_clid(new);
 	gen_confirm(new);
 	add_to_unconfirmed(new, strhashval);
-- 
1.6.2.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [RFC 05/10] nfsd41: callback infrastructure
  2009-04-30 23:00 [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
                   ` (3 preceding siblings ...)
  2009-04-30 23:06 ` [RFC 04/10] nfsd41: Remember the auth flavor to use for callbacks Benny Halevy
@ 2009-04-30 23:06 ` Benny Halevy
  2009-05-03 20:49   ` J. Bruce Fields
  2009-04-30 23:06 ` [RFC 06/10] nfsd41: Backchannel: Add sequence arguments to callback RPC arguments Benny Halevy
                   ` (6 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Benny Halevy @ 2009-04-30 23:06 UTC (permalink / raw)
  To:  J. Bruce Fields
  Cc: Ricardo Labiaga, pnfs, linux-nfs, Andy Adamson, Benny Halevy

From: Andy Adamson <andros@netapp.com>

Keep the xprt used for create_session in cl_cb_xprt.
Mark cl_callback.cb_minorversion = 1 and remember
the client provided cl_callback.cb_prog rpc program number.
Use it to probe the callback path.

Define xdr sizes and code nfs4_cb_compound header to be able
to send a null callback rpc.

Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[get callback minorversion from fore channel's]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfsd/nfs4callback.c     |   33 ++++++++++++++++++++++++++++++---
 fs/nfsd/nfs4state.c        |   10 ++++++++++
 include/linux/nfsd/state.h |    3 +++
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 5823b9a..6f1ca49 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -43,6 +43,7 @@
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svcsock.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/state.h>
 #include <linux/sunrpc/sched.h>
@@ -52,16 +53,19 @@
 
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
+#define NFS4_STATEID_SIZE 16
 
 /* Index of predefined Linux callback client operations */
 
 enum {
-        NFSPROC4_CLNT_CB_NULL = 0,
+	NFSPROC4_CLNT_CB_NULL = 0,
 	NFSPROC4_CLNT_CB_RECALL,
+	NFSPROC4_CLNT_CB_SEQUENCE,
 };
 
 enum nfs_cb_opnum4 {
 	OP_CB_RECALL            = 4,
+	OP_CB_SEQUENCE          = 11,
 };
 
 #define NFS4_MAXTAGLEN		20
@@ -70,15 +74,22 @@ enum nfs_cb_opnum4 {
 #define NFS4_dec_cb_null_sz		0
 #define cb_compound_enc_hdr_sz		4
 #define cb_compound_dec_hdr_sz		(3 + (NFS4_MAXTAGLEN >> 2))
+#define sessionid_sz			(NFS4_MAX_SESSIONID_LEN >> 2)
+#define cb_sequence_enc_sz		(sessionid_sz + 4 +             \
+					1 /* no referring calls list yet */)
+#define cb_sequence_dec_sz		(op_dec_sz + sessionid_sz + 4)
+
 #define op_enc_sz			1
 #define op_dec_sz			2
 #define enc_nfs4_fh_sz			(1 + (NFS4_FHSIZE >> 2))
 #define enc_stateid_sz			(NFS4_STATEID_SIZE >> 2)
 #define NFS4_enc_cb_recall_sz		(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
 					1 + enc_stateid_sz +            \
 					enc_nfs4_fh_sz)
 
 #define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
+					cb_sequence_dec_sz +            \
 					op_dec_sz)
 
 /*
@@ -135,13 +146,19 @@ xdr_error:                                      \
 		return -EIO; \
 	} \
 } while (0)
+#define COPYMEM(x, nbytes) do {                \
+	memcpy((x), p, nbytes);                \
+	p += XDR_QUADLEN(nbytes);              \
+} while (0)
 
 struct nfs4_cb_compound_hdr {
-	int		status;
-	u32		ident;
+	/* args */
+	u32		ident;	/* minorversion 0 only */
 	u32		nops;
 	__be32		*nops_p;
 	u32		minorversion;
+	/* res */
+	int		status;
 	u32		taglen;
 	char		*tag;
 };
@@ -402,6 +419,15 @@ static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
 	addr.sin_family = AF_INET;
 	addr.sin_port = htons(cb->cb_port);
 	addr.sin_addr.s_addr = htonl(cb->cb_addr);
+	if (cb->cb_minorversion) {
+		BUG_ON(cb->cb_minorversion != 1);
+		args.bc_sock = container_of(clp->cl_cb_xprt, struct svc_sock,
+					    sk_xprt);
+	}
+
+	dprintk("%s: program %s 0x%x nrvers %u version %u minorversion %u\n",
+		__func__, args.program->name, args.prognumber,
+		args.program->nrvers, args.version, cb->cb_minorversion);
 
 	/* Create RPC client */
 	client = rpc_create(&args);
@@ -441,6 +467,7 @@ static int do_probe_callback(void *data)
 	put_nfs4_client(clp);
 	return 0;
 out_release_client:
+	dprintk("NFSD: synchronous CB_NULL failed. status=%d\n", status);
 	rpc_shutdown_client(client);
 out_err:
 	dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ad30039..61d5c66 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -651,6 +651,8 @@ static inline void
 free_client(struct nfs4_client *clp)
 {
 	shutdown_callback_client(clp);
+	if (clp->cl_cb_xprt)
+		svc_xprt_put(clp->cl_cb_xprt);
 	nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
 			     clp->cl_slot.sl_cache_entry.ce_resused);
 	if (clp->cl_cred.cr_group_info)
@@ -1391,6 +1393,14 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		cr_ses->flags &= ~SESSION4_PERSIST;
 		cr_ses->flags &= ~SESSION4_RDMA;
 
+		if (cr_ses->flags & SESSION4_BACK_CHAN) {
+			unconf->cl_cb_xprt = rqstp->rq_xprt;
+			svc_xprt_get(unconf->cl_cb_xprt);
+			unconf->cl_callback.cb_minorversion =
+				cstate->minorversion;
+			unconf->cl_callback.cb_prog = cr_ses->callback_prog;
+			nfsd4_probe_callback(unconf);
+		}
 		conf = unconf;
 	} else {
 		status = nfserr_stale_clientid;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 8762843..6bdf0d5 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -203,6 +203,9 @@ struct nfs4_client {
 	struct nfsd4_slot	cl_slot;	/* create_session slot */
 	u32			cl_exchange_flags;
 	struct nfs4_sessionid	cl_sessionid;
+
+	/* for nfs41 callbacks */
+	struct svc_xprt		*cl_cb_xprt;	/* 4.1 callback transport */
 };
 
 /* struct nfs4_client_reset
-- 
1.6.2.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [RFC 06/10] nfsd41: Backchannel: Add sequence arguments to callback RPC arguments
  2009-04-30 23:00 [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
                   ` (4 preceding siblings ...)
  2009-04-30 23:06 ` [RFC 05/10] nfsd41: callback infrastructure Benny Halevy
@ 2009-04-30 23:06 ` Benny Halevy
  2009-04-30 23:06 ` [RFC 07/10] nfsd41: Backchannel: Server backchannel RPC wait queue Benny Halevy
                   ` (5 subsequent siblings)
  11 siblings, 0 replies; 29+ messages in thread
From: Benny Halevy @ 2009-04-30 23:06 UTC (permalink / raw)
  To:  J. Bruce Fields; +Cc: Ricardo Labiaga, pnfs, linux-nfs, Benny Halevy

From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>

Follow the model we use in the client. Make the sequence arguments
part of the regular RPC arguments.  The results point to them.  Adjust
references to the sequence arguments.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
[define struct nfsd4_cb_sequence here]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfsd/nfs4callback.c     |    9 +++++++++
 include/linux/nfsd/state.h |    5 +++++
 2 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 6f1ca49..14535b2 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -92,6 +92,15 @@ enum nfs_cb_opnum4 {
 					cb_sequence_dec_sz +            \
 					op_dec_sz)
 
+struct nfs4_rpc_args {
+	void				*args_op;
+	struct nfsd4_cb_sequence	args_seq;
+};
+
+struct nfs4_rpc_res {
+	struct nfsd4_cb_sequence	*res_seq;
+};
+
 /*
 * Generic encode routines from fs/nfs/nfs4xdr.c
 */
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 6bdf0d5..aeb9c40 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -61,6 +61,11 @@ typedef struct {
 #define si_stateownerid   si_opaque.so_stateownerid
 #define si_fileid         si_opaque.so_fileid
 
+struct nfsd4_cb_sequence {
+	/* args/res */
+	u32			cbs_minorversion;
+	struct nfs4_client	*cbs_clp;
+};
 
 struct nfs4_cb_recall {
 	u32			cbr_ident;
-- 
1.6.2.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [RFC 07/10] nfsd41: Backchannel: Server backchannel RPC wait queue
  2009-04-30 23:00 [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
                   ` (5 preceding siblings ...)
  2009-04-30 23:06 ` [RFC 06/10] nfsd41: Backchannel: Add sequence arguments to callback RPC arguments Benny Halevy
@ 2009-04-30 23:06 ` Benny Halevy
  2009-04-30 23:06 ` [RFC 08/10] nfsd41: Backchannel: Setup sequence information Benny Halevy
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 29+ messages in thread
From: Benny Halevy @ 2009-04-30 23:06 UTC (permalink / raw)
  To:  J. Bruce Fields; +Cc: Ricardo Labiaga, pnfs, linux-nfs, Benny Halevy

From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>

RPC callback requests will wait on this wait queue if the backchannel
is out of slots.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfsd/nfs4state.c        |    2 ++
 include/linux/nfsd/state.h |    4 ++++
 2 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 61d5c66..d7b4028 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -726,6 +726,8 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
 	INIT_LIST_HEAD(&clp->cl_delegations);
 	INIT_LIST_HEAD(&clp->cl_sessions);
 	INIT_LIST_HEAD(&clp->cl_lru);
+	clear_bit(0, &clp->cl_cb_slot_busy);
+	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
 	return clp;
 }
 
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index aeb9c40..f204ca8 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -210,7 +210,11 @@ struct nfs4_client {
 	struct nfs4_sessionid	cl_sessionid;
 
 	/* for nfs41 callbacks */
+	/* We currently support a single back channel with a single slot */
+	unsigned long		cl_cb_slot_busy;
 	struct svc_xprt		*cl_cb_xprt;	/* 4.1 callback transport */
+	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
+						/* wait here for slots */
 };
 
 /* struct nfs4_client_reset
-- 
1.6.2.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [RFC 08/10] nfsd41: Backchannel: Setup sequence information
  2009-04-30 23:00 [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
                   ` (6 preceding siblings ...)
  2009-04-30 23:06 ` [RFC 07/10] nfsd41: Backchannel: Server backchannel RPC wait queue Benny Halevy
@ 2009-04-30 23:06 ` Benny Halevy
  2009-04-30 23:06 ` [RFC 09/10] nfsd41: cb_sequence callback Benny Halevy
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 29+ messages in thread
From: Benny Halevy @ 2009-04-30 23:06 UTC (permalink / raw)
  To:  J. Bruce Fields; +Cc: Ricardo Labiaga, pnfs, linux-nfs, Benny Halevy

From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>

Follows the model used by the NFS client.  Setup the RPC prepare and done
function pointers so that we can populate the sequence information if
minorversion == 1.  rpc_run_task() is then invoked directly just like
existing NFS client operations do.

nfsd4_cb_prepare() determines if the sequence information needs to be setup.
If the slot is in use, it adds itself to the wait queue.

nfsd4_cb_done() wakes anyone sleeping on the callback channel wait queue
after our RPC reply has been received.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
[define cl_cb_seq_nr here]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfsd/nfs4callback.c     |  109 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfsd/state.h |    1 +
 2 files changed, 110 insertions(+), 0 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 14535b2..2bf2cd4 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -507,6 +507,115 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 }
 
 /*
+ * There's currently a single callback channel slot.
+ * If the slot is available, then mark it busy.  Otherwise, set the
+ * thread for sleeping on the callback RPC wait queue.
+ */
+static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
+		struct rpc_task *task)
+{
+	struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+	struct nfs4_rpc_res *res = task->tk_msg.rpc_resp;
+	u32 *ptr = (u32 *)clp->cl_sessionid.data;
+	int status = 0;
+
+	dprintk("%s: %u:%u:%u:%u\n", __func__,
+		ptr[0], ptr[1], ptr[2], ptr[3]);
+
+	if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
+		rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
+		dprintk("%s slot is busy\n", __func__);
+		status = -EAGAIN;
+		goto out;
+	}
+
+	/* We'll need the clp during XDR encoding and decoding */
+	args->args_seq.cbs_clp = clp;
+	res->res_seq = &args->args_seq;
+
+out:
+	dprintk("%s status=%d\n", __func__, status);
+	return status;
+}
+
+struct nfsd4_cb_data {
+	struct nfs4_client *clp;
+};
+
+/*
+ * FIXME: cb_sequence should support referring call lists, cachethis, multiple
+ * slots, and mark callback channel down on communication errors.
+ */
+static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_client *clp = ((struct nfsd4_cb_data *)calldata)->clp;
+	struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+	u32 minorversion = clp->cl_callback.cb_minorversion;
+	int status = 0;
+
+	args->args_seq.cbs_minorversion = minorversion;
+	if (minorversion) {
+		status = nfsd41_cb_setup_sequence(clp, task);
+		if (status) {
+			if (status != -EAGAIN) {
+				/* terminate rpc task */
+				task->tk_status = status;
+				task->tk_action = NULL;
+			}
+			return;
+		}
+	}
+	rpc_call_start(task);
+}
+
+static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_client *clp = ((struct nfsd4_cb_data *)calldata)->clp;
+
+	dprintk("%s: minorversion=%d\n", __func__,
+		clp->cl_callback.cb_minorversion);
+
+	if (clp->cl_callback.cb_minorversion) {
+		/* No need for lock, access serialized in nfsd4_cb_prepare */
+		++clp->cl_cb_seq_nr;
+		clear_bit(0, &clp->cl_cb_slot_busy);
+		rpc_wake_up_next(&clp->cl_cb_waitq);
+		dprintk("%s: freed slot, new seqid=%d\n", __func__,
+			clp->cl_cb_seq_nr);
+	}
+}
+
+struct rpc_call_ops nfsd4_cb_ops = {
+	.rpc_call_prepare = nfsd4_cb_prepare,
+	.rpc_call_done = nfsd4_cb_done
+};
+
+static int nfsd4_cb_sync(struct nfs4_client *clp, struct rpc_message *msg,
+			 int flags)
+{
+	int status;
+	struct rpc_task *task;
+	struct nfsd4_cb_data data = {
+		.clp = clp
+	};
+
+	struct rpc_task_setup task_setup = {
+		.rpc_client = clp->cl_callback.cb_client,
+		.rpc_message = msg,
+		.callback_ops = &nfsd4_cb_ops,
+		.callback_data = &data,
+		.flags = flags
+	};
+
+	task = rpc_run_task(&task_setup);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	status = task->tk_status;
+	rpc_put_task(task);
+	return status;
+}
+
+/*
  * called with dp->dl_count inc'ed.
  */
 void
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index f204ca8..432b5d1 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -212,6 +212,7 @@ struct nfs4_client {
 	/* for nfs41 callbacks */
 	/* We currently support a single back channel with a single slot */
 	unsigned long		cl_cb_slot_busy;
+	u32			cl_cb_seq_nr;
 	struct svc_xprt		*cl_cb_xprt;	/* 4.1 callback transport */
 	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
 						/* wait here for slots */
-- 
1.6.2.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [RFC 09/10] nfsd41: cb_sequence callback
  2009-04-30 23:00 [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
                   ` (7 preceding siblings ...)
  2009-04-30 23:06 ` [RFC 08/10] nfsd41: Backchannel: Setup sequence information Benny Halevy
@ 2009-04-30 23:06 ` Benny Halevy
  2009-04-30 23:52   ` [pnfs] " Trond Myklebust
  2009-04-30 23:07 ` [RFC 10/10] nfsd41: cb_recall callback Benny Halevy
                   ` (2 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Benny Halevy @ 2009-04-30 23:06 UTC (permalink / raw)
  To:  J. Bruce Fields
  Cc: Ricardo Labiaga, pnfs, linux-nfs, Andy Adamson, Benny Halevy

From: Andy Adamson <andros@netapp.com>

Implement the cb_sequence callback conforming to draft-ietf-nfsv4-minorversion1

Note: highest slot id and target highest slot id do not have to be 0
as was previously implemented.  They can be greater than what the
nfs server sent if the client supports a larger slot table on the
backchannel.  At this point we just ignore that.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Rework the back channel xdr using the shared v4.0 and v4.1 framework.]
Signed-off-by: Andy Adamson <andros@netapp.com>
[fixed indentation]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: use nfsd4_cb_sequence for callback minorversion]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: fix verification of CB_SEQUENCE highest slot id[
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: Backchannel: Remove old backchannel serialization]
[nfsd41: Backchannel: First callback sequence ID should be 1]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfsd/nfs4callback.c |   72 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4state.c    |    1 +
 2 files changed, 73 insertions(+), 0 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 2bf2cd4..78f4dd2 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -264,6 +264,27 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec,
 	hdr->nops++;
 }
 
+static void
+encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
+		   struct nfs4_cb_compound_hdr *hdr)
+{
+	__be32 *p;
+
+	if (hdr->minorversion == 0)
+		return;
+
+	RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
+
+	WRITE32(OP_CB_SEQUENCE);
+	WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITE32(args->cbs_clp->cl_cb_seq_nr);
+	WRITE32(0);		/* slotid, always 0 */
+	WRITE32(0);		/* highest slotid always 0 */
+	WRITE32(0);		/* cachethis always 0 */
+	WRITE32(0); /* FIXME: support referring_call_lists */
+	hdr->nops++;
+}
+
 static int
 nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
 {
@@ -325,6 +346,57 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 	return 0;
 }
 
+/*
+ * Our current back channel implmentation supports a single backchannel
+ * with a single slot.
+ */
+static int
+decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
+		   struct rpc_rqst *rqstp)
+{
+	struct nfs4_sessionid id;
+	int status;
+	u32 dummy;
+	__be32 *p;
+
+	if (res->cbs_minorversion == 0)
+		return 0;
+
+	status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
+	if (status)
+		return status;
+
+	/*
+	 * If the server returns different values for sessionID, slotID or
+	 * sequence number, the server is looney tunes.
+	 */
+	status = -ESERVERFAULT;
+
+	READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+	COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
+	if (memcmp(id.data, res->cbs_clp->cl_sessionid.data,
+		   NFS4_MAX_SESSIONID_LEN)) {
+		dprintk("%s Invalid session id\n", __func__);
+		goto out;
+	}
+	READ32(dummy);
+	if (dummy != res->cbs_clp->cl_cb_seq_nr) {
+		dprintk("%s Invalid sequence number\n", __func__);
+		goto out;
+	}
+	READ32(dummy); 	/* slotid must be 0 */
+	if (dummy != 0) {
+		dprintk("%s Invalid slotid\n", __func__);
+		goto out;
+	}
+	READ32(dummy); 	/* highest slotid */
+	READ32(dummy); 	/* target highest slotid */
+	status = 0;
+out:
+	return status;
+}
+
+
 static int
 nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
 {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d7b4028..6f3cf47 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1400,6 +1400,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 			svc_xprt_get(unconf->cl_cb_xprt);
 			unconf->cl_callback.cb_minorversion =
 				cstate->minorversion;
+			unconf->cl_cb_seq_nr = 1;
 			unconf->cl_callback.cb_prog = cr_ses->callback_prog;
 			nfsd4_probe_callback(unconf);
 		}
-- 
1.6.2.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [RFC 10/10] nfsd41: cb_recall callback
  2009-04-30 23:00 [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
                   ` (8 preceding siblings ...)
  2009-04-30 23:06 ` [RFC 09/10] nfsd41: cb_sequence callback Benny Halevy
@ 2009-04-30 23:07 ` Benny Halevy
  2009-04-30 23:12 ` [pnfs] [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
  2009-05-03 20:53 ` J. Bruce Fields
  11 siblings, 0 replies; 29+ messages in thread
From: Benny Halevy @ 2009-04-30 23:07 UTC (permalink / raw)
  To:  J. Bruce Fields
  Cc: Ricardo Labiaga, pnfs, linux-nfs, Andy Adamson, Ricardo Labiaga,
	Benny Halevy

From: Andy Adamson <andros@netapp.com>

Implement the cb_recall callback conforming to
http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-26

Signed-off-by: Ricardo Labiaga <ricardo.labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Share v4.0 and v4.1 back channel xdr]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: use nfsd4_cb_sequence for callback minorversion]
[nfsd41: conditionally decode_sequence in nfs4_xdr_dec_cb_recall]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: Backchannel: Add sequence arguments to callback RPC arguments]
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfsd/nfs4callback.c |   30 ++++++++++++++++++++++++------
 1 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 78f4dd2..136dbda 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -296,15 +296,19 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
 }
 
 static int
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_cb_recall *args)
+nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
+		       struct nfs4_rpc_args *rpc_args)
 {
 	struct xdr_stream xdr;
+	struct nfs4_cb_recall *args = rpc_args->args_op;
 	struct nfs4_cb_compound_hdr hdr = {
 		.ident = args->cbr_ident,
+		.minorversion = rpc_args->args_seq.cbs_minorversion,
 	};
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_cb_compound_hdr(&xdr, &hdr);
+	encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
 	encode_cb_recall(&xdr, args, &hdr);
 	encode_cb_nops(&hdr);
 	return 0;
@@ -404,7 +408,8 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
 }
 
 static int
-nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p)
+nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
+		       struct nfs4_rpc_res *rpc_res)
 {
 	struct xdr_stream xdr;
 	struct nfs4_cb_compound_hdr hdr;
@@ -414,6 +419,11 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p)
 	status = decode_cb_compound_hdr(&xdr, &hdr);
 	if (status)
 		goto out;
+	if (rpc_res && rpc_res->res_seq) {
+		status = decode_cb_sequence(&xdr, rpc_res->res_seq, rqstp);
+		if (status)
+			goto out;
+	}
 	status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
 out:
 	return status;
@@ -694,19 +704,25 @@ void
 nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
 	struct nfs4_client *clp = dp->dl_client;
-	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
 	struct nfs4_cb_recall *cbr = &dp->dl_recall;
+	struct nfs4_rpc_args args = {
+		.args_op = cbr,
+	};
+	struct nfs4_rpc_res res;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
-		.rpc_argp = cbr,
+		.rpc_argp = &args,
+		.rpc_resp = &res,
 	};
 	int retries = 1;
 	int status = 0;
 
+	dprintk("%s: dp %p\n", __func__, dp);
+
 	cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */
 	cbr->cbr_dp = dp;
 
-	status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
+	status = nfsd4_cb_sync(clp, &msg, RPC_TASK_SOFT);
 	while (retries--) {
 		switch (status) {
 			case -EIO:
@@ -721,13 +737,15 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 				goto out_put_cred;
 		}
 		ssleep(2);
-		status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
+		status = nfsd4_cb_sync(clp, &msg, RPC_TASK_SOFT);
 	}
 out_put_cred:
 	/*
 	 * Success or failure, now we're either waiting for lease expiration
 	 * or deleg_return.
 	 */
+	dprintk("%s: dp %p dl_flock %p dl_count %d\n",
+		__func__, dp, dp->dl_flock, atomic_read(&dp->dl_count));
 	put_nfs4_client(clp);
 	nfs4_put_delegation(dp);
 	return;
-- 
1.6.2.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [pnfs] [RFC 0/10] nfsd41 server backchannel for 2.6.31
  2009-04-30 23:00 [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
                   ` (9 preceding siblings ...)
  2009-04-30 23:07 ` [RFC 10/10] nfsd41: cb_recall callback Benny Halevy
@ 2009-04-30 23:12 ` Benny Halevy
  2009-05-03 20:53 ` J. Bruce Fields
  11 siblings, 0 replies; 29+ messages in thread
From: Benny Halevy @ 2009-04-30 23:12 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: NFS list, pNFS Mailing List, Ricardo Labiaga

On May. 01, 2009, 2:00 +0300, Benny Halevy <bhalevy@panasas.com> wrote:
> Bruce,
> 
> After squashing and merging Ricardo's latest patchset
> please review the following patchset and consider for 2.6.31.

I forgot to mention that this patchset is also available on
git://linux-nfs.org/~bhalevy/linux-pnfs.git nfsd41-for-2.6.31
which is based onto
git://git.linux-nfs.org/~bfields/linux.git for-2.6.31

Benny

> 
> Thanks,
> 
> Benny
> 
> [RFC 01/10] nfsd: cleanup nfs4.0 callback encode routines
> [RFC 02/10] nfsd: minorversion support for the back channel
> [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling
> [RFC 04/10] nfsd41: Remember the auth flavor to use for callbacks
> [RFC 05/10] nfsd41: callback infrastructure
> [RFC 06/10] nfsd41: Backchannel: Add sequence arguments to callback RPC arguments
> [RFC 07/10] nfsd41: Backchannel: Server backchannel RPC wait queue
> [RFC 08/10] nfsd41: Backchannel: Setup sequence information
> [RFC 09/10] nfsd41: cb_sequence callback
> [RFC 10/10] nfsd41: cb_recall callback
> _______________________________________________
> pNFS mailing list
> pNFS@linux-nfs.org
> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [pnfs] [RFC 09/10] nfsd41: cb_sequence callback
  2009-04-30 23:06 ` [RFC 09/10] nfsd41: cb_sequence callback Benny Halevy
@ 2009-04-30 23:52   ` Trond Myklebust
       [not found]     ` <1241135565.15476.111.camel-rJ7iovZKK19ZJLDQqaL3InhyD016LWXt@public.gmane.org>
  0 siblings, 1 reply; 29+ messages in thread
From: Trond Myklebust @ 2009-04-30 23:52 UTC (permalink / raw)
  To: Benny Halevy; +Cc: J. Bruce Fields, Andy Adamson, linux-nfs, pnfs

On Fri, 2009-05-01 at 02:06 +0300, Benny Halevy wrote:
> From: Andy Adamson <andros@netapp.com>
> 
> Implement the cb_sequence callback conforming to draft-ietf-nfsv4-minorversion1
> 
> Note: highest slot id and target highest slot id do not have to be 0
> as was previously implemented.  They can be greater than what the
> nfs server sent if the client supports a larger slot table on the
> backchannel.  At this point we just ignore that.
> 
> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> [Rework the back channel xdr using the shared v4.0 and v4.1 framework.]
> Signed-off-by: Andy Adamson <andros@netapp.com>
> [fixed indentation]
> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> [nfsd41: use nfsd4_cb_sequence for callback minorversion]
> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> [nfsd41: fix verification of CB_SEQUENCE highest slot id[
> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> [nfsd41: Backchannel: Remove old backchannel serialization]
> [nfsd41: Backchannel: First callback sequence ID should be 1]
> Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> ---
>  fs/nfsd/nfs4callback.c |   72 ++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfs4state.c    |    1 +
>  2 files changed, 73 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> index 2bf2cd4..78f4dd2 100644
> --- a/fs/nfsd/nfs4callback.c
> +++ b/fs/nfsd/nfs4callback.c
> @@ -264,6 +264,27 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec,
>  	hdr->nops++;
>  }
>  
> +static void
> +encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
> +		   struct nfs4_cb_compound_hdr *hdr)
> +{
> +	__be32 *p;
> +
> +	if (hdr->minorversion == 0)
> +		return;
> +
> +	RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
> +
> +	WRITE32(OP_CB_SEQUENCE);
> +	WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
> +	WRITE32(args->cbs_clp->cl_cb_seq_nr);
> +	WRITE32(0);		/* slotid, always 0 */
> +	WRITE32(0);		/* highest slotid always 0 */
> +	WRITE32(0);		/* cachethis always 0 */
> +	WRITE32(0); /* FIXME: support referring_call_lists */
> +	hdr->nops++;
> +}
> +
>  static int
>  nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
>  {
> @@ -325,6 +346,57 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
>  	return 0;
>  }
>  
> +/*
> + * Our current back channel implmentation supports a single backchannel
> + * with a single slot.
> + */
> +static int
> +decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
> +		   struct rpc_rqst *rqstp)
> +{
> +	struct nfs4_sessionid id;
> +	int status;
> +	u32 dummy;
> +	__be32 *p;
> +
> +	if (res->cbs_minorversion == 0)
> +		return 0;
> +
> +	status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
> +	if (status)
> +		return status;
> +
> +	/*
> +	 * If the server returns different values for sessionID, slotID or
> +	 * sequence number, the server is looney tunes.
> +	 */
> +	status = -ESERVERFAULT;
> +
> +	READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
> +	COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
> +	if (memcmp(id.data, res->cbs_clp->cl_sessionid.data,
> +		   NFS4_MAX_SESSIONID_LEN)) {
> +		dprintk("%s Invalid session id\n", __func__);
> +		goto out;
> +	}
> +	READ32(dummy);
> +	if (dummy != res->cbs_clp->cl_cb_seq_nr) {
> +		dprintk("%s Invalid sequence number\n", __func__);
> +		goto out;
> +	}
> +	READ32(dummy); 	/* slotid must be 0 */
> +	if (dummy != 0) {
> +		dprintk("%s Invalid slotid\n", __func__);
> +		goto out;
> +	}
> +	READ32(dummy); 	/* highest slotid */
          ^^^^^^^^^^^^^

> +	READ32(dummy); 	/* target highest slotid */
          ^^^^^^^^^^^^^^

Why do you need those?

> +	status = 0;
> +out:
> +	return status;
> +}
> +
> +
>  static int
>  nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
>  {
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index d7b4028..6f3cf47 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -1400,6 +1400,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
>  			svc_xprt_get(unconf->cl_cb_xprt);
>  			unconf->cl_callback.cb_minorversion =
>  				cstate->minorversion;
> +			unconf->cl_cb_seq_nr = 1;
>  			unconf->cl_callback.cb_prog = cr_ses->callback_prog;
>  			nfsd4_probe_callback(unconf);
>  		}
> -- 
> 1.6.2.1
> 
> _______________________________________________
> pNFS mailing list
> pNFS@linux-nfs.org
> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling
  2009-04-30 23:05 ` [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling Benny Halevy
@ 2009-05-01  0:05   ` Trond Myklebust
       [not found]     ` <1241136328.15476.124.camel-rJ7iovZKK19ZJLDQqaL3InhyD016LWXt@public.gmane.org>
  2009-05-03 20:36   ` J. Bruce Fields
  1 sibling, 1 reply; 29+ messages in thread
From: Trond Myklebust @ 2009-05-01  0:05 UTC (permalink / raw)
  To: Benny Halevy; +Cc: J. Bruce Fields, linux-nfs, pnfs, Andy Adamson

On Fri, 2009-05-01 at 02:05 +0300, Benny Halevy wrote:
> From: Rahul Iyer <iyer@netapp.com>
> 
> FIXME: bhalevy: write up commit message
> 
> Signed-off-by: Rahul Iyer <iyer@netapp.com>
> Signed-off-by: Mike Sager <sager@netapp.com>
> Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> 
> When the call direction is a reply, copy the xid and call direction into the
> req->rq_private_buf.head[0].iov_base otherwise rpc_verify_header returns
> rpc_garbage.
> 
> Signed-off-by: Andy Adamson <andros@netapp.com>
> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> [get rid of CONFIG_NFSD_V4_1]
> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> ---
>  include/linux/sunrpc/clnt.h    |    1 +
>  include/linux/sunrpc/svcsock.h |    1 +
>  include/linux/sunrpc/xprt.h    |    2 +
>  net/sunrpc/clnt.c              |    1 +
>  net/sunrpc/svcsock.c           |   68 ++++++++++-
>  net/sunrpc/xprt.c              |   41 ++++++-
>  net/sunrpc/xprtsock.c          |  278 +++++++++++++++++++++++++++++++++++++++-
>  7 files changed, 381 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
> index c39a210..cf9a8ec 100644
> --- a/include/linux/sunrpc/clnt.h
> +++ b/include/linux/sunrpc/clnt.h
> @@ -110,6 +110,7 @@ struct rpc_create_args {
>  	rpc_authflavor_t	authflavor;
>  	unsigned long		flags;
>  	char			*client_name;
> +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel */
>  };
>  
>  /* Values for "flags" field */
> diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
> index 8271631..19228f4 100644
> --- a/include/linux/sunrpc/svcsock.h
> +++ b/include/linux/sunrpc/svcsock.h
> @@ -28,6 +28,7 @@ struct svc_sock {
>  	/* private TCP part */
>  	u32			sk_reclen;	/* length of record */
>  	u32			sk_tcplen;	/* current read length */
> +	struct rpc_xprt	       *sk_bc_xprt;	/* NFSv4.1 backchannel xprt */
>  };
>  
>  /*
> diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
> index 1758d9f..063a6a7 100644
> --- a/include/linux/sunrpc/xprt.h
> +++ b/include/linux/sunrpc/xprt.h
> @@ -174,6 +174,7 @@ struct rpc_xprt {
>  	spinlock_t		reserve_lock;	/* lock slot table */
>  	u32			xid;		/* Next XID value to use */
>  	struct rpc_task *	snd_task;	/* Task blocked in send */
> +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel */
>  	struct list_head	recv;
>  
>  	struct {
> @@ -197,6 +198,7 @@ struct xprt_create {
>  	struct sockaddr *	srcaddr;	/* optional local address */
>  	struct sockaddr *	dstaddr;	/* remote peer address */
>  	size_t			addrlen;
> +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel */
>  };
>  
>  struct xprt_class {
> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> index 5abab09..3dc847f 100644
> --- a/net/sunrpc/clnt.c
> +++ b/net/sunrpc/clnt.c
> @@ -266,6 +266,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
>  		.srcaddr = args->saddress,
>  		.dstaddr = args->address,
>  		.addrlen = args->addrsize,
> +		.bc_sock = args->bc_sock,
>  	};
>  	char servername[48];
>  
> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> index 4e6d406..619764e 100644
> --- a/net/sunrpc/svcsock.c
> +++ b/net/sunrpc/svcsock.c
> @@ -49,6 +49,7 @@
>  #include <linux/sunrpc/msg_prot.h>
>  #include <linux/sunrpc/svcsock.h>
>  #include <linux/sunrpc/stats.h>
> +#include <linux/sunrpc/xprt.h>
>  
>  #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
>  
> @@ -825,6 +826,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
>  	int		len;
>  	struct kvec *vec;
>  	int pnum, vlen;
> +	struct rpc_rqst *req = NULL;
>  
>  	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
>  		svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
> @@ -891,12 +893,65 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
>  	len = svsk->sk_reclen;
>  	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
>  
> +	/*
> +	 * We have enough data for the whole tcp record. Let's try and read the
> +	 * first 8 bytes to get the xid and the call direction. We can use this
> +	 * to figure out if this is a call or a reply to a callback. If
> +	 * sk_reclen is < 8 (xid and calldir), then this is a malformed packet.
> +	 * In that case, don't bother with the calldir and just read the data.
> +	 * It will be rejected in svc_process.
> +	 */
> +
>  	vec = rqstp->rq_vec;
>  	vec[0] = rqstp->rq_arg.head[0];
>  	vlen = PAGE_SIZE;
> +
> +	if (len >= 8) {
> +		u32 *p;
> +		u32 xid;
> +		u32 calldir;
> +
> +		len = svc_recvfrom(rqstp, vec, 1, 8);
> +		if (len < 0)
> +			goto error;
> +
> +		p = (u32 *)rqstp->rq_arg.head[0].iov_base;
> +		xid = *p++;
> +		calldir = *p;
> +
> +		if (calldir) {
> +			/* REPLY */
> +			if (svsk->sk_bc_xprt)
> +				req = xprt_lookup_rqst(svsk->sk_bc_xprt, xid);
> +			if (req) {
> +				memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
> +					sizeof(struct xdr_buf));
> +				/* copy the xid and call direction */
> +				memcpy(req->rq_private_buf.head[0].iov_base,
> +					rqstp->rq_arg.head[0].iov_base, 8);
> +				vec[0] = req->rq_private_buf.head[0];
> +			} else
> +				printk(KERN_NOTICE
> +					"%s: Got unrecognized reply: "
> +					"calldir 0x%x sk_bc_xprt %p xid %08x\n",
> +					__func__, ntohl(calldir),
> +					svsk->sk_bc_xprt, xid);
> +		}
> +
> +		if (!calldir || !req)
> +			vec[0] = rqstp->rq_arg.head[0];
> +
> +		vec[0].iov_base += 8;
> +		vec[0].iov_len -= 8;
> +		len = svsk->sk_reclen - 8;
> +		vlen -= 8;
> +	}
> +
>  	pnum = 1;
>  	while (vlen < len) {
> -		vec[pnum].iov_base = page_address(rqstp->rq_pages[pnum]);
> +		vec[pnum].iov_base = (req) ?
> +			page_address(req->rq_private_buf.pages[pnum - 1]) :
> +			page_address(rqstp->rq_pages[pnum]);
>  		vec[pnum].iov_len = PAGE_SIZE;
>  		pnum++;
>  		vlen += PAGE_SIZE;
> @@ -908,6 +963,16 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
>  	if (len < 0)
>  		goto error;
>  
> +	/*
> +	 * Account for the 8 bytes we read earlier
> +	 */
> +	len += 8;
> +
> +	if (req) {
> +		xprt_complete_rqst(req->rq_task, len);
> +		len = 0;
> +		goto out;
> +	}
>  	dprintk("svc: TCP complete record (%d bytes)\n", len);
>  	rqstp->rq_arg.len = len;
>  	rqstp->rq_arg.page_base = 0;
> @@ -921,6 +986,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
>  	rqstp->rq_xprt_ctxt   = NULL;
>  	rqstp->rq_prot	      = IPPROTO_TCP;
>  
> +out:
>  	/* Reset TCP read info */
>  	svsk->sk_reclen = 0;
>  	svsk->sk_tcplen = 0;
> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> index a0bfe53..03f175e 100644
> --- a/net/sunrpc/xprt.c
> +++ b/net/sunrpc/xprt.c
> @@ -1015,6 +1015,27 @@ void xprt_release(struct rpc_task *task)
>  	spin_unlock(&xprt->reserve_lock);
>  }
>  
> +/*
> + * The autoclose function for the back channel
> + *
> + * The callback channel should never close the channel,
> + * let the forechannel do that.
> + */
> +static void bc_autoclose(struct work_struct *work)
> +{
> +	return;
> +}
> +
> +
> +/*
> + * The autodisconnect routine for the back channel. We never disconnect
> + */
> +static void
> +bc_init_autodisconnect(unsigned long data)
> +{
> +	return;
> +}
> +
>  /**
>   * xprt_create_transport - create an RPC transport
>   * @args: rpc transport creation arguments
> @@ -1051,9 +1072,16 @@ found:
>  
>  	INIT_LIST_HEAD(&xprt->free);
>  	INIT_LIST_HEAD(&xprt->recv);
> -	INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
> -	setup_timer(&xprt->timer, xprt_init_autodisconnect,
> -			(unsigned long)xprt);
> +	if (args->bc_sock) {
> +		INIT_WORK(&xprt->task_cleanup, bc_autoclose);
> +		setup_timer(&xprt->timer, bc_init_autodisconnect,
> +			    (unsigned long)xprt);

Hrmph... Why do you need dummy routines here?

> +	} else {
> +		INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
> +		setup_timer(&xprt->timer, xprt_init_autodisconnect,
> +			    (unsigned long)xprt);
> +	}
> +
>  	xprt->last_used = jiffies;
>  	xprt->cwnd = RPC_INITCWND;
>  	xprt->bind_index = 0;
> @@ -1073,6 +1101,13 @@ found:
>  	dprintk("RPC:       created transport %p with %u slots\n", xprt,
>  			xprt->max_reqs);
>  
> +	/*
> +	 * Since we don't want connections for the backchannel, we set
> +	 * the xprt status to connected
> +	 */
> +	if (args->bc_sock)
> +		xprt_set_connected(xprt);
> +
>  	return xprt;
>  }
>  
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> index d40ff50..067d205 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -32,6 +32,7 @@
>  #include <linux/tcp.h>
>  #include <linux/sunrpc/clnt.h>
>  #include <linux/sunrpc/sched.h>
> +#include <linux/sunrpc/svcsock.h>
>  #include <linux/sunrpc/xprtsock.h>
>  #include <linux/file.h>
>  
> @@ -1966,6 +1967,219 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
>  			xprt->stat.bklog_u);
>  }
>  
> +/*
> + * The connect worker for the backchannel
> + * This should never be called as we should never need to connect
> + */
> +static void bc_connect_worker(struct work_struct *work)
> +{
> +	BUG();
> +}
> +
> +/*
> + * The set_port routine of the rpc_xprt_ops. This is related to the portmapper
> + * and should never be called
> + */
> +
> +static void bc_set_port(struct rpc_xprt *xprt, unsigned short port)
> +{
> +	BUG();
> +}
> +
> +/*
> + * The connect routine for the backchannel rpc_xprt ops
> + * Again, should never be called!
> + */
> +
> +static void bc_connect(struct rpc_task *task)
> +{
> +	BUG();
> +}
> +
> +struct rpc_buffer {
> +	size_t	len;
> +	char	data[];
> +};
> +/*
> + * Allocate a bunch of pages for a scratch buffer for the rpc code. The reason
> + * we allocate pages instead doing a kmalloc like rpc_malloc is because we want
> + * to use the server side send routines.
> + */
> +void *bc_malloc(struct rpc_task *task, size_t size)
> +{
> +	struct page *page;
> +	struct rpc_buffer *buf;
> +
> +	BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer));
> +	page = alloc_page(GFP_KERNEL);
> +
> +	if (!page)
> +		return NULL;
> +
> +	buf = page_address(page);
> +	buf->len = PAGE_SIZE;
> +
> +	return buf->data;
> +}
> +

__get_free_page()? Why can't you kmalloc() here?

> +/*
> + * Free the space allocated in the bc_alloc routine
> + */
> +void bc_free(void *buffer)
> +{
> +	struct rpc_buffer *buf;
> +
> +	if (!buffer)
> +		return;
> +
> +	buf = container_of(buffer, struct rpc_buffer, data);
> +	free_pages((unsigned long)buf, get_order(buf->len));

This looks funky... Why can't you just call free_page()? You already
know from bc_malloc() that this is an order 0 page allocation.

> +}
> +
> +/*
> + * Use the svc_sock to send the callback. Must be called with svsk->sk_mutex
> + * held. Borrows heavily from svc_tcp_sendto and xs_tcp_semd_request.
> + */
> +static int bc_sendto(struct rpc_rqst *req)
> +{
> +	int total_len;
> +	int len;
> +	int size;
> +	int result;
> +	struct xdr_buf *xbufp = &req->rq_snd_buf;
> +	struct page **pages = xbufp->pages;
> +	unsigned int flags = MSG_MORE;
> +	unsigned int pglen = xbufp->page_len;
> +	size_t base = xbufp->page_base;
> +	struct rpc_xprt *xprt = req->rq_xprt;
> +	struct sock_xprt *transport =
> +				container_of(xprt, struct sock_xprt, xprt);
> +	struct socket *sock = transport->sock;
> +
> +	total_len = xbufp->len;
> +
> +	/*
> +	 * Set up the rpc header and record marker stuff
> +	 */
> +	xs_encode_tcp_record_marker(xbufp);
> +
> +	/*
> +	 * The RPC message is divided into 3 pieces:
> +	 * - The header: This is what most of the smaller RPC messages consist
> +	 *   of. Often the whole message is in this.
> +	 *
> +	 *   - xdr->pages: This is a list of pages that contain data, for
> +	 *   example in a write request or while using rpcsec gss
> +	 *
> +	 *   - The tail: This is the rest of the rpc message
> +	 *
> +	 *  First we send the header, then the pages and then finally the tail.
> +	 *  The code borrows heavily from svc_sendto.
> +	 */
> +
> +	/*
> +	 * Send the head
> +	 */
> +	if (total_len == xbufp->head[0].iov_len)
> +		flags = 0;
> +
> +	len = sock->ops->sendpage(sock, virt_to_page(xbufp->head[0].iov_base),
> +			(unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK,
> +			xbufp->head[0].iov_len, flags);

Why do you need to do this? The head iovec is supposed to be reserved
for kmalloc()ed memory, which cannot be used together with sendpage().
Somebody, some day is going to mess up and try to put a kmalloced buffer
in here, and will wonder why the above doesn't work.

If you are sending pages, then please put them in the page list part of
the xdr_buf. There is no rule that the RPC call _must_ have a non-zero
head.

> +
> +	if (len != xbufp->head[0].iov_len)
> +		goto out;
> +
> +	/*
> +	 * send page data
> +	 *
> +	 * Check the amount of data to be sent. If it is less than the
> +	 * remaining page, then send it else send the current page
> +	 */
> +
> +	size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
> +	while (pglen > 0) {
> +		if (total_len == size)
> +			flags = 0;
> +		result = sock->ops->sendpage(sock, *pages, base, size, flags);
> +		if (result > 0)
> +			len += result;
> +		if (result != size)
> +			goto out;
> +		total_len -= size;
> +		pglen -= size;
> +		size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
> +		base = 0;
> +		pages++;
> +	}
> +	/*
> +	 * send tail
> +	 */
> +	if (xbufp->tail[0].iov_len) {
> +		result = sock->ops->sendpage(sock,
> +			xbufp->tail[0].iov_base,
> +			(unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK,
> +			xbufp->tail[0].iov_len,
> +			0);

Ditto.

> +
> +		if (result > 0)
> +			len += result;
> +	}
> +out:
> +	if (len != xbufp->len)
> +		printk(KERN_NOTICE "Error sending entire callback!\n");
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Then what? Shouldn't you be closing the connection here?

> +
> +	return len;
> +}
> +
> +/*
> + * The send routine. Borrows from svc_send
> + */
> +static int bc_send_request(struct rpc_task *task)
> +{
> +	struct rpc_rqst *req = task->tk_rqstp;
> +	struct rpc_xprt *bc_xprt = req->rq_xprt;
> +	struct svc_xprt	*xprt;
> +	struct svc_sock         *svsk;
> +	u32                     len;
> +
> +	dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid));
> +	/*
> +	 * Get the server socket associated with this callback xprt
> +	 */
> +	svsk = bc_xprt->bc_sock;
> +	xprt = &svsk->sk_xprt;
> +
> +	mutex_lock(&xprt->xpt_mutex);
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Eh? What's this, in which patch is it defined, and why is it at all
needed?

> +	if (test_bit(XPT_DEAD, &xprt->xpt_flags))
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^
Where is this defined, and why is it needed? The xprt already has a
connected/unconnected flag.

> +		len = -ENOTCONN;
> +	else
> +		len = bc_sendto(req);
> +	mutex_unlock(&xprt->xpt_mutex);
> +
> +	return 0;
> +
> +}
> +
> +/*
> + * The close routine. Since this is client initiated, we do nothing
> + */
> +
> +static void bc_close(struct rpc_xprt *xprt)
> +{
> +	return;
> +}
> +
> +/*
> + * The xprt destroy routine. Again, because this connection is client
> + * initiated, we do nothing
> + */
> +
> +static void bc_destroy(struct rpc_xprt *xprt)
> +{
> +	return;
> +}
> +
>  static struct rpc_xprt_ops xs_udp_ops = {
>  	.set_buffer_size	= xs_udp_set_buffer_size,
>  	.reserve_xprt		= xprt_reserve_xprt_cong,
> @@ -1999,6 +2213,24 @@ static struct rpc_xprt_ops xs_tcp_ops = {
>  	.print_stats		= xs_tcp_print_stats,
>  };
>  
> +/*
> + * The rpc_xprt_ops for the server backchannel
> + */
> +
> +static struct rpc_xprt_ops bc_tcp_ops = {
> +	.reserve_xprt		= xprt_reserve_xprt,
> +	.release_xprt		= xprt_release_xprt,
> +	.set_port		= bc_set_port,
> +	.connect		= bc_connect,
> +	.buf_alloc		= bc_malloc,
> +	.buf_free		= bc_free,
> +	.send_request		= bc_send_request,
> +	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
> +	.close			= bc_close,
> +	.destroy		= bc_destroy,
> +	.print_stats		= xs_tcp_print_stats,
> +};
> +
>  static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
>  				      unsigned int slot_table_size)
>  {
> @@ -2131,13 +2363,29 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
>  	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
>  	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
>  
> -	xprt->bind_timeout = XS_BIND_TO;
> -	xprt->connect_timeout = XS_TCP_CONN_TO;
> -	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
> -	xprt->idle_timeout = XS_IDLE_DISC_TO;
> +	if (args->bc_sock) {
> +		/* backchannel */
> +		xprt_set_bound(xprt);
> +		INIT_DELAYED_WORK(&transport->connect_worker,
> +				  bc_connect_worker);

Errm.... Is it really such a good idea to tell the RPC layer that it can
reconnect at any time using a routine that will BUG()?

> +		xprt->bind_timeout = 0;
> +		xprt->connect_timeout = 0;
> +		xprt->reestablish_timeout = 0;
> +		xprt->idle_timeout = (~0);
>  
> -	xprt->ops = &xs_tcp_ops;
> -	xprt->timeout = &xs_tcp_default_timeout;
> +		/*
> +		 * The backchannel uses the same socket connection as the
> +		 * forechannel
> +		 */
> +		xprt->bc_sock = args->bc_sock;
> +		xprt->bc_sock->sk_bc_xprt = xprt;
> +		transport->sock = xprt->bc_sock->sk_sock;
> +		transport->inet = xprt->bc_sock->sk_sk;
> +
> +		xprt->ops = &bc_tcp_ops;
> +
> +		goto next;
> +	}
>  
>  	switch (addr->sa_family) {
>  	case AF_INET:
> @@ -2145,13 +2393,29 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
>  			xprt_set_bound(xprt);
>  
>  		INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker4);
> -		xs_format_ipv4_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP);
>  		break;
>  	case AF_INET6:
>  		if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
>  			xprt_set_bound(xprt);
>  
>  		INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker6);
> +		break;
> +	}
> +	xprt->bind_timeout = XS_BIND_TO;
> +	xprt->connect_timeout = XS_TCP_CONN_TO;
> +	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
> +	xprt->idle_timeout = XS_IDLE_DISC_TO;
> +
> +	xprt->ops = &xs_tcp_ops;
> +
> +next:
> +	xprt->timeout = &xs_tcp_default_timeout;
> +
> +	switch (addr->sa_family) {

Why do we suddenly need 2 switch statements here?

> +	case AF_INET:
> +		xs_format_ipv4_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP);
> +		break;
> +	case AF_INET6:
>  		xs_format_ipv6_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6);
>  		break;
>  	default:



^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling
       [not found]     ` <1241136328.15476.124.camel-rJ7iovZKK19ZJLDQqaL3InhyD016LWXt@public.gmane.org>
@ 2009-05-01  0:13       ` Labiaga, Ricardo
  2009-06-02  0:33       ` Labiaga, Ricardo
  1 sibling, 0 replies; 29+ messages in thread
From: Labiaga, Ricardo @ 2009-05-01  0:13 UTC (permalink / raw)
  To: Trond Myklebust, Benny Halevy, Iyer, Rahul
  Cc: Adamson, Andy, J. Bruce Fields, pnfs, linux-nfs

Adding Rahul (the original author)...

- ricardo

> -----Original Message-----
> From: Trond Myklebust [mailto:trond.myklebust@fys.uio.no]
> Sent: Thursday, April 30, 2009 5:05 PM
> To: Benny Halevy
> Cc: Adamson, Andy; J. Bruce Fields; pnfs@linux-nfs.org; linux-
> nfs@vger.kernel.org
> Subject: Re: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side
> backchannel handling
> 
> On Fri, 2009-05-01 at 02:05 +0300, Benny Halevy wrote:
> > From: Rahul Iyer <iyer@netapp.com>
> >
> > FIXME: bhalevy: write up commit message
> >
> > Signed-off-by: Rahul Iyer <iyer@netapp.com>
> > Signed-off-by: Mike Sager <sager@netapp.com>
> > Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> >
> > When the call direction is a reply, copy the xid and call direction
into
> the
> > req->rq_private_buf.head[0].iov_base otherwise rpc_verify_header
returns
> > rpc_garbage.
> >
> > Signed-off-by: Andy Adamson <andros@netapp.com>
> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> > [get rid of CONFIG_NFSD_V4_1]
> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> > ---
> >  include/linux/sunrpc/clnt.h    |    1 +
> >  include/linux/sunrpc/svcsock.h |    1 +
> >  include/linux/sunrpc/xprt.h    |    2 +
> >  net/sunrpc/clnt.c              |    1 +
> >  net/sunrpc/svcsock.c           |   68 ++++++++++-
> >  net/sunrpc/xprt.c              |   41 ++++++-
> >  net/sunrpc/xprtsock.c          |  278
> +++++++++++++++++++++++++++++++++++++++-
> >  7 files changed, 381 insertions(+), 11 deletions(-)
> >
> > diff --git a/include/linux/sunrpc/clnt.h
b/include/linux/sunrpc/clnt.h
> > index c39a210..cf9a8ec 100644
> > --- a/include/linux/sunrpc/clnt.h
> > +++ b/include/linux/sunrpc/clnt.h
> > @@ -110,6 +110,7 @@ struct rpc_create_args {
> >  	rpc_authflavor_t	authflavor;
> >  	unsigned long		flags;
> >  	char			*client_name;
> > +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel
*/
> >  };
> >
> >  /* Values for "flags" field */
> > diff --git a/include/linux/sunrpc/svcsock.h
> b/include/linux/sunrpc/svcsock.h
> > index 8271631..19228f4 100644
> > --- a/include/linux/sunrpc/svcsock.h
> > +++ b/include/linux/sunrpc/svcsock.h
> > @@ -28,6 +28,7 @@ struct svc_sock {
> >  	/* private TCP part */
> >  	u32			sk_reclen;	/* length of record */
> >  	u32			sk_tcplen;	/* current read length
*/
> > +	struct rpc_xprt	       *sk_bc_xprt;	/* NFSv4.1 backchannel
xprt
> */
> >  };
> >
> >  /*
> > diff --git a/include/linux/sunrpc/xprt.h
b/include/linux/sunrpc/xprt.h
> > index 1758d9f..063a6a7 100644
> > --- a/include/linux/sunrpc/xprt.h
> > +++ b/include/linux/sunrpc/xprt.h
> > @@ -174,6 +174,7 @@ struct rpc_xprt {
> >  	spinlock_t		reserve_lock;	/* lock slot table */
> >  	u32			xid;		/* Next XID value to use
*/
> >  	struct rpc_task *	snd_task;	/* Task blocked in send
*/
> > +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel
*/
> >  	struct list_head	recv;
> >
> >  	struct {
> > @@ -197,6 +198,7 @@ struct xprt_create {
> >  	struct sockaddr *	srcaddr;	/* optional local
address */
> >  	struct sockaddr *	dstaddr;	/* remote peer address
*/
> >  	size_t			addrlen;
> > +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel
*/
> >  };
> >
> >  struct xprt_class {
> > diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> > index 5abab09..3dc847f 100644
> > --- a/net/sunrpc/clnt.c
> > +++ b/net/sunrpc/clnt.c
> > @@ -266,6 +266,7 @@ struct rpc_clnt *rpc_create(struct
rpc_create_args
> *args)
> >  		.srcaddr = args->saddress,
> >  		.dstaddr = args->address,
> >  		.addrlen = args->addrsize,
> > +		.bc_sock = args->bc_sock,
> >  	};
> >  	char servername[48];
> >
> > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> > index 4e6d406..619764e 100644
> > --- a/net/sunrpc/svcsock.c
> > +++ b/net/sunrpc/svcsock.c
> > @@ -49,6 +49,7 @@
> >  #include <linux/sunrpc/msg_prot.h>
> >  #include <linux/sunrpc/svcsock.h>
> >  #include <linux/sunrpc/stats.h>
> > +#include <linux/sunrpc/xprt.h>
> >
> >  #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
> >
> > @@ -825,6 +826,7 @@ static int svc_tcp_recvfrom(struct svc_rqst
*rqstp)
> >  	int		len;
> >  	struct kvec *vec;
> >  	int pnum, vlen;
> > +	struct rpc_rqst *req = NULL;
> >
> >  	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
> >  		svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
> > @@ -891,12 +893,65 @@ static int svc_tcp_recvfrom(struct svc_rqst
> *rqstp)
> >  	len = svsk->sk_reclen;
> >  	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
> >
> > +	/*
> > +	 * We have enough data for the whole tcp record. Let's try and
read
> the
> > +	 * first 8 bytes to get the xid and the call direction. We can
use
> this
> > +	 * to figure out if this is a call or a reply to a callback. If
> > +	 * sk_reclen is < 8 (xid and calldir), then this is a malformed
> packet.
> > +	 * In that case, don't bother with the calldir and just read the
> data.
> > +	 * It will be rejected in svc_process.
> > +	 */
> > +
> >  	vec = rqstp->rq_vec;
> >  	vec[0] = rqstp->rq_arg.head[0];
> >  	vlen = PAGE_SIZE;
> > +
> > +	if (len >= 8) {
> > +		u32 *p;
> > +		u32 xid;
> > +		u32 calldir;
> > +
> > +		len = svc_recvfrom(rqstp, vec, 1, 8);
> > +		if (len < 0)
> > +			goto error;
> > +
> > +		p = (u32 *)rqstp->rq_arg.head[0].iov_base;
> > +		xid = *p++;
> > +		calldir = *p;
> > +
> > +		if (calldir) {
> > +			/* REPLY */
> > +			if (svsk->sk_bc_xprt)
> > +				req = xprt_lookup_rqst(svsk->sk_bc_xprt,
xid);
> > +			if (req) {
> > +				memcpy(&req->rq_private_buf,
&req->rq_rcv_buf,
> > +					sizeof(struct xdr_buf));
> > +				/* copy the xid and call direction */
> > +
memcpy(req->rq_private_buf.head[0].iov_base,
> > +					rqstp->rq_arg.head[0].iov_base,
8);
> > +				vec[0] = req->rq_private_buf.head[0];
> > +			} else
> > +				printk(KERN_NOTICE
> > +					"%s: Got unrecognized reply: "
> > +					"calldir 0x%x sk_bc_xprt %p xid
%08x\n",
> > +					__func__, ntohl(calldir),
> > +					svsk->sk_bc_xprt, xid);
> > +		}
> > +
> > +		if (!calldir || !req)
> > +			vec[0] = rqstp->rq_arg.head[0];
> > +
> > +		vec[0].iov_base += 8;
> > +		vec[0].iov_len -= 8;
> > +		len = svsk->sk_reclen - 8;
> > +		vlen -= 8;
> > +	}
> > +
> >  	pnum = 1;
> >  	while (vlen < len) {
> > -		vec[pnum].iov_base =
page_address(rqstp->rq_pages[pnum]);
> > +		vec[pnum].iov_base = (req) ?
> > +			page_address(req->rq_private_buf.pages[pnum -
1]) :
> > +			page_address(rqstp->rq_pages[pnum]);
> >  		vec[pnum].iov_len = PAGE_SIZE;
> >  		pnum++;
> >  		vlen += PAGE_SIZE;
> > @@ -908,6 +963,16 @@ static int svc_tcp_recvfrom(struct svc_rqst
*rqstp)
> >  	if (len < 0)
> >  		goto error;
> >
> > +	/*
> > +	 * Account for the 8 bytes we read earlier
> > +	 */
> > +	len += 8;
> > +
> > +	if (req) {
> > +		xprt_complete_rqst(req->rq_task, len);
> > +		len = 0;
> > +		goto out;
> > +	}
> >  	dprintk("svc: TCP complete record (%d bytes)\n", len);
> >  	rqstp->rq_arg.len = len;
> >  	rqstp->rq_arg.page_base = 0;
> > @@ -921,6 +986,7 @@ static int svc_tcp_recvfrom(struct svc_rqst
*rqstp)
> >  	rqstp->rq_xprt_ctxt   = NULL;
> >  	rqstp->rq_prot	      = IPPROTO_TCP;
> >
> > +out:
> >  	/* Reset TCP read info */
> >  	svsk->sk_reclen = 0;
> >  	svsk->sk_tcplen = 0;
> > diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> > index a0bfe53..03f175e 100644
> > --- a/net/sunrpc/xprt.c
> > +++ b/net/sunrpc/xprt.c
> > @@ -1015,6 +1015,27 @@ void xprt_release(struct rpc_task *task)
> >  	spin_unlock(&xprt->reserve_lock);
> >  }
> >
> > +/*
> > + * The autoclose function for the back channel
> > + *
> > + * The callback channel should never close the channel,
> > + * let the forechannel do that.
> > + */
> > +static void bc_autoclose(struct work_struct *work)
> > +{
> > +	return;
> > +}
> > +
> > +
> > +/*
> > + * The autodisconnect routine for the back channel. We never
disconnect
> > + */
> > +static void
> > +bc_init_autodisconnect(unsigned long data)
> > +{
> > +	return;
> > +}
> > +
> >  /**
> >   * xprt_create_transport - create an RPC transport
> >   * @args: rpc transport creation arguments
> > @@ -1051,9 +1072,16 @@ found:
> >
> >  	INIT_LIST_HEAD(&xprt->free);
> >  	INIT_LIST_HEAD(&xprt->recv);
> > -	INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
> > -	setup_timer(&xprt->timer, xprt_init_autodisconnect,
> > -			(unsigned long)xprt);
> > +	if (args->bc_sock) {
> > +		INIT_WORK(&xprt->task_cleanup, bc_autoclose);
> > +		setup_timer(&xprt->timer, bc_init_autodisconnect,
> > +			    (unsigned long)xprt);
> 
> Hrmph... Why do you need dummy routines here?
> 
> > +	} else {
> > +		INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
> > +		setup_timer(&xprt->timer, xprt_init_autodisconnect,
> > +			    (unsigned long)xprt);
> > +	}
> > +
> >  	xprt->last_used = jiffies;
> >  	xprt->cwnd = RPC_INITCWND;
> >  	xprt->bind_index = 0;
> > @@ -1073,6 +1101,13 @@ found:
> >  	dprintk("RPC:       created transport %p with %u slots\n", xprt,
> >  			xprt->max_reqs);
> >
> > +	/*
> > +	 * Since we don't want connections for the backchannel, we set
> > +	 * the xprt status to connected
> > +	 */
> > +	if (args->bc_sock)
> > +		xprt_set_connected(xprt);
> > +
> >  	return xprt;
> >  }
> >
> > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > index d40ff50..067d205 100644
> > --- a/net/sunrpc/xprtsock.c
> > +++ b/net/sunrpc/xprtsock.c
> > @@ -32,6 +32,7 @@
> >  #include <linux/tcp.h>
> >  #include <linux/sunrpc/clnt.h>
> >  #include <linux/sunrpc/sched.h>
> > +#include <linux/sunrpc/svcsock.h>
> >  #include <linux/sunrpc/xprtsock.h>
> >  #include <linux/file.h>
> >
> > @@ -1966,6 +1967,219 @@ static void xs_tcp_print_stats(struct
rpc_xprt
> *xprt, struct seq_file *seq)
> >  			xprt->stat.bklog_u);
> >  }
> >
> > +/*
> > + * The connect worker for the backchannel
> > + * This should never be called as we should never need to connect
> > + */
> > +static void bc_connect_worker(struct work_struct *work)
> > +{
> > +	BUG();
> > +}
> > +
> > +/*
> > + * The set_port routine of the rpc_xprt_ops. This is related to the
> portmapper
> > + * and should never be called
> > + */
> > +
> > +static void bc_set_port(struct rpc_xprt *xprt, unsigned short port)
> > +{
> > +	BUG();
> > +}
> > +
> > +/*
> > + * The connect routine for the backchannel rpc_xprt ops
> > + * Again, should never be called!
> > + */
> > +
> > +static void bc_connect(struct rpc_task *task)
> > +{
> > +	BUG();
> > +}
> > +
> > +struct rpc_buffer {
> > +	size_t	len;
> > +	char	data[];
> > +};
> > +/*
> > + * Allocate a bunch of pages for a scratch buffer for the rpc code.
The
> reason
> > + * we allocate pages instead doing a kmalloc like rpc_malloc is
because
> we want
> > + * to use the server side send routines.
> > + */
> > +void *bc_malloc(struct rpc_task *task, size_t size)
> > +{
> > +	struct page *page;
> > +	struct rpc_buffer *buf;
> > +
> > +	BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer));
> > +	page = alloc_page(GFP_KERNEL);
> > +
> > +	if (!page)
> > +		return NULL;
> > +
> > +	buf = page_address(page);
> > +	buf->len = PAGE_SIZE;
> > +
> > +	return buf->data;
> > +}
> > +
> 
> __get_free_page()? Why can't you kmalloc() here?
> 
> > +/*
> > + * Free the space allocated in the bc_alloc routine
> > + */
> > +void bc_free(void *buffer)
> > +{
> > +	struct rpc_buffer *buf;
> > +
> > +	if (!buffer)
> > +		return;
> > +
> > +	buf = container_of(buffer, struct rpc_buffer, data);
> > +	free_pages((unsigned long)buf, get_order(buf->len));
> 
> This looks funky... Why can't you just call free_page()? You already
> know from bc_malloc() that this is an order 0 page allocation.
> 
> > +}
> > +
> > +/*
> > + * Use the svc_sock to send the callback. Must be called with svsk-
> >sk_mutex
> > + * held. Borrows heavily from svc_tcp_sendto and
xs_tcp_semd_request.
> > + */
> > +static int bc_sendto(struct rpc_rqst *req)
> > +{
> > +	int total_len;
> > +	int len;
> > +	int size;
> > +	int result;
> > +	struct xdr_buf *xbufp = &req->rq_snd_buf;
> > +	struct page **pages = xbufp->pages;
> > +	unsigned int flags = MSG_MORE;
> > +	unsigned int pglen = xbufp->page_len;
> > +	size_t base = xbufp->page_base;
> > +	struct rpc_xprt *xprt = req->rq_xprt;
> > +	struct sock_xprt *transport =
> > +				container_of(xprt, struct sock_xprt,
xprt);
> > +	struct socket *sock = transport->sock;
> > +
> > +	total_len = xbufp->len;
> > +
> > +	/*
> > +	 * Set up the rpc header and record marker stuff
> > +	 */
> > +	xs_encode_tcp_record_marker(xbufp);
> > +
> > +	/*
> > +	 * The RPC message is divided into 3 pieces:
> > +	 * - The header: This is what most of the smaller RPC messages
> consist
> > +	 *   of. Often the whole message is in this.
> > +	 *
> > +	 *   - xdr->pages: This is a list of pages that contain data,
for
> > +	 *   example in a write request or while using rpcsec gss
> > +	 *
> > +	 *   - The tail: This is the rest of the rpc message
> > +	 *
> > +	 *  First we send the header, then the pages and then finally
the
> tail.
> > +	 *  The code borrows heavily from svc_sendto.
> > +	 */
> > +
> > +	/*
> > +	 * Send the head
> > +	 */
> > +	if (total_len == xbufp->head[0].iov_len)
> > +		flags = 0;
> > +
> > +	len = sock->ops->sendpage(sock, virt_to_page(xbufp-
> >head[0].iov_base),
> > +			(unsigned long)xbufp->head[0].iov_base &
~PAGE_MASK,
> > +			xbufp->head[0].iov_len, flags);
> 
> Why do you need to do this? The head iovec is supposed to be reserved
> for kmalloc()ed memory, which cannot be used together with sendpage().
> Somebody, some day is going to mess up and try to put a kmalloced
buffer
> in here, and will wonder why the above doesn't work.
> 
> If you are sending pages, then please put them in the page list part
of
> the xdr_buf. There is no rule that the RPC call _must_ have a non-zero
> head.
> 
> > +
> > +	if (len != xbufp->head[0].iov_len)
> > +		goto out;
> > +
> > +	/*
> > +	 * send page data
> > +	 *
> > +	 * Check the amount of data to be sent. If it is less than the
> > +	 * remaining page, then send it else send the current page
> > +	 */
> > +
> > +	size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
> > +	while (pglen > 0) {
> > +		if (total_len == size)
> > +			flags = 0;
> > +		result = sock->ops->sendpage(sock, *pages, base, size,
flags);
> > +		if (result > 0)
> > +			len += result;
> > +		if (result != size)
> > +			goto out;
> > +		total_len -= size;
> > +		pglen -= size;
> > +		size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
> > +		base = 0;
> > +		pages++;
> > +	}
> > +	/*
> > +	 * send tail
> > +	 */
> > +	if (xbufp->tail[0].iov_len) {
> > +		result = sock->ops->sendpage(sock,
> > +			xbufp->tail[0].iov_base,
> > +			(unsigned long)xbufp->tail[0].iov_base &
~PAGE_MASK,
> > +			xbufp->tail[0].iov_len,
> > +			0);
> 
> Ditto.
> 
> > +
> > +		if (result > 0)
> > +			len += result;
> > +	}
> > +out:
> > +	if (len != xbufp->len)
> > +		printk(KERN_NOTICE "Error sending entire callback!\n");
>              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> Then what? Shouldn't you be closing the connection here?
> 
> > +
> > +	return len;
> > +}
> > +
> > +/*
> > + * The send routine. Borrows from svc_send
> > + */
> > +static int bc_send_request(struct rpc_task *task)
> > +{
> > +	struct rpc_rqst *req = task->tk_rqstp;
> > +	struct rpc_xprt *bc_xprt = req->rq_xprt;
> > +	struct svc_xprt	*xprt;
> > +	struct svc_sock         *svsk;
> > +	u32                     len;
> > +
> > +	dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid));
> > +	/*
> > +	 * Get the server socket associated with this callback xprt
> > +	 */
> > +	svsk = bc_xprt->bc_sock;
> > +	xprt = &svsk->sk_xprt;
> > +
> > +	mutex_lock(&xprt->xpt_mutex);
>           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> Eh? What's this, in which patch is it defined, and why is it at all
> needed?
> 
> > +	if (test_bit(XPT_DEAD, &xprt->xpt_flags))
>                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
> Where is this defined, and why is it needed? The xprt already has a
> connected/unconnected flag.
> 
> > +		len = -ENOTCONN;
> > +	else
> > +		len = bc_sendto(req);
> > +	mutex_unlock(&xprt->xpt_mutex);
> > +
> > +	return 0;
> > +
> > +}
> > +
> > +/*
> > + * The close routine. Since this is client initiated, we do nothing
> > + */
> > +
> > +static void bc_close(struct rpc_xprt *xprt)
> > +{
> > +	return;
> > +}
> > +
> > +/*
> > + * The xprt destroy routine. Again, because this connection is
client
> > + * initiated, we do nothing
> > + */
> > +
> > +static void bc_destroy(struct rpc_xprt *xprt)
> > +{
> > +	return;
> > +}
> > +
> >  static struct rpc_xprt_ops xs_udp_ops = {
> >  	.set_buffer_size	= xs_udp_set_buffer_size,
> >  	.reserve_xprt		= xprt_reserve_xprt_cong,
> > @@ -1999,6 +2213,24 @@ static struct rpc_xprt_ops xs_tcp_ops = {
> >  	.print_stats		= xs_tcp_print_stats,
> >  };
> >
> > +/*
> > + * The rpc_xprt_ops for the server backchannel
> > + */
> > +
> > +static struct rpc_xprt_ops bc_tcp_ops = {
> > +	.reserve_xprt		= xprt_reserve_xprt,
> > +	.release_xprt		= xprt_release_xprt,
> > +	.set_port		= bc_set_port,
> > +	.connect		= bc_connect,
> > +	.buf_alloc		= bc_malloc,
> > +	.buf_free		= bc_free,
> > +	.send_request		= bc_send_request,
> > +	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
> > +	.close			= bc_close,
> > +	.destroy		= bc_destroy,
> > +	.print_stats		= xs_tcp_print_stats,
> > +};
> > +
> >  static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
> >  				      unsigned int slot_table_size)
> >  {
> > @@ -2131,13 +2363,29 @@ static struct rpc_xprt *xs_setup_tcp(struct
> xprt_create *args)
> >  	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
> >  	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
> >
> > -	xprt->bind_timeout = XS_BIND_TO;
> > -	xprt->connect_timeout = XS_TCP_CONN_TO;
> > -	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
> > -	xprt->idle_timeout = XS_IDLE_DISC_TO;
> > +	if (args->bc_sock) {
> > +		/* backchannel */
> > +		xprt_set_bound(xprt);
> > +		INIT_DELAYED_WORK(&transport->connect_worker,
> > +				  bc_connect_worker);
> 
> Errm.... Is it really such a good idea to tell the RPC layer that it
can
> reconnect at any time using a routine that will BUG()?
> 
> > +		xprt->bind_timeout = 0;
> > +		xprt->connect_timeout = 0;
> > +		xprt->reestablish_timeout = 0;
> > +		xprt->idle_timeout = (~0);
> >
> > -	xprt->ops = &xs_tcp_ops;
> > -	xprt->timeout = &xs_tcp_default_timeout;
> > +		/*
> > +		 * The backchannel uses the same socket connection as
the
> > +		 * forechannel
> > +		 */
> > +		xprt->bc_sock = args->bc_sock;
> > +		xprt->bc_sock->sk_bc_xprt = xprt;
> > +		transport->sock = xprt->bc_sock->sk_sock;
> > +		transport->inet = xprt->bc_sock->sk_sk;
> > +
> > +		xprt->ops = &bc_tcp_ops;
> > +
> > +		goto next;
> > +	}
> >
> >  	switch (addr->sa_family) {
> >  	case AF_INET:
> > @@ -2145,13 +2393,29 @@ static struct rpc_xprt *xs_setup_tcp(struct
> xprt_create *args)
> >  			xprt_set_bound(xprt);
> >
> >  		INIT_DELAYED_WORK(&transport->connect_worker,
> xs_tcp_connect_worker4);
> > -		xs_format_ipv4_peer_addresses(xprt, "tcp",
RPCBIND_NETID_TCP);
> >  		break;
> >  	case AF_INET6:
> >  		if (((struct sockaddr_in6 *)addr)->sin6_port !=
htons(0))
> >  			xprt_set_bound(xprt);
> >
> >  		INIT_DELAYED_WORK(&transport->connect_worker,
> xs_tcp_connect_worker6);
> > +		break;
> > +	}
> > +	xprt->bind_timeout = XS_BIND_TO;
> > +	xprt->connect_timeout = XS_TCP_CONN_TO;
> > +	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
> > +	xprt->idle_timeout = XS_IDLE_DISC_TO;
> > +
> > +	xprt->ops = &xs_tcp_ops;
> > +
> > +next:
> > +	xprt->timeout = &xs_tcp_default_timeout;
> > +
> > +	switch (addr->sa_family) {
> 
> Why do we suddenly need 2 switch statements here?
> 
> > +	case AF_INET:
> > +		xs_format_ipv4_peer_addresses(xprt, "tcp",
RPCBIND_NETID_TCP);
> > +		break;
> > +	case AF_INET6:
> >  		xs_format_ipv6_peer_addresses(xprt, "tcp",
RPCBIND_NETID_TCP6);
> >  		break;
> >  	default:
> 
> 
> _______________________________________________
> pNFS mailing list
> pNFS@linux-nfs.org
> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [pnfs] [RFC 09/10] nfsd41: cb_sequence callback
       [not found]     ` <1241135565.15476.111.camel-rJ7iovZKK19ZJLDQqaL3InhyD016LWXt@public.gmane.org>
@ 2009-05-01  8:33       ` Benny Halevy
  0 siblings, 0 replies; 29+ messages in thread
From: Benny Halevy @ 2009-05-01  8:33 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: J. Bruce Fields, Andy Adamson, linux-nfs, pnfs

On May. 01, 2009, 2:52 +0300, Trond Myklebust <trond.myklebust@fys.uio.no> wrote:
> On Fri, 2009-05-01 at 02:06 +0300, Benny Halevy wrote:
>> From: Andy Adamson <andros@netapp.com>
>>
>> Implement the cb_sequence callback conforming to draft-ietf-nfsv4-minorversion1
>>
>> Note: highest slot id and target highest slot id do not have to be 0
>> as was previously implemented.  They can be greater than what the
>> nfs server sent if the client supports a larger slot table on the
>> backchannel.  At this point we just ignore that.
>>
>> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>> [Rework the back channel xdr using the shared v4.0 and v4.1 framework.]
>> Signed-off-by: Andy Adamson <andros@netapp.com>
>> [fixed indentation]
>> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>> [nfsd41: use nfsd4_cb_sequence for callback minorversion]
>> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>> [nfsd41: fix verification of CB_SEQUENCE highest slot id[
>> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>> [nfsd41: Backchannel: Remove old backchannel serialization]
>> [nfsd41: Backchannel: First callback sequence ID should be 1]
>> Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
>> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>> ---
>>  fs/nfsd/nfs4callback.c |   72 ++++++++++++++++++++++++++++++++++++++++++++++++
>>  fs/nfsd/nfs4state.c    |    1 +
>>  2 files changed, 73 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
>> index 2bf2cd4..78f4dd2 100644
>> --- a/fs/nfsd/nfs4callback.c
>> +++ b/fs/nfsd/nfs4callback.c
>> @@ -264,6 +264,27 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec,
>>  	hdr->nops++;
>>  }
>>  
>> +static void
>> +encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
>> +		   struct nfs4_cb_compound_hdr *hdr)
>> +{
>> +	__be32 *p;
>> +
>> +	if (hdr->minorversion == 0)
>> +		return;
>> +
>> +	RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
>> +
>> +	WRITE32(OP_CB_SEQUENCE);
>> +	WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
>> +	WRITE32(args->cbs_clp->cl_cb_seq_nr);
>> +	WRITE32(0);		/* slotid, always 0 */
>> +	WRITE32(0);		/* highest slotid always 0 */
>> +	WRITE32(0);		/* cachethis always 0 */
>> +	WRITE32(0); /* FIXME: support referring_call_lists */
>> +	hdr->nops++;
>> +}
>> +
>>  static int
>>  nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
>>  {
>> @@ -325,6 +346,57 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
>>  	return 0;
>>  }
>>  
>> +/*
>> + * Our current back channel implmentation supports a single backchannel
>> + * with a single slot.
>> + */
>> +static int
>> +decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
>> +		   struct rpc_rqst *rqstp)
>> +{
>> +	struct nfs4_sessionid id;
>> +	int status;
>> +	u32 dummy;
>> +	__be32 *p;
>> +
>> +	if (res->cbs_minorversion == 0)
>> +		return 0;
>> +
>> +	status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
>> +	if (status)
>> +		return status;
>> +
>> +	/*
>> +	 * If the server returns different values for sessionID, slotID or
>> +	 * sequence number, the server is looney tunes.
>> +	 */
>> +	status = -ESERVERFAULT;
>> +
>> +	READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
>> +	COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
>> +	if (memcmp(id.data, res->cbs_clp->cl_sessionid.data,
>> +		   NFS4_MAX_SESSIONID_LEN)) {
>> +		dprintk("%s Invalid session id\n", __func__);
>> +		goto out;
>> +	}
>> +	READ32(dummy);
>> +	if (dummy != res->cbs_clp->cl_cb_seq_nr) {
>> +		dprintk("%s Invalid sequence number\n", __func__);
>> +		goto out;
>> +	}
>> +	READ32(dummy); 	/* slotid must be 0 */
>> +	if (dummy != 0) {
>> +		dprintk("%s Invalid slotid\n", __func__);
>> +		goto out;
>> +	}
>> +	READ32(dummy); 	/* highest slotid */
>           ^^^^^^^^^^^^^
> 
>> +	READ32(dummy); 	/* target highest slotid */
>           ^^^^^^^^^^^^^^
> 
> Why do you need those?

Good catch :)
Since READ_BUF (xdr_inline_decode) indeed takes care of
adjusting the xdr_stream pointer we can take them out.
A FIXME comment about the need to process these channel
attributes would be clearer.

Benny

> 
>> +	status = 0;
>> +out:
>> +	return status;
>> +}
>> +
>> +
>>  static int
>>  nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
>>  {
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index d7b4028..6f3cf47 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -1400,6 +1400,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
>>  			svc_xprt_get(unconf->cl_cb_xprt);
>>  			unconf->cl_callback.cb_minorversion =
>>  				cstate->minorversion;
>> +			unconf->cl_cb_seq_nr = 1;
>>  			unconf->cl_callback.cb_prog = cr_ses->callback_prog;
>>  			nfsd4_probe_callback(unconf);
>>  		}
>> -- 
>> 1.6.2.1
>>
>> _______________________________________________
>> pNFS mailing list
>> pNFS@linux-nfs.org
>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
> 
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling
  2009-04-30 23:05 ` [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling Benny Halevy
  2009-05-01  0:05   ` [pnfs] " Trond Myklebust
@ 2009-05-03 20:36   ` J. Bruce Fields
  1 sibling, 0 replies; 29+ messages in thread
From: J. Bruce Fields @ 2009-05-03 20:36 UTC (permalink / raw)
  To: Benny Halevy
  Cc: Ricardo Labiaga, pnfs, linux-nfs, Rahul Iyer, Mike Sager,
	Marc Eshel, Andy Adamson

On Fri, May 01, 2009 at 02:05:50AM +0300, Benny Halevy wrote:
> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> index 4e6d406..619764e 100644
> --- a/net/sunrpc/svcsock.c
> +++ b/net/sunrpc/svcsock.c
> @@ -49,6 +49,7 @@
>  #include <linux/sunrpc/msg_prot.h>
>  #include <linux/sunrpc/svcsock.h>
>  #include <linux/sunrpc/stats.h>
> +#include <linux/sunrpc/xprt.h>
>  
>  #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
>  
> @@ -825,6 +826,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
>  	int		len;
>  	struct kvec *vec;
>  	int pnum, vlen;
> +	struct rpc_rqst *req = NULL;
>  
>  	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
>  		svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
> @@ -891,12 +893,65 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
>  	len = svsk->sk_reclen;
>  	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
>  
> +	/*
> +	 * We have enough data for the whole tcp record. Let's try and read the
> +	 * first 8 bytes to get the xid and the call direction. We can use this
> +	 * to figure out if this is a call or a reply to a callback. If
> +	 * sk_reclen is < 8 (xid and calldir), then this is a malformed packet.
> +	 * In that case, don't bother with the calldir and just read the data.
> +	 * It will be rejected in svc_process.
> +	 */
> +
>  	vec = rqstp->rq_vec;
>  	vec[0] = rqstp->rq_arg.head[0];
>  	vlen = PAGE_SIZE;
> +
> +	if (len >= 8) {
> +		u32 *p;
> +		u32 xid;
> +		u32 calldir;

Style complaint: "Functions should be short and sweet, and do just one
thing."  The code in this "if" clause looks like something that could
easily be encapsulated in a helper function.

> +
> +		len = svc_recvfrom(rqstp, vec, 1, 8);
> +		if (len < 0)
> +			goto error;
> +
> +		p = (u32 *)rqstp->rq_arg.head[0].iov_base;
> +		xid = *p++;
> +		calldir = *p;
> +
> +		if (calldir) {
> +			/* REPLY */
> +			if (svsk->sk_bc_xprt)
> +				req = xprt_lookup_rqst(svsk->sk_bc_xprt, xid);
> +			if (req) {
> +				memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
> +					sizeof(struct xdr_buf));

This worries me.  Has anyone tested this with krb5p?  If not, please do.

> +				/* copy the xid and call direction */
> +				memcpy(req->rq_private_buf.head[0].iov_base,
> +					rqstp->rq_arg.head[0].iov_base, 8);
> +				vec[0] = req->rq_private_buf.head[0];
> +			} else
> +				printk(KERN_NOTICE
> +					"%s: Got unrecognized reply: "
> +					"calldir 0x%x sk_bc_xprt %p xid %08x\n",
> +					__func__, ntohl(calldir),
> +					svsk->sk_bc_xprt, xid);
> +		}

And another style nit:  other things being equal, I'd generally prefer

	if (exceptional case)
		handle it, goto/return
	if (another exceptional case)
		handle it, goto/return
	...
	normal case
	...

to

	if (normal case) {
		...
		normal case
		...
	} else
		handle exceptional case

--b.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC 04/10] nfsd41: Remember the auth flavor to use for callbacks
  2009-04-30 23:06 ` [RFC 04/10] nfsd41: Remember the auth flavor to use for callbacks Benny Halevy
@ 2009-05-03 20:42   ` J. Bruce Fields
  2009-05-05  2:51     ` [RFC 04/10] nfsd41: Remember the auth flavor to use forcallbacks Labiaga, Ricardo
  0 siblings, 1 reply; 29+ messages in thread
From: J. Bruce Fields @ 2009-05-03 20:42 UTC (permalink / raw)
  To: Benny Halevy; +Cc: Ricardo Labiaga, pnfs, linux-nfs

On Fri, May 01, 2009 at 02:06:01AM +0300, Benny Halevy wrote:
> From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
> 
> The callbacks will be sent using the same authentication flavor that
> was used during session creation.  We'll add code to remember the
> principal in the case RPCSEC_GSS in a separate patch.
> 
> Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> ---
>  fs/nfsd/nfs4state.c |    1 +
>  1 files changed, 1 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index cc9705b..ad30039 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -1284,6 +1284,7 @@ out_new:
>  	copy_verf(new, &verf);
>  	copy_cred(&new->cl_cred, &rqstp->rq_cred);
>  	new->cl_addr = ip_addr;
> +	new->cl_flavor = rqstp->rq_flavor;
>  	gen_clid(new);
>  	gen_confirm(new);
>  	add_to_unconfirmed(new, strhashval);

Most of the code here is duplicated between the 4.1 and 4.0 cases, and
this is just adding a line that was already there in the 4.0 case.  (The
same will be true for rembering the princpal in the RPCSEC_GSS case.)
Could we move the rest of this initialization into create_client (giving
it some more arguments if necessary), and eliminate some code
duplication?

--b.

> -- 
> 1.6.2.1
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC 05/10] nfsd41: callback infrastructure
  2009-04-30 23:06 ` [RFC 05/10] nfsd41: callback infrastructure Benny Halevy
@ 2009-05-03 20:49   ` J. Bruce Fields
  0 siblings, 0 replies; 29+ messages in thread
From: J. Bruce Fields @ 2009-05-03 20:49 UTC (permalink / raw)
  To: Benny Halevy; +Cc: Ricardo Labiaga, pnfs, linux-nfs, Andy Adamson

On Fri, May 01, 2009 at 02:06:08AM +0300, Benny Halevy wrote:
> From: Andy Adamson <andros@netapp.com>
> 
> Keep the xprt used for create_session in cl_cb_xprt.
> Mark cl_callback.cb_minorversion = 1 and remember
> the client provided cl_callback.cb_prog rpc program number.
> Use it to probe the callback path.
> 
> Define xdr sizes and code nfs4_cb_compound header to be able
> to send a null callback rpc.
> 
> Signed-off-by: Andy Adamson<andros@netapp.com>
> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> [get callback minorversion from fore channel's]
> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> ---
>  fs/nfsd/nfs4callback.c     |   33 ++++++++++++++++++++++++++++++---
>  fs/nfsd/nfs4state.c        |   10 ++++++++++
>  include/linux/nfsd/state.h |    3 +++
>  3 files changed, 43 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> index 5823b9a..6f1ca49 100644
> --- a/fs/nfsd/nfs4callback.c
> +++ b/fs/nfsd/nfs4callback.c
> @@ -43,6 +43,7 @@
>  #include <linux/sunrpc/xdr.h>
>  #include <linux/sunrpc/svc.h>
>  #include <linux/sunrpc/clnt.h>
> +#include <linux/sunrpc/svcsock.h>
>  #include <linux/nfsd/nfsd.h>
>  #include <linux/nfsd/state.h>
>  #include <linux/sunrpc/sched.h>
> @@ -52,16 +53,19 @@
>  
>  #define NFSPROC4_CB_NULL 0
>  #define NFSPROC4_CB_COMPOUND 1
> +#define NFS4_STATEID_SIZE 16
>  
>  /* Index of predefined Linux callback client operations */
>  
>  enum {
> -        NFSPROC4_CLNT_CB_NULL = 0,
> +	NFSPROC4_CLNT_CB_NULL = 0,
>  	NFSPROC4_CLNT_CB_RECALL,
> +	NFSPROC4_CLNT_CB_SEQUENCE,
>  };
>  
>  enum nfs_cb_opnum4 {
>  	OP_CB_RECALL            = 4,
> +	OP_CB_SEQUENCE          = 11,
>  };
>  
>  #define NFS4_MAXTAGLEN		20
> @@ -70,15 +74,22 @@ enum nfs_cb_opnum4 {
>  #define NFS4_dec_cb_null_sz		0
>  #define cb_compound_enc_hdr_sz		4
>  #define cb_compound_dec_hdr_sz		(3 + (NFS4_MAXTAGLEN >> 2))
> +#define sessionid_sz			(NFS4_MAX_SESSIONID_LEN >> 2)
> +#define cb_sequence_enc_sz		(sessionid_sz + 4 +             \
> +					1 /* no referring calls list yet */)
> +#define cb_sequence_dec_sz		(op_dec_sz + sessionid_sz + 4)
> +
>  #define op_enc_sz			1
>  #define op_dec_sz			2
>  #define enc_nfs4_fh_sz			(1 + (NFS4_FHSIZE >> 2))
>  #define enc_stateid_sz			(NFS4_STATEID_SIZE >> 2)
>  #define NFS4_enc_cb_recall_sz		(cb_compound_enc_hdr_sz +       \
> +					cb_sequence_enc_sz +            \
>  					1 + enc_stateid_sz +            \
>  					enc_nfs4_fh_sz)
>  
>  #define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
> +					cb_sequence_dec_sz +            \
>  					op_dec_sz)
>  
>  /*
> @@ -135,13 +146,19 @@ xdr_error:                                      \
>  		return -EIO; \
>  	} \
>  } while (0)
> +#define COPYMEM(x, nbytes) do {                \
> +	memcpy((x), p, nbytes);                \
> +	p += XDR_QUADLEN(nbytes);              \
> +} while (0)
>  
>  struct nfs4_cb_compound_hdr {
> -	int		status;
> -	u32		ident;
> +	/* args */
> +	u32		ident;	/* minorversion 0 only */
>  	u32		nops;
>  	__be32		*nops_p;
>  	u32		minorversion;
> +	/* res */
> +	int		status;
>  	u32		taglen;
>  	char		*tag;
>  };
> @@ -402,6 +419,15 @@ static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
>  	addr.sin_family = AF_INET;
>  	addr.sin_port = htons(cb->cb_port);
>  	addr.sin_addr.s_addr = htonl(cb->cb_addr);
> +	if (cb->cb_minorversion) {
> +		BUG_ON(cb->cb_minorversion != 1);

Is this really a likely mistake?  Let's just drop this BUG_ON().

--b.

> +		args.bc_sock = container_of(clp->cl_cb_xprt, struct svc_sock,
> +					    sk_xprt);
> +	}
> +
> +	dprintk("%s: program %s 0x%x nrvers %u version %u minorversion %u\n",
> +		__func__, args.program->name, args.prognumber,
> +		args.program->nrvers, args.version, cb->cb_minorversion);
>  
>  	/* Create RPC client */
>  	client = rpc_create(&args);
> @@ -441,6 +467,7 @@ static int do_probe_callback(void *data)
>  	put_nfs4_client(clp);
>  	return 0;
>  out_release_client:
> +	dprintk("NFSD: synchronous CB_NULL failed. status=%d\n", status);
>  	rpc_shutdown_client(client);
>  out_err:
>  	dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index ad30039..61d5c66 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -651,6 +651,8 @@ static inline void
>  free_client(struct nfs4_client *clp)
>  {
>  	shutdown_callback_client(clp);
> +	if (clp->cl_cb_xprt)
> +		svc_xprt_put(clp->cl_cb_xprt);
>  	nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
>  			     clp->cl_slot.sl_cache_entry.ce_resused);
>  	if (clp->cl_cred.cr_group_info)
> @@ -1391,6 +1393,14 @@ nfsd4_create_session(struct svc_rqst *rqstp,
>  		cr_ses->flags &= ~SESSION4_PERSIST;
>  		cr_ses->flags &= ~SESSION4_RDMA;
>  
> +		if (cr_ses->flags & SESSION4_BACK_CHAN) {
> +			unconf->cl_cb_xprt = rqstp->rq_xprt;
> +			svc_xprt_get(unconf->cl_cb_xprt);
> +			unconf->cl_callback.cb_minorversion =
> +				cstate->minorversion;
> +			unconf->cl_callback.cb_prog = cr_ses->callback_prog;
> +			nfsd4_probe_callback(unconf);
> +		}
>  		conf = unconf;
>  	} else {
>  		status = nfserr_stale_clientid;
> diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
> index 8762843..6bdf0d5 100644
> --- a/include/linux/nfsd/state.h
> +++ b/include/linux/nfsd/state.h
> @@ -203,6 +203,9 @@ struct nfs4_client {
>  	struct nfsd4_slot	cl_slot;	/* create_session slot */
>  	u32			cl_exchange_flags;
>  	struct nfs4_sessionid	cl_sessionid;
> +
> +	/* for nfs41 callbacks */
> +	struct svc_xprt		*cl_cb_xprt;	/* 4.1 callback transport */
>  };
>  
>  /* struct nfs4_client_reset
> -- 
> 1.6.2.1
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC 0/10] nfsd41 server backchannel for 2.6.31
  2009-04-30 23:00 [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
                   ` (10 preceding siblings ...)
  2009-04-30 23:12 ` [pnfs] [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
@ 2009-05-03 20:53 ` J. Bruce Fields
  2009-05-06  4:11   ` Labiaga, Ricardo
  11 siblings, 1 reply; 29+ messages in thread
From: J. Bruce Fields @ 2009-05-03 20:53 UTC (permalink / raw)
  To: Benny Halevy; +Cc: pNFS Mailing List, NFS list, Ricardo Labiaga

On Fri, May 01, 2009 at 02:00:22AM +0300, Benny Halevy wrote:
> Bruce,
> 
> After squashing and merging Ricardo's latest patchset
> please review the following patchset and consider for 2.6.31.

Note this will conflict with the pending changes I have to make the
current callbacks use asynchronous rpc tasks instead of
kthreads--apologies.  I'll try to get those into for-2.6.31 tomorrow,
and then I hope you won't mind working on top of that?

--b.

> 
> Thanks,
> 
> Benny
> 
> [RFC 01/10] nfsd: cleanup nfs4.0 callback encode routines
> [RFC 02/10] nfsd: minorversion support for the back channel
> [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling
> [RFC 04/10] nfsd41: Remember the auth flavor to use for callbacks
> [RFC 05/10] nfsd41: callback infrastructure
> [RFC 06/10] nfsd41: Backchannel: Add sequence arguments to callback RPC arguments
> [RFC 07/10] nfsd41: Backchannel: Server backchannel RPC wait queue
> [RFC 08/10] nfsd41: Backchannel: Setup sequence information
> [RFC 09/10] nfsd41: cb_sequence callback
> [RFC 10/10] nfsd41: cb_recall callback

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [RFC 04/10] nfsd41: Remember the auth flavor to use forcallbacks
  2009-05-03 20:42   ` J. Bruce Fields
@ 2009-05-05  2:51     ` Labiaga, Ricardo
  0 siblings, 0 replies; 29+ messages in thread
From: Labiaga, Ricardo @ 2009-05-05  2:51 UTC (permalink / raw)
  To: J. Bruce Fields, Benny Halevy; +Cc: pnfs, linux-nfs

> -----Original Message-----
> From: J. Bruce Fields [mailto:bfields@fieldses.org]
> Sent: Sunday, May 03, 2009 1:43 PM
> To: Benny Halevy
> Cc: Labiaga, Ricardo; pnfs@linux-nfs.org; linux-nfs@vger.kernel.org
> Subject: Re: [RFC 04/10] nfsd41: Remember the auth flavor to use
> forcallbacks
> 
> On Fri, May 01, 2009 at 02:06:01AM +0300, Benny Halevy wrote:
> > From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
> >
> > The callbacks will be sent using the same authentication flavor that
> > was used during session creation.  We'll add code to remember the
> > principal in the case RPCSEC_GSS in a separate patch.
> >
> > Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> > ---
> >  fs/nfsd/nfs4state.c |    1 +
> >  1 files changed, 1 insertions(+), 0 deletions(-)
> >
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index cc9705b..ad30039 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -1284,6 +1284,7 @@ out_new:
> >  	copy_verf(new, &verf);
> >  	copy_cred(&new->cl_cred, &rqstp->rq_cred);
> >  	new->cl_addr = ip_addr;
> > +	new->cl_flavor = rqstp->rq_flavor;
> >  	gen_clid(new);
> >  	gen_confirm(new);
> >  	add_to_unconfirmed(new, strhashval);
> 
> Most of the code here is duplicated between the 4.1 and 4.0 cases, and
> this is just adding a line that was already there in the 4.0 case.
(The
> same will be true for rembering the princpal in the RPCSEC_GSS case.)
> Could we move the rest of this initialization into create_client
(giving
> it some more arguments if necessary), and eliminate some code
> duplication?

Sounds good.  I'll take care of that.

- ricardo

> --b.
> 
> > --
> > 1.6.2.1
> >

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC 0/10] nfsd41 server backchannel for 2.6.31
  2009-05-03 20:53 ` J. Bruce Fields
@ 2009-05-06  4:11   ` Labiaga, Ricardo
  2009-05-06 21:24     ` J. Bruce Fields
  0 siblings, 1 reply; 29+ messages in thread
From: Labiaga, Ricardo @ 2009-05-06  4:11 UTC (permalink / raw)
  To: J. Bruce Fields, Benny Halevy; +Cc: pNFS Mailing List, NFS list

It shouldn't be too bad.  I'll go ahead and rework the patches on top of
your asynchronous RPC callback changes.

- ricardo


On 5/3/09 1:53 PM, "J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Fri, May 01, 2009 at 02:00:22AM +0300, Benny Halevy wrote:
>> Bruce,
>> 
>> After squashing and merging Ricardo's latest patchset
>> please review the following patchset and consider for 2.6.31.
> 
> Note this will conflict with the pending changes I have to make the
> current callbacks use asynchronous rpc tasks instead of
> kthreads--apologies.  I'll try to get those into for-2.6.31 tomorrow,
> and then I hope you won't mind working on top of that?
> 
> --b.
> 
>> 
>> Thanks,
>> 
>> Benny
>> 
>> [RFC 01/10] nfsd: cleanup nfs4.0 callback encode routines
>> [RFC 02/10] nfsd: minorversion support for the back channel
>> [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling
>> [RFC 04/10] nfsd41: Remember the auth flavor to use for callbacks
>> [RFC 05/10] nfsd41: callback infrastructure
>> [RFC 06/10] nfsd41: Backchannel: Add sequence arguments to callback RPC
>> arguments
>> [RFC 07/10] nfsd41: Backchannel: Server backchannel RPC wait queue
>> [RFC 08/10] nfsd41: Backchannel: Setup sequence information
>> [RFC 09/10] nfsd41: cb_sequence callback
>> [RFC 10/10] nfsd41: cb_recall callback


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC 0/10] nfsd41 server backchannel for 2.6.31
  2009-05-06  4:11   ` Labiaga, Ricardo
@ 2009-05-06 21:24     ` J. Bruce Fields
  0 siblings, 0 replies; 29+ messages in thread
From: J. Bruce Fields @ 2009-05-06 21:24 UTC (permalink / raw)
  To: Labiaga, Ricardo; +Cc: Benny Halevy, pNFS Mailing List, NFS list

On Tue, May 05, 2009 at 09:11:28PM -0700, Labiaga, Ricardo wrote:
> It shouldn't be too bad.  I'll go ahead and rework the patches on top of
> your asynchronous RPC callback changes.

OK, thanks.  I've added them to for-2.6.31:

	git://linux-nfs.org/~bfields/linux.git for-2.6.31

So easiest for me would be if the patches are against that.

--b.

> 
> - ricardo
> 
> 
> On 5/3/09 1:53 PM, "J. Bruce Fields" <bfields@fieldses.org> wrote:
> 
> > On Fri, May 01, 2009 at 02:00:22AM +0300, Benny Halevy wrote:
> >> Bruce,
> >> 
> >> After squashing and merging Ricardo's latest patchset
> >> please review the following patchset and consider for 2.6.31.
> > 
> > Note this will conflict with the pending changes I have to make the
> > current callbacks use asynchronous rpc tasks instead of
> > kthreads--apologies.  I'll try to get those into for-2.6.31 tomorrow,
> > and then I hope you won't mind working on top of that?
> > 
> > --b.
> > 
> >> 
> >> Thanks,
> >> 
> >> Benny
> >> 
> >> [RFC 01/10] nfsd: cleanup nfs4.0 callback encode routines
> >> [RFC 02/10] nfsd: minorversion support for the back channel
> >> [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling
> >> [RFC 04/10] nfsd41: Remember the auth flavor to use for callbacks
> >> [RFC 05/10] nfsd41: callback infrastructure
> >> [RFC 06/10] nfsd41: Backchannel: Add sequence arguments to callback RPC
> >> arguments
> >> [RFC 07/10] nfsd41: Backchannel: Server backchannel RPC wait queue
> >> [RFC 08/10] nfsd41: Backchannel: Setup sequence information
> >> [RFC 09/10] nfsd41: cb_sequence callback
> >> [RFC 10/10] nfsd41: cb_recall callback
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling
       [not found]     ` <1241136328.15476.124.camel-rJ7iovZKK19ZJLDQqaL3InhyD016LWXt@public.gmane.org>
  2009-05-01  0:13       ` Labiaga, Ricardo
@ 2009-06-02  0:33       ` Labiaga, Ricardo
       [not found]         ` <273FE88A07F5D445824060902F70034405FE3129-hX7t0kiaRRpT+ZUat5FNkAK/GNPrWCqfQQ4Iyu8u01E@public.gmane.org>
  1 sibling, 1 reply; 29+ messages in thread
From: Labiaga, Ricardo @ 2009-06-02  0:33 UTC (permalink / raw)
  To: Trond Myklebust, Benny Halevy
  Cc: Adamson, Andy, J. Bruce Fields, pnfs, linux-nfs

Trond, Bruce,

Alexandros has coded a number of patches that address the issues raised
here by Trond.  Do you want the fixes squashed into the original patch
or do you want them submitted separately.

Thanks,

- ricardo

> -----Original Message-----
> From: Trond Myklebust [mailto:trond.myklebust@fys.uio.no]
> Sent: Thursday, April 30, 2009 5:05 PM
> To: Benny Halevy
> Cc: Adamson, Andy; J. Bruce Fields; pnfs@linux-nfs.org; linux-
> nfs@vger.kernel.org
> Subject: Re: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side
> backchannel handling
> 
> On Fri, 2009-05-01 at 02:05 +0300, Benny Halevy wrote:
> > From: Rahul Iyer <iyer@netapp.com>
> >
> > FIXME: bhalevy: write up commit message
> >
> > Signed-off-by: Rahul Iyer <iyer@netapp.com>
> > Signed-off-by: Mike Sager <sager@netapp.com>
> > Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> >
> > When the call direction is a reply, copy the xid and call direction
into
> the
> > req->rq_private_buf.head[0].iov_base otherwise rpc_verify_header
returns
> > rpc_garbage.
> >
> > Signed-off-by: Andy Adamson <andros@netapp.com>
> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> > [get rid of CONFIG_NFSD_V4_1]
> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> > ---
> >  include/linux/sunrpc/clnt.h    |    1 +
> >  include/linux/sunrpc/svcsock.h |    1 +
> >  include/linux/sunrpc/xprt.h    |    2 +
> >  net/sunrpc/clnt.c              |    1 +
> >  net/sunrpc/svcsock.c           |   68 ++++++++++-
> >  net/sunrpc/xprt.c              |   41 ++++++-
> >  net/sunrpc/xprtsock.c          |  278
> +++++++++++++++++++++++++++++++++++++++-
> >  7 files changed, 381 insertions(+), 11 deletions(-)
> >
> > diff --git a/include/linux/sunrpc/clnt.h
b/include/linux/sunrpc/clnt.h
> > index c39a210..cf9a8ec 100644
> > --- a/include/linux/sunrpc/clnt.h
> > +++ b/include/linux/sunrpc/clnt.h
> > @@ -110,6 +110,7 @@ struct rpc_create_args {
> >  	rpc_authflavor_t	authflavor;
> >  	unsigned long		flags;
> >  	char			*client_name;
> > +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel
*/
> >  };
> >
> >  /* Values for "flags" field */
> > diff --git a/include/linux/sunrpc/svcsock.h
> b/include/linux/sunrpc/svcsock.h
> > index 8271631..19228f4 100644
> > --- a/include/linux/sunrpc/svcsock.h
> > +++ b/include/linux/sunrpc/svcsock.h
> > @@ -28,6 +28,7 @@ struct svc_sock {
> >  	/* private TCP part */
> >  	u32			sk_reclen;	/* length of record */
> >  	u32			sk_tcplen;	/* current read length
*/
> > +	struct rpc_xprt	       *sk_bc_xprt;	/* NFSv4.1 backchannel
xprt
> */
> >  };
> >
> >  /*
> > diff --git a/include/linux/sunrpc/xprt.h
b/include/linux/sunrpc/xprt.h
> > index 1758d9f..063a6a7 100644
> > --- a/include/linux/sunrpc/xprt.h
> > +++ b/include/linux/sunrpc/xprt.h
> > @@ -174,6 +174,7 @@ struct rpc_xprt {
> >  	spinlock_t		reserve_lock;	/* lock slot table */
> >  	u32			xid;		/* Next XID value to use
*/
> >  	struct rpc_task *	snd_task;	/* Task blocked in send
*/
> > +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel
*/
> >  	struct list_head	recv;
> >
> >  	struct {
> > @@ -197,6 +198,7 @@ struct xprt_create {
> >  	struct sockaddr *	srcaddr;	/* optional local
address */
> >  	struct sockaddr *	dstaddr;	/* remote peer address
*/
> >  	size_t			addrlen;
> > +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel
*/
> >  };
> >
> >  struct xprt_class {
> > diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> > index 5abab09..3dc847f 100644
> > --- a/net/sunrpc/clnt.c
> > +++ b/net/sunrpc/clnt.c
> > @@ -266,6 +266,7 @@ struct rpc_clnt *rpc_create(struct
rpc_create_args
> *args)
> >  		.srcaddr = args->saddress,
> >  		.dstaddr = args->address,
> >  		.addrlen = args->addrsize,
> > +		.bc_sock = args->bc_sock,
> >  	};
> >  	char servername[48];
> >
> > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> > index 4e6d406..619764e 100644
> > --- a/net/sunrpc/svcsock.c
> > +++ b/net/sunrpc/svcsock.c
> > @@ -49,6 +49,7 @@
> >  #include <linux/sunrpc/msg_prot.h>
> >  #include <linux/sunrpc/svcsock.h>
> >  #include <linux/sunrpc/stats.h>
> > +#include <linux/sunrpc/xprt.h>
> >
> >  #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
> >
> > @@ -825,6 +826,7 @@ static int svc_tcp_recvfrom(struct svc_rqst
*rqstp)
> >  	int		len;
> >  	struct kvec *vec;
> >  	int pnum, vlen;
> > +	struct rpc_rqst *req = NULL;
> >
> >  	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
> >  		svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
> > @@ -891,12 +893,65 @@ static int svc_tcp_recvfrom(struct svc_rqst
> *rqstp)
> >  	len = svsk->sk_reclen;
> >  	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
> >
> > +	/*
> > +	 * We have enough data for the whole tcp record. Let's try and
read
> the
> > +	 * first 8 bytes to get the xid and the call direction. We can
use
> this
> > +	 * to figure out if this is a call or a reply to a callback. If
> > +	 * sk_reclen is < 8 (xid and calldir), then this is a malformed
> packet.
> > +	 * In that case, don't bother with the calldir and just read the
> data.
> > +	 * It will be rejected in svc_process.
> > +	 */
> > +
> >  	vec = rqstp->rq_vec;
> >  	vec[0] = rqstp->rq_arg.head[0];
> >  	vlen = PAGE_SIZE;
> > +
> > +	if (len >= 8) {
> > +		u32 *p;
> > +		u32 xid;
> > +		u32 calldir;
> > +
> > +		len = svc_recvfrom(rqstp, vec, 1, 8);
> > +		if (len < 0)
> > +			goto error;
> > +
> > +		p = (u32 *)rqstp->rq_arg.head[0].iov_base;
> > +		xid = *p++;
> > +		calldir = *p;
> > +
> > +		if (calldir) {
> > +			/* REPLY */
> > +			if (svsk->sk_bc_xprt)
> > +				req = xprt_lookup_rqst(svsk->sk_bc_xprt,
xid);
> > +			if (req) {
> > +				memcpy(&req->rq_private_buf,
&req->rq_rcv_buf,
> > +					sizeof(struct xdr_buf));
> > +				/* copy the xid and call direction */
> > +
memcpy(req->rq_private_buf.head[0].iov_base,
> > +					rqstp->rq_arg.head[0].iov_base,
8);
> > +				vec[0] = req->rq_private_buf.head[0];
> > +			} else
> > +				printk(KERN_NOTICE
> > +					"%s: Got unrecognized reply: "
> > +					"calldir 0x%x sk_bc_xprt %p xid
%08x\n",
> > +					__func__, ntohl(calldir),
> > +					svsk->sk_bc_xprt, xid);
> > +		}
> > +
> > +		if (!calldir || !req)
> > +			vec[0] = rqstp->rq_arg.head[0];
> > +
> > +		vec[0].iov_base += 8;
> > +		vec[0].iov_len -= 8;
> > +		len = svsk->sk_reclen - 8;
> > +		vlen -= 8;
> > +	}
> > +
> >  	pnum = 1;
> >  	while (vlen < len) {
> > -		vec[pnum].iov_base =
page_address(rqstp->rq_pages[pnum]);
> > +		vec[pnum].iov_base = (req) ?
> > +			page_address(req->rq_private_buf.pages[pnum -
1]) :
> > +			page_address(rqstp->rq_pages[pnum]);
> >  		vec[pnum].iov_len = PAGE_SIZE;
> >  		pnum++;
> >  		vlen += PAGE_SIZE;
> > @@ -908,6 +963,16 @@ static int svc_tcp_recvfrom(struct svc_rqst
*rqstp)
> >  	if (len < 0)
> >  		goto error;
> >
> > +	/*
> > +	 * Account for the 8 bytes we read earlier
> > +	 */
> > +	len += 8;
> > +
> > +	if (req) {
> > +		xprt_complete_rqst(req->rq_task, len);
> > +		len = 0;
> > +		goto out;
> > +	}
> >  	dprintk("svc: TCP complete record (%d bytes)\n", len);
> >  	rqstp->rq_arg.len = len;
> >  	rqstp->rq_arg.page_base = 0;
> > @@ -921,6 +986,7 @@ static int svc_tcp_recvfrom(struct svc_rqst
*rqstp)
> >  	rqstp->rq_xprt_ctxt   = NULL;
> >  	rqstp->rq_prot	      = IPPROTO_TCP;
> >
> > +out:
> >  	/* Reset TCP read info */
> >  	svsk->sk_reclen = 0;
> >  	svsk->sk_tcplen = 0;
> > diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> > index a0bfe53..03f175e 100644
> > --- a/net/sunrpc/xprt.c
> > +++ b/net/sunrpc/xprt.c
> > @@ -1015,6 +1015,27 @@ void xprt_release(struct rpc_task *task)
> >  	spin_unlock(&xprt->reserve_lock);
> >  }
> >
> > +/*
> > + * The autoclose function for the back channel
> > + *
> > + * The callback channel should never close the channel,
> > + * let the forechannel do that.
> > + */
> > +static void bc_autoclose(struct work_struct *work)
> > +{
> > +	return;
> > +}
> > +
> > +
> > +/*
> > + * The autodisconnect routine for the back channel. We never
disconnect
> > + */
> > +static void
> > +bc_init_autodisconnect(unsigned long data)
> > +{
> > +	return;
> > +}
> > +
> >  /**
> >   * xprt_create_transport - create an RPC transport
> >   * @args: rpc transport creation arguments
> > @@ -1051,9 +1072,16 @@ found:
> >
> >  	INIT_LIST_HEAD(&xprt->free);
> >  	INIT_LIST_HEAD(&xprt->recv);
> > -	INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
> > -	setup_timer(&xprt->timer, xprt_init_autodisconnect,
> > -			(unsigned long)xprt);
> > +	if (args->bc_sock) {
> > +		INIT_WORK(&xprt->task_cleanup, bc_autoclose);
> > +		setup_timer(&xprt->timer, bc_init_autodisconnect,
> > +			    (unsigned long)xprt);
> 
> Hrmph... Why do you need dummy routines here?
> 
> > +	} else {
> > +		INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
> > +		setup_timer(&xprt->timer, xprt_init_autodisconnect,
> > +			    (unsigned long)xprt);
> > +	}
> > +
> >  	xprt->last_used = jiffies;
> >  	xprt->cwnd = RPC_INITCWND;
> >  	xprt->bind_index = 0;
> > @@ -1073,6 +1101,13 @@ found:
> >  	dprintk("RPC:       created transport %p with %u slots\n", xprt,
> >  			xprt->max_reqs);
> >
> > +	/*
> > +	 * Since we don't want connections for the backchannel, we set
> > +	 * the xprt status to connected
> > +	 */
> > +	if (args->bc_sock)
> > +		xprt_set_connected(xprt);
> > +
> >  	return xprt;
> >  }
> >
> > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > index d40ff50..067d205 100644
> > --- a/net/sunrpc/xprtsock.c
> > +++ b/net/sunrpc/xprtsock.c
> > @@ -32,6 +32,7 @@
> >  #include <linux/tcp.h>
> >  #include <linux/sunrpc/clnt.h>
> >  #include <linux/sunrpc/sched.h>
> > +#include <linux/sunrpc/svcsock.h>
> >  #include <linux/sunrpc/xprtsock.h>
> >  #include <linux/file.h>
> >
> > @@ -1966,6 +1967,219 @@ static void xs_tcp_print_stats(struct
rpc_xprt
> *xprt, struct seq_file *seq)
> >  			xprt->stat.bklog_u);
> >  }
> >
> > +/*
> > + * The connect worker for the backchannel
> > + * This should never be called as we should never need to connect
> > + */
> > +static void bc_connect_worker(struct work_struct *work)
> > +{
> > +	BUG();
> > +}
> > +
> > +/*
> > + * The set_port routine of the rpc_xprt_ops. This is related to the
> portmapper
> > + * and should never be called
> > + */
> > +
> > +static void bc_set_port(struct rpc_xprt *xprt, unsigned short port)
> > +{
> > +	BUG();
> > +}
> > +
> > +/*
> > + * The connect routine for the backchannel rpc_xprt ops
> > + * Again, should never be called!
> > + */
> > +
> > +static void bc_connect(struct rpc_task *task)
> > +{
> > +	BUG();
> > +}
> > +
> > +struct rpc_buffer {
> > +	size_t	len;
> > +	char	data[];
> > +};
> > +/*
> > + * Allocate a bunch of pages for a scratch buffer for the rpc code.
The
> reason
> > + * we allocate pages instead doing a kmalloc like rpc_malloc is
because
> we want
> > + * to use the server side send routines.
> > + */
> > +void *bc_malloc(struct rpc_task *task, size_t size)
> > +{
> > +	struct page *page;
> > +	struct rpc_buffer *buf;
> > +
> > +	BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer));
> > +	page = alloc_page(GFP_KERNEL);
> > +
> > +	if (!page)
> > +		return NULL;
> > +
> > +	buf = page_address(page);
> > +	buf->len = PAGE_SIZE;
> > +
> > +	return buf->data;
> > +}
> > +
> 
> __get_free_page()? Why can't you kmalloc() here?
> 
> > +/*
> > + * Free the space allocated in the bc_alloc routine
> > + */
> > +void bc_free(void *buffer)
> > +{
> > +	struct rpc_buffer *buf;
> > +
> > +	if (!buffer)
> > +		return;
> > +
> > +	buf = container_of(buffer, struct rpc_buffer, data);
> > +	free_pages((unsigned long)buf, get_order(buf->len));
> 
> This looks funky... Why can't you just call free_page()? You already
> know from bc_malloc() that this is an order 0 page allocation.
> 
> > +}
> > +
> > +/*
> > + * Use the svc_sock to send the callback. Must be called with svsk-
> >sk_mutex
> > + * held. Borrows heavily from svc_tcp_sendto and
xs_tcp_semd_request.
> > + */
> > +static int bc_sendto(struct rpc_rqst *req)
> > +{
> > +	int total_len;
> > +	int len;
> > +	int size;
> > +	int result;
> > +	struct xdr_buf *xbufp = &req->rq_snd_buf;
> > +	struct page **pages = xbufp->pages;
> > +	unsigned int flags = MSG_MORE;
> > +	unsigned int pglen = xbufp->page_len;
> > +	size_t base = xbufp->page_base;
> > +	struct rpc_xprt *xprt = req->rq_xprt;
> > +	struct sock_xprt *transport =
> > +				container_of(xprt, struct sock_xprt,
xprt);
> > +	struct socket *sock = transport->sock;
> > +
> > +	total_len = xbufp->len;
> > +
> > +	/*
> > +	 * Set up the rpc header and record marker stuff
> > +	 */
> > +	xs_encode_tcp_record_marker(xbufp);
> > +
> > +	/*
> > +	 * The RPC message is divided into 3 pieces:
> > +	 * - The header: This is what most of the smaller RPC messages
> consist
> > +	 *   of. Often the whole message is in this.
> > +	 *
> > +	 *   - xdr->pages: This is a list of pages that contain data,
for
> > +	 *   example in a write request or while using rpcsec gss
> > +	 *
> > +	 *   - The tail: This is the rest of the rpc message
> > +	 *
> > +	 *  First we send the header, then the pages and then finally
the
> tail.
> > +	 *  The code borrows heavily from svc_sendto.
> > +	 */
> > +
> > +	/*
> > +	 * Send the head
> > +	 */
> > +	if (total_len == xbufp->head[0].iov_len)
> > +		flags = 0;
> > +
> > +	len = sock->ops->sendpage(sock, virt_to_page(xbufp-
> >head[0].iov_base),
> > +			(unsigned long)xbufp->head[0].iov_base &
~PAGE_MASK,
> > +			xbufp->head[0].iov_len, flags);
> 
> Why do you need to do this? The head iovec is supposed to be reserved
> for kmalloc()ed memory, which cannot be used together with sendpage().
> Somebody, some day is going to mess up and try to put a kmalloced
buffer
> in here, and will wonder why the above doesn't work.
> 
> If you are sending pages, then please put them in the page list part
of
> the xdr_buf. There is no rule that the RPC call _must_ have a non-zero
> head.
> 
> > +
> > +	if (len != xbufp->head[0].iov_len)
> > +		goto out;
> > +
> > +	/*
> > +	 * send page data
> > +	 *
> > +	 * Check the amount of data to be sent. If it is less than the
> > +	 * remaining page, then send it else send the current page
> > +	 */
> > +
> > +	size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
> > +	while (pglen > 0) {
> > +		if (total_len == size)
> > +			flags = 0;
> > +		result = sock->ops->sendpage(sock, *pages, base, size,
flags);
> > +		if (result > 0)
> > +			len += result;
> > +		if (result != size)
> > +			goto out;
> > +		total_len -= size;
> > +		pglen -= size;
> > +		size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
> > +		base = 0;
> > +		pages++;
> > +	}
> > +	/*
> > +	 * send tail
> > +	 */
> > +	if (xbufp->tail[0].iov_len) {
> > +		result = sock->ops->sendpage(sock,
> > +			xbufp->tail[0].iov_base,
> > +			(unsigned long)xbufp->tail[0].iov_base &
~PAGE_MASK,
> > +			xbufp->tail[0].iov_len,
> > +			0);
> 
> Ditto.
> 
> > +
> > +		if (result > 0)
> > +			len += result;
> > +	}
> > +out:
> > +	if (len != xbufp->len)
> > +		printk(KERN_NOTICE "Error sending entire callback!\n");
>              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> Then what? Shouldn't you be closing the connection here?
> 
> > +
> > +	return len;
> > +}
> > +
> > +/*
> > + * The send routine. Borrows from svc_send
> > + */
> > +static int bc_send_request(struct rpc_task *task)
> > +{
> > +	struct rpc_rqst *req = task->tk_rqstp;
> > +	struct rpc_xprt *bc_xprt = req->rq_xprt;
> > +	struct svc_xprt	*xprt;
> > +	struct svc_sock         *svsk;
> > +	u32                     len;
> > +
> > +	dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid));
> > +	/*
> > +	 * Get the server socket associated with this callback xprt
> > +	 */
> > +	svsk = bc_xprt->bc_sock;
> > +	xprt = &svsk->sk_xprt;
> > +
> > +	mutex_lock(&xprt->xpt_mutex);
>           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> Eh? What's this, in which patch is it defined, and why is it at all
> needed?
> 
> > +	if (test_bit(XPT_DEAD, &xprt->xpt_flags))
>                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
> Where is this defined, and why is it needed? The xprt already has a
> connected/unconnected flag.
> 
> > +		len = -ENOTCONN;
> > +	else
> > +		len = bc_sendto(req);
> > +	mutex_unlock(&xprt->xpt_mutex);
> > +
> > +	return 0;
> > +
> > +}
> > +
> > +/*
> > + * The close routine. Since this is client initiated, we do nothing
> > + */
> > +
> > +static void bc_close(struct rpc_xprt *xprt)
> > +{
> > +	return;
> > +}
> > +
> > +/*
> > + * The xprt destroy routine. Again, because this connection is
client
> > + * initiated, we do nothing
> > + */
> > +
> > +static void bc_destroy(struct rpc_xprt *xprt)
> > +{
> > +	return;
> > +}
> > +
> >  static struct rpc_xprt_ops xs_udp_ops = {
> >  	.set_buffer_size	= xs_udp_set_buffer_size,
> >  	.reserve_xprt		= xprt_reserve_xprt_cong,
> > @@ -1999,6 +2213,24 @@ static struct rpc_xprt_ops xs_tcp_ops = {
> >  	.print_stats		= xs_tcp_print_stats,
> >  };
> >
> > +/*
> > + * The rpc_xprt_ops for the server backchannel
> > + */
> > +
> > +static struct rpc_xprt_ops bc_tcp_ops = {
> > +	.reserve_xprt		= xprt_reserve_xprt,
> > +	.release_xprt		= xprt_release_xprt,
> > +	.set_port		= bc_set_port,
> > +	.connect		= bc_connect,
> > +	.buf_alloc		= bc_malloc,
> > +	.buf_free		= bc_free,
> > +	.send_request		= bc_send_request,
> > +	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
> > +	.close			= bc_close,
> > +	.destroy		= bc_destroy,
> > +	.print_stats		= xs_tcp_print_stats,
> > +};
> > +
> >  static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
> >  				      unsigned int slot_table_size)
> >  {
> > @@ -2131,13 +2363,29 @@ static struct rpc_xprt *xs_setup_tcp(struct
> xprt_create *args)
> >  	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
> >  	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
> >
> > -	xprt->bind_timeout = XS_BIND_TO;
> > -	xprt->connect_timeout = XS_TCP_CONN_TO;
> > -	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
> > -	xprt->idle_timeout = XS_IDLE_DISC_TO;
> > +	if (args->bc_sock) {
> > +		/* backchannel */
> > +		xprt_set_bound(xprt);
> > +		INIT_DELAYED_WORK(&transport->connect_worker,
> > +				  bc_connect_worker);
> 
> Errm.... Is it really such a good idea to tell the RPC layer that it
can
> reconnect at any time using a routine that will BUG()?
> 
> > +		xprt->bind_timeout = 0;
> > +		xprt->connect_timeout = 0;
> > +		xprt->reestablish_timeout = 0;
> > +		xprt->idle_timeout = (~0);
> >
> > -	xprt->ops = &xs_tcp_ops;
> > -	xprt->timeout = &xs_tcp_default_timeout;
> > +		/*
> > +		 * The backchannel uses the same socket connection as
the
> > +		 * forechannel
> > +		 */
> > +		xprt->bc_sock = args->bc_sock;
> > +		xprt->bc_sock->sk_bc_xprt = xprt;
> > +		transport->sock = xprt->bc_sock->sk_sock;
> > +		transport->inet = xprt->bc_sock->sk_sk;
> > +
> > +		xprt->ops = &bc_tcp_ops;
> > +
> > +		goto next;
> > +	}
> >
> >  	switch (addr->sa_family) {
> >  	case AF_INET:
> > @@ -2145,13 +2393,29 @@ static struct rpc_xprt *xs_setup_tcp(struct
> xprt_create *args)
> >  			xprt_set_bound(xprt);
> >
> >  		INIT_DELAYED_WORK(&transport->connect_worker,
> xs_tcp_connect_worker4);
> > -		xs_format_ipv4_peer_addresses(xprt, "tcp",
RPCBIND_NETID_TCP);
> >  		break;
> >  	case AF_INET6:
> >  		if (((struct sockaddr_in6 *)addr)->sin6_port !=
htons(0))
> >  			xprt_set_bound(xprt);
> >
> >  		INIT_DELAYED_WORK(&transport->connect_worker,
> xs_tcp_connect_worker6);
> > +		break;
> > +	}
> > +	xprt->bind_timeout = XS_BIND_TO;
> > +	xprt->connect_timeout = XS_TCP_CONN_TO;
> > +	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
> > +	xprt->idle_timeout = XS_IDLE_DISC_TO;
> > +
> > +	xprt->ops = &xs_tcp_ops;
> > +
> > +next:
> > +	xprt->timeout = &xs_tcp_default_timeout;
> > +
> > +	switch (addr->sa_family) {
> 
> Why do we suddenly need 2 switch statements here?
> 
> > +	case AF_INET:
> > +		xs_format_ipv4_peer_addresses(xprt, "tcp",
RPCBIND_NETID_TCP);
> > +		break;
> > +	case AF_INET6:
> >  		xs_format_ipv6_peer_addresses(xprt, "tcp",
RPCBIND_NETID_TCP6);
> >  		break;
> >  	default:
> 
> 
> _______________________________________________
> pNFS mailing list
> pNFS@linux-nfs.org
> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling
       [not found]         ` <273FE88A07F5D445824060902F70034405FE3129-hX7t0kiaRRpT+ZUat5FNkAK/GNPrWCqfQQ4Iyu8u01E@public.gmane.org>
@ 2009-06-02  0:52           ` J. Bruce Fields
  2009-06-02  1:24             ` [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-sidebackchannel handling Labiaga, Ricardo
  2009-06-02  4:51           ` [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling Benny Halevy
  1 sibling, 1 reply; 29+ messages in thread
From: J. Bruce Fields @ 2009-06-02  0:52 UTC (permalink / raw)
  To: Labiaga, Ricardo
  Cc: Trond Myklebust, Benny Halevy, Adamson, Andy, linux-nfs, pnfs

On Mon, Jun 01, 2009 at 05:33:34PM -0700, Labiaga, Ricardo wrote:
> Trond, Bruce,
> 
> Alexandros has coded a number of patches that address the issues raised
> here by Trond.  Do you want the fixes squashed into the original patch
> or do you want them submitted separately.

I'd prefer the former.

--b.

> 
> Thanks,
> 
> - ricardo
> 
> > -----Original Message-----
> > From: Trond Myklebust [mailto:trond.myklebust@fys.uio.no]
> > Sent: Thursday, April 30, 2009 5:05 PM
> > To: Benny Halevy
> > Cc: Adamson, Andy; J. Bruce Fields; pnfs@linux-nfs.org; linux-
> > nfs@vger.kernel.org
> > Subject: Re: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side
> > backchannel handling
> > 
> > On Fri, 2009-05-01 at 02:05 +0300, Benny Halevy wrote:
> > > From: Rahul Iyer <iyer@netapp.com>
> > >
> > > FIXME: bhalevy: write up commit message
> > >
> > > Signed-off-by: Rahul Iyer <iyer@netapp.com>
> > > Signed-off-by: Mike Sager <sager@netapp.com>
> > > Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
> > > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> > >
> > > When the call direction is a reply, copy the xid and call direction
> into
> > the
> > > req->rq_private_buf.head[0].iov_base otherwise rpc_verify_header
> returns
> > > rpc_garbage.
> > >
> > > Signed-off-by: Andy Adamson <andros@netapp.com>
> > > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> > > [get rid of CONFIG_NFSD_V4_1]
> > > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> > > ---
> > >  include/linux/sunrpc/clnt.h    |    1 +
> > >  include/linux/sunrpc/svcsock.h |    1 +
> > >  include/linux/sunrpc/xprt.h    |    2 +
> > >  net/sunrpc/clnt.c              |    1 +
> > >  net/sunrpc/svcsock.c           |   68 ++++++++++-
> > >  net/sunrpc/xprt.c              |   41 ++++++-
> > >  net/sunrpc/xprtsock.c          |  278
> > +++++++++++++++++++++++++++++++++++++++-
> > >  7 files changed, 381 insertions(+), 11 deletions(-)
> > >
> > > diff --git a/include/linux/sunrpc/clnt.h
> b/include/linux/sunrpc/clnt.h
> > > index c39a210..cf9a8ec 100644
> > > --- a/include/linux/sunrpc/clnt.h
> > > +++ b/include/linux/sunrpc/clnt.h
> > > @@ -110,6 +110,7 @@ struct rpc_create_args {
> > >  	rpc_authflavor_t	authflavor;
> > >  	unsigned long		flags;
> > >  	char			*client_name;
> > > +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel
> */
> > >  };
> > >
> > >  /* Values for "flags" field */
> > > diff --git a/include/linux/sunrpc/svcsock.h
> > b/include/linux/sunrpc/svcsock.h
> > > index 8271631..19228f4 100644
> > > --- a/include/linux/sunrpc/svcsock.h
> > > +++ b/include/linux/sunrpc/svcsock.h
> > > @@ -28,6 +28,7 @@ struct svc_sock {
> > >  	/* private TCP part */
> > >  	u32			sk_reclen;	/* length of record */
> > >  	u32			sk_tcplen;	/* current read length
> */
> > > +	struct rpc_xprt	       *sk_bc_xprt;	/* NFSv4.1 backchannel
> xprt
> > */
> > >  };
> > >
> > >  /*
> > > diff --git a/include/linux/sunrpc/xprt.h
> b/include/linux/sunrpc/xprt.h
> > > index 1758d9f..063a6a7 100644
> > > --- a/include/linux/sunrpc/xprt.h
> > > +++ b/include/linux/sunrpc/xprt.h
> > > @@ -174,6 +174,7 @@ struct rpc_xprt {
> > >  	spinlock_t		reserve_lock;	/* lock slot table */
> > >  	u32			xid;		/* Next XID value to use
> */
> > >  	struct rpc_task *	snd_task;	/* Task blocked in send
> */
> > > +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel
> */
> > >  	struct list_head	recv;
> > >
> > >  	struct {
> > > @@ -197,6 +198,7 @@ struct xprt_create {
> > >  	struct sockaddr *	srcaddr;	/* optional local
> address */
> > >  	struct sockaddr *	dstaddr;	/* remote peer address
> */
> > >  	size_t			addrlen;
> > > +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel
> */
> > >  };
> > >
> > >  struct xprt_class {
> > > diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> > > index 5abab09..3dc847f 100644
> > > --- a/net/sunrpc/clnt.c
> > > +++ b/net/sunrpc/clnt.c
> > > @@ -266,6 +266,7 @@ struct rpc_clnt *rpc_create(struct
> rpc_create_args
> > *args)
> > >  		.srcaddr = args->saddress,
> > >  		.dstaddr = args->address,
> > >  		.addrlen = args->addrsize,
> > > +		.bc_sock = args->bc_sock,
> > >  	};
> > >  	char servername[48];
> > >
> > > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> > > index 4e6d406..619764e 100644
> > > --- a/net/sunrpc/svcsock.c
> > > +++ b/net/sunrpc/svcsock.c
> > > @@ -49,6 +49,7 @@
> > >  #include <linux/sunrpc/msg_prot.h>
> > >  #include <linux/sunrpc/svcsock.h>
> > >  #include <linux/sunrpc/stats.h>
> > > +#include <linux/sunrpc/xprt.h>
> > >
> > >  #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
> > >
> > > @@ -825,6 +826,7 @@ static int svc_tcp_recvfrom(struct svc_rqst
> *rqstp)
> > >  	int		len;
> > >  	struct kvec *vec;
> > >  	int pnum, vlen;
> > > +	struct rpc_rqst *req = NULL;
> > >
> > >  	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
> > >  		svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
> > > @@ -891,12 +893,65 @@ static int svc_tcp_recvfrom(struct svc_rqst
> > *rqstp)
> > >  	len = svsk->sk_reclen;
> > >  	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
> > >
> > > +	/*
> > > +	 * We have enough data for the whole tcp record. Let's try and
> read
> > the
> > > +	 * first 8 bytes to get the xid and the call direction. We can
> use
> > this
> > > +	 * to figure out if this is a call or a reply to a callback. If
> > > +	 * sk_reclen is < 8 (xid and calldir), then this is a malformed
> > packet.
> > > +	 * In that case, don't bother with the calldir and just read the
> > data.
> > > +	 * It will be rejected in svc_process.
> > > +	 */
> > > +
> > >  	vec = rqstp->rq_vec;
> > >  	vec[0] = rqstp->rq_arg.head[0];
> > >  	vlen = PAGE_SIZE;
> > > +
> > > +	if (len >= 8) {
> > > +		u32 *p;
> > > +		u32 xid;
> > > +		u32 calldir;
> > > +
> > > +		len = svc_recvfrom(rqstp, vec, 1, 8);
> > > +		if (len < 0)
> > > +			goto error;
> > > +
> > > +		p = (u32 *)rqstp->rq_arg.head[0].iov_base;
> > > +		xid = *p++;
> > > +		calldir = *p;
> > > +
> > > +		if (calldir) {
> > > +			/* REPLY */
> > > +			if (svsk->sk_bc_xprt)
> > > +				req = xprt_lookup_rqst(svsk->sk_bc_xprt,
> xid);
> > > +			if (req) {
> > > +				memcpy(&req->rq_private_buf,
> &req->rq_rcv_buf,
> > > +					sizeof(struct xdr_buf));
> > > +				/* copy the xid and call direction */
> > > +
> memcpy(req->rq_private_buf.head[0].iov_base,
> > > +					rqstp->rq_arg.head[0].iov_base,
> 8);
> > > +				vec[0] = req->rq_private_buf.head[0];
> > > +			} else
> > > +				printk(KERN_NOTICE
> > > +					"%s: Got unrecognized reply: "
> > > +					"calldir 0x%x sk_bc_xprt %p xid
> %08x\n",
> > > +					__func__, ntohl(calldir),
> > > +					svsk->sk_bc_xprt, xid);
> > > +		}
> > > +
> > > +		if (!calldir || !req)
> > > +			vec[0] = rqstp->rq_arg.head[0];
> > > +
> > > +		vec[0].iov_base += 8;
> > > +		vec[0].iov_len -= 8;
> > > +		len = svsk->sk_reclen - 8;
> > > +		vlen -= 8;
> > > +	}
> > > +
> > >  	pnum = 1;
> > >  	while (vlen < len) {
> > > -		vec[pnum].iov_base =
> page_address(rqstp->rq_pages[pnum]);
> > > +		vec[pnum].iov_base = (req) ?
> > > +			page_address(req->rq_private_buf.pages[pnum -
> 1]) :
> > > +			page_address(rqstp->rq_pages[pnum]);
> > >  		vec[pnum].iov_len = PAGE_SIZE;
> > >  		pnum++;
> > >  		vlen += PAGE_SIZE;
> > > @@ -908,6 +963,16 @@ static int svc_tcp_recvfrom(struct svc_rqst
> *rqstp)
> > >  	if (len < 0)
> > >  		goto error;
> > >
> > > +	/*
> > > +	 * Account for the 8 bytes we read earlier
> > > +	 */
> > > +	len += 8;
> > > +
> > > +	if (req) {
> > > +		xprt_complete_rqst(req->rq_task, len);
> > > +		len = 0;
> > > +		goto out;
> > > +	}
> > >  	dprintk("svc: TCP complete record (%d bytes)\n", len);
> > >  	rqstp->rq_arg.len = len;
> > >  	rqstp->rq_arg.page_base = 0;
> > > @@ -921,6 +986,7 @@ static int svc_tcp_recvfrom(struct svc_rqst
> *rqstp)
> > >  	rqstp->rq_xprt_ctxt   = NULL;
> > >  	rqstp->rq_prot	      = IPPROTO_TCP;
> > >
> > > +out:
> > >  	/* Reset TCP read info */
> > >  	svsk->sk_reclen = 0;
> > >  	svsk->sk_tcplen = 0;
> > > diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> > > index a0bfe53..03f175e 100644
> > > --- a/net/sunrpc/xprt.c
> > > +++ b/net/sunrpc/xprt.c
> > > @@ -1015,6 +1015,27 @@ void xprt_release(struct rpc_task *task)
> > >  	spin_unlock(&xprt->reserve_lock);
> > >  }
> > >
> > > +/*
> > > + * The autoclose function for the back channel
> > > + *
> > > + * The callback channel should never close the channel,
> > > + * let the forechannel do that.
> > > + */
> > > +static void bc_autoclose(struct work_struct *work)
> > > +{
> > > +	return;
> > > +}
> > > +
> > > +
> > > +/*
> > > + * The autodisconnect routine for the back channel. We never
> disconnect
> > > + */
> > > +static void
> > > +bc_init_autodisconnect(unsigned long data)
> > > +{
> > > +	return;
> > > +}
> > > +
> > >  /**
> > >   * xprt_create_transport - create an RPC transport
> > >   * @args: rpc transport creation arguments
> > > @@ -1051,9 +1072,16 @@ found:
> > >
> > >  	INIT_LIST_HEAD(&xprt->free);
> > >  	INIT_LIST_HEAD(&xprt->recv);
> > > -	INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
> > > -	setup_timer(&xprt->timer, xprt_init_autodisconnect,
> > > -			(unsigned long)xprt);
> > > +	if (args->bc_sock) {
> > > +		INIT_WORK(&xprt->task_cleanup, bc_autoclose);
> > > +		setup_timer(&xprt->timer, bc_init_autodisconnect,
> > > +			    (unsigned long)xprt);
> > 
> > Hrmph... Why do you need dummy routines here?
> > 
> > > +	} else {
> > > +		INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
> > > +		setup_timer(&xprt->timer, xprt_init_autodisconnect,
> > > +			    (unsigned long)xprt);
> > > +	}
> > > +
> > >  	xprt->last_used = jiffies;
> > >  	xprt->cwnd = RPC_INITCWND;
> > >  	xprt->bind_index = 0;
> > > @@ -1073,6 +1101,13 @@ found:
> > >  	dprintk("RPC:       created transport %p with %u slots\n", xprt,
> > >  			xprt->max_reqs);
> > >
> > > +	/*
> > > +	 * Since we don't want connections for the backchannel, we set
> > > +	 * the xprt status to connected
> > > +	 */
> > > +	if (args->bc_sock)
> > > +		xprt_set_connected(xprt);
> > > +
> > >  	return xprt;
> > >  }
> > >
> > > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > > index d40ff50..067d205 100644
> > > --- a/net/sunrpc/xprtsock.c
> > > +++ b/net/sunrpc/xprtsock.c
> > > @@ -32,6 +32,7 @@
> > >  #include <linux/tcp.h>
> > >  #include <linux/sunrpc/clnt.h>
> > >  #include <linux/sunrpc/sched.h>
> > > +#include <linux/sunrpc/svcsock.h>
> > >  #include <linux/sunrpc/xprtsock.h>
> > >  #include <linux/file.h>
> > >
> > > @@ -1966,6 +1967,219 @@ static void xs_tcp_print_stats(struct
> rpc_xprt
> > *xprt, struct seq_file *seq)
> > >  			xprt->stat.bklog_u);
> > >  }
> > >
> > > +/*
> > > + * The connect worker for the backchannel
> > > + * This should never be called as we should never need to connect
> > > + */
> > > +static void bc_connect_worker(struct work_struct *work)
> > > +{
> > > +	BUG();
> > > +}
> > > +
> > > +/*
> > > + * The set_port routine of the rpc_xprt_ops. This is related to the
> > portmapper
> > > + * and should never be called
> > > + */
> > > +
> > > +static void bc_set_port(struct rpc_xprt *xprt, unsigned short port)
> > > +{
> > > +	BUG();
> > > +}
> > > +
> > > +/*
> > > + * The connect routine for the backchannel rpc_xprt ops
> > > + * Again, should never be called!
> > > + */
> > > +
> > > +static void bc_connect(struct rpc_task *task)
> > > +{
> > > +	BUG();
> > > +}
> > > +
> > > +struct rpc_buffer {
> > > +	size_t	len;
> > > +	char	data[];
> > > +};
> > > +/*
> > > + * Allocate a bunch of pages for a scratch buffer for the rpc code.
> The
> > reason
> > > + * we allocate pages instead doing a kmalloc like rpc_malloc is
> because
> > we want
> > > + * to use the server side send routines.
> > > + */
> > > +void *bc_malloc(struct rpc_task *task, size_t size)
> > > +{
> > > +	struct page *page;
> > > +	struct rpc_buffer *buf;
> > > +
> > > +	BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer));
> > > +	page = alloc_page(GFP_KERNEL);
> > > +
> > > +	if (!page)
> > > +		return NULL;
> > > +
> > > +	buf = page_address(page);
> > > +	buf->len = PAGE_SIZE;
> > > +
> > > +	return buf->data;
> > > +}
> > > +
> > 
> > __get_free_page()? Why can't you kmalloc() here?
> > 
> > > +/*
> > > + * Free the space allocated in the bc_alloc routine
> > > + */
> > > +void bc_free(void *buffer)
> > > +{
> > > +	struct rpc_buffer *buf;
> > > +
> > > +	if (!buffer)
> > > +		return;
> > > +
> > > +	buf = container_of(buffer, struct rpc_buffer, data);
> > > +	free_pages((unsigned long)buf, get_order(buf->len));
> > 
> > This looks funky... Why can't you just call free_page()? You already
> > know from bc_malloc() that this is an order 0 page allocation.
> > 
> > > +}
> > > +
> > > +/*
> > > + * Use the svc_sock to send the callback. Must be called with svsk-
> > >sk_mutex
> > > + * held. Borrows heavily from svc_tcp_sendto and
> xs_tcp_semd_request.
> > > + */
> > > +static int bc_sendto(struct rpc_rqst *req)
> > > +{
> > > +	int total_len;
> > > +	int len;
> > > +	int size;
> > > +	int result;
> > > +	struct xdr_buf *xbufp = &req->rq_snd_buf;
> > > +	struct page **pages = xbufp->pages;
> > > +	unsigned int flags = MSG_MORE;
> > > +	unsigned int pglen = xbufp->page_len;
> > > +	size_t base = xbufp->page_base;
> > > +	struct rpc_xprt *xprt = req->rq_xprt;
> > > +	struct sock_xprt *transport =
> > > +				container_of(xprt, struct sock_xprt,
> xprt);
> > > +	struct socket *sock = transport->sock;
> > > +
> > > +	total_len = xbufp->len;
> > > +
> > > +	/*
> > > +	 * Set up the rpc header and record marker stuff
> > > +	 */
> > > +	xs_encode_tcp_record_marker(xbufp);
> > > +
> > > +	/*
> > > +	 * The RPC message is divided into 3 pieces:
> > > +	 * - The header: This is what most of the smaller RPC messages
> > consist
> > > +	 *   of. Often the whole message is in this.
> > > +	 *
> > > +	 *   - xdr->pages: This is a list of pages that contain data,
> for
> > > +	 *   example in a write request or while using rpcsec gss
> > > +	 *
> > > +	 *   - The tail: This is the rest of the rpc message
> > > +	 *
> > > +	 *  First we send the header, then the pages and then finally
> the
> > tail.
> > > +	 *  The code borrows heavily from svc_sendto.
> > > +	 */
> > > +
> > > +	/*
> > > +	 * Send the head
> > > +	 */
> > > +	if (total_len == xbufp->head[0].iov_len)
> > > +		flags = 0;
> > > +
> > > +	len = sock->ops->sendpage(sock, virt_to_page(xbufp-
> > >head[0].iov_base),
> > > +			(unsigned long)xbufp->head[0].iov_base &
> ~PAGE_MASK,
> > > +			xbufp->head[0].iov_len, flags);
> > 
> > Why do you need to do this? The head iovec is supposed to be reserved
> > for kmalloc()ed memory, which cannot be used together with sendpage().
> > Somebody, some day is going to mess up and try to put a kmalloced
> buffer
> > in here, and will wonder why the above doesn't work.
> > 
> > If you are sending pages, then please put them in the page list part
> of
> > the xdr_buf. There is no rule that the RPC call _must_ have a non-zero
> > head.
> > 
> > > +
> > > +	if (len != xbufp->head[0].iov_len)
> > > +		goto out;
> > > +
> > > +	/*
> > > +	 * send page data
> > > +	 *
> > > +	 * Check the amount of data to be sent. If it is less than the
> > > +	 * remaining page, then send it else send the current page
> > > +	 */
> > > +
> > > +	size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
> > > +	while (pglen > 0) {
> > > +		if (total_len == size)
> > > +			flags = 0;
> > > +		result = sock->ops->sendpage(sock, *pages, base, size,
> flags);
> > > +		if (result > 0)
> > > +			len += result;
> > > +		if (result != size)
> > > +			goto out;
> > > +		total_len -= size;
> > > +		pglen -= size;
> > > +		size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
> > > +		base = 0;
> > > +		pages++;
> > > +	}
> > > +	/*
> > > +	 * send tail
> > > +	 */
> > > +	if (xbufp->tail[0].iov_len) {
> > > +		result = sock->ops->sendpage(sock,
> > > +			xbufp->tail[0].iov_base,
> > > +			(unsigned long)xbufp->tail[0].iov_base &
> ~PAGE_MASK,
> > > +			xbufp->tail[0].iov_len,
> > > +			0);
> > 
> > Ditto.
> > 
> > > +
> > > +		if (result > 0)
> > > +			len += result;
> > > +	}
> > > +out:
> > > +	if (len != xbufp->len)
> > > +		printk(KERN_NOTICE "Error sending entire callback!\n");
> >              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > Then what? Shouldn't you be closing the connection here?
> > 
> > > +
> > > +	return len;
> > > +}
> > > +
> > > +/*
> > > + * The send routine. Borrows from svc_send
> > > + */
> > > +static int bc_send_request(struct rpc_task *task)
> > > +{
> > > +	struct rpc_rqst *req = task->tk_rqstp;
> > > +	struct rpc_xprt *bc_xprt = req->rq_xprt;
> > > +	struct svc_xprt	*xprt;
> > > +	struct svc_sock         *svsk;
> > > +	u32                     len;
> > > +
> > > +	dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid));
> > > +	/*
> > > +	 * Get the server socket associated with this callback xprt
> > > +	 */
> > > +	svsk = bc_xprt->bc_sock;
> > > +	xprt = &svsk->sk_xprt;
> > > +
> > > +	mutex_lock(&xprt->xpt_mutex);
> >           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > 
> > Eh? What's this, in which patch is it defined, and why is it at all
> > needed?
> > 
> > > +	if (test_bit(XPT_DEAD, &xprt->xpt_flags))
> >                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
> > Where is this defined, and why is it needed? The xprt already has a
> > connected/unconnected flag.
> > 
> > > +		len = -ENOTCONN;
> > > +	else
> > > +		len = bc_sendto(req);
> > > +	mutex_unlock(&xprt->xpt_mutex);
> > > +
> > > +	return 0;
> > > +
> > > +}
> > > +
> > > +/*
> > > + * The close routine. Since this is client initiated, we do nothing
> > > + */
> > > +
> > > +static void bc_close(struct rpc_xprt *xprt)
> > > +{
> > > +	return;
> > > +}
> > > +
> > > +/*
> > > + * The xprt destroy routine. Again, because this connection is
> client
> > > + * initiated, we do nothing
> > > + */
> > > +
> > > +static void bc_destroy(struct rpc_xprt *xprt)
> > > +{
> > > +	return;
> > > +}
> > > +
> > >  static struct rpc_xprt_ops xs_udp_ops = {
> > >  	.set_buffer_size	= xs_udp_set_buffer_size,
> > >  	.reserve_xprt		= xprt_reserve_xprt_cong,
> > > @@ -1999,6 +2213,24 @@ static struct rpc_xprt_ops xs_tcp_ops = {
> > >  	.print_stats		= xs_tcp_print_stats,
> > >  };
> > >
> > > +/*
> > > + * The rpc_xprt_ops for the server backchannel
> > > + */
> > > +
> > > +static struct rpc_xprt_ops bc_tcp_ops = {
> > > +	.reserve_xprt		= xprt_reserve_xprt,
> > > +	.release_xprt		= xprt_release_xprt,
> > > +	.set_port		= bc_set_port,
> > > +	.connect		= bc_connect,
> > > +	.buf_alloc		= bc_malloc,
> > > +	.buf_free		= bc_free,
> > > +	.send_request		= bc_send_request,
> > > +	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
> > > +	.close			= bc_close,
> > > +	.destroy		= bc_destroy,
> > > +	.print_stats		= xs_tcp_print_stats,
> > > +};
> > > +
> > >  static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
> > >  				      unsigned int slot_table_size)
> > >  {
> > > @@ -2131,13 +2363,29 @@ static struct rpc_xprt *xs_setup_tcp(struct
> > xprt_create *args)
> > >  	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
> > >  	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
> > >
> > > -	xprt->bind_timeout = XS_BIND_TO;
> > > -	xprt->connect_timeout = XS_TCP_CONN_TO;
> > > -	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
> > > -	xprt->idle_timeout = XS_IDLE_DISC_TO;
> > > +	if (args->bc_sock) {
> > > +		/* backchannel */
> > > +		xprt_set_bound(xprt);
> > > +		INIT_DELAYED_WORK(&transport->connect_worker,
> > > +				  bc_connect_worker);
> > 
> > Errm.... Is it really such a good idea to tell the RPC layer that it
> can
> > reconnect at any time using a routine that will BUG()?
> > 
> > > +		xprt->bind_timeout = 0;
> > > +		xprt->connect_timeout = 0;
> > > +		xprt->reestablish_timeout = 0;
> > > +		xprt->idle_timeout = (~0);
> > >
> > > -	xprt->ops = &xs_tcp_ops;
> > > -	xprt->timeout = &xs_tcp_default_timeout;
> > > +		/*
> > > +		 * The backchannel uses the same socket connection as
> the
> > > +		 * forechannel
> > > +		 */
> > > +		xprt->bc_sock = args->bc_sock;
> > > +		xprt->bc_sock->sk_bc_xprt = xprt;
> > > +		transport->sock = xprt->bc_sock->sk_sock;
> > > +		transport->inet = xprt->bc_sock->sk_sk;
> > > +
> > > +		xprt->ops = &bc_tcp_ops;
> > > +
> > > +		goto next;
> > > +	}
> > >
> > >  	switch (addr->sa_family) {
> > >  	case AF_INET:
> > > @@ -2145,13 +2393,29 @@ static struct rpc_xprt *xs_setup_tcp(struct
> > xprt_create *args)
> > >  			xprt_set_bound(xprt);
> > >
> > >  		INIT_DELAYED_WORK(&transport->connect_worker,
> > xs_tcp_connect_worker4);
> > > -		xs_format_ipv4_peer_addresses(xprt, "tcp",
> RPCBIND_NETID_TCP);
> > >  		break;
> > >  	case AF_INET6:
> > >  		if (((struct sockaddr_in6 *)addr)->sin6_port !=
> htons(0))
> > >  			xprt_set_bound(xprt);
> > >
> > >  		INIT_DELAYED_WORK(&transport->connect_worker,
> > xs_tcp_connect_worker6);
> > > +		break;
> > > +	}
> > > +	xprt->bind_timeout = XS_BIND_TO;
> > > +	xprt->connect_timeout = XS_TCP_CONN_TO;
> > > +	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
> > > +	xprt->idle_timeout = XS_IDLE_DISC_TO;
> > > +
> > > +	xprt->ops = &xs_tcp_ops;
> > > +
> > > +next:
> > > +	xprt->timeout = &xs_tcp_default_timeout;
> > > +
> > > +	switch (addr->sa_family) {
> > 
> > Why do we suddenly need 2 switch statements here?
> > 
> > > +	case AF_INET:
> > > +		xs_format_ipv4_peer_addresses(xprt, "tcp",
> RPCBIND_NETID_TCP);
> > > +		break;
> > > +	case AF_INET6:
> > >  		xs_format_ipv6_peer_addresses(xprt, "tcp",
> RPCBIND_NETID_TCP6);
> > >  		break;
> > >  	default:
> > 
> > 
> > _______________________________________________
> > pNFS mailing list
> > pNFS@linux-nfs.org
> > http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
> _______________________________________________
> pNFS mailing list
> pNFS@linux-nfs.org
> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-sidebackchannel handling
  2009-06-02  0:52           ` J. Bruce Fields
@ 2009-06-02  1:24             ` Labiaga, Ricardo
  0 siblings, 0 replies; 29+ messages in thread
From: Labiaga, Ricardo @ 2009-06-02  1:24 UTC (permalink / raw)
  To: J. Bruce Fields
  Cc: Trond Myklebust, Benny Halevy, Adamson, Andy, linux-nfs, pnfs

> From: J. Bruce Fields [mailto:bfields@fieldses.org]
> Sent: Monday, June 01, 2009 5:53 PM
> 
> On Mon, Jun 01, 2009 at 05:33:34PM -0700, Labiaga, Ricardo wrote:
> > Trond, Bruce,
> >
> > Alexandros has coded a number of patches that address the issues
raised
> > here by Trond.  Do you want the fixes squashed into the original
patch
> > or do you want them submitted separately.
> 
> I'd prefer the former.
> 

OK, we'll squash them into the original and provide a list of
modifications in the introduction to the patch set.

- ricardo

> --b.
> 
> >
> > Thanks,
> >
> > - ricardo
> >
> > > -----Original Message-----
> > > From: Trond Myklebust [mailto:trond.myklebust@fys.uio.no]
> > > Sent: Thursday, April 30, 2009 5:05 PM
> > > To: Benny Halevy
> > > Cc: Adamson, Andy; J. Bruce Fields; pnfs@linux-nfs.org; linux-
> > > nfs@vger.kernel.org
> > > Subject: Re: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc
server-side
> > > backchannel handling
> > >
> > > On Fri, 2009-05-01 at 02:05 +0300, Benny Halevy wrote:
> > > > From: Rahul Iyer <iyer@netapp.com>
> > > >
> > > > FIXME: bhalevy: write up commit message
> > > >
> > > > Signed-off-by: Rahul Iyer <iyer@netapp.com>
> > > > Signed-off-by: Mike Sager <sager@netapp.com>
> > > > Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
> > > > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> > > >
> > > > When the call direction is a reply, copy the xid and call
direction
> > into
> > > the
> > > > req->rq_private_buf.head[0].iov_base otherwise rpc_verify_header
> > returns
> > > > rpc_garbage.
> > > >
> > > > Signed-off-by: Andy Adamson <andros@netapp.com>
> > > > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> > > > [get rid of CONFIG_NFSD_V4_1]
> > > > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> > > > ---
> > > >  include/linux/sunrpc/clnt.h    |    1 +
> > > >  include/linux/sunrpc/svcsock.h |    1 +
> > > >  include/linux/sunrpc/xprt.h    |    2 +
> > > >  net/sunrpc/clnt.c              |    1 +
> > > >  net/sunrpc/svcsock.c           |   68 ++++++++++-
> > > >  net/sunrpc/xprt.c              |   41 ++++++-
> > > >  net/sunrpc/xprtsock.c          |  278
> > > +++++++++++++++++++++++++++++++++++++++-
> > > >  7 files changed, 381 insertions(+), 11 deletions(-)
> > > >
> > > > diff --git a/include/linux/sunrpc/clnt.h
> > b/include/linux/sunrpc/clnt.h
> > > > index c39a210..cf9a8ec 100644
> > > > --- a/include/linux/sunrpc/clnt.h
> > > > +++ b/include/linux/sunrpc/clnt.h
> > > > @@ -110,6 +110,7 @@ struct rpc_create_args {
> > > >  	rpc_authflavor_t	authflavor;
> > > >  	unsigned long		flags;
> > > >  	char			*client_name;
> > > > +	struct svc_sock		*bc_sock;	/* NFSv4.1
backchannel
> > */
> > > >  };
> > > >
> > > >  /* Values for "flags" field */
> > > > diff --git a/include/linux/sunrpc/svcsock.h
> > > b/include/linux/sunrpc/svcsock.h
> > > > index 8271631..19228f4 100644
> > > > --- a/include/linux/sunrpc/svcsock.h
> > > > +++ b/include/linux/sunrpc/svcsock.h
> > > > @@ -28,6 +28,7 @@ struct svc_sock {
> > > >  	/* private TCP part */
> > > >  	u32			sk_reclen;	/* length of
record */
> > > >  	u32			sk_tcplen;	/* current read
length
> > */
> > > > +	struct rpc_xprt	       *sk_bc_xprt;	/* NFSv4.1
backchannel
> > xprt
> > > */
> > > >  };
> > > >
> > > >  /*
> > > > diff --git a/include/linux/sunrpc/xprt.h
> > b/include/linux/sunrpc/xprt.h
> > > > index 1758d9f..063a6a7 100644
> > > > --- a/include/linux/sunrpc/xprt.h
> > > > +++ b/include/linux/sunrpc/xprt.h
> > > > @@ -174,6 +174,7 @@ struct rpc_xprt {
> > > >  	spinlock_t		reserve_lock;	/* lock slot
table */
> > > >  	u32			xid;		/* Next XID
value to use
> > */
> > > >  	struct rpc_task *	snd_task;	/* Task blocked
in send
> > */
> > > > +	struct svc_sock		*bc_sock;	/* NFSv4.1
backchannel
> > */
> > > >  	struct list_head	recv;
> > > >
> > > >  	struct {
> > > > @@ -197,6 +198,7 @@ struct xprt_create {
> > > >  	struct sockaddr *	srcaddr;	/* optional
local
> > address */
> > > >  	struct sockaddr *	dstaddr;	/* remote peer
address
> > */
> > > >  	size_t			addrlen;
> > > > +	struct svc_sock		*bc_sock;	/* NFSv4.1
backchannel
> > */
> > > >  };
> > > >
> > > >  struct xprt_class {
> > > > diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> > > > index 5abab09..3dc847f 100644
> > > > --- a/net/sunrpc/clnt.c
> > > > +++ b/net/sunrpc/clnt.c
> > > > @@ -266,6 +266,7 @@ struct rpc_clnt *rpc_create(struct
> > rpc_create_args
> > > *args)
> > > >  		.srcaddr = args->saddress,
> > > >  		.dstaddr = args->address,
> > > >  		.addrlen = args->addrsize,
> > > > +		.bc_sock = args->bc_sock,
> > > >  	};
> > > >  	char servername[48];
> > > >
> > > > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> > > > index 4e6d406..619764e 100644
> > > > --- a/net/sunrpc/svcsock.c
> > > > +++ b/net/sunrpc/svcsock.c
> > > > @@ -49,6 +49,7 @@
> > > >  #include <linux/sunrpc/msg_prot.h>
> > > >  #include <linux/sunrpc/svcsock.h>
> > > >  #include <linux/sunrpc/stats.h>
> > > > +#include <linux/sunrpc/xprt.h>
> > > >
> > > >  #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
> > > >
> > > > @@ -825,6 +826,7 @@ static int svc_tcp_recvfrom(struct svc_rqst
> > *rqstp)
> > > >  	int		len;
> > > >  	struct kvec *vec;
> > > >  	int pnum, vlen;
> > > > +	struct rpc_rqst *req = NULL;
> > > >
> > > >  	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
> > > >  		svsk, test_bit(XPT_DATA,
&svsk->sk_xprt.xpt_flags),
> > > > @@ -891,12 +893,65 @@ static int svc_tcp_recvfrom(struct
svc_rqst
> > > *rqstp)
> > > >  	len = svsk->sk_reclen;
> > > >  	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
> > > >
> > > > +	/*
> > > > +	 * We have enough data for the whole tcp record. Let's
try and
> > read
> > > the
> > > > +	 * first 8 bytes to get the xid and the call direction.
We can
> > use
> > > this
> > > > +	 * to figure out if this is a call or a reply to a
callback. If
> > > > +	 * sk_reclen is < 8 (xid and calldir), then this is a
malformed
> > > packet.
> > > > +	 * In that case, don't bother with the calldir and just
read the
> > > data.
> > > > +	 * It will be rejected in svc_process.
> > > > +	 */
> > > > +
> > > >  	vec = rqstp->rq_vec;
> > > >  	vec[0] = rqstp->rq_arg.head[0];
> > > >  	vlen = PAGE_SIZE;
> > > > +
> > > > +	if (len >= 8) {
> > > > +		u32 *p;
> > > > +		u32 xid;
> > > > +		u32 calldir;
> > > > +
> > > > +		len = svc_recvfrom(rqstp, vec, 1, 8);
> > > > +		if (len < 0)
> > > > +			goto error;
> > > > +
> > > > +		p = (u32 *)rqstp->rq_arg.head[0].iov_base;
> > > > +		xid = *p++;
> > > > +		calldir = *p;
> > > > +
> > > > +		if (calldir) {
> > > > +			/* REPLY */
> > > > +			if (svsk->sk_bc_xprt)
> > > > +				req =
xprt_lookup_rqst(svsk->sk_bc_xprt,
> > xid);
> > > > +			if (req) {
> > > > +				memcpy(&req->rq_private_buf,
> > &req->rq_rcv_buf,
> > > > +					sizeof(struct xdr_buf));
> > > > +				/* copy the xid and call
direction */
> > > > +
> > memcpy(req->rq_private_buf.head[0].iov_base,
> > > > +
rqstp->rq_arg.head[0].iov_base,
> > 8);
> > > > +				vec[0] =
req->rq_private_buf.head[0];
> > > > +			} else
> > > > +				printk(KERN_NOTICE
> > > > +					"%s: Got unrecognized
reply: "
> > > > +					"calldir 0x%x sk_bc_xprt
%p xid
> > %08x\n",
> > > > +					__func__,
ntohl(calldir),
> > > > +					svsk->sk_bc_xprt, xid);
> > > > +		}
> > > > +
> > > > +		if (!calldir || !req)
> > > > +			vec[0] = rqstp->rq_arg.head[0];
> > > > +
> > > > +		vec[0].iov_base += 8;
> > > > +		vec[0].iov_len -= 8;
> > > > +		len = svsk->sk_reclen - 8;
> > > > +		vlen -= 8;
> > > > +	}
> > > > +
> > > >  	pnum = 1;
> > > >  	while (vlen < len) {
> > > > -		vec[pnum].iov_base =
> > page_address(rqstp->rq_pages[pnum]);
> > > > +		vec[pnum].iov_base = (req) ?
> > > > +
page_address(req->rq_private_buf.pages[pnum -
> > 1]) :
> > > > +			page_address(rqstp->rq_pages[pnum]);
> > > >  		vec[pnum].iov_len = PAGE_SIZE;
> > > >  		pnum++;
> > > >  		vlen += PAGE_SIZE;
> > > > @@ -908,6 +963,16 @@ static int svc_tcp_recvfrom(struct svc_rqst
> > *rqstp)
> > > >  	if (len < 0)
> > > >  		goto error;
> > > >
> > > > +	/*
> > > > +	 * Account for the 8 bytes we read earlier
> > > > +	 */
> > > > +	len += 8;
> > > > +
> > > > +	if (req) {
> > > > +		xprt_complete_rqst(req->rq_task, len);
> > > > +		len = 0;
> > > > +		goto out;
> > > > +	}
> > > >  	dprintk("svc: TCP complete record (%d bytes)\n", len);
> > > >  	rqstp->rq_arg.len = len;
> > > >  	rqstp->rq_arg.page_base = 0;
> > > > @@ -921,6 +986,7 @@ static int svc_tcp_recvfrom(struct svc_rqst
> > *rqstp)
> > > >  	rqstp->rq_xprt_ctxt   = NULL;
> > > >  	rqstp->rq_prot	      = IPPROTO_TCP;
> > > >
> > > > +out:
> > > >  	/* Reset TCP read info */
> > > >  	svsk->sk_reclen = 0;
> > > >  	svsk->sk_tcplen = 0;
> > > > diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> > > > index a0bfe53..03f175e 100644
> > > > --- a/net/sunrpc/xprt.c
> > > > +++ b/net/sunrpc/xprt.c
> > > > @@ -1015,6 +1015,27 @@ void xprt_release(struct rpc_task *task)
> > > >  	spin_unlock(&xprt->reserve_lock);
> > > >  }
> > > >
> > > > +/*
> > > > + * The autoclose function for the back channel
> > > > + *
> > > > + * The callback channel should never close the channel,
> > > > + * let the forechannel do that.
> > > > + */
> > > > +static void bc_autoclose(struct work_struct *work)
> > > > +{
> > > > +	return;
> > > > +}
> > > > +
> > > > +
> > > > +/*
> > > > + * The autodisconnect routine for the back channel. We never
> > disconnect
> > > > + */
> > > > +static void
> > > > +bc_init_autodisconnect(unsigned long data)
> > > > +{
> > > > +	return;
> > > > +}
> > > > +
> > > >  /**
> > > >   * xprt_create_transport - create an RPC transport
> > > >   * @args: rpc transport creation arguments
> > > > @@ -1051,9 +1072,16 @@ found:
> > > >
> > > >  	INIT_LIST_HEAD(&xprt->free);
> > > >  	INIT_LIST_HEAD(&xprt->recv);
> > > > -	INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
> > > > -	setup_timer(&xprt->timer, xprt_init_autodisconnect,
> > > > -			(unsigned long)xprt);
> > > > +	if (args->bc_sock) {
> > > > +		INIT_WORK(&xprt->task_cleanup, bc_autoclose);
> > > > +		setup_timer(&xprt->timer,
bc_init_autodisconnect,
> > > > +			    (unsigned long)xprt);
> > >
> > > Hrmph... Why do you need dummy routines here?
> > >
> > > > +	} else {
> > > > +		INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
> > > > +		setup_timer(&xprt->timer,
xprt_init_autodisconnect,
> > > > +			    (unsigned long)xprt);
> > > > +	}
> > > > +
> > > >  	xprt->last_used = jiffies;
> > > >  	xprt->cwnd = RPC_INITCWND;
> > > >  	xprt->bind_index = 0;
> > > > @@ -1073,6 +1101,13 @@ found:
> > > >  	dprintk("RPC:       created transport %p with %u
slots\n", xprt,
> > > >  			xprt->max_reqs);
> > > >
> > > > +	/*
> > > > +	 * Since we don't want connections for the backchannel,
we set
> > > > +	 * the xprt status to connected
> > > > +	 */
> > > > +	if (args->bc_sock)
> > > > +		xprt_set_connected(xprt);
> > > > +
> > > >  	return xprt;
> > > >  }
> > > >
> > > > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > > > index d40ff50..067d205 100644
> > > > --- a/net/sunrpc/xprtsock.c
> > > > +++ b/net/sunrpc/xprtsock.c
> > > > @@ -32,6 +32,7 @@
> > > >  #include <linux/tcp.h>
> > > >  #include <linux/sunrpc/clnt.h>
> > > >  #include <linux/sunrpc/sched.h>
> > > > +#include <linux/sunrpc/svcsock.h>
> > > >  #include <linux/sunrpc/xprtsock.h>
> > > >  #include <linux/file.h>
> > > >
> > > > @@ -1966,6 +1967,219 @@ static void xs_tcp_print_stats(struct
> > rpc_xprt
> > > *xprt, struct seq_file *seq)
> > > >  			xprt->stat.bklog_u);
> > > >  }
> > > >
> > > > +/*
> > > > + * The connect worker for the backchannel
> > > > + * This should never be called as we should never need to
connect
> > > > + */
> > > > +static void bc_connect_worker(struct work_struct *work)
> > > > +{
> > > > +	BUG();
> > > > +}
> > > > +
> > > > +/*
> > > > + * The set_port routine of the rpc_xprt_ops. This is related to
the
> > > portmapper
> > > > + * and should never be called
> > > > + */
> > > > +
> > > > +static void bc_set_port(struct rpc_xprt *xprt, unsigned short
port)
> > > > +{
> > > > +	BUG();
> > > > +}
> > > > +
> > > > +/*
> > > > + * The connect routine for the backchannel rpc_xprt ops
> > > > + * Again, should never be called!
> > > > + */
> > > > +
> > > > +static void bc_connect(struct rpc_task *task)
> > > > +{
> > > > +	BUG();
> > > > +}
> > > > +
> > > > +struct rpc_buffer {
> > > > +	size_t	len;
> > > > +	char	data[];
> > > > +};
> > > > +/*
> > > > + * Allocate a bunch of pages for a scratch buffer for the rpc
code.
> > The
> > > reason
> > > > + * we allocate pages instead doing a kmalloc like rpc_malloc is
> > because
> > > we want
> > > > + * to use the server side send routines.
> > > > + */
> > > > +void *bc_malloc(struct rpc_task *task, size_t size)
> > > > +{
> > > > +	struct page *page;
> > > > +	struct rpc_buffer *buf;
> > > > +
> > > > +	BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer));
> > > > +	page = alloc_page(GFP_KERNEL);
> > > > +
> > > > +	if (!page)
> > > > +		return NULL;
> > > > +
> > > > +	buf = page_address(page);
> > > > +	buf->len = PAGE_SIZE;
> > > > +
> > > > +	return buf->data;
> > > > +}
> > > > +
> > >
> > > __get_free_page()? Why can't you kmalloc() here?
> > >
> > > > +/*
> > > > + * Free the space allocated in the bc_alloc routine
> > > > + */
> > > > +void bc_free(void *buffer)
> > > > +{
> > > > +	struct rpc_buffer *buf;
> > > > +
> > > > +	if (!buffer)
> > > > +		return;
> > > > +
> > > > +	buf = container_of(buffer, struct rpc_buffer, data);
> > > > +	free_pages((unsigned long)buf, get_order(buf->len));
> > >
> > > This looks funky... Why can't you just call free_page()? You
already
> > > know from bc_malloc() that this is an order 0 page allocation.
> > >
> > > > +}
> > > > +
> > > > +/*
> > > > + * Use the svc_sock to send the callback. Must be called with
svsk-
> > > >sk_mutex
> > > > + * held. Borrows heavily from svc_tcp_sendto and
> > xs_tcp_semd_request.
> > > > + */
> > > > +static int bc_sendto(struct rpc_rqst *req)
> > > > +{
> > > > +	int total_len;
> > > > +	int len;
> > > > +	int size;
> > > > +	int result;
> > > > +	struct xdr_buf *xbufp = &req->rq_snd_buf;
> > > > +	struct page **pages = xbufp->pages;
> > > > +	unsigned int flags = MSG_MORE;
> > > > +	unsigned int pglen = xbufp->page_len;
> > > > +	size_t base = xbufp->page_base;
> > > > +	struct rpc_xprt *xprt = req->rq_xprt;
> > > > +	struct sock_xprt *transport =
> > > > +				container_of(xprt, struct
sock_xprt,
> > xprt);
> > > > +	struct socket *sock = transport->sock;
> > > > +
> > > > +	total_len = xbufp->len;
> > > > +
> > > > +	/*
> > > > +	 * Set up the rpc header and record marker stuff
> > > > +	 */
> > > > +	xs_encode_tcp_record_marker(xbufp);
> > > > +
> > > > +	/*
> > > > +	 * The RPC message is divided into 3 pieces:
> > > > +	 * - The header: This is what most of the smaller RPC
messages
> > > consist
> > > > +	 *   of. Often the whole message is in this.
> > > > +	 *
> > > > +	 *   - xdr->pages: This is a list of pages that contain
data,
> > for
> > > > +	 *   example in a write request or while using rpcsec
gss
> > > > +	 *
> > > > +	 *   - The tail: This is the rest of the rpc message
> > > > +	 *
> > > > +	 *  First we send the header, then the pages and then
finally
> > the
> > > tail.
> > > > +	 *  The code borrows heavily from svc_sendto.
> > > > +	 */
> > > > +
> > > > +	/*
> > > > +	 * Send the head
> > > > +	 */
> > > > +	if (total_len == xbufp->head[0].iov_len)
> > > > +		flags = 0;
> > > > +
> > > > +	len = sock->ops->sendpage(sock, virt_to_page(xbufp-
> > > >head[0].iov_base),
> > > > +			(unsigned long)xbufp->head[0].iov_base &
> > ~PAGE_MASK,
> > > > +			xbufp->head[0].iov_len, flags);
> > >
> > > Why do you need to do this? The head iovec is supposed to be
reserved
> > > for kmalloc()ed memory, which cannot be used together with
sendpage().
> > > Somebody, some day is going to mess up and try to put a kmalloced
> > buffer
> > > in here, and will wonder why the above doesn't work.
> > >
> > > If you are sending pages, then please put them in the page list
part
> > of
> > > the xdr_buf. There is no rule that the RPC call _must_ have a
non-zero
> > > head.
> > >
> > > > +
> > > > +	if (len != xbufp->head[0].iov_len)
> > > > +		goto out;
> > > > +
> > > > +	/*
> > > > +	 * send page data
> > > > +	 *
> > > > +	 * Check the amount of data to be sent. If it is less
than the
> > > > +	 * remaining page, then send it else send the current
page
> > > > +	 */
> > > > +
> > > > +	size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base :
pglen;
> > > > +	while (pglen > 0) {
> > > > +		if (total_len == size)
> > > > +			flags = 0;
> > > > +		result = sock->ops->sendpage(sock, *pages, base,
size,
> > flags);
> > > > +		if (result > 0)
> > > > +			len += result;
> > > > +		if (result != size)
> > > > +			goto out;
> > > > +		total_len -= size;
> > > > +		pglen -= size;
> > > > +		size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
> > > > +		base = 0;
> > > > +		pages++;
> > > > +	}
> > > > +	/*
> > > > +	 * send tail
> > > > +	 */
> > > > +	if (xbufp->tail[0].iov_len) {
> > > > +		result = sock->ops->sendpage(sock,
> > > > +			xbufp->tail[0].iov_base,
> > > > +			(unsigned long)xbufp->tail[0].iov_base &
> > ~PAGE_MASK,
> > > > +			xbufp->tail[0].iov_len,
> > > > +			0);
> > >
> > > Ditto.
> > >
> > > > +
> > > > +		if (result > 0)
> > > > +			len += result;
> > > > +	}
> > > > +out:
> > > > +	if (len != xbufp->len)
> > > > +		printk(KERN_NOTICE "Error sending entire
callback!\n");
> > >              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > > Then what? Shouldn't you be closing the connection here?
> > >
> > > > +
> > > > +	return len;
> > > > +}
> > > > +
> > > > +/*
> > > > + * The send routine. Borrows from svc_send
> > > > + */
> > > > +static int bc_send_request(struct rpc_task *task)
> > > > +{
> > > > +	struct rpc_rqst *req = task->tk_rqstp;
> > > > +	struct rpc_xprt *bc_xprt = req->rq_xprt;
> > > > +	struct svc_xprt	*xprt;
> > > > +	struct svc_sock         *svsk;
> > > > +	u32                     len;
> > > > +
> > > > +	dprintk("sending request with xid: %08x\n",
ntohl(req->rq_xid));
> > > > +	/*
> > > > +	 * Get the server socket associated with this callback
xprt
> > > > +	 */
> > > > +	svsk = bc_xprt->bc_sock;
> > > > +	xprt = &svsk->sk_xprt;
> > > > +
> > > > +	mutex_lock(&xprt->xpt_mutex);
> > >           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > >
> > > Eh? What's this, in which patch is it defined, and why is it at
all
> > > needed?
> > >
> > > > +	if (test_bit(XPT_DEAD, &xprt->xpt_flags))
> > >                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
> > > Where is this defined, and why is it needed? The xprt already has
a
> > > connected/unconnected flag.
> > >
> > > > +		len = -ENOTCONN;
> > > > +	else
> > > > +		len = bc_sendto(req);
> > > > +	mutex_unlock(&xprt->xpt_mutex);
> > > > +
> > > > +	return 0;
> > > > +
> > > > +}
> > > > +
> > > > +/*
> > > > + * The close routine. Since this is client initiated, we do
nothing
> > > > + */
> > > > +
> > > > +static void bc_close(struct rpc_xprt *xprt)
> > > > +{
> > > > +	return;
> > > > +}
> > > > +
> > > > +/*
> > > > + * The xprt destroy routine. Again, because this connection is
> > client
> > > > + * initiated, we do nothing
> > > > + */
> > > > +
> > > > +static void bc_destroy(struct rpc_xprt *xprt)
> > > > +{
> > > > +	return;
> > > > +}
> > > > +
> > > >  static struct rpc_xprt_ops xs_udp_ops = {
> > > >  	.set_buffer_size	= xs_udp_set_buffer_size,
> > > >  	.reserve_xprt		= xprt_reserve_xprt_cong,
> > > > @@ -1999,6 +2213,24 @@ static struct rpc_xprt_ops xs_tcp_ops = {
> > > >  	.print_stats		= xs_tcp_print_stats,
> > > >  };
> > > >
> > > > +/*
> > > > + * The rpc_xprt_ops for the server backchannel
> > > > + */
> > > > +
> > > > +static struct rpc_xprt_ops bc_tcp_ops = {
> > > > +	.reserve_xprt		= xprt_reserve_xprt,
> > > > +	.release_xprt		= xprt_release_xprt,
> > > > +	.set_port		= bc_set_port,
> > > > +	.connect		= bc_connect,
> > > > +	.buf_alloc		= bc_malloc,
> > > > +	.buf_free		= bc_free,
> > > > +	.send_request		= bc_send_request,
> > > > +	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
> > > > +	.close			= bc_close,
> > > > +	.destroy		= bc_destroy,
> > > > +	.print_stats		= xs_tcp_print_stats,
> > > > +};
> > > > +
> > > >  static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
> > > >  				      unsigned int
slot_table_size)
> > > >  {
> > > > @@ -2131,13 +2363,29 @@ static struct rpc_xprt
*xs_setup_tcp(struct
> > > xprt_create *args)
> > > >  	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
> > > >  	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
> > > >
> > > > -	xprt->bind_timeout = XS_BIND_TO;
> > > > -	xprt->connect_timeout = XS_TCP_CONN_TO;
> > > > -	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
> > > > -	xprt->idle_timeout = XS_IDLE_DISC_TO;
> > > > +	if (args->bc_sock) {
> > > > +		/* backchannel */
> > > > +		xprt_set_bound(xprt);
> > > > +		INIT_DELAYED_WORK(&transport->connect_worker,
> > > > +				  bc_connect_worker);
> > >
> > > Errm.... Is it really such a good idea to tell the RPC layer that
it
> > can
> > > reconnect at any time using a routine that will BUG()?
> > >
> > > > +		xprt->bind_timeout = 0;
> > > > +		xprt->connect_timeout = 0;
> > > > +		xprt->reestablish_timeout = 0;
> > > > +		xprt->idle_timeout = (~0);
> > > >
> > > > -	xprt->ops = &xs_tcp_ops;
> > > > -	xprt->timeout = &xs_tcp_default_timeout;
> > > > +		/*
> > > > +		 * The backchannel uses the same socket
connection as
> > the
> > > > +		 * forechannel
> > > > +		 */
> > > > +		xprt->bc_sock = args->bc_sock;
> > > > +		xprt->bc_sock->sk_bc_xprt = xprt;
> > > > +		transport->sock = xprt->bc_sock->sk_sock;
> > > > +		transport->inet = xprt->bc_sock->sk_sk;
> > > > +
> > > > +		xprt->ops = &bc_tcp_ops;
> > > > +
> > > > +		goto next;
> > > > +	}
> > > >
> > > >  	switch (addr->sa_family) {
> > > >  	case AF_INET:
> > > > @@ -2145,13 +2393,29 @@ static struct rpc_xprt
*xs_setup_tcp(struct
> > > xprt_create *args)
> > > >  			xprt_set_bound(xprt);
> > > >
> > > >  		INIT_DELAYED_WORK(&transport->connect_worker,
> > > xs_tcp_connect_worker4);
> > > > -		xs_format_ipv4_peer_addresses(xprt, "tcp",
> > RPCBIND_NETID_TCP);
> > > >  		break;
> > > >  	case AF_INET6:
> > > >  		if (((struct sockaddr_in6 *)addr)->sin6_port !=
> > htons(0))
> > > >  			xprt_set_bound(xprt);
> > > >
> > > >  		INIT_DELAYED_WORK(&transport->connect_worker,
> > > xs_tcp_connect_worker6);
> > > > +		break;
> > > > +	}
> > > > +	xprt->bind_timeout = XS_BIND_TO;
> > > > +	xprt->connect_timeout = XS_TCP_CONN_TO;
> > > > +	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
> > > > +	xprt->idle_timeout = XS_IDLE_DISC_TO;
> > > > +
> > > > +	xprt->ops = &xs_tcp_ops;
> > > > +
> > > > +next:
> > > > +	xprt->timeout = &xs_tcp_default_timeout;
> > > > +
> > > > +	switch (addr->sa_family) {
> > >
> > > Why do we suddenly need 2 switch statements here?
> > >
> > > > +	case AF_INET:
> > > > +		xs_format_ipv4_peer_addresses(xprt, "tcp",
> > RPCBIND_NETID_TCP);
> > > > +		break;
> > > > +	case AF_INET6:
> > > >  		xs_format_ipv6_peer_addresses(xprt, "tcp",
> > RPCBIND_NETID_TCP6);
> > > >  		break;
> > > >  	default:
> > >
> > >
> > > _______________________________________________
> > > pNFS mailing list
> > > pNFS@linux-nfs.org
> > > http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
> > _______________________________________________
> > pNFS mailing list
> > pNFS@linux-nfs.org
> > http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling
       [not found]         ` <273FE88A07F5D445824060902F70034405FE3129-hX7t0kiaRRpT+ZUat5FNkAK/GNPrWCqfQQ4Iyu8u01E@public.gmane.org>
  2009-06-02  0:52           ` J. Bruce Fields
@ 2009-06-02  4:51           ` Benny Halevy
       [not found]             ` <273FE88A07F5D445824060902F70034402030375@SACMVEXC1-PRD.hq.netapp.com>
  1 sibling, 1 reply; 29+ messages in thread
From: Benny Halevy @ 2009-06-02  4:51 UTC (permalink / raw)
  To: Labiaga, Ricardo
  Cc: Trond Myklebust, Adamson, Andy, J. Bruce Fields, pnfs, linux-nfs

On Jun. 02, 2009, 3:33 +0300, "Labiaga, Ricardo" <Ricardo.Labiaga@netapp.com> wrote:
> Trond, Bruce,
> 
> Alexandros has coded a number of patches that address the issues raised
> here by Trond.  Do you want the fixes squashed into the original patch
> or do you want them submitted separately.

Can you please also send the unsquashed cleanup patches separately.
In the given time frame before the upcoming Bakeathon I'm afraid
we'd want to queue the squashed series for 2.6.31 in the
nfs{,d}41-for-2.6.31 branches and commit the cleanup patches onto
our 2.6.30 based development branches.

Benny

> 
> Thanks,
> 
> - ricardo
> 
>> -----Original Message-----
>> From: Trond Myklebust [mailto:trond.myklebust@fys.uio.no]
>> Sent: Thursday, April 30, 2009 5:05 PM
>> To: Benny Halevy
>> Cc: Adamson, Andy; J. Bruce Fields; pnfs@linux-nfs.org; linux-
>> nfs@vger.kernel.org
>> Subject: Re: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side
>> backchannel handling
>>
>> On Fri, 2009-05-01 at 02:05 +0300, Benny Halevy wrote:
>>> From: Rahul Iyer <iyer@netapp.com>
>>>
>>> FIXME: bhalevy: write up commit message
>>>
>>> Signed-off-by: Rahul Iyer <iyer@netapp.com>
>>> Signed-off-by: Mike Sager <sager@netapp.com>
>>> Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
>>> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>>>
>>> When the call direction is a reply, copy the xid and call direction
> into
>> the
>>> req->rq_private_buf.head[0].iov_base otherwise rpc_verify_header
> returns
>>> rpc_garbage.
>>>
>>> Signed-off-by: Andy Adamson <andros@netapp.com>
>>> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>>> [get rid of CONFIG_NFSD_V4_1]
>>> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>>> ---
>>>  include/linux/sunrpc/clnt.h    |    1 +
>>>  include/linux/sunrpc/svcsock.h |    1 +
>>>  include/linux/sunrpc/xprt.h    |    2 +
>>>  net/sunrpc/clnt.c              |    1 +
>>>  net/sunrpc/svcsock.c           |   68 ++++++++++-
>>>  net/sunrpc/xprt.c              |   41 ++++++-
>>>  net/sunrpc/xprtsock.c          |  278
>> +++++++++++++++++++++++++++++++++++++++-
>>>  7 files changed, 381 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/include/linux/sunrpc/clnt.h
> b/include/linux/sunrpc/clnt.h
>>> index c39a210..cf9a8ec 100644
>>> --- a/include/linux/sunrpc/clnt.h
>>> +++ b/include/linux/sunrpc/clnt.h
>>> @@ -110,6 +110,7 @@ struct rpc_create_args {
>>>  	rpc_authflavor_t	authflavor;
>>>  	unsigned long		flags;
>>>  	char			*client_name;
>>> +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel
> */
>>>  };
>>>
>>>  /* Values for "flags" field */
>>> diff --git a/include/linux/sunrpc/svcsock.h
>> b/include/linux/sunrpc/svcsock.h
>>> index 8271631..19228f4 100644
>>> --- a/include/linux/sunrpc/svcsock.h
>>> +++ b/include/linux/sunrpc/svcsock.h
>>> @@ -28,6 +28,7 @@ struct svc_sock {
>>>  	/* private TCP part */
>>>  	u32			sk_reclen;	/* length of record */
>>>  	u32			sk_tcplen;	/* current read length
> */
>>> +	struct rpc_xprt	       *sk_bc_xprt;	/* NFSv4.1 backchannel
> xprt
>> */
>>>  };
>>>
>>>  /*
>>> diff --git a/include/linux/sunrpc/xprt.h
> b/include/linux/sunrpc/xprt.h
>>> index 1758d9f..063a6a7 100644
>>> --- a/include/linux/sunrpc/xprt.h
>>> +++ b/include/linux/sunrpc/xprt.h
>>> @@ -174,6 +174,7 @@ struct rpc_xprt {
>>>  	spinlock_t		reserve_lock;	/* lock slot table */
>>>  	u32			xid;		/* Next XID value to use
> */
>>>  	struct rpc_task *	snd_task;	/* Task blocked in send
> */
>>> +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel
> */
>>>  	struct list_head	recv;
>>>
>>>  	struct {
>>> @@ -197,6 +198,7 @@ struct xprt_create {
>>>  	struct sockaddr *	srcaddr;	/* optional local
> address */
>>>  	struct sockaddr *	dstaddr;	/* remote peer address
> */
>>>  	size_t			addrlen;
>>> +	struct svc_sock		*bc_sock;	/* NFSv4.1 backchannel
> */
>>>  };
>>>
>>>  struct xprt_class {
>>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>>> index 5abab09..3dc847f 100644
>>> --- a/net/sunrpc/clnt.c
>>> +++ b/net/sunrpc/clnt.c
>>> @@ -266,6 +266,7 @@ struct rpc_clnt *rpc_create(struct
> rpc_create_args
>> *args)
>>>  		.srcaddr = args->saddress,
>>>  		.dstaddr = args->address,
>>>  		.addrlen = args->addrsize,
>>> +		.bc_sock = args->bc_sock,
>>>  	};
>>>  	char servername[48];
>>>
>>> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
>>> index 4e6d406..619764e 100644
>>> --- a/net/sunrpc/svcsock.c
>>> +++ b/net/sunrpc/svcsock.c
>>> @@ -49,6 +49,7 @@
>>>  #include <linux/sunrpc/msg_prot.h>
>>>  #include <linux/sunrpc/svcsock.h>
>>>  #include <linux/sunrpc/stats.h>
>>> +#include <linux/sunrpc/xprt.h>
>>>
>>>  #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
>>>
>>> @@ -825,6 +826,7 @@ static int svc_tcp_recvfrom(struct svc_rqst
> *rqstp)
>>>  	int		len;
>>>  	struct kvec *vec;
>>>  	int pnum, vlen;
>>> +	struct rpc_rqst *req = NULL;
>>>
>>>  	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
>>>  		svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
>>> @@ -891,12 +893,65 @@ static int svc_tcp_recvfrom(struct svc_rqst
>> *rqstp)
>>>  	len = svsk->sk_reclen;
>>>  	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
>>>
>>> +	/*
>>> +	 * We have enough data for the whole tcp record. Let's try and
> read
>> the
>>> +	 * first 8 bytes to get the xid and the call direction. We can
> use
>> this
>>> +	 * to figure out if this is a call or a reply to a callback. If
>>> +	 * sk_reclen is < 8 (xid and calldir), then this is a malformed
>> packet.
>>> +	 * In that case, don't bother with the calldir and just read the
>> data.
>>> +	 * It will be rejected in svc_process.
>>> +	 */
>>> +
>>>  	vec = rqstp->rq_vec;
>>>  	vec[0] = rqstp->rq_arg.head[0];
>>>  	vlen = PAGE_SIZE;
>>> +
>>> +	if (len >= 8) {
>>> +		u32 *p;
>>> +		u32 xid;
>>> +		u32 calldir;
>>> +
>>> +		len = svc_recvfrom(rqstp, vec, 1, 8);
>>> +		if (len < 0)
>>> +			goto error;
>>> +
>>> +		p = (u32 *)rqstp->rq_arg.head[0].iov_base;
>>> +		xid = *p++;
>>> +		calldir = *p;
>>> +
>>> +		if (calldir) {
>>> +			/* REPLY */
>>> +			if (svsk->sk_bc_xprt)
>>> +				req = xprt_lookup_rqst(svsk->sk_bc_xprt,
> xid);
>>> +			if (req) {
>>> +				memcpy(&req->rq_private_buf,
> &req->rq_rcv_buf,
>>> +					sizeof(struct xdr_buf));
>>> +				/* copy the xid and call direction */
>>> +
> memcpy(req->rq_private_buf.head[0].iov_base,
>>> +					rqstp->rq_arg.head[0].iov_base,
> 8);
>>> +				vec[0] = req->rq_private_buf.head[0];
>>> +			} else
>>> +				printk(KERN_NOTICE
>>> +					"%s: Got unrecognized reply: "
>>> +					"calldir 0x%x sk_bc_xprt %p xid
> %08x\n",
>>> +					__func__, ntohl(calldir),
>>> +					svsk->sk_bc_xprt, xid);
>>> +		}
>>> +
>>> +		if (!calldir || !req)
>>> +			vec[0] = rqstp->rq_arg.head[0];
>>> +
>>> +		vec[0].iov_base += 8;
>>> +		vec[0].iov_len -= 8;
>>> +		len = svsk->sk_reclen - 8;
>>> +		vlen -= 8;
>>> +	}
>>> +
>>>  	pnum = 1;
>>>  	while (vlen < len) {
>>> -		vec[pnum].iov_base =
> page_address(rqstp->rq_pages[pnum]);
>>> +		vec[pnum].iov_base = (req) ?
>>> +			page_address(req->rq_private_buf.pages[pnum -
> 1]) :
>>> +			page_address(rqstp->rq_pages[pnum]);
>>>  		vec[pnum].iov_len = PAGE_SIZE;
>>>  		pnum++;
>>>  		vlen += PAGE_SIZE;
>>> @@ -908,6 +963,16 @@ static int svc_tcp_recvfrom(struct svc_rqst
> *rqstp)
>>>  	if (len < 0)
>>>  		goto error;
>>>
>>> +	/*
>>> +	 * Account for the 8 bytes we read earlier
>>> +	 */
>>> +	len += 8;
>>> +
>>> +	if (req) {
>>> +		xprt_complete_rqst(req->rq_task, len);
>>> +		len = 0;
>>> +		goto out;
>>> +	}
>>>  	dprintk("svc: TCP complete record (%d bytes)\n", len);
>>>  	rqstp->rq_arg.len = len;
>>>  	rqstp->rq_arg.page_base = 0;
>>> @@ -921,6 +986,7 @@ static int svc_tcp_recvfrom(struct svc_rqst
> *rqstp)
>>>  	rqstp->rq_xprt_ctxt   = NULL;
>>>  	rqstp->rq_prot	      = IPPROTO_TCP;
>>>
>>> +out:
>>>  	/* Reset TCP read info */
>>>  	svsk->sk_reclen = 0;
>>>  	svsk->sk_tcplen = 0;
>>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>>> index a0bfe53..03f175e 100644
>>> --- a/net/sunrpc/xprt.c
>>> +++ b/net/sunrpc/xprt.c
>>> @@ -1015,6 +1015,27 @@ void xprt_release(struct rpc_task *task)
>>>  	spin_unlock(&xprt->reserve_lock);
>>>  }
>>>
>>> +/*
>>> + * The autoclose function for the back channel
>>> + *
>>> + * The callback channel should never close the channel,
>>> + * let the forechannel do that.
>>> + */
>>> +static void bc_autoclose(struct work_struct *work)
>>> +{
>>> +	return;
>>> +}
>>> +
>>> +
>>> +/*
>>> + * The autodisconnect routine for the back channel. We never
> disconnect
>>> + */
>>> +static void
>>> +bc_init_autodisconnect(unsigned long data)
>>> +{
>>> +	return;
>>> +}
>>> +
>>>  /**
>>>   * xprt_create_transport - create an RPC transport
>>>   * @args: rpc transport creation arguments
>>> @@ -1051,9 +1072,16 @@ found:
>>>
>>>  	INIT_LIST_HEAD(&xprt->free);
>>>  	INIT_LIST_HEAD(&xprt->recv);
>>> -	INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
>>> -	setup_timer(&xprt->timer, xprt_init_autodisconnect,
>>> -			(unsigned long)xprt);
>>> +	if (args->bc_sock) {
>>> +		INIT_WORK(&xprt->task_cleanup, bc_autoclose);
>>> +		setup_timer(&xprt->timer, bc_init_autodisconnect,
>>> +			    (unsigned long)xprt);
>> Hrmph... Why do you need dummy routines here?
>>
>>> +	} else {
>>> +		INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
>>> +		setup_timer(&xprt->timer, xprt_init_autodisconnect,
>>> +			    (unsigned long)xprt);
>>> +	}
>>> +
>>>  	xprt->last_used = jiffies;
>>>  	xprt->cwnd = RPC_INITCWND;
>>>  	xprt->bind_index = 0;
>>> @@ -1073,6 +1101,13 @@ found:
>>>  	dprintk("RPC:       created transport %p with %u slots\n", xprt,
>>>  			xprt->max_reqs);
>>>
>>> +	/*
>>> +	 * Since we don't want connections for the backchannel, we set
>>> +	 * the xprt status to connected
>>> +	 */
>>> +	if (args->bc_sock)
>>> +		xprt_set_connected(xprt);
>>> +
>>>  	return xprt;
>>>  }
>>>
>>> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
>>> index d40ff50..067d205 100644
>>> --- a/net/sunrpc/xprtsock.c
>>> +++ b/net/sunrpc/xprtsock.c
>>> @@ -32,6 +32,7 @@
>>>  #include <linux/tcp.h>
>>>  #include <linux/sunrpc/clnt.h>
>>>  #include <linux/sunrpc/sched.h>
>>> +#include <linux/sunrpc/svcsock.h>
>>>  #include <linux/sunrpc/xprtsock.h>
>>>  #include <linux/file.h>
>>>
>>> @@ -1966,6 +1967,219 @@ static void xs_tcp_print_stats(struct
> rpc_xprt
>> *xprt, struct seq_file *seq)
>>>  			xprt->stat.bklog_u);
>>>  }
>>>
>>> +/*
>>> + * The connect worker for the backchannel
>>> + * This should never be called as we should never need to connect
>>> + */
>>> +static void bc_connect_worker(struct work_struct *work)
>>> +{
>>> +	BUG();
>>> +}
>>> +
>>> +/*
>>> + * The set_port routine of the rpc_xprt_ops. This is related to the
>> portmapper
>>> + * and should never be called
>>> + */
>>> +
>>> +static void bc_set_port(struct rpc_xprt *xprt, unsigned short port)
>>> +{
>>> +	BUG();
>>> +}
>>> +
>>> +/*
>>> + * The connect routine for the backchannel rpc_xprt ops
>>> + * Again, should never be called!
>>> + */
>>> +
>>> +static void bc_connect(struct rpc_task *task)
>>> +{
>>> +	BUG();
>>> +}
>>> +
>>> +struct rpc_buffer {
>>> +	size_t	len;
>>> +	char	data[];
>>> +};
>>> +/*
>>> + * Allocate a bunch of pages for a scratch buffer for the rpc code.
> The
>> reason
>>> + * we allocate pages instead doing a kmalloc like rpc_malloc is
> because
>> we want
>>> + * to use the server side send routines.
>>> + */
>>> +void *bc_malloc(struct rpc_task *task, size_t size)
>>> +{
>>> +	struct page *page;
>>> +	struct rpc_buffer *buf;
>>> +
>>> +	BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer));
>>> +	page = alloc_page(GFP_KERNEL);
>>> +
>>> +	if (!page)
>>> +		return NULL;
>>> +
>>> +	buf = page_address(page);
>>> +	buf->len = PAGE_SIZE;
>>> +
>>> +	return buf->data;
>>> +}
>>> +
>> __get_free_page()? Why can't you kmalloc() here?
>>
>>> +/*
>>> + * Free the space allocated in the bc_alloc routine
>>> + */
>>> +void bc_free(void *buffer)
>>> +{
>>> +	struct rpc_buffer *buf;
>>> +
>>> +	if (!buffer)
>>> +		return;
>>> +
>>> +	buf = container_of(buffer, struct rpc_buffer, data);
>>> +	free_pages((unsigned long)buf, get_order(buf->len));
>> This looks funky... Why can't you just call free_page()? You already
>> know from bc_malloc() that this is an order 0 page allocation.
>>
>>> +}
>>> +
>>> +/*
>>> + * Use the svc_sock to send the callback. Must be called with svsk-
>>> sk_mutex
>>> + * held. Borrows heavily from svc_tcp_sendto and
> xs_tcp_semd_request.
>>> + */
>>> +static int bc_sendto(struct rpc_rqst *req)
>>> +{
>>> +	int total_len;
>>> +	int len;
>>> +	int size;
>>> +	int result;
>>> +	struct xdr_buf *xbufp = &req->rq_snd_buf;
>>> +	struct page **pages = xbufp->pages;
>>> +	unsigned int flags = MSG_MORE;
>>> +	unsigned int pglen = xbufp->page_len;
>>> +	size_t base = xbufp->page_base;
>>> +	struct rpc_xprt *xprt = req->rq_xprt;
>>> +	struct sock_xprt *transport =
>>> +				container_of(xprt, struct sock_xprt,
> xprt);
>>> +	struct socket *sock = transport->sock;
>>> +
>>> +	total_len = xbufp->len;
>>> +
>>> +	/*
>>> +	 * Set up the rpc header and record marker stuff
>>> +	 */
>>> +	xs_encode_tcp_record_marker(xbufp);
>>> +
>>> +	/*
>>> +	 * The RPC message is divided into 3 pieces:
>>> +	 * - The header: This is what most of the smaller RPC messages
>> consist
>>> +	 *   of. Often the whole message is in this.
>>> +	 *
>>> +	 *   - xdr->pages: This is a list of pages that contain data,
> for
>>> +	 *   example in a write request or while using rpcsec gss
>>> +	 *
>>> +	 *   - The tail: This is the rest of the rpc message
>>> +	 *
>>> +	 *  First we send the header, then the pages and then finally
> the
>> tail.
>>> +	 *  The code borrows heavily from svc_sendto.
>>> +	 */
>>> +
>>> +	/*
>>> +	 * Send the head
>>> +	 */
>>> +	if (total_len == xbufp->head[0].iov_len)
>>> +		flags = 0;
>>> +
>>> +	len = sock->ops->sendpage(sock, virt_to_page(xbufp-
>>> head[0].iov_base),
>>> +			(unsigned long)xbufp->head[0].iov_base &
> ~PAGE_MASK,
>>> +			xbufp->head[0].iov_len, flags);
>> Why do you need to do this? The head iovec is supposed to be reserved
>> for kmalloc()ed memory, which cannot be used together with sendpage().
>> Somebody, some day is going to mess up and try to put a kmalloced
> buffer
>> in here, and will wonder why the above doesn't work.
>>
>> If you are sending pages, then please put them in the page list part
> of
>> the xdr_buf. There is no rule that the RPC call _must_ have a non-zero
>> head.
>>
>>> +
>>> +	if (len != xbufp->head[0].iov_len)
>>> +		goto out;
>>> +
>>> +	/*
>>> +	 * send page data
>>> +	 *
>>> +	 * Check the amount of data to be sent. If it is less than the
>>> +	 * remaining page, then send it else send the current page
>>> +	 */
>>> +
>>> +	size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
>>> +	while (pglen > 0) {
>>> +		if (total_len == size)
>>> +			flags = 0;
>>> +		result = sock->ops->sendpage(sock, *pages, base, size,
> flags);
>>> +		if (result > 0)
>>> +			len += result;
>>> +		if (result != size)
>>> +			goto out;
>>> +		total_len -= size;
>>> +		pglen -= size;
>>> +		size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
>>> +		base = 0;
>>> +		pages++;
>>> +	}
>>> +	/*
>>> +	 * send tail
>>> +	 */
>>> +	if (xbufp->tail[0].iov_len) {
>>> +		result = sock->ops->sendpage(sock,
>>> +			xbufp->tail[0].iov_base,
>>> +			(unsigned long)xbufp->tail[0].iov_base &
> ~PAGE_MASK,
>>> +			xbufp->tail[0].iov_len,
>>> +			0);
>> Ditto.
>>
>>> +
>>> +		if (result > 0)
>>> +			len += result;
>>> +	}
>>> +out:
>>> +	if (len != xbufp->len)
>>> +		printk(KERN_NOTICE "Error sending entire callback!\n");
>>              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>> Then what? Shouldn't you be closing the connection here?
>>
>>> +
>>> +	return len;
>>> +}
>>> +
>>> +/*
>>> + * The send routine. Borrows from svc_send
>>> + */
>>> +static int bc_send_request(struct rpc_task *task)
>>> +{
>>> +	struct rpc_rqst *req = task->tk_rqstp;
>>> +	struct rpc_xprt *bc_xprt = req->rq_xprt;
>>> +	struct svc_xprt	*xprt;
>>> +	struct svc_sock         *svsk;
>>> +	u32                     len;
>>> +
>>> +	dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid));
>>> +	/*
>>> +	 * Get the server socket associated with this callback xprt
>>> +	 */
>>> +	svsk = bc_xprt->bc_sock;
>>> +	xprt = &svsk->sk_xprt;
>>> +
>>> +	mutex_lock(&xprt->xpt_mutex);
>>           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>
>> Eh? What's this, in which patch is it defined, and why is it at all
>> needed?
>>
>>> +	if (test_bit(XPT_DEAD, &xprt->xpt_flags))
>>                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
>> Where is this defined, and why is it needed? The xprt already has a
>> connected/unconnected flag.
>>
>>> +		len = -ENOTCONN;
>>> +	else
>>> +		len = bc_sendto(req);
>>> +	mutex_unlock(&xprt->xpt_mutex);
>>> +
>>> +	return 0;
>>> +
>>> +}
>>> +
>>> +/*
>>> + * The close routine. Since this is client initiated, we do nothing
>>> + */
>>> +
>>> +static void bc_close(struct rpc_xprt *xprt)
>>> +{
>>> +	return;
>>> +}
>>> +
>>> +/*
>>> + * The xprt destroy routine. Again, because this connection is
> client
>>> + * initiated, we do nothing
>>> + */
>>> +
>>> +static void bc_destroy(struct rpc_xprt *xprt)
>>> +{
>>> +	return;
>>> +}
>>> +
>>>  static struct rpc_xprt_ops xs_udp_ops = {
>>>  	.set_buffer_size	= xs_udp_set_buffer_size,
>>>  	.reserve_xprt		= xprt_reserve_xprt_cong,
>>> @@ -1999,6 +2213,24 @@ static struct rpc_xprt_ops xs_tcp_ops = {
>>>  	.print_stats		= xs_tcp_print_stats,
>>>  };
>>>
>>> +/*
>>> + * The rpc_xprt_ops for the server backchannel
>>> + */
>>> +
>>> +static struct rpc_xprt_ops bc_tcp_ops = {
>>> +	.reserve_xprt		= xprt_reserve_xprt,
>>> +	.release_xprt		= xprt_release_xprt,
>>> +	.set_port		= bc_set_port,
>>> +	.connect		= bc_connect,
>>> +	.buf_alloc		= bc_malloc,
>>> +	.buf_free		= bc_free,
>>> +	.send_request		= bc_send_request,
>>> +	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
>>> +	.close			= bc_close,
>>> +	.destroy		= bc_destroy,
>>> +	.print_stats		= xs_tcp_print_stats,
>>> +};
>>> +
>>>  static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
>>>  				      unsigned int slot_table_size)
>>>  {
>>> @@ -2131,13 +2363,29 @@ static struct rpc_xprt *xs_setup_tcp(struct
>> xprt_create *args)
>>>  	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
>>>  	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
>>>
>>> -	xprt->bind_timeout = XS_BIND_TO;
>>> -	xprt->connect_timeout = XS_TCP_CONN_TO;
>>> -	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
>>> -	xprt->idle_timeout = XS_IDLE_DISC_TO;
>>> +	if (args->bc_sock) {
>>> +		/* backchannel */
>>> +		xprt_set_bound(xprt);
>>> +		INIT_DELAYED_WORK(&transport->connect_worker,
>>> +				  bc_connect_worker);
>> Errm.... Is it really such a good idea to tell the RPC layer that it
> can
>> reconnect at any time using a routine that will BUG()?
>>
>>> +		xprt->bind_timeout = 0;
>>> +		xprt->connect_timeout = 0;
>>> +		xprt->reestablish_timeout = 0;
>>> +		xprt->idle_timeout = (~0);
>>>
>>> -	xprt->ops = &xs_tcp_ops;
>>> -	xprt->timeout = &xs_tcp_default_timeout;
>>> +		/*
>>> +		 * The backchannel uses the same socket connection as
> the
>>> +		 * forechannel
>>> +		 */
>>> +		xprt->bc_sock = args->bc_sock;
>>> +		xprt->bc_sock->sk_bc_xprt = xprt;
>>> +		transport->sock = xprt->bc_sock->sk_sock;
>>> +		transport->inet = xprt->bc_sock->sk_sk;
>>> +
>>> +		xprt->ops = &bc_tcp_ops;
>>> +
>>> +		goto next;
>>> +	}
>>>
>>>  	switch (addr->sa_family) {
>>>  	case AF_INET:
>>> @@ -2145,13 +2393,29 @@ static struct rpc_xprt *xs_setup_tcp(struct
>> xprt_create *args)
>>>  			xprt_set_bound(xprt);
>>>
>>>  		INIT_DELAYED_WORK(&transport->connect_worker,
>> xs_tcp_connect_worker4);
>>> -		xs_format_ipv4_peer_addresses(xprt, "tcp",
> RPCBIND_NETID_TCP);
>>>  		break;
>>>  	case AF_INET6:
>>>  		if (((struct sockaddr_in6 *)addr)->sin6_port !=
> htons(0))
>>>  			xprt_set_bound(xprt);
>>>
>>>  		INIT_DELAYED_WORK(&transport->connect_worker,
>> xs_tcp_connect_worker6);
>>> +		break;
>>> +	}
>>> +	xprt->bind_timeout = XS_BIND_TO;
>>> +	xprt->connect_timeout = XS_TCP_CONN_TO;
>>> +	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
>>> +	xprt->idle_timeout = XS_IDLE_DISC_TO;
>>> +
>>> +	xprt->ops = &xs_tcp_ops;
>>> +
>>> +next:
>>> +	xprt->timeout = &xs_tcp_default_timeout;
>>> +
>>> +	switch (addr->sa_family) {
>> Why do we suddenly need 2 switch statements here?
>>
>>> +	case AF_INET:
>>> +		xs_format_ipv4_peer_addresses(xprt, "tcp",
> RPCBIND_NETID_TCP);
>>> +		break;
>>> +	case AF_INET6:
>>>  		xs_format_ipv6_peer_addresses(xprt, "tcp",
> RPCBIND_NETID_TCP6);
>>>  		break;
>>>  	default:
>>
>> _______________________________________________
>> pNFS mailing list
>> pNFS@linux-nfs.org
>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling
       [not found]               ` <273FE88A07F5D445824060902F70034402030375-hX7t0kiaRRpT+ZUat5FNkAK/GNPrWCqfQQ4Iyu8u01E@public.gmane.org>
@ 2009-06-03  8:44                 ` Benny Halevy
  0 siblings, 0 replies; 29+ messages in thread
From: Benny Halevy @ 2009-06-03  8:44 UTC (permalink / raw)
  To: Labiaga, Ricardo
  Cc: Trond Myklebust, Adamson, Andy, J. Bruce Fields, pnfs, linux-nfs

On Jun. 03, 2009, 9:45 +0300, "Labiaga, Ricardo" <Ricardo.Labiaga@netapp.com> wrote:
> We can do that, but it will need to be a subset of the patches since a
> number of them are re-implementing the server backchannel callbacks as
> asynchronous RPCs.  That will require rebasing to bruce's 'for-2.6.31'. 
> Is a subset of the patches useful for the development branch?

If there are squashme's applicable to the development branch, then yes.
I'd like to keep them as close as possible, but don't bother too much
about that since 2.6.31 is right around the corner.

Benny

> 
> - ricardo
> 
> 
> -----Original Message-----
> From: Benny Halevy [mailto:bhalevy@panasas.com]
> Sent: Mon 6/1/2009 9:51 PM
> To: Labiaga, Ricardo
> Cc: Trond Myklebust; Adamson, Andy; J. Bruce Fields; pnfs@linux-nfs.org;
> linux-nfs@vger.kernel.org
> Subject: Re: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side
> backchannel handling
> 
> On Jun. 02, 2009, 3:33 +0300, "Labiaga, Ricardo"
> <Ricardo.Labiaga@netapp.com> wrote:
>> Trond, Bruce,
>>
>> Alexandros has coded a number of patches that address the issues raised
>> here by Trond.  Do you want the fixes squashed into the original patch
>> or do you want them submitted separately.
> 
> Can you please also send the unsquashed cleanup patches separately.
> In the given time frame before the upcoming Bakeathon I'm afraid
> we'd want to queue the squashed series for 2.6.31 in the
> nfs{,d}41-for-2.6.31 branches and commit the cleanup patches onto
> our 2.6.30 based development branches.
> 
> Benny
> 
>>
>> Thanks,
>>
>> - ricardo
>>
>>> -----Original Message-----
>>> From: Trond Myklebust [mailto:trond.myklebust@fys.uio.no]
>>> Sent: Thursday, April 30, 2009 5:05 PM
>>> To: Benny Halevy
>>> Cc: Adamson, Andy; J. Bruce Fields; pnfs@linux-nfs.org; linux-
>>> nfs@vger.kernel.org
>>> Subject: Re: [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side
>>> backchannel handling
>>>
>>> On Fri, 2009-05-01 at 02:05 +0300, Benny Halevy wrote:
>>>> From: Rahul Iyer <iyer@netapp.com>
>>>>
>>>> FIXME: bhalevy: write up commit message
>>>>
>>>> Signed-off-by: Rahul Iyer <iyer@netapp.com>
>>>> Signed-off-by: Mike Sager <sager@netapp.com>
>>>> Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
>>>> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>>>>
>>>> When the call direction is a reply, copy the xid and call direction
>> into
>>> the
>>>> req->rq_private_buf.head[0].iov_base otherwise rpc_verify_header
>> returns
>>>> rpc_garbage.
>>>>
>>>> Signed-off-by: Andy Adamson <andros@netapp.com>
>>>> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>>>> [get rid of CONFIG_NFSD_V4_1]
>>>> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>>>> ---
>>>>  include/linux/sunrpc/clnt.h    |    1 +
>>>>  include/linux/sunrpc/svcsock.h |    1 +
>>>>  include/linux/sunrpc/xprt.h    |    2 +
>>>>  net/sunrpc/clnt.c              |    1 +
>>>>  net/sunrpc/svcsock.c           |   68 ++++++++++-
>>>>  net/sunrpc/xprt.c              |   41 ++++++-
>>>>  net/sunrpc/xprtsock.c          |  278
>>> +++++++++++++++++++++++++++++++++++++++-
>>>>  7 files changed, 381 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/include/linux/sunrpc/clnt.h
>> b/include/linux/sunrpc/clnt.h
>>>> index c39a210..cf9a8ec 100644
>>>> --- a/include/linux/sunrpc/clnt.h
>>>> +++ b/include/linux/sunrpc/clnt.h
>>>> @@ -110,6 +110,7 @@ struct rpc_create_args {
>>>>     rpc_authflavor_t        authflavor;
>>>>     unsigned long           flags;
>>>>     char                    *client_name;
>>>> +   struct svc_sock         *bc_sock;       /* NFSv4.1 backchannel
>> */
>>>>  };
>>>>
>>>>  /* Values for "flags" field */
>>>> diff --git a/include/linux/sunrpc/svcsock.h
>>> b/include/linux/sunrpc/svcsock.h
>>>> index 8271631..19228f4 100644
>>>> --- a/include/linux/sunrpc/svcsock.h
>>>> +++ b/include/linux/sunrpc/svcsock.h
>>>> @@ -28,6 +28,7 @@ struct svc_sock {
>>>>     /* private TCP part */
>>>>     u32                     sk_reclen;      /* length of record */
>>>>     u32                     sk_tcplen;      /* current read length
>> */
>>>> +   struct rpc_xprt        *sk_bc_xprt;     /* NFSv4.1 backchannel
>> xprt
>>> */
>>>>  };
>>>>
>>>>  /*
>>>> diff --git a/include/linux/sunrpc/xprt.h
>> b/include/linux/sunrpc/xprt.h
>>>> index 1758d9f..063a6a7 100644
>>>> --- a/include/linux/sunrpc/xprt.h
>>>> +++ b/include/linux/sunrpc/xprt.h
>>>> @@ -174,6 +174,7 @@ struct rpc_xprt {
>>>>     spinlock_t              reserve_lock;   /* lock slot table */
>>>>     u32                     xid;            /* Next XID value to use
>> */
>>>>     struct rpc_task *       snd_task;       /* Task blocked in send
>> */
>>>> +   struct svc_sock         *bc_sock;       /* NFSv4.1 backchannel
>> */
>>>>     struct list_head        recv;
>>>>
>>>>     struct {
>>>> @@ -197,6 +198,7 @@ struct xprt_create {
>>>>     struct sockaddr *       srcaddr;        /* optional local
>> address */
>>>>     struct sockaddr *       dstaddr;        /* remote peer address
>> */
>>>>     size_t                  addrlen;
>>>> +   struct svc_sock         *bc_sock;       /* NFSv4.1 backchannel
>> */
>>>>  };
>>>>
>>>>  struct xprt_class {
>>>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>>>> index 5abab09..3dc847f 100644
>>>> --- a/net/sunrpc/clnt.c
>>>> +++ b/net/sunrpc/clnt.c
>>>> @@ -266,6 +266,7 @@ struct rpc_clnt *rpc_create(struct
>> rpc_create_args
>>> *args)
>>>>             .srcaddr = args->saddress,
>>>>             .dstaddr = args->address,
>>>>             .addrlen = args->addrsize,
>>>> +           .bc_sock = args->bc_sock,
>>>>     };
>>>>     char servername[48];
>>>>
>>>> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
>>>> index 4e6d406..619764e 100644
>>>> --- a/net/sunrpc/svcsock.c
>>>> +++ b/net/sunrpc/svcsock.c
>>>> @@ -49,6 +49,7 @@
>>>>  #include <linux/sunrpc/msg_prot.h>
>>>>  #include <linux/sunrpc/svcsock.h>
>>>>  #include <linux/sunrpc/stats.h>
>>>> +#include <linux/sunrpc/xprt.h>
>>>>
>>>>  #define RPCDBG_FACILITY    RPCDBG_SVCXPRT
>>>>
>>>> @@ -825,6 +826,7 @@ static int svc_tcp_recvfrom(struct svc_rqst
>> *rqstp)
>>>>     int             len;
>>>>     struct kvec *vec;
>>>>     int pnum, vlen;
>>>> +   struct rpc_rqst *req = NULL;
>>>>
>>>>     dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
>>>>             svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
>>>> @@ -891,12 +893,65 @@ static int svc_tcp_recvfrom(struct svc_rqst
>>> *rqstp)
>>>>     len = svsk->sk_reclen;
>>>>     set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
>>>>
>>>> +   /*
>>>> +    * We have enough data for the whole tcp record. Let's try and
>> read
>>> the
>>>> +    * first 8 bytes to get the xid and the call direction. We can
>> use
>>> this
>>>> +    * to figure out if this is a call or a reply to a callback. If
>>>> +    * sk_reclen is < 8 (xid and calldir), then this is a malformed
>>> packet.
>>>> +    * In that case, don't bother with the calldir and just read the
>>> data.
>>>> +    * It will be rejected in svc_process.
>>>> +    */
>>>> +
>>>>     vec = rqstp->rq_vec;
>>>>     vec[0] = rqstp->rq_arg.head[0];
>>>>     vlen = PAGE_SIZE;
>>>> +
>>>> +   if (len >= 8) {
>>>> +           u32 *p;
>>>> +           u32 xid;
>>>> +           u32 calldir;
>>>> +
>>>> +           len = svc_recvfrom(rqstp, vec, 1, 8);
>>>> +           if (len < 0)
>>>> +                   goto error;
>>>> +
>>>> +           p = (u32 *)rqstp->rq_arg.head[0].iov_base;
>>>> +           xid = *p++;
>>>> +           calldir = *p;
>>>> +
>>>> +           if (calldir) {
>>>> +                   /* REPLY */
>>>> +                   if (svsk->sk_bc_xprt)
>>>> +                           req = xprt_lookup_rqst(svsk->sk_bc_xprt,
>> xid);
>>>> +                   if (req) {
>>>> +                           memcpy(&req->rq_private_buf,
>> &req->rq_rcv_buf,
>>>> +                                   sizeof(struct xdr_buf));
>>>> +                           /* copy the xid and call direction */
>>>> +
>> memcpy(req->rq_private_buf.head[0].iov_base,
>>>> +                                   rqstp->rq_arg.head[0].iov_base,
>> 8);
>>>> +                           vec[0] = req->rq_private_buf.head[0];
>>>> +                   } else
>>>> +                           printk(KERN_NOTICE
>>>> +                                   "%s: Got unrecognized reply: "
>>>> +                                   "calldir 0x%x sk_bc_xprt %p xid
>> %08x\n",
>>>> +                                   __func__, ntohl(calldir),
>>>> +                                   svsk->sk_bc_xprt, xid);
>>>> +           }
>>>> +
>>>> +           if (!calldir || !req)
>>>> +                   vec[0] = rqstp->rq_arg.head[0];
>>>> +
>>>> +           vec[0].iov_base += 8;
>>>> +           vec[0].iov_len -= 8;
>>>> +           len = svsk->sk_reclen - 8;
>>>> +           vlen -= 8;
>>>> +   }
>>>> +
>>>>     pnum = 1;
>>>>     while (vlen < len) {
>>>> -           vec[pnum].iov_base =
>> page_address(rqstp->rq_pages[pnum]);
>>>> +           vec[pnum].iov_base = (req) ?
>>>> +                   page_address(req->rq_private_buf.pages[pnum -
>> 1]) :
>>>> +                   page_address(rqstp->rq_pages[pnum]);
>>>>             vec[pnum].iov_len = PAGE_SIZE;
>>>>             pnum++;
>>>>             vlen += PAGE_SIZE;
>>>> @@ -908,6 +963,16 @@ static int svc_tcp_recvfrom(struct svc_rqst
>> *rqstp)
>>>>     if (len < 0)
>>>>             goto error;
>>>>
>>>> +   /*
>>>> +    * Account for the 8 bytes we read earlier
>>>> +    */
>>>> +   len += 8;
>>>> +
>>>> +   if (req) {
>>>> +           xprt_complete_rqst(req->rq_task, len);
>>>> +           len = 0;
>>>> +           goto out;
>>>> +   }
>>>>     dprintk("svc: TCP complete record (%d bytes)\n", len);
>>>>     rqstp->rq_arg.len = len;
>>>>     rqstp->rq_arg.page_base = 0;
>>>> @@ -921,6 +986,7 @@ static int svc_tcp_recvfrom(struct svc_rqst
>> *rqstp)
>>>>     rqstp->rq_xprt_ctxt   = NULL;
>>>>     rqstp->rq_prot        = IPPROTO_TCP;
>>>>
>>>> +out:
>>>>     /* Reset TCP read info */
>>>>     svsk->sk_reclen = 0;
>>>>     svsk->sk_tcplen = 0;
>>>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>>>> index a0bfe53..03f175e 100644
>>>> --- a/net/sunrpc/xprt.c
>>>> +++ b/net/sunrpc/xprt.c
>>>> @@ -1015,6 +1015,27 @@ void xprt_release(struct rpc_task *task)
>>>>     spin_unlock(&xprt->reserve_lock);
>>>>  }
>>>>
>>>> +/*
>>>> + * The autoclose function for the back channel
>>>> + *
>>>> + * The callback channel should never close the channel,
>>>> + * let the forechannel do that.
>>>> + */
>>>> +static void bc_autoclose(struct work_struct *work)
>>>> +{
>>>> +   return;
>>>> +}
>>>> +
>>>> +
>>>> +/*
>>>> + * The autodisconnect routine for the back channel. We never
>> disconnect
>>>> + */
>>>> +static void
>>>> +bc_init_autodisconnect(unsigned long data)
>>>> +{
>>>> +   return;
>>>> +}
>>>> +
>>>>  /**
>>>>   * xprt_create_transport - create an RPC transport
>>>>   * @args: rpc transport creation arguments
>>>> @@ -1051,9 +1072,16 @@ found:
>>>>
>>>>     INIT_LIST_HEAD(&xprt->free);
>>>>     INIT_LIST_HEAD(&xprt->recv);
>>>> -   INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
>>>> -   setup_timer(&xprt->timer, xprt_init_autodisconnect,
>>>> -                   (unsigned long)xprt);
>>>> +   if (args->bc_sock) {
>>>> +           INIT_WORK(&xprt->task_cleanup, bc_autoclose);
>>>> +           setup_timer(&xprt->timer, bc_init_autodisconnect,
>>>> +                       (unsigned long)xprt);
>>> Hrmph... Why do you need dummy routines here?
>>>
>>>> +   } else {
>>>> +           INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
>>>> +           setup_timer(&xprt->timer, xprt_init_autodisconnect,
>>>> +                       (unsigned long)xprt);
>>>> +   }
>>>> +
>>>>     xprt->last_used = jiffies;
>>>>     xprt->cwnd = RPC_INITCWND;
>>>>     xprt->bind_index = 0;
>>>> @@ -1073,6 +1101,13 @@ found:
>>>>     dprintk("RPC:       created transport %p with %u slots\n", xprt,
>>>>                     xprt->max_reqs);
>>>>
>>>> +   /*
>>>> +    * Since we don't want connections for the backchannel, we set
>>>> +    * the xprt status to connected
>>>> +    */
>>>> +   if (args->bc_sock)
>>>> +           xprt_set_connected(xprt);
>>>> +
>>>>     return xprt;
>>>>  }
>>>>
>>>> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
>>>> index d40ff50..067d205 100644
>>>> --- a/net/sunrpc/xprtsock.c
>>>> +++ b/net/sunrpc/xprtsock.c
>>>> @@ -32,6 +32,7 @@
>>>>  #include <linux/tcp.h>
>>>>  #include <linux/sunrpc/clnt.h>
>>>>  #include <linux/sunrpc/sched.h>
>>>> +#include <linux/sunrpc/svcsock.h>
>>>>  #include <linux/sunrpc/xprtsock.h>
>>>>  #include <linux/file.h>
>>>>
>>>> @@ -1966,6 +1967,219 @@ static void xs_tcp_print_stats(struct
>> rpc_xprt
>>> *xprt, struct seq_file *seq)
>>>>                     xprt->stat.bklog_u);
>>>>  }
>>>>
>>>> +/*
>>>> + * The connect worker for the backchannel
>>>> + * This should never be called as we should never need to connect
>>>> + */
>>>> +static void bc_connect_worker(struct work_struct *work)
>>>> +{
>>>> +   BUG();
>>>> +}
>>>> +
>>>> +/*
>>>> + * The set_port routine of the rpc_xprt_ops. This is related to the
>>> portmapper
>>>> + * and should never be called
>>>> + */
>>>> +
>>>> +static void bc_set_port(struct rpc_xprt *xprt, unsigned short port)
>>>> +{
>>>> +   BUG();
>>>> +}
>>>> +
>>>> +/*
>>>> + * The connect routine for the backchannel rpc_xprt ops
>>>> + * Again, should never be called!
>>>> + */
>>>> +
>>>> +static void bc_connect(struct rpc_task *task)
>>>> +{
>>>> +   BUG();
>>>> +}
>>>> +
>>>> +struct rpc_buffer {
>>>> +   size_t  len;
>>>> +   char    data[];
>>>> +};
>>>> +/*
>>>> + * Allocate a bunch of pages for a scratch buffer for the rpc code.
>> The
>>> reason
>>>> + * we allocate pages instead doing a kmalloc like rpc_malloc is
>> because
>>> we want
>>>> + * to use the server side send routines.
>>>> + */
>>>> +void *bc_malloc(struct rpc_task *task, size_t size)
>>>> +{
>>>> +   struct page *page;
>>>> +   struct rpc_buffer *buf;
>>>> +
>>>> +   BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer));
>>>> +   page = alloc_page(GFP_KERNEL);
>>>> +
>>>> +   if (!page)
>>>> +           return NULL;
>>>> +
>>>> +   buf = page_address(page);
>>>> +   buf->len = PAGE_SIZE;
>>>> +
>>>> +   return buf->data;
>>>> +}
>>>> +
>>> __get_free_page()? Why can't you kmalloc() here?
>>>
>>>> +/*
>>>> + * Free the space allocated in the bc_alloc routine
>>>> + */
>>>> +void bc_free(void *buffer)
>>>> +{
>>>> +   struct rpc_buffer *buf;
>>>> +
>>>> +   if (!buffer)
>>>> +           return;
>>>> +
>>>> +   buf = container_of(buffer, struct rpc_buffer, data);
>>>> +   free_pages((unsigned long)buf, get_order(buf->len));
>>> This looks funky... Why can't you just call free_page()? You already
>>> know from bc_malloc() that this is an order 0 page allocation.
>>>
>>>> +}
>>>> +
>>>> +/*
>>>> + * Use the svc_sock to send the callback. Must be called with svsk-
>>>> sk_mutex
>>>> + * held. Borrows heavily from svc_tcp_sendto and
>> xs_tcp_semd_request.
>>>> + */
>>>> +static int bc_sendto(struct rpc_rqst *req)
>>>> +{
>>>> +   int total_len;
>>>> +   int len;
>>>> +   int size;
>>>> +   int result;
>>>> +   struct xdr_buf *xbufp = &req->rq_snd_buf;
>>>> +   struct page **pages = xbufp->pages;
>>>> +   unsigned int flags = MSG_MORE;
>>>> +   unsigned int pglen = xbufp->page_len;
>>>> +   size_t base = xbufp->page_base;
>>>> +   struct rpc_xprt *xprt = req->rq_xprt;
>>>> +   struct sock_xprt *transport =
>>>> +                           container_of(xprt, struct sock_xprt,
>> xprt);
>>>> +   struct socket *sock = transport->sock;
>>>> +
>>>> +   total_len = xbufp->len;
>>>> +
>>>> +   /*
>>>> +    * Set up the rpc header and record marker stuff
>>>> +    */
>>>> +   xs_encode_tcp_record_marker(xbufp);
>>>> +
>>>> +   /*
>>>> +    * The RPC message is divided into 3 pieces:
>>>> +    * - The header: This is what most of the smaller RPC messages
>>> consist
>>>> +    *   of. Often the whole message is in this.
>>>> +    *
>>>> +    *   - xdr->pages: This is a list of pages that contain data,
>> for
>>>> +    *   example in a write request or while using rpcsec gss
>>>> +    *
>>>> +    *   - The tail: This is the rest of the rpc message
>>>> +    *
>>>> +    *  First we send the header, then the pages and then finally
>> the
>>> tail.
>>>> +    *  The code borrows heavily from svc_sendto.
>>>> +    */
>>>> +
>>>> +   /*
>>>> +    * Send the head
>>>> +    */
>>>> +   if (total_len == xbufp->head[0].iov_len)
>>>> +           flags = 0;
>>>> +
>>>> +   len = sock->ops->sendpage(sock, virt_to_page(xbufp-
>>>> head[0].iov_base),
>>>> +                   (unsigned long)xbufp->head[0].iov_base &
>> ~PAGE_MASK,
>>>> +                   xbufp->head[0].iov_len, flags);
>>> Why do you need to do this? The head iovec is supposed to be reserved
>>> for kmalloc()ed memory, which cannot be used together with sendpage().
>>> Somebody, some day is going to mess up and try to put a kmalloced
>> buffer
>>> in here, and will wonder why the above doesn't work.
>>>
>>> If you are sending pages, then please put them in the page list part
>> of
>>> the xdr_buf. There is no rule that the RPC call _must_ have a non-zero
>>> head.
>>>
>>>> +
>>>> +   if (len != xbufp->head[0].iov_len)
>>>> +           goto out;
>>>> +
>>>> +   /*
>>>> +    * send page data
>>>> +    *
>>>> +    * Check the amount of data to be sent. If it is less than the
>>>> +    * remaining page, then send it else send the current page
>>>> +    */
>>>> +
>>>> +   size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
>>>> +   while (pglen > 0) {
>>>> +           if (total_len == size)
>>>> +                   flags = 0;
>>>> +           result = sock->ops->sendpage(sock, *pages, base, size,
>> flags);
>>>> +           if (result > 0)
>>>> +                   len += result;
>>>> +           if (result != size)
>>>> +                   goto out;
>>>> +           total_len -= size;
>>>> +           pglen -= size;
>>>> +           size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
>>>> +           base = 0;
>>>> +           pages++;
>>>> +   }
>>>> +   /*
>>>> +    * send tail
>>>> +    */
>>>> +   if (xbufp->tail[0].iov_len) {
>>>> +           result = sock->ops->sendpage(sock,
>>>> +                   xbufp->tail[0].iov_base,
>>>> +                   (unsigned long)xbufp->tail[0].iov_base &
>> ~PAGE_MASK,
>>>> +                   xbufp->tail[0].iov_len,
>>>> +                   0);
>>> Ditto.
>>>
>>>> +
>>>> +           if (result > 0)
>>>> +                   len += result;
>>>> +   }
>>>> +out:
>>>> +   if (len != xbufp->len)
>>>> +           printk(KERN_NOTICE "Error sending entire callback!\n");
>>>              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>> Then what? Shouldn't you be closing the connection here?
>>>
>>>> +
>>>> +   return len;
>>>> +}
>>>> +
>>>> +/*
>>>> + * The send routine. Borrows from svc_send
>>>> + */
>>>> +static int bc_send_request(struct rpc_task *task)
>>>> +{
>>>> +   struct rpc_rqst *req = task->tk_rqstp;
>>>> +   struct rpc_xprt *bc_xprt = req->rq_xprt;
>>>> +   struct svc_xprt *xprt;
>>>> +   struct svc_sock         *svsk;
>>>> +   u32                     len;
>>>> +
>>>> +   dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid));
>>>> +   /*
>>>> +    * Get the server socket associated with this callback xprt
>>>> +    */
>>>> +   svsk = bc_xprt->bc_sock;
>>>> +   xprt = &svsk->sk_xprt;
>>>> +
>>>> +   mutex_lock(&xprt->xpt_mutex);
>>>           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>>
>>> Eh? What's this, in which patch is it defined, and why is it at all
>>> needed?
>>>
>>>> +   if (test_bit(XPT_DEAD, &xprt->xpt_flags))
>>>                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
>>> Where is this defined, and why is it needed? The xprt already has a
>>> connected/unconnected flag.
>>>
>>>> +           len = -ENOTCONN;
>>>> +   else
>>>> +           len = bc_sendto(req);
>>>> +   mutex_unlock(&xprt->xpt_mutex);
>>>> +
>>>> +   return 0;
>>>> +
>>>> +}
>>>> +
>>>> +/*
>>>> + * The close routine. Since this is client initiated, we do nothing
>>>> + */
>>>> +
>>>> +static void bc_close(struct rpc_xprt *xprt)
>>>> +{
>>>> +   return;
>>>> +}
>>>> +
>>>> +/*
>>>> + * The xprt destroy routine. Again, because this connection is
>> client
>>>> + * initiated, we do nothing
>>>> + */
>>>> +
>>>> +static void bc_destroy(struct rpc_xprt *xprt)
>>>> +{
>>>> +   return;
>>>> +}
>>>> +
>>>>  static struct rpc_xprt_ops xs_udp_ops = {
>>>>     .set_buffer_size        = xs_udp_set_buffer_size,
>>>>     .reserve_xprt           = xprt_reserve_xprt_cong,
>>>> @@ -1999,6 +2213,24 @@ static struct rpc_xprt_ops xs_tcp_ops = {
>>>>     .print_stats            = xs_tcp_print_stats,
>>>>  };
>>>>
>>>> +/*
>>>> + * The rpc_xprt_ops for the server backchannel
>>>> + */
>>>> +
>>>> +static struct rpc_xprt_ops bc_tcp_ops = {
>>>> +   .reserve_xprt           = xprt_reserve_xprt,
>>>> +   .release_xprt           = xprt_release_xprt,
>>>> +   .set_port               = bc_set_port,
>>>> +   .connect                = bc_connect,
>>>> +   .buf_alloc              = bc_malloc,
>>>> +   .buf_free               = bc_free,
>>>> +   .send_request           = bc_send_request,
>>>> +   .set_retrans_timeout    = xprt_set_retrans_timeout_def,
>>>> +   .close                  = bc_close,
>>>> +   .destroy                = bc_destroy,
>>>> +   .print_stats            = xs_tcp_print_stats,
>>>> +};
>>>> +
>>>>  static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
>>>>                                   unsigned int slot_table_size)
>>>>  {
>>>> @@ -2131,13 +2363,29 @@ static struct rpc_xprt *xs_setup_tcp(struct
>>> xprt_create *args)
>>>>     xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
>>>>     xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
>>>>
>>>> -   xprt->bind_timeout = XS_BIND_TO;
>>>> -   xprt->connect_timeout = XS_TCP_CONN_TO;
>>>> -   xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
>>>> -   xprt->idle_timeout = XS_IDLE_DISC_TO;
>>>> +   if (args->bc_sock) {
>>>> +           /* backchannel */
>>>> +           xprt_set_bound(xprt);
>>>> +           INIT_DELAYED_WORK(&transport->connect_worker,
>>>> +                             bc_connect_worker);
>>> Errm.... Is it really such a good idea to tell the RPC layer that it
>> can
>>> reconnect at any time using a routine that will BUG()?
>>>
>>>> +           xprt->bind_timeout = 0;
>>>> +           xprt->connect_timeout = 0;
>>>> +           xprt->reestablish_timeout = 0;
>>>> +           xprt->idle_timeout = (~0);
>>>>
>>>> -   xprt->ops = &xs_tcp_ops;
>>>> -   xprt->timeout = &xs_tcp_default_timeout;
>>>> +           /*
>>>> +            * The backchannel uses the same socket connection as
>> the
>>>> +            * forechannel
>>>> +            */
>>>> +           xprt->bc_sock = args->bc_sock;
>>>> +           xprt->bc_sock->sk_bc_xprt = xprt;
>>>> +           transport->sock = xprt->bc_sock->sk_sock;
>>>> +           transport->inet = xprt->bc_sock->sk_sk;
>>>> +
>>>> +           xprt->ops = &bc_tcp_ops;
>>>> +
>>>> +           goto next;
>>>> +   }
>>>>
>>>>     switch (addr->sa_family) {
>>>>     case AF_INET:
>>>> @@ -2145,13 +2393,29 @@ static struct rpc_xprt *xs_setup_tcp(struct
>>> xprt_create *args)
>>>>                     xprt_set_bound(xprt);
>>>>
>>>>             INIT_DELAYED_WORK(&transport->connect_worker,
>>> xs_tcp_connect_worker4);
>>>> -           xs_format_ipv4_peer_addresses(xprt, "tcp",
>> RPCBIND_NETID_TCP);
>>>>             break;
>>>>     case AF_INET6:
>>>>             if (((struct sockaddr_in6 *)addr)->sin6_port !=
>> htons(0))
>>>>                     xprt_set_bound(xprt);
>>>>
>>>>             INIT_DELAYED_WORK(&transport->connect_worker,
>>> xs_tcp_connect_worker6);
>>>> +           break;
>>>> +   }
>>>> +   xprt->bind_timeout = XS_BIND_TO;
>>>> +   xprt->connect_timeout = XS_TCP_CONN_TO;
>>>> +   xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
>>>> +   xprt->idle_timeout = XS_IDLE_DISC_TO;
>>>> +
>>>> +   xprt->ops = &xs_tcp_ops;
>>>> +
>>>> +next:
>>>> +   xprt->timeout = &xs_tcp_default_timeout;
>>>> +
>>>> +   switch (addr->sa_family) {
>>> Why do we suddenly need 2 switch statements here?
>>>
>>>> +   case AF_INET:
>>>> +           xs_format_ipv4_peer_addresses(xprt, "tcp",
>> RPCBIND_NETID_TCP);
>>>> +           break;
>>>> +   case AF_INET6:
>>>>             xs_format_ipv6_peer_addresses(xprt, "tcp",
>> RPCBIND_NETID_TCP6);
>>>>             break;
>>>>     default:
>>>
>>> _______________________________________________
>>> pNFS mailing list
>>> pNFS@linux-nfs.org
>>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [RFC 0/10] nfsd41 server backchannel for 2.6.31
@ 2009-05-20  2:07 Labiaga, Ricardo
  0 siblings, 0 replies; 29+ messages in thread
From: Labiaga, Ricardo @ 2009-05-20  2:07 UTC (permalink / raw)
  To: bfields; +Cc: linux-nfs, pnfs, andros, batsakis, bhalevy

[resend with extended cc list]

Bruce,

We've reworked the v4.1 backchannel patches to use asynchronous RPCs.
We've also addressed the comments you made on the previous series (RFC
03, RFC 04, and RFC 05).  We are still working to address Trond's
comments for RFC 03, but do not want to hold off the review of the rest
of the patches.  Can you please consider the set for inclusion in
2.6.31, with the note that an update to RFC 03 will be coming soon.

[RFC 01/11] nfsd: cleanup nfs4.0 callback encode routines
[RFC 02/11] nfsd: minorversion support for the back channel
[RFC 03/11] nfsd41: sunrpc: svc_tcp_recv_record()
[RFC 04/11] nfsd41: sunrpc: Added rpc server-side backchannel handling
[RFC 05/11] nfsd41: callback infrastructure
[RFC 06/11] nfsd41: Backchannel: Add sequence arguments to callback RPC
arguments
[RFC 07/11] nfsd41: Backchannel: Server backchannel RPC wait queue
[RFC 08/11] nfsd41: Backchannel: Setup sequence information
[RFC 09/11] nfsd41: cb_sequence callback
[RFC 10/11] nfsd41: Backchannel: Implement cb_recall over NFSv4.1
[RFC 11/11] nfsd41: Refactor create_client()

Thanks,

- ricardo

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2009-06-03  8:44 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-30 23:00 [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
2009-04-30 23:05 ` [RFC 01/10] nfsd: cleanup nfs4.0 callback encode routines Benny Halevy
2009-04-30 23:05 ` [RFC 02/10] nfsd: minorversion support for the back channel Benny Halevy
2009-04-30 23:05 ` [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling Benny Halevy
2009-05-01  0:05   ` [pnfs] " Trond Myklebust
     [not found]     ` <1241136328.15476.124.camel-rJ7iovZKK19ZJLDQqaL3InhyD016LWXt@public.gmane.org>
2009-05-01  0:13       ` Labiaga, Ricardo
2009-06-02  0:33       ` Labiaga, Ricardo
     [not found]         ` <273FE88A07F5D445824060902F70034405FE3129-hX7t0kiaRRpT+ZUat5FNkAK/GNPrWCqfQQ4Iyu8u01E@public.gmane.org>
2009-06-02  0:52           ` J. Bruce Fields
2009-06-02  1:24             ` [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-sidebackchannel handling Labiaga, Ricardo
2009-06-02  4:51           ` [pnfs] [RFC 03/10] nfsd41: sunrpc: Added rpc server-side backchannel handling Benny Halevy
     [not found]             ` <273FE88A07F5D445824060902F70034402030375@SACMVEXC1-PRD.hq.netapp.com>
     [not found]               ` <273FE88A07F5D445824060902F70034402030375-hX7t0kiaRRpT+ZUat5FNkAK/GNPrWCqfQQ4Iyu8u01E@public.gmane.org>
2009-06-03  8:44                 ` Benny Halevy
2009-05-03 20:36   ` J. Bruce Fields
2009-04-30 23:06 ` [RFC 04/10] nfsd41: Remember the auth flavor to use for callbacks Benny Halevy
2009-05-03 20:42   ` J. Bruce Fields
2009-05-05  2:51     ` [RFC 04/10] nfsd41: Remember the auth flavor to use forcallbacks Labiaga, Ricardo
2009-04-30 23:06 ` [RFC 05/10] nfsd41: callback infrastructure Benny Halevy
2009-05-03 20:49   ` J. Bruce Fields
2009-04-30 23:06 ` [RFC 06/10] nfsd41: Backchannel: Add sequence arguments to callback RPC arguments Benny Halevy
2009-04-30 23:06 ` [RFC 07/10] nfsd41: Backchannel: Server backchannel RPC wait queue Benny Halevy
2009-04-30 23:06 ` [RFC 08/10] nfsd41: Backchannel: Setup sequence information Benny Halevy
2009-04-30 23:06 ` [RFC 09/10] nfsd41: cb_sequence callback Benny Halevy
2009-04-30 23:52   ` [pnfs] " Trond Myklebust
     [not found]     ` <1241135565.15476.111.camel-rJ7iovZKK19ZJLDQqaL3InhyD016LWXt@public.gmane.org>
2009-05-01  8:33       ` Benny Halevy
2009-04-30 23:07 ` [RFC 10/10] nfsd41: cb_recall callback Benny Halevy
2009-04-30 23:12 ` [pnfs] [RFC 0/10] nfsd41 server backchannel for 2.6.31 Benny Halevy
2009-05-03 20:53 ` J. Bruce Fields
2009-05-06  4:11   ` Labiaga, Ricardo
2009-05-06 21:24     ` J. Bruce Fields
2009-05-20  2:07 Labiaga, Ricardo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.