All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/6] nfs: renewd fixes
@ 2010-02-03  0:06 Alexandros Batsakis
  2010-02-03  0:06 ` [PATCH 1/6] nfs: kill renewd before clearing client minor version Alexandros Batsakis
  0 siblings, 1 reply; 15+ messages in thread
From: Alexandros Batsakis @ 2010-02-03  0:06 UTC (permalink / raw)
  To: linux-nfs; +Cc: trond, Alexandros Batsakis

This set of patches is yet another attempt to fix a couple of issues with renewd.
Patch 1/6 makes sure that renewd is synchronously killed before sending a destroy_session to the server.
Patch 2/6 schedules new renewd request only if a previous one returns (error or success).
Patch 3-4/6 fix the umount race by forcing renewd to take/put a client reference, so access to nfs_free_client is serialized.
Patch 5/6 sets a timeout (similar to timeo) for the nfs_client->rpcclient that equals the server advertised lease_time. This ensures that NFS requests fail (so they can be retried) within a lease period.
Patch 6/6 makes the internal timeouts of the RPC layer sensitive to the major timeout value; IOW there is no point in sleeping waiting for an event for longer than the remaining time before a major timeout.


Alexandros Batsakis (6):
  nfs: kill renewd before clearing client minor version
  nfs: prevent backlogging of renewd requests
  nfs41: fix race between umount and renewd sequence operations
  nfs4: fix race between umount and renewd renew operations
  nfs4: adjust rpc timeout for nfs_client rpc client based on the
    lease_time
  RPC: adjust timeout for connect, bind, restablish so that they
    sensitive to the major time out value

 fs/nfs/client.c       |   45 +++++++++++++++++++++----------------------
 fs/nfs/nfs4proc.c     |   50 +++++++++++++++++++++++++++++++++++++++++++++++-
 fs/nfs/nfs4renewd.c   |   19 ++++++-----------
 net/sunrpc/clnt.c     |    2 +-
 net/sunrpc/sunrpc.h   |    2 +
 net/sunrpc/xprt.c     |   18 ++++++++++++++--
 net/sunrpc/xprtsock.c |   12 +++++++++-
 7 files changed, 105 insertions(+), 43 deletions(-)


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/6] nfs: kill renewd before clearing client minor version
  2010-02-03  0:06 [PATCH 0/6] nfs: renewd fixes Alexandros Batsakis
@ 2010-02-03  0:06 ` Alexandros Batsakis
  2010-02-03  0:06   ` [PATCH 2/6] nfs: prevent backlogging of renewd requests Alexandros Batsakis
  0 siblings, 1 reply; 15+ messages in thread
From: Alexandros Batsakis @ 2010-02-03  0:06 UTC (permalink / raw)
  To: linux-nfs; +Cc: trond, Alexandros Batsakis

Signed-off-by: Alexandros Batsakis <batsakis@netapp.com>
---
 fs/nfs/client.c |   45 ++++++++++++++++++++++-----------------------
 1 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d0b060a..f712e52 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -164,17 +164,20 @@ error_0:
 	return ERR_PTR(err);
 }
 
-static void nfs4_shutdown_client(struct nfs_client *clp)
+/*
+ * Clears/puts all minor version specific parts from an nfs_client struct
+ * reverting it to minorversion 0.
+ */
+static void nfs4_clear_client_minor_version(struct nfs_client *clp)
 {
-#ifdef CONFIG_NFS_V4
-	if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
-		nfs4_kill_renewd(clp);
-	BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners));
-	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
-		nfs_idmap_delete(clp);
+#ifdef CONFIG_NFS_V4_1
+	if (nfs4_has_session(clp)) {
+		nfs4_destroy_session(clp->cl_session);
+		clp->cl_session = NULL;
+	}
 
-	rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
-#endif
+	clp->cl_call_sync = _nfs4_call_sync;
+#endif /* CONFIG_NFS_V4_1 */
 }
 
 /*
@@ -188,22 +191,18 @@ static void nfs4_destroy_callback(struct nfs_client *clp)
 #endif /* CONFIG_NFS_V4 */
 }
 
-/*
- * Clears/puts all minor version specific parts from an nfs_client struct
- * reverting it to minorversion 0.
- */
-static void nfs4_clear_client_minor_version(struct nfs_client *clp)
+static void nfs4_shutdown_client(struct nfs_client *clp)
 {
-#ifdef CONFIG_NFS_V4_1
-	if (nfs4_has_session(clp)) {
-		nfs4_destroy_session(clp->cl_session);
-		clp->cl_session = NULL;
-	}
-
-	clp->cl_call_sync = _nfs4_call_sync;
-#endif /* CONFIG_NFS_V4_1 */
-
+#ifdef CONFIG_NFS_V4
+	if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
+		nfs4_kill_renewd(clp);
+	nfs4_clear_client_minor_version(clp);
 	nfs4_destroy_callback(clp);
+	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
+		nfs_idmap_delete(clp);
+
+	rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
+#endif
 }
 
 /*
-- 
1.6.2.5


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/6] nfs: prevent backlogging of renewd requests
  2010-02-03  0:06 ` [PATCH 1/6] nfs: kill renewd before clearing client minor version Alexandros Batsakis
@ 2010-02-03  0:06   ` Alexandros Batsakis
  2010-02-03  0:06     ` [PATCH 3/6] nfs41: fix race between umount and renewd sequence operations Alexandros Batsakis
  0 siblings, 1 reply; 15+ messages in thread
From: Alexandros Batsakis @ 2010-02-03  0:06 UTC (permalink / raw)
  To: linux-nfs; +Cc: trond, Alexandros Batsakis

Signed-off-by: Alexandros Batsakis <batsakis@netapp.com>
---
 fs/nfs/nfs4proc.c   |    3 +++
 fs/nfs/nfs4renewd.c |   19 +++++++------------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 84b53d3..ce44b5a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3162,6 +3162,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
 	if (time_before(clp->cl_last_renewal,timestamp))
 		clp->cl_last_renewal = timestamp;
 	spin_unlock(&clp->cl_lock);
+	nfs4_schedule_state_renewal(clp);
 }
 
 static const struct rpc_call_ops nfs4_renew_ops = {
@@ -5040,6 +5041,8 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
 	}
 	dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
 
+	nfs4_schedule_state_renewal(clp);
+
 	kfree(task->tk_msg.rpc_argp);
 	kfree(task->tk_msg.rpc_resp);
 
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 0156c01..eec481b 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -63,7 +63,7 @@ nfs4_renew_state(struct work_struct *work)
 	struct nfs_client *clp =
 		container_of(work, struct nfs_client, cl_renewd.work);
 	struct rpc_cred *cred;
-	long lease, timeout;
+	long lease;
 	unsigned long last, now;
 
 	ops = nfs4_state_renewal_ops[clp->cl_minorversion];
@@ -75,7 +75,6 @@ nfs4_renew_state(struct work_struct *work)
 	lease = clp->cl_lease_time;
 	last = clp->cl_last_renewal;
 	now = jiffies;
-	timeout = (2 * lease) / 3 + (long)last - (long)now;
 	/* Are we close to a lease timeout? */
 	if (time_after(now, last + lease/3)) {
 		cred = ops->get_state_renewal_cred_locked(clp);
@@ -90,19 +89,15 @@ nfs4_renew_state(struct work_struct *work)
 			/* Queue an asynchronous RENEW. */
 			ops->sched_state_renewal(clp, cred);
 			put_rpccred(cred);
+			goto out_exp;
 		}
-		timeout = (2 * lease) / 3;
-		spin_lock(&clp->cl_lock);
-	} else
+	} else {
 		dprintk("%s: failed to call renewd. Reason: lease not expired \n",
 				__func__);
-	if (timeout < 5 * HZ)    /* safeguard */
-		timeout = 5 * HZ;
-	dprintk("%s: requeueing work. Lease period = %ld\n",
-			__func__, (timeout + HZ - 1) / HZ);
-	cancel_delayed_work(&clp->cl_renewd);
-	schedule_delayed_work(&clp->cl_renewd, timeout);
-	spin_unlock(&clp->cl_lock);
+		spin_unlock(&clp->cl_lock);
+	}
+	nfs4_schedule_state_renewal(clp);
+out_exp:
 	nfs_expire_unreferenced_delegations(clp);
 out:
 	dprintk("%s: done\n", __func__);
-- 
1.6.2.5


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/6] nfs41: fix race between umount and renewd sequence operations
  2010-02-03  0:06   ` [PATCH 2/6] nfs: prevent backlogging of renewd requests Alexandros Batsakis
@ 2010-02-03  0:06     ` Alexandros Batsakis
  2010-02-03  0:06       ` [PATCH 4/6] nfs4: fix race between umount and renewd renew operations Alexandros Batsakis
  0 siblings, 1 reply; 15+ messages in thread
From: Alexandros Batsakis @ 2010-02-03  0:06 UTC (permalink / raw)
  To: linux-nfs; +Cc: trond, Alexandros Batsakis

Signed-off-by: Alexandros Batsakis <batsakis@netapp.com>
---
 fs/nfs/nfs4proc.c |   23 +++++++++++++++++++----
 1 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ce44b5a..87fd43a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -419,7 +419,8 @@ static void nfs41_sequence_done(struct nfs_client *clp,
 			clp->cl_last_renewal = timestamp;
 		spin_unlock(&clp->cl_lock);
 		/* Check sequence flags */
-		nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+		if (atomic_read(&clp->cl_count) > 1)
+			nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
 	}
 out:
 	/* The session may be reset by one of the error handlers. */
@@ -5032,6 +5033,8 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
 
 	if (task->tk_status < 0) {
 		dprintk("%s ERROR %d\n", __func__, task->tk_status);
+		if (atomic_read(&clp->cl_count) == 1)
+			goto out;
 
 		if (_nfs4_async_handle_error(task, NULL, clp, NULL)
 								== -EAGAIN) {
@@ -5041,8 +5044,9 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
 	}
 	dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
 
-	nfs4_schedule_state_renewal(clp);
-
+	if (atomic_read(&clp->cl_count) > 1)
+		nfs4_schedule_state_renewal(clp);
+out:
 	kfree(task->tk_msg.rpc_argp);
 	kfree(task->tk_msg.rpc_resp);
 
@@ -5064,9 +5068,15 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
 	rpc_call_start(task);
 }
 
+static void nfs41_sequence_release(void *calldata)
+{
+	nfs_put_client((struct nfs_client *) calldata);
+}
+
 static const struct rpc_call_ops nfs41_sequence_ops = {
 	.rpc_call_done = nfs41_sequence_call_done,
 	.rpc_call_prepare = nfs41_sequence_prepare,
+	.rpc_release = nfs41_sequence_release,
 };
 
 static int nfs41_proc_async_sequence(struct nfs_client *clp,
@@ -5079,11 +5089,16 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
 		.rpc_cred = cred,
 	};
 
+	if (!atomic_inc_not_zero(&clp->cl_count))
+		return -EIO;
 	args = kzalloc(sizeof(*args), GFP_KERNEL);
-	if (!args)
+	if (!args) {
+		nfs_put_client(clp);
 		return -ENOMEM;
+	}
 	res = kzalloc(sizeof(*res), GFP_KERNEL);
 	if (!res) {
+		nfs_put_client(clp);
 		kfree(args);
 		return -ENOMEM;
 	}
-- 
1.6.2.5


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 4/6] nfs4: fix race between umount and renewd renew operations
  2010-02-03  0:06     ` [PATCH 3/6] nfs41: fix race between umount and renewd sequence operations Alexandros Batsakis
@ 2010-02-03  0:06       ` Alexandros Batsakis
  2010-02-03  0:06         ` [PATCH 5/6] nfs4: adjust rpc timeout for nfs_client rpc client based on the lease_time Alexandros Batsakis
  0 siblings, 1 reply; 15+ messages in thread
From: Alexandros Batsakis @ 2010-02-03  0:06 UTC (permalink / raw)
  To: linux-nfs; +Cc: trond, Alexandros Batsakis

Signed-off-by: Alexandros Batsakis <batsakis@netapp.com>
---
 fs/nfs/nfs4proc.c |   11 ++++++++++-
 1 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 87fd43a..ffc84d4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3163,11 +3163,18 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
 	if (time_before(clp->cl_last_renewal,timestamp))
 		clp->cl_last_renewal = timestamp;
 	spin_unlock(&clp->cl_lock);
-	nfs4_schedule_state_renewal(clp);
+	if (atomic_read(&clp->cl_count) > 1)
+		nfs4_schedule_state_renewal(clp);
+}
+
+static void nfs4_renew_release(void *calldata)
+{
+	nfs_put_client((struct nfs_client *) calldata);
 }
 
 static const struct rpc_call_ops nfs4_renew_ops = {
 	.rpc_call_done = nfs4_renew_done,
+	.rpc_release = nfs4_renew_release,
 };
 
 int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3178,6 +3185,8 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
 		.rpc_cred	= cred,
 	};
 
+	if (!atomic_inc_not_zero(&clp->cl_count))
+		return -EIO;
 	return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
 			&nfs4_renew_ops, (void *)jiffies);
 }
-- 
1.6.2.5


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 5/6] nfs4: adjust rpc timeout for nfs_client rpc client based on the lease_time
  2010-02-03  0:06       ` [PATCH 4/6] nfs4: fix race between umount and renewd renew operations Alexandros Batsakis
@ 2010-02-03  0:06         ` Alexandros Batsakis
  2010-02-03  0:06           ` [PATCH 6/6] RPC: adjust timeout for connect, bind, restablish so that they sensitive to the major time out value Alexandros Batsakis
  0 siblings, 1 reply; 15+ messages in thread
From: Alexandros Batsakis @ 2010-02-03  0:06 UTC (permalink / raw)
  To: linux-nfs; +Cc: trond, Alexandros Batsakis

Signed-off-by: Alexandros Batsakis <batsakis@netapp.com>
---
 fs/nfs/nfs4proc.c |   19 +++++++++++++++++++
 1 files changed, 19 insertions(+), 0 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ffc84d4..efa3feb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3490,6 +3490,23 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 	return _nfs4_async_handle_error(task, server, server->nfs_client, state);
 }
 
+static void nfs4_set_rpc_timeout(struct nfs_client *clp, __u32 lease_time)
+{
+	struct rpc_timeout to;
+
+	to.to_retries = NFS_DEF_TCP_RETRANS;
+	to.to_initval = lease_time * HZ / (to.to_retries + 1);
+	to.to_increment = to.to_initval;
+	to.to_maxval = to.to_initval + (to.to_increment * to.to_retries);
+	if (to.to_maxval > NFS_MAX_TCP_TIMEOUT)
+		to.to_maxval = NFS_MAX_TCP_TIMEOUT;
+	to.to_exponential = 0;
+
+	memcpy(&clp->cl_rpcclient->cl_timeout_default, &to,
+	       sizeof(clp->cl_rpcclient->cl_timeout_default));
+	clp->cl_rpcclient->cl_timeout = &clp->cl_rpcclient->cl_timeout_default;
+}
+
 int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
 {
 	nfs4_verifier sc_verifier;
@@ -3560,6 +3577,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
 	if (status == 0) {
 		spin_lock(&clp->cl_lock);
 		clp->cl_lease_time = fsinfo.lease_time * HZ;
+		nfs4_set_rpc_timeout(clp, fsinfo.lease_time);
 		clp->cl_last_renewal = now;
 		spin_unlock(&clp->cl_lock);
 	}
@@ -4578,6 +4596,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
 		nfs_restart_rpc(task, data->clp);
 		return;
 	}
+	nfs4_set_rpc_timeout(data->clp, data->res->lr_fsinfo->lease_time);
 	dprintk("<-- %s\n", __func__);
 }
 
-- 
1.6.2.5


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 6/6] RPC: adjust timeout for connect, bind, restablish so that they sensitive to the major time out value
  2010-02-03  0:06         ` [PATCH 5/6] nfs4: adjust rpc timeout for nfs_client rpc client based on the lease_time Alexandros Batsakis
@ 2010-02-03  0:06           ` Alexandros Batsakis
  2010-02-05 20:12             ` Chuck Lever
  0 siblings, 1 reply; 15+ messages in thread
From: Alexandros Batsakis @ 2010-02-03  0:06 UTC (permalink / raw)
  To: linux-nfs; +Cc: trond, Alexandros Batsakis

Signed-off-by: Alexandros Batsakis <batsakis@netapp.com>
---
 net/sunrpc/clnt.c     |    2 +-
 net/sunrpc/sunrpc.h   |    2 ++
 net/sunrpc/xprt.c     |   18 +++++++++++++++---
 net/sunrpc/xprtsock.c |   12 ++++++++++--
 4 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 154034b..8e552cd 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1019,7 +1019,7 @@ call_bind(struct rpc_task *task)
 	task->tk_action = call_connect;
 	if (!xprt_bound(xprt)) {
 		task->tk_action = call_bind_status;
-		task->tk_timeout = xprt->bind_timeout;
+		xprt_set_timeout_sane(task, xprt->bind_timeout);
 		xprt->ops->rpcbind(task);
 	}
 }
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index 90c292e..87c7aaa 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -37,6 +37,8 @@ struct rpc_buffer {
 	char	data[];
 };
 
+void xprt_set_timeout_sane(struct rpc_task *task, unsigned long timeout);
+
 static inline int rpc_reply_expected(struct rpc_task *task)
 {
 	return (task->tk_msg.rpc_proc != NULL) &&
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 469de29..f6f3988 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -482,7 +482,7 @@ void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action)
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_xprt *xprt = req->rq_xprt;
 
-	task->tk_timeout = req->rq_timeout;
+	xprt_set_timeout_sane(task, req->rq_timeout);
 	rpc_sleep_on(&xprt->pending, task, action);
 }
 EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
@@ -518,7 +518,7 @@ EXPORT_SYMBOL_GPL(xprt_write_space);
  */
 void xprt_set_retrans_timeout_def(struct rpc_task *task)
 {
-	task->tk_timeout = task->tk_rqstp->rq_timeout;
+	xprt_set_timeout_sane(task, task->tk_rqstp->rq_timeout);
 }
 EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def);
 
@@ -682,6 +682,18 @@ out_abort:
 	spin_unlock(&xprt->transport_lock);
 }
 
+void xprt_set_timeout_sane(struct rpc_task *task, unsigned long timeout)
+{
+	unsigned long majorto = task->tk_rqstp->rq_majortimeo;
+
+	if (majorto <= jiffies)
+		task->tk_timeout = 5 * HZ;
+	else if (timeout > majorto - jiffies)
+		task->tk_timeout = 2 * (majorto - jiffies) / 3;
+	else
+		task->tk_timeout = timeout;
+}
+
 /**
  * xprt_connect - schedule a transport connect operation
  * @task: RPC task that is requesting the connect
@@ -710,7 +722,7 @@ void xprt_connect(struct rpc_task *task)
 		if (task->tk_rqstp)
 			task->tk_rqstp->rq_bytes_sent = 0;
 
-		task->tk_timeout = xprt->connect_timeout;
+		xprt_set_timeout_sane(task, xprt->connect_timeout);
 		rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
 		xprt->stat.connect_start = jiffies;
 		xprt->ops->connect(task);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 721bafd..7fcc97f 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2020,9 +2020,17 @@ static void xs_connect(struct rpc_task *task)
 		return;
 
 	if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) {
+		unsigned long majorto = task->tk_rqstp->rq_majortimeo;
+
+		if (majorto <= jiffies)
+			xprt->reestablish_timeout = 5 * HZ;
+		else if (xprt->reestablish_timeout > majorto - jiffies)
+			xprt->reestablish_timeout = 2 * (majorto - jiffies)
+			  / 3;
+
 		dprintk("RPC:       xs_connect delayed xprt %p for %lu "
-				"seconds\n",
-				xprt, xprt->reestablish_timeout / HZ);
+			"seconds\n",
+			xprt, xprt->reestablish_timeout + (HZ - 1) / HZ);
 		queue_delayed_work(rpciod_workqueue,
 				   &transport->connect_worker,
 				   xprt->reestablish_timeout);
-- 
1.6.2.5


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 6/6] RPC: adjust timeout for connect, bind, restablish so that they sensitive to the major time out value
  2010-02-03  0:06           ` [PATCH 6/6] RPC: adjust timeout for connect, bind, restablish so that they sensitive to the major time out value Alexandros Batsakis
@ 2010-02-05 20:12             ` Chuck Lever
  2010-02-05 22:14               ` Batsakis, Alexandros
  0 siblings, 1 reply; 15+ messages in thread
From: Chuck Lever @ 2010-02-05 20:12 UTC (permalink / raw)
  To: Alexandros Batsakis; +Cc: linux-nfs, trond

Hi Alexandros-

I think we need a little more than the short description for these 
patches.  Can you explain why the extra logic in xs_connect is 
necessary?  What exactly happens if the RENEW doesn't reach the server 
because the transport can't be reconnected?

In other words, what problem are you trying to address here?

On 02/02/2010 07:06 PM, Alexandros Batsakis wrote:
> Signed-off-by: Alexandros Batsakis<batsakis@netapp.com>
> ---
>   net/sunrpc/clnt.c     |    2 +-
>   net/sunrpc/sunrpc.h   |    2 ++
>   net/sunrpc/xprt.c     |   18 +++++++++++++++---
>   net/sunrpc/xprtsock.c |   12 ++++++++++--
>   4 files changed, 28 insertions(+), 6 deletions(-)
>
> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> index 154034b..8e552cd 100644
> --- a/net/sunrpc/clnt.c
> +++ b/net/sunrpc/clnt.c
> @@ -1019,7 +1019,7 @@ call_bind(struct rpc_task *task)
>   	task->tk_action = call_connect;
>   	if (!xprt_bound(xprt)) {
>   		task->tk_action = call_bind_status;
> -		task->tk_timeout = xprt->bind_timeout;
> +		xprt_set_timeout_sane(task, xprt->bind_timeout);
>   		xprt->ops->rpcbind(task);
>   	}
>   }
> diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
> index 90c292e..87c7aaa 100644
> --- a/net/sunrpc/sunrpc.h
> +++ b/net/sunrpc/sunrpc.h
> @@ -37,6 +37,8 @@ struct rpc_buffer {
>   	char	data[];
>   };
>
> +void xprt_set_timeout_sane(struct rpc_task *task, unsigned long timeout);
> +
>   static inline int rpc_reply_expected(struct rpc_task *task)
>   {
>   	return (task->tk_msg.rpc_proc != NULL)&&
> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> index 469de29..f6f3988 100644
> --- a/net/sunrpc/xprt.c
> +++ b/net/sunrpc/xprt.c
> @@ -482,7 +482,7 @@ void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action)
>   	struct rpc_rqst *req = task->tk_rqstp;
>   	struct rpc_xprt *xprt = req->rq_xprt;
>
> -	task->tk_timeout = req->rq_timeout;
> +	xprt_set_timeout_sane(task, req->rq_timeout);
>   	rpc_sleep_on(&xprt->pending, task, action);
>   }
>   EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
> @@ -518,7 +518,7 @@ EXPORT_SYMBOL_GPL(xprt_write_space);
>    */
>   void xprt_set_retrans_timeout_def(struct rpc_task *task)
>   {
> -	task->tk_timeout = task->tk_rqstp->rq_timeout;
> +	xprt_set_timeout_sane(task, task->tk_rqstp->rq_timeout);
>   }
>   EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def);
>
> @@ -682,6 +682,18 @@ out_abort:
>   	spin_unlock(&xprt->transport_lock);
>   }
>
> +void xprt_set_timeout_sane(struct rpc_task *task, unsigned long timeout)

xprt_set_timeout_sane?  That's amusing, but it isn't very descriptive to 
someone who doesn't know why this extra function is needed.

> +{
> +	unsigned long majorto = task->tk_rqstp->rq_majortimeo;
> +
> +	if (majorto<= jiffies)
> +		task->tk_timeout = 5 * HZ;
> +	else if (timeout>  majorto - jiffies)
> +		task->tk_timeout = 2 * (majorto - jiffies) / 3;
> +	else
> +		task->tk_timeout = timeout;
> +}
> +
>   /**
>    * xprt_connect - schedule a transport connect operation
>    * @task: RPC task that is requesting the connect
> @@ -710,7 +722,7 @@ void xprt_connect(struct rpc_task *task)
>   		if (task->tk_rqstp)
>   			task->tk_rqstp->rq_bytes_sent = 0;
>
> -		task->tk_timeout = xprt->connect_timeout;
> +		xprt_set_timeout_sane(task, xprt->connect_timeout);
>   		rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
>   		xprt->stat.connect_start = jiffies;
>   		xprt->ops->connect(task);
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> index 721bafd..7fcc97f 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -2020,9 +2020,17 @@ static void xs_connect(struct rpc_task *task)
>   		return;
>
>   	if (transport->sock != NULL&&  !RPC_IS_SOFTCONN(task)) {
> +		unsigned long majorto = task->tk_rqstp->rq_majortimeo;
> +
> +		if (majorto<= jiffies)
> +			xprt->reestablish_timeout = 5 * HZ;
> +		else if (xprt->reestablish_timeout>  majorto - jiffies)
> +			xprt->reestablish_timeout = 2 * (majorto - jiffies)
> +			  / 3;

This looks like the code you added above in _sane.  Can you reuse that 
code here?

> +
>   		dprintk("RPC:       xs_connect delayed xprt %p for %lu "
> -				"seconds\n",
> -				xprt, xprt->reestablish_timeout / HZ);
> +			"seconds\n",
> +			xprt, xprt->reestablish_timeout + (HZ - 1) / HZ);
>   		queue_delayed_work(rpciod_workqueue,
>   				&transport->connect_worker,
>   				   xprt->reestablish_timeout);


-- 
chuck[dot]lever[at]oracle[dot]com

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 6/6] RPC: adjust timeout for connect, bind, restablish so that they sensitive to the major time out value
  2010-02-05 20:12             ` Chuck Lever
@ 2010-02-05 22:14               ` Batsakis, Alexandros
  2010-02-05 22:45                 ` Chuck Lever
  0 siblings, 1 reply; 15+ messages in thread
From: Batsakis, Alexandros @ 2010-02-05 22:14 UTC (permalink / raw)
  To: Chuck Lever; +Cc: Alexandros Batsakis, linux-nfs, trond

Yeah sure,

So imagine that for a specific connection the remaining major timeo  
value is 30secs. Xs_connect has a default timeout before attempting to  
reconnect of 60secs. The user (NFS) expects to "hear back" from the  
rpc layer within the timeout as in often cases e.g. lease renewal,  
it's of no benefit for an operation to reach the server at a later  
time and miss the critical time because it was sleeping for an  
arbitrary amount of time.

My patch ensures that the reconnection attempts happen within the user- 
specified limits by replacing the 60secs default timeout with a value  
that is less than the time to expire. If this is not ideal the user  
should readjust the timeout value.

Does it make sense?

-alexandros

On Feb 5, 2010, at 12:14, "Chuck Lever" <chuck.lever@oracle.com> wrote:

> Hi Alexandros-
>
> I think we need a little more than the short description for these  
> patches.  Can you explain why the extra logic in xs_connect is  
> necessary?  What exactly happens if the RENEW doesn't reach the  
> server because the transport can't be reconnected?
>
> In other words, what problem are you trying to address here?
>
> On 02/02/2010 07:06 PM, Alexandros Batsakis wrote:
>> Signed-off-by: Alexandros Batsakis<batsakis@netapp.com>
>> ---
>>  net/sunrpc/clnt.c     |    2 +-
>>  net/sunrpc/sunrpc.h   |    2 ++
>>  net/sunrpc/xprt.c     |   18 +++++++++++++++---
>>  net/sunrpc/xprtsock.c |   12 ++++++++++--
>>  4 files changed, 28 insertions(+), 6 deletions(-)
>>
>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>> index 154034b..8e552cd 100644
>> --- a/net/sunrpc/clnt.c
>> +++ b/net/sunrpc/clnt.c
>> @@ -1019,7 +1019,7 @@ call_bind(struct rpc_task *task)
>>      task->tk_action = call_connect;
>>      if (!xprt_bound(xprt)) {
>>          task->tk_action = call_bind_status;
>> -        task->tk_timeout = xprt->bind_timeout;
>> +        xprt_set_timeout_sane(task, xprt->bind_timeout);
>>          xprt->ops->rpcbind(task);
>>      }
>>  }
>> diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
>> index 90c292e..87c7aaa 100644
>> --- a/net/sunrpc/sunrpc.h
>> +++ b/net/sunrpc/sunrpc.h
>> @@ -37,6 +37,8 @@ struct rpc_buffer {
>>      char    data[];
>>  };
>>
>> +void xprt_set_timeout_sane(struct rpc_task *task, unsigned long  
>> timeout);
>> +
>>  static inline int rpc_reply_expected(struct rpc_task *task)
>>  {
>>      return (task->tk_msg.rpc_proc != NULL)&&
>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>> index 469de29..f6f3988 100644
>> --- a/net/sunrpc/xprt.c
>> +++ b/net/sunrpc/xprt.c
>> @@ -482,7 +482,7 @@ void xprt_wait_for_buffer_space(struct rpc_task  
>> *task, rpc_action action)
>>      struct rpc_rqst *req = task->tk_rqstp;
>>      struct rpc_xprt *xprt = req->rq_xprt;
>>
>> -    task->tk_timeout = req->rq_timeout;
>> +    xprt_set_timeout_sane(task, req->rq_timeout);
>>      rpc_sleep_on(&xprt->pending, task, action);
>>  }
>>  EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
>> @@ -518,7 +518,7 @@ EXPORT_SYMBOL_GPL(xprt_write_space);
>>   */
>>  void xprt_set_retrans_timeout_def(struct rpc_task *task)
>>  {
>> -    task->tk_timeout = task->tk_rqstp->rq_timeout;
>> +    xprt_set_timeout_sane(task, task->tk_rqstp->rq_timeout);
>>  }
>>  EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def);
>>
>> @@ -682,6 +682,18 @@ out_abort:
>>      spin_unlock(&xprt->transport_lock);
>>  }
>>
>> +void xprt_set_timeout_sane(struct rpc_task *task, unsigned long  
>> timeout)
>
> xprt_set_timeout_sane?  That's amusing, but it isn't very  
> descriptive to someone who doesn't know why this extra function is  
> needed.
>
>> +{
>> +    unsigned long majorto = task->tk_rqstp->rq_majortimeo;
>> +
>> +    if (majorto<= jiffies)
>> +        task->tk_timeout = 5 * HZ;
>> +    else if (timeout>  majorto - jiffies)
>> +        task->tk_timeout = 2 * (majorto - jiffies) / 3;
>> +    else
>> +        task->tk_timeout = timeout;
>> +}
>> +
>>  /**
>>   * xprt_connect - schedule a transport connect operation
>>   * @task: RPC task that is requesting the connect
>> @@ -710,7 +722,7 @@ void xprt_connect(struct rpc_task *task)
>>          if (task->tk_rqstp)
>>              task->tk_rqstp->rq_bytes_sent = 0;
>>
>> -        task->tk_timeout = xprt->connect_timeout;
>> +        xprt_set_timeout_sane(task, xprt->connect_timeout);
>>          rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
>>          xprt->stat.connect_start = jiffies;
>>          xprt->ops->connect(task);
>> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
>> index 721bafd..7fcc97f 100644
>> --- a/net/sunrpc/xprtsock.c
>> +++ b/net/sunrpc/xprtsock.c
>> @@ -2020,9 +2020,17 @@ static void xs_connect(struct rpc_task *task)
>>          return;
>>
>>      if (transport->sock != NULL&&  !RPC_IS_SOFTCONN(task)) {
>> +        unsigned long majorto = task->tk_rqstp->rq_majortimeo;
>> +
>> +        if (majorto<= jiffies)
>> +            xprt->reestablish_timeout = 5 * HZ;
>> +        else if (xprt->reestablish_timeout>  majorto - jiffies)
>> +            xprt->reestablish_timeout = 2 * (majorto - jiffies)
>> +              / 3;
>
> This looks like the code you added above in _sane.  Can you reuse  
> that code here?
>
>> +
>>          dprintk("RPC:       xs_connect delayed xprt %p for %lu "
>> -                "seconds\n",
>> -                xprt, xprt->reestablish_timeout / HZ);
>> +            "seconds\n",
>> +            xprt, xprt->reestablish_timeout + (HZ - 1) / HZ);
>>          queue_delayed_work(rpciod_workqueue,
>>                  &transport->connect_worker,
>>                     xprt->reestablish_timeout);
>
>
> -- 
> chuck[dot]lever[at]oracle[dot]com

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 6/6] RPC: adjust timeout for connect, bind, restablish so that they sensitive to the major time out value
  2010-02-05 22:14               ` Batsakis, Alexandros
@ 2010-02-05 22:45                 ` Chuck Lever
  2010-02-05 23:04                   ` Batsakis, Alexandros
  0 siblings, 1 reply; 15+ messages in thread
From: Chuck Lever @ 2010-02-05 22:45 UTC (permalink / raw)
  To: Batsakis, Alexandros; +Cc: Alexandros Batsakis, linux-nfs, trond

On 02/05/2010 05:14 PM, Batsakis, Alexandros wrote:
> Yeah sure,
>
> So imagine that for a specific connection the remaining major timeo
> value is 30secs. Xs_connect has a default timeout before attempting to
> reconnect of 60secs. The user (NFS) expects to "hear back" from the rpc
> layer within the timeout as in often cases e.g. lease renewal, it's of
> no benefit for an operation to reach the server at a later time and miss
> the critical time because it was sleeping for an arbitrary amount of time.
>
> My patch ensures that the reconnection attempts happen within the
> user-specified limits by replacing the 60secs default timeout with a
> value that is less than the time to expire. If this is not ideal the
> user should readjust the timeout value.
>
> Does it make sense?

That's better, but this explanation needs to accompany the patch, as the 
long description.  It's hard for reviewers who are not familiar with the 
constraints on NFSv4 RENEW to understand what you're trying to fix. 
Same comment applies to the other patches in your series, I think.  (And 
see further comments in the code below).

You are changing a generic part of the RPC client to fix an issue with 
one specific NFSv4 procedure, it seems to me.  Did you actually observe 
a situation where the reconnect delay caused the server to miss a RENEW 
request?

There are good reasons why there is a timeout before reestablishing a 
connection.  Have you tested this patch with NFSv3 servers that are 
going up and down repeatedly, for example?  I think skipping the 
reconnect delay could have consequences for the cases which the original 
xs_connect logic is supposed to address, and it's not something we want 
in many other cases.

Perhaps a better idea would be to mark these particular RPCs with some 
kind of indication that the RPC client has to connect immediately for 
this request, if possible.  Similar to RPC_TASK_SOFTCONN.

In general, sunrpc.ko has a problem with this kind of "urgent" RPC 
request.  If the RPC backlog queue is large for a particular rpc_clnt, 
it can often take many seconds (like longer than the major timeout) for 
a request to actually get down to the transport and get sent.  I don't 
see that these timeout changes necessarily address that at all.

I would suggest that the best solution might be a separate transport for 
such requests, but that has other consequences.  Maybe our RPC client 
needs a separate "urgent" request queue that bypasses the normal backlog 
queue for such time-dependent requests?

> -alexandros
>
> On Feb 5, 2010, at 12:14, "Chuck Lever" <chuck.lever@oracle.com> wrote:
>
>> Hi Alexandros-
>>
>> I think we need a little more than the short description for these
>> patches. Can you explain why the extra logic in xs_connect is
>> necessary? What exactly happens if the RENEW doesn't reach the server
>> because the transport can't be reconnected?
>>
>> In other words, what problem are you trying to address here?
>>
>> On 02/02/2010 07:06 PM, Alexandros Batsakis wrote:
>>> Signed-off-by: Alexandros Batsakis<batsakis@netapp.com>
>>> ---
>>> net/sunrpc/clnt.c | 2 +-
>>> net/sunrpc/sunrpc.h | 2 ++
>>> net/sunrpc/xprt.c | 18 +++++++++++++++---
>>> net/sunrpc/xprtsock.c | 12 ++++++++++--
>>> 4 files changed, 28 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>>> index 154034b..8e552cd 100644
>>> --- a/net/sunrpc/clnt.c
>>> +++ b/net/sunrpc/clnt.c
>>> @@ -1019,7 +1019,7 @@ call_bind(struct rpc_task *task)
>>> task->tk_action = call_connect;
>>> if (!xprt_bound(xprt)) {
>>> task->tk_action = call_bind_status;
>>> - task->tk_timeout = xprt->bind_timeout;
>>> + xprt_set_timeout_sane(task, xprt->bind_timeout);
>>> xprt->ops->rpcbind(task);
>>> }
>>> }
>>> diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
>>> index 90c292e..87c7aaa 100644
>>> --- a/net/sunrpc/sunrpc.h
>>> +++ b/net/sunrpc/sunrpc.h
>>> @@ -37,6 +37,8 @@ struct rpc_buffer {
>>> char data[];
>>> };
>>>
>>> +void xprt_set_timeout_sane(struct rpc_task *task, unsigned long
>>> timeout);
>>> +
>>> static inline int rpc_reply_expected(struct rpc_task *task)
>>> {
>>> return (task->tk_msg.rpc_proc != NULL)&&
>>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>>> index 469de29..f6f3988 100644
>>> --- a/net/sunrpc/xprt.c
>>> +++ b/net/sunrpc/xprt.c
>>> @@ -482,7 +482,7 @@ void xprt_wait_for_buffer_space(struct rpc_task
>>> *task, rpc_action action)
>>> struct rpc_rqst *req = task->tk_rqstp;
>>> struct rpc_xprt *xprt = req->rq_xprt;
>>>
>>> - task->tk_timeout = req->rq_timeout;
>>> + xprt_set_timeout_sane(task, req->rq_timeout);
>>> rpc_sleep_on(&xprt->pending, task, action);
>>> }
>>> EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
>>> @@ -518,7 +518,7 @@ EXPORT_SYMBOL_GPL(xprt_write_space);
>>> */
>>> void xprt_set_retrans_timeout_def(struct rpc_task *task)
>>> {
>>> - task->tk_timeout = task->tk_rqstp->rq_timeout;
>>> + xprt_set_timeout_sane(task, task->tk_rqstp->rq_timeout);
>>> }
>>> EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def);
>>>
>>> @@ -682,6 +682,18 @@ out_abort:
>>> spin_unlock(&xprt->transport_lock);
>>> }
>>>
>>> +void xprt_set_timeout_sane(struct rpc_task *task, unsigned long
>>> timeout)
>>
>> xprt_set_timeout_sane? That's amusing, but it isn't very descriptive
>> to someone who doesn't know why this extra function is needed.
>>
>>> +{
>>> + unsigned long majorto = task->tk_rqstp->rq_majortimeo;
>>> +
>>> + if (majorto<= jiffies)
>>> + task->tk_timeout = 5 * HZ;
>>> + else if (timeout> majorto - jiffies)
>>> + task->tk_timeout = 2 * (majorto - jiffies) / 3;
>>> + else
>>> + task->tk_timeout = timeout;
>>> +}
>>> +
>>> /**
>>> * xprt_connect - schedule a transport connect operation
>>> * @task: RPC task that is requesting the connect
>>> @@ -710,7 +722,7 @@ void xprt_connect(struct rpc_task *task)
>>> if (task->tk_rqstp)
>>> task->tk_rqstp->rq_bytes_sent = 0;
>>>
>>> - task->tk_timeout = xprt->connect_timeout;
>>> + xprt_set_timeout_sane(task, xprt->connect_timeout);
>>> rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
>>> xprt->stat.connect_start = jiffies;
>>> xprt->ops->connect(task);
>>> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
>>> index 721bafd..7fcc97f 100644
>>> --- a/net/sunrpc/xprtsock.c
>>> +++ b/net/sunrpc/xprtsock.c
>>> @@ -2020,9 +2020,17 @@ static void xs_connect(struct rpc_task *task)
>>> return;
>>>
>>> if (transport->sock != NULL&& !RPC_IS_SOFTCONN(task)) {
>>> + unsigned long majorto = task->tk_rqstp->rq_majortimeo;
>>> +
>>> + if (majorto<= jiffies)
>>> + xprt->reestablish_timeout = 5 * HZ;
>>> + else if (xprt->reestablish_timeout> majorto - jiffies)
>>> + xprt->reestablish_timeout = 2 * (majorto - jiffies)
>>> + / 3;
>>
>> This looks like the code you added above in _sane. Can you reuse that
>> code here?
>>
>>> +
>>> dprintk("RPC: xs_connect delayed xprt %p for %lu "
>>> - "seconds\n",
>>> - xprt, xprt->reestablish_timeout / HZ);
>>> + "seconds\n",
>>> + xprt, xprt->reestablish_timeout + (HZ - 1) / HZ);
>>> queue_delayed_work(rpciod_workqueue,
>>> &transport->connect_worker,
>>> xprt->reestablish_timeout);
>>
>>
>> --
>> chuck[dot]lever[at]oracle[dot]com


-- 
chuck[dot]lever[at]oracle[dot]com

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 6/6] RPC: adjust timeout for connect, bind, restablish so that they sensitive to the major time out value
  2010-02-05 22:45                 ` Chuck Lever
@ 2010-02-05 23:04                   ` Batsakis, Alexandros
  2010-02-06  0:11                     ` Chuck Lever
  0 siblings, 1 reply; 15+ messages in thread
From: Batsakis, Alexandros @ 2010-02-05 23:04 UTC (permalink / raw)
  To: Chuck Lever; +Cc: Alexandros Batsakis, linux-nfs, trond



On Feb 5, 2010, at 14:47, "Chuck Lever" <chuck.lever@oracle.com> wrote:

> On 02/05/2010 05:14 PM, Batsakis, Alexandros wrote:
>> Yeah sure,
>>
>> So imagine that for a specific connection the remaining major timeo
>> value is 30secs. Xs_connect has a default timeout before attempting  
>> to
>> reconnect of 60secs. The user (NFS) expects to "hear back" from the  
>> rpc
>> layer within the timeout as in often cases e.g. lease renewal, it's  
>> of
>> no benefit for an operation to reach the server at a later time and  
>> miss
>> the critical time because it was sleeping for an arbitrary amount  
>> of time.
>>
>> My patch ensures that the reconnection attempts happen within the
>> user-specified limits by replacing the 60secs default timeout with a
>> value that is less than the time to expire. If this is not ideal the
>> user should readjust the timeout value.
>>
>> Does it make sense?
>
> That's better, but this explanation needs to accompany the patch, as  
> the long description.  It's hard for reviewers who are not familiar  
> with the constraints on NFSv4 RENEW to understand what you're trying  
> to fix. Same comment applies to the other patches in your series, I  
> think.  (And see further comments in the code below).
>

Yeah you are right about that

> You are changing a generic part of the RPC client to fix an issue  
> with one specific NFSv4 procedure, it seems to me.  Did you actually  
> observe a situation where the reconnect delay caused the server to  
> miss a RENEW request?
>

Yeah, it's very easy to reproduce too

> There are good reasons why there is a timeout before reestablishing  
> a connection.  Have you tested this patch with NFSv3 servers that  
> are going up and down repeatedly, for example?  I think skipping the  
> reconnect delay could have consequences for the cases which the  
> original xs_connect logic is supposed to address, and it's not  
> something we want in many other cases.
>

I am not skipping the reconnect delay. What i am saying is that it  
seems wrong to me to hard-code the reconnection delay. Why 60secs for  
example ? To me it seems that this value should be related to the  
timeout. Do you disagree ?

> Perhaps a better idea would be to mark these particular RPCs with  
> some kind of indication that the RPC client has to connect  
> immediately for this request, if possible.  Similar to  
> RPC_TASK_SOFTCONN.

> In general, sunrpc.ko has a problem with this kind of "urgent" RPC  
> request.  If the RPC backlog queue is large for a particular  
> rpc_clnt, it can often take many seconds (like longer than the major  
> timeout) for a request to actually get down to the transport and get  
> sent.  I don't see that these timeout changes necessarily address  
> that at all.
>

Bu if the timeout has expired rpc_execute will quit the task anyways.
What is the downside of instead of sleeping for a long, arbitrary  
period to wake up and poll the server at intervals that actually make  
some sense to the client?

> I would suggest that the best solution might be a separate transport  
> for such requests, but that has other consequences.  Maybe our RPC  
> client needs a separate "urgent" request queue that bypasses the  
> normal backlog queue for such time-dependent requests?
>

A seperate transport for requests that will fail as soon as the  
timeout expires and not later? Because, as I said, the request is  
gonna fail anyways.
I dont't think that this should be treated as a special case, but  
that's just my opinion. Maybe your experience says differently.

I ll address the code comments when we aggree how to proceed.

Thanks!
-alexandros

>> -alexandros
>>
>> On Feb 5, 2010, at 12:14, "Chuck Lever" <chuck.lever@oracle.com>  
>> wrote:
>>
>>> Hi Alexandros-
>>>
>>> I think we need a little more than the short description for these
>>> patches. Can you explain why the extra logic in xs_connect is
>>> necessary? What exactly happens if the RENEW doesn't reach the  
>>> server
>>> because the transport can't be reconnected?
>>>
>>> In other words, what problem are you trying to address here?
>>>
>>> On 02/02/2010 07:06 PM, Alexandros Batsakis wrote:
>>>> Signed-off-by: Alexandros Batsakis<batsakis@netapp.com>
>>>> ---
>>>> net/sunrpc/clnt.c | 2 +-
>>>> net/sunrpc/sunrpc.h | 2 ++
>>>> net/sunrpc/xprt.c | 18 +++++++++++++++---
>>>> net/sunrpc/xprtsock.c | 12 ++++++++++--
>>>> 4 files changed, 28 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>>>> index 154034b..8e552cd 100644
>>>> --- a/net/sunrpc/clnt.c
>>>> +++ b/net/sunrpc/clnt.c
>>>> @@ -1019,7 +1019,7 @@ call_bind(struct rpc_task *task)
>>>> task->tk_action = call_connect;
>>>> if (!xprt_bound(xprt)) {
>>>> task->tk_action = call_bind_status;
>>>> - task->tk_timeout = xprt->bind_timeout;
>>>> + xprt_set_timeout_sane(task, xprt->bind_timeout);
>>>> xprt->ops->rpcbind(task);
>>>> }
>>>> }
>>>> diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
>>>> index 90c292e..87c7aaa 100644
>>>> --- a/net/sunrpc/sunrpc.h
>>>> +++ b/net/sunrpc/sunrpc.h
>>>> @@ -37,6 +37,8 @@ struct rpc_buffer {
>>>> char data[];
>>>> };
>>>>
>>>> +void xprt_set_timeout_sane(struct rpc_task *task, unsigned long
>>>> timeout);
>>>> +
>>>> static inline int rpc_reply_expected(struct rpc_task *task)
>>>> {
>>>> return (task->tk_msg.rpc_proc != NULL)&&
>>>> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
>>>> index 469de29..f6f3988 100644
>>>> --- a/net/sunrpc/xprt.c
>>>> +++ b/net/sunrpc/xprt.c
>>>> @@ -482,7 +482,7 @@ void xprt_wait_for_buffer_space(struct rpc_task
>>>> *task, rpc_action action)
>>>> struct rpc_rqst *req = task->tk_rqstp;
>>>> struct rpc_xprt *xprt = req->rq_xprt;
>>>>
>>>> - task->tk_timeout = req->rq_timeout;
>>>> + xprt_set_timeout_sane(task, req->rq_timeout);
>>>> rpc_sleep_on(&xprt->pending, task, action);
>>>> }
>>>> EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
>>>> @@ -518,7 +518,7 @@ EXPORT_SYMBOL_GPL(xprt_write_space);
>>>> */
>>>> void xprt_set_retrans_timeout_def(struct rpc_task *task)
>>>> {
>>>> - task->tk_timeout = task->tk_rqstp->rq_timeout;
>>>> + xprt_set_timeout_sane(task, task->tk_rqstp->rq_timeout);
>>>> }
>>>> EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def);
>>>>
>>>> @@ -682,6 +682,18 @@ out_abort:
>>>> spin_unlock(&xprt->transport_lock);
>>>> }
>>>>
>>>> +void xprt_set_timeout_sane(struct rpc_task *task, unsigned long
>>>> timeout)
>>>
>>> xprt_set_timeout_sane? That's amusing, but it isn't very descriptive
>>> to someone who doesn't know why this extra function is needed.
>>>
>>>> +{
>>>> + unsigned long majorto = task->tk_rqstp->rq_majortimeo;
>>>> +
>>>> + if (majorto<= jiffies)
>>>> + task->tk_timeout = 5 * HZ;
>>>> + else if (timeout> majorto - jiffies)
>>>> + task->tk_timeout = 2 * (majorto - jiffies) / 3;
>>>> + else
>>>> + task->tk_timeout = timeout;
>>>> +}
>>>> +
>>>> /**
>>>> * xprt_connect - schedule a transport connect operation
>>>> * @task: RPC task that is requesting the connect
>>>> @@ -710,7 +722,7 @@ void xprt_connect(struct rpc_task *task)
>>>> if (task->tk_rqstp)
>>>> task->tk_rqstp->rq_bytes_sent = 0;
>>>>
>>>> - task->tk_timeout = xprt->connect_timeout;
>>>> + xprt_set_timeout_sane(task, xprt->connect_timeout);
>>>> rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
>>>> xprt->stat.connect_start = jiffies;
>>>> xprt->ops->connect(task);
>>>> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
>>>> index 721bafd..7fcc97f 100644
>>>> --- a/net/sunrpc/xprtsock.c
>>>> +++ b/net/sunrpc/xprtsock.c
>>>> @@ -2020,9 +2020,17 @@ static void xs_connect(struct rpc_task  
>>>> *task)
>>>> return;
>>>>
>>>> if (transport->sock != NULL&& !RPC_IS_SOFTCONN(task)) {
>>>> + unsigned long majorto = task->tk_rqstp->rq_majortimeo;
>>>> +
>>>> + if (majorto<= jiffies)
>>>> + xprt->reestablish_timeout = 5 * HZ;
>>>> + else if (xprt->reestablish_timeout> majorto - jiffies)
>>>> + xprt->reestablish_timeout = 2 * (majorto - jiffies)
>>>> + / 3;
>>>
>>> This looks like the code you added above in _sane. Can you reuse  
>>> that
>>> code here?
>>>
>>>> +
>>>> dprintk("RPC: xs_connect delayed xprt %p for %lu "
>>>> - "seconds\n",
>>>> - xprt, xprt->reestablish_timeout / HZ);
>>>> + "seconds\n",
>>>> + xprt, xprt->reestablish_timeout + (HZ - 1) / HZ);
>>>> queue_delayed_work(rpciod_workqueue,
>>>> &transport->connect_worker,
>>>> xprt->reestablish_timeout);
>>>
>>>
>>> --
>>> chuck[dot]lever[at]oracle[dot]com
>
>
> -- 
> chuck[dot]lever[at]oracle[dot]com

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 6/6] RPC: adjust timeout for connect, bind, restablish so that they sensitive to the major time out value
  2010-02-05 23:04                   ` Batsakis, Alexandros
@ 2010-02-06  0:11                     ` Chuck Lever
       [not found]                       ` <B9364369CA66BF45806C2CD86EAB8BA60259D23D@SACMVEXC3-PRD.hq.netapp.com>
  0 siblings, 1 reply; 15+ messages in thread
From: Chuck Lever @ 2010-02-06  0:11 UTC (permalink / raw)
  To: Batsakis, Alexandros; +Cc: Alexandros Batsakis, linux-nfs, trond

On 02/05/2010 06:04 PM, Batsakis, Alexandros wrote:
>
>
> On Feb 5, 2010, at 14:47, "Chuck Lever" <chuck.lever@oracle.com> wrote:
>
>> On 02/05/2010 05:14 PM, Batsakis, Alexandros wrote:
>>> Yeah sure,
>>>
>>> So imagine that for a specific connection the remaining major timeo
>>> value is 30secs. Xs_connect has a default timeout before attempting to
>>> reconnect of 60secs. The user (NFS) expects to "hear back" from the rpc
>>> layer within the timeout as in often cases e.g. lease renewal, it's of
>>> no benefit for an operation to reach the server at a later time and miss
>>> the critical time because it was sleeping for an arbitrary amount of
>>> time.

Maybe you want RPC_TASK_SOFTCONN for NFSv4 renewals instead of 
RPC_TASK_SOFT.  This would cause the RENEW request to fail immediately 
if the transport can't connect.

>>>
>>> My patch ensures that the reconnection attempts happen within the
>>> user-specified limits by replacing the 60secs default timeout with a
>>> value that is less than the time to expire. If this is not ideal the
>>> user should readjust the timeout value.
>>>
>>> Does it make sense?
>>
>> That's better, but this explanation needs to accompany the patch, as
>> the long description. It's hard for reviewers who are not familiar
>> with the constraints on NFSv4 RENEW to understand what you're trying
>> to fix. Same comment applies to the other patches in your series, I
>> think. (And see further comments in the code below).
>>
>
> Yeah you are right about that
>
>> You are changing a generic part of the RPC client to fix an issue with
>> one specific NFSv4 procedure, it seems to me. Did you actually observe
>> a situation where the reconnect delay caused the server to miss a
>> RENEW request?
>>
>
> Yeah, it's very easy to reproduce too

Can you describe the scenario more precisely?  If I wanted to reproduce 
this here, how would I do it?

>> There are good reasons why there is a timeout before reestablishing a
>> connection. Have you tested this patch with NFSv3 servers that are
>> going up and down repeatedly, for example? I think skipping the
>> reconnect delay could have consequences for the cases which the
>> original xs_connect logic is supposed to address, and it's not
>> something we want in many other cases.
>>
>
> I am not skipping the reconnect delay. What i am saying is that it seems
> wrong to me to hard-code the reconnection delay. Why 60secs for example
> ? To me it seems that this value should be related to the timeout. Do
> you disagree ?

Hrm.  It looks like the TCP re-establish timeout starts at 3 seconds, 
and then backs off to 5 minutes.  So, it's not fixed, it's related to 
how quickly the server responds to the client's connect() attempt.

The reestablish timer is meant to prevent a client with pending requests 
from hammering an overloaded (or rebooting) server with connection 
requests.  It's purpose is markedly different than the purpose of an 
individual RPC retransmit timer.

>> Perhaps a better idea would be to mark these particular RPCs with some
>> kind of indication that the RPC client has to connect immediately for
>> this request, if possible. Similar to RPC_TASK_SOFTCONN.
>
>> In general, sunrpc.ko has a problem with this kind of "urgent" RPC
>> request. If the RPC backlog queue is large for a particular rpc_clnt,
>> it can often take many seconds (like longer than the major timeout)
>> for a request to actually get down to the transport and get sent. I
>> don't see that these timeout changes necessarily address that at all.
>>
>
> Bu if the timeout has expired rpc_execute will quit the task anyways.
> What is the downside of instead of sleeping for a long, arbitrary period
> to wake up and poll the server at intervals that actually make some
> sense to the client?

If the 5 minute backoff maximum is too long, then you can easily reduce 
that maximum (which is probably not unreasonable).  We originally had 
the client retrying to connect every 3 seconds, but it was thought that 
would put unnecessary load on servers.

-- 
chuck[dot]lever[at]oracle[dot]com

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 6/6] RPC: adjust timeout for connect, bind, restablish so that they sensitive to the major time out value
       [not found]                       ` <B9364369CA66BF45806C2CD86EAB8BA60259D23D@SACMVEXC3-PRD.hq.netapp.com>
@ 2010-02-07  0:53                         ` Chuck Lever
       [not found]                           ` <2CDC4373-10AD-4F84-BA44-3C2106D590BE@netapp.com>
  0 siblings, 1 reply; 15+ messages in thread
From: Chuck Lever @ 2010-02-07  0:53 UTC (permalink / raw)
  To: Batsakis, Alexandros; +Cc: linux-nfs, Myklebust, Trond

On 02/05/2010 11:05 PM, Batsakis, Alexandros wrote:
>
> My replies marked with the "AB" prefix
>
> -----Original Message-----
> From: Chuck Lever [mailto:chuck.lever@oracle.com]
> Sent: Fri 2/5/2010 4:11 PM
> To: Batsakis, Alexandros
> Cc: Batsakis, Alexandros; linux-nfs@vger.kernel.org; Myklebust, Trond
> Subject: Re: [PATCH 6/6] RPC: adjust timeout for connect, bind,
> restablish so that they sensitive to the major time out value
>
> On 02/05/2010 06:04 PM, Batsakis, Alexandros wrote:
>  >
>  >
>  > On Feb 5, 2010, at 14:47, "Chuck Lever" <chuck.lever@oracle.com> wrote:
>  >
>  >> On 02/05/2010 05:14 PM, Batsakis, Alexandros wrote:
>  >>> Yeah sure,
>  >>>
>  >>> So imagine that for a specific connection the remaining major timeo
>  >>> value is 30secs. Xs_connect has a default timeout before attempting to
>  >>> reconnect of 60secs. The user (NFS) expects to "hear back" from the rpc
>  >>> layer within the timeout as in often cases e.g. lease renewal, it's of
>  >>> no benefit for an operation to reach the server at a later time and
> miss
>  >>> the critical time because it was sleeping for an arbitrary amount of
>  >>> time.
>
> Maybe you want RPC_TASK_SOFTCONN for NFSv4 renewals instead of
> RPC_TASK_SOFT. This would cause the RENEW request to fail immediately
> if the transport can't connect.
>
> AB: is this a new flag ? I am not familiar with it. Or are you proposing
> to add such a flag?
> It's not an unreasonable thing to do

The flag was added recently (maybe in 2.6.33-rc?).  It causes an 
individual RPC request to fail immediately if the underlying transport 
cannot be connected.  It bypasses the reconnect timeout if the transport 
is not already connected.

> Can you describe the scenario more precisely? If I wanted to reproduce
> this here, how would I do it?
>
> AB: mount a 4.1 server (or v4 with state, e.g. an open/lock etc.)
> Then kill the server. With a lease time of 90 seconds the client
> attempts to reconnect will come
> roughly at 60-80secs and 120-140 secs. So the lease period of 90 secs is
> lost.

That doesn't sound right to me.  xs_connect is supposed to retry after 3 
seconds and then back off exponentially until it is retrying once every 
5 minutes.  How do you determine when the client is retrying the 
connect: by watching for SYN packets or by enabling dprintk messages in 
xprtsock?

>  >> There are good reasons why there is a timeout before reestablishing a
>  >> connection. Have you tested this patch with NFSv3 servers that are
>  >> going up and down repeatedly, for example? I think skipping the
>  >> reconnect delay could have consequences for the cases which the
>  >> original xs_connect logic is supposed to address, and it's not
>  >> something we want in many other cases.
>  >>
>  >
>  > I am not skipping the reconnect delay. What i am saying is that it seems
>  > wrong to me to hard-code the reconnection delay. Why 60secs for example
>  > ? To me it seems that this value should be related to the timeout. Do
>  > you disagree ?
>
> Hrm. It looks like the TCP re-establish timeout starts at 3 seconds,
> and then backs off to 5 minutes. So, it's not fixed, it's related to
> how quickly the server responds to the client's connect() attempt.
>
> AB: Agreed. All I am saying is let's cap this value.

It's capped today at 5 minutes.  See XS_TCP_MAX_REEST_TO.  Perhaps it 
could be lowered to 60 seconds.

But I don't yet see a need to link the reconnect timeout to the 
retransmit timeout.  In nearly every case, a major timeout should occur 
if the client can't reconnect, and there's no harm done if that's 
delayed longer than is optimal.

RENEW is really quite the exception here.  This procedure is the only 
one I can think of where the request has to start _and_ finish within a 
particular time limit.

I don't know as much about NFSv4 and especially RENEW as I should, but 
it seems to me there are a number of reasons why RENEW should perhaps be 
given its own transport.

1.  If a RENEW is sent every 90 seconds, that means there's no 
possibility for the underlying transports to idle out.

2.  We only need one RENEW per client-server pair, I think.  Isn't it 
tied to the client ID?

3.  Because of its timing requirements, we can't depend on a RENEW 
getting through a heavily loaded rpc_clnt at a given time.

4.  It seems to want different retransmit behavior than any other procedure.

So perhaps the best solution is for the client to set up a separate 
rpc_clnt/rpc_xprt for each mounted server (but not each mount point) 
that is dedicated to handling RENEW requests for that server.  It would 
have retransmit timeouts that are determined by the lease timer, and not 
by the retransmit timeout specified on the mount command line.

This works around the fact that TCP streams suffer from head-of-queue 
blocking, as well as attempting to work around a long RPC client backlog 
queue.

The only downside I can think of is this might consume excess privileged 
ports on a client that mounts hundreds of different servers.

>  >> Perhaps a better idea would be to mark these particular RPCs with some
>  >> kind of indication that the RPC client has to connect immediately for
>  >> this request, if possible. Similar to RPC_TASK_SOFTCONN.
>  >
>  >> In general, sunrpc.ko has a problem with this kind of "urgent" RPC
>  >> request. If the RPC backlog queue is large for a particular rpc_clnt,
>  >> it can often take many seconds (like longer than the major timeout)
>  >> for a request to actually get down to the transport and get sent. I
>  >> don't see that these timeout changes necessarily address that at all.
>
> AB: again here I think that the correct solution would be to increase
> the timeout value rather than ignore it. What is the purpose of the
> timeout anyways then ?
>
>  > Bu if the timeout has expired rpc_execute will quit the task anyways.
>  > What is the downside of instead of sleeping for a long, arbitrary period
>  > to wake up and poll the server at intervals that actually make some
>  > sense to the client?
>
> If the 5 minute backoff maximum is too long, then you can easily reduce
> that maximum (which is probably not unreasonable). We originally had
> the client retrying to connect every 3 seconds, but it was thought that
> would put unnecessary load on servers.
>
> AB: Again, my argument is that we shouldn't select these values arbitrarily.
> In all, I see your points. If I understand correctly
> you don't consider the tcp reconnection policy that I am describing an
> important issue. I assume that you are OK
> with the timeout variability in the state machine ?

I think changing the timeout logic in both places is probably overkill, 
and may even have negative effects on nonidempotent requests.  I think 
it's wise to depend on retransmit settings as little as possible, as 
retransmitting an RPC is basically a hack, when you get right down to it.

I'm still open to discussion, though.

-- 
chuck[dot]lever[at]oracle[dot]com

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 6/6] RPC: adjust timeout for connect, bind, restablish so that they sensitive to the major time out value
       [not found]                           ` <2CDC4373-10AD-4F84-BA44-3C2106D590BE@netapp.com>
@ 2010-02-08 18:43                             ` Chuck Lever
  2010-02-08 23:13                               ` Batsakis, Alexandros
  0 siblings, 1 reply; 15+ messages in thread
From: Chuck Lever @ 2010-02-08 18:43 UTC (permalink / raw)
  To: Batsakis, Alexandros; +Cc: linux-nfs

On 02/06/2010 11:11 PM, Batsakis, Alexandros wrote:
>
>
> On Feb 6, 2010, at 16:53, "Chuck Lever" <chuck.lever@oracle.com> wrote:
>
>> On 02/05/2010 11:05 PM, Batsakis, Alexandros wrote:
>>>
>>> My replies marked with the "AB" prefix
>>>
>>> -----Original Message-----
>>> From: Chuck Lever [mailto:chuck.lever@oracle.com]
>>> Sent: Fri 2/5/2010 4:11 PM
>>> To: Batsakis, Alexandros
>>> Cc: Batsakis, Alexandros; linux-nfs@vger.kernel.org; Myklebust, Trond
>>> Subject: Re: [PATCH 6/6] RPC: adjust timeout for connect, bind,
>>> restablish so that they sensitive to the major time out value
>>>
>>> On 02/05/2010 06:04 PM, Batsakis, Alexandros wrote:
>>> >
>>> >
>>> > On Feb 5, 2010, at 14:47, "Chuck Lever" <chuck.lever@oracle.com>
>>> wrote:
>>> >
>>> >> On 02/05/2010 05:14 PM, Batsakis, Alexandros wrote:
>>> >>> Yeah sure,
>>> >>>
>>> >>> So imagine that for a specific connection the remaining major timeo
>>> >>> value is 30secs. Xs_connect has a default timeout before
>>> attempting to
>>> >>> reconnect of 60secs. The user (NFS) expects to "hear back" from
>>> the rpc
>>> >>> layer within the timeout as in often cases e.g. lease renewal,
>>> it's of
>>> >>> no benefit for an operation to reach the server at a later time and
>>> miss
>>> >>> the critical time because it was sleeping for an arbitrary amount of
>>> >>> time.
>>>
>>> Maybe you want RPC_TASK_SOFTCONN for NFSv4 renewals instead of
>>> RPC_TASK_SOFT. This would cause the RENEW request to fail immediately
>>> if the transport can't connect.
>>>
>>> AB: is this a new flag ? I am not familiar with it. Or are you proposing
>>> to add such a flag?
>>> It's not an unreasonable thing to do
>>
>> The flag was added recently (maybe in 2.6.33-rc?). It causes an
>> individual RPC request to fail immediately if the underlying transport
>> cannot be connected. It bypasses the reconnect timeout if the
>> transport is not already connected.
>>
>
> Oh OK. Maybe then it's a reasonable workaround to the reconnection
> policy changes. I think though that the rest of the changes wrt the
> major timeout are still valid. Also IMHO the max of 5min seems a lot,
> especially for operations that are state-oriented like in v4.0 and v4.1.

I'm not averse to reducing the maximum reconnect delay to something like 
60 seconds.  This might even be an acceptable work around for some of 
the issues you've raised.  Additional illumination of current reconnect 
behavior may find that it no longer behaves as expected in the quick 
server reboot cases, and that also should be addressed.

However, the fact that _all_ NFSv4 state-changing operations now have 
additional delivery constraints makes this an issue larger than RENEWD 
(which is the subject line of your original postings).  IMO sunrpc.ko is 
not currently prepared to handle that kind of timing constraint 
adequately.  Adjusting the retransmit behavior is simply not sufficient 
to address these problems (and perhaps it is even orthogonal to them).

-- 
chuck[dot]lever[at]oracle[dot]com

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH 6/6] RPC: adjust timeout for connect, bind, restablish so that they sensitive to the major time out value
  2010-02-08 18:43                             ` Chuck Lever
@ 2010-02-08 23:13                               ` Batsakis, Alexandros
  0 siblings, 0 replies; 15+ messages in thread
From: Batsakis, Alexandros @ 2010-02-08 23:13 UTC (permalink / raw)
  To: Chuck Lever; +Cc: linux-nfs



> -----Original Message-----
> From: Chuck Lever [mailto:chuck.lever@oracle.com]
> Sent: Monday, February 08, 2010 10:44 AM
> To: Batsakis, Alexandros
> Cc: linux-nfs@vger.kernel.org
> Subject: Re: [PATCH 6/6] RPC: adjust timeout for connect, bind,
> restablish so that they sensitive to the major time out value
> 
> >
> > Oh OK. Maybe then it's a reasonable workaround to the reconnection
> > policy changes. I think though that the rest of the changes wrt the
> > major timeout are still valid. Also IMHO the max of 5min seems a
lot,
> > especially for operations that are state-oriented like in v4.0 and
> v4.1.
> 
> I'm not averse to reducing the maximum reconnect delay to something
> like
> 60 seconds.  This might even be an acceptable work around for some of
> the issues you've raised.  Additional illumination of current
reconnect
> behavior may find that it no longer behaves as expected in the quick
> server reboot cases, and that also should be addressed.
> 

OK I agree that then I could only change the rest of the timeout values
to something that is in accordance with the NFS timeout and leave the
reconnection policy as it is. This is also acceptable because the
connect_timeout supersedes the reestablish_timeout and the task can
return while the connect_worker will retry to reestablish the
connection. 
AFACIS though the last point brings up another issue with the current
design as a new task would wait for a connection establishment for
XS_TCP_CONN_TO that is << XS_TCP_MAX_REEST_TO, so it may return (up to
XS_TCP_MAX_REEST_TO/XS_TCP_CONN_TO) times without even trying to
reconnect. Am I missing something ?

> However, the fact that _all_ NFSv4 state-changing operations now have
> additional delivery constraints makes this an issue larger than RENEWD
> (which is the subject line of your original postings).  IMO sunrpc.ko
> is
> not currently prepared to handle that kind of timing constraint
> adequately.  Adjusting the retransmit behavior is simply not
sufficient
> to address these problems (and perhaps it is even orthogonal to them).
> 
> --

Hm I am not sure. First, by making the RPC layer's queuing sensitive to
the timeout value and by forcing the timeout to be <= lease time we
ensure that operations will return to the NFS layer within the specified
time limits. Also, this doesn't have to be a strict requirement: forcing
state operations (except async RENEWs and SEQUENCEs) to be executed
timely is more of an optimization rather than a correctness requirement.
You are right though that I should think whether this will affect v2/v3.

Anyway, your feedback was very helpful. I ll resend patch 6/6 and we can
take it from there.

-alexandros


> chuck[dot]lever[at]oracle[dot]com

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2010-02-08 23:13 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-02-03  0:06 [PATCH 0/6] nfs: renewd fixes Alexandros Batsakis
2010-02-03  0:06 ` [PATCH 1/6] nfs: kill renewd before clearing client minor version Alexandros Batsakis
2010-02-03  0:06   ` [PATCH 2/6] nfs: prevent backlogging of renewd requests Alexandros Batsakis
2010-02-03  0:06     ` [PATCH 3/6] nfs41: fix race between umount and renewd sequence operations Alexandros Batsakis
2010-02-03  0:06       ` [PATCH 4/6] nfs4: fix race between umount and renewd renew operations Alexandros Batsakis
2010-02-03  0:06         ` [PATCH 5/6] nfs4: adjust rpc timeout for nfs_client rpc client based on the lease_time Alexandros Batsakis
2010-02-03  0:06           ` [PATCH 6/6] RPC: adjust timeout for connect, bind, restablish so that they sensitive to the major time out value Alexandros Batsakis
2010-02-05 20:12             ` Chuck Lever
2010-02-05 22:14               ` Batsakis, Alexandros
2010-02-05 22:45                 ` Chuck Lever
2010-02-05 23:04                   ` Batsakis, Alexandros
2010-02-06  0:11                     ` Chuck Lever
     [not found]                       ` <B9364369CA66BF45806C2CD86EAB8BA60259D23D@SACMVEXC3-PRD.hq.netapp.com>
2010-02-07  0:53                         ` Chuck Lever
     [not found]                           ` <2CDC4373-10AD-4F84-BA44-3C2106D590BE@netapp.com>
2010-02-08 18:43                             ` Chuck Lever
2010-02-08 23:13                               ` Batsakis, Alexandros

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.