[RFC] sunrpc: Fix race between work-queue and rpc_killall_tasks.

* [RFC] sunrpc:  Fix race between work-queue and rpc_killall_tasks.
@ 2011-07-06 22:49 greearb
  2011-07-06 23:45   ` Trond Myklebust
  0 siblings, 1 reply; 20+ messages in thread
From: greearb @ 2011-07-06 22:49 UTC (permalink / raw)
  To: linux-nfs, linux-kernel; +Cc: Ben Greear

From: Ben Greear <greearb@candelatech.com>

The rpc_killall_tasks logic is not locked against
the work-queue thread, but it still directly modifies
function pointers and data in the task objects.

This patch changes the killall-tasks logic to set a flag
that tells the work-queue thread to terminate the task
instead of directly calling the terminate logic.

Signed-off-by: Ben Greear <greearb@candelatech.com>
---

NOTE:  This needs review, as I am still struggling to understand
the rpc code, and it's quite possible this patch either doesn't
fully fix the problem or actually causes other issues.  That said,
my nfs stress test seems to run a bit more stable with this patch applied.

:100644 100644 fe2d8e6... b238944... M	include/linux/sunrpc/sched.h
:100644 100644 8c91415... 6851f84... M	net/sunrpc/clnt.c
:100644 100644 1cbbed5... 0fc559e... M	net/sunrpc/sched.c
 include/linux/sunrpc/sched.h |   10 ++++++++++
 net/sunrpc/clnt.c            |    3 +--
 net/sunrpc/sched.c           |    6 ++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index fe2d8e6..b238944 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -76,6 +76,7 @@ struct rpc_task {
 
 	pid_t			tk_owner;	/* Process id for batching tasks */
 	int			tk_status;	/* result of last operation */
+	int			tk_killme_errno;/* For RPC_TASK_KILLME */
 	unsigned short		tk_flags;	/* misc flags */
 	unsigned short		tk_timeouts;	/* maj timeouts */
 
@@ -130,6 +131,7 @@ struct rpc_task_setup {
 #define RPC_TASK_SOFTCONN	0x0400		/* Fail if can't connect */
 #define RPC_TASK_SENT		0x0800		/* message was sent */
 #define RPC_TASK_TIMEOUT	0x1000		/* fail with ETIMEDOUT on timeout */
+#define RPC_TASK_KILLME		0x2000		/* Need to die ASAP. */
 
 #define RPC_IS_ASYNC(t)		((t)->tk_flags & RPC_TASK_ASYNC)
 #define RPC_IS_SWAPPER(t)	((t)->tk_flags & RPC_TASK_SWAPPER)
@@ -138,6 +140,7 @@ struct rpc_task_setup {
 #define RPC_IS_SOFT(t)		((t)->tk_flags & (RPC_TASK_SOFT|RPC_TASK_TIMEOUT))
 #define RPC_IS_SOFTCONN(t)	((t)->tk_flags & RPC_TASK_SOFTCONN)
 #define RPC_WAS_SENT(t)		((t)->tk_flags & RPC_TASK_SENT)
+#define RPC_SHOULD_KILLME(t)	((t)->tk_flags & RPC_TASK_KILLME)
 
 #define RPC_TASK_RUNNING	0
 #define RPC_TASK_QUEUED		1
@@ -269,4 +272,11 @@ static inline const char * rpc_qname(struct rpc_wait_queue *q)
 }
 #endif
 
+static inline void rpc_task_killme(struct rpc_task *task, int exit_errno)
+{
+	task->tk_killme_errno = exit_errno;
+	task->tk_flags |= RPC_TASK_KILLME;
+}
+
+
 #endif /* _LINUX_SUNRPC_SCHED_H_ */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 8c91415..6851f84 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -437,8 +437,7 @@ void rpc_killall_tasks(struct rpc_clnt *clnt)
 		if (!RPC_IS_ACTIVATED(rovr))
 			continue;
 		if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
-			rovr->tk_flags |= RPC_TASK_KILLED;
-			rpc_exit(rovr, -EIO);
+			rpc_task_killme(rovr, -EIO);
 			if (RPC_IS_QUEUED(rovr))
 				rpc_wake_up_queued_task(rovr->tk_waitqueue,
 							rovr);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 1cbbed5..0fc559e 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -646,6 +646,12 @@ static void __rpc_execute(struct rpc_task *task)
 			task->tk_action(task);
 		}
 
+		/* If we should die, do it now. */
+		if (RPC_SHOULD_KILLME(task)) {
+			task->tk_flags |= RPC_TASK_KILLED;
+			rpc_exit(task, task->tk_killme_errno);
+		}
+
 		/*
 		 * Lockless check for whether task is sleeping or not.
 		 */
-- 
1.7.3.4


^ permalink raw reply related	[flat|nested] 20+ messages in thread