All of lore.kernel.org
 help / color / mirror / Atom feed
From: James Simmons <jsimmons@infradead.org>
To: lustre-devel@lists.lustre.org
Subject: [lustre-devel] [PATCH 05/25] lustre: lnet: fix race in lnet shutdown path
Date: Tue, 25 Sep 2018 22:47:57 -0400	[thread overview]
Message-ID: <1537930097-11624-6-git-send-email-jsimmons@infradead.org> (raw)
In-Reply-To: <1537930097-11624-1-git-send-email-jsimmons@infradead.org>

From: Olaf Weber <olaf.weber@hpe.com>

The locking changes for the lnet_net_lock made for Multi-Rail
introduce a race in the LNet shutdown path. The code keeps two
states in the_lnet.ln_shutdown: 0 means LNet is either up and
running or shut down, while 1 means lnet is shutting down. In
lnet_select_pathway() if we need to restart and drop and relock
the lnet_net_lock we can find that LNet went from running to
stopped, and not be able to tell the difference.

Replace ln_shutdown with a three-state ln_state patterned on
ln_rc_state: states are LNET_STATE_SHUTDOWN, LNET_STATE_RUNNING,
and LNET_STATE_STOPPING. Most checks against ln_shutdown now test
ln_state against LNET_STATE_RUNNING. LNet moves to RUNNING state
in lnet_startup_lndnets().

Signed-off-by: Olaf Weber <olaf.weber@hpe.com>
WC-bug-id: https://jira.whamcloud.com/browse/LU-9119
Reviewed-on: https://review.whamcloud.com/26690
Reviewed-by: Doug Oucharek <dougso@me.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
---
 drivers/staging/lustre/include/linux/lnet/lib-types.h |  9 +++++++--
 drivers/staging/lustre/lnet/lnet/api-ni.c             | 15 ++++++++++++---
 drivers/staging/lustre/lnet/lnet/lib-move.c           |  2 +-
 drivers/staging/lustre/lnet/lnet/lib-ptl.c            |  4 ++--
 drivers/staging/lustre/lnet/lnet/peer.c               | 10 +++++-----
 drivers/staging/lustre/lnet/lnet/router.c             |  6 +++---
 6 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h
index 18e2665..6abac19 100644
--- a/drivers/staging/lustre/include/linux/lnet/lib-types.h
+++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h
@@ -726,6 +726,11 @@ struct lnet_msg_container {
 #define LNET_RC_STATE_RUNNING		1	/* started up OK */
 #define LNET_RC_STATE_STOPPING		2	/* telling thread to stop */
 
+/* LNet states */
+#define LNET_STATE_SHUTDOWN		0	/* not started */
+#define LNET_STATE_RUNNING		1	/* started up OK */
+#define LNET_STATE_STOPPING		2	/* telling thread to stop */
+
 struct lnet {
 	/* CPU partition table of LNet */
 	struct cfs_cpt_table		 *ln_cpt_table;
@@ -805,8 +810,8 @@ struct lnet {
 	int				  ln_niinit_self;
 	/* LNetNIInit/LNetNIFini counter */
 	int				  ln_refcount;
-	/* shutdown in progress */
-	int				  ln_shutdown;
+	/* SHUTDOWN/RUNNING/STOPPING */
+	int				  ln_state;
 
 	int				  ln_routing;	/* am I a router? */
 	lnet_pid_t			  ln_pid;	/* requested pid */
diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
index 2d430d0..7c907a3 100644
--- a/drivers/staging/lustre/lnet/lnet/api-ni.c
+++ b/drivers/staging/lustre/lnet/lnet/api-ni.c
@@ -1277,11 +1277,11 @@ struct lnet_ni *
 	/* NB called holding the global mutex */
 
 	/* All quiet on the API front */
-	LASSERT(!the_lnet.ln_shutdown);
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
 	LASSERT(!the_lnet.ln_refcount);
 
 	lnet_net_lock(LNET_LOCK_EX);
-	the_lnet.ln_shutdown = 1;	/* flag shutdown */
+	the_lnet.ln_state = LNET_STATE_STOPPING;
 
 	while (!list_empty(&the_lnet.ln_nets)) {
 		/*
@@ -1309,7 +1309,7 @@ struct lnet_ni *
 	}
 
 	lnet_net_lock(LNET_LOCK_EX);
-	the_lnet.ln_shutdown = 0;
+	the_lnet.ln_state = LNET_STATE_SHUTDOWN;
 	lnet_net_unlock(LNET_LOCK_EX);
 }
 
@@ -1583,6 +1583,15 @@ struct lnet_ni *
 	int rc;
 	int ni_count = 0;
 
+	/*
+	 * Change to running state before bringing up the LNDs. This
+	 * allows lnet_shutdown_lndnets() to assert that we've passed
+	 * through here.
+	 */
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_state = LNET_STATE_RUNNING;
+	lnet_net_unlock(LNET_LOCK_EX);
+
 	while (!list_empty(netlist)) {
 		net = list_entry(netlist->next, struct lnet_net, net_list);
 		list_del_init(&net->net_list);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
index a213387..ea7e2c3 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-move.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-move.c
@@ -1242,7 +1242,7 @@
 
 	seq = lnet_get_dlc_seq_locked();
 
-	if (the_lnet.ln_shutdown) {
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
 		lnet_net_unlock(cpt);
 		return -ESHUTDOWN;
 	}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
index d403353..6fa5bbf 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-ptl.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
@@ -589,7 +589,7 @@ struct list_head *
 	mtable = lnet_mt_of_match(info, msg);
 	lnet_res_lock(mtable->mt_cpt);
 
-	if (the_lnet.ln_shutdown) {
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
 		rc = LNET_MATCHMD_DROP;
 		goto out1;
 	}
@@ -951,7 +951,7 @@ struct list_head *
 				list_move(&msg->msg_list, &zombies);
 		}
 	} else {
-		if (the_lnet.ln_shutdown)
+		if (the_lnet.ln_state != LNET_STATE_RUNNING)
 			CWARN("Active lazy portal %d on exit\n", portal);
 		else
 			CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
index ae3ffca..2fbf93a 100644
--- a/drivers/staging/lustre/lnet/lnet/peer.c
+++ b/drivers/staging/lustre/lnet/lnet/peer.c
@@ -428,7 +428,7 @@ void lnet_peer_uninit(void)
 	struct lnet_peer_table *ptable;
 	int i;
 
-	LASSERT(the_lnet.ln_shutdown || net);
+	LASSERT(the_lnet.ln_state != LNET_STATE_SHUTDOWN || net);
 	/*
 	 * If just deleting the peers for a NI, get rid of any routes these
 	 * peers are gateways for.
@@ -458,7 +458,7 @@ void lnet_peer_uninit(void)
 	struct list_head	*peers;
 	struct lnet_peer_ni	*lp;
 
-	LASSERT(!the_lnet.ln_shutdown);
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
 
 	peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
 	list_for_each_entry(lp, peers, lpni_hashlist) {
@@ -1000,7 +1000,7 @@ struct lnet_peer_ni *
 	struct lnet_peer_ni *lpni = NULL;
 	int rc;
 
-	if (the_lnet.ln_shutdown) /* it's shutting down */
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
 		return ERR_PTR(-ESHUTDOWN);
 
 	/*
@@ -1034,7 +1034,7 @@ struct lnet_peer_ni *
 	struct lnet_peer_ni *lpni = NULL;
 	int rc;
 
-	if (the_lnet.ln_shutdown) /* it's shutting down */
+	if (the_lnet.ln_state != LNET_STATE_RUNNING)
 		return ERR_PTR(-ESHUTDOWN);
 
 	/*
@@ -1063,7 +1063,7 @@ struct lnet_peer_ni *
 	 * Shutdown is only set under the ln_api_lock, so a single
 	 * check here is sufficent.
 	 */
-	if (the_lnet.ln_shutdown) {
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
 		lpni = ERR_PTR(-ESHUTDOWN);
 		goto out_mutex_unlock;
 	}
diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c
index a0483f9..d3aef8b 100644
--- a/drivers/staging/lustre/lnet/lnet/router.c
+++ b/drivers/staging/lustre/lnet/lnet/router.c
@@ -252,7 +252,7 @@ struct lnet_remotenet *
 	struct lnet_remotenet *rnet;
 	struct list_head *rn_list;
 
-	LASSERT(!the_lnet.ln_shutdown);
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
 
 	rn_list = lnet_net2rnethash(net);
 	list_for_each_entry(rnet, rn_list, lrn_list) {
@@ -374,7 +374,7 @@ static void lnet_shuffle_seed(void)
 		return rc;
 	}
 	route->lr_gateway = lpni;
-	LASSERT(!the_lnet.ln_shutdown);
+	LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
 
 	rnet2 = lnet_find_rnet_locked(net);
 	if (!rnet2) {
@@ -1775,7 +1775,7 @@ int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg)
 
 	lnet_net_lock(cpt);
 
-	if (the_lnet.ln_shutdown) {
+	if (the_lnet.ln_state != LNET_STATE_RUNNING) {
 		lnet_net_unlock(cpt);
 		return -ESHUTDOWN;
 	}
-- 
1.8.3.1

  parent reply	other threads:[~2018-09-26  2:47 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-09-26  2:47 [lustre-devel] [PATCH 00/25] lustre: lnet: remaining fixes for multi-rail James Simmons
2018-09-26  2:47 ` [lustre-devel] [PATCH 01/25] lustre: lnet: remove ni from lnet_finalize James Simmons
2018-09-26 23:57   ` NeilBrown
2018-09-30  2:19     ` James Simmons
2018-10-02  4:24       ` NeilBrown
2018-09-26  2:47 ` [lustre-devel] [PATCH 02/25] lustre: lnet: Allow min stats to be reset in peers and nis James Simmons
2018-09-26 23:59   ` NeilBrown
2018-09-26  2:47 ` [lustre-devel] [PATCH 03/25] lustre: lnet: remove debug ioctl James Simmons
2018-09-26  2:47 ` [lustre-devel] [PATCH 04/25] lustre: lnet: Normalize ioctl interface James Simmons
2018-09-26  2:47 ` James Simmons [this message]
2018-09-27  0:03   ` [lustre-devel] [PATCH 05/25] lustre: lnet: fix race in lnet shutdown path NeilBrown
2018-09-27  1:14     ` NeilBrown
2018-09-26  2:47 ` [lustre-devel] [PATCH 06/25] lustre: lnet: loopback NID in lnet_select_pathway() James Simmons
2018-09-26  2:47 ` [lustre-devel] [PATCH 07/25] lustre: lnet: rename LNET_MAX_INTERFACES James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 08/25] lustre: lnet: selftest MR fix James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 09/25] lustre: lnet: prevent assert on ln_state James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 10/25] lustre: lnet: increment per NI stats James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 11/25] lustre: lnet: Fix lost lock James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 12/25] lustre: lnet: correct locking in legacy add net James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 13/25] lustre: lnet: fix lnet_cpt_of_md() James Simmons
2018-09-27  1:03   ` NeilBrown
2018-09-27  1:17     ` NeilBrown
2018-09-26  2:48 ` [lustre-devel] [PATCH 14/25] lustre: lnet: safe access to msg James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 15/25] lustre: o2iblnd: reconnect peer for REJ_INVALID_SERVICE_ID James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 16/25] lustre: o2iblnd: kill timedout txs from ibp_tx_queue James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 17/25] lustre: o2iblnd: multiple sges for work request James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 18/25] lustre: lnd: Turn on 2 sges by default James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 19/25] lustre: lnd: Don't Assert On Reconnect with MultiQP James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 20/25] lustre: lnet: handle empty CPTs James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 21/25] lustre: lnet: set LND tunables properly James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 22/25] lustre: lnd: Don't Page Align remote_addr with FastReg James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 23/25] lustre: lnd: pending transmits dropped silently James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 24/25] lustre: socklnd: propagate errors on send failure James Simmons
2018-09-26  2:48 ` [lustre-devel] [PATCH 25/25] lustre: ko2iblnd: allow for discontiguous fragments James Simmons
2018-09-27  1:19 ` [lustre-devel] [PATCH 00/25] lustre: lnet: remaining fixes for multi-rail NeilBrown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1537930097-11624-6-git-send-email-jsimmons@infradead.org \
    --to=jsimmons@infradead.org \
    --cc=lustre-devel@lists.lustre.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.