All of lore.kernel.org
 help / color / mirror / Atom feed
From: NeilBrown <neilb@suse.com>
To: lustre-devel@lists.lustre.org
Subject: [lustre-devel] [PATCH 26/34] LU-7734 lnet: Routing fixes part 2
Date: Tue, 25 Sep 2018 11:07:15 +1000	[thread overview]
Message-ID: <153783763587.32103.5037367646271689437.stgit@noble> (raw)
In-Reply-To: <153783752960.32103.8394391715843917125.stgit@noble>

From: Amir Shehata <amir.shehata@intel.com>

Fix lnet_select_pathway() to handle the routing cases correctly.
The following general cases are handled:
. Non-MR directly connected
. Non-MR not directly connected
. MR Directly connected
. MR Not directly connected
  . No gateway
  . Gateway is non-mr
  . Gateway is mr

Signed-off-by: Amir Shehata <amir.shehata@intel.com>
Change-Id: If2d16b797b94421e78a9f2a254a250a440f8b244
Reviewed-on: http://review.whamcloud.com/21167
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/staging/lustre/lnet/lnet/lib-move.c |  214 ++++++++++++++++++---------
 drivers/staging/lustre/lnet/lnet/peer.c     |   29 +++-
 2 files changed, 167 insertions(+), 76 deletions(-)

diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
index 58521b014ef3..12bc80d060e9 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-move.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-move.c
@@ -1145,6 +1145,7 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 	__u32 seq;
 	int cpt, cpt2, rc;
 	bool routing;
+	bool routing2;
 	bool ni_is_pref;
 	bool preferred;
 	int best_credits;
@@ -1168,6 +1169,7 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 	best_gw = NULL;
 	local_net = NULL;
 	routing = false;
+	routing2 = false;
 
 	seq = lnet_get_dlc_seq_locked();
 
@@ -1201,7 +1203,7 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 	}
 
 	/*
-	 * STEP 1: first jab at determineing best_ni
+	 * STEP 1: first jab at determining best_ni
 	 * if src_nid is explicitly specified, then best_ni is already
 	 * pre-determiend for us. Otherwise we need to select the best
 	 * one to use later on
@@ -1215,17 +1217,122 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 				      libcfs_nid2str(src_nid));
 			return -EINVAL;
 		}
+	}
+
+	if (msg->msg_type == LNET_MSG_REPLY ||
+	    msg->msg_type == LNET_MSG_ACK ||
+	    !peer->lp_multi_rail) {
+		/*
+		 * for replies we want to respond on the same peer_ni we
+		 * received the message on if possible. If not, then pick
+		 * a peer_ni to send to
+		 *
+		 * if the peer is non-multi-rail then you want to send to
+		 * the dst_nid provided as well.
+		 *
+		 * It is expected to find the lpni using dst_nid, since we
+		 * created it earlier.
+		 */
+		best_lpni = lnet_find_peer_ni_locked(dst_nid);
+		if (best_lpni)
+			lnet_peer_ni_decref_locked(best_lpni);
 
-		if (best_ni->ni_net->net_id != LNET_NIDNET(dst_nid)) {
+		if (best_lpni && !lnet_get_net_locked(LNET_NIDNET(dst_nid))) {
+			/*
+			 * this lpni is not on a local network so we need
+			 * to route this reply.
+			 */
+			best_gw = lnet_find_route_locked(NULL,
+							 best_lpni->lpni_nid,
+							 rtr_nid);
+			if (best_gw) {
+				/*
+				 * RULE: Each node considers only the next-hop
+				 *
+				 * We're going to route the message,
+				 * so change the peer to the router.
+				 */
+				LASSERT(best_gw->lpni_peer_net);
+				LASSERT(best_gw->lpni_peer_net->lpn_peer);
+				peer = best_gw->lpni_peer_net->lpn_peer;
+
+				/*
+				 * if the router is not multi-rail
+				 * then use the best_gw found to send
+				 * the message to
+				 */
+				if (!peer->lp_multi_rail)
+					best_lpni = best_gw;
+				else
+					best_lpni = NULL;
+
+				routing = true;
+			} else {
+				best_lpni = NULL;
+			}
+		} else if (!best_lpni) {
 			lnet_net_unlock(cpt);
-			LCONSOLE_WARN("No route to %s via from %s\n",
-				      libcfs_nid2str(dst_nid),
-				      libcfs_nid2str(src_nid));
+			CERROR("unable to send msg_type %d to originating %s. Destination NID not in DB\n",
+			       msg->msg_type, libcfs_nid2str(dst_nid));
 			return -EINVAL;
 		}
-		goto pick_peer;
 	}
 
+	/*
+	 * if the peer is not MR capable, then we should always send to it
+	 * using the first NI in the NET we determined.
+	 */
+	if (!peer->lp_multi_rail) {
+		if (!best_lpni) {
+			lnet_net_unlock(cpt);
+			CERROR("no route to %s\n",
+			       libcfs_nid2str(dst_nid));
+			return -EHOSTUNREACH;
+		}
+
+		/* best ni could be set because src_nid was provided */
+		if (!best_ni) {
+			best_ni = lnet_net2ni_locked(
+				best_lpni->lpni_net->net_id, cpt);
+			if (!best_ni) {
+				lnet_net_unlock(cpt);
+				CERROR("no path to %s from net %s\n",
+				       libcfs_nid2str(best_lpni->lpni_nid),
+				       libcfs_net2str(best_lpni->lpni_net->net_id));
+				return -EHOSTUNREACH;
+			}
+		}
+	}
+
+	if (best_ni == the_lnet.ln_loni) {
+		/* No send credit hassles with LOLND */
+		lnet_ni_addref_locked(best_ni, cpt);
+		msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
+		if (!msg->msg_routing)
+			msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
+		msg->msg_target.nid = best_ni->ni_nid;
+		lnet_msg_commit(msg, cpt);
+		msg->msg_txni = best_ni;
+		lnet_net_unlock(cpt);
+
+		return LNET_CREDIT_OK;
+	}
+
+	/*
+	 * if we already found a best_ni because src_nid is specified and
+	 * best_lpni because we are replying to a message then just send
+	 * the message
+	 */
+	if (best_ni && best_lpni)
+		goto send;
+
+	/*
+	 * If we already found a best_ni because src_nid is specified then
+	 * pick the peer then send the message
+	 */
+	if (best_ni)
+		goto pick_peer;
+
 	/*
 	 * Decide whether we need to route to peer_ni.
 	 * Get the local net that I need to be on to be able to directly
@@ -1242,7 +1349,7 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 			continue;
 
 		local_net = lnet_get_net_locked(peer_net->lpn_net_id);
-		if (!local_net) {
+		if (!local_net && !routing) {
 			struct lnet_peer_ni *net_gw;
 			/*
 			 * go through each peer_ni on that peer_net and
@@ -1263,14 +1370,11 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 
 				if (!best_gw) {
 					best_gw = net_gw;
-					best_lpni = lpni;
 				} else  {
 					rc = lnet_compare_peers(net_gw,
 								best_gw);
-					if (rc > 0) {
+					if (rc > 0)
 						best_gw = net_gw;
-						best_lpni = lpni;
-					}
 				}
 			}
 
@@ -1279,9 +1383,9 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 
 			local_net = lnet_get_net_locked
 					(LNET_NIDNET(best_gw->lpni_nid));
-			routing = true;
+			routing2 = true;
 		} else {
-			routing = false;
+			routing2 = false;
 			best_gw = NULL;
 		}
 
@@ -1342,12 +1446,17 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 		}
 	}
 
-	/*
-	 * if the peer is not MR capable, then we should always send to it
-	 * using the first NI in the NET we determined.
-	 */
-	if (!peer->lp_multi_rail && local_net)
-		best_ni = lnet_net2ni_locked(local_net->net_id, cpt);
+	if (routing2) {
+		/*
+		 * RULE: Each node considers only the next-hop
+		 *
+		 * We're going to route the message, so change the peer to
+		 * the router.
+		 */
+		LASSERT(best_gw->lpni_peer_net);
+		LASSERT(best_gw->lpni_peer_net->lpn_peer);
+		peer = best_gw->lpni_peer_net->lpn_peer;
+	}
 
 	if (!best_ni) {
 		lnet_net_unlock(cpt);
@@ -1363,43 +1472,11 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 	 */
 	best_ni->ni_seq++;
 
-	if (routing)
-		goto send;
-
 pick_peer:
-	if (best_ni == the_lnet.ln_loni) {
-		/* No send credit hassles with LOLND */
-		lnet_ni_addref_locked(best_ni, cpt);
-		msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
-		if (!msg->msg_routing)
-			msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
-		msg->msg_target.nid = best_ni->ni_nid;
-		lnet_msg_commit(msg, cpt);
-		msg->msg_txni = best_ni;
-		lnet_net_unlock(cpt);
-
-		return LNET_CREDIT_OK;
-	}
-
-	if (msg->msg_type == LNET_MSG_REPLY ||
-	    msg->msg_type == LNET_MSG_ACK) {
-		/*
-		 * for replies we want to respond on the same peer_ni we
-		 * received the message on if possible. If not, then pick
-		 * a peer_ni to send to
-		 */
-		best_lpni = lnet_find_peer_ni_locked(dst_nid);
-		if (best_lpni) {
-			lnet_peer_ni_decref_locked(best_lpni);
-			goto send;
-		} else {
-			CDEBUG(D_NET,
-			       "unable to send msg_type %d to originating %s\n",
-			       msg->msg_type,
-			       libcfs_nid2str(dst_nid));
-		}
-	}
-
+	/*
+	 * At this point the best_ni is on a local network on which
+	 * the peer has a peer_ni as well
+	 */
 	peer_net = lnet_peer_get_net_locked(peer,
 					    best_ni->ni_net->net_id);
 	/*
@@ -1429,13 +1506,16 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 			libcfs_nid2str(best_gw->lpni_nid),
 			lnet_msgtyp2str(msg->msg_type), msg->msg_len);
 
-		best_lpni = lnet_find_peer_ni_locked(dst_nid);
-		LASSERT(best_lpni);
-		lnet_peer_ni_decref_locked(best_lpni);
-
-		routing = true;
-
-		goto send;
+		routing2 = true;
+		/*
+		 * RULE: Each node considers only the next-hop
+		 *
+		 * We're going to route the message, so change the peer to
+		 * the router.
+		 */
+		LASSERT(best_gw->lpni_peer_net);
+		LASSERT(best_gw->lpni_peer_net->lpn_peer);
+		peer = best_gw->lpni_peer_net->lpn_peer;
 	} else if (!lnet_is_peer_net_healthy_locked(peer_net)) {
 		/*
 		 * this peer_net is unhealthy but we still have an opportunity
@@ -1459,6 +1539,7 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 	lpni = NULL;
 	best_lpni_credits = INT_MIN;
 	preferred = false;
+	best_lpni = NULL;
 	while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
 		/*
 		 * if this peer ni is not healthy just skip it, no point in
@@ -1513,19 +1594,14 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
 	}
 
 send:
+	routing = routing || routing2;
+
 	/*
 	 * Increment sequence number of the peer selected so that we
 	 * pick the next one in Round Robin.
 	 */
 	best_lpni->lpni_seq++;
 
-	/*
-	 * When routing the best gateway found acts as the best peer
-	 * NI to send to.
-	 */
-	if (routing)
-		best_lpni = best_gw;
-
 	/*
 	 * grab a reference on the peer_ni so it sticks around even if
 	 * we need to drop and relock the lnet_net_lock below.
diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
index 9cecfb49db87..d757f4df1f39 100644
--- a/drivers/staging/lustre/lnet/lnet/peer.c
+++ b/drivers/staging/lustre/lnet/lnet/peer.c
@@ -225,11 +225,18 @@ lnet_try_destroy_peer_hierarchy_locked(struct lnet_peer_ni *lpni)
 }
 
 /* called with lnet_net_lock LNET_LOCK_EX held */
-static void
+static int
 lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
 {
 	struct lnet_peer_table *ptable = NULL;
 
+	/* don't remove a peer_ni if it's also a gateway */
+	if (lpni->lpni_rtr_refcount > 0) {
+		CERROR("Peer NI %s is a gateway. Can not delete it\n",
+		       libcfs_nid2str(lpni->lpni_nid));
+		return -EBUSY;
+	}
+
 	lnet_peer_remove_from_remote_list(lpni);
 
 	/* remove peer ni from the hash list. */
@@ -260,6 +267,8 @@ lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
 
 	/* decrement reference on peer */
 	lnet_peer_ni_decref_locked(lpni);
+
+	return 0;
 }
 
 void lnet_peer_uninit(void)
@@ -313,17 +322,22 @@ lnet_peer_tables_create(void)
 	return 0;
 }
 
-static void
+static int
 lnet_peer_del_locked(struct lnet_peer *peer)
 {
 	struct lnet_peer_ni *lpni = NULL, *lpni2;
+	int rc = 0, rc2 = 0;
 
 	lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
 	while (lpni) {
 		lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
-		lnet_peer_ni_del_locked(lpni);
+		rc = lnet_peer_ni_del_locked(lpni);
+		if (rc != 0)
+			rc2 = rc;
 		lpni = lpni2;
 	}
+
+	return rc2;
 }
 
 static void
@@ -899,6 +913,7 @@ lnet_del_peer_ni_from_peer(lnet_nid_t key_nid, lnet_nid_t nid)
 	lnet_nid_t local_nid;
 	struct lnet_peer *peer;
 	struct lnet_peer_ni *lpni;
+	int rc;
 
 	if (key_nid == LNET_NID_ANY)
 		return -EINVAL;
@@ -919,17 +934,17 @@ lnet_del_peer_ni_from_peer(lnet_nid_t key_nid, lnet_nid_t nid)
 		 * entire peer
 		 */
 		lnet_net_lock(LNET_LOCK_EX);
-		lnet_peer_del_locked(peer);
+		rc = lnet_peer_del_locked(peer);
 		lnet_net_unlock(LNET_LOCK_EX);
 
-		return 0;
+		return rc;
 	}
 
 	lnet_net_lock(LNET_LOCK_EX);
-	lnet_peer_ni_del_locked(lpni);
+	rc = lnet_peer_ni_del_locked(lpni);
 	lnet_net_unlock(LNET_LOCK_EX);
 
-	return 0;
+	return rc;
 }
 
 void

  parent reply	other threads:[~2018-09-25  1:07 UTC|newest]

Thread overview: 53+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-09-25  1:07 [lustre-devel] [PATCH 00/34] lustre: remainder of multi-rail series NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 02/34] lnet: change struct lnet_peer to struct lnet_peer_ni NeilBrown
2018-09-29 22:47   ` James Simmons
2018-09-25  1:07 ` [lustre-devel] [PATCH 01/34] lnet: replace all lp_ fields with lpni_ NeilBrown
2018-09-29 22:45   ` James Simmons
2018-09-25  1:07 ` [lustre-devel] [PATCH 03/34] lnet: Change lpni_refcount to atomic_t NeilBrown
2018-09-29 22:47   ` James Simmons
2018-09-25  1:07 ` NeilBrown [this message]
2018-09-25  1:07 ` [lustre-devel] [PATCH 28/34] LU-7734 lnet: Fix crash in router_proc.c NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 13/34] LU-7734 lnet: Primary NID and traffic distribution NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 06/34] lnet: introduce lnet_find_peer_ni_locked() NeilBrown
2018-09-29 22:48   ` James Simmons
2018-09-25  1:07 ` [lustre-devel] [PATCH 12/34] LU-7734 lnet: NUMA support NeilBrown
2018-09-30  1:49   ` James Simmons
2018-09-25  1:07 ` [lustre-devel] [PATCH 08/34] LU-7734 lnet: Multi-Rail peer split NeilBrown
2018-09-29 23:01   ` James Simmons
2018-10-02  3:10     ` NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 04/34] lnet: change some function names - add 'ni' NeilBrown
2018-09-29 22:47   ` James Simmons
2018-09-25  1:07 ` [lustre-devel] [PATCH 09/34] LU-7734 lnet: Multi-Rail local_ni/peer_ni selection NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 15/34] LU-7734 lnet: handle N NIs to 1 LND peer NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 19/34] LU-7734 lnet: proper cpt locking NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 17/34] LU-7734 lnet: Add peer_ni and NI stats for DLC NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 24/34] LU-7734 lnet: fix lnet_select_pathway() NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 22/34] LU-7734 lnet: fix lnet_peer_table_cleanup_locked() NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 07/34] lnet: lnet_peer_tables_cleanup: use an exclusive lock NeilBrown
2018-09-29 22:53   ` James Simmons
2018-10-02  2:25     ` NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 14/34] LU-7734 lnet: handle non-MR peers NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 21/34] LU-7734 lnet: simplify and fix lnet_select_pathway() NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 27/34] LU-7734 lnet: fix routing selection NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 23/34] LU-7734 lnet: configuration fixes NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 18/34] LU-7734 lnet: peer/peer_ni handling adjustments NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 25/34] LU-7734 lnet: Routing fixes part 1 NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 10/34] LU-7734 lnet: configure peers from DLC NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 16/34] LU-7734 lnet: rename LND peer to peer_ni NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 20/34] LU-7734 lnet: protect peer_ni credits NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 11/34] LU-7734 lnet: configure local NI from DLC NeilBrown
2018-09-29 21:05   ` James Simmons
2018-10-02  3:19     ` NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 05/34] lnet: make lnet_nid_cpt_hash non-static NeilBrown
2018-09-29 22:48   ` James Simmons
2018-09-25  1:07 ` [lustre-devel] [PATCH 30/34] LU-7734 lnet: set primary NID in ptlrpc_connection_get() NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 29/34] LU-7734 lnet: double free in lnet_add_net_common() NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 34/34] LU-7734 lnet: cpt locking NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 32/34] LU-7734 lnet: rename peer key_nid to prim_nid NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 33/34] lnet: use BIT() macro for LNET_MD_* flags NeilBrown
2018-09-28 16:25   ` James Simmons
2018-10-02  3:31     ` NeilBrown
2018-09-25  1:07 ` [lustre-devel] [PATCH 31/34] LU-7734 lnet: fix NULL access in lnet_peer_aliveness_enabled NeilBrown
2018-09-30  2:17 ` [lustre-devel] [PATCH 00/34] lustre: remainder of multi-rail series James Simmons
2018-10-02  3:41   ` NeilBrown
2018-10-01  2:06 ` James Simmons

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=153783763587.32103.5037367646271689437.stgit@noble \
    --to=neilb@suse.com \
    --cc=lustre-devel@lists.lustre.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.