All of lore.kernel.org
 help / color / mirror / Atom feed
* [net  1/1] tipc: fix failover problem
@ 2018-09-26 19:00 Jon Maloy
  2018-09-29 18:46 ` David Miller
  0 siblings, 1 reply; 2+ messages in thread
From: Jon Maloy @ 2018-09-26 19:00 UTC (permalink / raw)
  To: davem, netdev
  Cc: gordan.mihaljevic, tung.q.nguyen, hoang.h.le, jon.maloy,
	canh.d.luu, ying.xue, tipc-discussion

From: LUU Duc Canh <canh.d.luu@dektech.com.au>

We see the following scenario:
1) Link endpoint B on node 1 discovers that its peer endpoint is gone.
   Since there is a second working link, failover procedure is started.
2) Link endpoint A on node 1 sends a FAILOVER message to peer endpoint
   A on node 2. The node item 1->2 goes to state FAILINGOVER.
3) Linke endpoint A/2 receives the failover, and is supposed to take
   down its parallell link endpoint B/2, while producing a FAILOVER
   message to send back to A/1.
4) However, B/2 has already been deleted, so no FAILOVER message can
   created.
5) Node 1->2 remains in state FAILINGOVER forever, refusing to receive
   any messages that can bring B/1 up again. We are left with a non-
   redundant link between node 1 and 2.

We fix this with letting endpoint A/2 build a dummy FAILOVER message
to send to back to A/1, so that the situation can be resolved.

Signed-off-by: LUU Duc Canh <canh.d.luu@dektech.com.au>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/link.c | 35 +++++++++++++++++++++++++++++++++++
 net/tipc/link.h |  3 +++
 net/tipc/node.c | 11 +++++++++++
 3 files changed, 49 insertions(+)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index 26cc033..4ed650c 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -410,6 +410,11 @@ char *tipc_link_name(struct tipc_link *l)
 	return l->name;
 }
 
+u32 tipc_link_state(struct tipc_link *l)
+{
+	return l->state;
+}
+
 /**
  * tipc_link_create - create a new link
  * @n: pointer to associated node
@@ -1385,6 +1390,36 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
 	__skb_queue_tail(xmitq, skb);
 }
 
+void tipc_link_create_dummy_tnl_msg(struct tipc_link *l,
+				    struct sk_buff_head *xmitq)
+{
+	u32 onode = tipc_own_addr(l->net);
+	struct tipc_msg *hdr, *ihdr;
+	struct sk_buff_head tnlq;
+	struct sk_buff *skb;
+	u32 dnode = l->addr;
+
+	skb_queue_head_init(&tnlq);
+	skb = tipc_msg_create(TUNNEL_PROTOCOL, FAILOVER_MSG,
+			      INT_H_SIZE, BASIC_H_SIZE,
+			      dnode, onode, 0, 0, 0);
+	if (!skb) {
+		pr_warn("%sunable to create tunnel packet\n", link_co_err);
+		return;
+	}
+
+	hdr = buf_msg(skb);
+	msg_set_msgcnt(hdr, 1);
+	msg_set_bearer_id(hdr, l->peer_bearer_id);
+
+	ihdr = (struct tipc_msg *)msg_data(hdr);
+	tipc_msg_init(onode, ihdr, TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG,
+		      BASIC_H_SIZE, dnode);
+	msg_set_errcode(ihdr, TIPC_ERR_NO_PORT);
+	__skb_queue_tail(&tnlq, skb);
+	tipc_link_xmit(l, &tnlq, xmitq);
+}
+
 /* tipc_link_tnl_prepare(): prepare and return a list of tunnel packets
  * with contents of the link's transmit and backlog queues.
  */
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 7bc494a3..90488c5 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -88,6 +88,8 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
 			 struct tipc_link **link);
 void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
 			   int mtyp, struct sk_buff_head *xmitq);
+void tipc_link_create_dummy_tnl_msg(struct tipc_link *tnl,
+				    struct sk_buff_head *xmitq);
 void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq);
 int tipc_link_fsm_evt(struct tipc_link *l, int evt);
 bool tipc_link_is_up(struct tipc_link *l);
@@ -107,6 +109,7 @@ u16 tipc_link_rcv_nxt(struct tipc_link *l);
 u16 tipc_link_acked(struct tipc_link *l);
 u32 tipc_link_id(struct tipc_link *l);
 char *tipc_link_name(struct tipc_link *l);
+u32 tipc_link_state(struct tipc_link *l);
 char tipc_link_plane(struct tipc_link *l);
 int tipc_link_prio(struct tipc_link *l);
 int tipc_link_window(struct tipc_link *l);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 68014f1..b0ee25f 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -111,6 +111,7 @@ struct tipc_node {
 	int action_flags;
 	struct list_head list;
 	int state;
+	bool failover_sent;
 	u16 sync_point;
 	int link_cnt;
 	u16 working_links;
@@ -680,6 +681,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id,
 		*slot0 = bearer_id;
 		*slot1 = bearer_id;
 		tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT);
+		n->failover_sent = false;
 		n->action_flags |= TIPC_NOTIFY_NODE_UP;
 		tipc_link_set_active(nl, true);
 		tipc_bcast_add_peer(n->net, nl, xmitq);
@@ -1615,6 +1617,15 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
 			tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl),
 							tipc_link_inputq(l));
 		}
+		/* If parallel link was already down, and this happened before
+		 * the tunnel link came up, FAILOVER was never sent. Ensure that
+		 * FAILOVER is sent to get peer out of NODE_FAILINGOVER state.
+		 */
+		if (n->state != NODE_FAILINGOVER && !n->failover_sent) {
+			tipc_link_create_dummy_tnl_msg(l, xmitq);
+			n->failover_sent = true;
+		}
+
 		/* If pkts arrive out of order, use lowest calculated syncpt */
 		if (less(syncpt, n->sync_point))
 			n->sync_point = syncpt;
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [net 1/1] tipc: fix failover problem
  2018-09-26 19:00 [net 1/1] tipc: fix failover problem Jon Maloy
@ 2018-09-29 18:46 ` David Miller
  0 siblings, 0 replies; 2+ messages in thread
From: David Miller @ 2018-09-29 18:46 UTC (permalink / raw)
  To: jon.maloy
  Cc: netdev, gordan.mihaljevic, tung.q.nguyen, hoang.h.le, canh.d.luu,
	ying.xue, tipc-discussion

From: Jon Maloy <jon.maloy@ericsson.com>
Date: Wed, 26 Sep 2018 21:00:54 +0200

> From: LUU Duc Canh <canh.d.luu@dektech.com.au>
> 
> We see the following scenario:
> 1) Link endpoint B on node 1 discovers that its peer endpoint is gone.
>    Since there is a second working link, failover procedure is started.
> 2) Link endpoint A on node 1 sends a FAILOVER message to peer endpoint
>    A on node 2. The node item 1->2 goes to state FAILINGOVER.
> 3) Linke endpoint A/2 receives the failover, and is supposed to take
>    down its parallell link endpoint B/2, while producing a FAILOVER
>    message to send back to A/1.
> 4) However, B/2 has already been deleted, so no FAILOVER message can
>    created.
> 5) Node 1->2 remains in state FAILINGOVER forever, refusing to receive
>    any messages that can bring B/1 up again. We are left with a non-
>    redundant link between node 1 and 2.
> 
> We fix this with letting endpoint A/2 build a dummy FAILOVER message
> to send to back to A/1, so that the situation can be resolved.
> 
> Signed-off-by: LUU Duc Canh <canh.d.luu@dektech.com.au>
> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>

Applied.

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2018-09-30  1:16 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-26 19:00 [net 1/1] tipc: fix failover problem Jon Maloy
2018-09-29 18:46 ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.