All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] sctp: Implement quick failover draft from tsvwg
@ 2012-07-13 18:26 ` Neil Horman
  0 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-13 18:26 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp

I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org
---
 Documentation/networking/ip-sysctl.txt |   14 +++++++++++++
 include/net/sctp/constants.h           |    1 +
 include/net/sctp/structs.h             |    4 +++
 include/net/sctp/user.h                |    1 +
 net/sctp/associola.c                   |   33 +++++++++++++++++++++++++------
 net/sctp/outqueue.c                    |    6 +++-
 net/sctp/sm_sideeffect.c               |   33 ++++++++++++++++++++++++++++---
 net/sctp/sysctl.c                      |    9 ++++++++
 net/sctp/transport.c                   |    3 +-
 9 files changed, 90 insertions(+), 14 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79..c636f9c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
 
 	Default: 5
 
+pf_retrans - INTEGER
+	The number of retransmissions that will be attempted on a given path
+	before traffic is redirected to an alternate transport (should one
+	exist).  Note this is distinct from path_max_retrans, as a path that
+	passes the pf_retrans threshold can still be used.  Its only
+	deprioritized when a transmission path is selected by the stack.  This
+	setting is primarily used to enable fast failover mechanisms without
+	having to reduce path_max_retrans to a very low value.  See:
+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+	for details.  Note also that a value of pf_retrans > path_max_retrans
+	disables this feature
+
+	Default: 0
+
 rto_initial - INTEGER
 	The initial round trip timeout value in milliseconds that will be used
 	in calculating round trip times.  This is the initial time interval
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 942b864..d053d2e 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -334,6 +334,7 @@ typedef enum {
 typedef enum {
 	SCTP_TRANSPORT_UP,
 	SCTP_TRANSPORT_DOWN,
+	SCTP_TRANSPORT_PF,
 } sctp_transport_cmd_t;
 
 /* These are the address scopes defined mainly for IPv4 addresses
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e4652fe..22825abe 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -160,6 +160,7 @@ extern struct sctp_globals {
 	int max_retrans_association;
 	int max_retrans_path;
 	int max_retrans_init;
+	int pf_retrans;
 
 	/*
 	 * Policy for preforming sctp/socket accounting
@@ -258,6 +259,7 @@ extern struct sctp_globals {
 #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
 #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
 #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
 #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
 #define sctp_sack_timeout		(sctp_globals.sack_timeout)
 #define sctp_hb_interval		(sctp_globals.hb_interval)
@@ -1660,6 +1662,8 @@ struct sctp_association {
 	 */
 	int max_retrans;
 
+	int pf_retrans;
+
 	/* Maximum number of times the endpoint will retransmit INIT  */
 	__u16 max_init_attempts;
 
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 0842ef0..cece1bf 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -649,6 +649,7 @@ struct sctp_paddrinfo {
  */
 enum sctp_spinfo_state {
 	SCTP_INACTIVE,
+	SCTP_PF,
 	SCTP_ACTIVE,
 	SCTP_UNCONFIRMED,
 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5bc9ab1..f3ebc23 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * socket values.
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+	asoc->pf_retrans  = sctp_pf_retrans;
+
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
@@ -840,6 +842,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	struct sctp_ulpevent *event;
 	struct sockaddr_storage addr;
 	int spc_state = 0;
+	bool ulp_notify = true;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -853,6 +856,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			spc_state = SCTP_ADDR_CONFIRMED;
 		else
 			spc_state = SCTP_ADDR_AVAILABLE;
+		/* Don't inform ULP about transition from PF to
+		 * active state and set cwnd to 1, see SCTP
+		 * Quick failover draft section 5.1, point 5
+		 */
+		if (transport->state == SCTP_PF) {
+			ulp_notify = false;
+			transport->cwnd = 1;
+		}
 		transport->state = SCTP_ACTIVE;
 		break;
 
@@ -871,6 +882,10 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
+	case SCTP_TRANSPORT_PF:
+		transport->state = SCTP_PF;
+		ulp_notify = false;
+		break;
 	default:
 		return;
 	}
@@ -878,12 +893,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
 	 * user.
 	 */
-	memset(&addr, 0, sizeof(struct sockaddr_storage));
-	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-				0, spc_state, error, GFP_ATOMIC);
-	if (event)
-		sctp_ulpq_tail_event(&asoc->ulpq, event);
+	if (ulp_notify) {
+		memset(&addr, 0, sizeof(struct sockaddr_storage));
+		memcpy(&addr, &transport->ipaddr,
+		       transport->af_specific->sockaddr_len);
+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+					0, spc_state, error, GFP_ATOMIC);
+		if (event)
+			sctp_ulpq_tail_event(&asoc->ulpq, event);
+	}
 
 	/* Select new active and retran paths. */
 
@@ -899,7 +917,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			transports) {
 
 		if ((t->state == SCTP_INACTIVE) ||
-		    (t->state == SCTP_UNCONFIRMED))
+		    (t->state == SCTP_UNCONFIRMED) ||
+		    (t->state == SCTP_PF))
 			continue;
 		if (!first || t->last_time_heard > first->last_time_heard) {
 			second = first;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index a0fa19f..e7aa177c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			if (!new_transport)
 				new_transport = asoc->peer.active_path;
 		} else if ((new_transport->state == SCTP_INACTIVE) ||
-			   (new_transport->state == SCTP_UNCONFIRMED)) {
+			   (new_transport->state == SCTP_UNCONFIRMED) ||
+			   (new_transport->state == SCTP_PF)) {
 			/* If the chunk is Heartbeat or Heartbeat Ack,
 			 * send it to chunk->transport, even if it's
 			 * inactive.
@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			new_transport = chunk->transport;
 			if (!new_transport ||
 			    ((new_transport->state == SCTP_INACTIVE) ||
-			     (new_transport->state == SCTP_UNCONFIRMED)))
+			     (new_transport->state == SCTP_UNCONFIRMED) ||
+			     (new_transport->state == SCTP_PF)))
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state == SCTP_UNCONFIRMED)
 				continue;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c96d1a8..285e26a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_cmd_seq_t *commands,
 			     gfp_t gfp);
 
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+				     struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
@@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+					 struct sctp_association *asoc,
 					 struct sctp_transport *transport,
 					 int is_hb)
 {
@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 			transport->error_count++;
 	}
 
+	/* If the transport error count is greater than the pf_retrans
+	 * threshold, and less than pathmaxrtx, then mark this transport
+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
+	 * point 1
+	 */
+	if ((transport->state != SCTP_PF) &&
+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
+	   (transport->error_count > asoc->pf_retrans)) {
+
+		sctp_assoc_control_transport(asoc, transport,
+					     SCTP_TRANSPORT_PF,
+					     0);
+
+		/* Update the hb timer to resend a heartbeat every rto */
+		sctp_cmd_hb_timer_update(commands, transport);
+	}
+
 	if (transport->state != SCTP_INACTIVE &&
 	    (transport->error_count > transport->pathmaxrxt)) {
 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 					     SCTP_HEARTBEAT_SUCCESS);
 	}
 
+	if (t->state == SCTP_PF)
+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+					     SCTP_HEARTBEAT_SUCCESS);
+
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
 	 * using the time value carried in the HEARTBEAT ACK chunk.
@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_STRIKE:
 			/* Mark one strike against a transport.  */
-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
-						    0);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						    cmd->obj.transport, 0);
 			break;
 
 		case SCTP_CMD_TRANSPORT_IDLE:
@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_TRANSPORT_HB_SENT:
 			t = cmd->obj.transport;
-			sctp_do_8_2_transport_strike(asoc, t, 1);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						     t, 1);
 			t->hb_sent = 1;
 			break;
 
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5fe639..2b2bfe9 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
 		.extra2		= &int_max
 	},
 	{
+		.procname	= "pf_retrans",
+		.data		= &sctp_pf_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
 		.procname	= "max_init_retransmits",
 		.data		= &sctp_max_retrans_init,
 		.maxlen		= sizeof(int),
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..4639ba2 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -585,7 +585,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
 {
 	unsigned long timeout;
 	timeout = t->rto + sctp_jitter(t->rto);
-	if (t->state != SCTP_UNCONFIRMED)
+	if ((t->state != SCTP_UNCONFIRMED) &&
+	    (t->state != SCTP_PF))
 		timeout += t->hbinterval;
 	timeout += jiffies;
 	return timeout;
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH] sctp: Implement quick failover draft from tsvwg
@ 2012-07-13 18:26 ` Neil Horman
  0 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-13 18:26 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp

I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org
---
 Documentation/networking/ip-sysctl.txt |   14 +++++++++++++
 include/net/sctp/constants.h           |    1 +
 include/net/sctp/structs.h             |    4 +++
 include/net/sctp/user.h                |    1 +
 net/sctp/associola.c                   |   33 +++++++++++++++++++++++++------
 net/sctp/outqueue.c                    |    6 +++-
 net/sctp/sm_sideeffect.c               |   33 ++++++++++++++++++++++++++++---
 net/sctp/sysctl.c                      |    9 ++++++++
 net/sctp/transport.c                   |    3 +-
 9 files changed, 90 insertions(+), 14 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79..c636f9c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
 
 	Default: 5
 
+pf_retrans - INTEGER
+	The number of retransmissions that will be attempted on a given path
+	before traffic is redirected to an alternate transport (should one
+	exist).  Note this is distinct from path_max_retrans, as a path that
+	passes the pf_retrans threshold can still be used.  Its only
+	deprioritized when a transmission path is selected by the stack.  This
+	setting is primarily used to enable fast failover mechanisms without
+	having to reduce path_max_retrans to a very low value.  See:
+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+	for details.  Note also that a value of pf_retrans > path_max_retrans
+	disables this feature
+
+	Default: 0
+
 rto_initial - INTEGER
 	The initial round trip timeout value in milliseconds that will be used
 	in calculating round trip times.  This is the initial time interval
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 942b864..d053d2e 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -334,6 +334,7 @@ typedef enum {
 typedef enum {
 	SCTP_TRANSPORT_UP,
 	SCTP_TRANSPORT_DOWN,
+	SCTP_TRANSPORT_PF,
 } sctp_transport_cmd_t;
 
 /* These are the address scopes defined mainly for IPv4 addresses
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e4652fe..22825abe 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -160,6 +160,7 @@ extern struct sctp_globals {
 	int max_retrans_association;
 	int max_retrans_path;
 	int max_retrans_init;
+	int pf_retrans;
 
 	/*
 	 * Policy for preforming sctp/socket accounting
@@ -258,6 +259,7 @@ extern struct sctp_globals {
 #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
 #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
 #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
 #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
 #define sctp_sack_timeout		(sctp_globals.sack_timeout)
 #define sctp_hb_interval		(sctp_globals.hb_interval)
@@ -1660,6 +1662,8 @@ struct sctp_association {
 	 */
 	int max_retrans;
 
+	int pf_retrans;
+
 	/* Maximum number of times the endpoint will retransmit INIT  */
 	__u16 max_init_attempts;
 
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 0842ef0..cece1bf 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -649,6 +649,7 @@ struct sctp_paddrinfo {
  */
 enum sctp_spinfo_state {
 	SCTP_INACTIVE,
+	SCTP_PF,
 	SCTP_ACTIVE,
 	SCTP_UNCONFIRMED,
 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5bc9ab1..f3ebc23 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * socket values.
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+	asoc->pf_retrans  = sctp_pf_retrans;
+
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
@@ -840,6 +842,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	struct sctp_ulpevent *event;
 	struct sockaddr_storage addr;
 	int spc_state = 0;
+	bool ulp_notify = true;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -853,6 +856,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			spc_state = SCTP_ADDR_CONFIRMED;
 		else
 			spc_state = SCTP_ADDR_AVAILABLE;
+		/* Don't inform ULP about transition from PF to
+		 * active state and set cwnd to 1, see SCTP
+		 * Quick failover draft section 5.1, point 5
+		 */
+		if (transport->state = SCTP_PF) {
+			ulp_notify = false;
+			transport->cwnd = 1;
+		}
 		transport->state = SCTP_ACTIVE;
 		break;
 
@@ -871,6 +882,10 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
+	case SCTP_TRANSPORT_PF:
+		transport->state = SCTP_PF;
+		ulp_notify = false;
+		break;
 	default:
 		return;
 	}
@@ -878,12 +893,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
 	 * user.
 	 */
-	memset(&addr, 0, sizeof(struct sockaddr_storage));
-	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-				0, spc_state, error, GFP_ATOMIC);
-	if (event)
-		sctp_ulpq_tail_event(&asoc->ulpq, event);
+	if (ulp_notify) {
+		memset(&addr, 0, sizeof(struct sockaddr_storage));
+		memcpy(&addr, &transport->ipaddr,
+		       transport->af_specific->sockaddr_len);
+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+					0, spc_state, error, GFP_ATOMIC);
+		if (event)
+			sctp_ulpq_tail_event(&asoc->ulpq, event);
+	}
 
 	/* Select new active and retran paths. */
 
@@ -899,7 +917,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			transports) {
 
 		if ((t->state = SCTP_INACTIVE) ||
-		    (t->state = SCTP_UNCONFIRMED))
+		    (t->state = SCTP_UNCONFIRMED) ||
+		    (t->state = SCTP_PF))
 			continue;
 		if (!first || t->last_time_heard > first->last_time_heard) {
 			second = first;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index a0fa19f..e7aa177c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			if (!new_transport)
 				new_transport = asoc->peer.active_path;
 		} else if ((new_transport->state = SCTP_INACTIVE) ||
-			   (new_transport->state = SCTP_UNCONFIRMED)) {
+			   (new_transport->state = SCTP_UNCONFIRMED) ||
+			   (new_transport->state = SCTP_PF)) {
 			/* If the chunk is Heartbeat or Heartbeat Ack,
 			 * send it to chunk->transport, even if it's
 			 * inactive.
@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			new_transport = chunk->transport;
 			if (!new_transport ||
 			    ((new_transport->state = SCTP_INACTIVE) ||
-			     (new_transport->state = SCTP_UNCONFIRMED)))
+			     (new_transport->state = SCTP_UNCONFIRMED) ||
+			     (new_transport->state = SCTP_PF)))
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state = SCTP_UNCONFIRMED)
 				continue;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c96d1a8..285e26a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_cmd_seq_t *commands,
 			     gfp_t gfp);
 
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+				     struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
@@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+					 struct sctp_association *asoc,
 					 struct sctp_transport *transport,
 					 int is_hb)
 {
@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 			transport->error_count++;
 	}
 
+	/* If the transport error count is greater than the pf_retrans
+	 * threshold, and less than pathmaxrtx, then mark this transport
+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
+	 * point 1
+	 */
+	if ((transport->state != SCTP_PF) &&
+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
+	   (transport->error_count > asoc->pf_retrans)) {
+
+		sctp_assoc_control_transport(asoc, transport,
+					     SCTP_TRANSPORT_PF,
+					     0);
+
+		/* Update the hb timer to resend a heartbeat every rto */
+		sctp_cmd_hb_timer_update(commands, transport);
+	}
+
 	if (transport->state != SCTP_INACTIVE &&
 	    (transport->error_count > transport->pathmaxrxt)) {
 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 					     SCTP_HEARTBEAT_SUCCESS);
 	}
 
+	if (t->state = SCTP_PF)
+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+					     SCTP_HEARTBEAT_SUCCESS);
+
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
 	 * using the time value carried in the HEARTBEAT ACK chunk.
@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_STRIKE:
 			/* Mark one strike against a transport.  */
-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
-						    0);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						    cmd->obj.transport, 0);
 			break;
 
 		case SCTP_CMD_TRANSPORT_IDLE:
@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_TRANSPORT_HB_SENT:
 			t = cmd->obj.transport;
-			sctp_do_8_2_transport_strike(asoc, t, 1);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						     t, 1);
 			t->hb_sent = 1;
 			break;
 
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5fe639..2b2bfe9 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
 		.extra2		= &int_max
 	},
 	{
+		.procname	= "pf_retrans",
+		.data		= &sctp_pf_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
 		.procname	= "max_init_retransmits",
 		.data		= &sctp_max_retrans_init,
 		.maxlen		= sizeof(int),
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..4639ba2 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -585,7 +585,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
 {
 	unsigned long timeout;
 	timeout = t->rto + sctp_jitter(t->rto);
-	if (t->state != SCTP_UNCONFIRMED)
+	if ((t->state != SCTP_UNCONFIRMED) &&
+	    (t->state != SCTP_PF))
 		timeout += t->hbinterval;
 	timeout += jiffies;
 	return timeout;
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH] sctp: Implement quick failover draft from tsvwg
  2012-07-13 18:26 ` Neil Horman
@ 2012-07-14 18:12   ` Vlad Yasevich
  -1 siblings, 0 replies; 48+ messages in thread
From: Vlad Yasevich @ 2012-07-14 18:12 UTC (permalink / raw)
  To: Neil Horman, netdev; +Cc: Sridhar Samudrala, David S. Miller, linux-sctp

Neil Horman <nhorman@tuxdriver.com> wrote:

>I've seen several attempts recently made to do quick failover of sctp
>transports
>by reducing various retransmit timers and counters.  While its possible
>to
>implement a faster failover on multihomed sctp associations, its not
>particularly robust, in that it can lead to unneeded retransmits, as
>well as
>false connection failures due to intermittent latency on a network.
>
>Instead, lets implement the new ietf quick failover draft found here:
>http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>
>This will let the sctp stack identify transports that have had a small
>number of
>errors, and avoid using them quickly until their reliability can be
>re-established.  I've tested this out on two virt guests connected via
>multiple
>isolated virt networks and believe its in compliance with the above
>draft and
>works well.
>
>Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
>CC: Vlad Yasevich <vyasevich@gmail.com>
>CC: Sridhar Samudrala <sri@us.ibm.com>
>CC: "David S. Miller" <davem@davemloft.net>
>CC: linux-sctp@vger.kernel.org
>---
> Documentation/networking/ip-sysctl.txt |   14 +++++++++++++
> include/net/sctp/constants.h           |    1 +
> include/net/sctp/structs.h             |    4 +++
> include/net/sctp/user.h                |    1 +
>net/sctp/associola.c                   |   33
>+++++++++++++++++++++++++------
> net/sctp/outqueue.c                    |    6 +++-
>net/sctp/sm_sideeffect.c               |   33
>++++++++++++++++++++++++++++---
> net/sctp/sysctl.c                      |    9 ++++++++
> net/sctp/transport.c                   |    3 +-
> 9 files changed, 90 insertions(+), 14 deletions(-)
>
>diff --git a/Documentation/networking/ip-sysctl.txt
>b/Documentation/networking/ip-sysctl.txt
>index 47b6c79..c636f9c 100644
>--- a/Documentation/networking/ip-sysctl.txt
>+++ b/Documentation/networking/ip-sysctl.txt
>@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
> 
> 	Default: 5
> 
>+pf_retrans - INTEGER
>+	The number of retransmissions that will be attempted on a given path
>+	before traffic is redirected to an alternate transport (should one
>+	exist).  Note this is distinct from path_max_retrans, as a path that
>+	passes the pf_retrans threshold can still be used.  Its only
>+	deprioritized when a transmission path is selected by the stack. 
>This
>+	setting is primarily used to enable fast failover mechanisms without
>+	having to reduce path_max_retrans to a very low value.  See:
>+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
>+	for details.  Note also that a value of pf_retrans > path_max_retrans
>+	disables this feature
>+
>+	Default: 0
>+
> rto_initial - INTEGER
>	The initial round trip timeout value in milliseconds that will be used
> 	in calculating round trip times.  This is the initial time interval
>diff --git a/include/net/sctp/constants.h
>b/include/net/sctp/constants.h
>index 942b864..d053d2e 100644
>--- a/include/net/sctp/constants.h
>+++ b/include/net/sctp/constants.h
>@@ -334,6 +334,7 @@ typedef enum {
> typedef enum {
> 	SCTP_TRANSPORT_UP,
> 	SCTP_TRANSPORT_DOWN,
>+	SCTP_TRANSPORT_PF,
> } sctp_transport_cmd_t;
> 
> /* These are the address scopes defined mainly for IPv4 addresses
>diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
>index e4652fe..22825abe 100644
>--- a/include/net/sctp/structs.h
>+++ b/include/net/sctp/structs.h
>@@ -160,6 +160,7 @@ extern struct sctp_globals {
> 	int max_retrans_association;
> 	int max_retrans_path;
> 	int max_retrans_init;
>+	int pf_retrans;
> 
> 	/*
> 	 * Policy for preforming sctp/socket accounting
>@@ -258,6 +259,7 @@ extern struct sctp_globals {
> #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
> #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
> #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
>+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
> #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
> #define sctp_sack_timeout		(sctp_globals.sack_timeout)
> #define sctp_hb_interval		(sctp_globals.hb_interval)
>@@ -1660,6 +1662,8 @@ struct sctp_association {
> 	 */
> 	int max_retrans;
> 
>+	int pf_retrans;
>+
> 	/* Maximum number of times the endpoint will retransmit INIT  */
> 	__u16 max_init_attempts;
> 
>diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
>index 0842ef0..cece1bf 100644
>--- a/include/net/sctp/user.h
>+++ b/include/net/sctp/user.h
>@@ -649,6 +649,7 @@ struct sctp_paddrinfo {
>  */
> enum sctp_spinfo_state {
> 	SCTP_INACTIVE,
>+	SCTP_PF,
> 	SCTP_ACTIVE,
> 	SCTP_UNCONFIRMED,
> 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
>diff --git a/net/sctp/associola.c b/net/sctp/associola.c
>index 5bc9ab1..f3ebc23 100644
>--- a/net/sctp/associola.c
>+++ b/net/sctp/associola.c
>@@ -124,6 +124,8 @@ static struct sctp_association
>*sctp_association_init(struct sctp_association *a
> 	 * socket values.
> 	 */
> 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
>+	asoc->pf_retrans  = sctp_pf_retrans;
>+
> 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
> 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
> 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
>@@ -840,6 +842,7 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 	struct sctp_ulpevent *event;
> 	struct sockaddr_storage addr;
> 	int spc_state = 0;
>+	bool ulp_notify = true;
> 
> 	/* Record the transition on the transport.  */
> 	switch (command) {
>@@ -853,6 +856,14 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 			spc_state = SCTP_ADDR_CONFIRMED;
> 		else
> 			spc_state = SCTP_ADDR_AVAILABLE;
>+		/* Don't inform ULP about transition from PF to
>+		 * active state and set cwnd to 1, see SCTP
>+		 * Quick failover draft section 5.1, point 5
>+		 */
>+		if (transport->state == SCTP_PF) {
>+			ulp_notify = false;
>+			transport->cwnd = 1;
>+		}
> 		transport->state = SCTP_ACTIVE;
> 		break;
> 
>@@ -871,6 +882,10 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 		spc_state = SCTP_ADDR_UNREACHABLE;
> 		break;
> 
>+	case SCTP_TRANSPORT_PF:
>+		transport->state = SCTP_PF;
>+		ulp_notify = false;
>+		break;
> 	default:
> 		return;
> 	}
>@@ -878,12 +893,15 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
> 	 * user.
> 	 */
>-	memset(&addr, 0, sizeof(struct sockaddr_storage));
>-	memcpy(&addr, &transport->ipaddr,
>transport->af_specific->sockaddr_len);
>-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
>-				0, spc_state, error, GFP_ATOMIC);
>-	if (event)
>-		sctp_ulpq_tail_event(&asoc->ulpq, event);
>+	if (ulp_notify) {
>+		memset(&addr, 0, sizeof(struct sockaddr_storage));
>+		memcpy(&addr, &transport->ipaddr,
>+		       transport->af_specific->sockaddr_len);
>+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
>+					0, spc_state, error, GFP_ATOMIC);
>+		if (event)
>+			sctp_ulpq_tail_event(&asoc->ulpq, event);
>+	}
> 
> 	/* Select new active and retran paths. */
> 
>@@ -899,7 +917,8 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 			transports) {
> 
> 		if ((t->state == SCTP_INACTIVE) ||
>-		    (t->state == SCTP_UNCONFIRMED))
>+		    (t->state == SCTP_UNCONFIRMED) ||
>+		    (t->state == SCTP_PF))
> 			continue;
> 		if (!first || t->last_time_heard > first->last_time_heard) {
> 			second = first;
>diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
>index a0fa19f..e7aa177c 100644
>--- a/net/sctp/outqueue.c
>+++ b/net/sctp/outqueue.c
>@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int
>rtx_timeout)
> 			if (!new_transport)
> 				new_transport = asoc->peer.active_path;
> 		} else if ((new_transport->state == SCTP_INACTIVE) ||
>-			   (new_transport->state == SCTP_UNCONFIRMED)) {
>+			   (new_transport->state == SCTP_UNCONFIRMED) ||
>+			   (new_transport->state == SCTP_PF)) {
> 			/* If the chunk is Heartbeat or Heartbeat Ack,
> 			 * send it to chunk->transport, even if it's
> 			 * inactive.
>@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int
>rtx_timeout)
> 			new_transport = chunk->transport;
> 			if (!new_transport ||
> 			    ((new_transport->state == SCTP_INACTIVE) ||
>-			     (new_transport->state == SCTP_UNCONFIRMED)))
>+			     (new_transport->state == SCTP_UNCONFIRMED) ||
>+			     (new_transport->state == SCTP_PF)))
> 				new_transport = asoc->peer.active_path;
> 			if (new_transport->state == SCTP_UNCONFIRMED)
> 				continue;
>diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
>index c96d1a8..285e26a 100644
>--- a/net/sctp/sm_sideeffect.c
>+++ b/net/sctp/sm_sideeffect.c
>@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type,
>sctp_subtype_t subtype,
> 			     sctp_cmd_seq_t *commands,
> 			     gfp_t gfp);
> 
>+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
>+				     struct sctp_transport *t);
> /********************************************************************
>  * Helper functions
>  ********************************************************************/
>@@ -470,7 +472,8 @@ sctp_timer_event_t
>*sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
>  * notification SHOULD be sent to the upper layer.
>  *
>  */
>-static void sctp_do_8_2_transport_strike(struct sctp_association
>*asoc,
>+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
>+					 struct sctp_association *asoc,
> 					 struct sctp_transport *transport,
> 					 int is_hb)
> {
>@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct
>sctp_association *asoc,
> 			transport->error_count++;
> 	}
> 
>+	/* If the transport error count is greater than the pf_retrans
>+	 * threshold, and less than pathmaxrtx, then mark this transport
>+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
>+	 * point 1
>+	 */
>+	if ((transport->state != SCTP_PF) &&
>+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
>+	   (transport->error_count > asoc->pf_retrans)) {
>+
>+		sctp_assoc_control_transport(asoc, transport,
>+					     SCTP_TRANSPORT_PF,
>+					     0);
>+
>+		/* Update the hb timer to resend a heartbeat every rto */
>+		sctp_cmd_hb_timer_update(commands, transport);
>+	}
>+
> 	if (transport->state != SCTP_INACTIVE &&
> 	    (transport->error_count > transport->pathmaxrxt)) {
> 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
>@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t
>*cmds,
> 					     SCTP_HEARTBEAT_SUCCESS);
> 	}
> 
>+	if (t->state == SCTP_PF)
>+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
>+					     SCTP_HEARTBEAT_SUCCESS);
>+
> 	/* The receiver of the HEARTBEAT ACK should also perform an
> 	 * RTT measurement for that destination transport address
> 	 * using the time value carried in the HEARTBEAT ACK chunk.
>@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t
>event_type,
> 
> 		case SCTP_CMD_STRIKE:
> 			/* Mark one strike against a transport.  */
>-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
>-						    0);
>+			sctp_do_8_2_transport_strike(commands, asoc,
>+						    cmd->obj.transport, 0);
> 			break;
> 
> 		case SCTP_CMD_TRANSPORT_IDLE:
>@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t
>event_type,
> 
> 		case SCTP_CMD_TRANSPORT_HB_SENT:
> 			t = cmd->obj.transport;
>-			sctp_do_8_2_transport_strike(asoc, t, 1);
>+			sctp_do_8_2_transport_strike(commands, asoc,
>+						     t, 1);
> 			t->hb_sent = 1;
> 			break;
> 
>diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
>index e5fe639..2b2bfe9 100644
>--- a/net/sctp/sysctl.c
>+++ b/net/sctp/sysctl.c
>@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
> 		.extra2		= &int_max
> 	},
> 	{
>+		.procname	= "pf_retrans",
>+		.data		= &sctp_pf_retrans,
>+		.maxlen		= sizeof(int),
>+		.mode		= 0644,
>+		.proc_handler	= proc_dointvec_minmax,
>+		.extra1		= &zero,
>+		.extra2		= &int_max
>+	},
>+	{
> 		.procname	= "max_init_retransmits",
> 		.data		= &sctp_max_retrans_init,
> 		.maxlen		= sizeof(int),
>diff --git a/net/sctp/transport.c b/net/sctp/transport.c
>index b026ba0..4639ba2 100644
>--- a/net/sctp/transport.c
>+++ b/net/sctp/transport.c
>@@ -585,7 +585,8 @@ unsigned long sctp_transport_timeout(struct
>sctp_transport *t)
> {
> 	unsigned long timeout;
> 	timeout = t->rto + sctp_jitter(t->rto);
>-	if (t->state != SCTP_UNCONFIRMED)
>+	if ((t->state != SCTP_UNCONFIRMED) &&
>+	    (t->state != SCTP_PF))
> 		timeout += t->hbinterval;
> 	timeout += jiffies;
> 	return timeout;
>-- 
>1.7.7.6

One thing that seems to be missing is the API.  As a result you don't carry the value per transport which we'll need.  That caused you to add assoc parameter to some functions.  That's really the only missing item.
-- 
Sent from my Android phone with SkitMail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] sctp: Implement quick failover draft from tsvwg
@ 2012-07-14 18:12   ` Vlad Yasevich
  0 siblings, 0 replies; 48+ messages in thread
From: Vlad Yasevich @ 2012-07-14 18:12 UTC (permalink / raw)
  To: Neil Horman, netdev; +Cc: Sridhar Samudrala, David S. Miller, linux-sctp

Neil Horman <nhorman@tuxdriver.com> wrote:

>I've seen several attempts recently made to do quick failover of sctp
>transports
>by reducing various retransmit timers and counters.  While its possible
>to
>implement a faster failover on multihomed sctp associations, its not
>particularly robust, in that it can lead to unneeded retransmits, as
>well as
>false connection failures due to intermittent latency on a network.
>
>Instead, lets implement the new ietf quick failover draft found here:
>http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>
>This will let the sctp stack identify transports that have had a small
>number of
>errors, and avoid using them quickly until their reliability can be
>re-established.  I've tested this out on two virt guests connected via
>multiple
>isolated virt networks and believe its in compliance with the above
>draft and
>works well.
>
>Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
>CC: Vlad Yasevich <vyasevich@gmail.com>
>CC: Sridhar Samudrala <sri@us.ibm.com>
>CC: "David S. Miller" <davem@davemloft.net>
>CC: linux-sctp@vger.kernel.org
>---
> Documentation/networking/ip-sysctl.txt |   14 +++++++++++++
> include/net/sctp/constants.h           |    1 +
> include/net/sctp/structs.h             |    4 +++
> include/net/sctp/user.h                |    1 +
>net/sctp/associola.c                   |   33
>+++++++++++++++++++++++++------
> net/sctp/outqueue.c                    |    6 +++-
>net/sctp/sm_sideeffect.c               |   33
>++++++++++++++++++++++++++++---
> net/sctp/sysctl.c                      |    9 ++++++++
> net/sctp/transport.c                   |    3 +-
> 9 files changed, 90 insertions(+), 14 deletions(-)
>
>diff --git a/Documentation/networking/ip-sysctl.txt
>b/Documentation/networking/ip-sysctl.txt
>index 47b6c79..c636f9c 100644
>--- a/Documentation/networking/ip-sysctl.txt
>+++ b/Documentation/networking/ip-sysctl.txt
>@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
> 
> 	Default: 5
> 
>+pf_retrans - INTEGER
>+	The number of retransmissions that will be attempted on a given path
>+	before traffic is redirected to an alternate transport (should one
>+	exist).  Note this is distinct from path_max_retrans, as a path that
>+	passes the pf_retrans threshold can still be used.  Its only
>+	deprioritized when a transmission path is selected by the stack. 
>This
>+	setting is primarily used to enable fast failover mechanisms without
>+	having to reduce path_max_retrans to a very low value.  See:
>+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
>+	for details.  Note also that a value of pf_retrans > path_max_retrans
>+	disables this feature
>+
>+	Default: 0
>+
> rto_initial - INTEGER
>	The initial round trip timeout value in milliseconds that will be used
> 	in calculating round trip times.  This is the initial time interval
>diff --git a/include/net/sctp/constants.h
>b/include/net/sctp/constants.h
>index 942b864..d053d2e 100644
>--- a/include/net/sctp/constants.h
>+++ b/include/net/sctp/constants.h
>@@ -334,6 +334,7 @@ typedef enum {
> typedef enum {
> 	SCTP_TRANSPORT_UP,
> 	SCTP_TRANSPORT_DOWN,
>+	SCTP_TRANSPORT_PF,
> } sctp_transport_cmd_t;
> 
> /* These are the address scopes defined mainly for IPv4 addresses
>diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
>index e4652fe..22825abe 100644
>--- a/include/net/sctp/structs.h
>+++ b/include/net/sctp/structs.h
>@@ -160,6 +160,7 @@ extern struct sctp_globals {
> 	int max_retrans_association;
> 	int max_retrans_path;
> 	int max_retrans_init;
>+	int pf_retrans;
> 
> 	/*
> 	 * Policy for preforming sctp/socket accounting
>@@ -258,6 +259,7 @@ extern struct sctp_globals {
> #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
> #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
> #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
>+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
> #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
> #define sctp_sack_timeout		(sctp_globals.sack_timeout)
> #define sctp_hb_interval		(sctp_globals.hb_interval)
>@@ -1660,6 +1662,8 @@ struct sctp_association {
> 	 */
> 	int max_retrans;
> 
>+	int pf_retrans;
>+
> 	/* Maximum number of times the endpoint will retransmit INIT  */
> 	__u16 max_init_attempts;
> 
>diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
>index 0842ef0..cece1bf 100644
>--- a/include/net/sctp/user.h
>+++ b/include/net/sctp/user.h
>@@ -649,6 +649,7 @@ struct sctp_paddrinfo {
>  */
> enum sctp_spinfo_state {
> 	SCTP_INACTIVE,
>+	SCTP_PF,
> 	SCTP_ACTIVE,
> 	SCTP_UNCONFIRMED,
> 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
>diff --git a/net/sctp/associola.c b/net/sctp/associola.c
>index 5bc9ab1..f3ebc23 100644
>--- a/net/sctp/associola.c
>+++ b/net/sctp/associola.c
>@@ -124,6 +124,8 @@ static struct sctp_association
>*sctp_association_init(struct sctp_association *a
> 	 * socket values.
> 	 */
> 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
>+	asoc->pf_retrans  = sctp_pf_retrans;
>+
> 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
> 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
> 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
>@@ -840,6 +842,7 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 	struct sctp_ulpevent *event;
> 	struct sockaddr_storage addr;
> 	int spc_state = 0;
>+	bool ulp_notify = true;
> 
> 	/* Record the transition on the transport.  */
> 	switch (command) {
>@@ -853,6 +856,14 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 			spc_state = SCTP_ADDR_CONFIRMED;
> 		else
> 			spc_state = SCTP_ADDR_AVAILABLE;
>+		/* Don't inform ULP about transition from PF to
>+		 * active state and set cwnd to 1, see SCTP
>+		 * Quick failover draft section 5.1, point 5
>+		 */
>+		if (transport->state = SCTP_PF) {
>+			ulp_notify = false;
>+			transport->cwnd = 1;
>+		}
> 		transport->state = SCTP_ACTIVE;
> 		break;
> 
>@@ -871,6 +882,10 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 		spc_state = SCTP_ADDR_UNREACHABLE;
> 		break;
> 
>+	case SCTP_TRANSPORT_PF:
>+		transport->state = SCTP_PF;
>+		ulp_notify = false;
>+		break;
> 	default:
> 		return;
> 	}
>@@ -878,12 +893,15 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
> 	 * user.
> 	 */
>-	memset(&addr, 0, sizeof(struct sockaddr_storage));
>-	memcpy(&addr, &transport->ipaddr,
>transport->af_specific->sockaddr_len);
>-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
>-				0, spc_state, error, GFP_ATOMIC);
>-	if (event)
>-		sctp_ulpq_tail_event(&asoc->ulpq, event);
>+	if (ulp_notify) {
>+		memset(&addr, 0, sizeof(struct sockaddr_storage));
>+		memcpy(&addr, &transport->ipaddr,
>+		       transport->af_specific->sockaddr_len);
>+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
>+					0, spc_state, error, GFP_ATOMIC);
>+		if (event)
>+			sctp_ulpq_tail_event(&asoc->ulpq, event);
>+	}
> 
> 	/* Select new active and retran paths. */
> 
>@@ -899,7 +917,8 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 			transports) {
> 
> 		if ((t->state = SCTP_INACTIVE) ||
>-		    (t->state = SCTP_UNCONFIRMED))
>+		    (t->state = SCTP_UNCONFIRMED) ||
>+		    (t->state = SCTP_PF))
> 			continue;
> 		if (!first || t->last_time_heard > first->last_time_heard) {
> 			second = first;
>diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
>index a0fa19f..e7aa177c 100644
>--- a/net/sctp/outqueue.c
>+++ b/net/sctp/outqueue.c
>@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int
>rtx_timeout)
> 			if (!new_transport)
> 				new_transport = asoc->peer.active_path;
> 		} else if ((new_transport->state = SCTP_INACTIVE) ||
>-			   (new_transport->state = SCTP_UNCONFIRMED)) {
>+			   (new_transport->state = SCTP_UNCONFIRMED) ||
>+			   (new_transport->state = SCTP_PF)) {
> 			/* If the chunk is Heartbeat or Heartbeat Ack,
> 			 * send it to chunk->transport, even if it's
> 			 * inactive.
>@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int
>rtx_timeout)
> 			new_transport = chunk->transport;
> 			if (!new_transport ||
> 			    ((new_transport->state = SCTP_INACTIVE) ||
>-			     (new_transport->state = SCTP_UNCONFIRMED)))
>+			     (new_transport->state = SCTP_UNCONFIRMED) ||
>+			     (new_transport->state = SCTP_PF)))
> 				new_transport = asoc->peer.active_path;
> 			if (new_transport->state = SCTP_UNCONFIRMED)
> 				continue;
>diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
>index c96d1a8..285e26a 100644
>--- a/net/sctp/sm_sideeffect.c
>+++ b/net/sctp/sm_sideeffect.c
>@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type,
>sctp_subtype_t subtype,
> 			     sctp_cmd_seq_t *commands,
> 			     gfp_t gfp);
> 
>+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
>+				     struct sctp_transport *t);
> /********************************************************************
>  * Helper functions
>  ********************************************************************/
>@@ -470,7 +472,8 @@ sctp_timer_event_t
>*sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
>  * notification SHOULD be sent to the upper layer.
>  *
>  */
>-static void sctp_do_8_2_transport_strike(struct sctp_association
>*asoc,
>+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
>+					 struct sctp_association *asoc,
> 					 struct sctp_transport *transport,
> 					 int is_hb)
> {
>@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct
>sctp_association *asoc,
> 			transport->error_count++;
> 	}
> 
>+	/* If the transport error count is greater than the pf_retrans
>+	 * threshold, and less than pathmaxrtx, then mark this transport
>+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
>+	 * point 1
>+	 */
>+	if ((transport->state != SCTP_PF) &&
>+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
>+	   (transport->error_count > asoc->pf_retrans)) {
>+
>+		sctp_assoc_control_transport(asoc, transport,
>+					     SCTP_TRANSPORT_PF,
>+					     0);
>+
>+		/* Update the hb timer to resend a heartbeat every rto */
>+		sctp_cmd_hb_timer_update(commands, transport);
>+	}
>+
> 	if (transport->state != SCTP_INACTIVE &&
> 	    (transport->error_count > transport->pathmaxrxt)) {
> 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
>@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t
>*cmds,
> 					     SCTP_HEARTBEAT_SUCCESS);
> 	}
> 
>+	if (t->state = SCTP_PF)
>+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
>+					     SCTP_HEARTBEAT_SUCCESS);
>+
> 	/* The receiver of the HEARTBEAT ACK should also perform an
> 	 * RTT measurement for that destination transport address
> 	 * using the time value carried in the HEARTBEAT ACK chunk.
>@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t
>event_type,
> 
> 		case SCTP_CMD_STRIKE:
> 			/* Mark one strike against a transport.  */
>-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
>-						    0);
>+			sctp_do_8_2_transport_strike(commands, asoc,
>+						    cmd->obj.transport, 0);
> 			break;
> 
> 		case SCTP_CMD_TRANSPORT_IDLE:
>@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t
>event_type,
> 
> 		case SCTP_CMD_TRANSPORT_HB_SENT:
> 			t = cmd->obj.transport;
>-			sctp_do_8_2_transport_strike(asoc, t, 1);
>+			sctp_do_8_2_transport_strike(commands, asoc,
>+						     t, 1);
> 			t->hb_sent = 1;
> 			break;
> 
>diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
>index e5fe639..2b2bfe9 100644
>--- a/net/sctp/sysctl.c
>+++ b/net/sctp/sysctl.c
>@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
> 		.extra2		= &int_max
> 	},
> 	{
>+		.procname	= "pf_retrans",
>+		.data		= &sctp_pf_retrans,
>+		.maxlen		= sizeof(int),
>+		.mode		= 0644,
>+		.proc_handler	= proc_dointvec_minmax,
>+		.extra1		= &zero,
>+		.extra2		= &int_max
>+	},
>+	{
> 		.procname	= "max_init_retransmits",
> 		.data		= &sctp_max_retrans_init,
> 		.maxlen		= sizeof(int),
>diff --git a/net/sctp/transport.c b/net/sctp/transport.c
>index b026ba0..4639ba2 100644
>--- a/net/sctp/transport.c
>+++ b/net/sctp/transport.c
>@@ -585,7 +585,8 @@ unsigned long sctp_transport_timeout(struct
>sctp_transport *t)
> {
> 	unsigned long timeout;
> 	timeout = t->rto + sctp_jitter(t->rto);
>-	if (t->state != SCTP_UNCONFIRMED)
>+	if ((t->state != SCTP_UNCONFIRMED) &&
>+	    (t->state != SCTP_PF))
> 		timeout += t->hbinterval;
> 	timeout += jiffies;
> 	return timeout;
>-- 
>1.7.7.6

One thing that seems to be missing is the API.  As a result you don't carry the value per transport which we'll need.  That caused you to add assoc parameter to some functions.  That's really the only missing item.
-- 
Sent from my Android phone with SkitMail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] sctp: Implement quick failover draft from tsvwg
  2012-07-14 18:12   ` Vlad Yasevich
@ 2012-07-14 21:22     ` Neil Horman
  -1 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-14 21:22 UTC (permalink / raw)
  To: Vlad Yasevich; +Cc: netdev, Sridhar Samudrala, David S. Miller, linux-sctp

On Sat, Jul 14, 2012 at 02:12:36PM -0400, Vlad Yasevich wrote:
> Neil Horman <nhorman@tuxdriver.com> wrote:
> 
> >I've seen several attempts recently made to do quick failover of sctp
> >transports
> >by reducing various retransmit timers and counters.  While its possible
> >to
> >implement a faster failover on multihomed sctp associations, its not
> >particularly robust, in that it can lead to unneeded retransmits, as
> >well as
> >false connection failures due to intermittent latency on a network.
> >
> >Instead, lets implement the new ietf quick failover draft found here:
> >http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
> >
> >This will let the sctp stack identify transports that have had a small
> >number of
> >errors, and avoid using them quickly until their reliability can be
> >re-established.  I've tested this out on two virt guests connected via
> >multiple
> >isolated virt networks and believe its in compliance with the above
> >draft and
> >works well.
> >
> >Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> >CC: Vlad Yasevich <vyasevich@gmail.com>
> >CC: Sridhar Samudrala <sri@us.ibm.com>
> >CC: "David S. Miller" <davem@davemloft.net>
> >CC: linux-sctp@vger.kernel.org
> >---
> > Documentation/networking/ip-sysctl.txt |   14 +++++++++++++
> > include/net/sctp/constants.h           |    1 +
> > include/net/sctp/structs.h             |    4 +++
> > include/net/sctp/user.h                |    1 +
> >net/sctp/associola.c                   |   33
> >+++++++++++++++++++++++++------
> > net/sctp/outqueue.c                    |    6 +++-
> >net/sctp/sm_sideeffect.c               |   33
> >++++++++++++++++++++++++++++---
> > net/sctp/sysctl.c                      |    9 ++++++++
> > net/sctp/transport.c                   |    3 +-
> > 9 files changed, 90 insertions(+), 14 deletions(-)
> >
> >diff --git a/Documentation/networking/ip-sysctl.txt
> >b/Documentation/networking/ip-sysctl.txt
> >index 47b6c79..c636f9c 100644
> >--- a/Documentation/networking/ip-sysctl.txt
> >+++ b/Documentation/networking/ip-sysctl.txt
> >@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
> > 
> > 	Default: 5
> > 
> >+pf_retrans - INTEGER
> >+	The number of retransmissions that will be attempted on a given path
> >+	before traffic is redirected to an alternate transport (should one
> >+	exist).  Note this is distinct from path_max_retrans, as a path that
> >+	passes the pf_retrans threshold can still be used.  Its only
> >+	deprioritized when a transmission path is selected by the stack. 
> >This
> >+	setting is primarily used to enable fast failover mechanisms without
> >+	having to reduce path_max_retrans to a very low value.  See:
> >+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> >+	for details.  Note also that a value of pf_retrans > path_max_retrans
> >+	disables this feature
> >+
> >+	Default: 0
> >+
> > rto_initial - INTEGER
> >	The initial round trip timeout value in milliseconds that will be used
> > 	in calculating round trip times.  This is the initial time interval
> >diff --git a/include/net/sctp/constants.h
> >b/include/net/sctp/constants.h
> >index 942b864..d053d2e 100644
> >--- a/include/net/sctp/constants.h
> >+++ b/include/net/sctp/constants.h
> >@@ -334,6 +334,7 @@ typedef enum {
> > typedef enum {
> > 	SCTP_TRANSPORT_UP,
> > 	SCTP_TRANSPORT_DOWN,
> >+	SCTP_TRANSPORT_PF,
> > } sctp_transport_cmd_t;
> > 
> > /* These are the address scopes defined mainly for IPv4 addresses
> >diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> >index e4652fe..22825abe 100644
> >--- a/include/net/sctp/structs.h
> >+++ b/include/net/sctp/structs.h
> >@@ -160,6 +160,7 @@ extern struct sctp_globals {
> > 	int max_retrans_association;
> > 	int max_retrans_path;
> > 	int max_retrans_init;
> >+	int pf_retrans;
> > 
> > 	/*
> > 	 * Policy for preforming sctp/socket accounting
> >@@ -258,6 +259,7 @@ extern struct sctp_globals {
> > #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
> > #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
> > #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
> >+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
> > #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
> > #define sctp_sack_timeout		(sctp_globals.sack_timeout)
> > #define sctp_hb_interval		(sctp_globals.hb_interval)
> >@@ -1660,6 +1662,8 @@ struct sctp_association {
> > 	 */
> > 	int max_retrans;
> > 
> >+	int pf_retrans;
> >+
> > 	/* Maximum number of times the endpoint will retransmit INIT  */
> > 	__u16 max_init_attempts;
> > 
> >diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
> >index 0842ef0..cece1bf 100644
> >--- a/include/net/sctp/user.h
> >+++ b/include/net/sctp/user.h
> >@@ -649,6 +649,7 @@ struct sctp_paddrinfo {
> >  */
> > enum sctp_spinfo_state {
> > 	SCTP_INACTIVE,
> >+	SCTP_PF,
> > 	SCTP_ACTIVE,
> > 	SCTP_UNCONFIRMED,
> > 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
> >diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> >index 5bc9ab1..f3ebc23 100644
> >--- a/net/sctp/associola.c
> >+++ b/net/sctp/associola.c
> >@@ -124,6 +124,8 @@ static struct sctp_association
> >*sctp_association_init(struct sctp_association *a
> > 	 * socket values.
> > 	 */
> > 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
> >+	asoc->pf_retrans  = sctp_pf_retrans;
> >+
> > 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
> > 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
> > 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
> >@@ -840,6 +842,7 @@ void sctp_assoc_control_transport(struct
> >sctp_association *asoc,
> > 	struct sctp_ulpevent *event;
> > 	struct sockaddr_storage addr;
> > 	int spc_state = 0;
> >+	bool ulp_notify = true;
> > 
> > 	/* Record the transition on the transport.  */
> > 	switch (command) {
> >@@ -853,6 +856,14 @@ void sctp_assoc_control_transport(struct
> >sctp_association *asoc,
> > 			spc_state = SCTP_ADDR_CONFIRMED;
> > 		else
> > 			spc_state = SCTP_ADDR_AVAILABLE;
> >+		/* Don't inform ULP about transition from PF to
> >+		 * active state and set cwnd to 1, see SCTP
> >+		 * Quick failover draft section 5.1, point 5
> >+		 */
> >+		if (transport->state == SCTP_PF) {
> >+			ulp_notify = false;
> >+			transport->cwnd = 1;
> >+		}
> > 		transport->state = SCTP_ACTIVE;
> > 		break;
> > 
> >@@ -871,6 +882,10 @@ void sctp_assoc_control_transport(struct
> >sctp_association *asoc,
> > 		spc_state = SCTP_ADDR_UNREACHABLE;
> > 		break;
> > 
> >+	case SCTP_TRANSPORT_PF:
> >+		transport->state = SCTP_PF;
> >+		ulp_notify = false;
> >+		break;
> > 	default:
> > 		return;
> > 	}
> >@@ -878,12 +893,15 @@ void sctp_assoc_control_transport(struct
> >sctp_association *asoc,
> > 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
> > 	 * user.
> > 	 */
> >-	memset(&addr, 0, sizeof(struct sockaddr_storage));
> >-	memcpy(&addr, &transport->ipaddr,
> >transport->af_specific->sockaddr_len);
> >-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
> >-				0, spc_state, error, GFP_ATOMIC);
> >-	if (event)
> >-		sctp_ulpq_tail_event(&asoc->ulpq, event);
> >+	if (ulp_notify) {
> >+		memset(&addr, 0, sizeof(struct sockaddr_storage));
> >+		memcpy(&addr, &transport->ipaddr,
> >+		       transport->af_specific->sockaddr_len);
> >+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
> >+					0, spc_state, error, GFP_ATOMIC);
> >+		if (event)
> >+			sctp_ulpq_tail_event(&asoc->ulpq, event);
> >+	}
> > 
> > 	/* Select new active and retran paths. */
> > 
> >@@ -899,7 +917,8 @@ void sctp_assoc_control_transport(struct
> >sctp_association *asoc,
> > 			transports) {
> > 
> > 		if ((t->state == SCTP_INACTIVE) ||
> >-		    (t->state == SCTP_UNCONFIRMED))
> >+		    (t->state == SCTP_UNCONFIRMED) ||
> >+		    (t->state == SCTP_PF))
> > 			continue;
> > 		if (!first || t->last_time_heard > first->last_time_heard) {
> > 			second = first;
> >diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
> >index a0fa19f..e7aa177c 100644
> >--- a/net/sctp/outqueue.c
> >+++ b/net/sctp/outqueue.c
> >@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int
> >rtx_timeout)
> > 			if (!new_transport)
> > 				new_transport = asoc->peer.active_path;
> > 		} else if ((new_transport->state == SCTP_INACTIVE) ||
> >-			   (new_transport->state == SCTP_UNCONFIRMED)) {
> >+			   (new_transport->state == SCTP_UNCONFIRMED) ||
> >+			   (new_transport->state == SCTP_PF)) {
> > 			/* If the chunk is Heartbeat or Heartbeat Ack,
> > 			 * send it to chunk->transport, even if it's
> > 			 * inactive.
> >@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int
> >rtx_timeout)
> > 			new_transport = chunk->transport;
> > 			if (!new_transport ||
> > 			    ((new_transport->state == SCTP_INACTIVE) ||
> >-			     (new_transport->state == SCTP_UNCONFIRMED)))
> >+			     (new_transport->state == SCTP_UNCONFIRMED) ||
> >+			     (new_transport->state == SCTP_PF)))
> > 				new_transport = asoc->peer.active_path;
> > 			if (new_transport->state == SCTP_UNCONFIRMED)
> > 				continue;
> >diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
> >index c96d1a8..285e26a 100644
> >--- a/net/sctp/sm_sideeffect.c
> >+++ b/net/sctp/sm_sideeffect.c
> >@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type,
> >sctp_subtype_t subtype,
> > 			     sctp_cmd_seq_t *commands,
> > 			     gfp_t gfp);
> > 
> >+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
> >+				     struct sctp_transport *t);
> > /********************************************************************
> >  * Helper functions
> >  ********************************************************************/
> >@@ -470,7 +472,8 @@ sctp_timer_event_t
> >*sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
> >  * notification SHOULD be sent to the upper layer.
> >  *
> >  */
> >-static void sctp_do_8_2_transport_strike(struct sctp_association
> >*asoc,
> >+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
> >+					 struct sctp_association *asoc,
> > 					 struct sctp_transport *transport,
> > 					 int is_hb)
> > {
> >@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct
> >sctp_association *asoc,
> > 			transport->error_count++;
> > 	}
> > 
> >+	/* If the transport error count is greater than the pf_retrans
> >+	 * threshold, and less than pathmaxrtx, then mark this transport
> >+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
> >+	 * point 1
> >+	 */
> >+	if ((transport->state != SCTP_PF) &&
> >+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
> >+	   (transport->error_count > asoc->pf_retrans)) {
> >+
> >+		sctp_assoc_control_transport(asoc, transport,
> >+					     SCTP_TRANSPORT_PF,
> >+					     0);
> >+
> >+		/* Update the hb timer to resend a heartbeat every rto */
> >+		sctp_cmd_hb_timer_update(commands, transport);
> >+	}
> >+
> > 	if (transport->state != SCTP_INACTIVE &&
> > 	    (transport->error_count > transport->pathmaxrxt)) {
> > 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
> >@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t
> >*cmds,
> > 					     SCTP_HEARTBEAT_SUCCESS);
> > 	}
> > 
> >+	if (t->state == SCTP_PF)
> >+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
> >+					     SCTP_HEARTBEAT_SUCCESS);
> >+
> > 	/* The receiver of the HEARTBEAT ACK should also perform an
> > 	 * RTT measurement for that destination transport address
> > 	 * using the time value carried in the HEARTBEAT ACK chunk.
> >@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t
> >event_type,
> > 
> > 		case SCTP_CMD_STRIKE:
> > 			/* Mark one strike against a transport.  */
> >-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
> >-						    0);
> >+			sctp_do_8_2_transport_strike(commands, asoc,
> >+						    cmd->obj.transport, 0);
> > 			break;
> > 
> > 		case SCTP_CMD_TRANSPORT_IDLE:
> >@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t
> >event_type,
> > 
> > 		case SCTP_CMD_TRANSPORT_HB_SENT:
> > 			t = cmd->obj.transport;
> >-			sctp_do_8_2_transport_strike(asoc, t, 1);
> >+			sctp_do_8_2_transport_strike(commands, asoc,
> >+						     t, 1);
> > 			t->hb_sent = 1;
> > 			break;
> > 
> >diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
> >index e5fe639..2b2bfe9 100644
> >--- a/net/sctp/sysctl.c
> >+++ b/net/sctp/sysctl.c
> >@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
> > 		.extra2		= &int_max
> > 	},
> > 	{
> >+		.procname	= "pf_retrans",
> >+		.data		= &sctp_pf_retrans,
> >+		.maxlen		= sizeof(int),
> >+		.mode		= 0644,
> >+		.proc_handler	= proc_dointvec_minmax,
> >+		.extra1		= &zero,
> >+		.extra2		= &int_max
> >+	},
> >+	{
> > 		.procname	= "max_init_retransmits",
> > 		.data		= &sctp_max_retrans_init,
> > 		.maxlen		= sizeof(int),
> >diff --git a/net/sctp/transport.c b/net/sctp/transport.c
> >index b026ba0..4639ba2 100644
> >--- a/net/sctp/transport.c
> >+++ b/net/sctp/transport.c
> >@@ -585,7 +585,8 @@ unsigned long sctp_transport_timeout(struct
> >sctp_transport *t)
> > {
> > 	unsigned long timeout;
> > 	timeout = t->rto + sctp_jitter(t->rto);
> >-	if (t->state != SCTP_UNCONFIRMED)
> >+	if ((t->state != SCTP_UNCONFIRMED) &&
> >+	    (t->state != SCTP_PF))
> > 		timeout += t->hbinterval;
> > 	timeout += jiffies;
> > 	return timeout;
> >-- 
> >1.7.7.6
> 
> One thing that seems to be missing is the API.  As a result you don't carry the value per transport which we'll need.  That caused you to add assoc parameter to some functions.  That's really the only missing item.

I definately agree that a way to set the a per association pf threshold from
userspace (ostensibly from a socket option), but I'm not quite sure we're on the
same page about the semantics.  You say above that that I don't carry the value
per transport (I presume you mean the pf threshold).  According to the draft the
threshold is maintained per association, see point one of section 5.1:
1.  The sender maintains a new tunable parameter called Potentially-
       failed.Max.Retrans (PFMR).  The recommended value of PFMR = 0
       when quick failover is used.  When an association's PFMR >= PMR,
       quick failover is turned off.

So yes, we should have a way to change it programatically from the default via
an option, but I think the threshold is stored in the correct place (the
assocition struct).

Or am I misunderstanding what you're saying?

Regards
Neil

> -- 
> Sent from my Android phone with SkitMail. Please excuse my brevity.
> 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH] sctp: Implement quick failover draft from tsvwg
@ 2012-07-14 21:22     ` Neil Horman
  0 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-14 21:22 UTC (permalink / raw)
  To: Vlad Yasevich; +Cc: netdev, Sridhar Samudrala, David S. Miller, linux-sctp

On Sat, Jul 14, 2012 at 02:12:36PM -0400, Vlad Yasevich wrote:
> Neil Horman <nhorman@tuxdriver.com> wrote:
> 
> >I've seen several attempts recently made to do quick failover of sctp
> >transports
> >by reducing various retransmit timers and counters.  While its possible
> >to
> >implement a faster failover on multihomed sctp associations, its not
> >particularly robust, in that it can lead to unneeded retransmits, as
> >well as
> >false connection failures due to intermittent latency on a network.
> >
> >Instead, lets implement the new ietf quick failover draft found here:
> >http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
> >
> >This will let the sctp stack identify transports that have had a small
> >number of
> >errors, and avoid using them quickly until their reliability can be
> >re-established.  I've tested this out on two virt guests connected via
> >multiple
> >isolated virt networks and believe its in compliance with the above
> >draft and
> >works well.
> >
> >Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> >CC: Vlad Yasevich <vyasevich@gmail.com>
> >CC: Sridhar Samudrala <sri@us.ibm.com>
> >CC: "David S. Miller" <davem@davemloft.net>
> >CC: linux-sctp@vger.kernel.org
> >---
> > Documentation/networking/ip-sysctl.txt |   14 +++++++++++++
> > include/net/sctp/constants.h           |    1 +
> > include/net/sctp/structs.h             |    4 +++
> > include/net/sctp/user.h                |    1 +
> >net/sctp/associola.c                   |   33
> >+++++++++++++++++++++++++------
> > net/sctp/outqueue.c                    |    6 +++-
> >net/sctp/sm_sideeffect.c               |   33
> >++++++++++++++++++++++++++++---
> > net/sctp/sysctl.c                      |    9 ++++++++
> > net/sctp/transport.c                   |    3 +-
> > 9 files changed, 90 insertions(+), 14 deletions(-)
> >
> >diff --git a/Documentation/networking/ip-sysctl.txt
> >b/Documentation/networking/ip-sysctl.txt
> >index 47b6c79..c636f9c 100644
> >--- a/Documentation/networking/ip-sysctl.txt
> >+++ b/Documentation/networking/ip-sysctl.txt
> >@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
> > 
> > 	Default: 5
> > 
> >+pf_retrans - INTEGER
> >+	The number of retransmissions that will be attempted on a given path
> >+	before traffic is redirected to an alternate transport (should one
> >+	exist).  Note this is distinct from path_max_retrans, as a path that
> >+	passes the pf_retrans threshold can still be used.  Its only
> >+	deprioritized when a transmission path is selected by the stack. 
> >This
> >+	setting is primarily used to enable fast failover mechanisms without
> >+	having to reduce path_max_retrans to a very low value.  See:
> >+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> >+	for details.  Note also that a value of pf_retrans > path_max_retrans
> >+	disables this feature
> >+
> >+	Default: 0
> >+
> > rto_initial - INTEGER
> >	The initial round trip timeout value in milliseconds that will be used
> > 	in calculating round trip times.  This is the initial time interval
> >diff --git a/include/net/sctp/constants.h
> >b/include/net/sctp/constants.h
> >index 942b864..d053d2e 100644
> >--- a/include/net/sctp/constants.h
> >+++ b/include/net/sctp/constants.h
> >@@ -334,6 +334,7 @@ typedef enum {
> > typedef enum {
> > 	SCTP_TRANSPORT_UP,
> > 	SCTP_TRANSPORT_DOWN,
> >+	SCTP_TRANSPORT_PF,
> > } sctp_transport_cmd_t;
> > 
> > /* These are the address scopes defined mainly for IPv4 addresses
> >diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> >index e4652fe..22825abe 100644
> >--- a/include/net/sctp/structs.h
> >+++ b/include/net/sctp/structs.h
> >@@ -160,6 +160,7 @@ extern struct sctp_globals {
> > 	int max_retrans_association;
> > 	int max_retrans_path;
> > 	int max_retrans_init;
> >+	int pf_retrans;
> > 
> > 	/*
> > 	 * Policy for preforming sctp/socket accounting
> >@@ -258,6 +259,7 @@ extern struct sctp_globals {
> > #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
> > #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
> > #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
> >+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
> > #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
> > #define sctp_sack_timeout		(sctp_globals.sack_timeout)
> > #define sctp_hb_interval		(sctp_globals.hb_interval)
> >@@ -1660,6 +1662,8 @@ struct sctp_association {
> > 	 */
> > 	int max_retrans;
> > 
> >+	int pf_retrans;
> >+
> > 	/* Maximum number of times the endpoint will retransmit INIT  */
> > 	__u16 max_init_attempts;
> > 
> >diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
> >index 0842ef0..cece1bf 100644
> >--- a/include/net/sctp/user.h
> >+++ b/include/net/sctp/user.h
> >@@ -649,6 +649,7 @@ struct sctp_paddrinfo {
> >  */
> > enum sctp_spinfo_state {
> > 	SCTP_INACTIVE,
> >+	SCTP_PF,
> > 	SCTP_ACTIVE,
> > 	SCTP_UNCONFIRMED,
> > 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
> >diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> >index 5bc9ab1..f3ebc23 100644
> >--- a/net/sctp/associola.c
> >+++ b/net/sctp/associola.c
> >@@ -124,6 +124,8 @@ static struct sctp_association
> >*sctp_association_init(struct sctp_association *a
> > 	 * socket values.
> > 	 */
> > 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
> >+	asoc->pf_retrans  = sctp_pf_retrans;
> >+
> > 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
> > 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
> > 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
> >@@ -840,6 +842,7 @@ void sctp_assoc_control_transport(struct
> >sctp_association *asoc,
> > 	struct sctp_ulpevent *event;
> > 	struct sockaddr_storage addr;
> > 	int spc_state = 0;
> >+	bool ulp_notify = true;
> > 
> > 	/* Record the transition on the transport.  */
> > 	switch (command) {
> >@@ -853,6 +856,14 @@ void sctp_assoc_control_transport(struct
> >sctp_association *asoc,
> > 			spc_state = SCTP_ADDR_CONFIRMED;
> > 		else
> > 			spc_state = SCTP_ADDR_AVAILABLE;
> >+		/* Don't inform ULP about transition from PF to
> >+		 * active state and set cwnd to 1, see SCTP
> >+		 * Quick failover draft section 5.1, point 5
> >+		 */
> >+		if (transport->state = SCTP_PF) {
> >+			ulp_notify = false;
> >+			transport->cwnd = 1;
> >+		}
> > 		transport->state = SCTP_ACTIVE;
> > 		break;
> > 
> >@@ -871,6 +882,10 @@ void sctp_assoc_control_transport(struct
> >sctp_association *asoc,
> > 		spc_state = SCTP_ADDR_UNREACHABLE;
> > 		break;
> > 
> >+	case SCTP_TRANSPORT_PF:
> >+		transport->state = SCTP_PF;
> >+		ulp_notify = false;
> >+		break;
> > 	default:
> > 		return;
> > 	}
> >@@ -878,12 +893,15 @@ void sctp_assoc_control_transport(struct
> >sctp_association *asoc,
> > 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
> > 	 * user.
> > 	 */
> >-	memset(&addr, 0, sizeof(struct sockaddr_storage));
> >-	memcpy(&addr, &transport->ipaddr,
> >transport->af_specific->sockaddr_len);
> >-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
> >-				0, spc_state, error, GFP_ATOMIC);
> >-	if (event)
> >-		sctp_ulpq_tail_event(&asoc->ulpq, event);
> >+	if (ulp_notify) {
> >+		memset(&addr, 0, sizeof(struct sockaddr_storage));
> >+		memcpy(&addr, &transport->ipaddr,
> >+		       transport->af_specific->sockaddr_len);
> >+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
> >+					0, spc_state, error, GFP_ATOMIC);
> >+		if (event)
> >+			sctp_ulpq_tail_event(&asoc->ulpq, event);
> >+	}
> > 
> > 	/* Select new active and retran paths. */
> > 
> >@@ -899,7 +917,8 @@ void sctp_assoc_control_transport(struct
> >sctp_association *asoc,
> > 			transports) {
> > 
> > 		if ((t->state = SCTP_INACTIVE) ||
> >-		    (t->state = SCTP_UNCONFIRMED))
> >+		    (t->state = SCTP_UNCONFIRMED) ||
> >+		    (t->state = SCTP_PF))
> > 			continue;
> > 		if (!first || t->last_time_heard > first->last_time_heard) {
> > 			second = first;
> >diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
> >index a0fa19f..e7aa177c 100644
> >--- a/net/sctp/outqueue.c
> >+++ b/net/sctp/outqueue.c
> >@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int
> >rtx_timeout)
> > 			if (!new_transport)
> > 				new_transport = asoc->peer.active_path;
> > 		} else if ((new_transport->state = SCTP_INACTIVE) ||
> >-			   (new_transport->state = SCTP_UNCONFIRMED)) {
> >+			   (new_transport->state = SCTP_UNCONFIRMED) ||
> >+			   (new_transport->state = SCTP_PF)) {
> > 			/* If the chunk is Heartbeat or Heartbeat Ack,
> > 			 * send it to chunk->transport, even if it's
> > 			 * inactive.
> >@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int
> >rtx_timeout)
> > 			new_transport = chunk->transport;
> > 			if (!new_transport ||
> > 			    ((new_transport->state = SCTP_INACTIVE) ||
> >-			     (new_transport->state = SCTP_UNCONFIRMED)))
> >+			     (new_transport->state = SCTP_UNCONFIRMED) ||
> >+			     (new_transport->state = SCTP_PF)))
> > 				new_transport = asoc->peer.active_path;
> > 			if (new_transport->state = SCTP_UNCONFIRMED)
> > 				continue;
> >diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
> >index c96d1a8..285e26a 100644
> >--- a/net/sctp/sm_sideeffect.c
> >+++ b/net/sctp/sm_sideeffect.c
> >@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type,
> >sctp_subtype_t subtype,
> > 			     sctp_cmd_seq_t *commands,
> > 			     gfp_t gfp);
> > 
> >+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
> >+				     struct sctp_transport *t);
> > /********************************************************************
> >  * Helper functions
> >  ********************************************************************/
> >@@ -470,7 +472,8 @@ sctp_timer_event_t
> >*sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
> >  * notification SHOULD be sent to the upper layer.
> >  *
> >  */
> >-static void sctp_do_8_2_transport_strike(struct sctp_association
> >*asoc,
> >+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
> >+					 struct sctp_association *asoc,
> > 					 struct sctp_transport *transport,
> > 					 int is_hb)
> > {
> >@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct
> >sctp_association *asoc,
> > 			transport->error_count++;
> > 	}
> > 
> >+	/* If the transport error count is greater than the pf_retrans
> >+	 * threshold, and less than pathmaxrtx, then mark this transport
> >+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
> >+	 * point 1
> >+	 */
> >+	if ((transport->state != SCTP_PF) &&
> >+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
> >+	   (transport->error_count > asoc->pf_retrans)) {
> >+
> >+		sctp_assoc_control_transport(asoc, transport,
> >+					     SCTP_TRANSPORT_PF,
> >+					     0);
> >+
> >+		/* Update the hb timer to resend a heartbeat every rto */
> >+		sctp_cmd_hb_timer_update(commands, transport);
> >+	}
> >+
> > 	if (transport->state != SCTP_INACTIVE &&
> > 	    (transport->error_count > transport->pathmaxrxt)) {
> > 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
> >@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t
> >*cmds,
> > 					     SCTP_HEARTBEAT_SUCCESS);
> > 	}
> > 
> >+	if (t->state = SCTP_PF)
> >+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
> >+					     SCTP_HEARTBEAT_SUCCESS);
> >+
> > 	/* The receiver of the HEARTBEAT ACK should also perform an
> > 	 * RTT measurement for that destination transport address
> > 	 * using the time value carried in the HEARTBEAT ACK chunk.
> >@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t
> >event_type,
> > 
> > 		case SCTP_CMD_STRIKE:
> > 			/* Mark one strike against a transport.  */
> >-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
> >-						    0);
> >+			sctp_do_8_2_transport_strike(commands, asoc,
> >+						    cmd->obj.transport, 0);
> > 			break;
> > 
> > 		case SCTP_CMD_TRANSPORT_IDLE:
> >@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t
> >event_type,
> > 
> > 		case SCTP_CMD_TRANSPORT_HB_SENT:
> > 			t = cmd->obj.transport;
> >-			sctp_do_8_2_transport_strike(asoc, t, 1);
> >+			sctp_do_8_2_transport_strike(commands, asoc,
> >+						     t, 1);
> > 			t->hb_sent = 1;
> > 			break;
> > 
> >diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
> >index e5fe639..2b2bfe9 100644
> >--- a/net/sctp/sysctl.c
> >+++ b/net/sctp/sysctl.c
> >@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
> > 		.extra2		= &int_max
> > 	},
> > 	{
> >+		.procname	= "pf_retrans",
> >+		.data		= &sctp_pf_retrans,
> >+		.maxlen		= sizeof(int),
> >+		.mode		= 0644,
> >+		.proc_handler	= proc_dointvec_minmax,
> >+		.extra1		= &zero,
> >+		.extra2		= &int_max
> >+	},
> >+	{
> > 		.procname	= "max_init_retransmits",
> > 		.data		= &sctp_max_retrans_init,
> > 		.maxlen		= sizeof(int),
> >diff --git a/net/sctp/transport.c b/net/sctp/transport.c
> >index b026ba0..4639ba2 100644
> >--- a/net/sctp/transport.c
> >+++ b/net/sctp/transport.c
> >@@ -585,7 +585,8 @@ unsigned long sctp_transport_timeout(struct
> >sctp_transport *t)
> > {
> > 	unsigned long timeout;
> > 	timeout = t->rto + sctp_jitter(t->rto);
> >-	if (t->state != SCTP_UNCONFIRMED)
> >+	if ((t->state != SCTP_UNCONFIRMED) &&
> >+	    (t->state != SCTP_PF))
> > 		timeout += t->hbinterval;
> > 	timeout += jiffies;
> > 	return timeout;
> >-- 
> >1.7.7.6
> 
> One thing that seems to be missing is the API.  As a result you don't carry the value per transport which we'll need.  That caused you to add assoc parameter to some functions.  That's really the only missing item.

I definately agree that a way to set the a per association pf threshold from
userspace (ostensibly from a socket option), but I'm not quite sure we're on the
same page about the semantics.  You say above that that I don't carry the value
per transport (I presume you mean the pf threshold).  According to the draft the
threshold is maintained per association, see point one of section 5.1:
1.  The sender maintains a new tunable parameter called Potentially-
       failed.Max.Retrans (PFMR).  The recommended value of PFMR = 0
       when quick failover is used.  When an association's PFMR >= PMR,
       quick failover is turned off.

So yes, we should have a way to change it programatically from the default via
an option, but I think the threshold is stored in the correct place (the
assocition struct).

Or am I misunderstanding what you're saying?

Regards
Neil

> -- 
> Sent from my Android phone with SkitMail. Please excuse my brevity.
> 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v2] sctp: Implement quick failover draft from tsvwg
  2012-07-13 18:26 ` Neil Horman
@ 2012-07-18 18:01   ` Neil Horman
  -1 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-18 18:01 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp

I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org

---
Change notes:

V2)
- Added socket option API from section 6.1 of the specification, as per
request from Vlad. Adding this socket option allows us to alter both the path
maximum retransmit value and the path partial failure threshold for each
transport and the association as a whole.

- Added a per transport pf_retrans value, and initialized it from the
association value.  This makes each transport independently configurable as per
the socket option above, and prevents changes in the sysctl from bleeding into
an already created association.
---
 Documentation/networking/ip-sysctl.txt |   14 +++++
 include/net/sctp/constants.h           |    1 +
 include/net/sctp/structs.h             |   11 +++-
 include/net/sctp/user.h                |   11 ++++
 net/sctp/associola.c                   |   36 ++++++++++--
 net/sctp/outqueue.c                    |    6 +-
 net/sctp/sm_sideeffect.c               |   33 ++++++++++-
 net/sctp/socket.c                      |   96 ++++++++++++++++++++++++++++++++
 net/sctp/sysctl.c                      |    9 +++
 net/sctp/transport.c                   |    4 +-
 10 files changed, 206 insertions(+), 15 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79..c636f9c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
 
 	Default: 5
 
+pf_retrans - INTEGER
+	The number of retransmissions that will be attempted on a given path
+	before traffic is redirected to an alternate transport (should one
+	exist).  Note this is distinct from path_max_retrans, as a path that
+	passes the pf_retrans threshold can still be used.  Its only
+	deprioritized when a transmission path is selected by the stack.  This
+	setting is primarily used to enable fast failover mechanisms without
+	having to reduce path_max_retrans to a very low value.  See:
+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+	for details.  Note also that a value of pf_retrans > path_max_retrans
+	disables this feature
+
+	Default: 0
+
 rto_initial - INTEGER
 	The initial round trip timeout value in milliseconds that will be used
 	in calculating round trip times.  This is the initial time interval
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 942b864..d053d2e 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -334,6 +334,7 @@ typedef enum {
 typedef enum {
 	SCTP_TRANSPORT_UP,
 	SCTP_TRANSPORT_DOWN,
+	SCTP_TRANSPORT_PF,
 } sctp_transport_cmd_t;
 
 /* These are the address scopes defined mainly for IPv4 addresses
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e4652fe..f70726c 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -160,6 +160,7 @@ extern struct sctp_globals {
 	int max_retrans_association;
 	int max_retrans_path;
 	int max_retrans_init;
+	int pf_retrans;
 
 	/*
 	 * Policy for preforming sctp/socket accounting
@@ -258,6 +259,7 @@ extern struct sctp_globals {
 #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
 #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
 #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
 #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
 #define sctp_sack_timeout		(sctp_globals.sack_timeout)
 #define sctp_hb_interval		(sctp_globals.hb_interval)
@@ -987,10 +989,15 @@ struct sctp_transport {
 
 	/* This is the max_retrans value for the transport and will
 	 * be initialized from the assocs value.  This can be changed
-	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
+	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
 	 */
 	__u16 pathmaxrxt;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be changed
+	 * using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
 	/* PMTU	      : The current known path MTU.  */
 	__u32 pathmtu;
 
@@ -1660,6 +1667,8 @@ struct sctp_association {
 	 */
 	int max_retrans;
 
+	int pf_retrans;
+
 	/* Maximum number of times the endpoint will retransmit INIT  */
 	__u16 max_init_attempts;
 
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 0842ef0..1b02d7a 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
 #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
 #define SCTP_AUTO_ASCONF       30
+#define SCTP_PEER_ADDR_THLDS	31
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
@@ -649,6 +650,7 @@ struct sctp_paddrinfo {
  */
 enum sctp_spinfo_state {
 	SCTP_INACTIVE,
+	SCTP_PF,
 	SCTP_ACTIVE,
 	SCTP_UNCONFIRMED,
 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
@@ -741,4 +743,13 @@ typedef struct {
 	int sd;
 } sctp_peeloff_arg_t;
 
+/*
+ *  Peer Address Thresholds socket option
+ */
+struct sctp_paddrthlds {
+	sctp_assoc_t spt_assoc_id;
+	struct sockaddr_storage spt_address;
+	__u16 spt_pathmaxrxt;
+	__u16 spt_pathpfthld;
+};
 #endif /* __net_sctp_user_h__ */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5bc9ab1..b357195 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * socket values.
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+	asoc->pf_retrans  = sctp_pf_retrans;
+
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
@@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	/* Set the path max_retrans.  */
 	peer->pathmaxrxt = asoc->pathmaxrxt;
 
+	/* And the partial failure retrnas threshold */
+	peer->pf_retrans = asoc->pf_retrans;
+
 	/* Initialize the peer's SACK delay timeout based on the
 	 * association configured value.
 	 */
@@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	struct sctp_ulpevent *event;
 	struct sockaddr_storage addr;
 	int spc_state = 0;
+	bool ulp_notify = true;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			spc_state = SCTP_ADDR_CONFIRMED;
 		else
 			spc_state = SCTP_ADDR_AVAILABLE;
+		/* Don't inform ULP about transition from PF to
+		 * active state and set cwnd to 1, see SCTP
+		 * Quick failover draft section 5.1, point 5
+		 */
+		if (transport->state == SCTP_PF) {
+			ulp_notify = false;
+			transport->cwnd = 1;
+		}
 		transport->state = SCTP_ACTIVE;
 		break;
 
@@ -871,6 +885,10 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
+	case SCTP_TRANSPORT_PF:
+		transport->state = SCTP_PF;
+		ulp_notify = false;
+		break;
 	default:
 		return;
 	}
@@ -878,12 +896,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
 	 * user.
 	 */
-	memset(&addr, 0, sizeof(struct sockaddr_storage));
-	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-				0, spc_state, error, GFP_ATOMIC);
-	if (event)
-		sctp_ulpq_tail_event(&asoc->ulpq, event);
+	if (ulp_notify) {
+		memset(&addr, 0, sizeof(struct sockaddr_storage));
+		memcpy(&addr, &transport->ipaddr,
+		       transport->af_specific->sockaddr_len);
+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+					0, spc_state, error, GFP_ATOMIC);
+		if (event)
+			sctp_ulpq_tail_event(&asoc->ulpq, event);
+	}
 
 	/* Select new active and retran paths. */
 
@@ -899,7 +920,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			transports) {
 
 		if ((t->state == SCTP_INACTIVE) ||
-		    (t->state == SCTP_UNCONFIRMED))
+		    (t->state == SCTP_UNCONFIRMED) ||
+		    (t->state == SCTP_PF))
 			continue;
 		if (!first || t->last_time_heard > first->last_time_heard) {
 			second = first;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index a0fa19f..e7aa177c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			if (!new_transport)
 				new_transport = asoc->peer.active_path;
 		} else if ((new_transport->state == SCTP_INACTIVE) ||
-			   (new_transport->state == SCTP_UNCONFIRMED)) {
+			   (new_transport->state == SCTP_UNCONFIRMED) ||
+			   (new_transport->state == SCTP_PF)) {
 			/* If the chunk is Heartbeat or Heartbeat Ack,
 			 * send it to chunk->transport, even if it's
 			 * inactive.
@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			new_transport = chunk->transport;
 			if (!new_transport ||
 			    ((new_transport->state == SCTP_INACTIVE) ||
-			     (new_transport->state == SCTP_UNCONFIRMED)))
+			     (new_transport->state == SCTP_UNCONFIRMED) ||
+			     (new_transport->state == SCTP_PF)))
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state == SCTP_UNCONFIRMED)
 				continue;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c96d1a8..285e26a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_cmd_seq_t *commands,
 			     gfp_t gfp);
 
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+				     struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
@@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+					 struct sctp_association *asoc,
 					 struct sctp_transport *transport,
 					 int is_hb)
 {
@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 			transport->error_count++;
 	}
 
+	/* If the transport error count is greater than the pf_retrans
+	 * threshold, and less than pathmaxrtx, then mark this transport
+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
+	 * point 1
+	 */
+	if ((transport->state != SCTP_PF) &&
+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
+	   (transport->error_count > asoc->pf_retrans)) {
+
+		sctp_assoc_control_transport(asoc, transport,
+					     SCTP_TRANSPORT_PF,
+					     0);
+
+		/* Update the hb timer to resend a heartbeat every rto */
+		sctp_cmd_hb_timer_update(commands, transport);
+	}
+
 	if (transport->state != SCTP_INACTIVE &&
 	    (transport->error_count > transport->pathmaxrxt)) {
 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 					     SCTP_HEARTBEAT_SUCCESS);
 	}
 
+	if (t->state == SCTP_PF)
+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+					     SCTP_HEARTBEAT_SUCCESS);
+
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
 	 * using the time value carried in the HEARTBEAT ACK chunk.
@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_STRIKE:
 			/* Mark one strike against a transport.  */
-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
-						    0);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						    cmd->obj.transport, 0);
 			break;
 
 		case SCTP_CMD_TRANSPORT_IDLE:
@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_TRANSPORT_HB_SENT:
 			t = cmd->obj.transport;
-			sctp_do_8_2_transport_strike(asoc, t, 1);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						     t, 1);
 			t->hb_sent = 1;
 			break;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b3b8a8d..dfffece 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3470,6 +3470,52 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 }
 
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to alter the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
+			   optlen))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				    transports) {
+			trans->pathmaxrxt = val.spt_pathmaxrxt;
+			trans->pf_retrans = val.spt_pathpfthld;
+		}
+
+		asoc->pf_retrans = val.spt_pathpfthld;
+		asoc->pathmaxrxt = val.spt_pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		trans->pathmaxrxt = val.spt_pathmaxrxt;
+		trans->pf_retrans = val.spt_pathpfthld;
+	}
+
+	return 0;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3619,6 +3665,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -5490,6 +5539,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
 	return 0;
 }
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to fetch the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+			val.spt_assoc_id);
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+
+		val.spt_pathpfthld = asoc->pf_retrans;
+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		val.spt_pathmaxrxt = trans->pathmaxrxt;
+		val.spt_pathpfthld = trans->pf_retrans;
+	}
+
+	if (copy_to_user(optval, &val, optlen))
+		return -EFAULT;
+
+	return 0;
+}
+
 SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 				char __user *optval, int __user *optlen)
 {
@@ -5628,6 +5721,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5fe639..2b2bfe9 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
 		.extra2		= &int_max
 	},
 	{
+		.procname	= "pf_retrans",
+		.data		= &sctp_pf_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
 		.procname	= "max_init_retransmits",
 		.data		= &sctp_max_retrans_init,
 		.maxlen		= sizeof(int),
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..194d0f3 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 
 	/* Initialize the default path max_retrans.  */
 	peer->pathmaxrxt  = sctp_max_retrans_path;
+	peer->pf_retrans  = sctp_pf_retrans;
 
 	INIT_LIST_HEAD(&peer->transmitted);
 	INIT_LIST_HEAD(&peer->send_ready);
@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
 {
 	unsigned long timeout;
 	timeout = t->rto + sctp_jitter(t->rto);
-	if (t->state != SCTP_UNCONFIRMED)
+	if ((t->state != SCTP_UNCONFIRMED) &&
+	    (t->state != SCTP_PF))
 		timeout += t->hbinterval;
 	timeout += jiffies;
 	return timeout;
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v2] sctp: Implement quick failover draft from tsvwg
@ 2012-07-18 18:01   ` Neil Horman
  0 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-18 18:01 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp

I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org

---
Change notes:

V2)
- Added socket option API from section 6.1 of the specification, as per
request from Vlad. Adding this socket option allows us to alter both the path
maximum retransmit value and the path partial failure threshold for each
transport and the association as a whole.

- Added a per transport pf_retrans value, and initialized it from the
association value.  This makes each transport independently configurable as per
the socket option above, and prevents changes in the sysctl from bleeding into
an already created association.
---
 Documentation/networking/ip-sysctl.txt |   14 +++++
 include/net/sctp/constants.h           |    1 +
 include/net/sctp/structs.h             |   11 +++-
 include/net/sctp/user.h                |   11 ++++
 net/sctp/associola.c                   |   36 ++++++++++--
 net/sctp/outqueue.c                    |    6 +-
 net/sctp/sm_sideeffect.c               |   33 ++++++++++-
 net/sctp/socket.c                      |   96 ++++++++++++++++++++++++++++++++
 net/sctp/sysctl.c                      |    9 +++
 net/sctp/transport.c                   |    4 +-
 10 files changed, 206 insertions(+), 15 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79..c636f9c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
 
 	Default: 5
 
+pf_retrans - INTEGER
+	The number of retransmissions that will be attempted on a given path
+	before traffic is redirected to an alternate transport (should one
+	exist).  Note this is distinct from path_max_retrans, as a path that
+	passes the pf_retrans threshold can still be used.  Its only
+	deprioritized when a transmission path is selected by the stack.  This
+	setting is primarily used to enable fast failover mechanisms without
+	having to reduce path_max_retrans to a very low value.  See:
+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+	for details.  Note also that a value of pf_retrans > path_max_retrans
+	disables this feature
+
+	Default: 0
+
 rto_initial - INTEGER
 	The initial round trip timeout value in milliseconds that will be used
 	in calculating round trip times.  This is the initial time interval
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 942b864..d053d2e 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -334,6 +334,7 @@ typedef enum {
 typedef enum {
 	SCTP_TRANSPORT_UP,
 	SCTP_TRANSPORT_DOWN,
+	SCTP_TRANSPORT_PF,
 } sctp_transport_cmd_t;
 
 /* These are the address scopes defined mainly for IPv4 addresses
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e4652fe..f70726c 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -160,6 +160,7 @@ extern struct sctp_globals {
 	int max_retrans_association;
 	int max_retrans_path;
 	int max_retrans_init;
+	int pf_retrans;
 
 	/*
 	 * Policy for preforming sctp/socket accounting
@@ -258,6 +259,7 @@ extern struct sctp_globals {
 #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
 #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
 #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
 #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
 #define sctp_sack_timeout		(sctp_globals.sack_timeout)
 #define sctp_hb_interval		(sctp_globals.hb_interval)
@@ -987,10 +989,15 @@ struct sctp_transport {
 
 	/* This is the max_retrans value for the transport and will
 	 * be initialized from the assocs value.  This can be changed
-	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
+	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
 	 */
 	__u16 pathmaxrxt;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be changed
+	 * using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
 	/* PMTU	      : The current known path MTU.  */
 	__u32 pathmtu;
 
@@ -1660,6 +1667,8 @@ struct sctp_association {
 	 */
 	int max_retrans;
 
+	int pf_retrans;
+
 	/* Maximum number of times the endpoint will retransmit INIT  */
 	__u16 max_init_attempts;
 
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 0842ef0..1b02d7a 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
 #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
 #define SCTP_AUTO_ASCONF       30
+#define SCTP_PEER_ADDR_THLDS	31
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
@@ -649,6 +650,7 @@ struct sctp_paddrinfo {
  */
 enum sctp_spinfo_state {
 	SCTP_INACTIVE,
+	SCTP_PF,
 	SCTP_ACTIVE,
 	SCTP_UNCONFIRMED,
 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
@@ -741,4 +743,13 @@ typedef struct {
 	int sd;
 } sctp_peeloff_arg_t;
 
+/*
+ *  Peer Address Thresholds socket option
+ */
+struct sctp_paddrthlds {
+	sctp_assoc_t spt_assoc_id;
+	struct sockaddr_storage spt_address;
+	__u16 spt_pathmaxrxt;
+	__u16 spt_pathpfthld;
+};
 #endif /* __net_sctp_user_h__ */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5bc9ab1..b357195 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * socket values.
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+	asoc->pf_retrans  = sctp_pf_retrans;
+
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
@@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	/* Set the path max_retrans.  */
 	peer->pathmaxrxt = asoc->pathmaxrxt;
 
+	/* And the partial failure retrnas threshold */
+	peer->pf_retrans = asoc->pf_retrans;
+
 	/* Initialize the peer's SACK delay timeout based on the
 	 * association configured value.
 	 */
@@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	struct sctp_ulpevent *event;
 	struct sockaddr_storage addr;
 	int spc_state = 0;
+	bool ulp_notify = true;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			spc_state = SCTP_ADDR_CONFIRMED;
 		else
 			spc_state = SCTP_ADDR_AVAILABLE;
+		/* Don't inform ULP about transition from PF to
+		 * active state and set cwnd to 1, see SCTP
+		 * Quick failover draft section 5.1, point 5
+		 */
+		if (transport->state = SCTP_PF) {
+			ulp_notify = false;
+			transport->cwnd = 1;
+		}
 		transport->state = SCTP_ACTIVE;
 		break;
 
@@ -871,6 +885,10 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
+	case SCTP_TRANSPORT_PF:
+		transport->state = SCTP_PF;
+		ulp_notify = false;
+		break;
 	default:
 		return;
 	}
@@ -878,12 +896,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
 	 * user.
 	 */
-	memset(&addr, 0, sizeof(struct sockaddr_storage));
-	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-				0, spc_state, error, GFP_ATOMIC);
-	if (event)
-		sctp_ulpq_tail_event(&asoc->ulpq, event);
+	if (ulp_notify) {
+		memset(&addr, 0, sizeof(struct sockaddr_storage));
+		memcpy(&addr, &transport->ipaddr,
+		       transport->af_specific->sockaddr_len);
+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+					0, spc_state, error, GFP_ATOMIC);
+		if (event)
+			sctp_ulpq_tail_event(&asoc->ulpq, event);
+	}
 
 	/* Select new active and retran paths. */
 
@@ -899,7 +920,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			transports) {
 
 		if ((t->state = SCTP_INACTIVE) ||
-		    (t->state = SCTP_UNCONFIRMED))
+		    (t->state = SCTP_UNCONFIRMED) ||
+		    (t->state = SCTP_PF))
 			continue;
 		if (!first || t->last_time_heard > first->last_time_heard) {
 			second = first;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index a0fa19f..e7aa177c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			if (!new_transport)
 				new_transport = asoc->peer.active_path;
 		} else if ((new_transport->state = SCTP_INACTIVE) ||
-			   (new_transport->state = SCTP_UNCONFIRMED)) {
+			   (new_transport->state = SCTP_UNCONFIRMED) ||
+			   (new_transport->state = SCTP_PF)) {
 			/* If the chunk is Heartbeat or Heartbeat Ack,
 			 * send it to chunk->transport, even if it's
 			 * inactive.
@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			new_transport = chunk->transport;
 			if (!new_transport ||
 			    ((new_transport->state = SCTP_INACTIVE) ||
-			     (new_transport->state = SCTP_UNCONFIRMED)))
+			     (new_transport->state = SCTP_UNCONFIRMED) ||
+			     (new_transport->state = SCTP_PF)))
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state = SCTP_UNCONFIRMED)
 				continue;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c96d1a8..285e26a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_cmd_seq_t *commands,
 			     gfp_t gfp);
 
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+				     struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
@@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+					 struct sctp_association *asoc,
 					 struct sctp_transport *transport,
 					 int is_hb)
 {
@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 			transport->error_count++;
 	}
 
+	/* If the transport error count is greater than the pf_retrans
+	 * threshold, and less than pathmaxrtx, then mark this transport
+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
+	 * point 1
+	 */
+	if ((transport->state != SCTP_PF) &&
+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
+	   (transport->error_count > asoc->pf_retrans)) {
+
+		sctp_assoc_control_transport(asoc, transport,
+					     SCTP_TRANSPORT_PF,
+					     0);
+
+		/* Update the hb timer to resend a heartbeat every rto */
+		sctp_cmd_hb_timer_update(commands, transport);
+	}
+
 	if (transport->state != SCTP_INACTIVE &&
 	    (transport->error_count > transport->pathmaxrxt)) {
 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 					     SCTP_HEARTBEAT_SUCCESS);
 	}
 
+	if (t->state = SCTP_PF)
+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+					     SCTP_HEARTBEAT_SUCCESS);
+
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
 	 * using the time value carried in the HEARTBEAT ACK chunk.
@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_STRIKE:
 			/* Mark one strike against a transport.  */
-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
-						    0);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						    cmd->obj.transport, 0);
 			break;
 
 		case SCTP_CMD_TRANSPORT_IDLE:
@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_TRANSPORT_HB_SENT:
 			t = cmd->obj.transport;
-			sctp_do_8_2_transport_strike(asoc, t, 1);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						     t, 1);
 			t->hb_sent = 1;
 			break;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b3b8a8d..dfffece 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3470,6 +3470,52 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 }
 
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to alter the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
+			   optlen))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				    transports) {
+			trans->pathmaxrxt = val.spt_pathmaxrxt;
+			trans->pf_retrans = val.spt_pathpfthld;
+		}
+
+		asoc->pf_retrans = val.spt_pathpfthld;
+		asoc->pathmaxrxt = val.spt_pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		trans->pathmaxrxt = val.spt_pathmaxrxt;
+		trans->pf_retrans = val.spt_pathpfthld;
+	}
+
+	return 0;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3619,6 +3665,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -5490,6 +5539,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
 	return 0;
 }
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to fetch the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+			val.spt_assoc_id);
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+
+		val.spt_pathpfthld = asoc->pf_retrans;
+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		val.spt_pathmaxrxt = trans->pathmaxrxt;
+		val.spt_pathpfthld = trans->pf_retrans;
+	}
+
+	if (copy_to_user(optval, &val, optlen))
+		return -EFAULT;
+
+	return 0;
+}
+
 SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 				char __user *optval, int __user *optlen)
 {
@@ -5628,6 +5721,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5fe639..2b2bfe9 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
 		.extra2		= &int_max
 	},
 	{
+		.procname	= "pf_retrans",
+		.data		= &sctp_pf_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
 		.procname	= "max_init_retransmits",
 		.data		= &sctp_max_retrans_init,
 		.maxlen		= sizeof(int),
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..194d0f3 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 
 	/* Initialize the default path max_retrans.  */
 	peer->pathmaxrxt  = sctp_max_retrans_path;
+	peer->pf_retrans  = sctp_pf_retrans;
 
 	INIT_LIST_HEAD(&peer->transmitted);
 	INIT_LIST_HEAD(&peer->send_ready);
@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
 {
 	unsigned long timeout;
 	timeout = t->rto + sctp_jitter(t->rto);
-	if (t->state != SCTP_UNCONFIRMED)
+	if ((t->state != SCTP_UNCONFIRMED) &&
+	    (t->state != SCTP_PF))
 		timeout += t->hbinterval;
 	timeout += jiffies;
 	return timeout;
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH v2] sctp: Implement quick failover draft from tsvwg
  2012-07-18 18:01   ` Neil Horman
@ 2012-07-18 20:30     ` Joe Perches
  -1 siblings, 0 replies; 48+ messages in thread
From: Joe Perches @ 2012-07-18 20:30 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, Vlad Yasevich, Sridhar Samudrala, David S. Miller, linux-sctp

On Wed, 2012-07-18 at 14:01 -0400, Neil Horman wrote:
> I've seen several attempts recently made to do quick failover of sctp transports
> by reducing various retransmit timers and counters.  While its possible to
> implement a faster failover on multihomed sctp associations, its not
> particularly robust, in that it can lead to unneeded retransmits, as well as
> false connection failures due to intermittent latency on a network.

trivia:

> diff --git a/net/sctp/associola.c b/net/sctp/associola.c

> @@ -871,6 +885,10 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
>  		spc_state = SCTP_ADDR_UNREACHABLE;
>  		break;
>  
> +	case SCTP_TRANSPORT_PF:
> +		transport->state = SCTP_PF;
> +		ulp_notify = false;
> +		break;

nicer to add a newline here

>  	default:
>  		return;
>  	}
> @@ -878,12 +896,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
[]
> +	if (ulp_notify) {
> +		memset(&addr, 0, sizeof(struct sockaddr_storage));
> +		memcpy(&addr, &transport->ipaddr,
> +		       transport->af_specific->sockaddr_len);

Perhaps it's better to do the memcpy then the memset of the
space left instead.

		memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
		memset((char *)&addr) + transport->af_specific->sockaddr_len, 0,
		       sizeof(struct sockaddr_storage) - transport->af_specific->sockaddr_len);
		       

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2] sctp: Implement quick failover draft from tsvwg
@ 2012-07-18 20:30     ` Joe Perches
  0 siblings, 0 replies; 48+ messages in thread
From: Joe Perches @ 2012-07-18 20:30 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, Vlad Yasevich, Sridhar Samudrala, David S. Miller, linux-sctp

On Wed, 2012-07-18 at 14:01 -0400, Neil Horman wrote:
> I've seen several attempts recently made to do quick failover of sctp transports
> by reducing various retransmit timers and counters.  While its possible to
> implement a faster failover on multihomed sctp associations, its not
> particularly robust, in that it can lead to unneeded retransmits, as well as
> false connection failures due to intermittent latency on a network.

trivia:

> diff --git a/net/sctp/associola.c b/net/sctp/associola.c

> @@ -871,6 +885,10 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
>  		spc_state = SCTP_ADDR_UNREACHABLE;
>  		break;
>  
> +	case SCTP_TRANSPORT_PF:
> +		transport->state = SCTP_PF;
> +		ulp_notify = false;
> +		break;

nicer to add a newline here

>  	default:
>  		return;
>  	}
> @@ -878,12 +896,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
[]
> +	if (ulp_notify) {
> +		memset(&addr, 0, sizeof(struct sockaddr_storage));
> +		memcpy(&addr, &transport->ipaddr,
> +		       transport->af_specific->sockaddr_len);

Perhaps it's better to do the memcpy then the memset of the
space left instead.

		memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
		memset((char *)&addr) + transport->af_specific->sockaddr_len, 0,
		       sizeof(struct sockaddr_storage) - transport->af_specific->sockaddr_len);
		       



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2] sctp: Implement quick failover draft from tsvwg
  2012-07-18 18:01   ` Neil Horman
@ 2012-07-18 21:23     ` Vlad Yasevich
  -1 siblings, 0 replies; 48+ messages in thread
From: Vlad Yasevich @ 2012-07-18 21:23 UTC (permalink / raw)
  To: Neil Horman; +Cc: netdev, Sridhar Samudrala, David S. Miller, linux-sctp

On 07/18/2012 02:01 PM, Neil Horman wrote:
> I've seen several attempts recently made to do quick failover of sctp transports
> by reducing various retransmit timers and counters.  While its possible to
> implement a faster failover on multihomed sctp associations, its not
> particularly robust, in that it can lead to unneeded retransmits, as well as
> false connection failures due to intermittent latency on a network.
>
> Instead, lets implement the new ietf quick failover draft found here:
> http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>
> This will let the sctp stack identify transports that have had a small number of
> errors, and avoid using them quickly until their reliability can be
> re-established.  I've tested this out on two virt guests connected via multiple
> isolated virt networks and believe its in compliance with the above draft and
> works well.
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> CC: Vlad Yasevich <vyasevich@gmail.com>
> CC: Sridhar Samudrala <sri@us.ibm.com>
> CC: "David S. Miller" <davem@davemloft.net>
> CC: linux-sctp@vger.kernel.org
>
> ---
> Change notes:
>
> V2)
> - Added socket option API from section 6.1 of the specification, as per
> request from Vlad. Adding this socket option allows us to alter both the path
> maximum retransmit value and the path partial failure threshold for each
> transport and the association as a whole.
>
> - Added a per transport pf_retrans value, and initialized it from the
> association value.  This makes each transport independently configurable as per
> the socket option above, and prevents changes in the sysctl from bleeding into
> an already created association.
> ---
>   Documentation/networking/ip-sysctl.txt |   14 +++++
>   include/net/sctp/constants.h           |    1 +
>   include/net/sctp/structs.h             |   11 +++-
>   include/net/sctp/user.h                |   11 ++++
>   net/sctp/associola.c                   |   36 ++++++++++--
>   net/sctp/outqueue.c                    |    6 +-
>   net/sctp/sm_sideeffect.c               |   33 ++++++++++-
>   net/sctp/socket.c                      |   96 ++++++++++++++++++++++++++++++++
>   net/sctp/sysctl.c                      |    9 +++
>   net/sctp/transport.c                   |    4 +-
>   10 files changed, 206 insertions(+), 15 deletions(-)
>

[ snip ]

> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index b3b8a8d..dfffece 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -3470,6 +3470,52 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
>   }
>
>
> +/*
> + * SCTP_PEER_ADDR_THLDS
> + *
> + * This option allows us to alter the partially failed threshold for one or all
> + * transports in an association.  See Section 6.1 of:
> + * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> + */
> +static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
> +					    char __user *optval,
> +					    unsigned int optlen)
> +{
> +	struct sctp_paddrthlds val;
> +	struct sctp_transport *trans;
> +	struct sctp_association *asoc;
> +
> +	if (optlen < sizeof(struct sctp_paddrthlds))
> +		return -EINVAL;
> +	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
> +			   optlen))
> +		return -EFAULT;

What if optlen is bigger?  You going to trash the stack.

> +
> +	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> +		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> +		if (!asoc)
> +			return -ENOENT;
> +		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
> +				    transports) {
> +			trans->pathmaxrxt = val.spt_pathmaxrxt;
> +			trans->pf_retrans = val.spt_pathpfthld;

You want to make sure that the values aren't 0.  Otherwise, you'll set 
the pathmaxrxt to 0 and that would be bad.

> +		}
> +
> +		asoc->pf_retrans = val.spt_pathpfthld;
> +		asoc->pathmaxrxt = val.spt_pathmaxrxt;

Ditto.

> +	} else {
> +		trans = sctp_addr_id2transport(sk, &val.spt_address,
> +					       val.spt_assoc_id);
> +		if (!trans)
> +			return -ENOENT;
> +
> +		trans->pathmaxrxt = val.spt_pathmaxrxt;
> +		trans->pf_retrans = val.spt_pathpfthld;

Ditto.

> +	}
> +
> +	return 0;
> +}
> +
>   /* API 6.2 setsockopt(), getsockopt()
>    *
>    * Applications use setsockopt() and getsockopt() to set or retrieve
> @@ -3619,6 +3665,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
>   	case SCTP_AUTO_ASCONF:
>   		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
>   		break;
> +	case SCTP_PEER_ADDR_THLDS:
> +		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
> +		break;
>   	default:
>   		retval = -ENOPROTOOPT;
>   		break;
> @@ -5490,6 +5539,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
>   	return 0;
>   }
>
> +/*
> + * SCTP_PEER_ADDR_THLDS
> + *
> + * This option allows us to fetch the partially failed threshold for one or all
> + * transports in an association.  See Section 6.1 of:
> + * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> + */
> +static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
> +					    char __user *optval,
> +					    int optlen)
> +{
> +	struct sctp_paddrthlds val;
> +	struct sctp_transport *trans;
> +	struct sctp_association *asoc;
> +
> +	if (optlen < sizeof(struct sctp_paddrthlds))
> +		return -EINVAL;
> +	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
> +		return -EFAULT;

Again, trashing the stack if optlen and optval are bigger.

-vlad
> +
> +	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> +			val.spt_assoc_id);
> +		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> +		if (!asoc)
> +			return -ENOENT;
> +
> +		val.spt_pathpfthld = asoc->pf_retrans;
> +		val.spt_pathmaxrxt = asoc->pathmaxrxt;
> +	} else {
> +		trans = sctp_addr_id2transport(sk, &val.spt_address,
> +					       val.spt_assoc_id);
> +		if (!trans)
> +			return -ENOENT;
> +
> +		val.spt_pathmaxrxt = trans->pathmaxrxt;
> +		val.spt_pathpfthld = trans->pf_retrans;
> +	}
> +
> +	if (copy_to_user(optval, &val, optlen))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
>   SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
>   				char __user *optval, int __user *optlen)
>   {
> @@ -5628,6 +5721,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
>   	case SCTP_AUTO_ASCONF:
>   		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
>   		break;
> +	case SCTP_PEER_ADDR_THLDS:
> +		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
> +		break;
>   	default:
>   		retval = -ENOPROTOOPT;
>   		break;
> diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
> index e5fe639..2b2bfe9 100644
> --- a/net/sctp/sysctl.c
> +++ b/net/sctp/sysctl.c
> @@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
>   		.extra2		= &int_max
>   	},
>   	{
> +		.procname	= "pf_retrans",
> +		.data		= &sctp_pf_retrans,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &zero,
> +		.extra2		= &int_max
> +	},
> +	{
>   		.procname	= "max_init_retransmits",
>   		.data		= &sctp_max_retrans_init,
>   		.maxlen		= sizeof(int),
> diff --git a/net/sctp/transport.c b/net/sctp/transport.c
> index b026ba0..194d0f3 100644
> --- a/net/sctp/transport.c
> +++ b/net/sctp/transport.c
> @@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
>
>   	/* Initialize the default path max_retrans.  */
>   	peer->pathmaxrxt  = sctp_max_retrans_path;
> +	peer->pf_retrans  = sctp_pf_retrans;
>
>   	INIT_LIST_HEAD(&peer->transmitted);
>   	INIT_LIST_HEAD(&peer->send_ready);
> @@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
>   {
>   	unsigned long timeout;
>   	timeout = t->rto + sctp_jitter(t->rto);
> -	if (t->state != SCTP_UNCONFIRMED)
> +	if ((t->state != SCTP_UNCONFIRMED) &&
> +	    (t->state != SCTP_PF))
>   		timeout += t->hbinterval;
>   	timeout += jiffies;
>   	return timeout;
>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2] sctp: Implement quick failover draft from tsvwg
@ 2012-07-18 21:23     ` Vlad Yasevich
  0 siblings, 0 replies; 48+ messages in thread
From: Vlad Yasevich @ 2012-07-18 21:23 UTC (permalink / raw)
  To: Neil Horman; +Cc: netdev, Sridhar Samudrala, David S. Miller, linux-sctp

On 07/18/2012 02:01 PM, Neil Horman wrote:
> I've seen several attempts recently made to do quick failover of sctp transports
> by reducing various retransmit timers and counters.  While its possible to
> implement a faster failover on multihomed sctp associations, its not
> particularly robust, in that it can lead to unneeded retransmits, as well as
> false connection failures due to intermittent latency on a network.
>
> Instead, lets implement the new ietf quick failover draft found here:
> http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>
> This will let the sctp stack identify transports that have had a small number of
> errors, and avoid using them quickly until their reliability can be
> re-established.  I've tested this out on two virt guests connected via multiple
> isolated virt networks and believe its in compliance with the above draft and
> works well.
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> CC: Vlad Yasevich <vyasevich@gmail.com>
> CC: Sridhar Samudrala <sri@us.ibm.com>
> CC: "David S. Miller" <davem@davemloft.net>
> CC: linux-sctp@vger.kernel.org
>
> ---
> Change notes:
>
> V2)
> - Added socket option API from section 6.1 of the specification, as per
> request from Vlad. Adding this socket option allows us to alter both the path
> maximum retransmit value and the path partial failure threshold for each
> transport and the association as a whole.
>
> - Added a per transport pf_retrans value, and initialized it from the
> association value.  This makes each transport independently configurable as per
> the socket option above, and prevents changes in the sysctl from bleeding into
> an already created association.
> ---
>   Documentation/networking/ip-sysctl.txt |   14 +++++
>   include/net/sctp/constants.h           |    1 +
>   include/net/sctp/structs.h             |   11 +++-
>   include/net/sctp/user.h                |   11 ++++
>   net/sctp/associola.c                   |   36 ++++++++++--
>   net/sctp/outqueue.c                    |    6 +-
>   net/sctp/sm_sideeffect.c               |   33 ++++++++++-
>   net/sctp/socket.c                      |   96 ++++++++++++++++++++++++++++++++
>   net/sctp/sysctl.c                      |    9 +++
>   net/sctp/transport.c                   |    4 +-
>   10 files changed, 206 insertions(+), 15 deletions(-)
>

[ snip ]

> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index b3b8a8d..dfffece 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -3470,6 +3470,52 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
>   }
>
>
> +/*
> + * SCTP_PEER_ADDR_THLDS
> + *
> + * This option allows us to alter the partially failed threshold for one or all
> + * transports in an association.  See Section 6.1 of:
> + * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> + */
> +static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
> +					    char __user *optval,
> +					    unsigned int optlen)
> +{
> +	struct sctp_paddrthlds val;
> +	struct sctp_transport *trans;
> +	struct sctp_association *asoc;
> +
> +	if (optlen < sizeof(struct sctp_paddrthlds))
> +		return -EINVAL;
> +	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
> +			   optlen))
> +		return -EFAULT;

What if optlen is bigger?  You going to trash the stack.

> +
> +	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> +		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> +		if (!asoc)
> +			return -ENOENT;
> +		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
> +				    transports) {
> +			trans->pathmaxrxt = val.spt_pathmaxrxt;
> +			trans->pf_retrans = val.spt_pathpfthld;

You want to make sure that the values aren't 0.  Otherwise, you'll set 
the pathmaxrxt to 0 and that would be bad.

> +		}
> +
> +		asoc->pf_retrans = val.spt_pathpfthld;
> +		asoc->pathmaxrxt = val.spt_pathmaxrxt;

Ditto.

> +	} else {
> +		trans = sctp_addr_id2transport(sk, &val.spt_address,
> +					       val.spt_assoc_id);
> +		if (!trans)
> +			return -ENOENT;
> +
> +		trans->pathmaxrxt = val.spt_pathmaxrxt;
> +		trans->pf_retrans = val.spt_pathpfthld;

Ditto.

> +	}
> +
> +	return 0;
> +}
> +
>   /* API 6.2 setsockopt(), getsockopt()
>    *
>    * Applications use setsockopt() and getsockopt() to set or retrieve
> @@ -3619,6 +3665,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
>   	case SCTP_AUTO_ASCONF:
>   		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
>   		break;
> +	case SCTP_PEER_ADDR_THLDS:
> +		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
> +		break;
>   	default:
>   		retval = -ENOPROTOOPT;
>   		break;
> @@ -5490,6 +5539,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
>   	return 0;
>   }
>
> +/*
> + * SCTP_PEER_ADDR_THLDS
> + *
> + * This option allows us to fetch the partially failed threshold for one or all
> + * transports in an association.  See Section 6.1 of:
> + * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> + */
> +static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
> +					    char __user *optval,
> +					    int optlen)
> +{
> +	struct sctp_paddrthlds val;
> +	struct sctp_transport *trans;
> +	struct sctp_association *asoc;
> +
> +	if (optlen < sizeof(struct sctp_paddrthlds))
> +		return -EINVAL;
> +	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
> +		return -EFAULT;

Again, trashing the stack if optlen and optval are bigger.

-vlad
> +
> +	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> +			val.spt_assoc_id);
> +		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> +		if (!asoc)
> +			return -ENOENT;
> +
> +		val.spt_pathpfthld = asoc->pf_retrans;
> +		val.spt_pathmaxrxt = asoc->pathmaxrxt;
> +	} else {
> +		trans = sctp_addr_id2transport(sk, &val.spt_address,
> +					       val.spt_assoc_id);
> +		if (!trans)
> +			return -ENOENT;
> +
> +		val.spt_pathmaxrxt = trans->pathmaxrxt;
> +		val.spt_pathpfthld = trans->pf_retrans;
> +	}
> +
> +	if (copy_to_user(optval, &val, optlen))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
>   SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
>   				char __user *optval, int __user *optlen)
>   {
> @@ -5628,6 +5721,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
>   	case SCTP_AUTO_ASCONF:
>   		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
>   		break;
> +	case SCTP_PEER_ADDR_THLDS:
> +		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
> +		break;
>   	default:
>   		retval = -ENOPROTOOPT;
>   		break;
> diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
> index e5fe639..2b2bfe9 100644
> --- a/net/sctp/sysctl.c
> +++ b/net/sctp/sysctl.c
> @@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
>   		.extra2		= &int_max
>   	},
>   	{
> +		.procname	= "pf_retrans",
> +		.data		= &sctp_pf_retrans,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &zero,
> +		.extra2		= &int_max
> +	},
> +	{
>   		.procname	= "max_init_retransmits",
>   		.data		= &sctp_max_retrans_init,
>   		.maxlen		= sizeof(int),
> diff --git a/net/sctp/transport.c b/net/sctp/transport.c
> index b026ba0..194d0f3 100644
> --- a/net/sctp/transport.c
> +++ b/net/sctp/transport.c
> @@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
>
>   	/* Initialize the default path max_retrans.  */
>   	peer->pathmaxrxt  = sctp_max_retrans_path;
> +	peer->pf_retrans  = sctp_pf_retrans;
>
>   	INIT_LIST_HEAD(&peer->transmitted);
>   	INIT_LIST_HEAD(&peer->send_ready);
> @@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
>   {
>   	unsigned long timeout;
>   	timeout = t->rto + sctp_jitter(t->rto);
> -	if (t->state != SCTP_UNCONFIRMED)
> +	if ((t->state != SCTP_UNCONFIRMED) &&
> +	    (t->state != SCTP_PF))
>   		timeout += t->hbinterval;
>   	timeout += jiffies;
>   	return timeout;
>



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2] sctp: Implement quick failover draft from tsvwg
  2012-07-18 20:30     ` Joe Perches
@ 2012-07-19 10:45       ` Neil Horman
  -1 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-19 10:45 UTC (permalink / raw)
  To: Joe Perches
  Cc: netdev, Vlad Yasevich, Sridhar Samudrala, David S. Miller, linux-sctp

On Wed, Jul 18, 2012 at 01:30:58PM -0700, Joe Perches wrote:
> On Wed, 2012-07-18 at 14:01 -0400, Neil Horman wrote:
> > I've seen several attempts recently made to do quick failover of sctp transports
> > by reducing various retransmit timers and counters.  While its possible to
> > implement a faster failover on multihomed sctp associations, its not
> > particularly robust, in that it can lead to unneeded retransmits, as well as
> > false connection failures due to intermittent latency on a network.
> 
> trivia:
> 
> > diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> 
> > @@ -871,6 +885,10 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
> >  		spc_state = SCTP_ADDR_UNREACHABLE;
> >  		break;
> >  
> > +	case SCTP_TRANSPORT_PF:
> > +		transport->state = SCTP_PF;
> > +		ulp_notify = false;
> > +		break;
> 
> nicer to add a newline here
> 
Ack, I'll fix that.

> >  	default:
> >  		return;
> >  	}
> > @@ -878,12 +896,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
> []
> > +	if (ulp_notify) {
> > +		memset(&addr, 0, sizeof(struct sockaddr_storage));
> > +		memcpy(&addr, &transport->ipaddr,
> > +		       transport->af_specific->sockaddr_len);
> 
> Perhaps it's better to do the memcpy then the memset of the
> space left instead.
> 
> 		memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
> 		memset((char *)&addr) + transport->af_specific->sockaddr_len, 0,
> 		       sizeof(struct sockaddr_storage) - transport->af_specific->sockaddr_len);
> 		       
> 
hmm, not sure about that. It works either way for me, but I've not changed that
code, just the condition under which it was executed.  I'd rather save cleanups
like that for a separate patch if you don't mind.
Neil

> 
> 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2] sctp: Implement quick failover draft from tsvwg
@ 2012-07-19 10:45       ` Neil Horman
  0 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-19 10:45 UTC (permalink / raw)
  To: Joe Perches
  Cc: netdev, Vlad Yasevich, Sridhar Samudrala, David S. Miller, linux-sctp

On Wed, Jul 18, 2012 at 01:30:58PM -0700, Joe Perches wrote:
> On Wed, 2012-07-18 at 14:01 -0400, Neil Horman wrote:
> > I've seen several attempts recently made to do quick failover of sctp transports
> > by reducing various retransmit timers and counters.  While its possible to
> > implement a faster failover on multihomed sctp associations, its not
> > particularly robust, in that it can lead to unneeded retransmits, as well as
> > false connection failures due to intermittent latency on a network.
> 
> trivia:
> 
> > diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> 
> > @@ -871,6 +885,10 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
> >  		spc_state = SCTP_ADDR_UNREACHABLE;
> >  		break;
> >  
> > +	case SCTP_TRANSPORT_PF:
> > +		transport->state = SCTP_PF;
> > +		ulp_notify = false;
> > +		break;
> 
> nicer to add a newline here
> 
Ack, I'll fix that.

> >  	default:
> >  		return;
> >  	}
> > @@ -878,12 +896,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
> []
> > +	if (ulp_notify) {
> > +		memset(&addr, 0, sizeof(struct sockaddr_storage));
> > +		memcpy(&addr, &transport->ipaddr,
> > +		       transport->af_specific->sockaddr_len);
> 
> Perhaps it's better to do the memcpy then the memset of the
> space left instead.
> 
> 		memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
> 		memset((char *)&addr) + transport->af_specific->sockaddr_len, 0,
> 		       sizeof(struct sockaddr_storage) - transport->af_specific->sockaddr_len);
> 		       
> 
hmm, not sure about that. It works either way for me, but I've not changed that
code, just the condition under which it was executed.  I'd rather save cleanups
like that for a separate patch if you don't mind.
Neil

> 
> 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2] sctp: Implement quick failover draft from tsvwg
  2012-07-18 21:23     ` Vlad Yasevich
@ 2012-07-19 10:46       ` Neil Horman
  -1 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-19 10:46 UTC (permalink / raw)
  To: Vlad Yasevich; +Cc: netdev, Sridhar Samudrala, David S. Miller, linux-sctp

On Wed, Jul 18, 2012 at 05:23:05PM -0400, Vlad Yasevich wrote:
> On 07/18/2012 02:01 PM, Neil Horman wrote:
> >I've seen several attempts recently made to do quick failover of sctp transports
> >by reducing various retransmit timers and counters.  While its possible to
> >implement a faster failover on multihomed sctp associations, its not
> >particularly robust, in that it can lead to unneeded retransmits, as well as
> >false connection failures due to intermittent latency on a network.
> >
> >Instead, lets implement the new ietf quick failover draft found here:
> >http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
> >
> >This will let the sctp stack identify transports that have had a small number of
> >errors, and avoid using them quickly until their reliability can be
> >re-established.  I've tested this out on two virt guests connected via multiple
> >isolated virt networks and believe its in compliance with the above draft and
> >works well.
> >
> >Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> >CC: Vlad Yasevich <vyasevich@gmail.com>
> >CC: Sridhar Samudrala <sri@us.ibm.com>
> >CC: "David S. Miller" <davem@davemloft.net>
> >CC: linux-sctp@vger.kernel.org
> >
> >---
> >Change notes:
> >
> >V2)
> >- Added socket option API from section 6.1 of the specification, as per
> >request from Vlad. Adding this socket option allows us to alter both the path
> >maximum retransmit value and the path partial failure threshold for each
> >transport and the association as a whole.
> >
> >- Added a per transport pf_retrans value, and initialized it from the
> >association value.  This makes each transport independently configurable as per
> >the socket option above, and prevents changes in the sysctl from bleeding into
> >an already created association.
> >---
> >  Documentation/networking/ip-sysctl.txt |   14 +++++
> >  include/net/sctp/constants.h           |    1 +
> >  include/net/sctp/structs.h             |   11 +++-
> >  include/net/sctp/user.h                |   11 ++++
> >  net/sctp/associola.c                   |   36 ++++++++++--
> >  net/sctp/outqueue.c                    |    6 +-
> >  net/sctp/sm_sideeffect.c               |   33 ++++++++++-
> >  net/sctp/socket.c                      |   96 ++++++++++++++++++++++++++++++++
> >  net/sctp/sysctl.c                      |    9 +++
> >  net/sctp/transport.c                   |    4 +-
> >  10 files changed, 206 insertions(+), 15 deletions(-)
> >
> 
> [ snip ]
> 
> >diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> >index b3b8a8d..dfffece 100644
> >--- a/net/sctp/socket.c
> >+++ b/net/sctp/socket.c
> >@@ -3470,6 +3470,52 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
> >  }
> >
> >
> >+/*
> >+ * SCTP_PEER_ADDR_THLDS
> >+ *
> >+ * This option allows us to alter the partially failed threshold for one or all
> >+ * transports in an association.  See Section 6.1 of:
> >+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> >+ */
> >+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
> >+					    char __user *optval,
> >+					    unsigned int optlen)
> >+{
> >+	struct sctp_paddrthlds val;
> >+	struct sctp_transport *trans;
> >+	struct sctp_association *asoc;
> >+
> >+	if (optlen < sizeof(struct sctp_paddrthlds))
> >+		return -EINVAL;
> >+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
> >+			   optlen))
> >+		return -EFAULT;
> 
> What if optlen is bigger?  You going to trash the stack.
> 
> >+
> >+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> >+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> >+		if (!asoc)
> >+			return -ENOENT;
> >+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
> >+				    transports) {
> >+			trans->pathmaxrxt = val.spt_pathmaxrxt;
> >+			trans->pf_retrans = val.spt_pathpfthld;
> 
> You want to make sure that the values aren't 0.  Otherwise, you'll
> set the pathmaxrxt to 0 and that would be bad.
> 
> >+		}
> >+
> >+		asoc->pf_retrans = val.spt_pathpfthld;
> >+		asoc->pathmaxrxt = val.spt_pathmaxrxt;
> 
> Ditto.
> 
> >+	} else {
> >+		trans = sctp_addr_id2transport(sk, &val.spt_address,
> >+					       val.spt_assoc_id);
> >+		if (!trans)
> >+			return -ENOENT;
> >+
> >+		trans->pathmaxrxt = val.spt_pathmaxrxt;
> >+		trans->pf_retrans = val.spt_pathpfthld;
> 
> Ditto.
> 
> >+	}
> >+
> >+	return 0;
> >+}
> >+
> >  /* API 6.2 setsockopt(), getsockopt()
> >   *
> >   * Applications use setsockopt() and getsockopt() to set or retrieve
> >@@ -3619,6 +3665,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
> >  	case SCTP_AUTO_ASCONF:
> >  		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
> >  		break;
> >+	case SCTP_PEER_ADDR_THLDS:
> >+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
> >+		break;
> >  	default:
> >  		retval = -ENOPROTOOPT;
> >  		break;
> >@@ -5490,6 +5539,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
> >  	return 0;
> >  }
> >
> >+/*
> >+ * SCTP_PEER_ADDR_THLDS
> >+ *
> >+ * This option allows us to fetch the partially failed threshold for one or all
> >+ * transports in an association.  See Section 6.1 of:
> >+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> >+ */
> >+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
> >+					    char __user *optval,
> >+					    int optlen)
> >+{
> >+	struct sctp_paddrthlds val;
> >+	struct sctp_transport *trans;
> >+	struct sctp_association *asoc;
> >+
> >+	if (optlen < sizeof(struct sctp_paddrthlds))
> >+		return -EINVAL;
> >+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
> >+		return -EFAULT;
> 
> Again, trashing the stack if optlen and optval are bigger.
> 
> -vlad


Ack, I'll fix these up and repost.  Thanks!
Neil

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2] sctp: Implement quick failover draft from tsvwg
@ 2012-07-19 10:46       ` Neil Horman
  0 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-19 10:46 UTC (permalink / raw)
  To: Vlad Yasevich; +Cc: netdev, Sridhar Samudrala, David S. Miller, linux-sctp

On Wed, Jul 18, 2012 at 05:23:05PM -0400, Vlad Yasevich wrote:
> On 07/18/2012 02:01 PM, Neil Horman wrote:
> >I've seen several attempts recently made to do quick failover of sctp transports
> >by reducing various retransmit timers and counters.  While its possible to
> >implement a faster failover on multihomed sctp associations, its not
> >particularly robust, in that it can lead to unneeded retransmits, as well as
> >false connection failures due to intermittent latency on a network.
> >
> >Instead, lets implement the new ietf quick failover draft found here:
> >http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
> >
> >This will let the sctp stack identify transports that have had a small number of
> >errors, and avoid using them quickly until their reliability can be
> >re-established.  I've tested this out on two virt guests connected via multiple
> >isolated virt networks and believe its in compliance with the above draft and
> >works well.
> >
> >Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> >CC: Vlad Yasevich <vyasevich@gmail.com>
> >CC: Sridhar Samudrala <sri@us.ibm.com>
> >CC: "David S. Miller" <davem@davemloft.net>
> >CC: linux-sctp@vger.kernel.org
> >
> >---
> >Change notes:
> >
> >V2)
> >- Added socket option API from section 6.1 of the specification, as per
> >request from Vlad. Adding this socket option allows us to alter both the path
> >maximum retransmit value and the path partial failure threshold for each
> >transport and the association as a whole.
> >
> >- Added a per transport pf_retrans value, and initialized it from the
> >association value.  This makes each transport independently configurable as per
> >the socket option above, and prevents changes in the sysctl from bleeding into
> >an already created association.
> >---
> >  Documentation/networking/ip-sysctl.txt |   14 +++++
> >  include/net/sctp/constants.h           |    1 +
> >  include/net/sctp/structs.h             |   11 +++-
> >  include/net/sctp/user.h                |   11 ++++
> >  net/sctp/associola.c                   |   36 ++++++++++--
> >  net/sctp/outqueue.c                    |    6 +-
> >  net/sctp/sm_sideeffect.c               |   33 ++++++++++-
> >  net/sctp/socket.c                      |   96 ++++++++++++++++++++++++++++++++
> >  net/sctp/sysctl.c                      |    9 +++
> >  net/sctp/transport.c                   |    4 +-
> >  10 files changed, 206 insertions(+), 15 deletions(-)
> >
> 
> [ snip ]
> 
> >diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> >index b3b8a8d..dfffece 100644
> >--- a/net/sctp/socket.c
> >+++ b/net/sctp/socket.c
> >@@ -3470,6 +3470,52 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
> >  }
> >
> >
> >+/*
> >+ * SCTP_PEER_ADDR_THLDS
> >+ *
> >+ * This option allows us to alter the partially failed threshold for one or all
> >+ * transports in an association.  See Section 6.1 of:
> >+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> >+ */
> >+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
> >+					    char __user *optval,
> >+					    unsigned int optlen)
> >+{
> >+	struct sctp_paddrthlds val;
> >+	struct sctp_transport *trans;
> >+	struct sctp_association *asoc;
> >+
> >+	if (optlen < sizeof(struct sctp_paddrthlds))
> >+		return -EINVAL;
> >+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
> >+			   optlen))
> >+		return -EFAULT;
> 
> What if optlen is bigger?  You going to trash the stack.
> 
> >+
> >+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> >+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> >+		if (!asoc)
> >+			return -ENOENT;
> >+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
> >+				    transports) {
> >+			trans->pathmaxrxt = val.spt_pathmaxrxt;
> >+			trans->pf_retrans = val.spt_pathpfthld;
> 
> You want to make sure that the values aren't 0.  Otherwise, you'll
> set the pathmaxrxt to 0 and that would be bad.
> 
> >+		}
> >+
> >+		asoc->pf_retrans = val.spt_pathpfthld;
> >+		asoc->pathmaxrxt = val.spt_pathmaxrxt;
> 
> Ditto.
> 
> >+	} else {
> >+		trans = sctp_addr_id2transport(sk, &val.spt_address,
> >+					       val.spt_assoc_id);
> >+		if (!trans)
> >+			return -ENOENT;
> >+
> >+		trans->pathmaxrxt = val.spt_pathmaxrxt;
> >+		trans->pf_retrans = val.spt_pathpfthld;
> 
> Ditto.
> 
> >+	}
> >+
> >+	return 0;
> >+}
> >+
> >  /* API 6.2 setsockopt(), getsockopt()
> >   *
> >   * Applications use setsockopt() and getsockopt() to set or retrieve
> >@@ -3619,6 +3665,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
> >  	case SCTP_AUTO_ASCONF:
> >  		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
> >  		break;
> >+	case SCTP_PEER_ADDR_THLDS:
> >+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
> >+		break;
> >  	default:
> >  		retval = -ENOPROTOOPT;
> >  		break;
> >@@ -5490,6 +5539,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
> >  	return 0;
> >  }
> >
> >+/*
> >+ * SCTP_PEER_ADDR_THLDS
> >+ *
> >+ * This option allows us to fetch the partially failed threshold for one or all
> >+ * transports in an association.  See Section 6.1 of:
> >+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> >+ */
> >+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
> >+					    char __user *optval,
> >+					    int optlen)
> >+{
> >+	struct sctp_paddrthlds val;
> >+	struct sctp_transport *trans;
> >+	struct sctp_association *asoc;
> >+
> >+	if (optlen < sizeof(struct sctp_paddrthlds))
> >+		return -EINVAL;
> >+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
> >+		return -EFAULT;
> 
> Again, trashing the stack if optlen and optval are bigger.
> 
> -vlad


Ack, I'll fix these up and repost.  Thanks!
Neil


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v3] sctp: Implement quick failover draft from tsvwg
  2012-07-13 18:26 ` Neil Horman
@ 2012-07-19 16:51   ` Neil Horman
  -1 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-19 16:51 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp, joe

I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org
CC: joe@perches.com

---
Change notes:

V2)
- Added socket option API from section 6.1 of the specification, as per
request from Vlad. Adding this socket option allows us to alter both the path
maximum retransmit value and the path partial failure threshold for each
transport and the association as a whole.

- Added a per transport pf_retrans value, and initialized it from the
association value.  This makes each transport independently configurable as per
the socket option above, and prevents changes in the sysctl from bleeding into
an already created association.

V3)
- Cleaned up some line spacing (Joe Perches)
- Fixed some socket option user data sanitization (Vlad Yasevich)
---
 Documentation/networking/ip-sysctl.txt |   14 +++++
 include/net/sctp/constants.h           |    1 +
 include/net/sctp/structs.h             |   11 +++-
 include/net/sctp/user.h                |   11 ++++
 net/sctp/associola.c                   |   37 ++++++++++--
 net/sctp/outqueue.c                    |    6 +-
 net/sctp/sm_sideeffect.c               |   33 +++++++++-
 net/sctp/socket.c                      |  100 ++++++++++++++++++++++++++++++++
 net/sctp/sysctl.c                      |    9 +++
 net/sctp/transport.c                   |    4 +-
 10 files changed, 211 insertions(+), 15 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79..c636f9c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
 
 	Default: 5
 
+pf_retrans - INTEGER
+	The number of retransmissions that will be attempted on a given path
+	before traffic is redirected to an alternate transport (should one
+	exist).  Note this is distinct from path_max_retrans, as a path that
+	passes the pf_retrans threshold can still be used.  Its only
+	deprioritized when a transmission path is selected by the stack.  This
+	setting is primarily used to enable fast failover mechanisms without
+	having to reduce path_max_retrans to a very low value.  See:
+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+	for details.  Note also that a value of pf_retrans > path_max_retrans
+	disables this feature
+
+	Default: 0
+
 rto_initial - INTEGER
 	The initial round trip timeout value in milliseconds that will be used
 	in calculating round trip times.  This is the initial time interval
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 942b864..d053d2e 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -334,6 +334,7 @@ typedef enum {
 typedef enum {
 	SCTP_TRANSPORT_UP,
 	SCTP_TRANSPORT_DOWN,
+	SCTP_TRANSPORT_PF,
 } sctp_transport_cmd_t;
 
 /* These are the address scopes defined mainly for IPv4 addresses
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e4652fe..f70726c 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -160,6 +160,7 @@ extern struct sctp_globals {
 	int max_retrans_association;
 	int max_retrans_path;
 	int max_retrans_init;
+	int pf_retrans;
 
 	/*
 	 * Policy for preforming sctp/socket accounting
@@ -258,6 +259,7 @@ extern struct sctp_globals {
 #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
 #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
 #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
 #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
 #define sctp_sack_timeout		(sctp_globals.sack_timeout)
 #define sctp_hb_interval		(sctp_globals.hb_interval)
@@ -987,10 +989,15 @@ struct sctp_transport {
 
 	/* This is the max_retrans value for the transport and will
 	 * be initialized from the assocs value.  This can be changed
-	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
+	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
 	 */
 	__u16 pathmaxrxt;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be changed
+	 * using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
 	/* PMTU	      : The current known path MTU.  */
 	__u32 pathmtu;
 
@@ -1660,6 +1667,8 @@ struct sctp_association {
 	 */
 	int max_retrans;
 
+	int pf_retrans;
+
 	/* Maximum number of times the endpoint will retransmit INIT  */
 	__u16 max_init_attempts;
 
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 0842ef0..1b02d7a 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
 #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
 #define SCTP_AUTO_ASCONF       30
+#define SCTP_PEER_ADDR_THLDS	31
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
@@ -649,6 +650,7 @@ struct sctp_paddrinfo {
  */
 enum sctp_spinfo_state {
 	SCTP_INACTIVE,
+	SCTP_PF,
 	SCTP_ACTIVE,
 	SCTP_UNCONFIRMED,
 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
@@ -741,4 +743,13 @@ typedef struct {
 	int sd;
 } sctp_peeloff_arg_t;
 
+/*
+ *  Peer Address Thresholds socket option
+ */
+struct sctp_paddrthlds {
+	sctp_assoc_t spt_assoc_id;
+	struct sockaddr_storage spt_address;
+	__u16 spt_pathmaxrxt;
+	__u16 spt_pathpfthld;
+};
 #endif /* __net_sctp_user_h__ */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5bc9ab1..90fe36b 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * socket values.
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+	asoc->pf_retrans  = sctp_pf_retrans;
+
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
@@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	/* Set the path max_retrans.  */
 	peer->pathmaxrxt = asoc->pathmaxrxt;
 
+	/* And the partial failure retrnas threshold */
+	peer->pf_retrans = asoc->pf_retrans;
+
 	/* Initialize the peer's SACK delay timeout based on the
 	 * association configured value.
 	 */
@@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	struct sctp_ulpevent *event;
 	struct sockaddr_storage addr;
 	int spc_state = 0;
+	bool ulp_notify = true;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			spc_state = SCTP_ADDR_CONFIRMED;
 		else
 			spc_state = SCTP_ADDR_AVAILABLE;
+		/* Don't inform ULP about transition from PF to
+		 * active state and set cwnd to 1, see SCTP
+		 * Quick failover draft section 5.1, point 5
+		 */
+		if (transport->state == SCTP_PF) {
+			ulp_notify = false;
+			transport->cwnd = 1;
+		}
 		transport->state = SCTP_ACTIVE;
 		break;
 
@@ -871,6 +885,11 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
+	case SCTP_TRANSPORT_PF:
+		transport->state = SCTP_PF;
+		ulp_notify = false;
+		break;
+
 	default:
 		return;
 	}
@@ -878,12 +897,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
 	 * user.
 	 */
-	memset(&addr, 0, sizeof(struct sockaddr_storage));
-	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-				0, spc_state, error, GFP_ATOMIC);
-	if (event)
-		sctp_ulpq_tail_event(&asoc->ulpq, event);
+	if (ulp_notify) {
+		memset(&addr, 0, sizeof(struct sockaddr_storage));
+		memcpy(&addr, &transport->ipaddr,
+		       transport->af_specific->sockaddr_len);
+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+					0, spc_state, error, GFP_ATOMIC);
+		if (event)
+			sctp_ulpq_tail_event(&asoc->ulpq, event);
+	}
 
 	/* Select new active and retran paths. */
 
@@ -899,7 +921,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			transports) {
 
 		if ((t->state == SCTP_INACTIVE) ||
-		    (t->state == SCTP_UNCONFIRMED))
+		    (t->state == SCTP_UNCONFIRMED) ||
+		    (t->state == SCTP_PF))
 			continue;
 		if (!first || t->last_time_heard > first->last_time_heard) {
 			second = first;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index a0fa19f..e7aa177c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			if (!new_transport)
 				new_transport = asoc->peer.active_path;
 		} else if ((new_transport->state == SCTP_INACTIVE) ||
-			   (new_transport->state == SCTP_UNCONFIRMED)) {
+			   (new_transport->state == SCTP_UNCONFIRMED) ||
+			   (new_transport->state == SCTP_PF)) {
 			/* If the chunk is Heartbeat or Heartbeat Ack,
 			 * send it to chunk->transport, even if it's
 			 * inactive.
@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			new_transport = chunk->transport;
 			if (!new_transport ||
 			    ((new_transport->state == SCTP_INACTIVE) ||
-			     (new_transport->state == SCTP_UNCONFIRMED)))
+			     (new_transport->state == SCTP_UNCONFIRMED) ||
+			     (new_transport->state == SCTP_PF)))
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state == SCTP_UNCONFIRMED)
 				continue;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c96d1a8..285e26a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_cmd_seq_t *commands,
 			     gfp_t gfp);
 
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+				     struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
@@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+					 struct sctp_association *asoc,
 					 struct sctp_transport *transport,
 					 int is_hb)
 {
@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 			transport->error_count++;
 	}
 
+	/* If the transport error count is greater than the pf_retrans
+	 * threshold, and less than pathmaxrtx, then mark this transport
+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
+	 * point 1
+	 */
+	if ((transport->state != SCTP_PF) &&
+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
+	   (transport->error_count > asoc->pf_retrans)) {
+
+		sctp_assoc_control_transport(asoc, transport,
+					     SCTP_TRANSPORT_PF,
+					     0);
+
+		/* Update the hb timer to resend a heartbeat every rto */
+		sctp_cmd_hb_timer_update(commands, transport);
+	}
+
 	if (transport->state != SCTP_INACTIVE &&
 	    (transport->error_count > transport->pathmaxrxt)) {
 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 					     SCTP_HEARTBEAT_SUCCESS);
 	}
 
+	if (t->state == SCTP_PF)
+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+					     SCTP_HEARTBEAT_SUCCESS);
+
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
 	 * using the time value carried in the HEARTBEAT ACK chunk.
@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_STRIKE:
 			/* Mark one strike against a transport.  */
-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
-						    0);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						    cmd->obj.transport, 0);
 			break;
 
 		case SCTP_CMD_TRANSPORT_IDLE:
@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_TRANSPORT_HB_SENT:
 			t = cmd->obj.transport;
-			sctp_do_8_2_transport_strike(asoc, t, 1);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						     t, 1);
 			t->hb_sent = 1;
 			break;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b3b8a8d..fef9bfa 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 }
 
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to alter the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
+			   sizeof(struct sctp_paddrthlds)))
+		return -EFAULT;
+
+	/* path_max_retrans shouldn't ever be zero */
+	if (!val.spt_pathmaxrxt)
+		return -EINVAL;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				    transports) {
+			trans->pathmaxrxt = val.spt_pathmaxrxt;
+			trans->pf_retrans = val.spt_pathpfthld;
+		}
+
+		asoc->pf_retrans = val.spt_pathpfthld;
+		asoc->pathmaxrxt = val.spt_pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		trans->pathmaxrxt = val.spt_pathmaxrxt;
+		trans->pf_retrans = val.spt_pathpfthld;
+	}
+
+	return 0;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -5490,6 +5543,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
 	return 0;
 }
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to fetch the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	optlen = sizeof(struct sctp_paddrthlds);
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+
+		val.spt_pathpfthld = asoc->pf_retrans;
+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		val.spt_pathmaxrxt = trans->pathmaxrxt;
+		val.spt_pathpfthld = trans->pf_retrans;
+	}
+
+	if (copy_to_user(optval, &val, optlen))
+		return -EFAULT;
+
+	return 0;
+}
+
 SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 				char __user *optval, int __user *optlen)
 {
@@ -5628,6 +5725,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5fe639..2b2bfe9 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
 		.extra2		= &int_max
 	},
 	{
+		.procname	= "pf_retrans",
+		.data		= &sctp_pf_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
 		.procname	= "max_init_retransmits",
 		.data		= &sctp_max_retrans_init,
 		.maxlen		= sizeof(int),
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..194d0f3 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 
 	/* Initialize the default path max_retrans.  */
 	peer->pathmaxrxt  = sctp_max_retrans_path;
+	peer->pf_retrans  = sctp_pf_retrans;
 
 	INIT_LIST_HEAD(&peer->transmitted);
 	INIT_LIST_HEAD(&peer->send_ready);
@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
 {
 	unsigned long timeout;
 	timeout = t->rto + sctp_jitter(t->rto);
-	if (t->state != SCTP_UNCONFIRMED)
+	if ((t->state != SCTP_UNCONFIRMED) &&
+	    (t->state != SCTP_PF))
 		timeout += t->hbinterval;
 	timeout += jiffies;
 	return timeout;
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v3] sctp: Implement quick failover draft from tsvwg
@ 2012-07-19 16:51   ` Neil Horman
  0 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-19 16:51 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp, joe

I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org
CC: joe@perches.com

---
Change notes:

V2)
- Added socket option API from section 6.1 of the specification, as per
request from Vlad. Adding this socket option allows us to alter both the path
maximum retransmit value and the path partial failure threshold for each
transport and the association as a whole.

- Added a per transport pf_retrans value, and initialized it from the
association value.  This makes each transport independently configurable as per
the socket option above, and prevents changes in the sysctl from bleeding into
an already created association.

V3)
- Cleaned up some line spacing (Joe Perches)
- Fixed some socket option user data sanitization (Vlad Yasevich)
---
 Documentation/networking/ip-sysctl.txt |   14 +++++
 include/net/sctp/constants.h           |    1 +
 include/net/sctp/structs.h             |   11 +++-
 include/net/sctp/user.h                |   11 ++++
 net/sctp/associola.c                   |   37 ++++++++++--
 net/sctp/outqueue.c                    |    6 +-
 net/sctp/sm_sideeffect.c               |   33 +++++++++-
 net/sctp/socket.c                      |  100 ++++++++++++++++++++++++++++++++
 net/sctp/sysctl.c                      |    9 +++
 net/sctp/transport.c                   |    4 +-
 10 files changed, 211 insertions(+), 15 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79..c636f9c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
 
 	Default: 5
 
+pf_retrans - INTEGER
+	The number of retransmissions that will be attempted on a given path
+	before traffic is redirected to an alternate transport (should one
+	exist).  Note this is distinct from path_max_retrans, as a path that
+	passes the pf_retrans threshold can still be used.  Its only
+	deprioritized when a transmission path is selected by the stack.  This
+	setting is primarily used to enable fast failover mechanisms without
+	having to reduce path_max_retrans to a very low value.  See:
+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+	for details.  Note also that a value of pf_retrans > path_max_retrans
+	disables this feature
+
+	Default: 0
+
 rto_initial - INTEGER
 	The initial round trip timeout value in milliseconds that will be used
 	in calculating round trip times.  This is the initial time interval
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 942b864..d053d2e 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -334,6 +334,7 @@ typedef enum {
 typedef enum {
 	SCTP_TRANSPORT_UP,
 	SCTP_TRANSPORT_DOWN,
+	SCTP_TRANSPORT_PF,
 } sctp_transport_cmd_t;
 
 /* These are the address scopes defined mainly for IPv4 addresses
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e4652fe..f70726c 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -160,6 +160,7 @@ extern struct sctp_globals {
 	int max_retrans_association;
 	int max_retrans_path;
 	int max_retrans_init;
+	int pf_retrans;
 
 	/*
 	 * Policy for preforming sctp/socket accounting
@@ -258,6 +259,7 @@ extern struct sctp_globals {
 #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
 #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
 #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
 #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
 #define sctp_sack_timeout		(sctp_globals.sack_timeout)
 #define sctp_hb_interval		(sctp_globals.hb_interval)
@@ -987,10 +989,15 @@ struct sctp_transport {
 
 	/* This is the max_retrans value for the transport and will
 	 * be initialized from the assocs value.  This can be changed
-	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
+	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
 	 */
 	__u16 pathmaxrxt;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be changed
+	 * using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
 	/* PMTU	      : The current known path MTU.  */
 	__u32 pathmtu;
 
@@ -1660,6 +1667,8 @@ struct sctp_association {
 	 */
 	int max_retrans;
 
+	int pf_retrans;
+
 	/* Maximum number of times the endpoint will retransmit INIT  */
 	__u16 max_init_attempts;
 
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 0842ef0..1b02d7a 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
 #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
 #define SCTP_AUTO_ASCONF       30
+#define SCTP_PEER_ADDR_THLDS	31
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
@@ -649,6 +650,7 @@ struct sctp_paddrinfo {
  */
 enum sctp_spinfo_state {
 	SCTP_INACTIVE,
+	SCTP_PF,
 	SCTP_ACTIVE,
 	SCTP_UNCONFIRMED,
 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
@@ -741,4 +743,13 @@ typedef struct {
 	int sd;
 } sctp_peeloff_arg_t;
 
+/*
+ *  Peer Address Thresholds socket option
+ */
+struct sctp_paddrthlds {
+	sctp_assoc_t spt_assoc_id;
+	struct sockaddr_storage spt_address;
+	__u16 spt_pathmaxrxt;
+	__u16 spt_pathpfthld;
+};
 #endif /* __net_sctp_user_h__ */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5bc9ab1..90fe36b 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * socket values.
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+	asoc->pf_retrans  = sctp_pf_retrans;
+
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
@@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	/* Set the path max_retrans.  */
 	peer->pathmaxrxt = asoc->pathmaxrxt;
 
+	/* And the partial failure retrnas threshold */
+	peer->pf_retrans = asoc->pf_retrans;
+
 	/* Initialize the peer's SACK delay timeout based on the
 	 * association configured value.
 	 */
@@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	struct sctp_ulpevent *event;
 	struct sockaddr_storage addr;
 	int spc_state = 0;
+	bool ulp_notify = true;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			spc_state = SCTP_ADDR_CONFIRMED;
 		else
 			spc_state = SCTP_ADDR_AVAILABLE;
+		/* Don't inform ULP about transition from PF to
+		 * active state and set cwnd to 1, see SCTP
+		 * Quick failover draft section 5.1, point 5
+		 */
+		if (transport->state = SCTP_PF) {
+			ulp_notify = false;
+			transport->cwnd = 1;
+		}
 		transport->state = SCTP_ACTIVE;
 		break;
 
@@ -871,6 +885,11 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
+	case SCTP_TRANSPORT_PF:
+		transport->state = SCTP_PF;
+		ulp_notify = false;
+		break;
+
 	default:
 		return;
 	}
@@ -878,12 +897,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
 	 * user.
 	 */
-	memset(&addr, 0, sizeof(struct sockaddr_storage));
-	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-				0, spc_state, error, GFP_ATOMIC);
-	if (event)
-		sctp_ulpq_tail_event(&asoc->ulpq, event);
+	if (ulp_notify) {
+		memset(&addr, 0, sizeof(struct sockaddr_storage));
+		memcpy(&addr, &transport->ipaddr,
+		       transport->af_specific->sockaddr_len);
+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+					0, spc_state, error, GFP_ATOMIC);
+		if (event)
+			sctp_ulpq_tail_event(&asoc->ulpq, event);
+	}
 
 	/* Select new active and retran paths. */
 
@@ -899,7 +921,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			transports) {
 
 		if ((t->state = SCTP_INACTIVE) ||
-		    (t->state = SCTP_UNCONFIRMED))
+		    (t->state = SCTP_UNCONFIRMED) ||
+		    (t->state = SCTP_PF))
 			continue;
 		if (!first || t->last_time_heard > first->last_time_heard) {
 			second = first;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index a0fa19f..e7aa177c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			if (!new_transport)
 				new_transport = asoc->peer.active_path;
 		} else if ((new_transport->state = SCTP_INACTIVE) ||
-			   (new_transport->state = SCTP_UNCONFIRMED)) {
+			   (new_transport->state = SCTP_UNCONFIRMED) ||
+			   (new_transport->state = SCTP_PF)) {
 			/* If the chunk is Heartbeat or Heartbeat Ack,
 			 * send it to chunk->transport, even if it's
 			 * inactive.
@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			new_transport = chunk->transport;
 			if (!new_transport ||
 			    ((new_transport->state = SCTP_INACTIVE) ||
-			     (new_transport->state = SCTP_UNCONFIRMED)))
+			     (new_transport->state = SCTP_UNCONFIRMED) ||
+			     (new_transport->state = SCTP_PF)))
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state = SCTP_UNCONFIRMED)
 				continue;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c96d1a8..285e26a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_cmd_seq_t *commands,
 			     gfp_t gfp);
 
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+				     struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
@@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+					 struct sctp_association *asoc,
 					 struct sctp_transport *transport,
 					 int is_hb)
 {
@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 			transport->error_count++;
 	}
 
+	/* If the transport error count is greater than the pf_retrans
+	 * threshold, and less than pathmaxrtx, then mark this transport
+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
+	 * point 1
+	 */
+	if ((transport->state != SCTP_PF) &&
+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
+	   (transport->error_count > asoc->pf_retrans)) {
+
+		sctp_assoc_control_transport(asoc, transport,
+					     SCTP_TRANSPORT_PF,
+					     0);
+
+		/* Update the hb timer to resend a heartbeat every rto */
+		sctp_cmd_hb_timer_update(commands, transport);
+	}
+
 	if (transport->state != SCTP_INACTIVE &&
 	    (transport->error_count > transport->pathmaxrxt)) {
 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 					     SCTP_HEARTBEAT_SUCCESS);
 	}
 
+	if (t->state = SCTP_PF)
+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+					     SCTP_HEARTBEAT_SUCCESS);
+
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
 	 * using the time value carried in the HEARTBEAT ACK chunk.
@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_STRIKE:
 			/* Mark one strike against a transport.  */
-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
-						    0);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						    cmd->obj.transport, 0);
 			break;
 
 		case SCTP_CMD_TRANSPORT_IDLE:
@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_TRANSPORT_HB_SENT:
 			t = cmd->obj.transport;
-			sctp_do_8_2_transport_strike(asoc, t, 1);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						     t, 1);
 			t->hb_sent = 1;
 			break;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b3b8a8d..fef9bfa 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 }
 
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to alter the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
+			   sizeof(struct sctp_paddrthlds)))
+		return -EFAULT;
+
+	/* path_max_retrans shouldn't ever be zero */
+	if (!val.spt_pathmaxrxt)
+		return -EINVAL;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				    transports) {
+			trans->pathmaxrxt = val.spt_pathmaxrxt;
+			trans->pf_retrans = val.spt_pathpfthld;
+		}
+
+		asoc->pf_retrans = val.spt_pathpfthld;
+		asoc->pathmaxrxt = val.spt_pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		trans->pathmaxrxt = val.spt_pathmaxrxt;
+		trans->pf_retrans = val.spt_pathpfthld;
+	}
+
+	return 0;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -5490,6 +5543,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
 	return 0;
 }
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to fetch the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	optlen = sizeof(struct sctp_paddrthlds);
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+
+		val.spt_pathpfthld = asoc->pf_retrans;
+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		val.spt_pathmaxrxt = trans->pathmaxrxt;
+		val.spt_pathpfthld = trans->pf_retrans;
+	}
+
+	if (copy_to_user(optval, &val, optlen))
+		return -EFAULT;
+
+	return 0;
+}
+
 SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 				char __user *optval, int __user *optlen)
 {
@@ -5628,6 +5725,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5fe639..2b2bfe9 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
 		.extra2		= &int_max
 	},
 	{
+		.procname	= "pf_retrans",
+		.data		= &sctp_pf_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
 		.procname	= "max_init_retransmits",
 		.data		= &sctp_max_retrans_init,
 		.maxlen		= sizeof(int),
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..194d0f3 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 
 	/* Initialize the default path max_retrans.  */
 	peer->pathmaxrxt  = sctp_max_retrans_path;
+	peer->pf_retrans  = sctp_pf_retrans;
 
 	INIT_LIST_HEAD(&peer->transmitted);
 	INIT_LIST_HEAD(&peer->send_ready);
@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
 {
 	unsigned long timeout;
 	timeout = t->rto + sctp_jitter(t->rto);
-	if (t->state != SCTP_UNCONFIRMED)
+	if ((t->state != SCTP_UNCONFIRMED) &&
+	    (t->state != SCTP_PF))
 		timeout += t->hbinterval;
 	timeout += jiffies;
 	return timeout;
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH v2] sctp: Implement quick failover draft from tsvwg
  2012-07-19 10:45       ` Neil Horman
@ 2012-07-19 16:54         ` Joe Perches
  -1 siblings, 0 replies; 48+ messages in thread
From: Joe Perches @ 2012-07-19 16:54 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, Vlad Yasevich, Sridhar Samudrala, David S. Miller, linux-sctp

On Thu, 2012-07-19 at 06:45 -0400, Neil Horman wrote:
> On Wed, Jul 18, 2012 at 01:30:58PM -0700, Joe Perches wrote:
> > On Wed, 2012-07-18 at 14:01 -0400, Neil Horman wrote:
> > > I've seen several attempts recently made to do quick failover of sctp transports
> > > by reducing various retransmit timers and counters.  While its possible to
> > > implement a faster failover on multihomed sctp associations, its not
> > > particularly robust, in that it can lead to unneeded retransmits, as well as
> > > false connection failures due to intermittent latency on a network.
[]
> > > @@ -878,12 +896,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
> > []
> > > +	if (ulp_notify) {
> > > +		memset(&addr, 0, sizeof(struct sockaddr_storage));
> > > +		memcpy(&addr, &transport->ipaddr,
> > > +		       transport->af_specific->sockaddr_len);
> > 
> > Perhaps it's better to do the memcpy then the memset of the
> > space left instead.
> > 
> > 		memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
> > 		memset((char *)&addr) + transport->af_specific->sockaddr_len, 0,
> > 		       sizeof(struct sockaddr_storage) - transport->af_specific->sockaddr_len);
> > 
> hmm, not sure about that. It works either way for me, but I've not changed that
> code, just the condition under which it was executed.  I'd rather save cleanups
> like that for a separate patch if you don't mind.

Not a bit.

It's almost certain reversing the order is slower for v4
addresses anyway.  It might be slower for v6 too given
the arithmetic.

cheers, Joe

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v2] sctp: Implement quick failover draft from tsvwg
@ 2012-07-19 16:54         ` Joe Perches
  0 siblings, 0 replies; 48+ messages in thread
From: Joe Perches @ 2012-07-19 16:54 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, Vlad Yasevich, Sridhar Samudrala, David S. Miller, linux-sctp

On Thu, 2012-07-19 at 06:45 -0400, Neil Horman wrote:
> On Wed, Jul 18, 2012 at 01:30:58PM -0700, Joe Perches wrote:
> > On Wed, 2012-07-18 at 14:01 -0400, Neil Horman wrote:
> > > I've seen several attempts recently made to do quick failover of sctp transports
> > > by reducing various retransmit timers and counters.  While its possible to
> > > implement a faster failover on multihomed sctp associations, its not
> > > particularly robust, in that it can lead to unneeded retransmits, as well as
> > > false connection failures due to intermittent latency on a network.
[]
> > > @@ -878,12 +896,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
> > []
> > > +	if (ulp_notify) {
> > > +		memset(&addr, 0, sizeof(struct sockaddr_storage));
> > > +		memcpy(&addr, &transport->ipaddr,
> > > +		       transport->af_specific->sockaddr_len);
> > 
> > Perhaps it's better to do the memcpy then the memset of the
> > space left instead.
> > 
> > 		memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
> > 		memset((char *)&addr) + transport->af_specific->sockaddr_len, 0,
> > 		       sizeof(struct sockaddr_storage) - transport->af_specific->sockaddr_len);
> > 
> hmm, not sure about that. It works either way for me, but I've not changed that
> code, just the condition under which it was executed.  I'd rather save cleanups
> like that for a separate patch if you don't mind.

Not a bit.

It's almost certain reversing the order is slower for v4
addresses anyway.  It might be slower for v6 too given
the arithmetic.

cheers, Joe


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3] sctp: Implement quick failover draft from tsvwg
  2012-07-19 16:51   ` Neil Horman
@ 2012-07-20 16:51     ` Flavio Leitner
  -1 siblings, 0 replies; 48+ messages in thread
From: Flavio Leitner @ 2012-07-20 16:51 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp, joe

On Thu, 19 Jul 2012 12:51:44 -0400
Neil Horman <nhorman@tuxdriver.com> wrote:
[...]
>  
> +pf_retrans - INTEGER
> +	The number of retransmissions that will be attempted on a given path
> +	before traffic is redirected to an alternate transport (should one
> +	exist).  Note this is distinct from path_max_retrans, as a path that
> +	passes the pf_retrans threshold can still be used.  Its only
> +	deprioritized when a transmission path is selected by the stack.  This
> +	setting is primarily used to enable fast failover mechanisms without
> +	having to reduce path_max_retrans to a very low value.  See:
> +	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> +	for details.  Note also that a value of pf_retrans > path_max_retrans
> +	disables this feature
> +
> +	Default: 0
> +
>  rto_initial - INTEGER
[...]
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index e4652fe..f70726c 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -160,6 +160,7 @@ extern struct sctp_globals {
>  	int max_retrans_association;
>  	int max_retrans_path;
>  	int max_retrans_init;
> +	int pf_retrans;

[...]
>  
> +	/* This is the partially failed retrans value for the transport
> +	 * and will be initialized from the assocs value.  This can be changed
> +	 * using the SCTP_PEER_ADDR_THLDS socket option
> +	 */
> +	int pf_retrans;
>  	/* PMTU	      : The current known path MTU.  */
>  	__u32 pathmtu;
>  
> @@ -1660,6 +1667,8 @@ struct sctp_association {
>  	 */
>  	int max_retrans;
>  
> +	int pf_retrans;
> +


You've documented in one place, but not in the others.  I suggest to include
references like this
       /* See the description in struct sctp_transport */
at the two missing places.

Nice feature,
fbl

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v3] sctp: Implement quick failover draft from tsvwg
@ 2012-07-20 16:51     ` Flavio Leitner
  0 siblings, 0 replies; 48+ messages in thread
From: Flavio Leitner @ 2012-07-20 16:51 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp, joe

On Thu, 19 Jul 2012 12:51:44 -0400
Neil Horman <nhorman@tuxdriver.com> wrote:
[...]
>  
> +pf_retrans - INTEGER
> +	The number of retransmissions that will be attempted on a given path
> +	before traffic is redirected to an alternate transport (should one
> +	exist).  Note this is distinct from path_max_retrans, as a path that
> +	passes the pf_retrans threshold can still be used.  Its only
> +	deprioritized when a transmission path is selected by the stack.  This
> +	setting is primarily used to enable fast failover mechanisms without
> +	having to reduce path_max_retrans to a very low value.  See:
> +	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> +	for details.  Note also that a value of pf_retrans > path_max_retrans
> +	disables this feature
> +
> +	Default: 0
> +
>  rto_initial - INTEGER
[...]
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index e4652fe..f70726c 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -160,6 +160,7 @@ extern struct sctp_globals {
>  	int max_retrans_association;
>  	int max_retrans_path;
>  	int max_retrans_init;
> +	int pf_retrans;

[...]
>  
> +	/* This is the partially failed retrans value for the transport
> +	 * and will be initialized from the assocs value.  This can be changed
> +	 * using the SCTP_PEER_ADDR_THLDS socket option
> +	 */
> +	int pf_retrans;
>  	/* PMTU	      : The current known path MTU.  */
>  	__u32 pathmtu;
>  
> @@ -1660,6 +1667,8 @@ struct sctp_association {
>  	 */
>  	int max_retrans;
>  
> +	int pf_retrans;
> +


You've documented in one place, but not in the others.  I suggest to include
references like this
       /* See the description in struct sctp_transport */
at the two missing places.

Nice feature,
fbl

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v4] sctp: Implement quick failover draft from tsvwg
  2012-07-13 18:26 ` Neil Horman
@ 2012-07-20 17:19   ` Neil Horman
  -1 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-20 17:19 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp, joe

I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org
CC: joe@perches.com

---
Change notes:

V2)
- Added socket option API from section 6.1 of the specification, as per
request from Vlad. Adding this socket option allows us to alter both the path
maximum retransmit value and the path partial failure threshold for each
transport and the association as a whole.

- Added a per transport pf_retrans value, and initialized it from the
association value.  This makes each transport independently configurable as per
the socket option above, and prevents changes in the sysctl from bleeding into
an already created association.

V3)
- Cleaned up some line spacing (Joe Perches)
- Fixed some socket option user data sanitization (Vlad Yasevich)

V4)
- Added additional documentation (Flavio Leitner)
---
 Documentation/networking/ip-sysctl.txt |   14 +++++
 include/net/sctp/constants.h           |    1 +
 include/net/sctp/structs.h             |   20 ++++++-
 include/net/sctp/user.h                |   11 ++++
 net/sctp/associola.c                   |   37 ++++++++++--
 net/sctp/outqueue.c                    |    6 +-
 net/sctp/sm_sideeffect.c               |   33 +++++++++-
 net/sctp/socket.c                      |  100 ++++++++++++++++++++++++++++++++
 net/sctp/sysctl.c                      |    9 +++
 net/sctp/transport.c                   |    4 +-
 10 files changed, 220 insertions(+), 15 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79..c636f9c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
 
 	Default: 5
 
+pf_retrans - INTEGER
+	The number of retransmissions that will be attempted on a given path
+	before traffic is redirected to an alternate transport (should one
+	exist).  Note this is distinct from path_max_retrans, as a path that
+	passes the pf_retrans threshold can still be used.  Its only
+	deprioritized when a transmission path is selected by the stack.  This
+	setting is primarily used to enable fast failover mechanisms without
+	having to reduce path_max_retrans to a very low value.  See:
+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+	for details.  Note also that a value of pf_retrans > path_max_retrans
+	disables this feature
+
+	Default: 0
+
 rto_initial - INTEGER
 	The initial round trip timeout value in milliseconds that will be used
 	in calculating round trip times.  This is the initial time interval
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 942b864..d053d2e 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -334,6 +334,7 @@ typedef enum {
 typedef enum {
 	SCTP_TRANSPORT_UP,
 	SCTP_TRANSPORT_DOWN,
+	SCTP_TRANSPORT_PF,
 } sctp_transport_cmd_t;
 
 /* These are the address scopes defined mainly for IPv4 addresses
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e4652fe..cee0678 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -161,6 +161,12 @@ extern struct sctp_globals {
 	int max_retrans_path;
 	int max_retrans_init;
 
+	/* Potentially-Failed.Max.Retrans sysctl value
+	 * taken from:
+	 * http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
+	 */
+	int pf_retrans;
+
 	/*
 	 * Policy for preforming sctp/socket accounting
 	 * 0   - do socket level accounting, all assocs share sk_sndbuf
@@ -258,6 +264,7 @@ extern struct sctp_globals {
 #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
 #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
 #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
 #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
 #define sctp_sack_timeout		(sctp_globals.sack_timeout)
 #define sctp_hb_interval		(sctp_globals.hb_interval)
@@ -987,10 +994,15 @@ struct sctp_transport {
 
 	/* This is the max_retrans value for the transport and will
 	 * be initialized from the assocs value.  This can be changed
-	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
+	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
 	 */
 	__u16 pathmaxrxt;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be changed
+	 * using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
 	/* PMTU	      : The current known path MTU.  */
 	__u32 pathmtu;
 
@@ -1660,6 +1672,12 @@ struct sctp_association {
 	 */
 	int max_retrans;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be
+	 * changed using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
+
 	/* Maximum number of times the endpoint will retransmit INIT  */
 	__u16 max_init_attempts;
 
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 0842ef0..1b02d7a 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
 #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
 #define SCTP_AUTO_ASCONF       30
+#define SCTP_PEER_ADDR_THLDS	31
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
@@ -649,6 +650,7 @@ struct sctp_paddrinfo {
  */
 enum sctp_spinfo_state {
 	SCTP_INACTIVE,
+	SCTP_PF,
 	SCTP_ACTIVE,
 	SCTP_UNCONFIRMED,
 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
@@ -741,4 +743,13 @@ typedef struct {
 	int sd;
 } sctp_peeloff_arg_t;
 
+/*
+ *  Peer Address Thresholds socket option
+ */
+struct sctp_paddrthlds {
+	sctp_assoc_t spt_assoc_id;
+	struct sockaddr_storage spt_address;
+	__u16 spt_pathmaxrxt;
+	__u16 spt_pathpfthld;
+};
 #endif /* __net_sctp_user_h__ */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5bc9ab1..90fe36b 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * socket values.
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+	asoc->pf_retrans  = sctp_pf_retrans;
+
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
@@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	/* Set the path max_retrans.  */
 	peer->pathmaxrxt = asoc->pathmaxrxt;
 
+	/* And the partial failure retrnas threshold */
+	peer->pf_retrans = asoc->pf_retrans;
+
 	/* Initialize the peer's SACK delay timeout based on the
 	 * association configured value.
 	 */
@@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	struct sctp_ulpevent *event;
 	struct sockaddr_storage addr;
 	int spc_state = 0;
+	bool ulp_notify = true;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			spc_state = SCTP_ADDR_CONFIRMED;
 		else
 			spc_state = SCTP_ADDR_AVAILABLE;
+		/* Don't inform ULP about transition from PF to
+		 * active state and set cwnd to 1, see SCTP
+		 * Quick failover draft section 5.1, point 5
+		 */
+		if (transport->state == SCTP_PF) {
+			ulp_notify = false;
+			transport->cwnd = 1;
+		}
 		transport->state = SCTP_ACTIVE;
 		break;
 
@@ -871,6 +885,11 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
+	case SCTP_TRANSPORT_PF:
+		transport->state = SCTP_PF;
+		ulp_notify = false;
+		break;
+
 	default:
 		return;
 	}
@@ -878,12 +897,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
 	 * user.
 	 */
-	memset(&addr, 0, sizeof(struct sockaddr_storage));
-	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-				0, spc_state, error, GFP_ATOMIC);
-	if (event)
-		sctp_ulpq_tail_event(&asoc->ulpq, event);
+	if (ulp_notify) {
+		memset(&addr, 0, sizeof(struct sockaddr_storage));
+		memcpy(&addr, &transport->ipaddr,
+		       transport->af_specific->sockaddr_len);
+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+					0, spc_state, error, GFP_ATOMIC);
+		if (event)
+			sctp_ulpq_tail_event(&asoc->ulpq, event);
+	}
 
 	/* Select new active and retran paths. */
 
@@ -899,7 +921,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			transports) {
 
 		if ((t->state == SCTP_INACTIVE) ||
-		    (t->state == SCTP_UNCONFIRMED))
+		    (t->state == SCTP_UNCONFIRMED) ||
+		    (t->state == SCTP_PF))
 			continue;
 		if (!first || t->last_time_heard > first->last_time_heard) {
 			second = first;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index a0fa19f..e7aa177c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			if (!new_transport)
 				new_transport = asoc->peer.active_path;
 		} else if ((new_transport->state == SCTP_INACTIVE) ||
-			   (new_transport->state == SCTP_UNCONFIRMED)) {
+			   (new_transport->state == SCTP_UNCONFIRMED) ||
+			   (new_transport->state == SCTP_PF)) {
 			/* If the chunk is Heartbeat or Heartbeat Ack,
 			 * send it to chunk->transport, even if it's
 			 * inactive.
@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			new_transport = chunk->transport;
 			if (!new_transport ||
 			    ((new_transport->state == SCTP_INACTIVE) ||
-			     (new_transport->state == SCTP_UNCONFIRMED)))
+			     (new_transport->state == SCTP_UNCONFIRMED) ||
+			     (new_transport->state == SCTP_PF)))
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state == SCTP_UNCONFIRMED)
 				continue;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c96d1a8..285e26a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_cmd_seq_t *commands,
 			     gfp_t gfp);
 
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+				     struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
@@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+					 struct sctp_association *asoc,
 					 struct sctp_transport *transport,
 					 int is_hb)
 {
@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 			transport->error_count++;
 	}
 
+	/* If the transport error count is greater than the pf_retrans
+	 * threshold, and less than pathmaxrtx, then mark this transport
+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
+	 * point 1
+	 */
+	if ((transport->state != SCTP_PF) &&
+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
+	   (transport->error_count > asoc->pf_retrans)) {
+
+		sctp_assoc_control_transport(asoc, transport,
+					     SCTP_TRANSPORT_PF,
+					     0);
+
+		/* Update the hb timer to resend a heartbeat every rto */
+		sctp_cmd_hb_timer_update(commands, transport);
+	}
+
 	if (transport->state != SCTP_INACTIVE &&
 	    (transport->error_count > transport->pathmaxrxt)) {
 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 					     SCTP_HEARTBEAT_SUCCESS);
 	}
 
+	if (t->state == SCTP_PF)
+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+					     SCTP_HEARTBEAT_SUCCESS);
+
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
 	 * using the time value carried in the HEARTBEAT ACK chunk.
@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_STRIKE:
 			/* Mark one strike against a transport.  */
-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
-						    0);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						    cmd->obj.transport, 0);
 			break;
 
 		case SCTP_CMD_TRANSPORT_IDLE:
@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_TRANSPORT_HB_SENT:
 			t = cmd->obj.transport;
-			sctp_do_8_2_transport_strike(asoc, t, 1);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						     t, 1);
 			t->hb_sent = 1;
 			break;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b3b8a8d..fef9bfa 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 }
 
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to alter the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
+			   sizeof(struct sctp_paddrthlds)))
+		return -EFAULT;
+
+	/* path_max_retrans shouldn't ever be zero */
+	if (!val.spt_pathmaxrxt)
+		return -EINVAL;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				    transports) {
+			trans->pathmaxrxt = val.spt_pathmaxrxt;
+			trans->pf_retrans = val.spt_pathpfthld;
+		}
+
+		asoc->pf_retrans = val.spt_pathpfthld;
+		asoc->pathmaxrxt = val.spt_pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		trans->pathmaxrxt = val.spt_pathmaxrxt;
+		trans->pf_retrans = val.spt_pathpfthld;
+	}
+
+	return 0;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -5490,6 +5543,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
 	return 0;
 }
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to fetch the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	optlen = sizeof(struct sctp_paddrthlds);
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+
+		val.spt_pathpfthld = asoc->pf_retrans;
+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		val.spt_pathmaxrxt = trans->pathmaxrxt;
+		val.spt_pathpfthld = trans->pf_retrans;
+	}
+
+	if (copy_to_user(optval, &val, optlen))
+		return -EFAULT;
+
+	return 0;
+}
+
 SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 				char __user *optval, int __user *optlen)
 {
@@ -5628,6 +5725,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5fe639..2b2bfe9 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
 		.extra2		= &int_max
 	},
 	{
+		.procname	= "pf_retrans",
+		.data		= &sctp_pf_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
 		.procname	= "max_init_retransmits",
 		.data		= &sctp_max_retrans_init,
 		.maxlen		= sizeof(int),
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..194d0f3 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 
 	/* Initialize the default path max_retrans.  */
 	peer->pathmaxrxt  = sctp_max_retrans_path;
+	peer->pf_retrans  = sctp_pf_retrans;
 
 	INIT_LIST_HEAD(&peer->transmitted);
 	INIT_LIST_HEAD(&peer->send_ready);
@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
 {
 	unsigned long timeout;
 	timeout = t->rto + sctp_jitter(t->rto);
-	if (t->state != SCTP_UNCONFIRMED)
+	if ((t->state != SCTP_UNCONFIRMED) &&
+	    (t->state != SCTP_PF))
 		timeout += t->hbinterval;
 	timeout += jiffies;
 	return timeout;
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v4] sctp: Implement quick failover draft from tsvwg
@ 2012-07-20 17:19   ` Neil Horman
  0 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-20 17:19 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp, joe

I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org
CC: joe@perches.com

---
Change notes:

V2)
- Added socket option API from section 6.1 of the specification, as per
request from Vlad. Adding this socket option allows us to alter both the path
maximum retransmit value and the path partial failure threshold for each
transport and the association as a whole.

- Added a per transport pf_retrans value, and initialized it from the
association value.  This makes each transport independently configurable as per
the socket option above, and prevents changes in the sysctl from bleeding into
an already created association.

V3)
- Cleaned up some line spacing (Joe Perches)
- Fixed some socket option user data sanitization (Vlad Yasevich)

V4)
- Added additional documentation (Flavio Leitner)
---
 Documentation/networking/ip-sysctl.txt |   14 +++++
 include/net/sctp/constants.h           |    1 +
 include/net/sctp/structs.h             |   20 ++++++-
 include/net/sctp/user.h                |   11 ++++
 net/sctp/associola.c                   |   37 ++++++++++--
 net/sctp/outqueue.c                    |    6 +-
 net/sctp/sm_sideeffect.c               |   33 +++++++++-
 net/sctp/socket.c                      |  100 ++++++++++++++++++++++++++++++++
 net/sctp/sysctl.c                      |    9 +++
 net/sctp/transport.c                   |    4 +-
 10 files changed, 220 insertions(+), 15 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79..c636f9c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
 
 	Default: 5
 
+pf_retrans - INTEGER
+	The number of retransmissions that will be attempted on a given path
+	before traffic is redirected to an alternate transport (should one
+	exist).  Note this is distinct from path_max_retrans, as a path that
+	passes the pf_retrans threshold can still be used.  Its only
+	deprioritized when a transmission path is selected by the stack.  This
+	setting is primarily used to enable fast failover mechanisms without
+	having to reduce path_max_retrans to a very low value.  See:
+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+	for details.  Note also that a value of pf_retrans > path_max_retrans
+	disables this feature
+
+	Default: 0
+
 rto_initial - INTEGER
 	The initial round trip timeout value in milliseconds that will be used
 	in calculating round trip times.  This is the initial time interval
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 942b864..d053d2e 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -334,6 +334,7 @@ typedef enum {
 typedef enum {
 	SCTP_TRANSPORT_UP,
 	SCTP_TRANSPORT_DOWN,
+	SCTP_TRANSPORT_PF,
 } sctp_transport_cmd_t;
 
 /* These are the address scopes defined mainly for IPv4 addresses
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e4652fe..cee0678 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -161,6 +161,12 @@ extern struct sctp_globals {
 	int max_retrans_path;
 	int max_retrans_init;
 
+	/* Potentially-Failed.Max.Retrans sysctl value
+	 * taken from:
+	 * http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
+	 */
+	int pf_retrans;
+
 	/*
 	 * Policy for preforming sctp/socket accounting
 	 * 0   - do socket level accounting, all assocs share sk_sndbuf
@@ -258,6 +264,7 @@ extern struct sctp_globals {
 #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
 #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
 #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
 #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
 #define sctp_sack_timeout		(sctp_globals.sack_timeout)
 #define sctp_hb_interval		(sctp_globals.hb_interval)
@@ -987,10 +994,15 @@ struct sctp_transport {
 
 	/* This is the max_retrans value for the transport and will
 	 * be initialized from the assocs value.  This can be changed
-	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
+	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
 	 */
 	__u16 pathmaxrxt;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be changed
+	 * using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
 	/* PMTU	      : The current known path MTU.  */
 	__u32 pathmtu;
 
@@ -1660,6 +1672,12 @@ struct sctp_association {
 	 */
 	int max_retrans;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be
+	 * changed using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
+
 	/* Maximum number of times the endpoint will retransmit INIT  */
 	__u16 max_init_attempts;
 
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 0842ef0..1b02d7a 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
 #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
 #define SCTP_AUTO_ASCONF       30
+#define SCTP_PEER_ADDR_THLDS	31
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
@@ -649,6 +650,7 @@ struct sctp_paddrinfo {
  */
 enum sctp_spinfo_state {
 	SCTP_INACTIVE,
+	SCTP_PF,
 	SCTP_ACTIVE,
 	SCTP_UNCONFIRMED,
 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
@@ -741,4 +743,13 @@ typedef struct {
 	int sd;
 } sctp_peeloff_arg_t;
 
+/*
+ *  Peer Address Thresholds socket option
+ */
+struct sctp_paddrthlds {
+	sctp_assoc_t spt_assoc_id;
+	struct sockaddr_storage spt_address;
+	__u16 spt_pathmaxrxt;
+	__u16 spt_pathpfthld;
+};
 #endif /* __net_sctp_user_h__ */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5bc9ab1..90fe36b 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * socket values.
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+	asoc->pf_retrans  = sctp_pf_retrans;
+
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
@@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	/* Set the path max_retrans.  */
 	peer->pathmaxrxt = asoc->pathmaxrxt;
 
+	/* And the partial failure retrnas threshold */
+	peer->pf_retrans = asoc->pf_retrans;
+
 	/* Initialize the peer's SACK delay timeout based on the
 	 * association configured value.
 	 */
@@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	struct sctp_ulpevent *event;
 	struct sockaddr_storage addr;
 	int spc_state = 0;
+	bool ulp_notify = true;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			spc_state = SCTP_ADDR_CONFIRMED;
 		else
 			spc_state = SCTP_ADDR_AVAILABLE;
+		/* Don't inform ULP about transition from PF to
+		 * active state and set cwnd to 1, see SCTP
+		 * Quick failover draft section 5.1, point 5
+		 */
+		if (transport->state = SCTP_PF) {
+			ulp_notify = false;
+			transport->cwnd = 1;
+		}
 		transport->state = SCTP_ACTIVE;
 		break;
 
@@ -871,6 +885,11 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
+	case SCTP_TRANSPORT_PF:
+		transport->state = SCTP_PF;
+		ulp_notify = false;
+		break;
+
 	default:
 		return;
 	}
@@ -878,12 +897,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
 	 * user.
 	 */
-	memset(&addr, 0, sizeof(struct sockaddr_storage));
-	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-				0, spc_state, error, GFP_ATOMIC);
-	if (event)
-		sctp_ulpq_tail_event(&asoc->ulpq, event);
+	if (ulp_notify) {
+		memset(&addr, 0, sizeof(struct sockaddr_storage));
+		memcpy(&addr, &transport->ipaddr,
+		       transport->af_specific->sockaddr_len);
+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+					0, spc_state, error, GFP_ATOMIC);
+		if (event)
+			sctp_ulpq_tail_event(&asoc->ulpq, event);
+	}
 
 	/* Select new active and retran paths. */
 
@@ -899,7 +921,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			transports) {
 
 		if ((t->state = SCTP_INACTIVE) ||
-		    (t->state = SCTP_UNCONFIRMED))
+		    (t->state = SCTP_UNCONFIRMED) ||
+		    (t->state = SCTP_PF))
 			continue;
 		if (!first || t->last_time_heard > first->last_time_heard) {
 			second = first;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index a0fa19f..e7aa177c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			if (!new_transport)
 				new_transport = asoc->peer.active_path;
 		} else if ((new_transport->state = SCTP_INACTIVE) ||
-			   (new_transport->state = SCTP_UNCONFIRMED)) {
+			   (new_transport->state = SCTP_UNCONFIRMED) ||
+			   (new_transport->state = SCTP_PF)) {
 			/* If the chunk is Heartbeat or Heartbeat Ack,
 			 * send it to chunk->transport, even if it's
 			 * inactive.
@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			new_transport = chunk->transport;
 			if (!new_transport ||
 			    ((new_transport->state = SCTP_INACTIVE) ||
-			     (new_transport->state = SCTP_UNCONFIRMED)))
+			     (new_transport->state = SCTP_UNCONFIRMED) ||
+			     (new_transport->state = SCTP_PF)))
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state = SCTP_UNCONFIRMED)
 				continue;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c96d1a8..285e26a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_cmd_seq_t *commands,
 			     gfp_t gfp);
 
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+				     struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
@@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+					 struct sctp_association *asoc,
 					 struct sctp_transport *transport,
 					 int is_hb)
 {
@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 			transport->error_count++;
 	}
 
+	/* If the transport error count is greater than the pf_retrans
+	 * threshold, and less than pathmaxrtx, then mark this transport
+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
+	 * point 1
+	 */
+	if ((transport->state != SCTP_PF) &&
+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
+	   (transport->error_count > asoc->pf_retrans)) {
+
+		sctp_assoc_control_transport(asoc, transport,
+					     SCTP_TRANSPORT_PF,
+					     0);
+
+		/* Update the hb timer to resend a heartbeat every rto */
+		sctp_cmd_hb_timer_update(commands, transport);
+	}
+
 	if (transport->state != SCTP_INACTIVE &&
 	    (transport->error_count > transport->pathmaxrxt)) {
 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 					     SCTP_HEARTBEAT_SUCCESS);
 	}
 
+	if (t->state = SCTP_PF)
+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+					     SCTP_HEARTBEAT_SUCCESS);
+
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
 	 * using the time value carried in the HEARTBEAT ACK chunk.
@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_STRIKE:
 			/* Mark one strike against a transport.  */
-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
-						    0);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						    cmd->obj.transport, 0);
 			break;
 
 		case SCTP_CMD_TRANSPORT_IDLE:
@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_TRANSPORT_HB_SENT:
 			t = cmd->obj.transport;
-			sctp_do_8_2_transport_strike(asoc, t, 1);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						     t, 1);
 			t->hb_sent = 1;
 			break;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b3b8a8d..fef9bfa 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 }
 
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to alter the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
+			   sizeof(struct sctp_paddrthlds)))
+		return -EFAULT;
+
+	/* path_max_retrans shouldn't ever be zero */
+	if (!val.spt_pathmaxrxt)
+		return -EINVAL;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				    transports) {
+			trans->pathmaxrxt = val.spt_pathmaxrxt;
+			trans->pf_retrans = val.spt_pathpfthld;
+		}
+
+		asoc->pf_retrans = val.spt_pathpfthld;
+		asoc->pathmaxrxt = val.spt_pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		trans->pathmaxrxt = val.spt_pathmaxrxt;
+		trans->pf_retrans = val.spt_pathpfthld;
+	}
+
+	return 0;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -5490,6 +5543,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
 	return 0;
 }
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to fetch the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	optlen = sizeof(struct sctp_paddrthlds);
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+
+		val.spt_pathpfthld = asoc->pf_retrans;
+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		val.spt_pathmaxrxt = trans->pathmaxrxt;
+		val.spt_pathpfthld = trans->pf_retrans;
+	}
+
+	if (copy_to_user(optval, &val, optlen))
+		return -EFAULT;
+
+	return 0;
+}
+
 SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 				char __user *optval, int __user *optlen)
 {
@@ -5628,6 +5725,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5fe639..2b2bfe9 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
 		.extra2		= &int_max
 	},
 	{
+		.procname	= "pf_retrans",
+		.data		= &sctp_pf_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
 		.procname	= "max_init_retransmits",
 		.data		= &sctp_max_retrans_init,
 		.maxlen		= sizeof(int),
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..194d0f3 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 
 	/* Initialize the default path max_retrans.  */
 	peer->pathmaxrxt  = sctp_max_retrans_path;
+	peer->pf_retrans  = sctp_pf_retrans;
 
 	INIT_LIST_HEAD(&peer->transmitted);
 	INIT_LIST_HEAD(&peer->send_ready);
@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
 {
 	unsigned long timeout;
 	timeout = t->rto + sctp_jitter(t->rto);
-	if (t->state != SCTP_UNCONFIRMED)
+	if ((t->state != SCTP_UNCONFIRMED) &&
+	    (t->state != SCTP_PF))
 		timeout += t->hbinterval;
 	timeout += jiffies;
 	return timeout;
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH v4] sctp: Implement quick failover draft from tsvwg
  2012-07-20 17:19   ` Neil Horman
@ 2012-07-20 17:55     ` Vlad Yasevich
  -1 siblings, 0 replies; 48+ messages in thread
From: Vlad Yasevich @ 2012-07-20 17:55 UTC (permalink / raw)
  To: Neil Horman; +Cc: netdev, Sridhar Samudrala, David S. Miller, linux-sctp, joe

On 07/20/2012 01:19 PM, Neil Horman wrote:
> I've seen several attempts recently made to do quick failover of sctp transports
> by reducing various retransmit timers and counters.  While its possible to
> implement a faster failover on multihomed sctp associations, its not
> particularly robust, in that it can lead to unneeded retransmits, as well as
> false connection failures due to intermittent latency on a network.
>
> Instead, lets implement the new ietf quick failover draft found here:
> http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>
> This will let the sctp stack identify transports that have had a small number of
> errors, and avoid using them quickly until their reliability can be
> re-established.  I've tested this out on two virt guests connected via multiple
> isolated virt networks and believe its in compliance with the above draft and
> works well.
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> CC: Vlad Yasevich <vyasevich@gmail.com>
> CC: Sridhar Samudrala <sri@us.ibm.com>
> CC: "David S. Miller" <davem@davemloft.net>
> CC: linux-sctp@vger.kernel.org
> CC: joe@perches.com
>
> ---
> Change notes:
>
> V2)
> - Added socket option API from section 6.1 of the specification, as per
> request from Vlad. Adding this socket option allows us to alter both the path
> maximum retransmit value and the path partial failure threshold for each
> transport and the association as a whole.
>
> - Added a per transport pf_retrans value, and initialized it from the
> association value.  This makes each transport independently configurable as per
> the socket option above, and prevents changes in the sysctl from bleeding into
> an already created association.
>
> V3)
> - Cleaned up some line spacing (Joe Perches)
> - Fixed some socket option user data sanitization (Vlad Yasevich)
>
> V4)
> - Added additional documentation (Flavio Leitner)
> ---
>   Documentation/networking/ip-sysctl.txt |   14 +++++
>   include/net/sctp/constants.h           |    1 +
>   include/net/sctp/structs.h             |   20 ++++++-
>   include/net/sctp/user.h                |   11 ++++
>   net/sctp/associola.c                   |   37 ++++++++++--
>   net/sctp/outqueue.c                    |    6 +-
>   net/sctp/sm_sideeffect.c               |   33 +++++++++-
>   net/sctp/socket.c                      |  100 ++++++++++++++++++++++++++++++++
>   net/sctp/sysctl.c                      |    9 +++
>   net/sctp/transport.c                   |    4 +-
>   10 files changed, 220 insertions(+), 15 deletions(-)
>

[ snip ]

>
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index b3b8a8d..fef9bfa 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
>   }
>
>
> +/*
> + * SCTP_PEER_ADDR_THLDS
> + *
> + * This option allows us to alter the partially failed threshold for one or all
> + * transports in an association.  See Section 6.1 of:
> + * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> + */
> +static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
> +					    char __user *optval,
> +					    unsigned int optlen)
> +{
> +	struct sctp_paddrthlds val;
> +	struct sctp_transport *trans;
> +	struct sctp_association *asoc;
> +
> +	if (optlen < sizeof(struct sctp_paddrthlds))
> +		return -EINVAL;
> +	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
> +			   sizeof(struct sctp_paddrthlds)))
> +		return -EFAULT;
> +
> +	/* path_max_retrans shouldn't ever be zero */
> +	if (!val.spt_pathmaxrxt)
> +		return -EINVAL;

I am not sure I like this solution.  This means that the application 
must fetch the pathmaxrx and then write the same value back here.
Why not simply ignore the patthmaxrxt if it's 0?  That way someone can 
just tweak the pf value without changing the pathmaxrxt.



> +
> +	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> +		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> +		if (!asoc)
> +			return -ENOENT;
> +		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
> +				    transports) {
> +			trans->pathmaxrxt = val.spt_pathmaxrxt;
> +			trans->pf_retrans = val.spt_pathpfthld;
> +		}
> +
> +		asoc->pf_retrans = val.spt_pathpfthld;
> +		asoc->pathmaxrxt = val.spt_pathmaxrxt;
> +	} else {
> +		trans = sctp_addr_id2transport(sk, &val.spt_address,
> +					       val.spt_assoc_id);
> +		if (!trans)
> +			return -ENOENT;
> +
> +		trans->pathmaxrxt = val.spt_pathmaxrxt;
> +		trans->pf_retrans = val.spt_pathpfthld;
> +	}
> +
> +	return 0;
> +}
> +
>   /* API 6.2 setsockopt(), getsockopt()
>    *
>    * Applications use setsockopt() and getsockopt() to set or retrieve
> @@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
>   	case SCTP_AUTO_ASCONF:
>   		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
>   		break;
> +	case SCTP_PEER_ADDR_THLDS:
> +		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
> +		break;
>   	default:
>   		retval = -ENOPROTOOPT;
>   		break;
> @@ -5490,6 +5543,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
>   	return 0;
>   }
>
> +/*
> + * SCTP_PEER_ADDR_THLDS
> + *
> + * This option allows us to fetch the partially failed threshold for one or all
> + * transports in an association.  See Section 6.1 of:
> + * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> + */
> +static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
> +					    char __user *optval,
> +					    int optlen)
> +{
> +	struct sctp_paddrthlds val;
> +	struct sctp_transport *trans;
> +	struct sctp_association *asoc;
> +
> +	if (optlen < sizeof(struct sctp_paddrthlds))
> +		return -EINVAL;
> +	optlen = sizeof(struct sctp_paddrthlds);
> +	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
> +		return -EFAULT;
> +
> +	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> +		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> +		if (!asoc)
> +			return -ENOENT;
> +
> +		val.spt_pathpfthld = asoc->pf_retrans;
> +		val.spt_pathmaxrxt = asoc->pathmaxrxt;
> +	} else {
> +		trans = sctp_addr_id2transport(sk, &val.spt_address,
> +					       val.spt_assoc_id);
> +		if (!trans)
> +			return -ENOENT;
> +
> +		val.spt_pathmaxrxt = trans->pathmaxrxt;
> +		val.spt_pathpfthld = trans->pf_retrans;
> +	}
> +
> +	if (copy_to_user(optval, &val, optlen))
> +		return -EFAULT;
> +

getsockopt typically returns the length of the option data that was 
written to the user.

> +	return 0;
> +}
> +
>   SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
>   				char __user *optval, int __user *optlen)
>   {
> @@ -5628,6 +5725,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
>   	case SCTP_AUTO_ASCONF:
>   		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
>   		break;
> +	case SCTP_PEER_ADDR_THLDS:
> +		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
> +		break;

You are passing the len.  The user may have passed in a bigger buffer 
and is expecting back the length of the option.

-vlad

>   	default:
>   		retval = -ENOPROTOOPT;
>   		break;
> diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
> index e5fe639..2b2bfe9 100644
> --- a/net/sctp/sysctl.c
> +++ b/net/sctp/sysctl.c
> @@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
>   		.extra2		= &int_max
>   	},
>   	{
> +		.procname	= "pf_retrans",
> +		.data		= &sctp_pf_retrans,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &zero,
> +		.extra2		= &int_max
> +	},
> +	{
>   		.procname	= "max_init_retransmits",
>   		.data		= &sctp_max_retrans_init,
>   		.maxlen		= sizeof(int),
> diff --git a/net/sctp/transport.c b/net/sctp/transport.c
> index b026ba0..194d0f3 100644
> --- a/net/sctp/transport.c
> +++ b/net/sctp/transport.c
> @@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
>
>   	/* Initialize the default path max_retrans.  */
>   	peer->pathmaxrxt  = sctp_max_retrans_path;
> +	peer->pf_retrans  = sctp_pf_retrans;
>
>   	INIT_LIST_HEAD(&peer->transmitted);
>   	INIT_LIST_HEAD(&peer->send_ready);
> @@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
>   {
>   	unsigned long timeout;
>   	timeout = t->rto + sctp_jitter(t->rto);
> -	if (t->state != SCTP_UNCONFIRMED)
> +	if ((t->state != SCTP_UNCONFIRMED) &&
> +	    (t->state != SCTP_PF))
>   		timeout += t->hbinterval;
>   	timeout += jiffies;
>   	return timeout;
>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v4] sctp: Implement quick failover draft from tsvwg
@ 2012-07-20 17:55     ` Vlad Yasevich
  0 siblings, 0 replies; 48+ messages in thread
From: Vlad Yasevich @ 2012-07-20 17:55 UTC (permalink / raw)
  To: Neil Horman; +Cc: netdev, Sridhar Samudrala, David S. Miller, linux-sctp, joe

On 07/20/2012 01:19 PM, Neil Horman wrote:
> I've seen several attempts recently made to do quick failover of sctp transports
> by reducing various retransmit timers and counters.  While its possible to
> implement a faster failover on multihomed sctp associations, its not
> particularly robust, in that it can lead to unneeded retransmits, as well as
> false connection failures due to intermittent latency on a network.
>
> Instead, lets implement the new ietf quick failover draft found here:
> http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>
> This will let the sctp stack identify transports that have had a small number of
> errors, and avoid using them quickly until their reliability can be
> re-established.  I've tested this out on two virt guests connected via multiple
> isolated virt networks and believe its in compliance with the above draft and
> works well.
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> CC: Vlad Yasevich <vyasevich@gmail.com>
> CC: Sridhar Samudrala <sri@us.ibm.com>
> CC: "David S. Miller" <davem@davemloft.net>
> CC: linux-sctp@vger.kernel.org
> CC: joe@perches.com
>
> ---
> Change notes:
>
> V2)
> - Added socket option API from section 6.1 of the specification, as per
> request from Vlad. Adding this socket option allows us to alter both the path
> maximum retransmit value and the path partial failure threshold for each
> transport and the association as a whole.
>
> - Added a per transport pf_retrans value, and initialized it from the
> association value.  This makes each transport independently configurable as per
> the socket option above, and prevents changes in the sysctl from bleeding into
> an already created association.
>
> V3)
> - Cleaned up some line spacing (Joe Perches)
> - Fixed some socket option user data sanitization (Vlad Yasevich)
>
> V4)
> - Added additional documentation (Flavio Leitner)
> ---
>   Documentation/networking/ip-sysctl.txt |   14 +++++
>   include/net/sctp/constants.h           |    1 +
>   include/net/sctp/structs.h             |   20 ++++++-
>   include/net/sctp/user.h                |   11 ++++
>   net/sctp/associola.c                   |   37 ++++++++++--
>   net/sctp/outqueue.c                    |    6 +-
>   net/sctp/sm_sideeffect.c               |   33 +++++++++-
>   net/sctp/socket.c                      |  100 ++++++++++++++++++++++++++++++++
>   net/sctp/sysctl.c                      |    9 +++
>   net/sctp/transport.c                   |    4 +-
>   10 files changed, 220 insertions(+), 15 deletions(-)
>

[ snip ]

>
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index b3b8a8d..fef9bfa 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
>   }
>
>
> +/*
> + * SCTP_PEER_ADDR_THLDS
> + *
> + * This option allows us to alter the partially failed threshold for one or all
> + * transports in an association.  See Section 6.1 of:
> + * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> + */
> +static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
> +					    char __user *optval,
> +					    unsigned int optlen)
> +{
> +	struct sctp_paddrthlds val;
> +	struct sctp_transport *trans;
> +	struct sctp_association *asoc;
> +
> +	if (optlen < sizeof(struct sctp_paddrthlds))
> +		return -EINVAL;
> +	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
> +			   sizeof(struct sctp_paddrthlds)))
> +		return -EFAULT;
> +
> +	/* path_max_retrans shouldn't ever be zero */
> +	if (!val.spt_pathmaxrxt)
> +		return -EINVAL;

I am not sure I like this solution.  This means that the application 
must fetch the pathmaxrx and then write the same value back here.
Why not simply ignore the patthmaxrxt if it's 0?  That way someone can 
just tweak the pf value without changing the pathmaxrxt.



> +
> +	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> +		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> +		if (!asoc)
> +			return -ENOENT;
> +		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
> +				    transports) {
> +			trans->pathmaxrxt = val.spt_pathmaxrxt;
> +			trans->pf_retrans = val.spt_pathpfthld;
> +		}
> +
> +		asoc->pf_retrans = val.spt_pathpfthld;
> +		asoc->pathmaxrxt = val.spt_pathmaxrxt;
> +	} else {
> +		trans = sctp_addr_id2transport(sk, &val.spt_address,
> +					       val.spt_assoc_id);
> +		if (!trans)
> +			return -ENOENT;
> +
> +		trans->pathmaxrxt = val.spt_pathmaxrxt;
> +		trans->pf_retrans = val.spt_pathpfthld;
> +	}
> +
> +	return 0;
> +}
> +
>   /* API 6.2 setsockopt(), getsockopt()
>    *
>    * Applications use setsockopt() and getsockopt() to set or retrieve
> @@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
>   	case SCTP_AUTO_ASCONF:
>   		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
>   		break;
> +	case SCTP_PEER_ADDR_THLDS:
> +		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
> +		break;
>   	default:
>   		retval = -ENOPROTOOPT;
>   		break;
> @@ -5490,6 +5543,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
>   	return 0;
>   }
>
> +/*
> + * SCTP_PEER_ADDR_THLDS
> + *
> + * This option allows us to fetch the partially failed threshold for one or all
> + * transports in an association.  See Section 6.1 of:
> + * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> + */
> +static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
> +					    char __user *optval,
> +					    int optlen)
> +{
> +	struct sctp_paddrthlds val;
> +	struct sctp_transport *trans;
> +	struct sctp_association *asoc;
> +
> +	if (optlen < sizeof(struct sctp_paddrthlds))
> +		return -EINVAL;
> +	optlen = sizeof(struct sctp_paddrthlds);
> +	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
> +		return -EFAULT;
> +
> +	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> +		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> +		if (!asoc)
> +			return -ENOENT;
> +
> +		val.spt_pathpfthld = asoc->pf_retrans;
> +		val.spt_pathmaxrxt = asoc->pathmaxrxt;
> +	} else {
> +		trans = sctp_addr_id2transport(sk, &val.spt_address,
> +					       val.spt_assoc_id);
> +		if (!trans)
> +			return -ENOENT;
> +
> +		val.spt_pathmaxrxt = trans->pathmaxrxt;
> +		val.spt_pathpfthld = trans->pf_retrans;
> +	}
> +
> +	if (copy_to_user(optval, &val, optlen))
> +		return -EFAULT;
> +

getsockopt typically returns the length of the option data that was 
written to the user.

> +	return 0;
> +}
> +
>   SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
>   				char __user *optval, int __user *optlen)
>   {
> @@ -5628,6 +5725,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
>   	case SCTP_AUTO_ASCONF:
>   		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
>   		break;
> +	case SCTP_PEER_ADDR_THLDS:
> +		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
> +		break;

You are passing the len.  The user may have passed in a bigger buffer 
and is expecting back the length of the option.

-vlad

>   	default:
>   		retval = -ENOPROTOOPT;
>   		break;
> diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
> index e5fe639..2b2bfe9 100644
> --- a/net/sctp/sysctl.c
> +++ b/net/sctp/sysctl.c
> @@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
>   		.extra2		= &int_max
>   	},
>   	{
> +		.procname	= "pf_retrans",
> +		.data		= &sctp_pf_retrans,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &zero,
> +		.extra2		= &int_max
> +	},
> +	{
>   		.procname	= "max_init_retransmits",
>   		.data		= &sctp_max_retrans_init,
>   		.maxlen		= sizeof(int),
> diff --git a/net/sctp/transport.c b/net/sctp/transport.c
> index b026ba0..194d0f3 100644
> --- a/net/sctp/transport.c
> +++ b/net/sctp/transport.c
> @@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
>
>   	/* Initialize the default path max_retrans.  */
>   	peer->pathmaxrxt  = sctp_max_retrans_path;
> +	peer->pf_retrans  = sctp_pf_retrans;
>
>   	INIT_LIST_HEAD(&peer->transmitted);
>   	INIT_LIST_HEAD(&peer->send_ready);
> @@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
>   {
>   	unsigned long timeout;
>   	timeout = t->rto + sctp_jitter(t->rto);
> -	if (t->state != SCTP_UNCONFIRMED)
> +	if ((t->state != SCTP_UNCONFIRMED) &&
> +	    (t->state != SCTP_PF))
>   		timeout += t->hbinterval;
>   	timeout += jiffies;
>   	return timeout;
>



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v4] sctp: Implement quick failover draft from tsvwg
  2012-07-20 17:55     ` Vlad Yasevich
@ 2012-07-20 18:36       ` Neil Horman
  -1 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-20 18:36 UTC (permalink / raw)
  To: Vlad Yasevich; +Cc: netdev, Sridhar Samudrala, David S. Miller, linux-sctp, joe

On Fri, Jul 20, 2012 at 01:55:35PM -0400, Vlad Yasevich wrote:
> On 07/20/2012 01:19 PM, Neil Horman wrote:
> >I've seen several attempts recently made to do quick failover of sctp transports
> >by reducing various retransmit timers and counters.  While its possible to
> >implement a faster failover on multihomed sctp associations, its not
> >particularly robust, in that it can lead to unneeded retransmits, as well as
> >false connection failures due to intermittent latency on a network.
> >
> >Instead, lets implement the new ietf quick failover draft found here:
> >http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
> >
> >This will let the sctp stack identify transports that have had a small number of
> >errors, and avoid using them quickly until their reliability can be
> >re-established.  I've tested this out on two virt guests connected via multiple
> >isolated virt networks and believe its in compliance with the above draft and
> >works well.
> >
> >Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> >CC: Vlad Yasevich <vyasevich@gmail.com>
> >CC: Sridhar Samudrala <sri@us.ibm.com>
> >CC: "David S. Miller" <davem@davemloft.net>
> >CC: linux-sctp@vger.kernel.org
> >CC: joe@perches.com
> >
> >---
> >Change notes:
> >
> >V2)
> >- Added socket option API from section 6.1 of the specification, as per
> >request from Vlad. Adding this socket option allows us to alter both the path
> >maximum retransmit value and the path partial failure threshold for each
> >transport and the association as a whole.
> >
> >- Added a per transport pf_retrans value, and initialized it from the
> >association value.  This makes each transport independently configurable as per
> >the socket option above, and prevents changes in the sysctl from bleeding into
> >an already created association.
> >
> >V3)
> >- Cleaned up some line spacing (Joe Perches)
> >- Fixed some socket option user data sanitization (Vlad Yasevich)
> >
> >V4)
> >- Added additional documentation (Flavio Leitner)
> >---
> >  Documentation/networking/ip-sysctl.txt |   14 +++++
> >  include/net/sctp/constants.h           |    1 +
> >  include/net/sctp/structs.h             |   20 ++++++-
> >  include/net/sctp/user.h                |   11 ++++
> >  net/sctp/associola.c                   |   37 ++++++++++--
> >  net/sctp/outqueue.c                    |    6 +-
> >  net/sctp/sm_sideeffect.c               |   33 +++++++++-
> >  net/sctp/socket.c                      |  100 ++++++++++++++++++++++++++++++++
> >  net/sctp/sysctl.c                      |    9 +++
> >  net/sctp/transport.c                   |    4 +-
> >  10 files changed, 220 insertions(+), 15 deletions(-)
> >
> 
> [ snip ]
> 
> >
> >diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> >index b3b8a8d..fef9bfa 100644
> >--- a/net/sctp/socket.c
> >+++ b/net/sctp/socket.c
> >@@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
> >  }
> >
> >
> >+/*
> >+ * SCTP_PEER_ADDR_THLDS
> >+ *
> >+ * This option allows us to alter the partially failed threshold for one or all
> >+ * transports in an association.  See Section 6.1 of:
> >+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> >+ */
> >+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
> >+					    char __user *optval,
> >+					    unsigned int optlen)
> >+{
> >+	struct sctp_paddrthlds val;
> >+	struct sctp_transport *trans;
> >+	struct sctp_association *asoc;
> >+
> >+	if (optlen < sizeof(struct sctp_paddrthlds))
> >+		return -EINVAL;
> >+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
> >+			   sizeof(struct sctp_paddrthlds)))
> >+		return -EFAULT;
> >+
> >+	/* path_max_retrans shouldn't ever be zero */
> >+	if (!val.spt_pathmaxrxt)
> >+		return -EINVAL;
> 
> I am not sure I like this solution.  This means that the application
> must fetch the pathmaxrx and then write the same value back here.
> Why not simply ignore the patthmaxrxt if it's 0?  That way someone
> can just tweak the pf value without changing the pathmaxrxt.
> 
> 
Yeah, I can make that change.
Neil

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v4] sctp: Implement quick failover draft from tsvwg
@ 2012-07-20 18:36       ` Neil Horman
  0 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-20 18:36 UTC (permalink / raw)
  To: Vlad Yasevich; +Cc: netdev, Sridhar Samudrala, David S. Miller, linux-sctp, joe

On Fri, Jul 20, 2012 at 01:55:35PM -0400, Vlad Yasevich wrote:
> On 07/20/2012 01:19 PM, Neil Horman wrote:
> >I've seen several attempts recently made to do quick failover of sctp transports
> >by reducing various retransmit timers and counters.  While its possible to
> >implement a faster failover on multihomed sctp associations, its not
> >particularly robust, in that it can lead to unneeded retransmits, as well as
> >false connection failures due to intermittent latency on a network.
> >
> >Instead, lets implement the new ietf quick failover draft found here:
> >http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
> >
> >This will let the sctp stack identify transports that have had a small number of
> >errors, and avoid using them quickly until their reliability can be
> >re-established.  I've tested this out on two virt guests connected via multiple
> >isolated virt networks and believe its in compliance with the above draft and
> >works well.
> >
> >Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> >CC: Vlad Yasevich <vyasevich@gmail.com>
> >CC: Sridhar Samudrala <sri@us.ibm.com>
> >CC: "David S. Miller" <davem@davemloft.net>
> >CC: linux-sctp@vger.kernel.org
> >CC: joe@perches.com
> >
> >---
> >Change notes:
> >
> >V2)
> >- Added socket option API from section 6.1 of the specification, as per
> >request from Vlad. Adding this socket option allows us to alter both the path
> >maximum retransmit value and the path partial failure threshold for each
> >transport and the association as a whole.
> >
> >- Added a per transport pf_retrans value, and initialized it from the
> >association value.  This makes each transport independently configurable as per
> >the socket option above, and prevents changes in the sysctl from bleeding into
> >an already created association.
> >
> >V3)
> >- Cleaned up some line spacing (Joe Perches)
> >- Fixed some socket option user data sanitization (Vlad Yasevich)
> >
> >V4)
> >- Added additional documentation (Flavio Leitner)
> >---
> >  Documentation/networking/ip-sysctl.txt |   14 +++++
> >  include/net/sctp/constants.h           |    1 +
> >  include/net/sctp/structs.h             |   20 ++++++-
> >  include/net/sctp/user.h                |   11 ++++
> >  net/sctp/associola.c                   |   37 ++++++++++--
> >  net/sctp/outqueue.c                    |    6 +-
> >  net/sctp/sm_sideeffect.c               |   33 +++++++++-
> >  net/sctp/socket.c                      |  100 ++++++++++++++++++++++++++++++++
> >  net/sctp/sysctl.c                      |    9 +++
> >  net/sctp/transport.c                   |    4 +-
> >  10 files changed, 220 insertions(+), 15 deletions(-)
> >
> 
> [ snip ]
> 
> >
> >diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> >index b3b8a8d..fef9bfa 100644
> >--- a/net/sctp/socket.c
> >+++ b/net/sctp/socket.c
> >@@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
> >  }
> >
> >
> >+/*
> >+ * SCTP_PEER_ADDR_THLDS
> >+ *
> >+ * This option allows us to alter the partially failed threshold for one or all
> >+ * transports in an association.  See Section 6.1 of:
> >+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> >+ */
> >+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
> >+					    char __user *optval,
> >+					    unsigned int optlen)
> >+{
> >+	struct sctp_paddrthlds val;
> >+	struct sctp_transport *trans;
> >+	struct sctp_association *asoc;
> >+
> >+	if (optlen < sizeof(struct sctp_paddrthlds))
> >+		return -EINVAL;
> >+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
> >+			   sizeof(struct sctp_paddrthlds)))
> >+		return -EFAULT;
> >+
> >+	/* path_max_retrans shouldn't ever be zero */
> >+	if (!val.spt_pathmaxrxt)
> >+		return -EINVAL;
> 
> I am not sure I like this solution.  This means that the application
> must fetch the pathmaxrx and then write the same value back here.
> Why not simply ignore the patthmaxrxt if it's 0?  That way someone
> can just tweak the pf value without changing the pathmaxrxt.
> 
> 
Yeah, I can make that change.
Neil


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v5] sctp: Implement quick failover draft from tsvwg
  2012-07-13 18:26 ` Neil Horman
@ 2012-07-20 18:51   ` Neil Horman
  -1 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-20 18:51 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp, joe

I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org
CC: joe@perches.com

---
Change notes:

V2)
- Added socket option API from section 6.1 of the specification, as per
request from Vlad. Adding this socket option allows us to alter both the path
maximum retransmit value and the path partial failure threshold for each
transport and the association as a whole.

- Added a per transport pf_retrans value, and initialized it from the
association value.  This makes each transport independently configurable as per
the socket option above, and prevents changes in the sysctl from bleeding into
an already created association.

V3)
- Cleaned up some line spacing (Joe Perches)
- Fixed some socket option user data sanitization (Vlad Yasevich)

V4)
- Added additional documentation (Flavio Leitner)

V5)
- Modified setsockopt option to ignore 0 pathmaxrxt rather than return
  error (Vlad Yasevich)
- Modified getsocopt to return option length written (Vlad Y.)
---
 Documentation/networking/ip-sysctl.txt |   14 +++++
 include/net/sctp/constants.h           |    1 +
 include/net/sctp/structs.h             |   20 ++++++-
 include/net/sctp/user.h                |   11 ++++
 net/sctp/associola.c                   |   37 ++++++++++--
 net/sctp/outqueue.c                    |    6 +-
 net/sctp/sm_sideeffect.c               |   33 +++++++++-
 net/sctp/socket.c                      |  100 ++++++++++++++++++++++++++++++++
 net/sctp/sysctl.c                      |    9 +++
 net/sctp/transport.c                   |    4 +-
 10 files changed, 220 insertions(+), 15 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79..c636f9c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
 
 	Default: 5
 
+pf_retrans - INTEGER
+	The number of retransmissions that will be attempted on a given path
+	before traffic is redirected to an alternate transport (should one
+	exist).  Note this is distinct from path_max_retrans, as a path that
+	passes the pf_retrans threshold can still be used.  Its only
+	deprioritized when a transmission path is selected by the stack.  This
+	setting is primarily used to enable fast failover mechanisms without
+	having to reduce path_max_retrans to a very low value.  See:
+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+	for details.  Note also that a value of pf_retrans > path_max_retrans
+	disables this feature
+
+	Default: 0
+
 rto_initial - INTEGER
 	The initial round trip timeout value in milliseconds that will be used
 	in calculating round trip times.  This is the initial time interval
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 942b864..d053d2e 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -334,6 +334,7 @@ typedef enum {
 typedef enum {
 	SCTP_TRANSPORT_UP,
 	SCTP_TRANSPORT_DOWN,
+	SCTP_TRANSPORT_PF,
 } sctp_transport_cmd_t;
 
 /* These are the address scopes defined mainly for IPv4 addresses
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e4652fe..cee0678 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -161,6 +161,12 @@ extern struct sctp_globals {
 	int max_retrans_path;
 	int max_retrans_init;
 
+	/* Potentially-Failed.Max.Retrans sysctl value
+	 * taken from:
+	 * http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
+	 */
+	int pf_retrans;
+
 	/*
 	 * Policy for preforming sctp/socket accounting
 	 * 0   - do socket level accounting, all assocs share sk_sndbuf
@@ -258,6 +264,7 @@ extern struct sctp_globals {
 #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
 #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
 #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
 #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
 #define sctp_sack_timeout		(sctp_globals.sack_timeout)
 #define sctp_hb_interval		(sctp_globals.hb_interval)
@@ -987,10 +994,15 @@ struct sctp_transport {
 
 	/* This is the max_retrans value for the transport and will
 	 * be initialized from the assocs value.  This can be changed
-	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
+	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
 	 */
 	__u16 pathmaxrxt;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be changed
+	 * using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
 	/* PMTU	      : The current known path MTU.  */
 	__u32 pathmtu;
 
@@ -1660,6 +1672,12 @@ struct sctp_association {
 	 */
 	int max_retrans;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be
+	 * changed using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
+
 	/* Maximum number of times the endpoint will retransmit INIT  */
 	__u16 max_init_attempts;
 
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 0842ef0..1b02d7a 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
 #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
 #define SCTP_AUTO_ASCONF       30
+#define SCTP_PEER_ADDR_THLDS	31
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
@@ -649,6 +650,7 @@ struct sctp_paddrinfo {
  */
 enum sctp_spinfo_state {
 	SCTP_INACTIVE,
+	SCTP_PF,
 	SCTP_ACTIVE,
 	SCTP_UNCONFIRMED,
 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
@@ -741,4 +743,13 @@ typedef struct {
 	int sd;
 } sctp_peeloff_arg_t;
 
+/*
+ *  Peer Address Thresholds socket option
+ */
+struct sctp_paddrthlds {
+	sctp_assoc_t spt_assoc_id;
+	struct sockaddr_storage spt_address;
+	__u16 spt_pathmaxrxt;
+	__u16 spt_pathpfthld;
+};
 #endif /* __net_sctp_user_h__ */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5bc9ab1..90fe36b 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * socket values.
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+	asoc->pf_retrans  = sctp_pf_retrans;
+
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
@@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	/* Set the path max_retrans.  */
 	peer->pathmaxrxt = asoc->pathmaxrxt;
 
+	/* And the partial failure retrnas threshold */
+	peer->pf_retrans = asoc->pf_retrans;
+
 	/* Initialize the peer's SACK delay timeout based on the
 	 * association configured value.
 	 */
@@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	struct sctp_ulpevent *event;
 	struct sockaddr_storage addr;
 	int spc_state = 0;
+	bool ulp_notify = true;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			spc_state = SCTP_ADDR_CONFIRMED;
 		else
 			spc_state = SCTP_ADDR_AVAILABLE;
+		/* Don't inform ULP about transition from PF to
+		 * active state and set cwnd to 1, see SCTP
+		 * Quick failover draft section 5.1, point 5
+		 */
+		if (transport->state == SCTP_PF) {
+			ulp_notify = false;
+			transport->cwnd = 1;
+		}
 		transport->state = SCTP_ACTIVE;
 		break;
 
@@ -871,6 +885,11 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
+	case SCTP_TRANSPORT_PF:
+		transport->state = SCTP_PF;
+		ulp_notify = false;
+		break;
+
 	default:
 		return;
 	}
@@ -878,12 +897,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
 	 * user.
 	 */
-	memset(&addr, 0, sizeof(struct sockaddr_storage));
-	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-				0, spc_state, error, GFP_ATOMIC);
-	if (event)
-		sctp_ulpq_tail_event(&asoc->ulpq, event);
+	if (ulp_notify) {
+		memset(&addr, 0, sizeof(struct sockaddr_storage));
+		memcpy(&addr, &transport->ipaddr,
+		       transport->af_specific->sockaddr_len);
+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+					0, spc_state, error, GFP_ATOMIC);
+		if (event)
+			sctp_ulpq_tail_event(&asoc->ulpq, event);
+	}
 
 	/* Select new active and retran paths. */
 
@@ -899,7 +921,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			transports) {
 
 		if ((t->state == SCTP_INACTIVE) ||
-		    (t->state == SCTP_UNCONFIRMED))
+		    (t->state == SCTP_UNCONFIRMED) ||
+		    (t->state == SCTP_PF))
 			continue;
 		if (!first || t->last_time_heard > first->last_time_heard) {
 			second = first;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index a0fa19f..e7aa177c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			if (!new_transport)
 				new_transport = asoc->peer.active_path;
 		} else if ((new_transport->state == SCTP_INACTIVE) ||
-			   (new_transport->state == SCTP_UNCONFIRMED)) {
+			   (new_transport->state == SCTP_UNCONFIRMED) ||
+			   (new_transport->state == SCTP_PF)) {
 			/* If the chunk is Heartbeat or Heartbeat Ack,
 			 * send it to chunk->transport, even if it's
 			 * inactive.
@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			new_transport = chunk->transport;
 			if (!new_transport ||
 			    ((new_transport->state == SCTP_INACTIVE) ||
-			     (new_transport->state == SCTP_UNCONFIRMED)))
+			     (new_transport->state == SCTP_UNCONFIRMED) ||
+			     (new_transport->state == SCTP_PF)))
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state == SCTP_UNCONFIRMED)
 				continue;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c96d1a8..285e26a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_cmd_seq_t *commands,
 			     gfp_t gfp);
 
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+				     struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
@@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+					 struct sctp_association *asoc,
 					 struct sctp_transport *transport,
 					 int is_hb)
 {
@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 			transport->error_count++;
 	}
 
+	/* If the transport error count is greater than the pf_retrans
+	 * threshold, and less than pathmaxrtx, then mark this transport
+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
+	 * point 1
+	 */
+	if ((transport->state != SCTP_PF) &&
+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
+	   (transport->error_count > asoc->pf_retrans)) {
+
+		sctp_assoc_control_transport(asoc, transport,
+					     SCTP_TRANSPORT_PF,
+					     0);
+
+		/* Update the hb timer to resend a heartbeat every rto */
+		sctp_cmd_hb_timer_update(commands, transport);
+	}
+
 	if (transport->state != SCTP_INACTIVE &&
 	    (transport->error_count > transport->pathmaxrxt)) {
 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 					     SCTP_HEARTBEAT_SUCCESS);
 	}
 
+	if (t->state == SCTP_PF)
+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+					     SCTP_HEARTBEAT_SUCCESS);
+
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
 	 * using the time value carried in the HEARTBEAT ACK chunk.
@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_STRIKE:
 			/* Mark one strike against a transport.  */
-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
-						    0);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						    cmd->obj.transport, 0);
 			break;
 
 		case SCTP_CMD_TRANSPORT_IDLE:
@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_TRANSPORT_HB_SENT:
 			t = cmd->obj.transport;
-			sctp_do_8_2_transport_strike(asoc, t, 1);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						     t, 1);
 			t->hb_sent = 1;
 			break;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b3b8a8d..bba551f 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 }
 
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to alter the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
+			   sizeof(struct sctp_paddrthlds)))
+		return -EFAULT;
+
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				    transports) {
+			if (val.spt_pathmaxrxt)
+				trans->pathmaxrxt = val.spt_pathmaxrxt;
+			trans->pf_retrans = val.spt_pathpfthld;
+		}
+
+		if (val.spt_pathmaxrxt)
+			asoc->pathmaxrxt = val.spt_pathmaxrxt;
+		asoc->pf_retrans = val.spt_pathpfthld;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		if (val.spt_pathmaxrxt)
+			trans->pathmaxrxt = val.spt_pathmaxrxt;
+		trans->pf_retrans = val.spt_pathpfthld;
+	}
+
+	return 0;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -5490,6 +5543,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
 	return 0;
 }
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to fetch the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	optlen = sizeof(struct sctp_paddrthlds);
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+
+		val.spt_pathpfthld = asoc->pf_retrans;
+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		val.spt_pathmaxrxt = trans->pathmaxrxt;
+		val.spt_pathpfthld = trans->pf_retrans;
+	}
+
+	if (copy_to_user(optval, &val, optlen))
+		return -EFAULT;
+
+	return optlen;
+}
+
 SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 				char __user *optval, int __user *optlen)
 {
@@ -5628,6 +5725,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5fe639..2b2bfe9 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
 		.extra2		= &int_max
 	},
 	{
+		.procname	= "pf_retrans",
+		.data		= &sctp_pf_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
 		.procname	= "max_init_retransmits",
 		.data		= &sctp_max_retrans_init,
 		.maxlen		= sizeof(int),
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..194d0f3 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 
 	/* Initialize the default path max_retrans.  */
 	peer->pathmaxrxt  = sctp_max_retrans_path;
+	peer->pf_retrans  = sctp_pf_retrans;
 
 	INIT_LIST_HEAD(&peer->transmitted);
 	INIT_LIST_HEAD(&peer->send_ready);
@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
 {
 	unsigned long timeout;
 	timeout = t->rto + sctp_jitter(t->rto);
-	if (t->state != SCTP_UNCONFIRMED)
+	if ((t->state != SCTP_UNCONFIRMED) &&
+	    (t->state != SCTP_PF))
 		timeout += t->hbinterval;
 	timeout += jiffies;
 	return timeout;
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v5] sctp: Implement quick failover draft from tsvwg
@ 2012-07-20 18:51   ` Neil Horman
  0 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-20 18:51 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp, joe

I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org
CC: joe@perches.com

---
Change notes:

V2)
- Added socket option API from section 6.1 of the specification, as per
request from Vlad. Adding this socket option allows us to alter both the path
maximum retransmit value and the path partial failure threshold for each
transport and the association as a whole.

- Added a per transport pf_retrans value, and initialized it from the
association value.  This makes each transport independently configurable as per
the socket option above, and prevents changes in the sysctl from bleeding into
an already created association.

V3)
- Cleaned up some line spacing (Joe Perches)
- Fixed some socket option user data sanitization (Vlad Yasevich)

V4)
- Added additional documentation (Flavio Leitner)

V5)
- Modified setsockopt option to ignore 0 pathmaxrxt rather than return
  error (Vlad Yasevich)
- Modified getsocopt to return option length written (Vlad Y.)
---
 Documentation/networking/ip-sysctl.txt |   14 +++++
 include/net/sctp/constants.h           |    1 +
 include/net/sctp/structs.h             |   20 ++++++-
 include/net/sctp/user.h                |   11 ++++
 net/sctp/associola.c                   |   37 ++++++++++--
 net/sctp/outqueue.c                    |    6 +-
 net/sctp/sm_sideeffect.c               |   33 +++++++++-
 net/sctp/socket.c                      |  100 ++++++++++++++++++++++++++++++++
 net/sctp/sysctl.c                      |    9 +++
 net/sctp/transport.c                   |    4 +-
 10 files changed, 220 insertions(+), 15 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79..c636f9c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
 
 	Default: 5
 
+pf_retrans - INTEGER
+	The number of retransmissions that will be attempted on a given path
+	before traffic is redirected to an alternate transport (should one
+	exist).  Note this is distinct from path_max_retrans, as a path that
+	passes the pf_retrans threshold can still be used.  Its only
+	deprioritized when a transmission path is selected by the stack.  This
+	setting is primarily used to enable fast failover mechanisms without
+	having to reduce path_max_retrans to a very low value.  See:
+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+	for details.  Note also that a value of pf_retrans > path_max_retrans
+	disables this feature
+
+	Default: 0
+
 rto_initial - INTEGER
 	The initial round trip timeout value in milliseconds that will be used
 	in calculating round trip times.  This is the initial time interval
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 942b864..d053d2e 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -334,6 +334,7 @@ typedef enum {
 typedef enum {
 	SCTP_TRANSPORT_UP,
 	SCTP_TRANSPORT_DOWN,
+	SCTP_TRANSPORT_PF,
 } sctp_transport_cmd_t;
 
 /* These are the address scopes defined mainly for IPv4 addresses
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e4652fe..cee0678 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -161,6 +161,12 @@ extern struct sctp_globals {
 	int max_retrans_path;
 	int max_retrans_init;
 
+	/* Potentially-Failed.Max.Retrans sysctl value
+	 * taken from:
+	 * http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
+	 */
+	int pf_retrans;
+
 	/*
 	 * Policy for preforming sctp/socket accounting
 	 * 0   - do socket level accounting, all assocs share sk_sndbuf
@@ -258,6 +264,7 @@ extern struct sctp_globals {
 #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
 #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
 #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
 #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
 #define sctp_sack_timeout		(sctp_globals.sack_timeout)
 #define sctp_hb_interval		(sctp_globals.hb_interval)
@@ -987,10 +994,15 @@ struct sctp_transport {
 
 	/* This is the max_retrans value for the transport and will
 	 * be initialized from the assocs value.  This can be changed
-	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
+	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
 	 */
 	__u16 pathmaxrxt;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be changed
+	 * using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
 	/* PMTU	      : The current known path MTU.  */
 	__u32 pathmtu;
 
@@ -1660,6 +1672,12 @@ struct sctp_association {
 	 */
 	int max_retrans;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be
+	 * changed using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
+
 	/* Maximum number of times the endpoint will retransmit INIT  */
 	__u16 max_init_attempts;
 
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 0842ef0..1b02d7a 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
 #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
 #define SCTP_AUTO_ASCONF       30
+#define SCTP_PEER_ADDR_THLDS	31
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
@@ -649,6 +650,7 @@ struct sctp_paddrinfo {
  */
 enum sctp_spinfo_state {
 	SCTP_INACTIVE,
+	SCTP_PF,
 	SCTP_ACTIVE,
 	SCTP_UNCONFIRMED,
 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
@@ -741,4 +743,13 @@ typedef struct {
 	int sd;
 } sctp_peeloff_arg_t;
 
+/*
+ *  Peer Address Thresholds socket option
+ */
+struct sctp_paddrthlds {
+	sctp_assoc_t spt_assoc_id;
+	struct sockaddr_storage spt_address;
+	__u16 spt_pathmaxrxt;
+	__u16 spt_pathpfthld;
+};
 #endif /* __net_sctp_user_h__ */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5bc9ab1..90fe36b 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * socket values.
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+	asoc->pf_retrans  = sctp_pf_retrans;
+
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
@@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	/* Set the path max_retrans.  */
 	peer->pathmaxrxt = asoc->pathmaxrxt;
 
+	/* And the partial failure retrnas threshold */
+	peer->pf_retrans = asoc->pf_retrans;
+
 	/* Initialize the peer's SACK delay timeout based on the
 	 * association configured value.
 	 */
@@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	struct sctp_ulpevent *event;
 	struct sockaddr_storage addr;
 	int spc_state = 0;
+	bool ulp_notify = true;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			spc_state = SCTP_ADDR_CONFIRMED;
 		else
 			spc_state = SCTP_ADDR_AVAILABLE;
+		/* Don't inform ULP about transition from PF to
+		 * active state and set cwnd to 1, see SCTP
+		 * Quick failover draft section 5.1, point 5
+		 */
+		if (transport->state = SCTP_PF) {
+			ulp_notify = false;
+			transport->cwnd = 1;
+		}
 		transport->state = SCTP_ACTIVE;
 		break;
 
@@ -871,6 +885,11 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
+	case SCTP_TRANSPORT_PF:
+		transport->state = SCTP_PF;
+		ulp_notify = false;
+		break;
+
 	default:
 		return;
 	}
@@ -878,12 +897,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
 	 * user.
 	 */
-	memset(&addr, 0, sizeof(struct sockaddr_storage));
-	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-				0, spc_state, error, GFP_ATOMIC);
-	if (event)
-		sctp_ulpq_tail_event(&asoc->ulpq, event);
+	if (ulp_notify) {
+		memset(&addr, 0, sizeof(struct sockaddr_storage));
+		memcpy(&addr, &transport->ipaddr,
+		       transport->af_specific->sockaddr_len);
+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+					0, spc_state, error, GFP_ATOMIC);
+		if (event)
+			sctp_ulpq_tail_event(&asoc->ulpq, event);
+	}
 
 	/* Select new active and retran paths. */
 
@@ -899,7 +921,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			transports) {
 
 		if ((t->state = SCTP_INACTIVE) ||
-		    (t->state = SCTP_UNCONFIRMED))
+		    (t->state = SCTP_UNCONFIRMED) ||
+		    (t->state = SCTP_PF))
 			continue;
 		if (!first || t->last_time_heard > first->last_time_heard) {
 			second = first;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index a0fa19f..e7aa177c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			if (!new_transport)
 				new_transport = asoc->peer.active_path;
 		} else if ((new_transport->state = SCTP_INACTIVE) ||
-			   (new_transport->state = SCTP_UNCONFIRMED)) {
+			   (new_transport->state = SCTP_UNCONFIRMED) ||
+			   (new_transport->state = SCTP_PF)) {
 			/* If the chunk is Heartbeat or Heartbeat Ack,
 			 * send it to chunk->transport, even if it's
 			 * inactive.
@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			new_transport = chunk->transport;
 			if (!new_transport ||
 			    ((new_transport->state = SCTP_INACTIVE) ||
-			     (new_transport->state = SCTP_UNCONFIRMED)))
+			     (new_transport->state = SCTP_UNCONFIRMED) ||
+			     (new_transport->state = SCTP_PF)))
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state = SCTP_UNCONFIRMED)
 				continue;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c96d1a8..285e26a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_cmd_seq_t *commands,
 			     gfp_t gfp);
 
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+				     struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
@@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+					 struct sctp_association *asoc,
 					 struct sctp_transport *transport,
 					 int is_hb)
 {
@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 			transport->error_count++;
 	}
 
+	/* If the transport error count is greater than the pf_retrans
+	 * threshold, and less than pathmaxrtx, then mark this transport
+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
+	 * point 1
+	 */
+	if ((transport->state != SCTP_PF) &&
+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
+	   (transport->error_count > asoc->pf_retrans)) {
+
+		sctp_assoc_control_transport(asoc, transport,
+					     SCTP_TRANSPORT_PF,
+					     0);
+
+		/* Update the hb timer to resend a heartbeat every rto */
+		sctp_cmd_hb_timer_update(commands, transport);
+	}
+
 	if (transport->state != SCTP_INACTIVE &&
 	    (transport->error_count > transport->pathmaxrxt)) {
 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 					     SCTP_HEARTBEAT_SUCCESS);
 	}
 
+	if (t->state = SCTP_PF)
+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+					     SCTP_HEARTBEAT_SUCCESS);
+
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
 	 * using the time value carried in the HEARTBEAT ACK chunk.
@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_STRIKE:
 			/* Mark one strike against a transport.  */
-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
-						    0);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						    cmd->obj.transport, 0);
 			break;
 
 		case SCTP_CMD_TRANSPORT_IDLE:
@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_TRANSPORT_HB_SENT:
 			t = cmd->obj.transport;
-			sctp_do_8_2_transport_strike(asoc, t, 1);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						     t, 1);
 			t->hb_sent = 1;
 			break;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b3b8a8d..bba551f 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 }
 
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to alter the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
+			   sizeof(struct sctp_paddrthlds)))
+		return -EFAULT;
+
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				    transports) {
+			if (val.spt_pathmaxrxt)
+				trans->pathmaxrxt = val.spt_pathmaxrxt;
+			trans->pf_retrans = val.spt_pathpfthld;
+		}
+
+		if (val.spt_pathmaxrxt)
+			asoc->pathmaxrxt = val.spt_pathmaxrxt;
+		asoc->pf_retrans = val.spt_pathpfthld;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		if (val.spt_pathmaxrxt)
+			trans->pathmaxrxt = val.spt_pathmaxrxt;
+		trans->pf_retrans = val.spt_pathpfthld;
+	}
+
+	return 0;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -5490,6 +5543,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
 	return 0;
 }
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to fetch the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	optlen = sizeof(struct sctp_paddrthlds);
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+
+		val.spt_pathpfthld = asoc->pf_retrans;
+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		val.spt_pathmaxrxt = trans->pathmaxrxt;
+		val.spt_pathpfthld = trans->pf_retrans;
+	}
+
+	if (copy_to_user(optval, &val, optlen))
+		return -EFAULT;
+
+	return optlen;
+}
+
 SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 				char __user *optval, int __user *optlen)
 {
@@ -5628,6 +5725,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5fe639..2b2bfe9 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
 		.extra2		= &int_max
 	},
 	{
+		.procname	= "pf_retrans",
+		.data		= &sctp_pf_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
 		.procname	= "max_init_retransmits",
 		.data		= &sctp_max_retrans_init,
 		.maxlen		= sizeof(int),
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..194d0f3 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 
 	/* Initialize the default path max_retrans.  */
 	peer->pathmaxrxt  = sctp_max_retrans_path;
+	peer->pf_retrans  = sctp_pf_retrans;
 
 	INIT_LIST_HEAD(&peer->transmitted);
 	INIT_LIST_HEAD(&peer->send_ready);
@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
 {
 	unsigned long timeout;
 	timeout = t->rto + sctp_jitter(t->rto);
-	if (t->state != SCTP_UNCONFIRMED)
+	if ((t->state != SCTP_UNCONFIRMED) &&
+	    (t->state != SCTP_PF))
 		timeout += t->hbinterval;
 	timeout += jiffies;
 	return timeout;
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH v5] sctp: Implement quick failover draft from tsvwg
  2012-07-20 18:51   ` Neil Horman
@ 2012-07-20 19:10     ` Flavio Leitner
  -1 siblings, 0 replies; 48+ messages in thread
From: Flavio Leitner @ 2012-07-20 19:10 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp, joe

On Fri, 20 Jul 2012 14:51:59 -0400
Neil Horman <nhorman@tuxdriver.com> wrote:

> I've seen several attempts recently made to do quick failover of sctp transports
> by reducing various retransmit timers and counters.  While its possible to
> implement a faster failover on multihomed sctp associations, its not
> particularly robust, in that it can lead to unneeded retransmits, as well as
> false connection failures due to intermittent latency on a network.
> 
> Instead, lets implement the new ietf quick failover draft found here:
> http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
> 
> This will let the sctp stack identify transports that have had a small number of
> errors, and avoid using them quickly until their reliability can be
> re-established.  I've tested this out on two virt guests connected via multiple
> isolated virt networks and believe its in compliance with the above draft and
> works well.
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> CC: Vlad Yasevich <vyasevich@gmail.com>
> CC: Sridhar Samudrala <sri@us.ibm.com>
> CC: "David S. Miller" <davem@davemloft.net>
> CC: linux-sctp@vger.kernel.org
> CC: joe@perches.com
> 
> ---
> Change notes:
> 
> V2)
> - Added socket option API from section 6.1 of the specification, as per
> request from Vlad. Adding this socket option allows us to alter both the path
> maximum retransmit value and the path partial failure threshold for each
> transport and the association as a whole.
> 
> - Added a per transport pf_retrans value, and initialized it from the
> association value.  This makes each transport independently configurable as per
> the socket option above, and prevents changes in the sysctl from bleeding into
> an already created association.
> 
> V3)
> - Cleaned up some line spacing (Joe Perches)
> - Fixed some socket option user data sanitization (Vlad Yasevich)
> 
> V4)
> - Added additional documentation (Flavio Leitner)
> 
> V5)
> - Modified setsockopt option to ignore 0 pathmaxrxt rather than return
>   error (Vlad Yasevich)
> - Modified getsocopt to return option length written (Vlad Y.)
> ---
>  Documentation/networking/ip-sysctl.txt |   14 +++++
>  include/net/sctp/constants.h           |    1 +
>  include/net/sctp/structs.h             |   20 ++++++-
>  include/net/sctp/user.h                |   11 ++++
>  net/sctp/associola.c                   |   37 ++++++++++--
>  net/sctp/outqueue.c                    |    6 +-
>  net/sctp/sm_sideeffect.c               |   33 +++++++++-
>  net/sctp/socket.c                      |  100 ++++++++++++++++++++++++++++++++
>  net/sctp/sysctl.c                      |    9 +++
>  net/sctp/transport.c                   |    4 +-
>  10 files changed, 220 insertions(+), 15 deletions(-)
> 
> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
> index 47b6c79..c636f9c 100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
>  
>  	Default: 5
>  
> +pf_retrans - INTEGER
> +	The number of retransmissions that will be attempted on a given path
> +	before traffic is redirected to an alternate transport (should one
> +	exist).  Note this is distinct from path_max_retrans, as a path that
> +	passes the pf_retrans threshold can still be used.  Its only
> +	deprioritized when a transmission path is selected by the stack.  This
> +	setting is primarily used to enable fast failover mechanisms without
> +	having to reduce path_max_retrans to a very low value.  See:
> +	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> +	for details.  Note also that a value of pf_retrans > path_max_retrans
> +	disables this feature
> +
> +	Default: 0
> +
>  rto_initial - INTEGER
>  	The initial round trip timeout value in milliseconds that will be used
>  	in calculating round trip times.  This is the initial time interval
> diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
> index 942b864..d053d2e 100644
> --- a/include/net/sctp/constants.h
> +++ b/include/net/sctp/constants.h
> @@ -334,6 +334,7 @@ typedef enum {
>  typedef enum {
>  	SCTP_TRANSPORT_UP,
>  	SCTP_TRANSPORT_DOWN,
> +	SCTP_TRANSPORT_PF,
>  } sctp_transport_cmd_t;
>  
>  /* These are the address scopes defined mainly for IPv4 addresses
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index e4652fe..cee0678 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -161,6 +161,12 @@ extern struct sctp_globals {
>  	int max_retrans_path;
>  	int max_retrans_init;
>  
> +	/* Potentially-Failed.Max.Retrans sysctl value
> +	 * taken from:
> +	 * http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
> +	 */
> +	int pf_retrans;
> +
>  	/*
>  	 * Policy for preforming sctp/socket accounting
>  	 * 0   - do socket level accounting, all assocs share sk_sndbuf
> @@ -258,6 +264,7 @@ extern struct sctp_globals {
>  #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
>  #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
>  #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
> +#define sctp_pf_retrans			(sctp_globals.pf_retrans)
>  #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
>  #define sctp_sack_timeout		(sctp_globals.sack_timeout)
>  #define sctp_hb_interval		(sctp_globals.hb_interval)
> @@ -987,10 +994,15 @@ struct sctp_transport {
>  
>  	/* This is the max_retrans value for the transport and will
>  	 * be initialized from the assocs value.  This can be changed
> -	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
> +	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
>  	 */
>  	__u16 pathmaxrxt;
>  
> +	/* This is the partially failed retrans value for the transport
> +	 * and will be initialized from the assocs value.  This can be changed
> +	 * using the SCTP_PEER_ADDR_THLDS socket option
> +	 */
> +	int pf_retrans;
>  	/* PMTU	      : The current known path MTU.  */
>  	__u32 pathmtu;
>  
> @@ -1660,6 +1672,12 @@ struct sctp_association {
>  	 */
>  	int max_retrans;
>  
> +	/* This is the partially failed retrans value for the transport
> +	 * and will be initialized from the assocs value.  This can be
> +	 * changed using the SCTP_PEER_ADDR_THLDS socket option
> +	 */
> +	int pf_retrans;
> +
>  	/* Maximum number of times the endpoint will retransmit INIT  */
>  	__u16 max_init_attempts;
>  
> diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
> index 0842ef0..1b02d7a 100644
> --- a/include/net/sctp/user.h
> +++ b/include/net/sctp/user.h
> @@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
>  #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
>  #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
>  #define SCTP_AUTO_ASCONF       30
> +#define SCTP_PEER_ADDR_THLDS	31
>  
>  /* Internal Socket Options. Some of the sctp library functions are
>   * implemented using these socket options.
> @@ -649,6 +650,7 @@ struct sctp_paddrinfo {
>   */
>  enum sctp_spinfo_state {
>  	SCTP_INACTIVE,
> +	SCTP_PF,
>  	SCTP_ACTIVE,
>  	SCTP_UNCONFIRMED,
>  	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
> @@ -741,4 +743,13 @@ typedef struct {
>  	int sd;
>  } sctp_peeloff_arg_t;
>  
> +/*
> + *  Peer Address Thresholds socket option
> + */
> +struct sctp_paddrthlds {
> +	sctp_assoc_t spt_assoc_id;
> +	struct sockaddr_storage spt_address;
> +	__u16 spt_pathmaxrxt;
> +	__u16 spt_pathpfthld;
> +};
>  #endif /* __net_sctp_user_h__ */
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index 5bc9ab1..90fe36b 100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
>  	 * socket values.
>  	 */
>  	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
> +	asoc->pf_retrans  = sctp_pf_retrans;
> +
>  	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
>  	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
>  	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
> @@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
>  	/* Set the path max_retrans.  */
>  	peer->pathmaxrxt = asoc->pathmaxrxt;
>  
> +	/* And the partial failure retrnas threshold */
> +	peer->pf_retrans = asoc->pf_retrans;
> +
>  	/* Initialize the peer's SACK delay timeout based on the
>  	 * association configured value.
>  	 */
> @@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
>  	struct sctp_ulpevent *event;
>  	struct sockaddr_storage addr;
>  	int spc_state = 0;
> +	bool ulp_notify = true;
>  
>  	/* Record the transition on the transport.  */
>  	switch (command) {
> @@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
>  			spc_state = SCTP_ADDR_CONFIRMED;
>  		else
>  			spc_state = SCTP_ADDR_AVAILABLE;
> +		/* Don't inform ULP about transition from PF to
> +		 * active state and set cwnd to 1, see SCTP
> +		 * Quick failover draft section 5.1, point 5
> +		 */
> +		if (transport->state == SCTP_PF) {
> +			ulp_notify = false;
> +			transport->cwnd = 1;
> +		}
>  		transport->state = SCTP_ACTIVE;
>  		break;
>  
> @@ -871,6 +885,11 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
>  		spc_state = SCTP_ADDR_UNREACHABLE;
>  		break;
>  
> +	case SCTP_TRANSPORT_PF:
> +		transport->state = SCTP_PF;
> +		ulp_notify = false;
> +		break;
> +
>  	default:
>  		return;
>  	}
> @@ -878,12 +897,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
>  	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
>  	 * user.
>  	 */
> -	memset(&addr, 0, sizeof(struct sockaddr_storage));
> -	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
> -	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
> -				0, spc_state, error, GFP_ATOMIC);
> -	if (event)
> -		sctp_ulpq_tail_event(&asoc->ulpq, event);
> +	if (ulp_notify) {
> +		memset(&addr, 0, sizeof(struct sockaddr_storage));
> +		memcpy(&addr, &transport->ipaddr,
> +		       transport->af_specific->sockaddr_len);
> +		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
> +					0, spc_state, error, GFP_ATOMIC);
> +		if (event)
> +			sctp_ulpq_tail_event(&asoc->ulpq, event);
> +	}
>  
>  	/* Select new active and retran paths. */
>  
> @@ -899,7 +921,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
>  			transports) {
>  
>  		if ((t->state == SCTP_INACTIVE) ||
> -		    (t->state == SCTP_UNCONFIRMED))
> +		    (t->state == SCTP_UNCONFIRMED) ||
> +		    (t->state == SCTP_PF))
>  			continue;
>  		if (!first || t->last_time_heard > first->last_time_heard) {
>  			second = first;
> diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
> index a0fa19f..e7aa177c 100644
> --- a/net/sctp/outqueue.c
> +++ b/net/sctp/outqueue.c
> @@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
>  			if (!new_transport)
>  				new_transport = asoc->peer.active_path;
>  		} else if ((new_transport->state == SCTP_INACTIVE) ||
> -			   (new_transport->state == SCTP_UNCONFIRMED)) {
> +			   (new_transport->state == SCTP_UNCONFIRMED) ||
> +			   (new_transport->state == SCTP_PF)) {
>  			/* If the chunk is Heartbeat or Heartbeat Ack,
>  			 * send it to chunk->transport, even if it's
>  			 * inactive.
> @@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
>  			new_transport = chunk->transport;
>  			if (!new_transport ||
>  			    ((new_transport->state == SCTP_INACTIVE) ||
> -			     (new_transport->state == SCTP_UNCONFIRMED)))
> +			     (new_transport->state == SCTP_UNCONFIRMED) ||
> +			     (new_transport->state == SCTP_PF)))
>  				new_transport = asoc->peer.active_path;
>  			if (new_transport->state == SCTP_UNCONFIRMED)
>  				continue;
> diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
> index c96d1a8..285e26a 100644
> --- a/net/sctp/sm_sideeffect.c
> +++ b/net/sctp/sm_sideeffect.c
> @@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
>  			     sctp_cmd_seq_t *commands,
>  			     gfp_t gfp);
>  
> +static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
> +				     struct sctp_transport *t);
>  /********************************************************************
>   * Helper functions
>   ********************************************************************/
> @@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
>   * notification SHOULD be sent to the upper layer.
>   *
>   */
> -static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
> +static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
> +					 struct sctp_association *asoc,
>  					 struct sctp_transport *transport,
>  					 int is_hb)
>  {
> @@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
>  			transport->error_count++;
>  	}
>  
> +	/* If the transport error count is greater than the pf_retrans
> +	 * threshold, and less than pathmaxrtx, then mark this transport
> +	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
> +	 * point 1
> +	 */
> +	if ((transport->state != SCTP_PF) &&
> +	   (asoc->pf_retrans < transport->pathmaxrxt) &&
> +	   (transport->error_count > asoc->pf_retrans)) {
> +
> +		sctp_assoc_control_transport(asoc, transport,
> +					     SCTP_TRANSPORT_PF,
> +					     0);
> +
> +		/* Update the hb timer to resend a heartbeat every rto */
> +		sctp_cmd_hb_timer_update(commands, transport);
> +	}
> +
>  	if (transport->state != SCTP_INACTIVE &&
>  	    (transport->error_count > transport->pathmaxrxt)) {
>  		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
> @@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
>  					     SCTP_HEARTBEAT_SUCCESS);
>  	}
>  
> +	if (t->state == SCTP_PF)
> +		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
> +					     SCTP_HEARTBEAT_SUCCESS);
> +
>  	/* The receiver of the HEARTBEAT ACK should also perform an
>  	 * RTT measurement for that destination transport address
>  	 * using the time value carried in the HEARTBEAT ACK chunk.
> @@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
>  
>  		case SCTP_CMD_STRIKE:
>  			/* Mark one strike against a transport.  */
> -			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
> -						    0);
> +			sctp_do_8_2_transport_strike(commands, asoc,
> +						    cmd->obj.transport, 0);
>  			break;
>  
>  		case SCTP_CMD_TRANSPORT_IDLE:
> @@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
>  
>  		case SCTP_CMD_TRANSPORT_HB_SENT:
>  			t = cmd->obj.transport;
> -			sctp_do_8_2_transport_strike(asoc, t, 1);
> +			sctp_do_8_2_transport_strike(commands, asoc,
> +						     t, 1);
>  			t->hb_sent = 1;
>  			break;
>  
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index b3b8a8d..bba551f 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
>  }
>  
>  
> +/*
> + * SCTP_PEER_ADDR_THLDS
> + *
> + * This option allows us to alter the partially failed threshold for one or all
> + * transports in an association.  See Section 6.1 of:
> + * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> + */
> +static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
> +					    char __user *optval,
> +					    unsigned int optlen)
> +{
> +	struct sctp_paddrthlds val;
> +	struct sctp_transport *trans;
> +	struct sctp_association *asoc;
> +
> +	if (optlen < sizeof(struct sctp_paddrthlds))
> +		return -EINVAL;
> +	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
> +			   sizeof(struct sctp_paddrthlds)))
> +		return -EFAULT;
> +
> +
> +	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> +		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> +		if (!asoc)
> +			return -ENOENT;
> +		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
> +				    transports) {
> +			if (val.spt_pathmaxrxt)
> +				trans->pathmaxrxt = val.spt_pathmaxrxt;
> +			trans->pf_retrans = val.spt_pathpfthld;
> +		}
> +
> +		if (val.spt_pathmaxrxt)
> +			asoc->pathmaxrxt = val.spt_pathmaxrxt;
> +		asoc->pf_retrans = val.spt_pathpfthld;
> +	} else {
> +		trans = sctp_addr_id2transport(sk, &val.spt_address,
> +					       val.spt_assoc_id);
> +		if (!trans)
> +			return -ENOENT;
> +
> +		if (val.spt_pathmaxrxt)
> +			trans->pathmaxrxt = val.spt_pathmaxrxt;
> +		trans->pf_retrans = val.spt_pathpfthld;
> +	}
> +
> +	return 0;
> +}
> +
>  /* API 6.2 setsockopt(), getsockopt()
>   *
>   * Applications use setsockopt() and getsockopt() to set or retrieve
> @@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
>  	case SCTP_AUTO_ASCONF:
>  		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
>  		break;
> +	case SCTP_PEER_ADDR_THLDS:
> +		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
> +		break;
>  	default:
>  		retval = -ENOPROTOOPT;
>  		break;
> @@ -5490,6 +5543,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
>  	return 0;
>  }
>  
> +/*
> + * SCTP_PEER_ADDR_THLDS
> + *
> + * This option allows us to fetch the partially failed threshold for one or all
> + * transports in an association.  See Section 6.1 of:
> + * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> + */
> +static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
> +					    char __user *optval,
> +					    int optlen)
> +{
> +	struct sctp_paddrthlds val;
> +	struct sctp_transport *trans;
> +	struct sctp_association *asoc;
> +
> +	if (optlen < sizeof(struct sctp_paddrthlds))
> +		return -EINVAL;
> +	optlen = sizeof(struct sctp_paddrthlds);
> +	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
> +		return -EFAULT;
> +
> +	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> +		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> +		if (!asoc)
> +			return -ENOENT;
> +
> +		val.spt_pathpfthld = asoc->pf_retrans;
> +		val.spt_pathmaxrxt = asoc->pathmaxrxt;
> +	} else {
> +		trans = sctp_addr_id2transport(sk, &val.spt_address,
> +					       val.spt_assoc_id);
> +		if (!trans)
> +			return -ENOENT;
> +
> +		val.spt_pathmaxrxt = trans->pathmaxrxt;
> +		val.spt_pathpfthld = trans->pf_retrans;
> +	}
> +
> +	if (copy_to_user(optval, &val, optlen))
> +		return -EFAULT;
> +
> +	return optlen;
> +}
> +
>  SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
>  				char __user *optval, int __user *optlen)
>  {
> @@ -5628,6 +5725,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
>  	case SCTP_AUTO_ASCONF:
>  		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
>  		break;
> +	case SCTP_PEER_ADDR_THLDS:
> +		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
> +		break;
>  	default:
>  		retval = -ENOPROTOOPT;
>  		break;
> diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
> index e5fe639..2b2bfe9 100644
> --- a/net/sctp/sysctl.c
> +++ b/net/sctp/sysctl.c
> @@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
>  		.extra2		= &int_max
>  	},
>  	{
> +		.procname	= "pf_retrans",
> +		.data		= &sctp_pf_retrans,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &zero,
> +		.extra2		= &int_max
> +	},
> +	{
>  		.procname	= "max_init_retransmits",
>  		.data		= &sctp_max_retrans_init,
>  		.maxlen		= sizeof(int),
> diff --git a/net/sctp/transport.c b/net/sctp/transport.c
> index b026ba0..194d0f3 100644
> --- a/net/sctp/transport.c
> +++ b/net/sctp/transport.c
> @@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
>  
>  	/* Initialize the default path max_retrans.  */
>  	peer->pathmaxrxt  = sctp_max_retrans_path;
> +	peer->pf_retrans  = sctp_pf_retrans;
>  
>  	INIT_LIST_HEAD(&peer->transmitted);
>  	INIT_LIST_HEAD(&peer->send_ready);
> @@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
>  {
>  	unsigned long timeout;
>  	timeout = t->rto + sctp_jitter(t->rto);
> -	if (t->state != SCTP_UNCONFIRMED)
> +	if ((t->state != SCTP_UNCONFIRMED) &&
> +	    (t->state != SCTP_PF))
>  		timeout += t->hbinterval;
>  	timeout += jiffies;
>  	return timeout;

Reviewed-by: Flavio Leitner <fbl@redhat.com>

fbl

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v5] sctp: Implement quick failover draft from tsvwg
@ 2012-07-20 19:10     ` Flavio Leitner
  0 siblings, 0 replies; 48+ messages in thread
From: Flavio Leitner @ 2012-07-20 19:10 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp, joe

On Fri, 20 Jul 2012 14:51:59 -0400
Neil Horman <nhorman@tuxdriver.com> wrote:

> I've seen several attempts recently made to do quick failover of sctp transports
> by reducing various retransmit timers and counters.  While its possible to
> implement a faster failover on multihomed sctp associations, its not
> particularly robust, in that it can lead to unneeded retransmits, as well as
> false connection failures due to intermittent latency on a network.
> 
> Instead, lets implement the new ietf quick failover draft found here:
> http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
> 
> This will let the sctp stack identify transports that have had a small number of
> errors, and avoid using them quickly until their reliability can be
> re-established.  I've tested this out on two virt guests connected via multiple
> isolated virt networks and believe its in compliance with the above draft and
> works well.
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> CC: Vlad Yasevich <vyasevich@gmail.com>
> CC: Sridhar Samudrala <sri@us.ibm.com>
> CC: "David S. Miller" <davem@davemloft.net>
> CC: linux-sctp@vger.kernel.org
> CC: joe@perches.com
> 
> ---
> Change notes:
> 
> V2)
> - Added socket option API from section 6.1 of the specification, as per
> request from Vlad. Adding this socket option allows us to alter both the path
> maximum retransmit value and the path partial failure threshold for each
> transport and the association as a whole.
> 
> - Added a per transport pf_retrans value, and initialized it from the
> association value.  This makes each transport independently configurable as per
> the socket option above, and prevents changes in the sysctl from bleeding into
> an already created association.
> 
> V3)
> - Cleaned up some line spacing (Joe Perches)
> - Fixed some socket option user data sanitization (Vlad Yasevich)
> 
> V4)
> - Added additional documentation (Flavio Leitner)
> 
> V5)
> - Modified setsockopt option to ignore 0 pathmaxrxt rather than return
>   error (Vlad Yasevich)
> - Modified getsocopt to return option length written (Vlad Y.)
> ---
>  Documentation/networking/ip-sysctl.txt |   14 +++++
>  include/net/sctp/constants.h           |    1 +
>  include/net/sctp/structs.h             |   20 ++++++-
>  include/net/sctp/user.h                |   11 ++++
>  net/sctp/associola.c                   |   37 ++++++++++--
>  net/sctp/outqueue.c                    |    6 +-
>  net/sctp/sm_sideeffect.c               |   33 +++++++++-
>  net/sctp/socket.c                      |  100 ++++++++++++++++++++++++++++++++
>  net/sctp/sysctl.c                      |    9 +++
>  net/sctp/transport.c                   |    4 +-
>  10 files changed, 220 insertions(+), 15 deletions(-)
> 
> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
> index 47b6c79..c636f9c 100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
>  
>  	Default: 5
>  
> +pf_retrans - INTEGER
> +	The number of retransmissions that will be attempted on a given path
> +	before traffic is redirected to an alternate transport (should one
> +	exist).  Note this is distinct from path_max_retrans, as a path that
> +	passes the pf_retrans threshold can still be used.  Its only
> +	deprioritized when a transmission path is selected by the stack.  This
> +	setting is primarily used to enable fast failover mechanisms without
> +	having to reduce path_max_retrans to a very low value.  See:
> +	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> +	for details.  Note also that a value of pf_retrans > path_max_retrans
> +	disables this feature
> +
> +	Default: 0
> +
>  rto_initial - INTEGER
>  	The initial round trip timeout value in milliseconds that will be used
>  	in calculating round trip times.  This is the initial time interval
> diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
> index 942b864..d053d2e 100644
> --- a/include/net/sctp/constants.h
> +++ b/include/net/sctp/constants.h
> @@ -334,6 +334,7 @@ typedef enum {
>  typedef enum {
>  	SCTP_TRANSPORT_UP,
>  	SCTP_TRANSPORT_DOWN,
> +	SCTP_TRANSPORT_PF,
>  } sctp_transport_cmd_t;
>  
>  /* These are the address scopes defined mainly for IPv4 addresses
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index e4652fe..cee0678 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -161,6 +161,12 @@ extern struct sctp_globals {
>  	int max_retrans_path;
>  	int max_retrans_init;
>  
> +	/* Potentially-Failed.Max.Retrans sysctl value
> +	 * taken from:
> +	 * http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
> +	 */
> +	int pf_retrans;
> +
>  	/*
>  	 * Policy for preforming sctp/socket accounting
>  	 * 0   - do socket level accounting, all assocs share sk_sndbuf
> @@ -258,6 +264,7 @@ extern struct sctp_globals {
>  #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
>  #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
>  #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
> +#define sctp_pf_retrans			(sctp_globals.pf_retrans)
>  #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
>  #define sctp_sack_timeout		(sctp_globals.sack_timeout)
>  #define sctp_hb_interval		(sctp_globals.hb_interval)
> @@ -987,10 +994,15 @@ struct sctp_transport {
>  
>  	/* This is the max_retrans value for the transport and will
>  	 * be initialized from the assocs value.  This can be changed
> -	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
> +	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
>  	 */
>  	__u16 pathmaxrxt;
>  
> +	/* This is the partially failed retrans value for the transport
> +	 * and will be initialized from the assocs value.  This can be changed
> +	 * using the SCTP_PEER_ADDR_THLDS socket option
> +	 */
> +	int pf_retrans;
>  	/* PMTU	      : The current known path MTU.  */
>  	__u32 pathmtu;
>  
> @@ -1660,6 +1672,12 @@ struct sctp_association {
>  	 */
>  	int max_retrans;
>  
> +	/* This is the partially failed retrans value for the transport
> +	 * and will be initialized from the assocs value.  This can be
> +	 * changed using the SCTP_PEER_ADDR_THLDS socket option
> +	 */
> +	int pf_retrans;
> +
>  	/* Maximum number of times the endpoint will retransmit INIT  */
>  	__u16 max_init_attempts;
>  
> diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
> index 0842ef0..1b02d7a 100644
> --- a/include/net/sctp/user.h
> +++ b/include/net/sctp/user.h
> @@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
>  #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
>  #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
>  #define SCTP_AUTO_ASCONF       30
> +#define SCTP_PEER_ADDR_THLDS	31
>  
>  /* Internal Socket Options. Some of the sctp library functions are
>   * implemented using these socket options.
> @@ -649,6 +650,7 @@ struct sctp_paddrinfo {
>   */
>  enum sctp_spinfo_state {
>  	SCTP_INACTIVE,
> +	SCTP_PF,
>  	SCTP_ACTIVE,
>  	SCTP_UNCONFIRMED,
>  	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
> @@ -741,4 +743,13 @@ typedef struct {
>  	int sd;
>  } sctp_peeloff_arg_t;
>  
> +/*
> + *  Peer Address Thresholds socket option
> + */
> +struct sctp_paddrthlds {
> +	sctp_assoc_t spt_assoc_id;
> +	struct sockaddr_storage spt_address;
> +	__u16 spt_pathmaxrxt;
> +	__u16 spt_pathpfthld;
> +};
>  #endif /* __net_sctp_user_h__ */
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index 5bc9ab1..90fe36b 100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
>  	 * socket values.
>  	 */
>  	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
> +	asoc->pf_retrans  = sctp_pf_retrans;
> +
>  	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
>  	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
>  	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
> @@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
>  	/* Set the path max_retrans.  */
>  	peer->pathmaxrxt = asoc->pathmaxrxt;
>  
> +	/* And the partial failure retrnas threshold */
> +	peer->pf_retrans = asoc->pf_retrans;
> +
>  	/* Initialize the peer's SACK delay timeout based on the
>  	 * association configured value.
>  	 */
> @@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
>  	struct sctp_ulpevent *event;
>  	struct sockaddr_storage addr;
>  	int spc_state = 0;
> +	bool ulp_notify = true;
>  
>  	/* Record the transition on the transport.  */
>  	switch (command) {
> @@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
>  			spc_state = SCTP_ADDR_CONFIRMED;
>  		else
>  			spc_state = SCTP_ADDR_AVAILABLE;
> +		/* Don't inform ULP about transition from PF to
> +		 * active state and set cwnd to 1, see SCTP
> +		 * Quick failover draft section 5.1, point 5
> +		 */
> +		if (transport->state = SCTP_PF) {
> +			ulp_notify = false;
> +			transport->cwnd = 1;
> +		}
>  		transport->state = SCTP_ACTIVE;
>  		break;
>  
> @@ -871,6 +885,11 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
>  		spc_state = SCTP_ADDR_UNREACHABLE;
>  		break;
>  
> +	case SCTP_TRANSPORT_PF:
> +		transport->state = SCTP_PF;
> +		ulp_notify = false;
> +		break;
> +
>  	default:
>  		return;
>  	}
> @@ -878,12 +897,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
>  	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
>  	 * user.
>  	 */
> -	memset(&addr, 0, sizeof(struct sockaddr_storage));
> -	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
> -	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
> -				0, spc_state, error, GFP_ATOMIC);
> -	if (event)
> -		sctp_ulpq_tail_event(&asoc->ulpq, event);
> +	if (ulp_notify) {
> +		memset(&addr, 0, sizeof(struct sockaddr_storage));
> +		memcpy(&addr, &transport->ipaddr,
> +		       transport->af_specific->sockaddr_len);
> +		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
> +					0, spc_state, error, GFP_ATOMIC);
> +		if (event)
> +			sctp_ulpq_tail_event(&asoc->ulpq, event);
> +	}
>  
>  	/* Select new active and retran paths. */
>  
> @@ -899,7 +921,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
>  			transports) {
>  
>  		if ((t->state = SCTP_INACTIVE) ||
> -		    (t->state = SCTP_UNCONFIRMED))
> +		    (t->state = SCTP_UNCONFIRMED) ||
> +		    (t->state = SCTP_PF))
>  			continue;
>  		if (!first || t->last_time_heard > first->last_time_heard) {
>  			second = first;
> diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
> index a0fa19f..e7aa177c 100644
> --- a/net/sctp/outqueue.c
> +++ b/net/sctp/outqueue.c
> @@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
>  			if (!new_transport)
>  				new_transport = asoc->peer.active_path;
>  		} else if ((new_transport->state = SCTP_INACTIVE) ||
> -			   (new_transport->state = SCTP_UNCONFIRMED)) {
> +			   (new_transport->state = SCTP_UNCONFIRMED) ||
> +			   (new_transport->state = SCTP_PF)) {
>  			/* If the chunk is Heartbeat or Heartbeat Ack,
>  			 * send it to chunk->transport, even if it's
>  			 * inactive.
> @@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
>  			new_transport = chunk->transport;
>  			if (!new_transport ||
>  			    ((new_transport->state = SCTP_INACTIVE) ||
> -			     (new_transport->state = SCTP_UNCONFIRMED)))
> +			     (new_transport->state = SCTP_UNCONFIRMED) ||
> +			     (new_transport->state = SCTP_PF)))
>  				new_transport = asoc->peer.active_path;
>  			if (new_transport->state = SCTP_UNCONFIRMED)
>  				continue;
> diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
> index c96d1a8..285e26a 100644
> --- a/net/sctp/sm_sideeffect.c
> +++ b/net/sctp/sm_sideeffect.c
> @@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
>  			     sctp_cmd_seq_t *commands,
>  			     gfp_t gfp);
>  
> +static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
> +				     struct sctp_transport *t);
>  /********************************************************************
>   * Helper functions
>   ********************************************************************/
> @@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
>   * notification SHOULD be sent to the upper layer.
>   *
>   */
> -static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
> +static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
> +					 struct sctp_association *asoc,
>  					 struct sctp_transport *transport,
>  					 int is_hb)
>  {
> @@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
>  			transport->error_count++;
>  	}
>  
> +	/* If the transport error count is greater than the pf_retrans
> +	 * threshold, and less than pathmaxrtx, then mark this transport
> +	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
> +	 * point 1
> +	 */
> +	if ((transport->state != SCTP_PF) &&
> +	   (asoc->pf_retrans < transport->pathmaxrxt) &&
> +	   (transport->error_count > asoc->pf_retrans)) {
> +
> +		sctp_assoc_control_transport(asoc, transport,
> +					     SCTP_TRANSPORT_PF,
> +					     0);
> +
> +		/* Update the hb timer to resend a heartbeat every rto */
> +		sctp_cmd_hb_timer_update(commands, transport);
> +	}
> +
>  	if (transport->state != SCTP_INACTIVE &&
>  	    (transport->error_count > transport->pathmaxrxt)) {
>  		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
> @@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
>  					     SCTP_HEARTBEAT_SUCCESS);
>  	}
>  
> +	if (t->state = SCTP_PF)
> +		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
> +					     SCTP_HEARTBEAT_SUCCESS);
> +
>  	/* The receiver of the HEARTBEAT ACK should also perform an
>  	 * RTT measurement for that destination transport address
>  	 * using the time value carried in the HEARTBEAT ACK chunk.
> @@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
>  
>  		case SCTP_CMD_STRIKE:
>  			/* Mark one strike against a transport.  */
> -			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
> -						    0);
> +			sctp_do_8_2_transport_strike(commands, asoc,
> +						    cmd->obj.transport, 0);
>  			break;
>  
>  		case SCTP_CMD_TRANSPORT_IDLE:
> @@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
>  
>  		case SCTP_CMD_TRANSPORT_HB_SENT:
>  			t = cmd->obj.transport;
> -			sctp_do_8_2_transport_strike(asoc, t, 1);
> +			sctp_do_8_2_transport_strike(commands, asoc,
> +						     t, 1);
>  			t->hb_sent = 1;
>  			break;
>  
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index b3b8a8d..bba551f 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
>  }
>  
>  
> +/*
> + * SCTP_PEER_ADDR_THLDS
> + *
> + * This option allows us to alter the partially failed threshold for one or all
> + * transports in an association.  See Section 6.1 of:
> + * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> + */
> +static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
> +					    char __user *optval,
> +					    unsigned int optlen)
> +{
> +	struct sctp_paddrthlds val;
> +	struct sctp_transport *trans;
> +	struct sctp_association *asoc;
> +
> +	if (optlen < sizeof(struct sctp_paddrthlds))
> +		return -EINVAL;
> +	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
> +			   sizeof(struct sctp_paddrthlds)))
> +		return -EFAULT;
> +
> +
> +	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> +		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> +		if (!asoc)
> +			return -ENOENT;
> +		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
> +				    transports) {
> +			if (val.spt_pathmaxrxt)
> +				trans->pathmaxrxt = val.spt_pathmaxrxt;
> +			trans->pf_retrans = val.spt_pathpfthld;
> +		}
> +
> +		if (val.spt_pathmaxrxt)
> +			asoc->pathmaxrxt = val.spt_pathmaxrxt;
> +		asoc->pf_retrans = val.spt_pathpfthld;
> +	} else {
> +		trans = sctp_addr_id2transport(sk, &val.spt_address,
> +					       val.spt_assoc_id);
> +		if (!trans)
> +			return -ENOENT;
> +
> +		if (val.spt_pathmaxrxt)
> +			trans->pathmaxrxt = val.spt_pathmaxrxt;
> +		trans->pf_retrans = val.spt_pathpfthld;
> +	}
> +
> +	return 0;
> +}
> +
>  /* API 6.2 setsockopt(), getsockopt()
>   *
>   * Applications use setsockopt() and getsockopt() to set or retrieve
> @@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
>  	case SCTP_AUTO_ASCONF:
>  		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
>  		break;
> +	case SCTP_PEER_ADDR_THLDS:
> +		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
> +		break;
>  	default:
>  		retval = -ENOPROTOOPT;
>  		break;
> @@ -5490,6 +5543,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
>  	return 0;
>  }
>  
> +/*
> + * SCTP_PEER_ADDR_THLDS
> + *
> + * This option allows us to fetch the partially failed threshold for one or all
> + * transports in an association.  See Section 6.1 of:
> + * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> + */
> +static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
> +					    char __user *optval,
> +					    int optlen)
> +{
> +	struct sctp_paddrthlds val;
> +	struct sctp_transport *trans;
> +	struct sctp_association *asoc;
> +
> +	if (optlen < sizeof(struct sctp_paddrthlds))
> +		return -EINVAL;
> +	optlen = sizeof(struct sctp_paddrthlds);
> +	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
> +		return -EFAULT;
> +
> +	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> +		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> +		if (!asoc)
> +			return -ENOENT;
> +
> +		val.spt_pathpfthld = asoc->pf_retrans;
> +		val.spt_pathmaxrxt = asoc->pathmaxrxt;
> +	} else {
> +		trans = sctp_addr_id2transport(sk, &val.spt_address,
> +					       val.spt_assoc_id);
> +		if (!trans)
> +			return -ENOENT;
> +
> +		val.spt_pathmaxrxt = trans->pathmaxrxt;
> +		val.spt_pathpfthld = trans->pf_retrans;
> +	}
> +
> +	if (copy_to_user(optval, &val, optlen))
> +		return -EFAULT;
> +
> +	return optlen;
> +}
> +
>  SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
>  				char __user *optval, int __user *optlen)
>  {
> @@ -5628,6 +5725,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
>  	case SCTP_AUTO_ASCONF:
>  		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
>  		break;
> +	case SCTP_PEER_ADDR_THLDS:
> +		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
> +		break;
>  	default:
>  		retval = -ENOPROTOOPT;
>  		break;
> diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
> index e5fe639..2b2bfe9 100644
> --- a/net/sctp/sysctl.c
> +++ b/net/sctp/sysctl.c
> @@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
>  		.extra2		= &int_max
>  	},
>  	{
> +		.procname	= "pf_retrans",
> +		.data		= &sctp_pf_retrans,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &zero,
> +		.extra2		= &int_max
> +	},
> +	{
>  		.procname	= "max_init_retransmits",
>  		.data		= &sctp_max_retrans_init,
>  		.maxlen		= sizeof(int),
> diff --git a/net/sctp/transport.c b/net/sctp/transport.c
> index b026ba0..194d0f3 100644
> --- a/net/sctp/transport.c
> +++ b/net/sctp/transport.c
> @@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
>  
>  	/* Initialize the default path max_retrans.  */
>  	peer->pathmaxrxt  = sctp_max_retrans_path;
> +	peer->pf_retrans  = sctp_pf_retrans;
>  
>  	INIT_LIST_HEAD(&peer->transmitted);
>  	INIT_LIST_HEAD(&peer->send_ready);
> @@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
>  {
>  	unsigned long timeout;
>  	timeout = t->rto + sctp_jitter(t->rto);
> -	if (t->state != SCTP_UNCONFIRMED)
> +	if ((t->state != SCTP_UNCONFIRMED) &&
> +	    (t->state != SCTP_PF))
>  		timeout += t->hbinterval;
>  	timeout += jiffies;
>  	return timeout;

Reviewed-by: Flavio Leitner <fbl@redhat.com>

fbl

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v5] sctp: Implement quick failover draft from tsvwg
  2012-07-20 19:10     ` Flavio Leitner
@ 2012-07-20 19:31       ` David Miller
  -1 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2012-07-20 19:31 UTC (permalink / raw)
  To: fbl; +Cc: nhorman, netdev, vyasevich, sri, linux-sctp, joe


Please DO NOT quote an entire large patch just to add your
signoff.  It's a waste of bandwith, and folks like me have
to scroll down the entire thing to see if you actually
have real patch feedback or not.

Just quote the commit message or similar.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v5] sctp: Implement quick failover draft from tsvwg
@ 2012-07-20 19:31       ` David Miller
  0 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2012-07-20 19:31 UTC (permalink / raw)
  To: fbl; +Cc: nhorman, netdev, vyasevich, sri, linux-sctp, joe


Please DO NOT quote an entire large patch just to add your
signoff.  It's a waste of bandwith, and folks like me have
to scroll down the entire thing to see if you actually
have real patch feedback or not.

Just quote the commit message or similar.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v5] sctp: Implement quick failover draft from tsvwg
  2012-07-20 18:51   ` Neil Horman
@ 2012-07-21  6:45     ` Vlad Yasevich
  -1 siblings, 0 replies; 48+ messages in thread
From: Vlad Yasevich @ 2012-07-21  6:45 UTC (permalink / raw)
  To: Neil Horman, netdev; +Cc: Sridhar Samudrala, David S. Miller, linux-sctp, joe

Neil Horman <nhorman@tuxdriver.com> wrote:

>I've seen several attempts recently made to do quick failover of sctp
>transports
>by reducing various retransmit timers and counters.  While its possible
>to
>implement a faster failover on multihomed sctp associations, its not
>particularly robust, in that it can lead to unneeded retransmits, as
>well as
>false connection failures due to intermittent latency on a network.
>
>Instead, lets implement the new ietf quick failover draft found here:
>http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>
>This will let the sctp stack identify transports that have had a small
>number of
>errors, and avoid using them quickly until their reliability can be
>re-established.  I've tested this out on two virt guests connected via
>multiple
>isolated virt networks and believe its in compliance with the above
>draft and
>works well.
>
>Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
>CC: Vlad Yasevich <vyasevich@gmail.com>
>CC: Sridhar Samudrala <sri@us.ibm.com>
>CC: "David S. Miller" <davem@davemloft.net>
>CC: linux-sctp@vger.kernel.org
>CC: joe@perches.com
>
>---
>Change notes:
>
>V2)
>- Added socket option API from section 6.1 of the specification, as per
>request from Vlad. Adding this socket option allows us to alter both
>the path
>maximum retransmit value and the path partial failure threshold for
>each
>transport and the association as a whole.
>
>- Added a per transport pf_retrans value, and initialized it from the
>association value.  This makes each transport independently
>configurable as per
>the socket option above, and prevents changes in the sysctl from
>bleeding into
>an already created association.
>
>V3)
>- Cleaned up some line spacing (Joe Perches)
>- Fixed some socket option user data sanitization (Vlad Yasevich)
>
>V4)
>- Added additional documentation (Flavio Leitner)
>
>V5)
>- Modified setsockopt option to ignore 0 pathmaxrxt rather than return
>  error (Vlad Yasevich)
>- Modified getsocopt to return option length written (Vlad Y.)
>---
> Documentation/networking/ip-sysctl.txt |   14 +++++
> include/net/sctp/constants.h           |    1 +
> include/net/sctp/structs.h             |   20 ++++++-
> include/net/sctp/user.h                |   11 ++++
> net/sctp/associola.c                   |   37 ++++++++++--
> net/sctp/outqueue.c                    |    6 +-
> net/sctp/sm_sideeffect.c               |   33 +++++++++-
>net/sctp/socket.c                      |  100
>++++++++++++++++++++++++++++++++
> net/sctp/sysctl.c                      |    9 +++
> net/sctp/transport.c                   |    4 +-
> 10 files changed, 220 insertions(+), 15 deletions(-)
>
>> /* API 6.2 setsockopt(), getsockopt()
>  *
>  * Applications use setsockopt() and getsockopt() to set or retrieve
>@@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk,
>int level, int optname,
> 	case SCTP_AUTO_ASCONF:
> 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
> 		break;
>+	case SCTP_PEER_ADDR_THLDS:
>+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
>+		break;
> 	default:
> 		retval = -ENOPROTOOPT;
> 		break;
>@@ -5490,6 +5543,50 @@ static int sctp_getsockopt_assoc_ids(struct sock
>*sk, int len,
> 	return 0;
> }
> 
>+/*
>+ * SCTP_PEER_ADDR_THLDS
>+ *
>+ * This option allows us to fetch the partially failed threshold for
>one or all
>+ * transports in an association.  See Section 6.1 of:
>+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
>+ */
>+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
>+					    char __user *optval,
>+					    int optlen)
>+{
>+	struct sctp_paddrthlds val;
>+	struct sctp_transport *trans;
>+	struct sctp_association *asoc;
>+
>+	if (optlen < sizeof(struct sctp_paddrthlds))
>+		return -EINVAL;
>+	optlen = sizeof(struct sctp_paddrthlds);
>+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
>optlen))
>+		return -EFAULT;
>+
>+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
>+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
>+		if (!asoc)
>+			return -ENOENT;
>+
>+		val.spt_pathpfthld = asoc->pf_retrans;
>+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
>+	} else {
>+		trans = sctp_addr_id2transport(sk, &val.spt_address,
>+					       val.spt_assoc_id);
>+		if (!trans)
>+			return -ENOENT;
>+
>+		val.spt_pathmaxrxt = trans->pathmaxrxt;
>+		val.spt_pathpfthld = trans->pf_retrans;
>+	}
>+
>+	if (copy_to_user(optval, &val, optlen))
>+		return -EFAULT;
>+
>+	return optlen;

I don't think you can simply return this.  You have to call put_user() with the value to write it back to the User.  See how other get calls are done.

-Vlad
>+}
>+
>SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int
>optname,
> 				char __user *optval, int __user *optlen)
> {
>@@ -5628,6 +5725,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk,
>int level, int optname,
> 	case SCTP_AUTO_ASCONF:
> 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
> 		break;
>+	case SCTP_PEER_ADDR_THLDS:
>+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
>+		break;
> 	default:
> 		retval = -ENOPROTOOPT;
> 		break;
>diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
>index e5fe639..2b2bfe9 100644
>--- a/net/sctp/sysctl.c
>+++ b/net/sctp/sysctl.c
>@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
> 		.extra2		= &int_max
> 	},
> 	{
>+		.procname	= "pf_retrans",
>+		.data		= &sctp_pf_retrans,
>+		.maxlen		= sizeof(int),
>+		.mode		= 0644,
>+		.proc_handler	= proc_dointvec_minmax,
>+		.extra1		= &zero,
>+		.extra2		= &int_max
>+	},
>+	{
> 		.procname	= "max_init_retransmits",
> 		.data		= &sctp_max_retrans_init,
> 		.maxlen		= sizeof(int),
>diff --git a/net/sctp/transport.c b/net/sctp/transport.c
>index b026ba0..194d0f3 100644
>--- a/net/sctp/transport.c
>+++ b/net/sctp/transport.c
>@@ -85,6 +85,7 @@ static struct sctp_transport
>*sctp_transport_init(struct sctp_transport *peer,
> 
> 	/* Initialize the default path max_retrans.  */
> 	peer->pathmaxrxt  = sctp_max_retrans_path;
>+	peer->pf_retrans  = sctp_pf_retrans;
> 
> 	INIT_LIST_HEAD(&peer->transmitted);
> 	INIT_LIST_HEAD(&peer->send_ready);
>@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct
>sctp_transport *t)
> {
> 	unsigned long timeout;
> 	timeout = t->rto + sctp_jitter(t->rto);
>-	if (t->state != SCTP_UNCONFIRMED)
>+	if ((t->state != SCTP_UNCONFIRMED) &&
>+	    (t->state != SCTP_PF))
> 		timeout += t->hbinterval;
> 	timeout += jiffies;
> 	return timeout;
>-- 
>1.7.7.6


-- 
Sent from my Android phone with SkitMail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v5] sctp: Implement quick failover draft from tsvwg
@ 2012-07-21  6:45     ` Vlad Yasevich
  0 siblings, 0 replies; 48+ messages in thread
From: Vlad Yasevich @ 2012-07-21  6:45 UTC (permalink / raw)
  To: Neil Horman, netdev; +Cc: Sridhar Samudrala, David S. Miller, linux-sctp, joe

Neil Horman <nhorman@tuxdriver.com> wrote:

>I've seen several attempts recently made to do quick failover of sctp
>transports
>by reducing various retransmit timers and counters.  While its possible
>to
>implement a faster failover on multihomed sctp associations, its not
>particularly robust, in that it can lead to unneeded retransmits, as
>well as
>false connection failures due to intermittent latency on a network.
>
>Instead, lets implement the new ietf quick failover draft found here:
>http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>
>This will let the sctp stack identify transports that have had a small
>number of
>errors, and avoid using them quickly until their reliability can be
>re-established.  I've tested this out on two virt guests connected via
>multiple
>isolated virt networks and believe its in compliance with the above
>draft and
>works well.
>
>Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
>CC: Vlad Yasevich <vyasevich@gmail.com>
>CC: Sridhar Samudrala <sri@us.ibm.com>
>CC: "David S. Miller" <davem@davemloft.net>
>CC: linux-sctp@vger.kernel.org
>CC: joe@perches.com
>
>---
>Change notes:
>
>V2)
>- Added socket option API from section 6.1 of the specification, as per
>request from Vlad. Adding this socket option allows us to alter both
>the path
>maximum retransmit value and the path partial failure threshold for
>each
>transport and the association as a whole.
>
>- Added a per transport pf_retrans value, and initialized it from the
>association value.  This makes each transport independently
>configurable as per
>the socket option above, and prevents changes in the sysctl from
>bleeding into
>an already created association.
>
>V3)
>- Cleaned up some line spacing (Joe Perches)
>- Fixed some socket option user data sanitization (Vlad Yasevich)
>
>V4)
>- Added additional documentation (Flavio Leitner)
>
>V5)
>- Modified setsockopt option to ignore 0 pathmaxrxt rather than return
>  error (Vlad Yasevich)
>- Modified getsocopt to return option length written (Vlad Y.)
>---
> Documentation/networking/ip-sysctl.txt |   14 +++++
> include/net/sctp/constants.h           |    1 +
> include/net/sctp/structs.h             |   20 ++++++-
> include/net/sctp/user.h                |   11 ++++
> net/sctp/associola.c                   |   37 ++++++++++--
> net/sctp/outqueue.c                    |    6 +-
> net/sctp/sm_sideeffect.c               |   33 +++++++++-
>net/sctp/socket.c                      |  100
>++++++++++++++++++++++++++++++++
> net/sctp/sysctl.c                      |    9 +++
> net/sctp/transport.c                   |    4 +-
> 10 files changed, 220 insertions(+), 15 deletions(-)
>
>> /* API 6.2 setsockopt(), getsockopt()
>  *
>  * Applications use setsockopt() and getsockopt() to set or retrieve
>@@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk,
>int level, int optname,
> 	case SCTP_AUTO_ASCONF:
> 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
> 		break;
>+	case SCTP_PEER_ADDR_THLDS:
>+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
>+		break;
> 	default:
> 		retval = -ENOPROTOOPT;
> 		break;
>@@ -5490,6 +5543,50 @@ static int sctp_getsockopt_assoc_ids(struct sock
>*sk, int len,
> 	return 0;
> }
> 
>+/*
>+ * SCTP_PEER_ADDR_THLDS
>+ *
>+ * This option allows us to fetch the partially failed threshold for
>one or all
>+ * transports in an association.  See Section 6.1 of:
>+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
>+ */
>+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
>+					    char __user *optval,
>+					    int optlen)
>+{
>+	struct sctp_paddrthlds val;
>+	struct sctp_transport *trans;
>+	struct sctp_association *asoc;
>+
>+	if (optlen < sizeof(struct sctp_paddrthlds))
>+		return -EINVAL;
>+	optlen = sizeof(struct sctp_paddrthlds);
>+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
>optlen))
>+		return -EFAULT;
>+
>+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
>+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
>+		if (!asoc)
>+			return -ENOENT;
>+
>+		val.spt_pathpfthld = asoc->pf_retrans;
>+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
>+	} else {
>+		trans = sctp_addr_id2transport(sk, &val.spt_address,
>+					       val.spt_assoc_id);
>+		if (!trans)
>+			return -ENOENT;
>+
>+		val.spt_pathmaxrxt = trans->pathmaxrxt;
>+		val.spt_pathpfthld = trans->pf_retrans;
>+	}
>+
>+	if (copy_to_user(optval, &val, optlen))
>+		return -EFAULT;
>+
>+	return optlen;

I don't think you can simply return this.  You have to call put_user() with the value to write it back to the User.  See how other get calls are done.

-Vlad
>+}
>+
>SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int
>optname,
> 				char __user *optval, int __user *optlen)
> {
>@@ -5628,6 +5725,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk,
>int level, int optname,
> 	case SCTP_AUTO_ASCONF:
> 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
> 		break;
>+	case SCTP_PEER_ADDR_THLDS:
>+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
>+		break;
> 	default:
> 		retval = -ENOPROTOOPT;
> 		break;
>diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
>index e5fe639..2b2bfe9 100644
>--- a/net/sctp/sysctl.c
>+++ b/net/sctp/sysctl.c
>@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
> 		.extra2		= &int_max
> 	},
> 	{
>+		.procname	= "pf_retrans",
>+		.data		= &sctp_pf_retrans,
>+		.maxlen		= sizeof(int),
>+		.mode		= 0644,
>+		.proc_handler	= proc_dointvec_minmax,
>+		.extra1		= &zero,
>+		.extra2		= &int_max
>+	},
>+	{
> 		.procname	= "max_init_retransmits",
> 		.data		= &sctp_max_retrans_init,
> 		.maxlen		= sizeof(int),
>diff --git a/net/sctp/transport.c b/net/sctp/transport.c
>index b026ba0..194d0f3 100644
>--- a/net/sctp/transport.c
>+++ b/net/sctp/transport.c
>@@ -85,6 +85,7 @@ static struct sctp_transport
>*sctp_transport_init(struct sctp_transport *peer,
> 
> 	/* Initialize the default path max_retrans.  */
> 	peer->pathmaxrxt  = sctp_max_retrans_path;
>+	peer->pf_retrans  = sctp_pf_retrans;
> 
> 	INIT_LIST_HEAD(&peer->transmitted);
> 	INIT_LIST_HEAD(&peer->send_ready);
>@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct
>sctp_transport *t)
> {
> 	unsigned long timeout;
> 	timeout = t->rto + sctp_jitter(t->rto);
>-	if (t->state != SCTP_UNCONFIRMED)
>+	if ((t->state != SCTP_UNCONFIRMED) &&
>+	    (t->state != SCTP_PF))
> 		timeout += t->hbinterval;
> 	timeout += jiffies;
> 	return timeout;
>-- 
>1.7.7.6


-- 
Sent from my Android phone with SkitMail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v5] sctp: Implement quick failover draft from tsvwg
  2012-07-21  6:45     ` Vlad Yasevich
@ 2012-07-21 11:03       ` Neil Horman
  -1 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-21 11:03 UTC (permalink / raw)
  To: Vlad Yasevich; +Cc: netdev, Sridhar Samudrala, David S. Miller, linux-sctp, joe

On Sat, Jul 21, 2012 at 02:45:03AM -0400, Vlad Yasevich wrote:
> Neil Horman <nhorman@tuxdriver.com> wrote:
><snip>
> >+
> >+		val.spt_pathmaxrxt = trans->pathmaxrxt;
> >+		val.spt_pathpfthld = trans->pf_retrans;
> >+	}
> >+
> >+	if (copy_to_user(optval, &val, optlen))
> >+		return -EFAULT;
> >+
> >+	return optlen;
> 
> I don't think you can simply return this.  You have to call put_user() with the value to write it back to the User.  See how other get calls are done.
> 
> -Vlad
> 
Yeah, sorry, I had assumed that the put_user for the return code was part of the
common sctp_getsockopt path and didn't go check.  Thanks, I'll respin this later
tonight.
Neil

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v5] sctp: Implement quick failover draft from tsvwg
@ 2012-07-21 11:03       ` Neil Horman
  0 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-21 11:03 UTC (permalink / raw)
  To: Vlad Yasevich; +Cc: netdev, Sridhar Samudrala, David S. Miller, linux-sctp, joe

On Sat, Jul 21, 2012 at 02:45:03AM -0400, Vlad Yasevich wrote:
> Neil Horman <nhorman@tuxdriver.com> wrote:
><snip>
> >+
> >+		val.spt_pathmaxrxt = trans->pathmaxrxt;
> >+		val.spt_pathpfthld = trans->pf_retrans;
> >+	}
> >+
> >+	if (copy_to_user(optval, &val, optlen))
> >+		return -EFAULT;
> >+
> >+	return optlen;
> 
> I don't think you can simply return this.  You have to call put_user() with the value to write it back to the User.  See how other get calls are done.
> 
> -Vlad
> 
Yeah, sorry, I had assumed that the put_user for the return code was part of the
common sctp_getsockopt path and didn't go check.  Thanks, I'll respin this later
tonight.
Neil


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH v6] sctp: Implement quick failover draft from tsvwg
  2012-07-13 18:26 ` Neil Horman
@ 2012-07-21 17:56   ` Neil Horman
  -1 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-21 17:56 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp, joe

I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org
CC: joe@perches.com

---
Change notes:

V2)
- Added socket option API from section 6.1 of the specification, as per
request from Vlad. Adding this socket option allows us to alter both the path
maximum retransmit value and the path partial failure threshold for each
transport and the association as a whole.

- Added a per transport pf_retrans value, and initialized it from the
association value.  This makes each transport independently configurable as per
the socket option above, and prevents changes in the sysctl from bleeding into
an already created association.

V3)
- Cleaned up some line spacing (Joe Perches)
- Fixed some socket option user data sanitization (Vlad Yasevich)

V4)
- Added additional documentation (Flavio Leitner)

V5)
- Modified setsockopt option to ignore 0 pathmaxrxt rather than return
  error (Vlad Yasevich)
- Modified getsocopt to return option length written (Vlad Y.)

V6)
- Fixed stupid mistake about returning optval incorrectly (Vlad Y)
---
 Documentation/networking/ip-sysctl.txt |   14 +++++
 include/net/sctp/constants.h           |    1 +
 include/net/sctp/structs.h             |   20 ++++++-
 include/net/sctp/user.h                |   11 ++++
 net/sctp/associola.c                   |   37 +++++++++--
 net/sctp/outqueue.c                    |    6 +-
 net/sctp/sm_sideeffect.c               |   33 +++++++++-
 net/sctp/socket.c                      |  101 ++++++++++++++++++++++++++++++++
 net/sctp/sysctl.c                      |    9 +++
 net/sctp/transport.c                   |    4 +-
 10 files changed, 221 insertions(+), 15 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79..c636f9c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
 
 	Default: 5
 
+pf_retrans - INTEGER
+	The number of retransmissions that will be attempted on a given path
+	before traffic is redirected to an alternate transport (should one
+	exist).  Note this is distinct from path_max_retrans, as a path that
+	passes the pf_retrans threshold can still be used.  Its only
+	deprioritized when a transmission path is selected by the stack.  This
+	setting is primarily used to enable fast failover mechanisms without
+	having to reduce path_max_retrans to a very low value.  See:
+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+	for details.  Note also that a value of pf_retrans > path_max_retrans
+	disables this feature
+
+	Default: 0
+
 rto_initial - INTEGER
 	The initial round trip timeout value in milliseconds that will be used
 	in calculating round trip times.  This is the initial time interval
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 942b864..d053d2e 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -334,6 +334,7 @@ typedef enum {
 typedef enum {
 	SCTP_TRANSPORT_UP,
 	SCTP_TRANSPORT_DOWN,
+	SCTP_TRANSPORT_PF,
 } sctp_transport_cmd_t;
 
 /* These are the address scopes defined mainly for IPv4 addresses
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e4652fe..cee0678 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -161,6 +161,12 @@ extern struct sctp_globals {
 	int max_retrans_path;
 	int max_retrans_init;
 
+	/* Potentially-Failed.Max.Retrans sysctl value
+	 * taken from:
+	 * http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
+	 */
+	int pf_retrans;
+
 	/*
 	 * Policy for preforming sctp/socket accounting
 	 * 0   - do socket level accounting, all assocs share sk_sndbuf
@@ -258,6 +264,7 @@ extern struct sctp_globals {
 #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
 #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
 #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
 #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
 #define sctp_sack_timeout		(sctp_globals.sack_timeout)
 #define sctp_hb_interval		(sctp_globals.hb_interval)
@@ -987,10 +994,15 @@ struct sctp_transport {
 
 	/* This is the max_retrans value for the transport and will
 	 * be initialized from the assocs value.  This can be changed
-	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
+	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
 	 */
 	__u16 pathmaxrxt;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be changed
+	 * using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
 	/* PMTU	      : The current known path MTU.  */
 	__u32 pathmtu;
 
@@ -1660,6 +1672,12 @@ struct sctp_association {
 	 */
 	int max_retrans;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be
+	 * changed using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
+
 	/* Maximum number of times the endpoint will retransmit INIT  */
 	__u16 max_init_attempts;
 
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 0842ef0..1b02d7a 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
 #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
 #define SCTP_AUTO_ASCONF       30
+#define SCTP_PEER_ADDR_THLDS	31
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
@@ -649,6 +650,7 @@ struct sctp_paddrinfo {
  */
 enum sctp_spinfo_state {
 	SCTP_INACTIVE,
+	SCTP_PF,
 	SCTP_ACTIVE,
 	SCTP_UNCONFIRMED,
 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
@@ -741,4 +743,13 @@ typedef struct {
 	int sd;
 } sctp_peeloff_arg_t;
 
+/*
+ *  Peer Address Thresholds socket option
+ */
+struct sctp_paddrthlds {
+	sctp_assoc_t spt_assoc_id;
+	struct sockaddr_storage spt_address;
+	__u16 spt_pathmaxrxt;
+	__u16 spt_pathpfthld;
+};
 #endif /* __net_sctp_user_h__ */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5bc9ab1..90fe36b 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * socket values.
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+	asoc->pf_retrans  = sctp_pf_retrans;
+
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
@@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	/* Set the path max_retrans.  */
 	peer->pathmaxrxt = asoc->pathmaxrxt;
 
+	/* And the partial failure retrnas threshold */
+	peer->pf_retrans = asoc->pf_retrans;
+
 	/* Initialize the peer's SACK delay timeout based on the
 	 * association configured value.
 	 */
@@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	struct sctp_ulpevent *event;
 	struct sockaddr_storage addr;
 	int spc_state = 0;
+	bool ulp_notify = true;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			spc_state = SCTP_ADDR_CONFIRMED;
 		else
 			spc_state = SCTP_ADDR_AVAILABLE;
+		/* Don't inform ULP about transition from PF to
+		 * active state and set cwnd to 1, see SCTP
+		 * Quick failover draft section 5.1, point 5
+		 */
+		if (transport->state == SCTP_PF) {
+			ulp_notify = false;
+			transport->cwnd = 1;
+		}
 		transport->state = SCTP_ACTIVE;
 		break;
 
@@ -871,6 +885,11 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
+	case SCTP_TRANSPORT_PF:
+		transport->state = SCTP_PF;
+		ulp_notify = false;
+		break;
+
 	default:
 		return;
 	}
@@ -878,12 +897,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
 	 * user.
 	 */
-	memset(&addr, 0, sizeof(struct sockaddr_storage));
-	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-				0, spc_state, error, GFP_ATOMIC);
-	if (event)
-		sctp_ulpq_tail_event(&asoc->ulpq, event);
+	if (ulp_notify) {
+		memset(&addr, 0, sizeof(struct sockaddr_storage));
+		memcpy(&addr, &transport->ipaddr,
+		       transport->af_specific->sockaddr_len);
+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+					0, spc_state, error, GFP_ATOMIC);
+		if (event)
+			sctp_ulpq_tail_event(&asoc->ulpq, event);
+	}
 
 	/* Select new active and retran paths. */
 
@@ -899,7 +921,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			transports) {
 
 		if ((t->state == SCTP_INACTIVE) ||
-		    (t->state == SCTP_UNCONFIRMED))
+		    (t->state == SCTP_UNCONFIRMED) ||
+		    (t->state == SCTP_PF))
 			continue;
 		if (!first || t->last_time_heard > first->last_time_heard) {
 			second = first;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index a0fa19f..e7aa177c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			if (!new_transport)
 				new_transport = asoc->peer.active_path;
 		} else if ((new_transport->state == SCTP_INACTIVE) ||
-			   (new_transport->state == SCTP_UNCONFIRMED)) {
+			   (new_transport->state == SCTP_UNCONFIRMED) ||
+			   (new_transport->state == SCTP_PF)) {
 			/* If the chunk is Heartbeat or Heartbeat Ack,
 			 * send it to chunk->transport, even if it's
 			 * inactive.
@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			new_transport = chunk->transport;
 			if (!new_transport ||
 			    ((new_transport->state == SCTP_INACTIVE) ||
-			     (new_transport->state == SCTP_UNCONFIRMED)))
+			     (new_transport->state == SCTP_UNCONFIRMED) ||
+			     (new_transport->state == SCTP_PF)))
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state == SCTP_UNCONFIRMED)
 				continue;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c96d1a8..285e26a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_cmd_seq_t *commands,
 			     gfp_t gfp);
 
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+				     struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
@@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+					 struct sctp_association *asoc,
 					 struct sctp_transport *transport,
 					 int is_hb)
 {
@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 			transport->error_count++;
 	}
 
+	/* If the transport error count is greater than the pf_retrans
+	 * threshold, and less than pathmaxrtx, then mark this transport
+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
+	 * point 1
+	 */
+	if ((transport->state != SCTP_PF) &&
+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
+	   (transport->error_count > asoc->pf_retrans)) {
+
+		sctp_assoc_control_transport(asoc, transport,
+					     SCTP_TRANSPORT_PF,
+					     0);
+
+		/* Update the hb timer to resend a heartbeat every rto */
+		sctp_cmd_hb_timer_update(commands, transport);
+	}
+
 	if (transport->state != SCTP_INACTIVE &&
 	    (transport->error_count > transport->pathmaxrxt)) {
 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 					     SCTP_HEARTBEAT_SUCCESS);
 	}
 
+	if (t->state == SCTP_PF)
+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+					     SCTP_HEARTBEAT_SUCCESS);
+
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
 	 * using the time value carried in the HEARTBEAT ACK chunk.
@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_STRIKE:
 			/* Mark one strike against a transport.  */
-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
-						    0);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						    cmd->obj.transport, 0);
 			break;
 
 		case SCTP_CMD_TRANSPORT_IDLE:
@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_TRANSPORT_HB_SENT:
 			t = cmd->obj.transport;
-			sctp_do_8_2_transport_strike(asoc, t, 1);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						     t, 1);
 			t->hb_sent = 1;
 			break;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b3b8a8d..8665b81 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 }
 
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to alter the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
+			   sizeof(struct sctp_paddrthlds)))
+		return -EFAULT;
+
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				    transports) {
+			if (val.spt_pathmaxrxt)
+				trans->pathmaxrxt = val.spt_pathmaxrxt;
+			trans->pf_retrans = val.spt_pathpfthld;
+		}
+
+		if (val.spt_pathmaxrxt)
+			asoc->pathmaxrxt = val.spt_pathmaxrxt;
+		asoc->pf_retrans = val.spt_pathpfthld;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		if (val.spt_pathmaxrxt)
+			trans->pathmaxrxt = val.spt_pathmaxrxt;
+		trans->pf_retrans = val.spt_pathpfthld;
+	}
+
+	return 0;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -5490,6 +5543,51 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
 	return 0;
 }
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to fetch the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    int len,
+					    int __user *optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (len < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	len = sizeof(struct sctp_paddrthlds);
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, len))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+
+		val.spt_pathpfthld = asoc->pf_retrans;
+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		val.spt_pathmaxrxt = trans->pathmaxrxt;
+		val.spt_pathpfthld = trans->pf_retrans;
+	}
+
+	if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
 SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 				char __user *optval, int __user *optlen)
 {
@@ -5628,6 +5726,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5fe639..2b2bfe9 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
 		.extra2		= &int_max
 	},
 	{
+		.procname	= "pf_retrans",
+		.data		= &sctp_pf_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
 		.procname	= "max_init_retransmits",
 		.data		= &sctp_max_retrans_init,
 		.maxlen		= sizeof(int),
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..194d0f3 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 
 	/* Initialize the default path max_retrans.  */
 	peer->pathmaxrxt  = sctp_max_retrans_path;
+	peer->pf_retrans  = sctp_pf_retrans;
 
 	INIT_LIST_HEAD(&peer->transmitted);
 	INIT_LIST_HEAD(&peer->send_ready);
@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
 {
 	unsigned long timeout;
 	timeout = t->rto + sctp_jitter(t->rto);
-	if (t->state != SCTP_UNCONFIRMED)
+	if ((t->state != SCTP_UNCONFIRMED) &&
+	    (t->state != SCTP_PF))
 		timeout += t->hbinterval;
 	timeout += jiffies;
 	return timeout;
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH v6] sctp: Implement quick failover draft from tsvwg
@ 2012-07-21 17:56   ` Neil Horman
  0 siblings, 0 replies; 48+ messages in thread
From: Neil Horman @ 2012-07-21 17:56 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp, joe

I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org
CC: joe@perches.com

---
Change notes:

V2)
- Added socket option API from section 6.1 of the specification, as per
request from Vlad. Adding this socket option allows us to alter both the path
maximum retransmit value and the path partial failure threshold for each
transport and the association as a whole.

- Added a per transport pf_retrans value, and initialized it from the
association value.  This makes each transport independently configurable as per
the socket option above, and prevents changes in the sysctl from bleeding into
an already created association.

V3)
- Cleaned up some line spacing (Joe Perches)
- Fixed some socket option user data sanitization (Vlad Yasevich)

V4)
- Added additional documentation (Flavio Leitner)

V5)
- Modified setsockopt option to ignore 0 pathmaxrxt rather than return
  error (Vlad Yasevich)
- Modified getsocopt to return option length written (Vlad Y.)

V6)
- Fixed stupid mistake about returning optval incorrectly (Vlad Y)
---
 Documentation/networking/ip-sysctl.txt |   14 +++++
 include/net/sctp/constants.h           |    1 +
 include/net/sctp/structs.h             |   20 ++++++-
 include/net/sctp/user.h                |   11 ++++
 net/sctp/associola.c                   |   37 +++++++++--
 net/sctp/outqueue.c                    |    6 +-
 net/sctp/sm_sideeffect.c               |   33 +++++++++-
 net/sctp/socket.c                      |  101 ++++++++++++++++++++++++++++++++
 net/sctp/sysctl.c                      |    9 +++
 net/sctp/transport.c                   |    4 +-
 10 files changed, 221 insertions(+), 15 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79..c636f9c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
 
 	Default: 5
 
+pf_retrans - INTEGER
+	The number of retransmissions that will be attempted on a given path
+	before traffic is redirected to an alternate transport (should one
+	exist).  Note this is distinct from path_max_retrans, as a path that
+	passes the pf_retrans threshold can still be used.  Its only
+	deprioritized when a transmission path is selected by the stack.  This
+	setting is primarily used to enable fast failover mechanisms without
+	having to reduce path_max_retrans to a very low value.  See:
+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+	for details.  Note also that a value of pf_retrans > path_max_retrans
+	disables this feature
+
+	Default: 0
+
 rto_initial - INTEGER
 	The initial round trip timeout value in milliseconds that will be used
 	in calculating round trip times.  This is the initial time interval
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 942b864..d053d2e 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -334,6 +334,7 @@ typedef enum {
 typedef enum {
 	SCTP_TRANSPORT_UP,
 	SCTP_TRANSPORT_DOWN,
+	SCTP_TRANSPORT_PF,
 } sctp_transport_cmd_t;
 
 /* These are the address scopes defined mainly for IPv4 addresses
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e4652fe..cee0678 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -161,6 +161,12 @@ extern struct sctp_globals {
 	int max_retrans_path;
 	int max_retrans_init;
 
+	/* Potentially-Failed.Max.Retrans sysctl value
+	 * taken from:
+	 * http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
+	 */
+	int pf_retrans;
+
 	/*
 	 * Policy for preforming sctp/socket accounting
 	 * 0   - do socket level accounting, all assocs share sk_sndbuf
@@ -258,6 +264,7 @@ extern struct sctp_globals {
 #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
 #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
 #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
 #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
 #define sctp_sack_timeout		(sctp_globals.sack_timeout)
 #define sctp_hb_interval		(sctp_globals.hb_interval)
@@ -987,10 +994,15 @@ struct sctp_transport {
 
 	/* This is the max_retrans value for the transport and will
 	 * be initialized from the assocs value.  This can be changed
-	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
+	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
 	 */
 	__u16 pathmaxrxt;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be changed
+	 * using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
 	/* PMTU	      : The current known path MTU.  */
 	__u32 pathmtu;
 
@@ -1660,6 +1672,12 @@ struct sctp_association {
 	 */
 	int max_retrans;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be
+	 * changed using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
+
 	/* Maximum number of times the endpoint will retransmit INIT  */
 	__u16 max_init_attempts;
 
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 0842ef0..1b02d7a 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
 #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
 #define SCTP_AUTO_ASCONF       30
+#define SCTP_PEER_ADDR_THLDS	31
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
@@ -649,6 +650,7 @@ struct sctp_paddrinfo {
  */
 enum sctp_spinfo_state {
 	SCTP_INACTIVE,
+	SCTP_PF,
 	SCTP_ACTIVE,
 	SCTP_UNCONFIRMED,
 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
@@ -741,4 +743,13 @@ typedef struct {
 	int sd;
 } sctp_peeloff_arg_t;
 
+/*
+ *  Peer Address Thresholds socket option
+ */
+struct sctp_paddrthlds {
+	sctp_assoc_t spt_assoc_id;
+	struct sockaddr_storage spt_address;
+	__u16 spt_pathmaxrxt;
+	__u16 spt_pathpfthld;
+};
 #endif /* __net_sctp_user_h__ */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5bc9ab1..90fe36b 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * socket values.
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+	asoc->pf_retrans  = sctp_pf_retrans;
+
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
@@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	/* Set the path max_retrans.  */
 	peer->pathmaxrxt = asoc->pathmaxrxt;
 
+	/* And the partial failure retrnas threshold */
+	peer->pf_retrans = asoc->pf_retrans;
+
 	/* Initialize the peer's SACK delay timeout based on the
 	 * association configured value.
 	 */
@@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	struct sctp_ulpevent *event;
 	struct sockaddr_storage addr;
 	int spc_state = 0;
+	bool ulp_notify = true;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			spc_state = SCTP_ADDR_CONFIRMED;
 		else
 			spc_state = SCTP_ADDR_AVAILABLE;
+		/* Don't inform ULP about transition from PF to
+		 * active state and set cwnd to 1, see SCTP
+		 * Quick failover draft section 5.1, point 5
+		 */
+		if (transport->state = SCTP_PF) {
+			ulp_notify = false;
+			transport->cwnd = 1;
+		}
 		transport->state = SCTP_ACTIVE;
 		break;
 
@@ -871,6 +885,11 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
+	case SCTP_TRANSPORT_PF:
+		transport->state = SCTP_PF;
+		ulp_notify = false;
+		break;
+
 	default:
 		return;
 	}
@@ -878,12 +897,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
 	 * user.
 	 */
-	memset(&addr, 0, sizeof(struct sockaddr_storage));
-	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-				0, spc_state, error, GFP_ATOMIC);
-	if (event)
-		sctp_ulpq_tail_event(&asoc->ulpq, event);
+	if (ulp_notify) {
+		memset(&addr, 0, sizeof(struct sockaddr_storage));
+		memcpy(&addr, &transport->ipaddr,
+		       transport->af_specific->sockaddr_len);
+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+					0, spc_state, error, GFP_ATOMIC);
+		if (event)
+			sctp_ulpq_tail_event(&asoc->ulpq, event);
+	}
 
 	/* Select new active and retran paths. */
 
@@ -899,7 +921,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			transports) {
 
 		if ((t->state = SCTP_INACTIVE) ||
-		    (t->state = SCTP_UNCONFIRMED))
+		    (t->state = SCTP_UNCONFIRMED) ||
+		    (t->state = SCTP_PF))
 			continue;
 		if (!first || t->last_time_heard > first->last_time_heard) {
 			second = first;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index a0fa19f..e7aa177c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			if (!new_transport)
 				new_transport = asoc->peer.active_path;
 		} else if ((new_transport->state = SCTP_INACTIVE) ||
-			   (new_transport->state = SCTP_UNCONFIRMED)) {
+			   (new_transport->state = SCTP_UNCONFIRMED) ||
+			   (new_transport->state = SCTP_PF)) {
 			/* If the chunk is Heartbeat or Heartbeat Ack,
 			 * send it to chunk->transport, even if it's
 			 * inactive.
@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			new_transport = chunk->transport;
 			if (!new_transport ||
 			    ((new_transport->state = SCTP_INACTIVE) ||
-			     (new_transport->state = SCTP_UNCONFIRMED)))
+			     (new_transport->state = SCTP_UNCONFIRMED) ||
+			     (new_transport->state = SCTP_PF)))
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state = SCTP_UNCONFIRMED)
 				continue;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c96d1a8..285e26a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_cmd_seq_t *commands,
 			     gfp_t gfp);
 
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+				     struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
@@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+					 struct sctp_association *asoc,
 					 struct sctp_transport *transport,
 					 int is_hb)
 {
@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 			transport->error_count++;
 	}
 
+	/* If the transport error count is greater than the pf_retrans
+	 * threshold, and less than pathmaxrtx, then mark this transport
+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
+	 * point 1
+	 */
+	if ((transport->state != SCTP_PF) &&
+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
+	   (transport->error_count > asoc->pf_retrans)) {
+
+		sctp_assoc_control_transport(asoc, transport,
+					     SCTP_TRANSPORT_PF,
+					     0);
+
+		/* Update the hb timer to resend a heartbeat every rto */
+		sctp_cmd_hb_timer_update(commands, transport);
+	}
+
 	if (transport->state != SCTP_INACTIVE &&
 	    (transport->error_count > transport->pathmaxrxt)) {
 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 					     SCTP_HEARTBEAT_SUCCESS);
 	}
 
+	if (t->state = SCTP_PF)
+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+					     SCTP_HEARTBEAT_SUCCESS);
+
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
 	 * using the time value carried in the HEARTBEAT ACK chunk.
@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_STRIKE:
 			/* Mark one strike against a transport.  */
-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
-						    0);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						    cmd->obj.transport, 0);
 			break;
 
 		case SCTP_CMD_TRANSPORT_IDLE:
@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_TRANSPORT_HB_SENT:
 			t = cmd->obj.transport;
-			sctp_do_8_2_transport_strike(asoc, t, 1);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						     t, 1);
 			t->hb_sent = 1;
 			break;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b3b8a8d..8665b81 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 }
 
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to alter the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
+			   sizeof(struct sctp_paddrthlds)))
+		return -EFAULT;
+
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				    transports) {
+			if (val.spt_pathmaxrxt)
+				trans->pathmaxrxt = val.spt_pathmaxrxt;
+			trans->pf_retrans = val.spt_pathpfthld;
+		}
+
+		if (val.spt_pathmaxrxt)
+			asoc->pathmaxrxt = val.spt_pathmaxrxt;
+		asoc->pf_retrans = val.spt_pathpfthld;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		if (val.spt_pathmaxrxt)
+			trans->pathmaxrxt = val.spt_pathmaxrxt;
+		trans->pf_retrans = val.spt_pathpfthld;
+	}
+
+	return 0;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -5490,6 +5543,51 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
 	return 0;
 }
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to fetch the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    int len,
+					    int __user *optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (len < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	len = sizeof(struct sctp_paddrthlds);
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, len))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+
+		val.spt_pathpfthld = asoc->pf_retrans;
+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		val.spt_pathmaxrxt = trans->pathmaxrxt;
+		val.spt_pathpfthld = trans->pf_retrans;
+	}
+
+	if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
 SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 				char __user *optval, int __user *optlen)
 {
@@ -5628,6 +5726,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5fe639..2b2bfe9 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
 		.extra2		= &int_max
 	},
 	{
+		.procname	= "pf_retrans",
+		.data		= &sctp_pf_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
 		.procname	= "max_init_retransmits",
 		.data		= &sctp_max_retrans_init,
 		.maxlen		= sizeof(int),
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..194d0f3 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 
 	/* Initialize the default path max_retrans.  */
 	peer->pathmaxrxt  = sctp_max_retrans_path;
+	peer->pf_retrans  = sctp_pf_retrans;
 
 	INIT_LIST_HEAD(&peer->transmitted);
 	INIT_LIST_HEAD(&peer->send_ready);
@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
 {
 	unsigned long timeout;
 	timeout = t->rto + sctp_jitter(t->rto);
-	if (t->state != SCTP_UNCONFIRMED)
+	if ((t->state != SCTP_UNCONFIRMED) &&
+	    (t->state != SCTP_PF))
 		timeout += t->hbinterval;
 	timeout += jiffies;
 	return timeout;
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH v6] sctp: Implement quick failover draft from tsvwg
  2012-07-21 17:56   ` Neil Horman
@ 2012-07-22 18:18     ` Vlad Yasevich
  -1 siblings, 0 replies; 48+ messages in thread
From: Vlad Yasevich @ 2012-07-22 18:18 UTC (permalink / raw)
  To: Neil Horman, netdev; +Cc: Sridhar Samudrala, David S. Miller, linux-sctp, joe

Neil Horman <nhorman@tuxdriver.com> wrote:

>I've seen several attempts recently made to do quick failover of sctp
>transports
>by reducing various retransmit timers and counters.  While its possible
>to
>implement a faster failover on multihomed sctp associations, its not
>particularly robust, in that it can lead to unneeded retransmits, as
>well as
>false connection failures due to intermittent latency on a network.
>
>Instead, lets implement the new ietf quick failover draft found here:
>http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>
>This will let the sctp stack identify transports that have had a small
>number of
>errors, and avoid using them quickly until their reliability can be
>re-established.  I've tested this out on two virt guests connected via
>multiple
>isolated virt networks and believe its in compliance with the above
>draft and
>works well.
>
>Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
>CC: Vlad Yasevich <vyasevich@gmail.com>
>CC: Sridhar Samudrala <sri@us.ibm.com>
>CC: "David S. Miller" <davem@davemloft.net>
>CC: linux-sctp@vger.kernel.org
>CC: joe@perches.com
>

Acked-by: Vlad Yasevich <vyasevich@gmail.com>

-vlad

>---
>Change notes:
>
>V2)
>- Added socket option API from section 6.1 of the specification, as per
>request from Vlad. Adding this socket option allows us to alter both
>the path
>maximum retransmit value and the path partial failure threshold for
>each
>transport and the association as a whole.
>
>- Added a per transport pf_retrans value, and initialized it from the
>association value.  This makes each transport independently
>configurable as per
>the socket option above, and prevents changes in the sysctl from
>bleeding into
>an already created association.
>
>V3)
>- Cleaned up some line spacing (Joe Perches)
>- Fixed some socket option user data sanitization (Vlad Yasevich)
>
>V4)
>- Added additional documentation (Flavio Leitner)
>
>V5)
>- Modified setsockopt option to ignore 0 pathmaxrxt rather than return
>  error (Vlad Yasevich)
>- Modified getsocopt to return option length written (Vlad Y.)
>
>V6)
>- Fixed stupid mistake about returning optval incorrectly (Vlad Y)
>---
> Documentation/networking/ip-sysctl.txt |   14 +++++
> include/net/sctp/constants.h           |    1 +
> include/net/sctp/structs.h             |   20 ++++++-
> include/net/sctp/user.h                |   11 ++++
> net/sctp/associola.c                   |   37 +++++++++--
> net/sctp/outqueue.c                    |    6 +-
> net/sctp/sm_sideeffect.c               |   33 +++++++++-
>net/sctp/socket.c                      |  101
>++++++++++++++++++++++++++++++++
> net/sctp/sysctl.c                      |    9 +++
> net/sctp/transport.c                   |    4 +-
> 10 files changed, 221 insertions(+), 15 deletions(-)
>
>diff --git a/Documentation/networking/ip-sysctl.txt
>b/Documentation/networking/ip-sysctl.txt
>index 47b6c79..c636f9c 100644
>--- a/Documentation/networking/ip-sysctl.txt
>+++ b/Documentation/networking/ip-sysctl.txt
>@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
> 
> 	Default: 5
> 
>+pf_retrans - INTEGER
>+	The number of retransmissions that will be attempted on a given path
>+	before traffic is redirected to an alternate transport (should one
>+	exist).  Note this is distinct from path_max_retrans, as a path that
>+	passes the pf_retrans threshold can still be used.  Its only
>+	deprioritized when a transmission path is selected by the stack. 
>This
>+	setting is primarily used to enable fast failover mechanisms without
>+	having to reduce path_max_retrans to a very low value.  See:
>+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
>+	for details.  Note also that a value of pf_retrans > path_max_retrans
>+	disables this feature
>+
>+	Default: 0
>+
> rto_initial - INTEGER
>	The initial round trip timeout value in milliseconds that will be used
> 	in calculating round trip times.  This is the initial time interval
>diff --git a/include/net/sctp/constants.h
>b/include/net/sctp/constants.h
>index 942b864..d053d2e 100644
>--- a/include/net/sctp/constants.h
>+++ b/include/net/sctp/constants.h
>@@ -334,6 +334,7 @@ typedef enum {
> typedef enum {
> 	SCTP_TRANSPORT_UP,
> 	SCTP_TRANSPORT_DOWN,
>+	SCTP_TRANSPORT_PF,
> } sctp_transport_cmd_t;
> 
> /* These are the address scopes defined mainly for IPv4 addresses
>diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
>index e4652fe..cee0678 100644
>--- a/include/net/sctp/structs.h
>+++ b/include/net/sctp/structs.h
>@@ -161,6 +161,12 @@ extern struct sctp_globals {
> 	int max_retrans_path;
> 	int max_retrans_init;
> 
>+	/* Potentially-Failed.Max.Retrans sysctl value
>+	 * taken from:
>+	 * http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>+	 */
>+	int pf_retrans;
>+
> 	/*
> 	 * Policy for preforming sctp/socket accounting
> 	 * 0   - do socket level accounting, all assocs share sk_sndbuf
>@@ -258,6 +264,7 @@ extern struct sctp_globals {
> #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
> #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
> #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
>+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
> #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
> #define sctp_sack_timeout		(sctp_globals.sack_timeout)
> #define sctp_hb_interval		(sctp_globals.hb_interval)
>@@ -987,10 +994,15 @@ struct sctp_transport {
> 
> 	/* This is the max_retrans value for the transport and will
> 	 * be initialized from the assocs value.  This can be changed
>-	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
>+	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
> 	 */
> 	__u16 pathmaxrxt;
> 
>+	/* This is the partially failed retrans value for the transport
>+	 * and will be initialized from the assocs value.  This can be
>changed
>+	 * using the SCTP_PEER_ADDR_THLDS socket option
>+	 */
>+	int pf_retrans;
> 	/* PMTU	      : The current known path MTU.  */
> 	__u32 pathmtu;
> 
>@@ -1660,6 +1672,12 @@ struct sctp_association {
> 	 */
> 	int max_retrans;
> 
>+	/* This is the partially failed retrans value for the transport
>+	 * and will be initialized from the assocs value.  This can be
>+	 * changed using the SCTP_PEER_ADDR_THLDS socket option
>+	 */
>+	int pf_retrans;
>+
> 	/* Maximum number of times the endpoint will retransmit INIT  */
> 	__u16 max_init_attempts;
> 
>diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
>index 0842ef0..1b02d7a 100644
>--- a/include/net/sctp/user.h
>+++ b/include/net/sctp/user.h
>@@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
> #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
> #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
> #define SCTP_AUTO_ASCONF       30
>+#define SCTP_PEER_ADDR_THLDS	31
> 
> /* Internal Socket Options. Some of the sctp library functions are
>  * implemented using these socket options.
>@@ -649,6 +650,7 @@ struct sctp_paddrinfo {
>  */
> enum sctp_spinfo_state {
> 	SCTP_INACTIVE,
>+	SCTP_PF,
> 	SCTP_ACTIVE,
> 	SCTP_UNCONFIRMED,
> 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
>@@ -741,4 +743,13 @@ typedef struct {
> 	int sd;
> } sctp_peeloff_arg_t;
> 
>+/*
>+ *  Peer Address Thresholds socket option
>+ */
>+struct sctp_paddrthlds {
>+	sctp_assoc_t spt_assoc_id;
>+	struct sockaddr_storage spt_address;
>+	__u16 spt_pathmaxrxt;
>+	__u16 spt_pathpfthld;
>+};
> #endif /* __net_sctp_user_h__ */
>diff --git a/net/sctp/associola.c b/net/sctp/associola.c
>index 5bc9ab1..90fe36b 100644
>--- a/net/sctp/associola.c
>+++ b/net/sctp/associola.c
>@@ -124,6 +124,8 @@ static struct sctp_association
>*sctp_association_init(struct sctp_association *a
> 	 * socket values.
> 	 */
> 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
>+	asoc->pf_retrans  = sctp_pf_retrans;
>+
> 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
> 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
> 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
>@@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct
>sctp_association *asoc,
> 	/* Set the path max_retrans.  */
> 	peer->pathmaxrxt = asoc->pathmaxrxt;
> 
>+	/* And the partial failure retrnas threshold */
>+	peer->pf_retrans = asoc->pf_retrans;
>+
> 	/* Initialize the peer's SACK delay timeout based on the
> 	 * association configured value.
> 	 */
>@@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 	struct sctp_ulpevent *event;
> 	struct sockaddr_storage addr;
> 	int spc_state = 0;
>+	bool ulp_notify = true;
> 
> 	/* Record the transition on the transport.  */
> 	switch (command) {
>@@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 			spc_state = SCTP_ADDR_CONFIRMED;
> 		else
> 			spc_state = SCTP_ADDR_AVAILABLE;
>+		/* Don't inform ULP about transition from PF to
>+		 * active state and set cwnd to 1, see SCTP
>+		 * Quick failover draft section 5.1, point 5
>+		 */
>+		if (transport->state == SCTP_PF) {
>+			ulp_notify = false;
>+			transport->cwnd = 1;
>+		}
> 		transport->state = SCTP_ACTIVE;
> 		break;
> 
>@@ -871,6 +885,11 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 		spc_state = SCTP_ADDR_UNREACHABLE;
> 		break;
> 
>+	case SCTP_TRANSPORT_PF:
>+		transport->state = SCTP_PF;
>+		ulp_notify = false;
>+		break;
>+
> 	default:
> 		return;
> 	}
>@@ -878,12 +897,15 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
> 	 * user.
> 	 */
>-	memset(&addr, 0, sizeof(struct sockaddr_storage));
>-	memcpy(&addr, &transport->ipaddr,
>transport->af_specific->sockaddr_len);
>-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
>-				0, spc_state, error, GFP_ATOMIC);
>-	if (event)
>-		sctp_ulpq_tail_event(&asoc->ulpq, event);
>+	if (ulp_notify) {
>+		memset(&addr, 0, sizeof(struct sockaddr_storage));
>+		memcpy(&addr, &transport->ipaddr,
>+		       transport->af_specific->sockaddr_len);
>+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
>+					0, spc_state, error, GFP_ATOMIC);
>+		if (event)
>+			sctp_ulpq_tail_event(&asoc->ulpq, event);
>+	}
> 
> 	/* Select new active and retran paths. */
> 
>@@ -899,7 +921,8 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 			transports) {
> 
> 		if ((t->state == SCTP_INACTIVE) ||
>-		    (t->state == SCTP_UNCONFIRMED))
>+		    (t->state == SCTP_UNCONFIRMED) ||
>+		    (t->state == SCTP_PF))
> 			continue;
> 		if (!first || t->last_time_heard > first->last_time_heard) {
> 			second = first;
>diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
>index a0fa19f..e7aa177c 100644
>--- a/net/sctp/outqueue.c
>+++ b/net/sctp/outqueue.c
>@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int
>rtx_timeout)
> 			if (!new_transport)
> 				new_transport = asoc->peer.active_path;
> 		} else if ((new_transport->state == SCTP_INACTIVE) ||
>-			   (new_transport->state == SCTP_UNCONFIRMED)) {
>+			   (new_transport->state == SCTP_UNCONFIRMED) ||
>+			   (new_transport->state == SCTP_PF)) {
> 			/* If the chunk is Heartbeat or Heartbeat Ack,
> 			 * send it to chunk->transport, even if it's
> 			 * inactive.
>@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int
>rtx_timeout)
> 			new_transport = chunk->transport;
> 			if (!new_transport ||
> 			    ((new_transport->state == SCTP_INACTIVE) ||
>-			     (new_transport->state == SCTP_UNCONFIRMED)))
>+			     (new_transport->state == SCTP_UNCONFIRMED) ||
>+			     (new_transport->state == SCTP_PF)))
> 				new_transport = asoc->peer.active_path;
> 			if (new_transport->state == SCTP_UNCONFIRMED)
> 				continue;
>diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
>index c96d1a8..285e26a 100644
>--- a/net/sctp/sm_sideeffect.c
>+++ b/net/sctp/sm_sideeffect.c
>@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type,
>sctp_subtype_t subtype,
> 			     sctp_cmd_seq_t *commands,
> 			     gfp_t gfp);
> 
>+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
>+				     struct sctp_transport *t);
> /********************************************************************
>  * Helper functions
>  ********************************************************************/
>@@ -470,7 +472,8 @@ sctp_timer_event_t
>*sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
>  * notification SHOULD be sent to the upper layer.
>  *
>  */
>-static void sctp_do_8_2_transport_strike(struct sctp_association
>*asoc,
>+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
>+					 struct sctp_association *asoc,
> 					 struct sctp_transport *transport,
> 					 int is_hb)
> {
>@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct
>sctp_association *asoc,
> 			transport->error_count++;
> 	}
> 
>+	/* If the transport error count is greater than the pf_retrans
>+	 * threshold, and less than pathmaxrtx, then mark this transport
>+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
>+	 * point 1
>+	 */
>+	if ((transport->state != SCTP_PF) &&
>+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
>+	   (transport->error_count > asoc->pf_retrans)) {
>+
>+		sctp_assoc_control_transport(asoc, transport,
>+					     SCTP_TRANSPORT_PF,
>+					     0);
>+
>+		/* Update the hb timer to resend a heartbeat every rto */
>+		sctp_cmd_hb_timer_update(commands, transport);
>+	}
>+
> 	if (transport->state != SCTP_INACTIVE &&
> 	    (transport->error_count > transport->pathmaxrxt)) {
> 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
>@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t
>*cmds,
> 					     SCTP_HEARTBEAT_SUCCESS);
> 	}
> 
>+	if (t->state == SCTP_PF)
>+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
>+					     SCTP_HEARTBEAT_SUCCESS);
>+
> 	/* The receiver of the HEARTBEAT ACK should also perform an
> 	 * RTT measurement for that destination transport address
> 	 * using the time value carried in the HEARTBEAT ACK chunk.
>@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t
>event_type,
> 
> 		case SCTP_CMD_STRIKE:
> 			/* Mark one strike against a transport.  */
>-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
>-						    0);
>+			sctp_do_8_2_transport_strike(commands, asoc,
>+						    cmd->obj.transport, 0);
> 			break;
> 
> 		case SCTP_CMD_TRANSPORT_IDLE:
>@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t
>event_type,
> 
> 		case SCTP_CMD_TRANSPORT_HB_SENT:
> 			t = cmd->obj.transport;
>-			sctp_do_8_2_transport_strike(asoc, t, 1);
>+			sctp_do_8_2_transport_strike(commands, asoc,
>+						     t, 1);
> 			t->hb_sent = 1;
> 			break;
> 
>diff --git a/net/sctp/socket.c b/net/sctp/socket.c
>index b3b8a8d..8665b81 100644
>--- a/net/sctp/socket.c
>+++ b/net/sctp/socket.c
>@@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct
>sock *sk, char __user *optval,
> }
> 
> 
>+/*
>+ * SCTP_PEER_ADDR_THLDS
>+ *
>+ * This option allows us to alter the partially failed threshold for
>one or all
>+ * transports in an association.  See Section 6.1 of:
>+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
>+ */
>+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
>+					    char __user *optval,
>+					    unsigned int optlen)
>+{
>+	struct sctp_paddrthlds val;
>+	struct sctp_transport *trans;
>+	struct sctp_association *asoc;
>+
>+	if (optlen < sizeof(struct sctp_paddrthlds))
>+		return -EINVAL;
>+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
>+			   sizeof(struct sctp_paddrthlds)))
>+		return -EFAULT;
>+
>+
>+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
>+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
>+		if (!asoc)
>+			return -ENOENT;
>+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
>+				    transports) {
>+			if (val.spt_pathmaxrxt)
>+				trans->pathmaxrxt = val.spt_pathmaxrxt;
>+			trans->pf_retrans = val.spt_pathpfthld;
>+		}
>+
>+		if (val.spt_pathmaxrxt)
>+			asoc->pathmaxrxt = val.spt_pathmaxrxt;
>+		asoc->pf_retrans = val.spt_pathpfthld;
>+	} else {
>+		trans = sctp_addr_id2transport(sk, &val.spt_address,
>+					       val.spt_assoc_id);
>+		if (!trans)
>+			return -ENOENT;
>+
>+		if (val.spt_pathmaxrxt)
>+			trans->pathmaxrxt = val.spt_pathmaxrxt;
>+		trans->pf_retrans = val.spt_pathpfthld;
>+	}
>+
>+	return 0;
>+}
>+
> /* API 6.2 setsockopt(), getsockopt()
>  *
>  * Applications use setsockopt() and getsockopt() to set or retrieve
>@@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk,
>int level, int optname,
> 	case SCTP_AUTO_ASCONF:
> 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
> 		break;
>+	case SCTP_PEER_ADDR_THLDS:
>+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
>+		break;
> 	default:
> 		retval = -ENOPROTOOPT;
> 		break;
>@@ -5490,6 +5543,51 @@ static int sctp_getsockopt_assoc_ids(struct sock
>*sk, int len,
> 	return 0;
> }
> 
>+/*
>+ * SCTP_PEER_ADDR_THLDS
>+ *
>+ * This option allows us to fetch the partially failed threshold for
>one or all
>+ * transports in an association.  See Section 6.1 of:
>+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
>+ */
>+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
>+					    char __user *optval,
>+					    int len,
>+					    int __user *optlen)
>+{
>+	struct sctp_paddrthlds val;
>+	struct sctp_transport *trans;
>+	struct sctp_association *asoc;
>+
>+	if (len < sizeof(struct sctp_paddrthlds))
>+		return -EINVAL;
>+	len = sizeof(struct sctp_paddrthlds);
>+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
>len))
>+		return -EFAULT;
>+
>+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
>+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
>+		if (!asoc)
>+			return -ENOENT;
>+
>+		val.spt_pathpfthld = asoc->pf_retrans;
>+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
>+	} else {
>+		trans = sctp_addr_id2transport(sk, &val.spt_address,
>+					       val.spt_assoc_id);
>+		if (!trans)
>+			return -ENOENT;
>+
>+		val.spt_pathmaxrxt = trans->pathmaxrxt;
>+		val.spt_pathpfthld = trans->pf_retrans;
>+	}
>+
>+	if (put_user(len, optlen) || copy_to_user(optval, &val, len))
>+		return -EFAULT;
>+
>+	return 0;
>+}
>+
>SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int
>optname,
> 				char __user *optval, int __user *optlen)
> {
>@@ -5628,6 +5726,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk,
>int level, int optname,
> 	case SCTP_AUTO_ASCONF:
> 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
> 		break;
>+	case SCTP_PEER_ADDR_THLDS:
>+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen);
>+		break;
> 	default:
> 		retval = -ENOPROTOOPT;
> 		break;
>diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
>index e5fe639..2b2bfe9 100644
>--- a/net/sctp/sysctl.c
>+++ b/net/sctp/sysctl.c
>@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
> 		.extra2		= &int_max
> 	},
> 	{
>+		.procname	= "pf_retrans",
>+		.data		= &sctp_pf_retrans,
>+		.maxlen		= sizeof(int),
>+		.mode		= 0644,
>+		.proc_handler	= proc_dointvec_minmax,
>+		.extra1		= &zero,
>+		.extra2		= &int_max
>+	},
>+	{
> 		.procname	= "max_init_retransmits",
> 		.data		= &sctp_max_retrans_init,
> 		.maxlen		= sizeof(int),
>diff --git a/net/sctp/transport.c b/net/sctp/transport.c
>index b026ba0..194d0f3 100644
>--- a/net/sctp/transport.c
>+++ b/net/sctp/transport.c
>@@ -85,6 +85,7 @@ static struct sctp_transport
>*sctp_transport_init(struct sctp_transport *peer,
> 
> 	/* Initialize the default path max_retrans.  */
> 	peer->pathmaxrxt  = sctp_max_retrans_path;
>+	peer->pf_retrans  = sctp_pf_retrans;
> 
> 	INIT_LIST_HEAD(&peer->transmitted);
> 	INIT_LIST_HEAD(&peer->send_ready);
>@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct
>sctp_transport *t)
> {
> 	unsigned long timeout;
> 	timeout = t->rto + sctp_jitter(t->rto);
>-	if (t->state != SCTP_UNCONFIRMED)
>+	if ((t->state != SCTP_UNCONFIRMED) &&
>+	    (t->state != SCTP_PF))
> 		timeout += t->hbinterval;
> 	timeout += jiffies;
> 	return timeout;
>-- 
>1.7.7.6


-- 
Sent from my Android phone with SkitMail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v6] sctp: Implement quick failover draft from tsvwg
@ 2012-07-22 18:18     ` Vlad Yasevich
  0 siblings, 0 replies; 48+ messages in thread
From: Vlad Yasevich @ 2012-07-22 18:18 UTC (permalink / raw)
  To: Neil Horman, netdev; +Cc: Sridhar Samudrala, David S. Miller, linux-sctp, joe

Neil Horman <nhorman@tuxdriver.com> wrote:

>I've seen several attempts recently made to do quick failover of sctp
>transports
>by reducing various retransmit timers and counters.  While its possible
>to
>implement a faster failover on multihomed sctp associations, its not
>particularly robust, in that it can lead to unneeded retransmits, as
>well as
>false connection failures due to intermittent latency on a network.
>
>Instead, lets implement the new ietf quick failover draft found here:
>http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>
>This will let the sctp stack identify transports that have had a small
>number of
>errors, and avoid using them quickly until their reliability can be
>re-established.  I've tested this out on two virt guests connected via
>multiple
>isolated virt networks and believe its in compliance with the above
>draft and
>works well.
>
>Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
>CC: Vlad Yasevich <vyasevich@gmail.com>
>CC: Sridhar Samudrala <sri@us.ibm.com>
>CC: "David S. Miller" <davem@davemloft.net>
>CC: linux-sctp@vger.kernel.org
>CC: joe@perches.com
>

Acked-by: Vlad Yasevich <vyasevich@gmail.com>

-vlad

>---
>Change notes:
>
>V2)
>- Added socket option API from section 6.1 of the specification, as per
>request from Vlad. Adding this socket option allows us to alter both
>the path
>maximum retransmit value and the path partial failure threshold for
>each
>transport and the association as a whole.
>
>- Added a per transport pf_retrans value, and initialized it from the
>association value.  This makes each transport independently
>configurable as per
>the socket option above, and prevents changes in the sysctl from
>bleeding into
>an already created association.
>
>V3)
>- Cleaned up some line spacing (Joe Perches)
>- Fixed some socket option user data sanitization (Vlad Yasevich)
>
>V4)
>- Added additional documentation (Flavio Leitner)
>
>V5)
>- Modified setsockopt option to ignore 0 pathmaxrxt rather than return
>  error (Vlad Yasevich)
>- Modified getsocopt to return option length written (Vlad Y.)
>
>V6)
>- Fixed stupid mistake about returning optval incorrectly (Vlad Y)
>---
> Documentation/networking/ip-sysctl.txt |   14 +++++
> include/net/sctp/constants.h           |    1 +
> include/net/sctp/structs.h             |   20 ++++++-
> include/net/sctp/user.h                |   11 ++++
> net/sctp/associola.c                   |   37 +++++++++--
> net/sctp/outqueue.c                    |    6 +-
> net/sctp/sm_sideeffect.c               |   33 +++++++++-
>net/sctp/socket.c                      |  101
>++++++++++++++++++++++++++++++++
> net/sctp/sysctl.c                      |    9 +++
> net/sctp/transport.c                   |    4 +-
> 10 files changed, 221 insertions(+), 15 deletions(-)
>
>diff --git a/Documentation/networking/ip-sysctl.txt
>b/Documentation/networking/ip-sysctl.txt
>index 47b6c79..c636f9c 100644
>--- a/Documentation/networking/ip-sysctl.txt
>+++ b/Documentation/networking/ip-sysctl.txt
>@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
> 
> 	Default: 5
> 
>+pf_retrans - INTEGER
>+	The number of retransmissions that will be attempted on a given path
>+	before traffic is redirected to an alternate transport (should one
>+	exist).  Note this is distinct from path_max_retrans, as a path that
>+	passes the pf_retrans threshold can still be used.  Its only
>+	deprioritized when a transmission path is selected by the stack. 
>This
>+	setting is primarily used to enable fast failover mechanisms without
>+	having to reduce path_max_retrans to a very low value.  See:
>+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
>+	for details.  Note also that a value of pf_retrans > path_max_retrans
>+	disables this feature
>+
>+	Default: 0
>+
> rto_initial - INTEGER
>	The initial round trip timeout value in milliseconds that will be used
> 	in calculating round trip times.  This is the initial time interval
>diff --git a/include/net/sctp/constants.h
>b/include/net/sctp/constants.h
>index 942b864..d053d2e 100644
>--- a/include/net/sctp/constants.h
>+++ b/include/net/sctp/constants.h
>@@ -334,6 +334,7 @@ typedef enum {
> typedef enum {
> 	SCTP_TRANSPORT_UP,
> 	SCTP_TRANSPORT_DOWN,
>+	SCTP_TRANSPORT_PF,
> } sctp_transport_cmd_t;
> 
> /* These are the address scopes defined mainly for IPv4 addresses
>diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
>index e4652fe..cee0678 100644
>--- a/include/net/sctp/structs.h
>+++ b/include/net/sctp/structs.h
>@@ -161,6 +161,12 @@ extern struct sctp_globals {
> 	int max_retrans_path;
> 	int max_retrans_init;
> 
>+	/* Potentially-Failed.Max.Retrans sysctl value
>+	 * taken from:
>+	 * http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>+	 */
>+	int pf_retrans;
>+
> 	/*
> 	 * Policy for preforming sctp/socket accounting
> 	 * 0   - do socket level accounting, all assocs share sk_sndbuf
>@@ -258,6 +264,7 @@ extern struct sctp_globals {
> #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
> #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
> #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
>+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
> #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
> #define sctp_sack_timeout		(sctp_globals.sack_timeout)
> #define sctp_hb_interval		(sctp_globals.hb_interval)
>@@ -987,10 +994,15 @@ struct sctp_transport {
> 
> 	/* This is the max_retrans value for the transport and will
> 	 * be initialized from the assocs value.  This can be changed
>-	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
>+	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
> 	 */
> 	__u16 pathmaxrxt;
> 
>+	/* This is the partially failed retrans value for the transport
>+	 * and will be initialized from the assocs value.  This can be
>changed
>+	 * using the SCTP_PEER_ADDR_THLDS socket option
>+	 */
>+	int pf_retrans;
> 	/* PMTU	      : The current known path MTU.  */
> 	__u32 pathmtu;
> 
>@@ -1660,6 +1672,12 @@ struct sctp_association {
> 	 */
> 	int max_retrans;
> 
>+	/* This is the partially failed retrans value for the transport
>+	 * and will be initialized from the assocs value.  This can be
>+	 * changed using the SCTP_PEER_ADDR_THLDS socket option
>+	 */
>+	int pf_retrans;
>+
> 	/* Maximum number of times the endpoint will retransmit INIT  */
> 	__u16 max_init_attempts;
> 
>diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
>index 0842ef0..1b02d7a 100644
>--- a/include/net/sctp/user.h
>+++ b/include/net/sctp/user.h
>@@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
> #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
> #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
> #define SCTP_AUTO_ASCONF       30
>+#define SCTP_PEER_ADDR_THLDS	31
> 
> /* Internal Socket Options. Some of the sctp library functions are
>  * implemented using these socket options.
>@@ -649,6 +650,7 @@ struct sctp_paddrinfo {
>  */
> enum sctp_spinfo_state {
> 	SCTP_INACTIVE,
>+	SCTP_PF,
> 	SCTP_ACTIVE,
> 	SCTP_UNCONFIRMED,
> 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
>@@ -741,4 +743,13 @@ typedef struct {
> 	int sd;
> } sctp_peeloff_arg_t;
> 
>+/*
>+ *  Peer Address Thresholds socket option
>+ */
>+struct sctp_paddrthlds {
>+	sctp_assoc_t spt_assoc_id;
>+	struct sockaddr_storage spt_address;
>+	__u16 spt_pathmaxrxt;
>+	__u16 spt_pathpfthld;
>+};
> #endif /* __net_sctp_user_h__ */
>diff --git a/net/sctp/associola.c b/net/sctp/associola.c
>index 5bc9ab1..90fe36b 100644
>--- a/net/sctp/associola.c
>+++ b/net/sctp/associola.c
>@@ -124,6 +124,8 @@ static struct sctp_association
>*sctp_association_init(struct sctp_association *a
> 	 * socket values.
> 	 */
> 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
>+	asoc->pf_retrans  = sctp_pf_retrans;
>+
> 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
> 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
> 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
>@@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct
>sctp_association *asoc,
> 	/* Set the path max_retrans.  */
> 	peer->pathmaxrxt = asoc->pathmaxrxt;
> 
>+	/* And the partial failure retrnas threshold */
>+	peer->pf_retrans = asoc->pf_retrans;
>+
> 	/* Initialize the peer's SACK delay timeout based on the
> 	 * association configured value.
> 	 */
>@@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 	struct sctp_ulpevent *event;
> 	struct sockaddr_storage addr;
> 	int spc_state = 0;
>+	bool ulp_notify = true;
> 
> 	/* Record the transition on the transport.  */
> 	switch (command) {
>@@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 			spc_state = SCTP_ADDR_CONFIRMED;
> 		else
> 			spc_state = SCTP_ADDR_AVAILABLE;
>+		/* Don't inform ULP about transition from PF to
>+		 * active state and set cwnd to 1, see SCTP
>+		 * Quick failover draft section 5.1, point 5
>+		 */
>+		if (transport->state = SCTP_PF) {
>+			ulp_notify = false;
>+			transport->cwnd = 1;
>+		}
> 		transport->state = SCTP_ACTIVE;
> 		break;
> 
>@@ -871,6 +885,11 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 		spc_state = SCTP_ADDR_UNREACHABLE;
> 		break;
> 
>+	case SCTP_TRANSPORT_PF:
>+		transport->state = SCTP_PF;
>+		ulp_notify = false;
>+		break;
>+
> 	default:
> 		return;
> 	}
>@@ -878,12 +897,15 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
> 	 * user.
> 	 */
>-	memset(&addr, 0, sizeof(struct sockaddr_storage));
>-	memcpy(&addr, &transport->ipaddr,
>transport->af_specific->sockaddr_len);
>-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
>-				0, spc_state, error, GFP_ATOMIC);
>-	if (event)
>-		sctp_ulpq_tail_event(&asoc->ulpq, event);
>+	if (ulp_notify) {
>+		memset(&addr, 0, sizeof(struct sockaddr_storage));
>+		memcpy(&addr, &transport->ipaddr,
>+		       transport->af_specific->sockaddr_len);
>+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
>+					0, spc_state, error, GFP_ATOMIC);
>+		if (event)
>+			sctp_ulpq_tail_event(&asoc->ulpq, event);
>+	}
> 
> 	/* Select new active and retran paths. */
> 
>@@ -899,7 +921,8 @@ void sctp_assoc_control_transport(struct
>sctp_association *asoc,
> 			transports) {
> 
> 		if ((t->state = SCTP_INACTIVE) ||
>-		    (t->state = SCTP_UNCONFIRMED))
>+		    (t->state = SCTP_UNCONFIRMED) ||
>+		    (t->state = SCTP_PF))
> 			continue;
> 		if (!first || t->last_time_heard > first->last_time_heard) {
> 			second = first;
>diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
>index a0fa19f..e7aa177c 100644
>--- a/net/sctp/outqueue.c
>+++ b/net/sctp/outqueue.c
>@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int
>rtx_timeout)
> 			if (!new_transport)
> 				new_transport = asoc->peer.active_path;
> 		} else if ((new_transport->state = SCTP_INACTIVE) ||
>-			   (new_transport->state = SCTP_UNCONFIRMED)) {
>+			   (new_transport->state = SCTP_UNCONFIRMED) ||
>+			   (new_transport->state = SCTP_PF)) {
> 			/* If the chunk is Heartbeat or Heartbeat Ack,
> 			 * send it to chunk->transport, even if it's
> 			 * inactive.
>@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int
>rtx_timeout)
> 			new_transport = chunk->transport;
> 			if (!new_transport ||
> 			    ((new_transport->state = SCTP_INACTIVE) ||
>-			     (new_transport->state = SCTP_UNCONFIRMED)))
>+			     (new_transport->state = SCTP_UNCONFIRMED) ||
>+			     (new_transport->state = SCTP_PF)))
> 				new_transport = asoc->peer.active_path;
> 			if (new_transport->state = SCTP_UNCONFIRMED)
> 				continue;
>diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
>index c96d1a8..285e26a 100644
>--- a/net/sctp/sm_sideeffect.c
>+++ b/net/sctp/sm_sideeffect.c
>@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type,
>sctp_subtype_t subtype,
> 			     sctp_cmd_seq_t *commands,
> 			     gfp_t gfp);
> 
>+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
>+				     struct sctp_transport *t);
> /********************************************************************
>  * Helper functions
>  ********************************************************************/
>@@ -470,7 +472,8 @@ sctp_timer_event_t
>*sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
>  * notification SHOULD be sent to the upper layer.
>  *
>  */
>-static void sctp_do_8_2_transport_strike(struct sctp_association
>*asoc,
>+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
>+					 struct sctp_association *asoc,
> 					 struct sctp_transport *transport,
> 					 int is_hb)
> {
>@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct
>sctp_association *asoc,
> 			transport->error_count++;
> 	}
> 
>+	/* If the transport error count is greater than the pf_retrans
>+	 * threshold, and less than pathmaxrtx, then mark this transport
>+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
>+	 * point 1
>+	 */
>+	if ((transport->state != SCTP_PF) &&
>+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
>+	   (transport->error_count > asoc->pf_retrans)) {
>+
>+		sctp_assoc_control_transport(asoc, transport,
>+					     SCTP_TRANSPORT_PF,
>+					     0);
>+
>+		/* Update the hb timer to resend a heartbeat every rto */
>+		sctp_cmd_hb_timer_update(commands, transport);
>+	}
>+
> 	if (transport->state != SCTP_INACTIVE &&
> 	    (transport->error_count > transport->pathmaxrxt)) {
> 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
>@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t
>*cmds,
> 					     SCTP_HEARTBEAT_SUCCESS);
> 	}
> 
>+	if (t->state = SCTP_PF)
>+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
>+					     SCTP_HEARTBEAT_SUCCESS);
>+
> 	/* The receiver of the HEARTBEAT ACK should also perform an
> 	 * RTT measurement for that destination transport address
> 	 * using the time value carried in the HEARTBEAT ACK chunk.
>@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t
>event_type,
> 
> 		case SCTP_CMD_STRIKE:
> 			/* Mark one strike against a transport.  */
>-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
>-						    0);
>+			sctp_do_8_2_transport_strike(commands, asoc,
>+						    cmd->obj.transport, 0);
> 			break;
> 
> 		case SCTP_CMD_TRANSPORT_IDLE:
>@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t
>event_type,
> 
> 		case SCTP_CMD_TRANSPORT_HB_SENT:
> 			t = cmd->obj.transport;
>-			sctp_do_8_2_transport_strike(asoc, t, 1);
>+			sctp_do_8_2_transport_strike(commands, asoc,
>+						     t, 1);
> 			t->hb_sent = 1;
> 			break;
> 
>diff --git a/net/sctp/socket.c b/net/sctp/socket.c
>index b3b8a8d..8665b81 100644
>--- a/net/sctp/socket.c
>+++ b/net/sctp/socket.c
>@@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct
>sock *sk, char __user *optval,
> }
> 
> 
>+/*
>+ * SCTP_PEER_ADDR_THLDS
>+ *
>+ * This option allows us to alter the partially failed threshold for
>one or all
>+ * transports in an association.  See Section 6.1 of:
>+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
>+ */
>+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
>+					    char __user *optval,
>+					    unsigned int optlen)
>+{
>+	struct sctp_paddrthlds val;
>+	struct sctp_transport *trans;
>+	struct sctp_association *asoc;
>+
>+	if (optlen < sizeof(struct sctp_paddrthlds))
>+		return -EINVAL;
>+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
>+			   sizeof(struct sctp_paddrthlds)))
>+		return -EFAULT;
>+
>+
>+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
>+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
>+		if (!asoc)
>+			return -ENOENT;
>+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
>+				    transports) {
>+			if (val.spt_pathmaxrxt)
>+				trans->pathmaxrxt = val.spt_pathmaxrxt;
>+			trans->pf_retrans = val.spt_pathpfthld;
>+		}
>+
>+		if (val.spt_pathmaxrxt)
>+			asoc->pathmaxrxt = val.spt_pathmaxrxt;
>+		asoc->pf_retrans = val.spt_pathpfthld;
>+	} else {
>+		trans = sctp_addr_id2transport(sk, &val.spt_address,
>+					       val.spt_assoc_id);
>+		if (!trans)
>+			return -ENOENT;
>+
>+		if (val.spt_pathmaxrxt)
>+			trans->pathmaxrxt = val.spt_pathmaxrxt;
>+		trans->pf_retrans = val.spt_pathpfthld;
>+	}
>+
>+	return 0;
>+}
>+
> /* API 6.2 setsockopt(), getsockopt()
>  *
>  * Applications use setsockopt() and getsockopt() to set or retrieve
>@@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk,
>int level, int optname,
> 	case SCTP_AUTO_ASCONF:
> 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
> 		break;
>+	case SCTP_PEER_ADDR_THLDS:
>+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
>+		break;
> 	default:
> 		retval = -ENOPROTOOPT;
> 		break;
>@@ -5490,6 +5543,51 @@ static int sctp_getsockopt_assoc_ids(struct sock
>*sk, int len,
> 	return 0;
> }
> 
>+/*
>+ * SCTP_PEER_ADDR_THLDS
>+ *
>+ * This option allows us to fetch the partially failed threshold for
>one or all
>+ * transports in an association.  See Section 6.1 of:
>+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
>+ */
>+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
>+					    char __user *optval,
>+					    int len,
>+					    int __user *optlen)
>+{
>+	struct sctp_paddrthlds val;
>+	struct sctp_transport *trans;
>+	struct sctp_association *asoc;
>+
>+	if (len < sizeof(struct sctp_paddrthlds))
>+		return -EINVAL;
>+	len = sizeof(struct sctp_paddrthlds);
>+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
>len))
>+		return -EFAULT;
>+
>+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
>+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
>+		if (!asoc)
>+			return -ENOENT;
>+
>+		val.spt_pathpfthld = asoc->pf_retrans;
>+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
>+	} else {
>+		trans = sctp_addr_id2transport(sk, &val.spt_address,
>+					       val.spt_assoc_id);
>+		if (!trans)
>+			return -ENOENT;
>+
>+		val.spt_pathmaxrxt = trans->pathmaxrxt;
>+		val.spt_pathpfthld = trans->pf_retrans;
>+	}
>+
>+	if (put_user(len, optlen) || copy_to_user(optval, &val, len))
>+		return -EFAULT;
>+
>+	return 0;
>+}
>+
>SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int
>optname,
> 				char __user *optval, int __user *optlen)
> {
>@@ -5628,6 +5726,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk,
>int level, int optname,
> 	case SCTP_AUTO_ASCONF:
> 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
> 		break;
>+	case SCTP_PEER_ADDR_THLDS:
>+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen);
>+		break;
> 	default:
> 		retval = -ENOPROTOOPT;
> 		break;
>diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
>index e5fe639..2b2bfe9 100644
>--- a/net/sctp/sysctl.c
>+++ b/net/sctp/sysctl.c
>@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
> 		.extra2		= &int_max
> 	},
> 	{
>+		.procname	= "pf_retrans",
>+		.data		= &sctp_pf_retrans,
>+		.maxlen		= sizeof(int),
>+		.mode		= 0644,
>+		.proc_handler	= proc_dointvec_minmax,
>+		.extra1		= &zero,
>+		.extra2		= &int_max
>+	},
>+	{
> 		.procname	= "max_init_retransmits",
> 		.data		= &sctp_max_retrans_init,
> 		.maxlen		= sizeof(int),
>diff --git a/net/sctp/transport.c b/net/sctp/transport.c
>index b026ba0..194d0f3 100644
>--- a/net/sctp/transport.c
>+++ b/net/sctp/transport.c
>@@ -85,6 +85,7 @@ static struct sctp_transport
>*sctp_transport_init(struct sctp_transport *peer,
> 
> 	/* Initialize the default path max_retrans.  */
> 	peer->pathmaxrxt  = sctp_max_retrans_path;
>+	peer->pf_retrans  = sctp_pf_retrans;
> 
> 	INIT_LIST_HEAD(&peer->transmitted);
> 	INIT_LIST_HEAD(&peer->send_ready);
>@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct
>sctp_transport *t)
> {
> 	unsigned long timeout;
> 	timeout = t->rto + sctp_jitter(t->rto);
>-	if (t->state != SCTP_UNCONFIRMED)
>+	if ((t->state != SCTP_UNCONFIRMED) &&
>+	    (t->state != SCTP_PF))
> 		timeout += t->hbinterval;
> 	timeout += jiffies;
> 	return timeout;
>-- 
>1.7.7.6


-- 
Sent from my Android phone with SkitMail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v6] sctp: Implement quick failover draft from tsvwg
  2012-07-22 18:18     ` Vlad Yasevich
@ 2012-07-22 19:14       ` David Miller
  -1 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2012-07-22 19:14 UTC (permalink / raw)
  To: vyasevich; +Cc: nhorman, netdev, sri, linux-sctp, joe

From: Vlad Yasevich <vyasevich@gmail.com>
Date: Sun, 22 Jul 2012 14:18:12 -0400

> Neil Horman <nhorman@tuxdriver.com> wrote:
> 
>>I've seen several attempts recently made to do quick failover of sctp
>>transports
>>by reducing various retransmit timers and counters.  While its possible
>>to
>>implement a faster failover on multihomed sctp associations, its not
>>particularly robust, in that it can lead to unneeded retransmits, as
>>well as
>>false connection failures due to intermittent latency on a network.
>>
>>Instead, lets implement the new ietf quick failover draft found here:
>>http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>>
>>This will let the sctp stack identify transports that have had a small
>>number of
>>errors, and avoid using them quickly until their reliability can be
>>re-established.  I've tested this out on two virt guests connected via
>>multiple
>>isolated virt networks and believe its in compliance with the above
>>draft and
>>works well.
>>
>>Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
 ...
> Acked-by: Vlad Yasevich <vyasevich@gmail.com>

Applied, thanks everyone.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v6] sctp: Implement quick failover draft from tsvwg
@ 2012-07-22 19:14       ` David Miller
  0 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2012-07-22 19:14 UTC (permalink / raw)
  To: vyasevich; +Cc: nhorman, netdev, sri, linux-sctp, joe

From: Vlad Yasevich <vyasevich@gmail.com>
Date: Sun, 22 Jul 2012 14:18:12 -0400

> Neil Horman <nhorman@tuxdriver.com> wrote:
> 
>>I've seen several attempts recently made to do quick failover of sctp
>>transports
>>by reducing various retransmit timers and counters.  While its possible
>>to
>>implement a faster failover on multihomed sctp associations, its not
>>particularly robust, in that it can lead to unneeded retransmits, as
>>well as
>>false connection failures due to intermittent latency on a network.
>>
>>Instead, lets implement the new ietf quick failover draft found here:
>>http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>>
>>This will let the sctp stack identify transports that have had a small
>>number of
>>errors, and avoid using them quickly until their reliability can be
>>re-established.  I've tested this out on two virt guests connected via
>>multiple
>>isolated virt networks and believe its in compliance with the above
>>draft and
>>works well.
>>
>>Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
 ...
> Acked-by: Vlad Yasevich <vyasevich@gmail.com>

Applied, thanks everyone.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v6] sctp: Implement quick failover draft from tsvwg
  2012-07-22 19:14       ` David Miller
@ 2012-07-22 19:18         ` Neil Horman <nhorman@tuxdriver.com>
  -1 siblings, 0 replies; 48+ messages in thread
From: Neil Horman <nhorman@tuxdriver.com> @ 2012-07-22 19:18 UTC (permalink / raw)
  To: David Miller, vyasevich; +Cc: netdev, sri, linux-sctp, joe

Thanks all!

David Miller <davem@davemloft.net> wrote:

>From: Vlad Yasevich <vyasevich@gmail.com>
>Date: Sun, 22 Jul 2012 14:18:12 -0400
>
>> Neil Horman <nhorman@tuxdriver.com> wrote:
>> 
>>>I've seen several attempts recently made to do quick failover of sctp
>>>transports
>>>by reducing various retransmit timers and counters.  While its
>possible
>>>to
>>>implement a faster failover on multihomed sctp associations, its not
>>>particularly robust, in that it can lead to unneeded retransmits, as
>>>well as
>>>false connection failures due to intermittent latency on a network.
>>>
>>>Instead, lets implement the new ietf quick failover draft found here:
>>>http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>>>
>>>This will let the sctp stack identify transports that have had a
>small
>>>number of
>>>errors, and avoid using them quickly until their reliability can be
>>>re-established.  I've tested this out on two virt guests connected
>via
>>>multiple
>>>isolated virt networks and believe its in compliance with the above
>>>draft and
>>>works well.
>>>
>>>Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> ...
>> Acked-by: Vlad Yasevich <vyasevich@gmail.com>
>
>Applied, thanks everyone.

-- 
Sent from my Android phone with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v6] sctp: Implement quick failover draft from tsvwg
@ 2012-07-22 19:18         ` Neil Horman <nhorman@tuxdriver.com>
  0 siblings, 0 replies; 48+ messages in thread
From: Neil Horman <nhorman@tuxdriver.com> @ 2012-07-22 19:18 UTC (permalink / raw)
  To: David Miller, vyasevich; +Cc: netdev, sri, linux-sctp, joe

Thanks all!

David Miller <davem@davemloft.net> wrote:

>From: Vlad Yasevich <vyasevich@gmail.com>
>Date: Sun, 22 Jul 2012 14:18:12 -0400
>
>> Neil Horman <nhorman@tuxdriver.com> wrote:
>> 
>>>I've seen several attempts recently made to do quick failover of sctp
>>>transports
>>>by reducing various retransmit timers and counters.  While its
>possible
>>>to
>>>implement a faster failover on multihomed sctp associations, its not
>>>particularly robust, in that it can lead to unneeded retransmits, as
>>>well as
>>>false connection failures due to intermittent latency on a network.
>>>
>>>Instead, lets implement the new ietf quick failover draft found here:
>>>http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>>>
>>>This will let the sctp stack identify transports that have had a
>small
>>>number of
>>>errors, and avoid using them quickly until their reliability can be
>>>re-established.  I've tested this out on two virt guests connected
>via
>>>multiple
>>>isolated virt networks and believe its in compliance with the above
>>>draft and
>>>works well.
>>>
>>>Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> ...
>> Acked-by: Vlad Yasevich <vyasevich@gmail.com>
>
>Applied, thanks everyone.

-- 
Sent from my Android phone with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v5] sctp: Implement quick failover draft from tsvwg
  2012-07-20 19:31       ` David Miller
@ 2012-07-23 14:28         ` Flavio Leitner
  -1 siblings, 0 replies; 48+ messages in thread
From: Flavio Leitner @ 2012-07-23 14:28 UTC (permalink / raw)
  To: David Miller; +Cc: nhorman, netdev, vyasevich, sri, linux-sctp, joe

On Fri, 20 Jul 2012 12:31:09 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:
[...]
> Just quote the commit message or similar.

Sure thing. I was going to clean up when I accidentally sent that email.
The previous ones were fine though :)
Sorry about that.
fbl

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH v5] sctp: Implement quick failover draft from tsvwg
@ 2012-07-23 14:28         ` Flavio Leitner
  0 siblings, 0 replies; 48+ messages in thread
From: Flavio Leitner @ 2012-07-23 14:28 UTC (permalink / raw)
  To: David Miller; +Cc: nhorman, netdev, vyasevich, sri, linux-sctp, joe

On Fri, 20 Jul 2012 12:31:09 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:
[...]
> Just quote the commit message or similar.

Sure thing. I was going to clean up when I accidentally sent that email.
The previous ones were fine though :)
Sorry about that.
fbl

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2012-07-23 14:29 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-07-13 18:26 [PATCH] sctp: Implement quick failover draft from tsvwg Neil Horman
2012-07-13 18:26 ` Neil Horman
2012-07-14 18:12 ` Vlad Yasevich
2012-07-14 18:12   ` Vlad Yasevich
2012-07-14 21:22   ` Neil Horman
2012-07-14 21:22     ` Neil Horman
2012-07-18 18:01 ` [PATCH v2] " Neil Horman
2012-07-18 18:01   ` Neil Horman
2012-07-18 20:30   ` Joe Perches
2012-07-18 20:30     ` Joe Perches
2012-07-19 10:45     ` Neil Horman
2012-07-19 10:45       ` Neil Horman
2012-07-19 16:54       ` Joe Perches
2012-07-19 16:54         ` Joe Perches
2012-07-18 21:23   ` Vlad Yasevich
2012-07-18 21:23     ` Vlad Yasevich
2012-07-19 10:46     ` Neil Horman
2012-07-19 10:46       ` Neil Horman
2012-07-19 16:51 ` [PATCH v3] " Neil Horman
2012-07-19 16:51   ` Neil Horman
2012-07-20 16:51   ` Flavio Leitner
2012-07-20 16:51     ` Flavio Leitner
2012-07-20 17:19 ` [PATCH v4] " Neil Horman
2012-07-20 17:19   ` Neil Horman
2012-07-20 17:55   ` Vlad Yasevich
2012-07-20 17:55     ` Vlad Yasevich
2012-07-20 18:36     ` Neil Horman
2012-07-20 18:36       ` Neil Horman
2012-07-20 18:51 ` [PATCH v5] " Neil Horman
2012-07-20 18:51   ` Neil Horman
2012-07-20 19:10   ` Flavio Leitner
2012-07-20 19:10     ` Flavio Leitner
2012-07-20 19:31     ` David Miller
2012-07-20 19:31       ` David Miller
2012-07-23 14:28       ` Flavio Leitner
2012-07-23 14:28         ` Flavio Leitner
2012-07-21  6:45   ` Vlad Yasevich
2012-07-21  6:45     ` Vlad Yasevich
2012-07-21 11:03     ` Neil Horman
2012-07-21 11:03       ` Neil Horman
2012-07-21 17:56 ` [PATCH v6] " Neil Horman
2012-07-21 17:56   ` Neil Horman
2012-07-22 18:18   ` Vlad Yasevich
2012-07-22 18:18     ` Vlad Yasevich
2012-07-22 19:14     ` David Miller
2012-07-22 19:14       ` David Miller
2012-07-22 19:18       ` Neil Horman <nhorman@tuxdriver.com>
2012-07-22 19:18         ` Neil Horman <nhorman@tuxdriver.com>

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.