All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 net-next] tcp: Change txhash on every SYN and RTO retransmit
@ 2016-09-27 21:23 Lawrence Brakmo
  0 siblings, 0 replies; only message in thread
From: Lawrence Brakmo @ 2016-09-27 21:23 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Eric Dumazet, Yuchung Cheng

The current code changes txhash (flowlables) on every retransmitted
SYN/ACK, but only after the 2nd retransmitted SYN and only after
tcp_retries1 RTO retransmits.

With this patch:
1) txhash is changed with every SYN retransmits
2) adds the option for the txhash to be changed before tcp_retries1
   RTO retransmits. A new sysctl tcp_rto_txhash_prob represents the
   probability that txhash will be changed. The default value is 0
   which maintains previous behavior and a value of 100 will always
   change it.

The result is that we can start re-routing around failed (or very
congested paths) as soon as possible. Otherwise application health
checks may fail and the connection may be terminated before we start
to change txhash.

v2: Added sysctl documentation and cleaned code

Tested with packetdrill tests

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 Documentation/networking/ip-sysctl.txt | 11 +++++++++++
 include/net/tcp.h                      |  1 +
 net/ipv4/sysctl_net_ipv4.c             | 10 ++++++++++
 net/ipv4/tcp_input.c                   |  2 ++
 net/ipv4/tcp_timer.c                   |  4 ++++
 5 files changed, 28 insertions(+)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 3db8c67..0e7f9ac 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -472,6 +472,17 @@ tcp_max_reordering - INTEGER
 	if paths are using per packet load balancing (like bonding rr mode)
 	Default: 300
 
+tcp_rto_txhash_prob - INTEGER
+	Probability [0 to 100] that we will recalculate txhash when a
+	packet is resent due to an RTO and the RTO for this packet has
+	fired less than tcp_retries1 times. It is always recalculated
+	after tcp_retries_times.
+
+	Setting it to 100 helps re-route around failed (or very congested
+	paths) as soon as possible. Otherwise application health checks may
+	fail and the connection may be terminated before the txhash has
+	a chance to change.
+
 tcp_retrans_collapse - BOOLEAN
 	Bug-to-bug compatibility with some broken printers.
 	On retransmit try to send bigger packets to work around bugs in
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f83b7f2..406d474 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -271,6 +271,7 @@ extern int sysctl_tcp_autocorking;
 extern int sysctl_tcp_invalid_ratelimit;
 extern int sysctl_tcp_pacing_ss_ratio;
 extern int sysctl_tcp_pacing_ca_ratio;
+extern int sysctl_tcp_rto_txhash_prob;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1cb67de..0b185a1 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -28,6 +28,7 @@
 static int zero;
 static int one = 1;
 static int four = 4;
+static int hundred = 100;
 static int thousand = 1000;
 static int gso_max_segs = GSO_MAX_SEGS;
 static int tcp_retr1_max = 255;
@@ -624,6 +625,15 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_ms_jiffies,
 	},
 	{
+		.procname	= "tcp_rto_txhash_prob",
+		.data		= &sysctl_tcp_rto_txhash_prob,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &hundred,
+	},
+	{
 		.procname	= "icmp_msgs_per_sec",
 		.data		= &sysctl_icmp_msgs_per_sec,
 		.maxlen		= sizeof(int),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8c6ad2d..2fea29d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -101,6 +101,8 @@ int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 
+int sysctl_tcp_rto_txhash_prob __read_mostly = 100;
+
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
 #define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index f712b41..8bdb215 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -192,6 +192,8 @@ static int tcp_write_timeout(struct sock *sk)
 			if (tp->syn_data && icsk->icsk_retransmits == 1)
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+		} else if (!tp->syn_data && !tp->syn_fastopen) {
+			sk_rethink_txhash(sk);
 		}
 		retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
 		syn_set = true;
@@ -213,6 +215,8 @@ static int tcp_write_timeout(struct sock *sk)
 			tcp_mtu_probing(icsk, sk);
 
 			dst_negative_advice(sk);
+		} else if (prandom_u32_max(100) < sysctl_tcp_rto_txhash_prob) {
+			sk_rethink_txhash(sk);
 		}
 
 		retry_until = net->ipv4.sysctl_tcp_retries2;
-- 
2.9.3

^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2016-09-27 21:23 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-27 21:23 [PATCH v2 net-next] tcp: Change txhash on every SYN and RTO retransmit Lawrence Brakmo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.