All of lore.kernel.org
 help / color / mirror / Atom feed
* [net-next PATCH v3 2/3] net: TCP thin linear timeouts
@ 2010-02-11 12:07 Andreas Petlund
  2010-02-12  3:52 ` Eric Dumazet
  2010-02-12 11:19 ` William Allen Simpson
  0 siblings, 2 replies; 5+ messages in thread
From: Andreas Petlund @ 2010-02-11 12:07 UTC (permalink / raw)
  To: netdev
  Cc: Ilpo Järvinen, Eric Dumazet, Arnd Hannemann, LKML,
	shemminger, David Miller, william.allen.simpson, damian

Major changes:
      -Possible to disable mechanisms by socket option
      -Socket option value boundary check


Signed-off-by: Andreas Petlund <apetlund@simula.no>
---
 include/linux/sysctl.h     |    1 +
 include/linux/tcp.h        |    3 +++
 include/net/tcp.h          |    4 ++++
 net/ipv4/sysctl_net_ipv4.c |    7 +++++++
 net/ipv4/tcp.c             |    7 +++++++
 net/ipv4/tcp_timer.c       |   19 ++++++++++++++++++-
 6 files changed, 40 insertions(+), 1 deletions(-)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 9f236cd..d840d75 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -425,6 +425,7 @@ enum
 	NET_TCP_ALLOWED_CONG_CONTROL=123,
 	NET_TCP_MAX_SSTHRESH=124,
 	NET_TCP_FRTO_RESPONSE=125,
+	NET_TCP_FORCE_THIN_LINEAR_TIMEOUTS=126,
 };
 
 enum {
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7fee8a4..67da706 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -103,6 +103,7 @@ enum {
 #define TCP_CONGESTION		13	/* Congestion control algorithm */
 #define TCP_MD5SIG		14	/* TCP MD5 Signature (RFC2385) */
 #define TCP_COOKIE_TRANSACTIONS	15	/* TCP Cookie Transactions */
+#define TCP_THIN_LT             16      /* Use linear timeouts for thin streams*/
 
 /* for TCP_INFO socket option */
 #define TCPI_OPT_TIMESTAMPS	1
@@ -341,6 +342,8 @@ struct tcp_sock {
 	u16	advmss;		/* Advertised MSS			*/
 	u8	frto_counter;	/* Number of new acks after RTO */
 	u8	nonagle;	/* Disable Nagle algorithm?             */
+	u8      thin_lt     : 1,/* Use linear timeouts for thin streams */
+		thin_undef  : 7;
 
 /* RTT measurement */
 	u32	srtt;		/* smoothed round trip time << 3	*/
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e5e2056..bc5856a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCP_NAGLE_CORK		2	/* Socket is corked	    */
 #define TCP_NAGLE_PUSH		4	/* Cork is overridden for already queued data */
 
+/* TCP thin-stream limits */
+#define TCP_THIN_LT_RETRIES     6       /* After 6 linear retries, do exp. backoff */
+
 extern struct inet_timewait_death_row tcp_death_row;
 
 /* sysctl variables for tcp */
@@ -241,6 +244,7 @@ extern int sysctl_tcp_workaround_signed_windows;
 extern int sysctl_tcp_slow_start_after_idle;
 extern int sysctl_tcp_max_ssthresh;
 extern int sysctl_tcp_cookie_size;
+extern int sysctl_tcp_force_thin_linear_timeouts;
 
 extern atomic_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7e3712c..cb2ed35 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -576,6 +576,13 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
+		.procname       = "tcp_force_thin_linear_timeouts",
+		.data           = &sysctl_tcp_force_thin_linear_timeouts,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec
+	},
+	{
 		.procname	= "udp_mem",
 		.data		= &sysctl_udp_mem,
 		.maxlen		= sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d5d69ea..ce9aeb0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2229,6 +2229,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		}
 		break;
 
+	case TCP_THIN_LT:
+		if (val < 0 || val > 1)
+			err = -EINVAL;
+		else
+			tp->thin_lt = val;
+		break;
+
 	case TCP_CORK:
 		/* When set indicates to always queue non-full frames.
 		 * Later the user clears this option and we transmit
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index de7d1bf..a682479 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
 int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
 int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
 int sysctl_tcp_orphan_retries __read_mostly;
+int sysctl_tcp_force_thin_linear_timeouts __read_mostly;
 
 static void tcp_write_timer(unsigned long);
 static void tcp_delack_timer(unsigned long);
@@ -415,7 +416,23 @@ void tcp_retransmit_timer(struct sock *sk)
 	icsk->icsk_retransmits++;
 
 out_reset_timer:
-	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+	/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
+	 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
+	 * might be increased if the stream oscillates between thin and thick,
+	 * thus the old value might already be too high compared to the value
+	 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
+	 * backoff. Limit to TCP_THIN_LT_RETRIES before initiating exponential
+	 * backoff behaviour to avoid continue hammering linear-timeout
+	 * retransmissions into a black hole*/
+	if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
+	    tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED &&
+	    icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {
+		icsk->icsk_backoff = 0;
+		icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
+	} else {
+		/* Use normal (exponential) backoff */
+		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+	}
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
 	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
 		__sk_dst_reset(sk);
-- 
1.6.3.3


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [net-next PATCH v3 2/3] net: TCP thin linear timeouts
  2010-02-11 12:07 [net-next PATCH v3 2/3] net: TCP thin linear timeouts Andreas Petlund
@ 2010-02-12  3:52 ` Eric Dumazet
  2010-02-13 15:49   ` Andreas Petlund
  2010-02-12 11:19 ` William Allen Simpson
  1 sibling, 1 reply; 5+ messages in thread
From: Eric Dumazet @ 2010-02-12  3:52 UTC (permalink / raw)
  To: Andreas Petlund
  Cc: netdev, Ilpo Järvinen, Arnd Hannemann, LKML, shemminger,
	David Miller, william.allen.simpson, damian

Le jeudi 11 février 2010 à 13:07 +0100, Andreas Petlund a écrit :
> Major changes:
>       -Possible to disable mechanisms by socket option
>       -Socket option value boundary check
> 
> 
> Signed-off-by: Andreas Petlund <apetlund@simula.no>
> ---
>  include/linux/sysctl.h     |    1 +
>  include/linux/tcp.h        |    3 +++
>  include/net/tcp.h          |    4 ++++
>  net/ipv4/sysctl_net_ipv4.c |    7 +++++++
>  net/ipv4/tcp.c             |    7 +++++++
>  net/ipv4/tcp_timer.c       |   19 ++++++++++++++++++-
>  6 files changed, 40 insertions(+), 1 deletions(-)
> 
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index 9f236cd..d840d75 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -425,6 +425,7 @@ enum
>  	NET_TCP_ALLOWED_CONG_CONTROL=123,
>  	NET_TCP_MAX_SSTHRESH=124,
>  	NET_TCP_FRTO_RESPONSE=125,
> +	NET_TCP_FORCE_THIN_LINEAR_TIMEOUTS=126,
>  };
>  
>  enum {
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 7fee8a4..67da706 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -103,6 +103,7 @@ enum {
>  #define TCP_CONGESTION		13	/* Congestion control algorithm */
>  #define TCP_MD5SIG		14	/* TCP MD5 Signature (RFC2385) */
>  #define TCP_COOKIE_TRANSACTIONS	15	/* TCP Cookie Transactions */
> +#define TCP_THIN_LT             16      /* Use linear timeouts for thin streams*/
>  
>  /* for TCP_INFO socket option */
>  #define TCPI_OPT_TIMESTAMPS	1
> @@ -341,6 +342,8 @@ struct tcp_sock {
>  	u16	advmss;		/* Advertised MSS			*/
>  	u8	frto_counter;	/* Number of new acks after RTO */
>  	u8	nonagle;	/* Disable Nagle algorithm?             */
> +	u8      thin_lt     : 1,/* Use linear timeouts for thin streams */
> +		thin_undef  : 7;
>  
>  /* RTT measurement */
>  	u32	srtt;		/* smoothed round trip time << 3	*/
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index e5e2056..bc5856a 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
>  #define TCP_NAGLE_CORK		2	/* Socket is corked	    */
>  #define TCP_NAGLE_PUSH		4	/* Cork is overridden for already queued data */
>  
> +/* TCP thin-stream limits */
> +#define TCP_THIN_LT_RETRIES     6       /* After 6 linear retries, do exp. backoff */
> +
>  extern struct inet_timewait_death_row tcp_death_row;
>  
>  /* sysctl variables for tcp */
> @@ -241,6 +244,7 @@ extern int sysctl_tcp_workaround_signed_windows;
>  extern int sysctl_tcp_slow_start_after_idle;
>  extern int sysctl_tcp_max_ssthresh;
>  extern int sysctl_tcp_cookie_size;
> +extern int sysctl_tcp_force_thin_linear_timeouts;
>  
>  extern atomic_t tcp_memory_allocated;
>  extern struct percpu_counter tcp_sockets_allocated;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 7e3712c..cb2ed35 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -576,6 +576,13 @@ static struct ctl_table ipv4_table[] = {
>  		.proc_handler	= proc_dointvec
>  	},
>  	{
> +		.procname       = "tcp_force_thin_linear_timeouts",
> +		.data           = &sysctl_tcp_force_thin_linear_timeouts,
> +		.maxlen         = sizeof(int),
> +		.mode           = 0644,
> +		.proc_handler   = proc_dointvec
> +	},
> +	{
>  		.procname	= "udp_mem",
>  		.data		= &sysctl_udp_mem,
>  		.maxlen		= sizeof(sysctl_udp_mem),
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index d5d69ea..ce9aeb0 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -2229,6 +2229,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
>  		}
>  		break;
>  
> +	case TCP_THIN_LT:
> +		if (val < 0 || val > 1)
> +			err = -EINVAL;
> +		else
> +			tp->thin_lt = val;
> +		break;
> +
>  	case TCP_CORK:
>  		/* When set indicates to always queue non-full frames.
>  		 * Later the user clears this option and we transmit
> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> index de7d1bf..a682479 100644
> --- a/net/ipv4/tcp_timer.c
> +++ b/net/ipv4/tcp_timer.c
> @@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
>  int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
>  int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
>  int sysctl_tcp_orphan_retries __read_mostly;
> +int sysctl_tcp_force_thin_linear_timeouts __read_mostly;
>  
>  static void tcp_write_timer(unsigned long);
>  static void tcp_delack_timer(unsigned long);
> @@ -415,7 +416,23 @@ void tcp_retransmit_timer(struct sock *sk)
>  	icsk->icsk_retransmits++;
>  
>  out_reset_timer:
> -	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
> +	/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
> +	 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
> +	 * might be increased if the stream oscillates between thin and thick,
> +	 * thus the old value might already be too high compared to the value
> +	 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
> +	 * backoff. Limit to TCP_THIN_LT_RETRIES before initiating exponential
> +	 * backoff behaviour to avoid continue hammering linear-timeout
> +	 * retransmissions into a black hole*/
> +	if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
> +	    tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED &&
> +	    icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {
> +		icsk->icsk_backoff = 0;
> +		icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
> +	} else {
> +		/* Use normal (exponential) backoff */
> +		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
> +	}
>  	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
>  	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
>  		__sk_dst_reset(sk);

Hi Anreas

Could you include a section in Documentation/networking/ip-sysctl.txt
about tcp_force_thin_linear_timeouts setting ?

Also, you should provide some documentation to be included in man pages
(man 7 tcp), to Michael Kerrisk ( <mtk.manpages@gmail.com> ), both for
tcp_force_thin_linear_timeouts and TCP_THIN_LT

(Same applies for patch 3/3 and tcp_force_thin_dupack /
TCP_THIN_DUPACK )




^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [net-next PATCH v3 2/3] net: TCP thin linear timeouts
  2010-02-11 12:07 [net-next PATCH v3 2/3] net: TCP thin linear timeouts Andreas Petlund
  2010-02-12  3:52 ` Eric Dumazet
@ 2010-02-12 11:19 ` William Allen Simpson
  2010-02-13 15:50   ` Andreas Petlund
  1 sibling, 1 reply; 5+ messages in thread
From: William Allen Simpson @ 2010-02-12 11:19 UTC (permalink / raw)
  To: Andreas Petlund
  Cc: netdev, Ilpo Järvinen, Eric Dumazet, Arnd Hannemann, LKML,
	shemminger, David Miller, damian

Last year, I'm pretty sure I was on record as thinking this is only a
marginally good idea, that would be better at the application layer.

Also that naming was a bit dicey.  Now the names are more descriptive,
but the "force" is a bit overkill.

How about?
   NET_TCP_FORCE_THIN_LINEAR_TIMEOUTS    -> NET_TCP_THIN_LINEAR_TIMEOUTS
   TCP_THIN_LT                           -> TCP_THIN_LINEAR_TIMEOUTS
   TCP_THIN_LT_RETRIES                   -> TCP_THIN_LINEAR_RETRIES
   tcp_force_thin_linear_timeouts        -> tcp_thin_linear_timeouts
   sysctl_tcp_force_thin_linear_timeouts -> sysctl_tcp_thin_linear_timeouts
   tp->thin_lt                           -> tp->thin_lto

The latter mostly traditional "to" for "timeout", as used most everywhere.


Andreas Petlund wrote:
> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> index de7d1bf..a682479 100644
> --- a/net/ipv4/tcp_timer.c
> +++ b/net/ipv4/tcp_timer.c
> @@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
>  int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
>  int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
>  int sysctl_tcp_orphan_retries __read_mostly;
> +int sysctl_tcp_force_thin_linear_timeouts __read_mostly;
>  
Where is the sysctl initialized?


> +	if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
> +	    tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED &&
> +	    icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {

Just for efficiency, I'd reorder this
   +	if (sk->sk_state == TCP_ESTABLISHED &&
   +	    (tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
   +	    tcp_stream_is_thin(sk) &&
   +	    icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [net-next PATCH v3 2/3] net: TCP thin linear timeouts
  2010-02-12  3:52 ` Eric Dumazet
@ 2010-02-13 15:49   ` Andreas Petlund
  0 siblings, 0 replies; 5+ messages in thread
From: Andreas Petlund @ 2010-02-13 15:49 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, Ilpo Järvinen, Arnd Hannemann, LKML, shemminger,
	David Miller, william.allen.simpson, damian

On 12. feb. 2010 04:52, Eric Dumazet wrote:
>
> Hi Anreas
>
> Could you include a section in Documentation/networking/ip-sysctl.txt
> about tcp_force_thin_linear_timeouts setting ?
>

I 'll submit a draft with the next iteration. thanks for the heads up on the procedure.
It may also be helpful with a separate textfile to describe the thin-stream latency
issues and the mechanisms. I will submit a draft for this also.

> Also, you should provide some documentation to be included in man pages
> (man 7 tcp), to Michael Kerrisk ( <mtk.manpages@gmail.com> ), both for
> tcp_force_thin_linear_timeouts and TCP_THIN_LT
>
> (Same applies for patch 3/3 and tcp_force_thin_dupack /
> TCP_THIN_DUPACK )
>   

I'll make a draft, and mail him as soon as the naming issues are settled.

Best regards,
Andreas

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [net-next PATCH v3 2/3] net: TCP thin linear timeouts
  2010-02-12 11:19 ` William Allen Simpson
@ 2010-02-13 15:50   ` Andreas Petlund
  0 siblings, 0 replies; 5+ messages in thread
From: Andreas Petlund @ 2010-02-13 15:50 UTC (permalink / raw)
  To: William Allen Simpson
  Cc: netdev, Ilpo Järvinen, Eric Dumazet, Arnd Hannemann, LKML,
	shemminger, David Miller, damian

On 12. feb. 2010 12:19, William Allen Simpson wrote:
> Last year, I'm pretty sure I was on record as thinking this is only a
> marginally good idea, that would be better at the application layer.
> 
> Also that naming was a bit dicey.  Now the names are more descriptive,
> but the "force" is a bit overkill.
> 
> How about?
>   NET_TCP_FORCE_THIN_LINEAR_TIMEOUTS    -> NET_TCP_THIN_LINEAR_TIMEOUTS
>   TCP_THIN_LT                           -> TCP_THIN_LINEAR_TIMEOUTS
>   TCP_THIN_LT_RETRIES                   -> TCP_THIN_LINEAR_RETRIES
>   tcp_force_thin_linear_timeouts        -> tcp_thin_linear_timeouts
>   sysctl_tcp_force_thin_linear_timeouts -> sysctl_tcp_thin_linear_timeouts
>   tp->thin_lt                           -> tp->thin_lto
> 
> The latter mostly traditional "to" for "timeout", as used most everywhere.
> 

I agree that the _force_-part should be taken out for both patches, and 
renaming the lt to lto also makes sense. I'll fix it in the next iteration.

> Just for efficiency, I'd reorder this
>   +    if (sk->sk_state == TCP_ESTABLISHED &&
>   +        (tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
>   +        tcp_stream_is_thin(sk) &&
>   +        icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {

Thank you for this suggestion. I'll reorder in the next iteration.

Best regards,
Andreas

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2010-02-13 15:58 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-02-11 12:07 [net-next PATCH v3 2/3] net: TCP thin linear timeouts Andreas Petlund
2010-02-12  3:52 ` Eric Dumazet
2010-02-13 15:49   ` Andreas Petlund
2010-02-12 11:19 ` William Allen Simpson
2010-02-13 15:50   ` Andreas Petlund

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.