netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] tcp: Export TCP Delayed ACK parameters to user
@ 2011-10-27 23:07 Daniel Baluta
  2011-10-28  0:01 ` Eric Dumazet
  2011-10-28 21:14 ` [RFC v2] " Daniel Baluta
  0 siblings, 2 replies; 17+ messages in thread
From: Daniel Baluta @ 2011-10-27 23:07 UTC (permalink / raw)
  To: davem
  Cc: kuznet, jmorris, yoshfuji, kaber, netdev, eric.dumazet, Daniel Baluta

RFC2581 ($4.2) specifies when an ACK should be generated as follows:

" .. an ACK SHOULD be generated for at least every second
  full-sized segment, and MUST be generated within 500 ms
  of the arrival of the first unacknowledged packet.
"

We export the number of segments and the timeout limits
specified above, so that a user can tune them according
to its needs.

Specifically:
	* /proc/sys/net/ipv4/tcp_delack_segs, represents
	the threshold for the number of segments.
	* /proc/sys/net/ipv4/tcp_delack_min, specifies
	the minimum timeout value
	* /proc/sys/net/ipv4/tcp_delack_max, specifies
	the maximum timeout value.

Signed-off-by: Daniel Baluta <dbaluta@ixiacom.com>
---
 include/net/tcp.h          |   20 +++++++++++++++++---
 net/ipv4/sysctl_net_ipv4.c |   21 +++++++++++++++++++++
 net/ipv4/tcp.c             |    5 +++--
 net/ipv4/tcp_input.c       |    7 +++++--
 net/ipv4/tcp_output.c      |    4 +++-
 5 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e147f42..f3b0c17 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -111,14 +111,21 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 				  * TIME-WAIT timer.
 				  */
 
-#define TCP_DELACK_MAX	((unsigned)(HZ/5))	/* maximal time to delay before sending an ACK */
+/* default maximum time to delay before sending an ACK */
+#define TCP_DELACK_MAX_DEFAULT	((unsigned)(HZ/5))
+
 #if HZ >= 100
-#define TCP_DELACK_MIN	((unsigned)(HZ/25))	/* minimal time to delay before sending an ACK */
+/* default minimum time to delay before sending an ACK */
+#define TCP_DELACK_MIN_DEFAULT	((unsigned)(HZ/25))
 #define TCP_ATO_MIN	((unsigned)(HZ/25))
 #else
-#define TCP_DELACK_MIN	4U
+#define TCP_DELACK_MIN_DEFAULT	4U
 #define TCP_ATO_MIN	4U
 #endif
+
+#define TCP_DELACK_MIN sysctl_tcp_delack_min
+#define TCP_DELACK_MAX sysctl_tcp_delack_max
+
 #define TCP_RTO_MAX	((unsigned)(120*HZ))
 #define TCP_RTO_MIN	((unsigned)(HZ/5))
 #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))	/* RFC2988bis initial RTO value	*/
@@ -251,6 +258,9 @@ extern int sysctl_tcp_max_ssthresh;
 extern int sysctl_tcp_cookie_size;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
+extern int sysctl_tcp_delack_segs;
+extern int sysctl_tcp_delack_min;
+extern int sysctl_tcp_delack_max;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -1557,6 +1567,10 @@ static inline struct tcp_extend_values *tcp_xv(struct request_values *rvp)
 {
 	return (struct tcp_extend_values *)rvp;
 }
+static inline int tcp_snd_thresh(struct sock *sk)
+{
+	return inet_csk(sk)->icsk_ack.rcv_mss * sysctl_tcp_delack_segs;
+}
 
 extern void tcp_v4_init(void);
 extern void tcp_init(void);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 69fd720..c22c4c5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -639,6 +639,27 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler   = proc_dointvec
 	},
 	{
+		.procname	= "tcp_delack_segs",
+		.data		= &sysctl_tcp_delack_segs,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_delack_min",
+		.data		= &sysctl_tcp_delack_min,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies
+	},
+	{
+		.procname	= "tcp_delack_max",
+		.data		= &sysctl_tcp_delack_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies
+	},
+	{
 		.procname	= "udp_mem",
 		.data		= &sysctl_udp_mem,
 		.maxlen		= sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 34f5db1..0aad29b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1204,8 +1204,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
 		   /* Delayed ACKs frequently hit locked sockets during bulk
 		    * receive. */
 		if (icsk->icsk_ack.blocked ||
-		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
-		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
+		    /* More than once-per-tcp_delack_segs-segments ACK
+		     * was not sent by tcp_input.c */
+		    tp->rcv_nxt - tp->rcv_wup > tcp_snd_thresh(sk) ||
 		    /*
 		     * If this read emptied read buffer, we send ACK, if
 		     * connection is not bidirectional, user drained
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 52b5c2d..1e02a80 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -98,6 +98,9 @@ int sysctl_tcp_thin_dupack __read_mostly;
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_abc __read_mostly;
 
+int sysctl_tcp_delack_segs __read_mostly = 1;
+
+
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
 #define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
@@ -4993,8 +4996,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	    /* More than one full frame received... */
-	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
+	    /* More than tcp_delack_segs full frame(s) received... */
+	if (((tp->rcv_nxt - tp->rcv_wup) > tcp_snd_thresh(sk) &&
 	     /* ... and right edge of window advances far enough.
 	      * (tcp_recvmsg() will send ACK otherwise). Or...
 	      */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 980b98f..0ec31af 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -63,6 +63,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
 EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
 
+int sysctl_tcp_delack_min __read_mostly = TCP_DELACK_MIN_DEFAULT;
+int sysctl_tcp_delack_max __read_mostly = TCP_DELACK_MAX_DEFAULT;
 
 /* Account for new data that has been sent to the network. */
 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
@@ -2685,7 +2687,7 @@ void tcp_send_delayed_ack(struct sock *sk)
 		 * directly.
 		 */
 		if (tp->srtt) {
-			int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
+			int rtt = max_t(unsigned, tp->srtt >> 3, TCP_DELACK_MIN);
 
 			if (rtt < max_ato)
 				max_ato = rtt;
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [RFC] tcp: Export TCP Delayed ACK parameters to user
  2011-10-27 23:07 [RFC] tcp: Export TCP Delayed ACK parameters to user Daniel Baluta
@ 2011-10-28  0:01 ` Eric Dumazet
  2011-10-28  8:01   ` Daniel Baluta
  2011-10-28 21:14 ` [RFC v2] " Daniel Baluta
  1 sibling, 1 reply; 17+ messages in thread
From: Eric Dumazet @ 2011-10-28  0:01 UTC (permalink / raw)
  To: Daniel Baluta; +Cc: davem, kuznet, jmorris, yoshfuji, kaber, netdev

Le vendredi 28 octobre 2011 à 02:07 +0300, Daniel Baluta a écrit :
> RFC2581 ($4.2) specifies when an ACK should be generated as follows:
> 
> " .. an ACK SHOULD be generated for at least every second
>   full-sized segment, and MUST be generated within 500 ms
>   of the arrival of the first unacknowledged packet.
> "
> 
> We export the number of segments and the timeout limits
> specified above, so that a user can tune them according
> to its needs.
> 

Well, this requires user has a machine exclusive use :)

> Specifically:
> 	* /proc/sys/net/ipv4/tcp_delack_segs, represents
> 	the threshold for the number of segments.
> 	* /proc/sys/net/ipv4/tcp_delack_min, specifies
> 	the minimum timeout value
> 	* /proc/sys/net/ipv4/tcp_delack_max, specifies
> 	the maximum timeout value.
> 


> Signed-off-by: Daniel Baluta <dbaluta@ixiacom.com>
> ---
>  include/net/tcp.h          |   20 +++++++++++++++++---
>  net/ipv4/sysctl_net_ipv4.c |   21 +++++++++++++++++++++
>  net/ipv4/tcp.c             |    5 +++--
>  net/ipv4/tcp_input.c       |    7 +++++--
>  net/ipv4/tcp_output.c      |    4 +++-
>  5 files changed, 49 insertions(+), 8 deletions(-)
> 

Missing Documentation changes

> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index e147f42..f3b0c17 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -111,14 +111,21 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
>  				  * TIME-WAIT timer.
>  				  */
>  
> -#define TCP_DELACK_MAX	((unsigned)(HZ/5))	/* maximal time to delay before sending an ACK */
> +/* default maximum time to delay before sending an ACK */
> +#define TCP_DELACK_MAX_DEFAULT	((unsigned)(HZ/5))
> +
>  #if HZ >= 100
> -#define TCP_DELACK_MIN	((unsigned)(HZ/25))	/* minimal time to delay before sending an ACK */
> +/* default minimum time to delay before sending an ACK */
> +#define TCP_DELACK_MIN_DEFAULT	((unsigned)(HZ/25))
>  #define TCP_ATO_MIN	((unsigned)(HZ/25))
>  #else
> -#define TCP_DELACK_MIN	4U
> +#define TCP_DELACK_MIN_DEFAULT	4U
>  #define TCP_ATO_MIN	4U
>  #endif
> +
> +#define TCP_DELACK_MIN sysctl_tcp_delack_min
> +#define TCP_DELACK_MAX sysctl_tcp_delack_max

Hmm, please try to compile dccp as a module :)

You need some EXPORT_SYMBOL() definitions.

Frankly, I suggest removing TCP_DELACK_{MIN|MAX} to avoid unecessary
layer, and use sysctl_tcp_delack_{min|max} instead


> +
>  #define TCP_RTO_MAX	((unsigned)(120*HZ))
>  #define TCP_RTO_MIN	((unsigned)(HZ/5))
>  #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))	/* RFC2988bis initial RTO value	*/
> @@ -251,6 +258,9 @@ extern int sysctl_tcp_max_ssthresh;
>  extern int sysctl_tcp_cookie_size;
>  extern int sysctl_tcp_thin_linear_timeouts;
>  extern int sysctl_tcp_thin_dupack;
> +extern int sysctl_tcp_delack_segs;
> +extern int sysctl_tcp_delack_min;
> +extern int sysctl_tcp_delack_max;
>  
>  extern atomic_long_t tcp_memory_allocated;
>  extern struct percpu_counter tcp_sockets_allocated;
> @@ -1557,6 +1567,10 @@ static inline struct tcp_extend_values *tcp_xv(struct request_values *rvp)
>  {
>  	return (struct tcp_extend_values *)rvp;
>  }


> +static inline int tcp_snd_thresh(struct sock *sk)

I am not sure name is properly chosen, its about delack or not ?

const struct *sk

> +{
> +	return inet_csk(sk)->icsk_ack.rcv_mss * sysctl_tcp_delack_segs;
> +}
>  

Thanks !

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] tcp: Export TCP Delayed ACK parameters to user
  2011-10-28  0:01 ` Eric Dumazet
@ 2011-10-28  8:01   ` Daniel Baluta
  2011-10-28  8:44     ` Eric Dumazet
  0 siblings, 1 reply; 17+ messages in thread
From: Daniel Baluta @ 2011-10-28  8:01 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, kuznet, jmorris, yoshfuji, kaber, netdev

On Fri, Oct 28, 2011 at 3:01 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le vendredi 28 octobre 2011 à 02:07 +0300, Daniel Baluta a écrit :
>> RFC2581 ($4.2) specifies when an ACK should be generated as follows:
>>
>> " .. an ACK SHOULD be generated for at least every second
>>   full-sized segment, and MUST be generated within 500 ms
>>   of the arrival of the first unacknowledged packet.
>> "
>>
>> We export the number of segments and the timeout limits
>> specified above, so that a user can tune them according
>> to its needs.
>>
>
> Well, this requires user has a machine exclusive use :)

So, this means that setting parameters system wide
isn't an option?

On Windows there is a global setting TcpAckFrequency [1],
which is similar with our tcp_delack_{min,max}.

On Solaris there is a global option tcp_deferred_acks_max [2],
which is similar with our tcp_delack_segs.

Thanks for your comments, I will post an updated patch asap.

Daniel.

[1] http://support.microsoft.com/kb/328890
[2] http://www.sean.de/Solaris/soltune.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] tcp: Export TCP Delayed ACK parameters to user
  2011-10-28  8:01   ` Daniel Baluta
@ 2011-10-28  8:44     ` Eric Dumazet
  2011-10-28 16:38       ` Rick Jones
  0 siblings, 1 reply; 17+ messages in thread
From: Eric Dumazet @ 2011-10-28  8:44 UTC (permalink / raw)
  To: Daniel Baluta; +Cc: davem, kuznet, jmorris, yoshfuji, kaber, netdev

Le vendredi 28 octobre 2011 à 11:01 +0300, Daniel Baluta a écrit :
> On Fri, Oct 28, 2011 at 3:01 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > Le vendredi 28 octobre 2011 à 02:07 +0300, Daniel Baluta a écrit :
> >> RFC2581 ($4.2) specifies when an ACK should be generated as follows:
> >>
> >> " .. an ACK SHOULD be generated for at least every second
> >>   full-sized segment, and MUST be generated within 500 ms
> >>   of the arrival of the first unacknowledged packet.
> >> "
> >>
> >> We export the number of segments and the timeout limits
> >> specified above, so that a user can tune them according
> >> to its needs.
> >>
> >
> > Well, this requires user has a machine exclusive use :)
> 
> So, this means that setting parameters system wide
> isn't an option?
> 

It is a first step, but we can notice a global setting might please one
application but negatively impact other applications.

I guess some users will want a per socket option, but this can come
later. An other idea to save space on socket structures would be to
select two set of values depending on TOS/TCLASS.

I can imagine ssh (lowdelay) and scp (throughput) wanting different
behavior here.

> On Windows there is a global setting TcpAckFrequency [1],
> which is similar with our tcp_delack_{min,max}.
> 
> On Solaris there is a global option tcp_deferred_acks_max [2],
> which is similar with our tcp_delack_segs.
> 

and also has tcp_deferred_ack_interval

> Thanks for your comments, I will post an updated patch asap.
> 
> Daniel.
> 
> [1] http://support.microsoft.com/kb/328890
> [2] http://www.sean.de/Solaris/soltune.html

Dont forget to CC Andy Lutomirski <luto@amacapital.net>, he might be
interested being part of the process.

Thanks

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] tcp: Export TCP Delayed ACK parameters to user
  2011-10-28  8:44     ` Eric Dumazet
@ 2011-10-28 16:38       ` Rick Jones
  0 siblings, 0 replies; 17+ messages in thread
From: Rick Jones @ 2011-10-28 16:38 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Daniel Baluta, davem, kuznet, jmorris, yoshfuji, kaber, netdev

>> On Solaris there is a global option tcp_deferred_acks_max [2],
>> which is similar with our tcp_delack_segs.
>>
>
> and also has tcp_deferred_ack_interval

And those have similar settings in HP-UX 11.X.

For the sake of completeness, the ACK avoidance heuristic in HP-UX, and 
I presume Solaris (as they share a common "Mentat" heritage) includes a 
mechanism to reduce the per-connection effective number of segments per 
ACKnowledgement.  I believe this is done to handle cases where the 
sender may have reduced her cwnd.  That would have deployment going back 
to 1997 in the case of HP-UX 11.0, and presumably a few years before 
that in the case of Solaris.  That mechanism in their ACK avoidance 
heuristics may be the reason neither have gone so far as to make the 
settings per-route or per-connection (though I could be wrong).  I 
believe that Solaris does though have two deferred ACK limits - one for 
perceived to be local connections and one (lower) for perceived to be 
remote connections.

There can be "fun" interactions with senders which increase cwnd per ACK 
rather than per bytes ACKed.

Still, I myself am somewhat fond of ACK avoidance heuristics.

rick jones

PS - when discussing the performance benefits of an ACK avoidance 
heuristic, feel free to use netperf and service demand numbers :)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [RFC v2] tcp: Export TCP Delayed ACK parameters to user
  2011-10-27 23:07 [RFC] tcp: Export TCP Delayed ACK parameters to user Daniel Baluta
  2011-10-28  0:01 ` Eric Dumazet
@ 2011-10-28 21:14 ` Daniel Baluta
  2011-10-28 21:19   ` David Miller
  2011-10-28 21:53   ` Andy Lutomirski
  1 sibling, 2 replies; 17+ messages in thread
From: Daniel Baluta @ 2011-10-28 21:14 UTC (permalink / raw)
  To: davem, eric.dumazet
  Cc: kuznet, jmorris, yoshfuji, kaber, netdev, luto, rick.jones2,
	Daniel Baluta

RFC2581 ($4.2) specifies when an ACK should be generated as follows:

" .. an ACK SHOULD be generated for at least every second
  full-sized segment, and MUST be generated within 500 ms
  of the arrival of the first unacknowledged packet.
"

We export the number of segments and the timeout limits
specified above, so that a user can tune them according
to its needs.

Specifically:
	* /proc/sys/net/ipv4/tcp_delack_segs, represents
	the threshold for the number of segments.
	* /proc/sys/net/ipv4/tcp_delack_min, specifies
	the minimum timeout value
	* /proc/sys/net/ipv4/tcp_delack_max, specifies
	the maximum timeout value.

Signed-off-by: Daniel Baluta <dbaluta@ixiacom.com>
---
Changes since v1:
	* added documentation for newly introduced /proc entries.
	* exported symbols sysctl_tcp_delack_{min|max}.
	* removed TCP_DELACK_{MIN|MAX} and used directly 
	sysctl_tcp_delack{min|max}.
	* renamed tcp_snd_thresh to tcp_delack_thresh.
	* added const qualifier to struct sock *sk.
---
 Documentation/networking/ip-sysctl.txt |   13 +++++++++++++
 include/net/tcp.h                      |   18 +++++++++++++++---
 net/dccp/output.c                      |    2 +-
 net/dccp/timer.c                       |    2 +-
 net/ipv4/sysctl_net_ipv4.c             |   21 +++++++++++++++++++++
 net/ipv4/tcp.c                         |    5 +++--
 net/ipv4/tcp_input.c                   |    8 +++++---
 net/ipv4/tcp_output.c                  |   13 +++++++++----
 net/ipv4/tcp_timer.c                   |    3 ++-
 9 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index cb7f314..efbd1b4 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -524,6 +524,19 @@ tcp_thin_dupack - BOOLEAN
 	Documentation/networking/tcp-thin.txt
 	Default: 0
 
+tcp_delack_segs: - INTEGER
+	Sets the strict minimal number of full-sized TCP segments
+	received after which an ACK should be sent.
+	Default: 1 (as specified in RFC2582, S4.2)
+
+tcp_delack_min:	- INTEGER
+	Sets the minimum time (in miliseconds) to delay before sending an ACK.
+	Default: 40ms
+
+tcp_delack_max: - INTEGER
+	Sets the maximum time (in miliseconds) to delay before sending an ACK.
+	Default: 200ms
+
 UDP variables:
 
 udp_mem - vector of 3 INTEGERs: min, pressure, max
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e147f42..9e29a9d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -111,14 +111,18 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 				  * TIME-WAIT timer.
 				  */
 
-#define TCP_DELACK_MAX	((unsigned)(HZ/5))	/* maximal time to delay before sending an ACK */
+/* default maximum time to delay before sending an ACK */
+#define TCP_DELACK_MAX_DEFAULT	((unsigned)(HZ/5))
+
 #if HZ >= 100
-#define TCP_DELACK_MIN	((unsigned)(HZ/25))	/* minimal time to delay before sending an ACK */
+/* default minimum time to delay before sending an ACK */
+#define TCP_DELACK_MIN_DEFAULT	((unsigned)(HZ/25))
 #define TCP_ATO_MIN	((unsigned)(HZ/25))
 #else
-#define TCP_DELACK_MIN	4U
+#define TCP_DELACK_MIN_DEFAULT	4U
 #define TCP_ATO_MIN	4U
 #endif
+
 #define TCP_RTO_MAX	((unsigned)(120*HZ))
 #define TCP_RTO_MIN	((unsigned)(HZ/5))
 #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))	/* RFC2988bis initial RTO value	*/
@@ -251,6 +255,9 @@ extern int sysctl_tcp_max_ssthresh;
 extern int sysctl_tcp_cookie_size;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
+extern int sysctl_tcp_delack_segs;
+extern int sysctl_tcp_delack_min;
+extern int sysctl_tcp_delack_max;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -1558,6 +1565,11 @@ static inline struct tcp_extend_values *tcp_xv(struct request_values *rvp)
 	return (struct tcp_extend_values *)rvp;
 }
 
+static inline int tcp_delack_thresh(const struct sock *sk)
+{
+	return inet_csk(sk)->icsk_ack.rcv_mss * sysctl_tcp_delack_segs;
+}
+
 extern void tcp_v4_init(void);
 extern void tcp_init(void);
 
diff --git a/net/dccp/output.c b/net/dccp/output.c
index dede3ed..9b5b0c4 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -577,7 +577,7 @@ void dccp_send_ack(struct sock *sk)
 			inet_csk_schedule_ack(sk);
 			inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
-						  TCP_DELACK_MAX,
+						  sysctl_tcp_delack_max,
 						  DCCP_RTO_MAX);
 			return;
 		}
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 7587870..7bae11e 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -202,7 +202,7 @@ static void dccp_delack_timer(unsigned long data)
 		icsk->icsk_ack.blocked = 1;
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
 		sk_reset_timer(sk, &icsk->icsk_delack_timer,
-			       jiffies + TCP_DELACK_MIN);
+			       jiffies + sysctl_tcp_delack_min);
 		goto out;
 	}
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 69fd720..c22c4c5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -639,6 +639,27 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler   = proc_dointvec
 	},
 	{
+		.procname	= "tcp_delack_segs",
+		.data		= &sysctl_tcp_delack_segs,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_delack_min",
+		.data		= &sysctl_tcp_delack_min,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies
+	},
+	{
+		.procname	= "tcp_delack_max",
+		.data		= &sysctl_tcp_delack_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies
+	},
+	{
 		.procname	= "udp_mem",
 		.data		= &sysctl_udp_mem,
 		.maxlen		= sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 34f5db1..731e284 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1204,8 +1204,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
 		   /* Delayed ACKs frequently hit locked sockets during bulk
 		    * receive. */
 		if (icsk->icsk_ack.blocked ||
-		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
-		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
+		    /* More than once-per-tcp_delack_segs-segments ACK
+		     * was not sent by tcp_input.c */
+		    tp->rcv_nxt - tp->rcv_wup > tcp_delack_thresh(sk) ||
 		    /*
 		     * If this read emptied read buffer, we send ACK, if
 		     * connection is not bidirectional, user drained
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 52b5c2d..f2893a9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -98,6 +98,8 @@ int sysctl_tcp_thin_dupack __read_mostly;
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_abc __read_mostly;
 
+int sysctl_tcp_delack_segs __read_mostly = 1;
+
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
 #define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
@@ -4993,8 +4995,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	    /* More than one full frame received... */
-	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
+	    /* More than tcp_delack_segs full frame(s) received... */
+	if (((tp->rcv_nxt - tp->rcv_wup) > tcp_delack_thresh(sk) &&
 	     /* ... and right edge of window advances far enough.
 	      * (tcp_recvmsg() will send ACK otherwise). Or...
 	      */
@@ -5689,7 +5691,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 			tcp_incr_quickack(sk);
 			tcp_enter_quickack_mode(sk);
 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
-						  TCP_DELACK_MAX, TCP_RTO_MAX);
+						  sysctl_tcp_delack_max, TCP_RTO_MAX);
 
 discard:
 			__kfree_skb(skb);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 980b98f..f4e7614 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -63,6 +63,11 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
 EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
 
+int sysctl_tcp_delack_min __read_mostly = TCP_DELACK_MIN_DEFAULT;
+EXPORT_SYMBOL(sysctl_tcp_delack_min);
+
+int sysctl_tcp_delack_max __read_mostly = TCP_DELACK_MAX_DEFAULT;
+EXPORT_SYMBOL(sysctl_tcp_delack_max);
 
 /* Account for new data that has been sent to the network. */
 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
@@ -2670,13 +2675,13 @@ void tcp_send_delayed_ack(struct sock *sk)
 	int ato = icsk->icsk_ack.ato;
 	unsigned long timeout;
 
-	if (ato > TCP_DELACK_MIN) {
+	if (ato > sysctl_tcp_delack_min) {
 		const struct tcp_sock *tp = tcp_sk(sk);
 		int max_ato = HZ / 2;
 
 		if (icsk->icsk_ack.pingpong ||
 		    (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
-			max_ato = TCP_DELACK_MAX;
+			max_ato = sysctl_tcp_delack_max;
 
 		/* Slow path, intersegment interval is "high". */
 
@@ -2685,7 +2690,7 @@ void tcp_send_delayed_ack(struct sock *sk)
 		 * directly.
 		 */
 		if (tp->srtt) {
-			int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
+			int rtt = max_t(unsigned, tp->srtt >> 3, sysctl_tcp_delack_min);
 
 			if (rtt < max_ato)
 				max_ato = rtt;
@@ -2734,7 +2739,7 @@ void tcp_send_ack(struct sock *sk)
 		inet_csk_schedule_ack(sk);
 		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
-					  TCP_DELACK_MAX, TCP_RTO_MAX);
+					  sysctl_tcp_delack_max, TCP_RTO_MAX);
 		return;
 	}
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 2e0f0af..1bdc1c4 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -219,7 +219,8 @@ static void tcp_delack_timer(unsigned long data)
 		/* Try again later. */
 		icsk->icsk_ack.blocked = 1;
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
-		sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
+		sk_reset_timer(sk, &icsk->icsk_delack_timer,
+			       jiffies + sysctl_tcp_delack_min);
 		goto out_unlock;
 	}
 
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [RFC v2] tcp: Export TCP Delayed ACK parameters to user
  2011-10-28 21:14 ` [RFC v2] " Daniel Baluta
@ 2011-10-28 21:19   ` David Miller
  2011-10-28 21:35     ` Daniel Baluta
  2011-10-28 21:53   ` Andy Lutomirski
  1 sibling, 1 reply; 17+ messages in thread
From: David Miller @ 2011-10-28 21:19 UTC (permalink / raw)
  To: dbaluta
  Cc: eric.dumazet, kuznet, jmorris, yoshfuji, kaber, netdev, luto,
	rick.jones2

From: Daniel Baluta <dbaluta@ixiacom.com>
Date: Sat, 29 Oct 2011 00:14:03 +0300

> +static inline int tcp_delack_thresh(const struct sock *sk)
> +{
> +	return inet_csk(sk)->icsk_ack.rcv_mss * sysctl_tcp_delack_segs;
> +}
> +

Please turn this into a shift or something, you're adding a multiply
into a core code path.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC v2] tcp: Export TCP Delayed ACK parameters to user
  2011-10-28 21:19   ` David Miller
@ 2011-10-28 21:35     ` Daniel Baluta
  2011-10-28 22:31       ` David Miller
  0 siblings, 1 reply; 17+ messages in thread
From: Daniel Baluta @ 2011-10-28 21:35 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, kuznet, jmorris, yoshfuji, kaber, netdev, luto,
	rick.jones2

On Sat, Oct 29, 2011 at 12:19 AM, David Miller <davem@davemloft.net> wrote:
> From: Daniel Baluta <dbaluta@ixiacom.com>
> Date: Sat, 29 Oct 2011 00:14:03 +0300
>
>> +static inline int tcp_delack_thresh(const struct sock *sk)
>> +{
>> +     return inet_csk(sk)->icsk_ack.rcv_mss * sysctl_tcp_delack_segs;
>> +}
>> +
>
> Please turn this into a shift or something, you're adding a multiply
> into a core code path.

Is there any generic API to do this? Default case is not
affected since tcp_delack_segs is 1.

Daniel.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC v2] tcp: Export TCP Delayed ACK parameters to user
  2011-10-28 21:14 ` [RFC v2] " Daniel Baluta
  2011-10-28 21:19   ` David Miller
@ 2011-10-28 21:53   ` Andy Lutomirski
  1 sibling, 0 replies; 17+ messages in thread
From: Andy Lutomirski @ 2011-10-28 21:53 UTC (permalink / raw)
  To: Daniel Baluta
  Cc: davem, eric.dumazet, kuznet, jmorris, yoshfuji, kaber, netdev,
	rick.jones2

On Fri, Oct 28, 2011 at 2:14 PM, Daniel Baluta <dbaluta@ixiacom.com> wrote:
> RFC2581 ($4.2) specifies when an ACK should be generated as follows:
>
> " .. an ACK SHOULD be generated for at least every second
>  full-sized segment, and MUST be generated within 500 ms
>  of the arrival of the first unacknowledged packet.
> "
>
> We export the number of segments and the timeout limits
> specified above, so that a user can tune them according
> to its needs.
>
> Specifically:
>        * /proc/sys/net/ipv4/tcp_delack_segs, represents
>        the threshold for the number of segments.
>        * /proc/sys/net/ipv4/tcp_delack_min, specifies
>        the minimum timeout value
>        * /proc/sys/net/ipv4/tcp_delack_max, specifies
>        the maximum timeout value.

This is neat, but IMO it should be per socket -- I (and possibly most
other people who would use it) want to do this kind of tuning per
flow, not per-route or per-interface.

--Andy

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC v2] tcp: Export TCP Delayed ACK parameters to user
  2011-10-28 21:35     ` Daniel Baluta
@ 2011-10-28 22:31       ` David Miller
  2011-10-28 22:40         ` Rick Jones
  0 siblings, 1 reply; 17+ messages in thread
From: David Miller @ 2011-10-28 22:31 UTC (permalink / raw)
  To: dbaluta
  Cc: eric.dumazet, kuznet, jmorris, yoshfuji, kaber, netdev, luto,
	rick.jones2

From: Daniel Baluta <dbaluta@ixiacom.com>
Date: Sat, 29 Oct 2011 00:35:24 +0300

> On Sat, Oct 29, 2011 at 12:19 AM, David Miller <davem@davemloft.net> wrote:
>> From: Daniel Baluta <dbaluta@ixiacom.com>
>> Date: Sat, 29 Oct 2011 00:14:03 +0300
>>
>>> +static inline int tcp_delack_thresh(const struct sock *sk)
>>> +{
>>> +     return inet_csk(sk)->icsk_ack.rcv_mss * sysctl_tcp_delack_segs;
>>> +}
>>> +
>>
>> Please turn this into a shift or something, you're adding a multiply
>> into a core code path.
> 
> Is there any generic API to do this? Default case is not
> affected since tcp_delack_segs is 1.

I'm saying make the tunable a shift count instead of something to
multiply against.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC v2] tcp: Export TCP Delayed ACK parameters to user
  2011-10-28 22:31       ` David Miller
@ 2011-10-28 22:40         ` Rick Jones
  2011-10-29  2:24           ` David Miller
  0 siblings, 1 reply; 17+ messages in thread
From: Rick Jones @ 2011-10-28 22:40 UTC (permalink / raw)
  To: David Miller
  Cc: dbaluta, eric.dumazet, kuznet, jmorris, yoshfuji, kaber, netdev, luto

On 10/28/2011 03:31 PM, David Miller wrote:
> From: Daniel Baluta<dbaluta@ixiacom.com>
> Date: Sat, 29 Oct 2011 00:35:24 +0300
>
>> On Sat, Oct 29, 2011 at 12:19 AM, David Miller<davem@davemloft.net>  wrote:
>>> From: Daniel Baluta<dbaluta@ixiacom.com>
>>> Date: Sat, 29 Oct 2011 00:14:03 +0300
>>>
>>>> +static inline int tcp_delack_thresh(const struct sock *sk)
>>>> +{
>>>> +     return inet_csk(sk)->icsk_ack.rcv_mss * sysctl_tcp_delack_segs;
>>>> +}
>>>> +
>>>
>>> Please turn this into a shift or something, you're adding a multiply
>>> into a core code path.
>>
>> Is there any generic API to do this? Default case is not
>> affected since tcp_delack_segs is 1.
>
> I'm saying make the tunable a shift count instead of something to
> multiply against.

That would be loads faster, but won't that have issues with granularity? 
  It will allow 1, 2, 4, 8, 16, 32, etc segments but none of the umpteen 
values in between.  FWIW, HP-UX defaults to 22 segments, which IIRC has 
its basis in how many "typical" segments could fit in a 32KB window.

If the mss and the delack segs are being converted into an octet count, 
and multiplication or successive addition etc are too expensive, how 
about using an octet count directly?

rick jones

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC v2] tcp: Export TCP Delayed ACK parameters to user
  2011-10-28 22:40         ` Rick Jones
@ 2011-10-29  2:24           ` David Miller
  2011-10-29 12:32             ` Daniel Baluta
  0 siblings, 1 reply; 17+ messages in thread
From: David Miller @ 2011-10-29  2:24 UTC (permalink / raw)
  To: rick.jones2
  Cc: dbaluta, eric.dumazet, kuznet, jmorris, yoshfuji, kaber, netdev, luto

From: Rick Jones <rick.jones2@hp.com>
Date: Fri, 28 Oct 2011 15:40:24 -0700

> That would be loads faster, but won't that have issues with
> granularity?

Frankly, I don't care.

For an obscure feature I don't even like to begin with, I refuse
to allow a multiply into a core code path.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC v2] tcp: Export TCP Delayed ACK parameters to user
  2011-10-29  2:24           ` David Miller
@ 2011-10-29 12:32             ` Daniel Baluta
  2011-10-30  4:13               ` David Miller
  2011-10-31 18:10               ` Rick Jones
  0 siblings, 2 replies; 17+ messages in thread
From: Daniel Baluta @ 2011-10-29 12:32 UTC (permalink / raw)
  To: David Miller
  Cc: rick.jones2, eric.dumazet, kuznet, jmorris, yoshfuji, kaber,
	netdev, luto

On Sat, Oct 29, 2011 at 5:24 AM, David Miller <davem@davemloft.net> wrote:
> From: Rick Jones <rick.jones2@hp.com>
> Date: Fri, 28 Oct 2011 15:40:24 -0700
>
>> That would be loads faster, but won't that have issues with
>> granularity?
>
> Frankly, I don't care.
>
> For an obscure feature I don't even like to begin with, I refuse
> to allow a multiply into a core code path.

I agree that there is no place for multiplication.

I think the best way to go right now is as follows:
* make TCP Delack params configurable per socket.
* count number of bytes instead of number of segments.

Although, I like more to have tcp_delack_thresh
measured in number of segments instead of number
of bytes I don't see yet a neat solution of how to avoid
the multiplication (shift count is not acceptable because it
limits the available number of segments).

Any comments?

thanks,
Daniel.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC v2] tcp: Export TCP Delayed ACK parameters to user
  2011-10-29 12:32             ` Daniel Baluta
@ 2011-10-30  4:13               ` David Miller
  2011-10-31 18:10               ` Rick Jones
  1 sibling, 0 replies; 17+ messages in thread
From: David Miller @ 2011-10-30  4:13 UTC (permalink / raw)
  To: dbaluta
  Cc: rick.jones2, eric.dumazet, kuznet, jmorris, yoshfuji, kaber,
	netdev, luto

From: Daniel Baluta <dbaluta@ixiacom.com>
Date: Sat, 29 Oct 2011 15:32:25 +0300

> * count number of bytes instead of number of segments.

The standard way Linux TCP analyzes connection state based upon
packets, not bytes.

I don't see any value for changing something so fundamental just
for the sake of avoiding the multiply in an obscure facility.

I asked you to make an incision less invasive, yet you're proposal
here more invasive.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC v2] tcp: Export TCP Delayed ACK parameters to user
  2011-10-29 12:32             ` Daniel Baluta
  2011-10-30  4:13               ` David Miller
@ 2011-10-31 18:10               ` Rick Jones
  2011-10-31 20:02                 ` Daniel Baluta
  1 sibling, 1 reply; 17+ messages in thread
From: Rick Jones @ 2011-10-31 18:10 UTC (permalink / raw)
  To: Daniel Baluta
  Cc: David Miller, eric.dumazet, kuznet, jmorris, yoshfuji, kaber,
	netdev, luto

Whether tracked as bytes or segments, my take is that to ask 
applications to have to think about another non-portable socket option 
is ungood.  I would suggest taking the time to work-out the automagic 
heuristic to drop the deferred ACK count on connections where it being 
large is un-desirable and then not need to worry about the limits being 
global.

Given the stack's existing propensity to try to decide when to increase 
the window I might even go so far as to suggest the sense of the 
heuristic be flipped and it seek to decide when it is ok to increase the 
number of segments/bytes per ACK.  To what extent one needs to go beyond 
what happens already with the stretching of ACKs via GRO/LRO or if that 
mechanism can serve as part of the logic of the heuristic is probably a 
fertile area for discussion.

If I recall correctly, in one of your earlier posts you mentioned 
something about a 20% performance boost.  What were the specific 
conditions of that testing?  Was it over a setup where the receiver 
already had LRO/GRO or was it over a more plain receiver NIC without 
that functionality?

rick jones

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC v2] tcp: Export TCP Delayed ACK parameters to user
  2011-10-31 18:10               ` Rick Jones
@ 2011-10-31 20:02                 ` Daniel Baluta
  2011-10-31 21:29                   ` Rick Jones
  0 siblings, 1 reply; 17+ messages in thread
From: Daniel Baluta @ 2011-10-31 20:02 UTC (permalink / raw)
  To: Rick Jones
  Cc: David Miller, eric.dumazet, kuznet, jmorris, yoshfuji, kaber,
	netdev, luto

On Mon, Oct 31, 2011 at 8:10 PM, Rick Jones <rick.jones2@hp.com> wrote:
> Whether tracked as bytes or segments, my take is that to ask applications to
> have to think about another non-portable socket option is ungood.  I would
> suggest taking the time to work-out the automagic heuristic to drop the
> deferred ACK count on connections where it being large is un-desirable and
> then not need to worry about the limits being global.

Your suggestion deserves further investigation, it looks tricky to
find a good heuristic for increasing/decreasing the ACK deferred count.

>
> Given the stack's existing propensity to try to decide when to increase the
> window I might even go so far as to suggest the sense of the heuristic be
> flipped and it seek to decide when it is ok to increase the number of
> segments/bytes per ACK.  To what extent one needs to go beyond what happens
> already with the stretching of ACKs via GRO/LRO or if that mechanism can
> serve as part of the logic of the heuristic is probably a fertile area for
> discussion.
>
> If I recall correctly, in one of your earlier posts you mentioned something
> about a 20% performance boost.  What were the specific conditions of that
> testing?  Was it over a setup where the receiver already had LRO/GRO or was
> it over a more plain receiver NIC without that functionality?

If I remember correctly on the receiver side there was no LRO/GRO, but we
tweaked some of /proc/sys/net/ipv4 parameters (e.g tcp_rmem).
Also, the traffic was highly unidirectional with many clients feeding multimedia
content to a server.

Anyhow, we used our custom kernel which is an older kernel version.
Are there any recommended benchmarks/tools for testing this kind of parameters?

Daniel.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC v2] tcp: Export TCP Delayed ACK parameters to user
  2011-10-31 20:02                 ` Daniel Baluta
@ 2011-10-31 21:29                   ` Rick Jones
  0 siblings, 0 replies; 17+ messages in thread
From: Rick Jones @ 2011-10-31 21:29 UTC (permalink / raw)
  To: Daniel Baluta
  Cc: David Miller, eric.dumazet, kuznet, jmorris, yoshfuji, kaber,
	netdev, luto

On 10/31/2011 01:02 PM, Daniel Baluta wrote:
> On Mon, Oct 31, 2011 at 8:10 PM, Rick Jones<rick.jones2@hp.com>  wrote:
>> Whether tracked as bytes or segments, my take is that to ask applications to
>> have to think about another non-portable socket option is ungood.  I would
>> suggest taking the time to work-out the automagic heuristic to drop the
>> deferred ACK count on connections where it being large is un-desirable and
>> then not need to worry about the limits being global.
>
> Your suggestion deserves further investigation, it looks tricky to
> find a good heuristic for increasing/decreasing the ACK deferred count.

Well, presumably you can observe the behaviour of some HP-UX and/or 
Solaris receivers to get some ideas.

>> If I recall correctly, in one of your earlier posts you mentioned something
>> about a 20% performance boost.  What were the specific conditions of that
>> testing?  Was it over a setup where the receiver already had LRO/GRO or was
>> it over a more plain receiver NIC without that functionality?
>
> If I remember correctly on the receiver side there was no LRO/GRO, but we
> tweaked some of /proc/sys/net/ipv4 parameters (e.g tcp_rmem).
> Also, the traffic was highly unidirectional with many clients feeding multimedia
> content to a server.
>
> Anyhow, we used our custom kernel which is an older kernel version.
> Are there any recommended benchmarks/tools for testing this kind of parameters?

Well, the last time I was tilting after the ACK avoidance windmill I 
used my favorite tool, netperf.  I believe I posted some HP-UX data 
showing the effect of different values of tcp_deferred_ack_max.  Both on 
throughput, and on CPU utilization/service demand.  Of course, I have 
something of a bias in that regard :)

rick jones

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2011-10-31 21:29 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-10-27 23:07 [RFC] tcp: Export TCP Delayed ACK parameters to user Daniel Baluta
2011-10-28  0:01 ` Eric Dumazet
2011-10-28  8:01   ` Daniel Baluta
2011-10-28  8:44     ` Eric Dumazet
2011-10-28 16:38       ` Rick Jones
2011-10-28 21:14 ` [RFC v2] " Daniel Baluta
2011-10-28 21:19   ` David Miller
2011-10-28 21:35     ` Daniel Baluta
2011-10-28 22:31       ` David Miller
2011-10-28 22:40         ` Rick Jones
2011-10-29  2:24           ` David Miller
2011-10-29 12:32             ` Daniel Baluta
2011-10-30  4:13               ` David Miller
2011-10-31 18:10               ` Rick Jones
2011-10-31 20:02                 ` Daniel Baluta
2011-10-31 21:29                   ` Rick Jones
2011-10-28 21:53   ` Andy Lutomirski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).