All of lore.kernel.org
 help / color / mirror / Atom feed
* high latency with TCP connections
@ 2006-08-30 10:07 Alexander Vodomerov
  2006-08-30 17:27 ` Stephen Hemminger
  2006-09-04 16:00 ` [PATCH][RFC] " Alexey Kuznetsov
  0 siblings, 2 replies; 43+ messages in thread
From: Alexander Vodomerov @ 2006-08-30 10:07 UTC (permalink / raw)
  To: netdev

Hello!

I'm writing an application that is working over TCP. Total traffic is
very low (~ 10 kb/sec), but performance is very bad. I've tried to
investigate problem with tcpdump and strace, and it shows that
application does multiple writes, but TCP buffers them and send after
some delay (about 40 msec). Due to nature of my application, it is
essential to send any available data ASAP (decreased bandwidth is not
important). I've set TCP_NODELAY option on socket, but it doesn't help.

We've written a simple program to reproduce the effect. It sends 10
small packets, then sleeps for 0.1 sec. Another node tries to receive
data. Strace shows that 2 packets are sent immediately and other 8 are
grouped together and delayed by 40 msec.

It is interesting that this effect can be seen not only on Ethernet
links, but on loopback also (with the same magic constant of 40 msec).

Here is a test run:
  server (should be run first):
$ ./a.out 1 5000
Server: begin send_all
Server: total time 14.216441
  client:
$ ./a.out 2 5000 localhost
Client: connected to localhost:5000
Client: begin receive_all
Client: total time 14.223265

Expected time is 10.0 sec (instead of 14.0 sec).
If packets are received more often (DELIM constant is set to 1 or 2)
then effect disappear.

Is this a desired behaviour? How can I specify that packets should be
sent really immediately after write?

Some people reported that this program runs in 9.997 sec when run on
FreeBSD.

Please cc me on replies, as I'm not subscribed to mailing list.

With best regards, 
   Alexander.

Listing follows:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <netdb.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <time.h>

int TOTAL_SENDS = 1000;
int DELIM = 10;

int sock = -1;

int init_server(int port)
{
	struct sockaddr_in sin;
	struct sockaddr_in new_sin;
	int new_sock;
	int val;
	int sockaddrlen;

	sock = socket(PF_INET, SOCK_STREAM, 0);
	if (sock == -1) return -1;

	val = 1;
	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)) < 0) return -1;

	memset(&sin,0,sizeof(struct sockaddr_in));
        sin.sin_family = AF_INET;
	sin.sin_port = htons(port);
	
	if (-1 == bind(sock,(struct sockaddr*)&sin,sizeof(sin))) return -2;
	if (-1 == listen(sock,1)) return -3;

	sockaddrlen = sizeof(struct sockaddr_in);
        new_sock = accept(sock,(struct sockaddr*)&new_sin,(socklen_t*)&sockaddrlen);
	if (new_sock == -1) return -4;

	sock = new_sock;
	val = 1;
	if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)) != 0) return -5;
	return 0;
}

int init_client(char* hostname, int port)
{
	int val;
	int res;
	struct sockaddr_in sin;

	sock = socket(PF_INET, SOCK_STREAM, 0);

	memset(&sin, 0, sizeof(sin));
	sin.sin_family = AF_INET;
	sin.sin_port = htons(port);
	memcpy(&sin.sin_addr, gethostbyname(hostname)->h_addr, 4);
  
	val = 1;
	setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val));

	res = connect(sock, (struct sockaddr*)&sin, sizeof(sin));
	printf("Client: connected to %s:%d\n", hostname, port);
	if (res == -1) return -1;
        setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val));
	return 0;
}

void send_all(unsigned long delay)
{
	int i;
	char buf[1024];
	printf("Server: begin send_all\n");
	for (i = 1; i < TOTAL_SENDS; ++i) {
                write(sock, buf, 1);
		if (i % DELIM == 0) read(sock, buf, 1);
		if (i % 10 == 0) usleep(delay);
	}
}

void receive_all(unsigned long delay)
{
	int i;
	char buf[1024];
	printf("Client: begin receive_all\n");
	for (i = 1; i < TOTAL_SENDS; ++i) {
		read(sock, buf, 1);
		if (i % DELIM == 0) write(sock, buf, 1);
		if (i % 10 == 0) usleep(delay);
	}
}

int main(int argc, char* argv[])
{
	int port;
	char* host;
	int me;
	struct timeval tv1, tv2;
	double tt;

	assert(argc > 2);
	me = atoi(argv[1]);
	switch (me) {
		case 1:
			port = atoi(argv[2]);
			if (init_server(port)) {
				printf("Server initialization failed!\n");
				return 1;
			}
			gettimeofday(&tv1, 0);
			send_all(100000);
			gettimeofday(&tv2, 0);
			tt = tv2.tv_sec - tv1.tv_sec + (tv2.tv_usec - tv1.tv_usec) * 0.000001;
			printf("Server: total time %f\n", tt);
			break;
		case 2:
			assert(argc == 4);
			port = atoi(argv[2]);
			host = argv[3];
			if (init_client(host, port)) {
				printf("Client initialization failed!\n");
				return 1;
			}
			gettimeofday(&tv1, 0);
			receive_all(100000);
			gettimeofday(&tv2, 0);
			tt = tv2.tv_sec - tv1.tv_sec + (tv2.tv_usec - tv1.tv_usec) * 0.000001;
			printf("Client: total time %f\n", tt);

			break;
		default:
			printf("Wrong parameter\n");
			return 1;
	}
	shutdown(sock, SHUT_RDWR);
	close(sock);
	return 0;
}


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-30 10:07 high latency with TCP connections Alexander Vodomerov
@ 2006-08-30 17:27 ` Stephen Hemminger
  2006-08-30 21:39   ` David Miller
  2006-08-31 21:46   ` Alexey Kuznetsov
  2006-09-04 16:00 ` [PATCH][RFC] " Alexey Kuznetsov
  1 sibling, 2 replies; 43+ messages in thread
From: Stephen Hemminger @ 2006-08-30 17:27 UTC (permalink / raw)
  To: Alexander Vodomerov; +Cc: netdev

On Wed, 30 Aug 2006 14:07:34 +0400
Alexander Vodomerov <alex@sectorb.msk.ru> wrote:

> Hello!
> 
> I'm writing an application that is working over TCP. Total traffic is
> very low (~ 10 kb/sec), but performance is very bad. I've tried to
> investigate problem with tcpdump and strace, and it shows that
> application does multiple writes, but TCP buffers them and send after
> some delay (about 40 msec). Due to nature of my application, it is
> essential to send any available data ASAP (decreased bandwidth is not
> important). I've set TCP_NODELAY option on socket, but it doesn't help.

Linux TCP implements "Appropriate Byte Count" (ABC) and this penalizes
applications that do small sends. The problem is that the other side
may be delaying acknowledgments.  If receiver doesn't acknowledge the
sender will limit itself to the congestion window. If the flow is light,
then you will be limited to 4 packets.

> We've written a simple program to reproduce the effect. It sends 10
> small packets, then sleeps for 0.1 sec. Another node tries to receive
> data. Strace shows that 2 packets are sent immediately and other 8 are
> grouped together and delayed by 40 msec.
> 
> It is interesting that this effect can be seen not only on Ethernet
> links, but on loopback also (with the same magic constant of 40 msec).
> 
> Here is a test run:
>   server (should be run first):
> $ ./a.out 1 5000
> Server: begin send_all
> Server: total time 14.216441
>   client:
> $ ./a.out 2 5000 localhost
> Client: connected to localhost:5000
> Client: begin receive_all
> Client: total time 14.223265
> 
> Expected time is 10.0 sec (instead of 14.0 sec).
> If packets are received more often (DELIM constant is set to 1 or 2)
> then effect disappear.
> 
> Is this a desired behaviour? How can I specify that packets should be
> sent really immediately after write?
> 
> Some people reported that this program runs in 9.997 sec when run on
> FreeBSD.
> 
> Please cc me on replies, as I'm not subscribed to mailing list.
> 
> With best regards, 
>    Alexander.
> 

> 
> void send_all(unsigned long delay)
> {
> 	int i;
> 	char buf[1024];
> 	printf("Server: begin send_all\n");
> 	for (i = 1; i < TOTAL_SENDS; ++i) {
>                 write(sock, buf, 1);

Expecting any performance with one byte write's is silly.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-30 17:27 ` Stephen Hemminger
@ 2006-08-30 21:39   ` David Miller
  2006-08-30 22:04     ` Stephen Hemminger
                       ` (3 more replies)
  2006-08-31 21:46   ` Alexey Kuznetsov
  1 sibling, 4 replies; 43+ messages in thread
From: David Miller @ 2006-08-30 21:39 UTC (permalink / raw)
  To: shemminger; +Cc: alex, netdev

From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 30 Aug 2006 10:27:27 -0700

> Linux TCP implements "Appropriate Byte Count" (ABC) and this penalizes
> applications that do small sends. The problem is that the other side
> may be delaying acknowledgments.  If receiver doesn't acknowledge the
> sender will limit itself to the congestion window. If the flow is light,
> then you will be limited to 4 packets.

Right.

However it occured to me the other day that ABC could be made smarter.
If we sent small frames, ABC should account for that.

The problem with ABC is that it prevents CWND growth not just during
ACK division, but also when we truly are sending smaller sized frames.

In fact, for "chatty" protocols, the real load on a router for the
small packets is much less than that for full sized frames.  So it is
in fact these small frame sending cases for which we can be less
conservative, whatever that means here.

So my suggestion is that ABC should go look in the retransmit queue
and see how many "real packets" are being fully ACK'd, rather than
assuming the send queue is composed of MSS sized frames.

I also think we should seriously consider changing the ABC default to
"2" rather than "1".

> Expecting any performance with one byte write's is silly.

This is absolutely true.  TCP_NODELAY can only save you when you are
sending a small amount of data "in aggregate", such as in an SSH or
telnet session, whereas in the case being shown here a large amount of
data is being sent in small chunks which will always get bad
performance.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-30 21:39   ` David Miller
@ 2006-08-30 22:04     ` Stephen Hemminger
  2006-08-30 23:00     ` Rick Jones
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 43+ messages in thread
From: Stephen Hemminger @ 2006-08-30 22:04 UTC (permalink / raw)
  To: David Miller; +Cc: alex, netdev

On Wed, 30 Aug 2006 14:39:55 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@osdl.org>
> Date: Wed, 30 Aug 2006 10:27:27 -0700
> 
> > Linux TCP implements "Appropriate Byte Count" (ABC) and this penalizes
> > applications that do small sends. The problem is that the other side
> > may be delaying acknowledgments.  If receiver doesn't acknowledge the
> > sender will limit itself to the congestion window. If the flow is light,
> > then you will be limited to 4 packets.
> 
> Right.
> 
> However it occured to me the other day that ABC could be made smarter.
> If we sent small frames, ABC should account for that.
> 
> The problem with ABC is that it prevents CWND growth not just during
> ACK division, but also when we truly are sending smaller sized frames.
> 
> In fact, for "chatty" protocols, the real load on a router for the
> small packets is much less than that for full sized frames.  So it is
> in fact these small frame sending cases for which we can be less
> conservative, whatever that means here.
> 
> So my suggestion is that ABC should go look in the retransmit queue
> and see how many "real packets" are being fully ACK'd, rather than
> assuming the send queue is composed of MSS sized frames.
> 
> I also think we should seriously consider changing the ABC default to
> "2" rather than "1".

That would be a good simple first step. I can't hurt and seems
reasonable.

-- 
Stephen Hemminger <shemminger@osdl.org>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-30 21:39   ` David Miller
  2006-08-30 22:04     ` Stephen Hemminger
@ 2006-08-30 23:00     ` Rick Jones
  2006-08-31  8:14     ` Alexander Vodomerov
  2006-08-31 18:22     ` Kelly Burkhart
  3 siblings, 0 replies; 43+ messages in thread
From: Rick Jones @ 2006-08-30 23:00 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, alex, netdev

David Miller wrote:
> From: Stephen Hemminger <shemminger@osdl.org>
> Date: Wed, 30 Aug 2006 10:27:27 -0700
> 
> 
>>Linux TCP implements "Appropriate Byte Count" (ABC) and this penalizes
>>applications that do small sends. The problem is that the other side
>>may be delaying acknowledgments.  If receiver doesn't acknowledge the
>>sender will limit itself to the congestion window. If the flow is light,
>>then you will be limited to 4 packets.
> 
> 
> Right.
> 
> However it occured to me the other day that ABC could be made smarter.
> If we sent small frames, ABC should account for that.

Is that part of the application of a byte-based RFC to packet-counting cwnd?

rick jones

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-30 21:39   ` David Miller
  2006-08-30 22:04     ` Stephen Hemminger
  2006-08-30 23:00     ` Rick Jones
@ 2006-08-31  8:14     ` Alexander Vodomerov
  2006-08-31 15:44       ` Sridhar Samudrala
  2006-08-31 18:22     ` Kelly Burkhart
  3 siblings, 1 reply; 43+ messages in thread
From: Alexander Vodomerov @ 2006-08-31  8:14 UTC (permalink / raw)
  To: David Miller, netdev

On Wed, Aug 30, 2006 at 02:39:55PM -0700, David Miller wrote:
> > Expecting any performance with one byte write's is silly.
> 
> This is absolutely true.  TCP_NODELAY can only save you when you are
> sending a small amount of data "in aggregate", such as in an SSH or
> telnet session, whereas in the case being shown here a large amount of
> data is being sent in small chunks which will always get bad
> performance.

Information is sent with one byte write's because it is not available at
the moment of sending (it may be read from hardware device or user). If
I change 1 to 10 or 100 nothing changes. I'm afraid there is a bit of
misunderstanding here. Only very small amount of data is being sent over
network. The total traffic for example I sent is only 10 bytes/s. After
every 10th packet program does usleep(100000) to simulate pause before
next available data.

There are really 3 factors:
1) total size of information is small
2) data for transferring is arrived by small portions from external
source
3) it is very important that any portion should be delivered to receiver
as soon as possible.
Is TCP is good choice for such transfer or some other protocol is better
suited?

With best regards,
   Alexander.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-31  8:14     ` Alexander Vodomerov
@ 2006-08-31 15:44       ` Sridhar Samudrala
  0 siblings, 0 replies; 43+ messages in thread
From: Sridhar Samudrala @ 2006-08-31 15:44 UTC (permalink / raw)
  To: Alexander Vodomerov; +Cc: David Miller, netdev

Alexander Vodomerov wrote:
> On Wed, Aug 30, 2006 at 02:39:55PM -0700, David Miller wrote:
>   
>>> Expecting any performance with one byte write's is silly.
>>>       
>> This is absolutely true.  TCP_NODELAY can only save you when you are
>> sending a small amount of data "in aggregate", such as in an SSH or
>> telnet session, whereas in the case being shown here a large amount of
>> data is being sent in small chunks which will always get bad
>> performance.
>>     
>
> Information is sent with one byte write's because it is not available at
> the moment of sending (it may be read from hardware device or user). If
> I change 1 to 10 or 100 nothing changes. I'm afraid there is a bit of
> misunderstanding here. Only very small amount of data is being sent over
> network. The total traffic for example I sent is only 10 bytes/s. After
> every 10th packet program does usleep(100000) to simulate pause before
> next available data.
>
> There are really 3 factors:
> 1) total size of information is small
> 2) data for transferring is arrived by small portions from external
> source
> 3) it is very important that any portion should be delivered to receiver
> as soon as possible.
> Is TCP is good choice for such transfer or some other protocol is better
> suited?
>   
If message boundary preservation is a useful feature for your app, you 
could try SCTP.
You should be able to do this by replacing IPPROTO_TCP with IPPROTO_SCTP and
TCP_NODELAY with SCTP_NODELAY.

Thanks
Sridhar
> With best regards,
>    Alexander.
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-30 21:39   ` David Miller
                       ` (2 preceding siblings ...)
  2006-08-31  8:14     ` Alexander Vodomerov
@ 2006-08-31 18:22     ` Kelly Burkhart
  2006-08-31 19:40       ` Rick Jones
  2006-08-31 21:08       ` Ian McDonald
  3 siblings, 2 replies; 43+ messages in thread
From: Kelly Burkhart @ 2006-08-31 18:22 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, alex, netdev

On 8/30/06, David Miller <davem@davemloft.net> wrote:
> From: Stephen Hemminger <shemminger@osdl.org>
> > Expecting any performance with one byte write's is silly.
>
> This is absolutely true.  TCP_NODELAY can only save you when you are
> sending a small amount of data "in aggregate", such as in an SSH or
> telnet session, whereas in the case being shown here a large amount of
> data is being sent in small chunks which will always get bad
> performance.


The word performance in this list seems to always mean 'throughput'.
It seems though that there could be some knob to tweak for those of us
who don't care so much about throughput but care a great deal about
latency.

-K

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-31 18:22     ` Kelly Burkhart
@ 2006-08-31 19:40       ` Rick Jones
  2006-08-31 21:08       ` Ian McDonald
  1 sibling, 0 replies; 43+ messages in thread
From: Rick Jones @ 2006-08-31 19:40 UTC (permalink / raw)
  To: Kelly Burkhart; +Cc: David Miller, shemminger, alex, netdev

Kelly Burkhart wrote:
> On 8/30/06, David Miller <davem@davemloft.net> wrote:
> 
>> From: Stephen Hemminger <shemminger@osdl.org>
>> > Expecting any performance with one byte write's is silly.
>>
>> This is absolutely true.  TCP_NODELAY can only save you when you are
>> sending a small amount of data "in aggregate", such as in an SSH or
>> telnet session, whereas in the case being shown here a large amount of
>> data is being sent in small chunks which will always get bad
>> performance.
> 
> 
> 
> The word performance in this list seems to always mean 'throughput'.
> It seems though that there could be some knob to tweak for those of us
> who don't care so much about throughput but care a great deal about
> latency.

IIRC Apart from interactions with Nagle (TCP_NODELAY) or the mixing of 
packet and byte-based congestion control and avoidance heuristics, there 
really isn't much of anything else to tweak in TCP.  If it can send 
data, it sends data.

Where there _is_ a knob to turn these days might be down with the 
drivers and their NICs' use of interrupt coalescing:

ftp://ftp.cup.hp.com/dist/networking/briefs/nic_latency_vs_tput.txt

rick jones

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-31 18:22     ` Kelly Burkhart
  2006-08-31 19:40       ` Rick Jones
@ 2006-08-31 21:08       ` Ian McDonald
  1 sibling, 0 replies; 43+ messages in thread
From: Ian McDonald @ 2006-08-31 21:08 UTC (permalink / raw)
  To: Kelly Burkhart; +Cc: David Miller, shemminger, alex, netdev

> The word performance in this list seems to always mean 'throughput'.
> It seems though that there could be some knob to tweak for those of us
> who don't care so much about throughput but care a great deal about
> latency.
>
SCTP has been mentioned. There is also DCCP - http://www.read.cs.ucla.edu/dccp/

-- 
Ian McDonald
Web: http://wand.net.nz/~iam4
Blog: http://imcdnzl.blogspot.com
WAND Network Research Group
Department of Computer Science
University of Waikato
New Zealand

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-30 17:27 ` Stephen Hemminger
  2006-08-30 21:39   ` David Miller
@ 2006-08-31 21:46   ` Alexey Kuznetsov
  2006-08-31 22:14     ` Stephen Hemminger
  1 sibling, 1 reply; 43+ messages in thread
From: Alexey Kuznetsov @ 2006-08-31 21:46 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Alexander Vodomerov, netdev

Hello!

> Expecting any performance with one byte write's is silly.

I am not sure why you are so confident about status of ABC.
I missed the discussions, when it was implemented. Apparently,
it was noticed that ABC in its pure form does not make sense
with snd_cwnd counted in packets and there were some reasons,
why it still was not adapted.

That's what I am sure: if congestion control causes delack syndrome,
it is difficult to advocate this congestion control scheme.
It must be repaired, if there is even the smallest possibility
to do this.


The first idea:

Behaviour is broken, when we have < 2*MSS in flight. In that
case receiver falls to delack timeout and breaks ACK clock.

	thresh = tp->mss_cache;

	if (tp->snd_nxt - tp->snd_una <= tp->mss_cache) {
		thresh = 1;
		if (tp->send_head)
			thresh = tp->send_head->len;
		/* If send_head is empty, it disables ABC.
		 * If send_head is good mss sized segment, ABC is in work.
		 * If send_head is a small segment, it is half-way.
		 */
	}

	if (tp->bytes_acked < thresh)
	       return;

Probably, do this only when TCP_NODELAY is set, but actually it does not
matter. If it is nagled/corked, cwnd will not be used/increased in any case.






^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-31 21:46   ` Alexey Kuznetsov
@ 2006-08-31 22:14     ` Stephen Hemminger
  2006-08-31 22:44       ` David Miller
  0 siblings, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2006-08-31 22:14 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: Alexander Vodomerov, netdev

On Fri, 1 Sep 2006 01:46:35 +0400
Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> wrote:

> Hello!
> 
> > Expecting any performance with one byte write's is silly.
> 
> I am not sure why you are so confident about status of ABC.
> I missed the discussions, when it was implemented. Apparently,
> it was noticed that ABC in its pure form does not make sense
> with snd_cwnd counted in packets and there were some reasons,
> why it still was not adapted.

I implemented it but don't think ABC is the correct thing to be doing
in all cases.

If you read the RFC3465, the problem it is trying to address is that of
small packets causing growth of congestion window beyond the capacity
of the link.

It makes a number of assumptions that may not be true for Linux:
  * ABC doesn't take into account congestion window validation RFC2861
    already prevents most of the problem of inflated growth.
  * ABC assumes that the "true" capacity of the link is limited by
    byte count not packet count.


> That's what I am sure: if congestion control causes delack syndrome,
> it is difficult to advocate this congestion control scheme.
> It must be repaired, if there is even the smallest possibility
> to do this.
> 
> 
> The first idea:
> 
> Behaviour is broken, when we have < 2*MSS in flight. In that
> case receiver falls to delack timeout and breaks ACK clock.
> 
> 	thresh = tp->mss_cache;
> 
> 	if (tp->snd_nxt - tp->snd_una <= tp->mss_cache) {
> 		thresh = 1;
> 		if (tp->send_head)
> 			thresh = tp->send_head->len;
> 		/* If send_head is empty, it disables ABC.
> 		 * If send_head is good mss sized segment, ABC is in work.
> 		 * If send_head is a small segment, it is half-way.
> 		 */
> 	}
> 
> 	if (tp->bytes_acked < thresh)
> 	       return;
> 
> Probably, do this only when TCP_NODELAY is set, but actually it does not
> matter. If it is nagled/corked, cwnd will not be used/increased in any case.

Maybe it makes better sense to add a bonus to bytes_acked on each ack to
try and make small packets get an increase sooner, and try and make the
cwnd * mss_cache estimate be more accurate. In the ideal case, during
slow start with small packets cwnd should increase at the same rate
that it would with larger packets.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-31 22:14     ` Stephen Hemminger
@ 2006-08-31 22:44       ` David Miller
  2006-08-31 23:29         ` Alexey Kuznetsov
  0 siblings, 1 reply; 43+ messages in thread
From: David Miller @ 2006-08-31 22:44 UTC (permalink / raw)
  To: shemminger; +Cc: kuznet, alex, netdev

From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 31 Aug 2006 15:14:56 -0700

> On Fri, 1 Sep 2006 01:46:35 +0400
> Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> wrote:
> 
> > > Expecting any performance with one byte write's is silly.
> > 
> > I am not sure why you are so confident about status of ABC.
> > I missed the discussions, when it was implemented. Apparently,
> > it was noticed that ABC in its pure form does not make sense
> > with snd_cwnd counted in packets and there were some reasons,
> > why it still was not adapted.
> 
> I implemented it but don't think ABC is the correct thing to be doing
> in all cases.
> 
> If you read the RFC3465, the problem it is trying to address is that of
> small packets causing growth of congestion window beyond the capacity
> of the link.
> 
> It makes a number of assumptions that may not be true for Linux:
>   * ABC doesn't take into account congestion window validation RFC2861
>     already prevents most of the problem of inflated growth.
>   * ABC assumes that the "true" capacity of the link is limited by
>     byte count not packet count.

It seems to me that the thing gained by ABC are twofold:

1) protection against ACK division
2) a way to take delayed ACKs into account for cwnd growth

Both of which can be obtained by simply validating the ACK
against the retransmit queue, returning number of true
packets ACK'd.

I would even go so far as to suggest that we should drop ACKs which do
not fall on packetization boundaries.  Perhaps only when not in LOSS
state, but I doubt that this matters in practice.

Cases where mid-packet ACK is valid are truly marginal ones involving
repacketization wrt. MSS/MTU changes, and these would self-correct
eventually.

I agree that ABC has some problems.  Solution is good, implementation
is just horrible :-)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-31 22:44       ` David Miller
@ 2006-08-31 23:29         ` Alexey Kuznetsov
  2006-08-31 23:57           ` David Miller
  0 siblings, 1 reply; 43+ messages in thread
From: Alexey Kuznetsov @ 2006-08-31 23:29 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, alex, netdev

Hello!

> 2) a way to take delayed ACKs into account for cwnd growth

This part is OK now, right?


> 1) protection against ACK division

But Linux never had this problem... Congestion window was increased
only when a whole skb is ACKed, flag FLAG_DATA_ACKED. (TSO could
break this, but should not). Otherwise, this ACK just advanced snd_una
and nothing more.

This aspect of ABC is crucial for BSD. TCP_NODELAY sockets did not obey
congestion control there. From the very beginning, before slow start
it can send thousands of 1 byte segments.

The only problem of kind "too-aggressive" with Linux was that we could
develop large cwnd sending small segments, and then switch to sending
mss-sized segments. It does not look scary, to be honest. :-)

Linux had troubles with slow start even before ABC. Actually,
some of applications can suffer of the same syndrome even if ABC disabled.
With ABC it becomes TROUBLE, cwnd has no chances to develop at all.

Probably, aspect 1 of ABC just should be disabled. And the first my suggestion
looks working too.

Alexey

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-31 23:29         ` Alexey Kuznetsov
@ 2006-08-31 23:57           ` David Miller
  2006-09-01  3:23             ` Stephen Hemminger
                               ` (2 more replies)
  0 siblings, 3 replies; 43+ messages in thread
From: David Miller @ 2006-08-31 23:57 UTC (permalink / raw)
  To: kuznet; +Cc: shemminger, alex, netdev

From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Date: Fri, 1 Sep 2006 03:29:23 +0400

> > 2) a way to take delayed ACKs into account for cwnd growth
> 
> This part is OK now, right?

This part of ABC is not on by default, and was broken until last week
:-)

Test in tcp_slow_start() used to be:

	tp->bytes_acked > 2*tp->mss_cache

but now it is the correct:

	tp->bytes_acked >= 2*tp->mss_cache

It allows to make two congestion window increases from one ACK, when
noticing delayed ACK.

Non-ABC code did not do this, but could figure this kind of thing
out while scanning retransmit queue.

> > 1) protection against ACK division
> 
> But Linux never had this problem... Congestion window was increased
> only when a whole skb is ACKed, flag FLAG_DATA_ACKED. (TSO could
> break this, but should not). Otherwise, this ACK just advanced snd_una
> and nothing more.

Ugh, I missed this. :-/

The TSO code is carefuly to only trim TSO skbs on proper boundaries,
and this ensures proper setting of FLAG_DATA_ACKED too.  So no
problems here.

> The only problem of kind "too-aggressive" with Linux was that we could
> develop large cwnd sending small segments, and then switch to sending
> mss-sized segments. It does not look scary, to be honest. :-)

Agreed.

> Linux had troubles with slow start even before ABC. Actually,
> some of applications can suffer of the same syndrome even if ABC disabled.
> With ABC it becomes TROUBLE, cwnd has no chances to develop at all.

I've discussed that very issue here before, some time ago, with
John Heffner.  It was in response to a user reporting a similar
problem.  The problem is really at the receiver because we only
ACK every other full sized frame.  I had the idea to ACK every 2
frames, regardless of size, but that might have other problems.

There is an assymetry between how we do congestion control on
sending (packet counting) and our ACK policy on receive (packet
sized based).

> Probably, aspect 1 of ABC just should be disabled. And the first my
> suggestion looks working too.

I'm ready to rip out ABC entirely, to be honest.  Or at least
turn it off by default.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-31 23:57           ` David Miller
@ 2006-09-01  3:23             ` Stephen Hemminger
  2006-09-01  3:39               ` Ian McDonald
  2006-09-01  9:44             ` Pekka Savola
  2006-09-01  9:47             ` Alexey Kuznetsov
  2 siblings, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2006-09-01  3:23 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, alex, netdev

On Thu, 31 Aug 2006 16:57:01 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
> Date: Fri, 1 Sep 2006 03:29:23 +0400
> 
> > > 2) a way to take delayed ACKs into account for cwnd growth
> > 
> > This part is OK now, right?
> 
> This part of ABC is not on by default, and was broken until last week
> :-)
> 
> Test in tcp_slow_start() used to be:
> 
> 	tp->bytes_acked > 2*tp->mss_cache
> 
> but now it is the correct:
> 
> 	tp->bytes_acked >= 2*tp->mss_cache
> 
> It allows to make two congestion window increases from one ACK, when
> noticing delayed ACK.
> 
> Non-ABC code did not do this, but could figure this kind of thing
> out while scanning retransmit queue.
> 
> > > 1) protection against ACK division
> > 
> > But Linux never had this problem... Congestion window was increased
> > only when a whole skb is ACKed, flag FLAG_DATA_ACKED. (TSO could
> > break this, but should not). Otherwise, this ACK just advanced snd_una
> > and nothing more.
> 
> Ugh, I missed this. :-/
> 
> The TSO code is carefuly to only trim TSO skbs on proper boundaries,
> and this ensures proper setting of FLAG_DATA_ACKED too.  So no
> problems here.
> 
> > The only problem of kind "too-aggressive" with Linux was that we could
> > develop large cwnd sending small segments, and then switch to sending
> > mss-sized segments. It does not look scary, to be honest. :-)
> 
> Agreed.
> 
> > Linux had troubles with slow start even before ABC. Actually,
> > some of applications can suffer of the same syndrome even if ABC disabled.
> > With ABC it becomes TROUBLE, cwnd has no chances to develop at all.
> 
> I've discussed that very issue here before, some time ago, with
> John Heffner.  It was in response to a user reporting a similar
> problem.  The problem is really at the receiver because we only
> ACK every other full sized frame.  I had the idea to ACK every 2
> frames, regardless of size, but that might have other problems.
> 
> There is an assymetry between how we do congestion control on
> sending (packet counting) and our ACK policy on receive (packet
> sized based).
> 
> > Probably, aspect 1 of ABC just should be disabled. And the first my
> > suggestion looks working too.
> 
> I'm ready to rip out ABC entirely, to be honest.  Or at least
> turn it off by default.

Turn it off for 2.6.18, by default then evaluate more for 2.6.19


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-09-01  3:23             ` Stephen Hemminger
@ 2006-09-01  3:39               ` Ian McDonald
  2006-09-01  6:23                 ` David Miller
  0 siblings, 1 reply; 43+ messages in thread
From: Ian McDonald @ 2006-09-01  3:39 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, kuznet, alex, netdev

> > I'm ready to rip out ABC entirely, to be honest.  Or at least
> > turn it off by default.
>
> Turn it off for 2.6.18, by default then evaluate more for 2.6.19
>
If it goes out in 2.6.18 there could probably be a good argument for
going into the stable tree as well... to stop the likes of the JVM
type issues that users keep hitting (which is fixed or going to be
fixed by Sun).
-- 
Ian McDonald
Web: http://wand.net.nz/~iam4
Blog: http://imcdnzl.blogspot.com
WAND Network Research Group
Department of Computer Science
University of Waikato
New Zealand

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-09-01  3:39               ` Ian McDonald
@ 2006-09-01  6:23                 ` David Miller
  0 siblings, 0 replies; 43+ messages in thread
From: David Miller @ 2006-09-01  6:23 UTC (permalink / raw)
  To: ian.mcdonald; +Cc: shemminger, kuznet, alex, netdev

From: "Ian McDonald" <ian.mcdonald@jandi.co.nz>
Date: Fri, 1 Sep 2006 15:39:37 +1200

> > > I'm ready to rip out ABC entirely, to be honest.  Or at least
> > > turn it off by default.
> >
> > Turn it off for 2.6.18, by default then evaluate more for 2.6.19
> >
> If it goes out in 2.6.18 there could probably be a good argument for
> going into the stable tree as well... to stop the likes of the JVM
> type issues that users keep hitting (which is fixed or going to be
> fixed by Sun).

I completely agree.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-31 23:57           ` David Miller
  2006-09-01  3:23             ` Stephen Hemminger
@ 2006-09-01  9:44             ` Pekka Savola
  2006-09-01  9:49               ` David Miller
  2006-09-01  9:47             ` Alexey Kuznetsov
  2 siblings, 1 reply; 43+ messages in thread
From: Pekka Savola @ 2006-09-01  9:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Thu, 31 Aug 2006, David Miller wrote:
...
>> Probably, aspect 1 of ABC just should be disabled. And the first my
>> suggestion looks working too.
>
> I'm ready to rip out ABC entirely, to be honest.  Or at least
> turn it off by default.

Just as a curious observer:  do you think these issues are due to ABC 
implementation, or due to ABC specification?

I.e., should the ABC specification be 1) revised to fix specification 
bugs/omissions, 2) revised to document assumptions better, 3) 
deprecated due to FOO, 4) recommended for use, or 5) something else?

-- 
Pekka Savola                 "You each name yourselves king, yet the
Netcore Oy                    kingdom bleeds."
Systems. Networks. Security. -- George R.R. Martin: A Clash of Kings

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-08-31 23:57           ` David Miller
  2006-09-01  3:23             ` Stephen Hemminger
  2006-09-01  9:44             ` Pekka Savola
@ 2006-09-01  9:47             ` Alexey Kuznetsov
  2006-09-01 11:00               ` Evgeniy Polyakov
  2 siblings, 1 reply; 43+ messages in thread
From: Alexey Kuznetsov @ 2006-09-01  9:47 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, alex, netdev

Hello!

> problem.  The problem is really at the receiver because we only
> ACK every other full sized frame.  I had the idea to ACK every 2
> frames, regardless of size,

This would solve lots of problems.


>			 but that might have other problems.

BSD used to do this, everyone is happy.

It could disclose some new problems in our stack, because we have
quite a lot of code to mitigate problems caused by ACK each 2*MSS.
But they should not be essential. Actually, we could have it adjustable:
normally each 2 frames, but apply efforts to coarsen ACKs up to 2*MSS,
when lots of small frames are ACKed back to back.


> There is an assymetry between how we do congestion control on
> sending (packet counting) and our ACK policy on receive (packet
> sized based).

It is curious fact: BSD has exactly opposite assymetry.
ACK are sent each second packet, but congestion avoidance is made by size. :-)


> turn it off by default.

Yes.

Alexey


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-09-01  9:44             ` Pekka Savola
@ 2006-09-01  9:49               ` David Miller
  0 siblings, 0 replies; 43+ messages in thread
From: David Miller @ 2006-09-01  9:49 UTC (permalink / raw)
  To: pekkas; +Cc: netdev

From: Pekka Savola <pekkas@netcore.fi>
Date: Fri, 1 Sep 2006 12:44:48 +0300 (EEST)

> On Thu, 31 Aug 2006, David Miller wrote:
> ...
> >> Probably, aspect 1 of ABC just should be disabled. And the first my
> >> suggestion looks working too.
> >
> > I'm ready to rip out ABC entirely, to be honest.  Or at least
> > turn it off by default.
> 
> Just as a curious observer:  do you think these issues are due to ABC 
> implementation, or due to ABC specification?

It simply doesn't apply to us, as Alexey explained, because we prevent
ACK divison already when we apply the ACK to the retransmit queue
purging loop.  If we didn't free any whole packets, we don't advance
the congestion window.

The other bit, dealing with delayed ACKs, we could handle another
way.

ABC is a very BSD specific algorithm, as Alexey also mentioned.


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-09-01  9:47             ` Alexey Kuznetsov
@ 2006-09-01 11:00               ` Evgeniy Polyakov
       [not found]                 ` <20060901090046.69b3d583@localhost.localdomain>
  2006-09-04  9:10                 ` high latency with TCP connections Alexey Kuznetsov
  0 siblings, 2 replies; 43+ messages in thread
From: Evgeniy Polyakov @ 2006-09-01 11:00 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: David Miller, shemminger, alex, netdev

On Fri, Sep 01, 2006 at 01:47:15PM +0400, Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) wrote:
> Hello!
> 
> > problem.  The problem is really at the receiver because we only
> > ACK every other full sized frame.  I had the idea to ACK every 2
> > frames, regardless of size,
> 
> This would solve lots of problems.

At least for slow start it is safe, but experiments with atcp for
netchannels showed that it is better not to send excessive number of
acks when slow start is over, instead we can introduce some tricky
ack avoidance scheme and ack at least 2-3-4 packets or full MSS instead
of two mss-sized frames.

> Alexey

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH] tcp: turn ABC off
       [not found]                 ` <20060901090046.69b3d583@localhost.localdomain>
@ 2006-09-01 20:55                   ` Stephen Hemminger
  2006-09-02  7:22                     ` Evgeniy Polyakov
  0 siblings, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2006-09-01 20:55 UTC (permalink / raw)
  To: David S. Miller; +Cc: Evgeniy Polyakov, Alexey Kuznetsov, alex, netdev

Turn Appropriate Byte Count off by default because it unfairly penalizes
applications that do small writes.
Add better documentation to describe what it is so users will understand
why they might want to turn it on.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>

---

 Documentation/networking/ip-sysctl.txt |   12 +++++++++---
 net/ipv4/tcp_input.c                   |    2 +-
 2 files changed, 10 insertions(+), 4 deletions(-)

3988eb0b87288c0d30c165742af7fc7a733100ab
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 3e0c017..90ed781 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -102,9 +102,15 @@ inet_peer_gc_maxtime - INTEGER
 TCP variables: 
 
 tcp_abc - INTEGER
-	Controls Appropriate Byte Count defined in RFC3465. If set to
-	0 then does congestion avoid once per ack. 1 is conservative
-	value, and 2 is more agressive.
+	Controls Appropriate Byte Count (ABC) defined in RFC3465.
+	ABC is a way of increasing congestion window (cwnd) more slowly
+	in response to partial acknowledgments.
+	Possible values are:
+		0 increase cwnd once per acknowledgment (no ABC)
+		1 increase cwnd once per acknowledgment of full sized segment
+		2 allow increase cwnd by two if acknowledgment is
+		  of two segments to compensate for delayed acknowledgments.
+	Default: 0 (off)
 
 tcp_syn_retries - INTEGER
 	Number of times initial SYNs for an active TCP connection attempt
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 111ff39..159fa3f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -89,7 +89,7 @@ int sysctl_tcp_frto;
 int sysctl_tcp_nometrics_save;
 
 int sysctl_tcp_moderate_rcvbuf = 1;
-int sysctl_tcp_abc = 1;
+int sysctl_tcp_abc;
 
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
-- 
1.2.4


-- 
VGER BF report: U 0.5

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH] tcp: turn ABC off
  2006-09-01 20:55                   ` [PATCH] tcp: turn ABC off Stephen Hemminger
@ 2006-09-02  7:22                     ` Evgeniy Polyakov
  2006-09-02  8:10                       ` Herbert Xu
  0 siblings, 1 reply; 43+ messages in thread
From: Evgeniy Polyakov @ 2006-09-02  7:22 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, Alexey Kuznetsov, alex, netdev

On Fri, Sep 01, 2006 at 01:55:15PM -0700, Stephen Hemminger (shemminger@osdl.org) wrote:
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 111ff39..159fa3f 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -89,7 +89,7 @@ int sysctl_tcp_frto;
>  int sysctl_tcp_nometrics_save;
>  
>  int sysctl_tcp_moderate_rcvbuf = 1;
> -int sysctl_tcp_abc = 1;
> +int sysctl_tcp_abc;

Since it is not static are you sure it will be zero?

-- 
	Evgeniy Polyakov

-- 
VGER BF report: H 8.21099e-10

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] tcp: turn ABC off
  2006-09-02  7:22                     ` Evgeniy Polyakov
@ 2006-09-02  8:10                       ` Herbert Xu
  0 siblings, 0 replies; 43+ messages in thread
From: Herbert Xu @ 2006-09-02  8:10 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: shemminger, davem, kuznet, alex, netdev

Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
> On Fri, Sep 01, 2006 at 01:55:15PM -0700, Stephen Hemminger (shemminger@osdl.org) wrote:
>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> index 111ff39..159fa3f 100644
>> --- a/net/ipv4/tcp_input.c
>> +++ b/net/ipv4/tcp_input.c
>> @@ -89,7 +89,7 @@ int sysctl_tcp_frto;
>>  int sysctl_tcp_nometrics_save;
>>  
>>  int sysctl_tcp_moderate_rcvbuf = 1;
>> -int sysctl_tcp_abc = 1;
>> +int sysctl_tcp_abc;
> 
> Since it is not static are you sure it will be zero?

Outside a function the static modifier merely modifies whether the
symbol is visible externally.  It does not control whether it gets
zeroed.  And yes this will get zeroed.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

-- 
VGER BF report: U 0.49998

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: high latency with TCP connections
  2006-09-01 11:00               ` Evgeniy Polyakov
       [not found]                 ` <20060901090046.69b3d583@localhost.localdomain>
@ 2006-09-04  9:10                 ` Alexey Kuznetsov
  1 sibling, 0 replies; 43+ messages in thread
From: Alexey Kuznetsov @ 2006-09-04  9:10 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: David Miller, shemminger, alex, netdev

Hello!

> At least for slow start it is safe, but experiments with atcp for
> netchannels showed that it is better not to send excessive number of
> acks when slow start is over,

If this thing is done from tcp_cleanup_rbuf(), it should not affect
performance too much.

Note, that with ABC and another pathological cases, which do not allow
to send more than a fixed amount of segments [ we have lots of them,
f.e. sending tiny segments, we can hit sndbuf limit ], we deal with case,
when slow start is _never_ over.


>				 instead we can introduce some tricky
> ack avoidance scheme and ack at least 2-3-4 packets or full MSS instead
> of two mss-sized frames.

One smart scheme was used at some stage (2000, probably never merged
in this form to mainstream): tcp counted amount of unacked small segments
in ack.rcv_small and kept threshold in ack.rcv_thresh.

+
+       /* If we ever saw N>1 small segments from peer, it has
+        * enough of send buffer to send N packets and does not nagle.
+        * Hence, we may delay acks more aggresively.
+        */
+       if (tp->ack.rcv_small > tp->ack.rcv_thresh+1)
+               tp->ack.rcv_thresh = tp->ack.rcv_small-1;
+       tp->ack.rcv_small = 0;


That was too much of trouble for such simple thing. So, eventually
it was replaced with much dumber scheme. Look at current tcp_cleanup_rbuf().
It forces ACK, each time when it sees, that some small segment was received.
It survived for 6 years, so that I guess it did not hurt anybody. :-)

What I would suggest to do now, is to replace:

	(copied > 0 &&
	(icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
	!icsk->icsk_ack.pingpong &&
	!atomic_read(&sk->sk_rmem_alloc)))
		time_to_ack = 1;

with:

	(copied > 0 &&
	(icsk->icsk_ack.unacked > 1 ||
	(icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && !icsk->icsk_ack.pingpong) &&
	!atomic_read(&sk->sk_rmem_alloc)))
		time_to_ack = 1;

I would not hesitate even a minute, if variable "unacked" could be caluclated
using some existing state variables.

Alexey

-- 
VGER BF report: U 0.500017

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH][RFC] Re: high latency with TCP connections
  2006-08-30 10:07 high latency with TCP connections Alexander Vodomerov
  2006-08-30 17:27 ` Stephen Hemminger
@ 2006-09-04 16:00 ` Alexey Kuznetsov
  2006-09-05 17:55   ` Rick Jones
  2006-09-18  7:31   ` David Miller
  1 sibling, 2 replies; 43+ messages in thread
From: Alexey Kuznetsov @ 2006-09-04 16:00 UTC (permalink / raw)
  To: Alexander Vodomerov; +Cc: netdev, davem

Hello!

> Some people reported that this program runs in 9.997 sec when run on
> FreeBSD.

Try enclosed patch. I have no idea why 9.997 sec is so magic, but I
get exactly this number on my notebook. :-)

Alexey

=================

This patch enables sending ACKs each 2d received segment.
It does not affect either mss-sized connections (obviously) or connections
controlled by Nagle (because there is only one small segment in flight).

The idea is to record the fact that a small segment arrives
on a connection, where one small segment has already been received
and still not-ACKed. In this case ACK is forced after tcp_recvmsg()
drains receive buffer.

In other words, it is a "soft" each-2d-segment ACK, which is enough
to preserve ACK clock even when ABC is enabled.

Signed-off-by: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>



diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 9bf73fe..de4e83b 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -147,7 +147,8 @@ extern struct sock *inet_csk_clone(struc
 enum inet_csk_ack_state_t {
 	ICSK_ACK_SCHED	= 1,
 	ICSK_ACK_TIMER  = 2,
-	ICSK_ACK_PUSHED = 4
+	ICSK_ACK_PUSHED = 4,
+	ICSK_ACK_PUSHED2 = 8
 };
 
 extern void inet_csk_init_xmit_timers(struct sock *sk,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 934396b..4f3b76f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -955,8 +955,11 @@ #endif
 		     * receive buffer and there was a small segment
 		     * in queue.
 		     */
-		    (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
-		     !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
+		    (copied > 0 &&
+		     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
+		      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
+		       !icsk->icsk_ack.pingpong) &&
+		      !atomic_read(&sk->sk_rmem_alloc)))
 			time_to_ack = 1;
 	}
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 111ff39..5877920 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -156,6 +156,8 @@ static void tcp_measure_rcv_mss(struct s
 				return;
 			}
 		}
+		if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
+			icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
 		icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
 	}
 }

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-04 16:00 ` [PATCH][RFC] " Alexey Kuznetsov
@ 2006-09-05 17:55   ` Rick Jones
  2006-09-05 22:13     ` Alexey Kuznetsov
  2006-09-18  7:39     ` David Miller
  2006-09-18  7:31   ` David Miller
  1 sibling, 2 replies; 43+ messages in thread
From: Rick Jones @ 2006-09-05 17:55 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: Alexander Vodomerov, netdev, davem

Alexey Kuznetsov wrote:
> Hello!
> 
> 
>>Some people reported that this program runs in 9.997 sec when run on
>>FreeBSD.
> 
> 
> Try enclosed patch. I have no idea why 9.997 sec is so magic, but I
> get exactly this number on my notebook. :-)
> 
> Alexey
> 
> =================
> 
> This patch enables sending ACKs each 2d received segment.
> It does not affect either mss-sized connections (obviously) or connections
> controlled by Nagle (because there is only one small segment in flight).
> 
> The idea is to record the fact that a small segment arrives
> on a connection, where one small segment has already been received
> and still not-ACKed. In this case ACK is forced after tcp_recvmsg()
> drains receive buffer.
> 
> In other words, it is a "soft" each-2d-segment ACK, which is enough
> to preserve ACK clock even when ABC is enabled.

Is this really necessary?  I thought that the problems with ABC were in 
trying to apply byte-based heuristics from the RFC(s) to a 
packet-oritented cwnd in the stack?

rick jones

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-05 17:55   ` Rick Jones
@ 2006-09-05 22:13     ` Alexey Kuznetsov
  2006-09-18  7:39     ` David Miller
  1 sibling, 0 replies; 43+ messages in thread
From: Alexey Kuznetsov @ 2006-09-05 22:13 UTC (permalink / raw)
  To: Rick Jones; +Cc: Alexander Vodomerov, netdev, davem

Hello!

> Is this really necessary?

No, of course. We lived for ages without this, would live for another age.



>			I thought that the problems with ABC were in 
> trying to apply byte-based heuristics from the RFC(s) to a 
> packet-oritented cwnd in the stack?

It was just the last drop.

Even with disabled ABC, that test shows some gaps in latency summed
up to ~300 msec. Almost invisible, but not good.

Too aggressive delack has many other issues. Even without ABC
we have quadratically suppressed cwnd on TCP_NODELAY connections
comparing to BSD: at sender side we suppress it by counting
cwnd in packets, at receiver side by ACKing by byte counter.

Each time when another victim sees artificial latencies introduced
by agressive delayed acks, even though he requested TCP_NODELAY,
our best argument is "Stupid, you do all wrong, how could you get
a decent performance?" :-). 

Probably, we stand for a feature which really does not worth
to stand for and causes nothing but permanent pain in ass.

Alexey

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-04 16:00 ` [PATCH][RFC] " Alexey Kuznetsov
  2006-09-05 17:55   ` Rick Jones
@ 2006-09-18  7:31   ` David Miller
  2006-09-18 10:37     ` Alexey Kuznetsov
  1 sibling, 1 reply; 43+ messages in thread
From: David Miller @ 2006-09-18  7:31 UTC (permalink / raw)
  To: kuznet; +Cc: alex, netdev

From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Date: Mon, 4 Sep 2006 20:00:45 +0400

> Try enclosed patch. I have no idea why 9.997 sec is so magic, but I
> get exactly this number on my notebook. :-)
> 
> =================
> 
> This patch enables sending ACKs each 2d received segment.
> It does not affect either mss-sized connections (obviously) or connections
> controlled by Nagle (because there is only one small segment in flight).
> 
> The idea is to record the fact that a small segment arrives
> on a connection, where one small segment has already been received
> and still not-ACKed. In this case ACK is forced after tcp_recvmsg()
> drains receive buffer.
> 
> In other words, it is a "soft" each-2d-segment ACK, which is enough
> to preserve ACK clock even when ABC is enabled.
> 
> Signed-off-by: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>

This looks exactly like the kind of patch I tried to formulate,
very unsuccessfully, last time this topic came up a year or
so ago.

It looks perfectly fine to me, would you like me to apply it
Alexey?

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-05 17:55   ` Rick Jones
  2006-09-05 22:13     ` Alexey Kuznetsov
@ 2006-09-18  7:39     ` David Miller
  2006-09-18 17:11       ` Rick Jones
  1 sibling, 1 reply; 43+ messages in thread
From: David Miller @ 2006-09-18  7:39 UTC (permalink / raw)
  To: rick.jones2; +Cc: kuznet, alex, netdev

From: Rick Jones <rick.jones2@hp.com>
Date: Tue, 05 Sep 2006 10:55:16 -0700

> Is this really necessary?  I thought that the problems with ABC were in 
> trying to apply byte-based heuristics from the RFC(s) to a 
> packet-oritented cwnd in the stack?

This is receiver side, and helps a sender who does congestion
control based upon packet counting like Linux does.   It really
is less related to ABC than Alexey implies, we've always had
this kind of problem as I mentioned in previous talks in the
past on this issue.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-18  7:31   ` David Miller
@ 2006-09-18 10:37     ` Alexey Kuznetsov
  2006-09-18 13:56       ` David Miller
  0 siblings, 1 reply; 43+ messages in thread
From: Alexey Kuznetsov @ 2006-09-18 10:37 UTC (permalink / raw)
  To: David Miller; +Cc: alex, netdev

Hello!

> It looks perfectly fine to me, would you like me to apply it
> Alexey?

Yes, I think it is safe.


Theoretically, there is one place where it can be not so good.
Good nagling tcp connection, which makes lots of small write()s,
will send MSS sized frames due to delayed ACKs. But if we ACK
each other segment, more segments will come out incomplete,
which could result in some decrease of throughput.

But the trap for this case was set 6 years ago. For unidirectional sessions
ACKs were sent not even each second segment, but each small segment. :-)
This did not show any problems for those 6 years. I guess it means
that the problem does not exist.

Alexey

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-18 10:37     ` Alexey Kuznetsov
@ 2006-09-18 13:56       ` David Miller
  2006-09-20 22:44         ` Stephen Hemminger
  0 siblings, 1 reply; 43+ messages in thread
From: David Miller @ 2006-09-18 13:56 UTC (permalink / raw)
  To: kuznet; +Cc: alex, netdev

From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Date: Mon, 18 Sep 2006 14:37:05 +0400

> > It looks perfectly fine to me, would you like me to apply it
> > Alexey?
> 
> Yes, I think it is safe.

Ok, I'll put this into net-2.6.19 for now.  Thanks.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-18  7:39     ` David Miller
@ 2006-09-18 17:11       ` Rick Jones
  2006-09-18 20:41         ` Alexey Kuznetsov
  0 siblings, 1 reply; 43+ messages in thread
From: Rick Jones @ 2006-09-18 17:11 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, alex, netdev

David Miller wrote:
> From: Rick Jones <rick.jones2@hp.com>
> Date: Tue, 05 Sep 2006 10:55:16 -0700
> 
> 
>>Is this really necessary?  I thought that the problems with ABC were in 
>>trying to apply byte-based heuristics from the RFC(s) to a 
>>packet-oritented cwnd in the stack?
> 
> 
> This is receiver side, and helps a sender who does congestion
> control based upon packet counting like Linux does.   It really
> is less related to ABC than Alexey implies, we've always had
> this kind of problem as I mentioned in previous talks in the
> past on this issue.

For a connection receiving nothing but sub-MSS segments this is going to 
non-trivially increase the number of ACKs sent no?  I would expect an 
unpleasant increase in service demands on something like a "burst 
enabled" (./configure --enable-burst) netperf TCP_RR test:

netperf -t TCP_RR -H foo -- -b N   # N > 1

to increase as a result.   Pipelined HTTP would be like that, some NFS 
over TCP stuff too, maybe X traffic, other "transactional" workloads as 
well - maybe Tuxeudo.

rick jones

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-18 17:11       ` Rick Jones
@ 2006-09-18 20:41         ` Alexey Kuznetsov
  2006-09-18 21:24           ` Rick Jones
  0 siblings, 1 reply; 43+ messages in thread
From: Alexey Kuznetsov @ 2006-09-18 20:41 UTC (permalink / raw)
  To: Rick Jones; +Cc: David Miller, alex, netdev

Hello!

Of course, number of ACK increases. It is the goal. :-)


> unpleasant increase in service demands on something like a "burst 
> enabled" (./configure --enable-burst) netperf TCP_RR test:
> 
> netperf -t TCP_RR -H foo -- -b N   # N > 1

foo=localhost

b	patched		orig
2	105874.83	105143.71
3	114208.53	114023.07
4	120493.99	120851.27
5	128087.48	128573.33
10	151328.48	151056.00

Probably, the test is done wrong. But I see no difference.


> to increase as a result.   Pipelined HTTP would be like that, some NFS 
> over TCP stuff too, maybe X traffic,

X will be excited about better latency.

What's about protocols not interested in latency, they will be a little
happier, if transactions are processed asynchronously.

But actually, it is not about increasing/decreasing number of ACKs.
It is about killing that pain in ass which we used to have because
we pretended to be too smart.

Alexey

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-18 20:41         ` Alexey Kuznetsov
@ 2006-09-18 21:24           ` Rick Jones
  2006-09-18 22:51             ` Alexey Kuznetsov
  0 siblings, 1 reply; 43+ messages in thread
From: Rick Jones @ 2006-09-18 21:24 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: David Miller, alex, netdev

Alexey Kuznetsov wrote:
> Hello!
> 
> Of course, number of ACK increases. It is the goal. :-)
> 
>>unpleasant increase in service demands on something like a "burst 
>>enabled" (./configure --enable-burst) netperf TCP_RR test:
>>
>>netperf -t TCP_RR -H foo -- -b N   # N > 1
> 
> foo=localhost

There isn't any sort of clever short-circuiting in loopback is there?  I 
do like the convenience of testing things over loopback, but always fret 
about not including drivers and actual hardware interrupts etc.

> b	patched		orig
> 2	105874.83	105143.71
> 3	114208.53	114023.07
> 4	120493.99	120851.27
> 5	128087.48	128573.33
> 10	151328.48	151056.00
 >
> Probably, the test is done wrong. But I see no difference.

Regardless, kudos for running the test.  The only thing missing is the 
-c and -C options to enable the CPU utilization measurements which will 
then give the service demand on a CPU time per transaction basis.  Or 
was this a UP system that was taken to CPU saturation?

>>to increase as a result.   Pipelined HTTP would be like that, some NFS 
>>over TCP stuff too, maybe X traffic,
> 
> 
> X will be excited about better latency.
> 
> What's about protocols not interested in latency, they will be a little
> happier, if transactions are processed asynchronously.

What i'm thinking about isn't so much about the latency as it is the 
aggregate throughput a system can do with lots of these 
protocols/connections going at the same time.  Hence the concern about 
increases in service demand.

> But actually, it is not about increasing/decreasing number of ACKs.
> It is about killing that pain in ass which we used to have because
> we pretended to be too smart.

:)

rick jones

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-18 21:24           ` Rick Jones
@ 2006-09-18 22:51             ` Alexey Kuznetsov
  2006-09-19  0:37               ` Rick Jones
  0 siblings, 1 reply; 43+ messages in thread
From: Alexey Kuznetsov @ 2006-09-18 22:51 UTC (permalink / raw)
  To: Rick Jones; +Cc: David Miller, alex, netdev

Hello!

> There isn't any sort of clever short-circuiting in loopback is there?

No, from all that I know.


> 									I 
> do like the convenience of testing things over loopback, but always fret 
> about not including drivers and actual hardware interrupts etc.

Well, if the test is right, it should show cost of redundant ACKs.


> Regardless, kudos for running the test.  The only thing missing is the 
> -c and -C options to enable the CPU utilization measurements which will 
> then give the service demand on a CPU time per transaction basis.  Or 
> was this a UP system that was taken to CPU saturation?

It is my notebook. :-) Of course, cpu consumption is 100%.
(Actally, netperf shows 100.10 :-))

I will redo test on a real network. What range of -b should I test?


> What i'm thinking about isn't so much about the latency

I understand.

Actually, I did those tests ages ago for a pure throughput case,
when nothing goes in the opposite direction. I did not find a difference
that time. And nobody even noticed that Linux sends ACKs _each_ small
segment for unidirectional connections for all those years. :-)

Alexey

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-18 22:51             ` Alexey Kuznetsov
@ 2006-09-19  0:37               ` Rick Jones
  2006-09-22 13:46                 ` Alexey Kuznetsov
  0 siblings, 1 reply; 43+ messages in thread
From: Rick Jones @ 2006-09-19  0:37 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: David Miller, alex, netdev

>>Regardless, kudos for running the test.  The only thing missing is the 
>>-c and -C options to enable the CPU utilization measurements which will 
>>then give the service demand on a CPU time per transaction basis.  Or 
>>was this a UP system that was taken to CPU saturation?
> 
> 
> It is my notebook. :-) Of course, cpu consumption is 100%.
> (Actally, netperf shows 100.10 :-))

Gotta love the accuracy. :)

> 
> I will redo test on a real network. What range of -b should I test?
> 

I suppose that depends on your patience :) In theory, as you increase 
(eg double) the -b setting you should reach a point of diminishing 
returns wrt transaction rate.  If you see that, and see the service 
demand flattening-out I'd say it is probably time to stop.

I'm also not quite sure if "abc" needs to be disabled or not.

I do know that I left-out one very important netperf option.  The 
command line should be:

netperf -t TCP_RR -H foo -- -b N -D

where "-D" is added to set TCP_NODELAY.  Otherwise, the ratio of 
transactions to data segments is fubar.  That issue is also why I wonder 
about the setting of tcp_abc.

[I have this quixotic pipedream about being able to --enable-burst, set 
-D and say that the number of TCP segments exchanged on the network is 
2X the transaction count when request and response size are < MSS.  The 
raison d'etre for this pipe dream is maximizing PPS with TCP_RR tests 
without _having_ to have hundreds if not thousands of simultaneous 
netperfs/connections - say with just as many netperfs/connections as 
there are CPUs or threads/strands in the system. It was while trying to 
make this pipe dream a reality I first noticed that HP-UX 11i, which 
normally has a very nice ACK avoidance heuristic, would send an 
immediate ACK if it received back-to-back sub-MSS segments - thus 
ruining my pipe dream when it came to HP-UX testing.  Hapily, I noticed 
that "linux" didn't seem to be doing the same thing. Hence my tweaking 
when seeing this patch come along...]

>>What i'm thinking about isn't so much about the latency
> 
> 
> I understand.
> 
> Actually, I did those tests ages ago for a pure throughput case,
> when nothing goes in the opposite direction. I did not find a difference
> that time. And nobody even noticed that Linux sends ACKs _each_ small
> segment for unidirectional connections for all those years. :-)

Not everyone looks very closely (alas, sometimes myself included).

If all anyone does is look at throughput, until they CPU saturate they 
wouldn't notice.  Heck, before netperf and TCP_RR tests, and sadly even 
still today, most people just look at how fast a single-connection, 
unidirectional data transfer goes and leave it at that :(

Thankfully, the set of "most people" and "netdev" aren't completely 
overlapping.

rick jones

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-18 13:56       ` David Miller
@ 2006-09-20 22:44         ` Stephen Hemminger
  2006-09-20 22:47           ` David Miller
  0 siblings, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2006-09-20 22:44 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, alex, netdev

On Mon, 18 Sep 2006 06:56:55 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
> Date: Mon, 18 Sep 2006 14:37:05 +0400
> 
> > > It looks perfectly fine to me, would you like me to apply it
> > > Alexey?
> > 
> > Yes, I think it is safe.
> 
> Ok, I'll put this into net-2.6.19 for now.  Thanks.

Did you try this on a desktop system?  Something is wrong with net-2.6.19
basic web browsing seems slower.

-- 
Stephen Hemminger <shemminger@osdl.org>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-20 22:44         ` Stephen Hemminger
@ 2006-09-20 22:47           ` David Miller
  2006-09-20 22:55             ` Stephen Hemminger
  0 siblings, 1 reply; 43+ messages in thread
From: David Miller @ 2006-09-20 22:47 UTC (permalink / raw)
  To: shemminger; +Cc: kuznet, alex, netdev

From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 20 Sep 2006 15:44:06 -0700

> On Mon, 18 Sep 2006 06:56:55 -0700 (PDT)
> David Miller <davem@davemloft.net> wrote:
> 
> > Ok, I'll put this into net-2.6.19 for now.  Thanks.
> 
> Did you try this on a desktop system?  Something is wrong with net-2.6.19
> basic web browsing seems slower.

It might be due to other changes, please verify that it's
truly caused by Alexey's change by backing it out and
retesting.

Note that I had to use an updated version of Alexey's change,
which he sent me privately, because the first version didn't
compile :)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-20 22:47           ` David Miller
@ 2006-09-20 22:55             ` Stephen Hemminger
  0 siblings, 0 replies; 43+ messages in thread
From: Stephen Hemminger @ 2006-09-20 22:55 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, alex, netdev

On Wed, 20 Sep 2006 15:47:56 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@osdl.org>
> Date: Wed, 20 Sep 2006 15:44:06 -0700
> 
> > On Mon, 18 Sep 2006 06:56:55 -0700 (PDT)
> > David Miller <davem@davemloft.net> wrote:
> > 
> > > Ok, I'll put this into net-2.6.19 for now.  Thanks.
> > 
> > Did you try this on a desktop system?  Something is wrong with net-2.6.19
> > basic web browsing seems slower.
> 
> It might be due to other changes, please verify that it's
> truly caused by Alexey's change by backing it out and
> retesting.
> 
> Note that I had to use an updated version of Alexey's change,
> which he sent me privately, because the first version didn't
> compile :)

It might be something else.. there are a lot of changes from 2.6.18 to net-2.6.19.



-- 
Stephen Hemminger <shemminger@osdl.org>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-19  0:37               ` Rick Jones
@ 2006-09-22 13:46                 ` Alexey Kuznetsov
  2006-09-22 17:15                   ` Rick Jones
  0 siblings, 1 reply; 43+ messages in thread
From: Alexey Kuznetsov @ 2006-09-22 13:46 UTC (permalink / raw)
  To: Rick Jones; +Cc: David Miller, alex, netdev

Hello!

> transactions to data segments is fubar.  That issue is also why I wonder 
> about the setting of tcp_abc.

Yes, switching ABC on/off has visible impact on amount of segments.
When ABC is off, amount of segments is almost the same as number of
transactions. When it is on, ~1.5% are merged. But this is invisible
in numbers of throughput/cpu usage.

That' numbers:

1Gig link. The first column is "b". - separates runs of netperf
in backward direction.

Run #1. One host is slower.

        old,abc=0
	 new,abc=0
	  new,abc=1
	   old,abc=1

2	23652.00  6.31   21.11  10.665  8.924
	 23622.16  6.47   21.01  10.951  8.893
	  23625.05  6.21   21.01  10.512  8.891
	   23725.12  6.46   20.31  10.898  8.559
	-
	23594.87  21.90  6.44   9.283   10.912
	 23631.52  20.30  6.36   8.592   10.766
	  23609.55  21.00  6.26   8.896   10.599
	   23633.75  21.10  5.44   8.929   9.206

4	36349.11  8.71   31.21  9.584   8.585
	 36461.37  8.65   30.81  9.492   8.449
	  36723.72  8.22   31.31  8.949   8.526
	   35801.24  8.58   30.51  9.589   8.521
	-
	35127.34  33.80  8.43   9.621   9.605
	 36165.50  30.90  8.48   8.545   9.381
	  36201.45  31.10  8.31   8.592   9.185
	   35269.76  30.00  8.58   8.507   9.732

8	41148.23  10.39  42.30  10.101  10.281
	 41270.06  11.04  31.31  10.698  7.585
	  41181.56  5.66   48.61  5.496   11.803
	   40372.37  9.68   56.50  9.591   13.996
	-
	40392.14  47.00  11.89  11.637  11.775
	 40613.80  36.90  9.16   9.086   9.019
	  40504.66  53.60  7.73   13.234  7.639
	   40388.99  48.70  11.93  12.058  11.814

16	67952.27  16.27  43.70  9.576   6.432
	 68031.40  10.56  53.70  6.206   7.894
	  67777.95  12.81  46.90  7.559   6.920
	   67814.41  16.13  46.50  9.517   6.857
	-
	68031.46  51.30  11.53  7.541   6.781
	 68044.57  40.70  8.48   5.982   4.986
	  67808.13  39.60  15.86  5.840   9.355
	   67818.32  52.90  11.51  7.801   6.791

32	90445.09  15.41  99.90  6.817   11.045
	 90210.34  16.11  100.00 7.143   11.085
	  90221.84  17.31  98.90  7.676   10.962
	   90712.78  18.41  99.40  8.120   10.958
	-
	89155.51  99.90  12.89  11.205  5.782
	 90058.54  99.90  16.16  11.093  7.179
	  90092.31  98.60  15.41  10.944  6.840
	   88688.96  99.00  17.59  11.163  7.933

64	89983.76  13.66  100.00 6.071   11.113
	 90504.24  17.54  100.00 7.750   11.049
	  92043.36  17.44  99.70  7.580   10.832
	   90979.29  16.01  99.90  7.038   10.981
	-
	88615.27  99.90  14.91  11.273  6.729
	 89316.13  99.90  17.28  11.185  7.740
	  90622.85  99.90  16.81  11.024  7.420
	   89084.85  99.90  17.51  11.214  7.861

Run #2. Slower host is replaced with better one. ABC=0.
No runs in backward directions.

	new
	 old

2	24009.73  8.80   6.49   3.667   10.806
	 24008.43  8.00   6.32   3.334   10.524
4	40012.53  18.30  8.79   4.574   8.783
	 39999.84  19.40  8.86   4.851   8.857
8	60500.29  26.30  12.78  4.348   8.452
	 60397.79  26.30  11.73  4.355   7.769
16	69619.95  39.80  14.03  5.717   8.063
	 70528.72  24.90  14.43  3.531   8.184
32	132522.01  53.20  21.28  4.015   6.424
	 132602.93  57.70  22.59  4.351   6.813
64	145738.83  60.30  25.01  4.138   6.865
	 143129.55  73.20  24.19  5.114   6.759
128	148184.21  69.70  24.96  4.704   6.739
	 148143.47  71.00  25.01  4.793   6.753
256	144798.91  69.40  25.01  4.793   6.908
	 144086.01  73.00  24.61  5.067   6.832

Frankly, I do not see any statistically valid correlations.



> that "linux" didn't seem to be doing the same thing. Hence my tweaking 
> when seeing this patch come along...]

netperf does not catch this. :-)

Even with this patch linux does not ack each second segment dumbly,
it waits for some conditions, mostly read() emptying receive queue.
To model this it is necessary to insert some gaps between
bursted segments or to use slow network.

I have no doubts it is easy to model a situation when we send
lots of useless ACKs. F.e. inserting 20ms gaps between requests.
To see effect on thoughput/cpu, we could start enough of connections,
doing the same thing.

Alexey

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH][RFC] Re: high latency with TCP connections
  2006-09-22 13:46                 ` Alexey Kuznetsov
@ 2006-09-22 17:15                   ` Rick Jones
  0 siblings, 0 replies; 43+ messages in thread
From: Rick Jones @ 2006-09-22 17:15 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: David Miller, alex, netdev

Alexey Kuznetsov wrote:
> Hello!
> 
> 
>>transactions to data segments is fubar.  That issue is also why I wonder 
>>about the setting of tcp_abc.
> 
> 
> Yes, switching ABC on/off has visible impact on amount of segments.
> When ABC is off, amount of segments is almost the same as number of
> transactions. When it is on, ~1.5% are merged. But this is invisible
> in numbers of throughput/cpu usage.

Hmm, that would seem to suggest that for "new" the netperf/netserver 
were being fast enough that the code didn't perceive the receipt of 
back-to-back sub-MSS segments? (Is that even possible once -b is fairly 
large?)  Otherwise, with new I would have expected the segment count to 
be meaningfully > than the transaction count?

> 
> That' numbers:
> 
> 1Gig link. The first column is "b". - separates runs of netperf
> in backward direction.
> 
> Run #1. One host is slower.
> 
>         old,abc=0
> 	 new,abc=0
> 	  new,abc=1
> 	   old,abc=1
> 
> 2	23652.00  6.31   21.11  10.665  8.924
> 	 23622.16  6.47   21.01  10.951  8.893
> 	  23625.05  6.21   21.01  10.512  8.891
> 	   23725.12  6.46   20.31  10.898  8.559
> 	-
> 	23594.87  21.90  6.44   9.283   10.912
> 	 23631.52  20.30  6.36   8.592   10.766
> 	  23609.55  21.00  6.26   8.896   10.599
> 	   23633.75  21.10  5.44   8.929   9.206
> 
> 4	36349.11  8.71   31.21  9.584   8.585
> 	 36461.37  8.65   30.81  9.492   8.449
> 	  36723.72  8.22   31.31  8.949   8.526
> 	   35801.24  8.58   30.51  9.589   8.521
> 	-
> 	35127.34  33.80  8.43   9.621   9.605
> 	 36165.50  30.90  8.48   8.545   9.381
> 	  36201.45  31.10  8.31   8.592   9.185
> 	   35269.76  30.00  8.58   8.507   9.732
> 
> 8	41148.23  10.39  42.30  10.101  10.281
> 	 41270.06  11.04  31.31  10.698  7.585
> 	  41181.56  5.66   48.61  5.496   11.803
> 	   40372.37  9.68   56.50  9.591   13.996
> 	-
> 	40392.14  47.00  11.89  11.637  11.775
> 	 40613.80  36.90  9.16   9.086   9.019
> 	  40504.66  53.60  7.73   13.234  7.639
> 	   40388.99  48.70  11.93  12.058  11.814
> 
> 16	67952.27  16.27  43.70  9.576   6.432
> 	 68031.40  10.56  53.70  6.206   7.894
> 	  67777.95  12.81  46.90  7.559   6.920
> 	   67814.41  16.13  46.50  9.517   6.857
> 	-
> 	68031.46  51.30  11.53  7.541   6.781
> 	 68044.57  40.70  8.48   5.982   4.986
> 	  67808.13  39.60  15.86  5.840   9.355
> 	   67818.32  52.90  11.51  7.801   6.791
> 
> 32	90445.09  15.41  99.90  6.817   11.045
> 	 90210.34  16.11  100.00 7.143   11.085
> 	  90221.84  17.31  98.90  7.676   10.962
> 	   90712.78  18.41  99.40  8.120   10.958
> 	-
> 	89155.51  99.90  12.89  11.205  5.782
> 	 90058.54  99.90  16.16  11.093  7.179
> 	  90092.31  98.60  15.41  10.944  6.840
> 	   88688.96  99.00  17.59  11.163  7.933
> 
> 64	89983.76  13.66  100.00 6.071   11.113
> 	 90504.24  17.54  100.00 7.750   11.049
> 	  92043.36  17.44  99.70  7.580   10.832
> 	   90979.29  16.01  99.90  7.038   10.981
> 	-
> 	88615.27  99.90  14.91  11.273  6.729
> 	 89316.13  99.90  17.28  11.185  7.740
> 	  90622.85  99.90  16.81  11.024  7.420
> 	   89084.85  99.90  17.51  11.214  7.861
> 
> Run #2. Slower host is replaced with better one. ABC=0.
> No runs in backward directions.
> 
> 	new
> 	 old
> 
> 2	24009.73  8.80   6.49   3.667   10.806
> 	 24008.43  8.00   6.32   3.334   10.524
> 4	40012.53  18.30  8.79   4.574   8.783
> 	 39999.84  19.40  8.86   4.851   8.857
> 8	60500.29  26.30  12.78  4.348   8.452
> 	 60397.79  26.30  11.73  4.355   7.769
> 16	69619.95  39.80  14.03  5.717   8.063
> 	 70528.72  24.90  14.43  3.531   8.184
> 32	132522.01  53.20  21.28  4.015   6.424
> 	 132602.93  57.70  22.59  4.351   6.813
> 64	145738.83  60.30  25.01  4.138   6.865
> 	 143129.55  73.20  24.19  5.114   6.759
> 128	148184.21  69.70  24.96  4.704   6.739
> 	 148143.47  71.00  25.01  4.793   6.753
> 256	144798.91  69.40  25.01  4.793   6.908
> 	 144086.01  73.00  24.61  5.067   6.832
> 
> Frankly, I do not see any statistically valid correlations.

Does look like it jumps-around quite a bit - for example the run#2 with 
-b 16 had the CPU util all over the map on the netperf side.  That 
wasn't by any chance an SMP system?

>>that "linux" didn't seem to be doing the same thing. Hence my tweaking 
>>when seeing this patch come along...]
> 
> 
> netperf does not catch this. :-)

Nope :(  One of these days.... I need to teach netperf how to extract 
TCP statistics from as many platforms as possible.  Meantime it relies 
as always on the kindness of benchmarkers :) (My appologies to Tennesee 
Williams :)

> Even with this patch linux does not ack each second segment dumbly,
> it waits for some conditions, mostly read() emptying receive queue.

Good.  HP-UX is indeed dumb about this, but I'm assured it will be 
changing.  I forget what Solaris does in this situation - I thought I 
looked a while ago but cannot recall the result.

> To model this it is necessary to insert some gaps between
> bursted segments or to use slow network.
> 
> I have no doubts it is easy to model a situation when we send
> lots of useless ACKs. F.e. inserting 20ms gaps between requests.
> To see effect on thoughput/cpu, we could start enough of connections,
> doing the same thing.

Adding --enable-intervals might work there.  I don't recall how well it 
gets along with --enable-burst though, and you have already made lots of 
runs as it is.

rick

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2006-09-22 17:15 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-08-30 10:07 high latency with TCP connections Alexander Vodomerov
2006-08-30 17:27 ` Stephen Hemminger
2006-08-30 21:39   ` David Miller
2006-08-30 22:04     ` Stephen Hemminger
2006-08-30 23:00     ` Rick Jones
2006-08-31  8:14     ` Alexander Vodomerov
2006-08-31 15:44       ` Sridhar Samudrala
2006-08-31 18:22     ` Kelly Burkhart
2006-08-31 19:40       ` Rick Jones
2006-08-31 21:08       ` Ian McDonald
2006-08-31 21:46   ` Alexey Kuznetsov
2006-08-31 22:14     ` Stephen Hemminger
2006-08-31 22:44       ` David Miller
2006-08-31 23:29         ` Alexey Kuznetsov
2006-08-31 23:57           ` David Miller
2006-09-01  3:23             ` Stephen Hemminger
2006-09-01  3:39               ` Ian McDonald
2006-09-01  6:23                 ` David Miller
2006-09-01  9:44             ` Pekka Savola
2006-09-01  9:49               ` David Miller
2006-09-01  9:47             ` Alexey Kuznetsov
2006-09-01 11:00               ` Evgeniy Polyakov
     [not found]                 ` <20060901090046.69b3d583@localhost.localdomain>
2006-09-01 20:55                   ` [PATCH] tcp: turn ABC off Stephen Hemminger
2006-09-02  7:22                     ` Evgeniy Polyakov
2006-09-02  8:10                       ` Herbert Xu
2006-09-04  9:10                 ` high latency with TCP connections Alexey Kuznetsov
2006-09-04 16:00 ` [PATCH][RFC] " Alexey Kuznetsov
2006-09-05 17:55   ` Rick Jones
2006-09-05 22:13     ` Alexey Kuznetsov
2006-09-18  7:39     ` David Miller
2006-09-18 17:11       ` Rick Jones
2006-09-18 20:41         ` Alexey Kuznetsov
2006-09-18 21:24           ` Rick Jones
2006-09-18 22:51             ` Alexey Kuznetsov
2006-09-19  0:37               ` Rick Jones
2006-09-22 13:46                 ` Alexey Kuznetsov
2006-09-22 17:15                   ` Rick Jones
2006-09-18  7:31   ` David Miller
2006-09-18 10:37     ` Alexey Kuznetsov
2006-09-18 13:56       ` David Miller
2006-09-20 22:44         ` Stephen Hemminger
2006-09-20 22:47           ` David Miller
2006-09-20 22:55             ` Stephen Hemminger

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.