All of lore.kernel.org
 help / color / mirror / Atom feed
* resurrecting tcphealth
@ 2012-07-12 20:55 Piotr Sawuk
  2012-07-12 21:35 ` Stephen Hemminger
                   ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Piotr Sawuk @ 2012-07-12 20:55 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

hello! I haven't done any kernel-hacking before, so be patient.

I got as far as to make tcphealth run, but now I need some help:
how does read-locking work in the tcp_sock struct?
the original code (for 2.5.1) made a read_lock(&head->lock) with
struct tcp_ehash_bucket *head = &tcp_ehash[i];
at the beginning of the for-loop over all sk=head->chain.
i.e.
       for (i=0; i < tcp_ehash_size; i++) {
               struct tcp_ehash_bucket *head = &tcp_ehash[i];
               struct sock *sk;
               struct tcp_opt *tp;

               read_lock(&head->lock);
               for (sk=head->chain; sk; sk=sk->next) {
                       if (!TCP_INET_FAMILY(sk->family))
                               continue;

and in the new kernel this construction has been replaced by

        if (st->state==TCP_SEQ_STATE_ESTABLISHED)
...
static struct tcp_seq_afinfo tcphealth_seq_afinfo
...

and the example with proc/net/tcp is seemingly not locking anything. do I
actually need a read-lock for diagnostic data from tcp_sock? why did the
original author need to lock the whole chain of tcp_sock?

here's my patch against 3.4.4 so far, any further comments welcome:

diff -rub linux-3.4.4/include/linux/tcp.h
linux-3.4.4-heal-lsm/include/linux/tcp.h
--- linux-3.4.4/include/linux/tcp.h	2012-06-22 20:37:50.000000000 +0200
+++ linux-3.4.4-heal-lsm/include/linux/tcp.h	2012-07-06 10:23:13.000000000
+0200
@@ -7,6 +7,8 @@
  *
  * Version:	@(#)tcp.h	1.0.2	04/28/93
  *
+ *      Federico D. Sacerdoti	:	Added TCP health counters.
+ *
  * Author:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *
  *		This program is free software; you can redistribute it and/or
@@ -472,6 +474,15 @@
 	 * contains related tcp_cookie_transactions fields.
 	 */
 	struct tcp_cookie_values  *cookie_values;
+
+	/*
+	 * TCP health monitoring counters.
+	 */
+	__u32	dup_acks_sent;
+	__u32	dup_pkts_recv;
+	__u32	acks_sent;
+	__u32	pkts_recv;
+		__u32	last_ack_sent;	/* Sequence number of the last ack we sent. */
 };

 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff -rub linux-3.4.4/net/ipv4/tcp_input.c
linux-3.4.4-heal-lsm/net/ipv4/tcp_input.c
--- linux-3.4.4/net/ipv4/tcp_input.c	2012-06-22 20:37:50.000000000 +0200
+++ linux-3.4.4-heal-lsm/net/ipv4/tcp_input.c	2012-07-06
10:12:12.000000000 +0200
@@ -59,7 +59,8 @@
  *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
  *					engine. Lots of bugs are found.
  *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
- */
+ *		Federico D. Sacerdoti:	Added TCP health monitoring.
+*/

 #define pr_fmt(fmt) "TCP: " fmt

@@ -4414,6 +4415,8 @@
 		}

 		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+			/* Course retransmit inefficiency- this packet has been received
twice. */
+			tp->dup_pkts_recv++;
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
 			__skb_unlink(skb, &tp->out_of_order_queue);
 			__kfree_skb(skb);
@@ -4664,6 +4667,10 @@
 		return;
 	}

+	/* A packet is a "duplicate" if it contains bytes we have already
received. */
+	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+		tp->dup_pkts_recv++;
+
 	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
 		/* A retransmit, 2nd most common case.  Force an immediate ack. */
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
@@ -5375,6 +5382,14 @@

 	tp->rx_opt.saw_tstamp = 0;

+	/*
+	 *	Tcp health monitoring is interested in
+	 *	total per-connection packet arrivals.
+	 *	This is in the fast path, but is quick.
+	 */
+
+	tp->pkts_recv++;
+
 	/*	pred_flags is 0xS?10 << 16 + snd_wnd
 	 *	if header_prediction is to be made
 	 *	'S' will always be tp->tcp_header_len >> 2
diff -rub linux-3.4.4/net/ipv4/tcp_ipv4.c
linux-3.4.4-heal-lsm/net/ipv4/tcp_ipv4.c
--- linux-3.4.4/net/ipv4/tcp_ipv4.c	2012-06-22 20:37:50.000000000 +0200
+++ linux-3.4.4-heal-lsm/net/ipv4/tcp_ipv4.c	2012-07-11 09:34:22.000000000
+0200
@@ -2533,6 +2533,82 @@
 	return 0;
 }

+
+/*
+ *	Output /proc/net/tcphealth
+ */
+#define LINESZ 128
+
+int tcp_health_seq_show(struct seq_file *seq, void *v)
+{
+	int len, num;
+       char srcIP[32], destIP[32];
+
+	unsigned long  SmoothedRttEstimate,
+		AcksSent, DupAcksSent, PktsRecv, DupPktsRecv;
+	struct tcp_iter_state *st;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq,
+		"TCP Health Monitoring (established connections only)\n"
+		" -Duplicate ACKs indicate lost or reordered packets on the connection.\n"
+		" -Duplicate Packets Received signal a slow and badly inefficient
connection.\n"
+		" -RttEst estimates how long future packets will take on a round trip
over the connection.\n"
+		"id   Local Address        Remote Address       RttEst(ms) AcksSent "
+		"DupAcksSent PktsRecv DupPktsRecv\n");
+		goto out;
+	}
+
+	/* Loop through established TCP connections */
+	st = seq->private;
+
+
+	if (st->state==TCP_SEQ_STATE_ESTABLISHED)
+	{
+/*	; //insert read-lock here */
+		const struct tcp_sock *tp = tcp_sk(v);
+		const struct inet_sock *inet = inet_sk(v);
+		__be32 dest = inet->inet_daddr;
+		__be32 src = inet->inet_rcv_saddr;
+		__u16 destp = ntohs(inet->inet_dport);
+		__u16 srcp = ntohs(inet->inet_sport);
+
+		num=st->num;
+		SmoothedRttEstimate = (tp->srtt >> 3);
+		AcksSent = tp->acks_sent;
+		DupAcksSent = tp->dup_acks_sent;
+		PktsRecv = tp->pkts_recv;
+		DupPktsRecv = tp->dup_pkts_recv;
+
+		sprintf(srcIP, "%lu.%lu.%lu.%lu:%u",
+			((src >> 24) & 0xFF), ((src >> 16) & 0xFF), ((src >> 8) & 0xFF), (src
& 0xFF),
+			srcp);
+		sprintf(destIP, "%3d.%3d.%3d.%3d:%u",
+			((dest >> 24) & 0xFF), ((dest >> 16) & 0xFF), ((dest >> 8) & 0xFF),
(dest & 0xFF),
+			destp);
+
+		seq_printf(seq, "%d: %-21s %-21s "
+				"%8lu %8lu %8lu %8lu %8lu%n",
+				num,
+				srcIP,
+				destIP,
+				SmoothedRttEstimate,
+				AcksSent,
+				DupAcksSent,
+				PktsRecv,
+				DupPktsRecv,
+
+				&len
+			);
+
+		seq_printf(seq, "%*s\n", LINESZ - 1 - len, "");
+/*	; //insert read-unlock here */
+	}
+
+out:
+	return 0;
+}
+
 static const struct file_operations tcp_afinfo_seq_fops = {
 	.owner   = THIS_MODULE,
 	.open    = tcp_seq_open,
@@ -2541,6 +2617,15 @@
 	.release = seq_release_net
 };

+static struct tcp_seq_afinfo tcphealth_seq_afinfo = {
+	.name		= "tcphealth",
+	.family		= AF_INET,
+	.seq_fops	= &tcp_afinfo_seq_fops,
+	.seq_ops	= {
+		.show		= tcp_health_seq_show,
+	},
+};
+
 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
 	.name		= "tcp",
 	.family		= AF_INET,
@@ -2552,12 +2637,15 @@

 static int __net_init tcp4_proc_init_net(struct net *net)
 {
-	return tcp_proc_register(net, &tcp4_seq_afinfo);
+	int ret=tcp_proc_register(net, &tcp4_seq_afinfo);
+	if(ret==0) ret=tcp_proc_register(net, &tcphealth_seq_afinfo);
+	return ret;
 }

 static void __net_exit tcp4_proc_exit_net(struct net *net)
 {
 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
+	tcp_proc_unregister(net, &tcphealth_seq_afinfo);
 }

 static struct pernet_operations tcp4_net_ops = {
diff -rub linux-3.4.4/net/ipv4/tcp_output.c
linux-3.4.4-heal-lsm/net/ipv4/tcp_output.c
--- linux-3.4.4/net/ipv4/tcp_output.c	2012-06-22 20:37:50.000000000 +0200
+++ linux-3.4.4-heal-lsm/net/ipv4/tcp_output.c	2012-07-06
17:15:14.000000000 +0200
@@ -31,6 +31,7 @@
  *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
  *		Cacophonix Gaul :	draft-minshall-nagle-01
  *		J Hadi Salim	:	ECN support
+ *	Federico D. Sacerdoti	:	Added TCP health monitoring.
  *
  */

@@ -2754,8 +2755,15 @@
 	skb_reserve(buff, MAX_TCP_HEADER);
 	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);

+		/* If the rcv_nxt has not advanced since sending our last ACK, this is
a duplicate. */
+		if (tcp_sk(sk)->rcv_nxt == tcp_sk(sk)->last_ack_sent)
+			tcp_sk(sk)->dup_acks_sent++;
+		/* Record the total number of acks sent on this connection. */
+		tcp_sk(sk)->acks_sent++;
+
 	/* Send it off, this clears delayed acks for us. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
+		tcp_sk(sk)->last_ack_sent = tcp_sk(sk)->rcv_nxt;
 	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
 }



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-12 20:55 resurrecting tcphealth Piotr Sawuk
@ 2012-07-12 21:35 ` Stephen Hemminger
  2012-07-12 22:29 ` Randy Dunlap
  2012-07-14 21:48 ` Stephen Hemminger
  2 siblings, 0 replies; 24+ messages in thread
From: Stephen Hemminger @ 2012-07-12 21:35 UTC (permalink / raw)
  To: Piotr Sawuk; +Cc: netdev, linux-kernel

On Thu, 12 Jul 2012 22:55:57 +0200
"Piotr Sawuk" <a9702387@unet.univie.ac.at> wrote:

> diff -rub linux-3.4.4/net/ipv4/tcp_input.c
> linux-3.4.4-heal-lsm/net/ipv4/tcp_input.c
> --- linux-3.4.4/net/ipv4/tcp_input.c	2012-06-22 20:37:50.000000000 +0200
> +++ linux-3.4.4-heal-lsm/net/ipv4/tcp_input.c	2012-07-06
> 10:12:12.000000000 +0200
> @@ -59,7 +59,8 @@
>   *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
>   *					engine. Lots of bugs are found.
>   *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
> - */
> + *		Federico D. Sacerdoti:	Added TCP health monitoring.

Please don't do this.
The kernel community no longer maintains a list of contributors
in the comments. The history is maintained in the git commit log.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-12 20:55 resurrecting tcphealth Piotr Sawuk
  2012-07-12 21:35 ` Stephen Hemminger
@ 2012-07-12 22:29 ` Randy Dunlap
  2012-07-14 21:48 ` Stephen Hemminger
  2 siblings, 0 replies; 24+ messages in thread
From: Randy Dunlap @ 2012-07-12 22:29 UTC (permalink / raw)
  To: Piotr Sawuk; +Cc: netdev, linux-kernel

On 07/12/2012 01:55 PM, Piotr Sawuk wrote:

> hello! I haven't done any kernel-hacking before, so be patient.
> 
> I got as far as to make tcphealth run, but now I need some help:
> how does read-locking work in the tcp_sock struct?
> the original code (for 2.5.1) made a read_lock(&head->lock) with
> struct tcp_ehash_bucket *head = &tcp_ehash[i];
> at the beginning of the for-loop over all sk=head->chain.
> i.e.
>        for (i=0; i < tcp_ehash_size; i++) {
>                struct tcp_ehash_bucket *head = &tcp_ehash[i];
>                struct sock *sk;
>                struct tcp_opt *tp;
> 
>                read_lock(&head->lock);
>                for (sk=head->chain; sk; sk=sk->next) {
>                        if (!TCP_INET_FAMILY(sk->family))
>                                continue;
> 
> and in the new kernel this construction has been replaced by
> 
>         if (st->state==TCP_SEQ_STATE_ESTABLISHED)
> ...
> static struct tcp_seq_afinfo tcphealth_seq_afinfo
> ...
> 
> and the example with proc/net/tcp is seemingly not locking anything. do I
> actually need a read-lock for diagnostic data from tcp_sock? why did the
> original author need to lock the whole chain of tcp_sock?
> 
> here's my patch against 3.4.4 so far, any further comments welcome:


Along with what Mr. Hemminger said:

Don't make patches to a stable kernel version.  Just make them
to mainline (e.g., 3.5-rc6).

More comments below.


> 
> diff -rub linux-3.4.4/include/linux/tcp.h
> linux-3.4.4-heal-lsm/include/linux/tcp.h
> --- linux-3.4.4/include/linux/tcp.h	2012-06-22 20:37:50.000000000 +0200
> +++ linux-3.4.4-heal-lsm/include/linux/tcp.h	2012-07-06 10:23:13.000000000
> +0200



See the 2 lines above.  They should be all on one line (the +0200 is
on a separate line).  Something along the way split this long line for
you, but this is bad.  It corrupts the patch.


> @@ -472,6 +474,15 @@
>  	 * contains related tcp_cookie_transactions fields.
>  	 */
>  	struct tcp_cookie_values  *cookie_values;
> +
> +	/*
> +	 * TCP health monitoring counters.
> +	 */
> +	__u32	dup_acks_sent;
> +	__u32	dup_pkts_recv;
> +	__u32	acks_sent;
> +	__u32	pkts_recv;
> +		__u32	last_ack_sent;	/* Sequence number of the last ack we sent.


Extra tab before __u32 above.

 */
>  };
> 
>  static inline struct tcp_sock *tcp_sk(const struct sock *sk)
> diff -rub linux-3.4.4/net/ipv4/tcp_input.c
> linux-3.4.4-heal-lsm/net/ipv4/tcp_input.c
> --- linux-3.4.4/net/ipv4/tcp_input.c	2012-06-22 20:37:50.000000000 +0200
> +++ linux-3.4.4-heal-lsm/net/ipv4/tcp_input.c	2012-07-06
> 10:12:12.000000000 +0200


Bad line(s) above.

> @@ -5375,6 +5382,14 @@
> 
>  	tp->rx_opt.saw_tstamp = 0;
> 
> +	/*
> +	 *	Tcp health monitoring is interested in
> +	 *	total per-connection packet arrivals.
> +	 *	This is in the fast path, but is quick.
> +	 */
> +


The	*/
is enough white space; don't add an extra line.

> +	tp->pkts_recv++;
> +
>  	/*	pred_flags is 0xS?10 << 16 + snd_wnd
>  	 *	if header_prediction is to be made
>  	 *	'S' will always be tp->tcp_header_len >> 2
> diff -rub linux-3.4.4/net/ipv4/tcp_ipv4.c
> linux-3.4.4-heal-lsm/net/ipv4/tcp_ipv4.c
> --- linux-3.4.4/net/ipv4/tcp_ipv4.c	2012-06-22 20:37:50.000000000 +0200
> +++ linux-3.4.4-heal-lsm/net/ipv4/tcp_ipv4.c	2012-07-11 09:34:22.000000000
> +0200
> @@ -2533,6 +2533,82 @@
>  	return 0;
>  }
> 
> +
> +/*
> + *	Output /proc/net/tcphealth
> + */
> +#define LINESZ 128
> +
> +int tcp_health_seq_show(struct seq_file *seq, void *v)
> +{
> +	int len, num;
> +       char srcIP[32], destIP[32];


Use tab above for indentation (don't use spaces).

> +
> +	unsigned long  SmoothedRttEstimate,
> +		AcksSent, DupAcksSent, PktsRecv, DupPktsRecv;
> +	struct tcp_iter_state *st;
> +
> +	if (v == SEQ_START_TOKEN) {
> +		seq_printf(seq,
> +		"TCP Health Monitoring (established connections only)\n"
> +		" -Duplicate ACKs indicate lost or reordered packets on the connection.\n"
> +		" -Duplicate Packets Received signal a slow and badly inefficient
> connection.\n"
> +		" -RttEst estimates how long future packets will take on a round trip
> over the connection.\n"
> +		"id   Local Address        Remote Address       RttEst(ms) AcksSent "
> +		"DupAcksSent PktsRecv DupPktsRecv\n");
> +		goto out;
> +	}
> +
> +	/* Loop through established TCP connections */
> +	st = seq->private;
> +
> +
> +	if (st->state==TCP_SEQ_STATE_ESTABLISHED)
> +	{


Kernel style is:

	if (st->state == TCP_SEQ_STATE_ESTABLISHED) {


> +/*	; //insert read-lock here */
> +		const struct tcp_sock *tp = tcp_sk(v);
> +		const struct inet_sock *inet = inet_sk(v);
> +		__be32 dest = inet->inet_daddr;
> +		__be32 src = inet->inet_rcv_saddr;
> +		__u16 destp = ntohs(inet->inet_dport);
> +		__u16 srcp = ntohs(inet->inet_sport);
> +
> +		num=st->num;


		num = st->num;

> +		SmoothedRttEstimate = (tp->srtt >> 3);
> +		AcksSent = tp->acks_sent;
> +		DupAcksSent = tp->dup_acks_sent;
> +		PktsRecv = tp->pkts_recv;
> +		DupPktsRecv = tp->dup_pkts_recv;
> +
> +		sprintf(srcIP, "%lu.%lu.%lu.%lu:%u",
> +			((src >> 24) & 0xFF), ((src >> 16) & 0xFF), ((src >> 8) & 0xFF), (src
> & 0xFF),
> +			srcp);
> +		sprintf(destIP, "%3d.%3d.%3d.%3d:%u",
> +			((dest >> 24) & 0xFF), ((dest >> 16) & 0xFF), ((dest >> 8) & 0xFF),
> (dest & 0xFF),
> +			destp);
> +
> +		seq_printf(seq, "%d: %-21s %-21s "
> +				"%8lu %8lu %8lu %8lu %8lu%n",
> +				num,
> +				srcIP,
> +				destIP,
> +				SmoothedRttEstimate,
> +				AcksSent,
> +				DupAcksSent,
> +				PktsRecv,
> +				DupPktsRecv,
> +
> +				&len
> +			);
> +
> +		seq_printf(seq, "%*s\n", LINESZ - 1 - len, "");
> +/*	; //insert read-unlock here */
> +	}
> +
> +out:
> +	return 0;
> +}
> +
>  static const struct file_operations tcp_afinfo_seq_fops = {
>  	.owner   = THIS_MODULE,
>  	.open    = tcp_seq_open,
> @@ -2541,6 +2617,15 @@
>  	.release = seq_release_net
>  };
> 
> +static struct tcp_seq_afinfo tcphealth_seq_afinfo = {
> +	.name		= "tcphealth",
> +	.family		= AF_INET,
> +	.seq_fops	= &tcp_afinfo_seq_fops,
> +	.seq_ops	= {
> +		.show		= tcp_health_seq_show,
> +	},
> +};
> +
>  static struct tcp_seq_afinfo tcp4_seq_afinfo = {
>  	.name		= "tcp",
>  	.family		= AF_INET,
> @@ -2552,12 +2637,15 @@
> 
>  static int __net_init tcp4_proc_init_net(struct net *net)
>  {
> -	return tcp_proc_register(net, &tcp4_seq_afinfo);
> +	int ret=tcp_proc_register(net, &tcp4_seq_afinfo);


	int ret = tcp_proc_register(...);

> +	if(ret==0) ret=tcp_proc_register(net, &tcphealth_seq_afinfo);


	if (ret == 0)
		ret = tcp_proc_register(...);

> +	return ret;
>  }
> 
>  static void __net_exit tcp4_proc_exit_net(struct net *net)
>  {
>  	tcp_proc_unregister(net, &tcp4_seq_afinfo);
> +	tcp_proc_unregister(net, &tcphealth_seq_afinfo);
>  }
> 
>  static struct pernet_operations tcp4_net_ops = {
> diff -rub linux-3.4.4/net/ipv4/tcp_output.c
> linux-3.4.4-heal-lsm/net/ipv4/tcp_output.c
> --- linux-3.4.4/net/ipv4/tcp_output.c	2012-06-22 20:37:50.000000000 +0200
> +++ linux-3.4.4-heal-lsm/net/ipv4/tcp_output.c	2012-07-06
> 17:15:14.000000000 +0200


Bad split lines above.

> @@ -2754,8 +2755,15 @@
>  	skb_reserve(buff, MAX_TCP_HEADER);
>  	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
> 
> +		/* If the rcv_nxt has not advanced since sending our last ACK, this is
> a duplicate. */
> +		if (tcp_sk(sk)->rcv_nxt == tcp_sk(sk)->last_ack_sent)
> +			tcp_sk(sk)->dup_acks_sent++;
> +		/* Record the total number of acks sent on this connection. */
> +		tcp_sk(sk)->acks_sent++;
> +
>  	/* Send it off, this clears delayed acks for us. */
>  	TCP_SKB_CB(buff)->when = tcp_time_stamp;
> +		tcp_sk(sk)->last_ack_sent = tcp_sk(sk)->rcv_nxt;


Extra tab indentation above.

>  	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
>  }
> 
> 
> --


-- 
~Randy

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-12 20:55 resurrecting tcphealth Piotr Sawuk
  2012-07-12 21:35 ` Stephen Hemminger
  2012-07-12 22:29 ` Randy Dunlap
@ 2012-07-14 21:48 ` Stephen Hemminger
  2012-07-14 23:43   ` Piotr Sawuk
  2 siblings, 1 reply; 24+ messages in thread
From: Stephen Hemminger @ 2012-07-14 21:48 UTC (permalink / raw)
  To: Piotr Sawuk; +Cc: netdev, linux-kernel

Maybe it would be better to use netlink info (for ss command)
rather than a /proc/net interface.
See how existing TCP values and MEMINFO are handled.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-14 21:48 ` Stephen Hemminger
@ 2012-07-14 23:43   ` Piotr Sawuk
  2012-07-15  7:16     ` Eric Dumazet
  0 siblings, 1 reply; 24+ messages in thread
From: Piotr Sawuk @ 2012-07-14 23:43 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

On Sa, 14.07.2012, 23:48, Stephen Hemminger wrote:
> Maybe it would be better to use netlink info (for ss command)
> rather than a /proc/net interface.
> See how existing TCP values and MEMINFO are handled.
>

I'm confused, what exactly do you mean?
of course a library-interface might be more interesting than proc/net.
the proc/net/tcphealth functionality probably could be done in userspace,
although I'm wondering how userspace could traverse all tcp_sock structures
for all clients -- but the gain would be a bit more security. and the rest
of the patch still would need to stay in the kernel since that info it
collects would otherwise be lost completely. so I somehow doubt this would
be worth the effort. better would be optional tcphealth disabled by default
for security-reasons. (i.e. if you're a server you don't want to enable it.)

@"David Miller":
sorry, I forgot an important info about this patch, an info that can be
found in the paper I quoted:

'We concentrate on client side metrics to make our results more applicable
to web clients. We chose this strategy to help answer the often asked
question: "Why is my Internet connection so slow?"
...
Since we only have access to the client node and not the server, we must
fi\fnd performance data using only the information coming downstream to us.
Savage showed that the TCP protocol maintains data that we can leverage to
our advantage and we use that idea to monitor per-connection TCP health at
the client. This task is made difficult because of the TCP philosophy of
"smart sender, dumb receiver". There is simply more information about the
connection on the server side than on the client.' [1]

tcp_info is such a server-side diagnosis, clients usually don't send enough
data to make the info therein meaningful for checking the "health" of tcp.
it's all about the senders, and the desktop-pc simply isn't that big of a
tcp-sender to slashdot and co.

@"Eric Dumazet":
the same goes for you too. netstat -s and whatever kind of pooling together
statistics on all internet-connections is useful for a server-admin only,
slowness of particular sites can only be investigated with ping and this
tcphealth patch when you're on client-side.

as for userspace-tools, apart from tcp-health display in each
downloading-progress-bar I can think of cron-jobs which wait for better
tcp-health before spidering some site, or an altered wget in mirror-mode.
the gkrellm-plugin that comes with this patch might count together all the
data of tcphealth, but when a tcp_sock isn't stored anymore that data is
forgotten and you get a concurrent info most likely for only one site, since
the client wont have much connections open and will be closing old
connections frequently.

oh, and again I recommend the really short although outdated thesis

[1] https://sacerdoti.org/tcphealth/tcphealth-paper.pdf


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-14 23:43   ` Piotr Sawuk
@ 2012-07-15  7:16     ` Eric Dumazet
  2012-07-15  9:17       ` Piotr Sawuk
  0 siblings, 1 reply; 24+ messages in thread
From: Eric Dumazet @ 2012-07-15  7:16 UTC (permalink / raw)
  To: Piotr Sawuk; +Cc: netdev, linux-kernel

On Sun, 2012-07-15 at 01:43 +0200, Piotr Sawuk wrote:

> oh, and again I recommend the really short although outdated thesis
> 
> [1] https://sacerdoti.org/tcphealth/tcphealth-paper.pdf

A thesis saying SACK are not useful is highly suspect.

Instead of finding why they behave not so good and fix the bugs, just
say "SACK addition to TCP is not critical"

Really ?



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-15  7:16     ` Eric Dumazet
@ 2012-07-15  9:17       ` Piotr Sawuk
  2012-07-15  9:53         ` Eric Dumazet
  0 siblings, 1 reply; 24+ messages in thread
From: Piotr Sawuk @ 2012-07-15  9:17 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

On So, 15.07.2012, 09:16, Eric Dumazet wrote:
> On Sun, 2012-07-15 at 01:43 +0200, Piotr Sawuk wrote:
>
>> oh, and again I recommend the really short although outdated thesis
>>
>> [1] https://sacerdoti.org/tcphealth/tcphealth-paper.pdf
>
> A thesis saying SACK are not useful is highly suspect.
>
> Instead of finding why they behave not so good and fix the bugs, just
> say "SACK addition to TCP is not critical"
the actual quotation is "We also found that the number of unnecessary
duplicate packets were quite small potentially indicating that the SACK
addition to TCP is not critical."
>
> Really ?

no, not really. he he actually said that SACK has been made mostly obsolete
by "Linux 2.2 implements fast retransmits for up to two packet gaps, thus
reducing the need for course grained timeouts due to the lack of SACK." and
he was a bit more careful and admitted that further tests with tcphealth are
needed to check if SACK really makes that big a difference. he admitted "It
could be that SACK's advantage lies in other areas such as very large
downloads or when using slow and unreliable network links." all these things
could be checked again nowadays, with larger files available and wlan-users
and higher traffic -- just find something without SACK...


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-15  9:17       ` Piotr Sawuk
@ 2012-07-15  9:53         ` Eric Dumazet
  2012-07-15 22:17           ` Piotr Sawuk
  0 siblings, 1 reply; 24+ messages in thread
From: Eric Dumazet @ 2012-07-15  9:53 UTC (permalink / raw)
  To: Piotr Sawuk; +Cc: netdev, linux-kernel

On Sun, 2012-07-15 at 11:17 +0200, Piotr Sawuk wrote:
> On So, 15.07.2012, 09:16, Eric Dumazet wrote:
> > On Sun, 2012-07-15 at 01:43 +0200, Piotr Sawuk wrote:
> >
> >> oh, and again I recommend the really short although outdated thesis
> >>
> >> [1] https://sacerdoti.org/tcphealth/tcphealth-paper.pdf
> >
> > A thesis saying SACK are not useful is highly suspect.
> >
> > Instead of finding why they behave not so good and fix the bugs, just
> > say "SACK addition to TCP is not critical"
> the actual quotation is "We also found that the number of unnecessary
> duplicate packets were quite small potentially indicating that the SACK
> addition to TCP is not critical."
> >
> > Really ?
> 
> no, not really. he he actually said that SACK has been made mostly obsolete
> by "Linux 2.2 implements fast retransmits for up to two packet gaps, thus
> reducing the need for course grained timeouts due to the lack of SACK." and
> he was a bit more careful and admitted that further tests with tcphealth are
> needed to check if SACK really makes that big a difference. he admitted "It
> could be that SACK's advantage lies in other areas such as very large
> downloads or when using slow and unreliable network links." all these things
> could be checked again nowadays, with larger files available and wlan-users
> and higher traffic -- just find something without SACK...

There are hundred of papers about TCP behavior. Many are very good.

This one seems not the best of them by far, and is based on measures
done on 2001 (!!!), on a single computer (!!!), connected to a
particular ISP (!!!), using a wireless pcmcia network card. (!!!)

At that time, almost no clients were using SACK. Because windows 98/XP
dont negociate SACK by default (you need to tweak registry)

Its like saying ECN is useless : If ECN users are under 1 % of total
number of users, network is still under pressure and ECN benefits cannot
rise because of misbehavior of other flows.

With RTT of 100 ms, SACK are clearly a win for long transferts.

A single drop shall retransmit a single packet instead of ~500 packets.
Only fools could deny this fact. Studying DuplicateAcks to detect
retransmits is clearly wrong.

Really, dont recommend this paper, it contains a lot of false
statements.

One example : "we discovered some surprising things as the high
percentage of lost or reordered packets from supposedly highly reliable
and fast services as Akamai networks". 

I cant believe such nonsense can be written, and recommended.

So you can add more counters to TCP stack because having them is good to
better understand TCP behavior and what can be done to improve it, but
dont base this work on dubious 'tcphealth'.




^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-15  9:53         ` Eric Dumazet
@ 2012-07-15 22:17           ` Piotr Sawuk
  0 siblings, 0 replies; 24+ messages in thread
From: Piotr Sawuk @ 2012-07-15 22:17 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

On So, 15.07.2012, 11:53, Eric Dumazet wrote:
> On Sun, 2012-07-15 at 11:17 +0200, Piotr Sawuk wrote:
>> On So, 15.07.2012, 09:16, Eric Dumazet wrote:
>> > On Sun, 2012-07-15 at 01:43 +0200, Piotr Sawuk wrote:
>> >
>> >> oh, and again I recommend the really short although outdated thesis
>> >>
>> >> [1] https://sacerdoti.org/tcphealth/tcphealth-paper.pdf
>> >
>> > A thesis saying SACK are not useful is highly suspect.
>> >
>> > Instead of finding why they behave not so good and fix the bugs, just
>> > say "SACK addition to TCP is not critical"
>> the actual quotation is "We also found that the number of unnecessary
>> duplicate packets were quite small potentially indicating that the SACK
>> addition to TCP is not critical."
>> >
>> > Really ?
>>
>> no, not really. he he actually said that SACK has been made mostly
>> obsolete
>> by "Linux 2.2 implements fast retransmits for up to two packet gaps, thus
>> reducing the need for course grained timeouts due to the lack of SACK."
>> and
>> he was a bit more careful and admitted that further tests with tcphealth
>> are
>> needed to check if SACK really makes that big a difference. he admitted
>> "It
>> could be that SACK's advantage lies in other areas such as very large
>> downloads or when using slow and unreliable network links." all these
>> things
>> could be checked again nowadays, with larger files available and
>> wlan-users
>> and higher traffic -- just find something without SACK...
>
> There are hundred of papers about TCP behavior. Many are very good.
>
> This one seems not the best of them by far, and is based on measures
> done on 2001 (!!!), on a single computer (!!!), connected to a
> particular ISP (!!!), using a wireless pcmcia network card. (!!!)
>
> At that time, almost no clients were using SACK. Because windows 98/XP
> dont negociate SACK by default (you need to tweak registry)
>
> Its like saying ECN is useless : If ECN users are under 1 % of total
> number of users, network is still under pressure and ECN benefits cannot
> rise because of misbehavior of other flows.

I don't think it's in any way similar, but then you definitely are more
knowledgeable on these matters. but it's quite straight-forward: SACK is
supposed to speed up things by informing when a packet has been received and
doesn't need to be resent. if the client doesn't get duplicate packets then
SACK wont make a difference. tcphealth counts how many packets have been
received correctly two or more times. why would tcphealth be useless for
answering if you'll need SACK?
>
> With RTT of 100 ms, SACK are clearly a win for long transferts.
>
> A single drop shall retransmit a single packet instead of ~500 packets.
> Only fools could deny this fact. Studying DuplicateAcks to detect
> retransmits is clearly wrong.

as far as I understood DuplicateAcks means either the Acks were lost in the
transfer or the same packet has been received twice one after the other.
i.e. duplicate packets received should always be smaller-or-equal than
duplicate acks sent. am I wrong? that's why tcphealth is only interested in
these two values.
>
> Really, dont recommend this paper, it contains a lot of false
> statements.
>
> One example : "we discovered some surprising things as the high
> percentage of lost or reordered packets from supposedly highly reliable
> and fast services as Akamai networks".
>
> I cant believe such nonsense can be written, and recommended.

I agree, it's quite some mickey-mouse thesis. but still a manual for what
tcphealth actually does do!
>
> So you can add more counters to TCP stack because having them is good to
> better understand TCP behavior and what can be done to improve it, but
> dont base this work on dubious 'tcphealth'.

what counters are you thinking of? how do you suggest an ordinary user could
improve the download-speed from sites? imho tcphealth offers a good method:
the more DuplicateAcks you send the less responsive the site is you want to
reach, so try at another time and do some other downloads now...


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-20 14:06         ` Yuchung Cheng
@ 2012-07-21 10:34           ` Piotr Sawuk
  0 siblings, 0 replies; 24+ messages in thread
From: Piotr Sawuk @ 2012-07-21 10:34 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

On Fr, 20.07.2012, 16:06, Yuchung Cheng wrote:
> On Mon, Jul 16, 2012 at 6:03 AM, Piotr Sawuk <a9702387@unet.univie.ac.at>
> wrote:
>> On Mo, 16.07.2012, 13:46, Eric Dumazet wrote:
>>> On Mon, 2012-07-16 at 13:33 +0200, Piotr Sawuk wrote:
>>>> On Sa, 14.07.2012, 01:55, Stephen Hemminger wrote:
>>>> > I am not sure if the is really necessary since the most
>>>> > of the stats are available elsewhere.
>>>>
>>>> if by "most" you mean address and port then you're right.
>>>> but even the rtt reported by "ss -i" seems to differ from tcphealth.
>>>
>>> Thats because tcphealth is wrong, it assumes HZ=1000 ?
>>>
>>> tp->srtt unit is jiffies, not ms.
>>
>> thanks. any conversion-functions in the kernel for that?
>>>
>>> tcphealth is a gross hack.
>>
>> what would you do if you tried making it less gross?
>>
>> I've not found any similar functionality, in the kernel.
>> I want to know an estimate for the percentage of data lost in tcp.
>> and I want to know that without actually sending much packets.
>> afterall I'm on the receiving end most of the time.
>> percentage of duplicate packets received is nice too.
>> you have any suggestions?
>
> counting dupack may not be as reliable as you'd like.
> say the remote sends you 100 packets and only the first one is lost,
> you'll see 99 dupacks. Morover any small degree reordering (<3)
> will generate substantial dupacks but the network is perfectly fine

I understand that.
but you must consider the difference between network-health and tcp-health.
network-health on my end I can see by looking at wlan-signal strength.
slow downloads can have many causes, some loose cable is only one possibility.

for example I once played a lan-game, 2 computers connected directly.
however, one computer was 10 times slower than the other.
so when the slow computer would act as server, the game would never start.
the reason wasn't bad connection, it was packet-loss caused by slowness.
and it had to do with the protocol being used (i.e. not tcp).

so when in tcp I get high percentage of dupack I see something's wrong.
not necessarily with the physical connection, but with protocol-handling.
the paper showed dupacks indicate spikes in network-usage.
and as we all know the bottleneck isn't the cable, it's data-processing.
when there is a spike, lots of users connecting, network itself is ok.
however, reordering and lost packets indicate something's up with the server.
and that's actually the info I'm after.

if I were the net-admin I would be interested in network-health too.
bad connection indicated by packet-loss itself means I've got to check cables.
but a user might have much wider area of interest.
the user can't do anything about the cables, but yet is interested in them.
i.e. useless info for the net-admin could be interesting for the user.
that's why I do not recommend tcphealth for servers, useless overhead.

so, if you want to judge usefulness of this patch, consider the situation:
you are powerless but interested in responsiveness of thousands of servers.
you want to learn how those servers behave at different times of a day.
isn't dupacks and dup-packets the best info on that you can possibly get?

> (see Craig Patridge's "reordering is not pathological" paper).

thanks, will look into that.

> unfortunately receiver can't and does not have to distinguish loss

true, not needed for the protocol.
on a higher level it sill can be interesting though.
most of the work for preventing packetloss must be done by the sender.
but as I said before, the receiver can do something too: avoid traffic-jams!
in a network many things are predictable, can be reprogrammed.
this way a network could become more efficient as a whole.
that's what spikes my interest in tcphealth, thinking more globally.

>  or reordering. you can infer that but it should not be kernel's job.

that's why I made it an option as opposed to what the original author did.
theoretically it should be possible to get the same functionality without it.
just read the raw network-data and emulate the work of tcp and tcphealth.
but that definitely would add a big overhead as tcp-handling is duplicated.

> there are public tools that inspect tcpdump traces to do that

good example. so to figure out dupacks you filter out the acks.
and you must somehow compare them, or you parse them the way the kernel does.
in the latter case you'll have to recreate the kernel's internal data.
definitely faster, but could result in duplicate code, that requires space.

also you should consider that not all users have privilegues for tcpdump.
and if they had, it would add another security-risk to their computer.
and you'd have to consider multiple users on one computer, using that service.
I can imagine a daemon in the background doing what tcphealth does.
that's the alternative, allows for more fine-grained security.
it could disallow spying on what connections other users have.
(of course then you'd need to remove /proc/net/tcp output too.)
but imagine the nightmare of keeping that daemon secure.
afterall it must be privilegued to read all network data.

if the kernel would provide what I'm looking for, this daemon could still run.
but then it wouldn't need that risky privilegues, could focus on other stuff.
the task of preventing users from seeing eachothers connections is enough...
>
> exposing duplicate packets received can be done via getsockopt(TCP_INFO)
> although I don't know what that gives you. the remote can be too
> aggressive in retransmission (not just because of a bad RTO) or
> the network RTT fluctuates.

TCP_INFO contains only duplicate packets *sent* (retransmits), not received!
am I missing something? can you give a code-example that can obtain such info?
if running that code in userspace results in same values as tcphealth...
well, actually dupacks is more interesting than dup-packets.
afterall in usual usage the latter will always be zero.
>
> I don't what if tracking last_ack_sent (the latest RCV.NXT) without
> knowing the ISN is useful.

so you suggest I should store and compare ISN too, for accuracy?
you think the gain in accuracy justifies the added overhead?
>
> btw the term project paper cited concludes SACK is not useful is simply
> wrong. This makes me suspicious about how rigorous and thoughtful of
> its design.

isn't my paper.
but I'd guess the usefulness of SACK is only doubted from pov of users.
remember, users connect to many servers.
if a server behaves badly, choose another one.
servers do not have such a choice, for them SACK naturally is important.

a server would just need to look at TCP_INFO to see how useful SACK is.
so I would conclude the author was quite thoughtful about users' pov.
(and quite ignorant about the servers.)

no matter how little knowledge the authors have, tcphealth is interesting.
maybe it was a random discovery by sheer luck.
the correlation between the data it provides and reality is compelling.
if we'd judge inventions by the stupidity of their inventors...


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-16 13:03       ` Piotr Sawuk
@ 2012-07-20 14:06         ` Yuchung Cheng
  2012-07-21 10:34           ` Piotr Sawuk
  0 siblings, 1 reply; 24+ messages in thread
From: Yuchung Cheng @ 2012-07-20 14:06 UTC (permalink / raw)
  To: Piotr Sawuk; +Cc: netdev, linux-kernel

On Mon, Jul 16, 2012 at 6:03 AM, Piotr Sawuk <a9702387@unet.univie.ac.at> wrote:
> On Mo, 16.07.2012, 13:46, Eric Dumazet wrote:
>> On Mon, 2012-07-16 at 13:33 +0200, Piotr Sawuk wrote:
>>> On Sa, 14.07.2012, 01:55, Stephen Hemminger wrote:
>>> > I am not sure if the is really necessary since the most
>>> > of the stats are available elsewhere.
>>>
>>> if by "most" you mean address and port then you're right.
>>> but even the rtt reported by "ss -i" seems to differ from tcphealth.
>>
>> Thats because tcphealth is wrong, it assumes HZ=1000 ?
>>
>> tp->srtt unit is jiffies, not ms.
>
> thanks. any conversion-functions in the kernel for that?
>>
>> tcphealth is a gross hack.
>
> what would you do if you tried making it less gross?
>
> I've not found any similar functionality, in the kernel.
> I want to know an estimate for the percentage of data lost in tcp.
> and I want to know that without actually sending much packets.
> afterall I'm on the receiving end most of the time.
> percentage of duplicate packets received is nice too.
> you have any suggestions?

counting dupack may not be as reliable as you'd like.
say the remote sends you 100 packets and only the first one is lost,
you'll see 99 dupacks. Morover any small degree reordering (<3)
will generate substantial dupacks but the network is perfectly fine
(see Craig Patridge's "reordering is not pathological" paper).
unfortunately receiver can't and does not have to distinguish loss
 or reordering. you can infer that but it should not be kernel's job.
there are public tools that inspect tcpdump traces to do that

exposing duplicate packets received can be done via getsockopt(TCP_INFO)
although I don't know what that gives you. the remote can be too
aggressive in retransmission (not just because of a bad RTO) or
the network RTT fluctuates.

I don't what if tracking last_ack_sent (the latest RCV.NXT) without
knowing the ISN is useful.

btw the term project paper cited concludes SACK is not useful is simply
wrong. This makes me suspicious about how rigorous and thoughtful of
its design.

>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-16 15:24     ` Christoph Paasch
@ 2012-07-19 10:37       ` Piotr Sawuk
  0 siblings, 0 replies; 24+ messages in thread
From: Piotr Sawuk @ 2012-07-19 10:37 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

On Mo, 16.07.2012, 17:24, Christoph Paasch wrote:

> You should do jiffies_to_msecs(tp->srtt) >> 3.
>
> The RTT is already exposed by tcp_info anyway... (see tcp_get_info() - where
> you also see the bitshift)

thanks a lot. rtt is output for completion's sake, it helps in diagnosis.
here my hopefully final version. it comes with tcp_info interface too:

diff -rub A/include/linux/tcp.h B/include/linux/tcp.h
--- A/include/linux/tcp.h	2012-07-15 00:40:28.000000000 +0200
+++ B/include/linux/tcp.h	2012-07-18 13:03:50.000000000 +0200
@@ -183,6 +183,12 @@
 	__u32	tcpi_rcv_space;

 	__u32	tcpi_total_retrans;
+
+	/* TCP Health */
+	__u32	tcpi_dup_acks;
+	__u32	tcpi_dup_pkts;
+	__u32	tcpi_acks;
+	__u32	tcpi_pkts;
 };

 /* for TCP_MD5SIG socket option */
@@ -492,6 +498,17 @@
 	 * contains related tcp_cookie_transactions fields.
 	 */
 	struct tcp_cookie_values  *cookie_values;
+
+#ifdef CONFIG_TCPHEALTH
+	/*
+	 * TCP health monitoring counters.
+	 */
+	__u32	dup_acks_sent;
+	__u32	dup_pkts_recv;
+	__u32	acks_sent;
+	__u32	pkts_recv;
+	__u32	last_ack_sent;	/* Sequence number of the last ack we sent. */
+#endif
 };

 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff -rub A/net/ipv4/Kconfig B/net/ipv4/Kconfig
--- A/net/ipv4/Kconfig	2012-07-15 00:40:28.000000000 +0200
+++ B/net/ipv4/Kconfig	2012-07-16 20:47:54.000000000 +0200
@@ -619,6 +619,28 @@
 	default "reno" if DEFAULT_RENO
 	default "cubic"

+config TCPHEALTH
+	bool "TCP client-side health-statistics (/proc/net/tcphealth)"
+	default n
+	---help---
+	TCP Health Monitoring (established connections only):
+	 -Duplicate ACKs indicate there could be lost or reordered packets
+	  on the connection.
+	 -Duplicate Packets Received signal a slow and badly inefficient
+	  connection.
+	 -RttEst estimates how long future packets will take on a round trip
+	  over the connection.
+
+	Additionally you get total amount of sent ACKs and received Packets.
+	All these values are displayed seperately for each connection.
+	If you are running a dedicated server you wont need this.
+	Duplicate ACKs refers only to those sent upon receiving a Packet.
+	A server most likely doesn't receive much Packets to count.
+	Hence for a server these statistics wont be meaningful.
+	especially since they are split into individual connections.
+
+	If you plan to investigate why some download is slow, say Y.
+
 config TCP_MD5SIG
 	bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff -rub A/net/ipv4/tcp.c B/net/ipv4/tcp.c
--- A/net/ipv4/tcp.c	2012-07-15 00:40:28.000000000 +0200
+++ B/net/ipv4/tcp.c	2012-07-18 13:04:08.000000000 +0200
@@ -2723,6 +2723,13 @@
 	info->tcpi_rcv_space = tp->rcvq_space.space;

 	info->tcpi_total_retrans = tp->total_retrans;
+
+#ifdef TCPHEALTH
+	tcpi_dup_acks = tp->dup_acks_sent;
+	tcpi_dup_pkts = tp->dup_pkts_recv;
+	tcpi_acks = tp->acks_sent;
+	tcpi_pkts = tp->pkts_recv;
+#endif
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);

diff -rub A/net/ipv4/tcp_input.c B/net/ipv4/tcp_input.c
--- A/net/ipv4/tcp_input.c	2012-07-15 00:40:28.000000000 +0200
+++ B/net/ipv4/tcp_input.c	2012-07-16 20:47:54.000000000 +0200
@@ -4492,6 +4492,11 @@
 		}

 		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+#ifdef CONFIG_TCPHEALTH
+			/* Course-Grained Timeout caused retransmit inefficiency-
+			 * this packet has been received twice. */
+			tp->dup_pkts_recv++;
+#endif
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
 			__skb_unlink(skb, &tp->out_of_order_queue);
 			__kfree_skb(skb);
@@ -4824,6 +4829,12 @@
 		return;
 	}

+#ifdef CONFIG_TCPHEALTH
+	/* A packet is a "duplicate" if it contains bytes we have already
received. */
+	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+		tp->dup_pkts_recv++;
+#endif
+
 	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
 		/* A retransmit, 2nd most common case.  Force an immediate ack. */
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
@@ -5535,6 +5546,12 @@

 	tp->rx_opt.saw_tstamp = 0;

+#ifdef CONFIG_TCPHEALTH
+	/*
+	 *	total per-connection packet arrivals.
+	 */
+	tp->pkts_recv++;
+#endif
 	/*	pred_flags is 0xS?10 << 16 + snd_wnd
 	 *	if header_prediction is to be made
 	 *	'S' will always be tp->tcp_header_len >> 2
diff -rub A/net/ipv4/tcp_ipv4.c B/net/ipv4/tcp_ipv4.c
--- A/net/ipv4/tcp_ipv4.c	2012-07-15 00:40:28.000000000 +0200
+++ B/net/ipv4/tcp_ipv4.c	2012-07-18 11:56:32.000000000 +0200
@@ -2500,6 +2500,68 @@
 	return 0;
 }

+#ifdef CONFIG_TCPHEALTH
+/*
+ *	Output /proc/net/tcphealth
+ */
+#define LINESZ 128
+
+int tcp_health_seq_show(struct seq_file *seq, void *v)
+{
+	int len, tab;
+	struct tcp_iter_state *st;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq,
+		"id   Local Address        Remote Address       RttEst(ms) AcksSent "
+		"DupAcksSent PktsRecv DupPktsRecv\n");
+		goto out;
+	}
+
+	/* Loop through established TCP connections */
+	st = seq->private;
+
+
+	if (st->state == TCP_SEQ_STATE_ESTABLISHED)
+	{
+		const struct tcp_sock *tp = tcp_sk(v);
+		const struct inet_sock *inet = inet_sk(v);
+
+		seq_printf(seq, "%d: %pI4:%u%n",
+				st->num,
+				&inet->inet_rcv_saddr,
+				ntohs(inet->inet_sport),
+				&tab
+			);
+		len = 3 + 21; /* 3 is minimum length for "%d: " */
+		if (tab < len) seq_printf(seq, "%*s", len - tab, "");
+		else len = tab;
+		seq_printf(seq, " %pI4:%u%n",
+				&inet->inet_daddr,
+				ntohs(inet->inet_dport),
+				&tab
+			);
+		tab += len;
+		len = 5 + 21 + 22; /* is num ever > 999? */
+		if (tab < len)  seq_printf(seq, "%*s", len - tab, "");
+		else len = tab;
+		seq_printf(seq, " %8u %8lu %8lu %8lu %8lu%n",
+				jiffies_to_msecs(tp->srtt)>>3,
+				(unsigned long)tp->acks_sent,
+				(unsigned long)tp->dup_acks_sent,
+				(unsigned long)tp->pkts_recv,
+				(unsigned long)tp->dup_pkts_recv,
+				&tab
+			);
+
+		seq_printf(seq, "%*s\n", LINESZ - 1 - len - tab, "");
+	}
+
+out:
+	return 0;
+}
+#endif /* CONFIG_TCPHEALTH */
+
 static const struct file_operations tcp_afinfo_seq_fops = {
 	.owner   = THIS_MODULE,
 	.open    = tcp_seq_open,
@@ -2508,6 +2570,17 @@
 	.release = seq_release_net
 };

+#ifdef CONFIG_TCPHEALTH
+static struct tcp_seq_afinfo tcphealth_seq_afinfo = {
+	.name		= "tcphealth",
+	.family		= AF_INET,
+	.seq_fops	= &tcp_afinfo_seq_fops,
+	.seq_ops	= {
+		.show		= tcp_health_seq_show,
+	},
+};
+#endif
+
 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
 	.name		= "tcp",
 	.family		= AF_INET,
@@ -2519,12 +2592,20 @@

 static int __net_init tcp4_proc_init_net(struct net *net)
 {
-	return tcp_proc_register(net, &tcp4_seq_afinfo);
+	int ret = tcp_proc_register(net, &tcp4_seq_afinfo);
+#ifdef CONFIG_TCPHEALTH
+	if(ret == 0)
+		ret = tcp_proc_register(net, &tcphealth_seq_afinfo);
+#endif
+	return ret;
 }

 static void __net_exit tcp4_proc_exit_net(struct net *net)
 {
 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
+#ifdef CONFIG_TCPHEALTH
+	tcp_proc_unregister(net, &tcphealth_seq_afinfo);
+#endif
 }

 static struct pernet_operations tcp4_net_ops = {
diff -rub A/net/ipv4/tcp_output.c B/net/ipv4/tcp_output.c
--- A/net/ipv4/tcp_output.c	2012-07-15 00:40:28.000000000 +0200
+++ B/net/ipv4/tcp_output.c	2012-07-16 20:47:54.000000000 +0200
@@ -2772,8 +2772,19 @@
 	skb_reserve(buff, MAX_TCP_HEADER);
 	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);

+#ifdef CONFIG_TCPHEALTH
+	/* If the rcv_nxt has not advanced since sending our last ACK, this is a
duplicate. */
+	if (tcp_sk(sk)->rcv_nxt == tcp_sk(sk)->last_ack_sent)
+		tcp_sk(sk)->dup_acks_sent++;
+	/* Record the total number of acks sent on this connection. */
+	tcp_sk(sk)->acks_sent++;
+#endif
+
 	/* Send it off, this clears delayed acks for us. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
+#ifdef CONFIG_TCPHEALTH
+	tcp_sk(sk)->last_ack_sent = tcp_sk(sk)->rcv_nxt;
+#endif
 	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
 }



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-16 15:12   ` Piotr Sawuk
@ 2012-07-16 15:24     ` Christoph Paasch
  2012-07-19 10:37       ` Piotr Sawuk
  0 siblings, 1 reply; 24+ messages in thread
From: Christoph Paasch @ 2012-07-16 15:24 UTC (permalink / raw)
  To: Piotr Sawuk; +Cc: netdev, linux-kernel

On Monday 16 July 2012 17:12:26 Piotr Sawuk wrote:
> +               seq_printf(seq, "%d: %-21pI4:%u %-21pI4:%u "
> +                               "%8u %8lu %8lu %8lu %8lu%n",
> +                               st->num,
> +                               &inet->inet_rcv_saddr,
> +                               ntohs(inet->inet_sport),
> +                               &inet->inet_daddr,
> +                               ntohs(inet->inet_dport),
> +                               jiffies_to_msecs(tp->srtt),

This still doesn't gives you the correct RTT.
srtt is in jiffies * 8.

You should do jiffies_to_msecs(tp->srtt) >> 3.

The RTT is already exposed by tcp_info anyway... (see tcp_get_info() - where 
you also see the bitshift)


Christoph

-- 
IP Networking Lab --- http://inl.info.ucl.ac.be
MultiPath TCP in the Linux Kernel --- http://mptcp.info.ucl.ac.be
Université Catholique de Louvain
--

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-16 13:32 ` Ben Hutchings
@ 2012-07-16 15:12   ` Piotr Sawuk
  2012-07-16 15:24     ` Christoph Paasch
  0 siblings, 1 reply; 24+ messages in thread
From: Piotr Sawuk @ 2012-07-16 15:12 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

On Mo, 16.07.2012, 15:32, Ben Hutchings wrote:
> On Sat, 2012-07-14 at 09:56 +0200, Piotr Sawuk wrote:
>> On Sa, 14.07.2012, 03:31, valdis.kletnieks@vt.edu wrote:
>> > On Fri, 13 Jul 2012 16:55:44 -0700, Stephen Hemminger said:
>> >
>> >> >+			/* Course retransmit inefficiency- this packet has been received
>> >> twice. */
>> >> >+			tp->dup_pkts_recv++;
>> >> I don't understand that comment, could you use a better sentence
>> please?
>> >
>> > I think what was intended was:
>> >
>> > /* Curse you, retransmit inefficiency! This packet has been received at
>> least twice */
>> >
>>
>> LOL, no. I think "course retransmit" is short for "course-grained timeout
>> caused retransmit" but I can't be sure since I'm not the author of these
>> lines. I'll replace that comment with the non-shorthand version though.
>> however, I think the real comment here should be:
> [...]
>
> The word you are looking for is 'coarse' not 'course' (they are
> generally pronounced the same, to confuse you).

that was my first thought too.
but then I noticed the word "course" in the kernel's comments.
judging by context it describes the events of a round-trip.
so I guess Course-Grained means RTT-grained.
especially since this misspelling was consistent in the author's paper.

anyway, new patch, made some mistakes in my previous version.
also I added the jiffies_to_msecs noone dared to mention
comments and suggestions as always welcome:

diff -rub A/include/linux/tcp.h B/include/linux/tcp.h
--- A/include/linux/tcp.h	2012-07-08 02:23:56.000000000 +0200
+++ B/include/linux/tcp.h	2012-07-16 16:42:08.000000000 +0200
@@ -492,6 +492,17 @@
 	 * contains related tcp_cookie_transactions fields.
 	 */
 	struct tcp_cookie_values  *cookie_values;
+
+#ifdef CONFIG_TCPHEALTH
+	/*
+	 * TCP health monitoring counters.
+	 */
+	__u32	dup_acks_sent;
+	__u32	dup_pkts_recv;
+	__u32	acks_sent;
+	__u32	pkts_recv;
+	__u32	last_ack_sent;	/* Sequence number of the last ack we sent. */
+#endif
 };

 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff -rub A/net/ipv4/Kconfig B/net/ipv4/Kconfig
--- A/net/ipv4/Kconfig	2012-07-08 02:23:56.000000000 +0200
+++ B/net/ipv4/Kconfig	2012-07-16 11:56:15.000000000 +0200
@@ -619,6 +619,28 @@
 	default "reno" if DEFAULT_RENO
 	default "cubic"

+config TCPHEALTH
+	bool "TCP client-side health-statistics (/proc/net/tcphealth)"
+	default n
+	---help---
+	TCP Health Monitoring (established connections only):
+	 -Duplicate ACKs indicate there could be lost or reordered packets
+	  on the connection.
+	 -Duplicate Packets Received signal a slow and badly inefficient
+	  connection.
+	 -RttEst estimates how long future packets will take on a round trip
+	  over the connection.
+
+	Additionally you get total amount of sent ACKs and received Packets.
+	All these values are displayed seperately for each connection.
+	If you are running a dedicated server you wont need this.
+	Duplicate ACKs refers only to those sent upon receiving a Packet.
+	A server most likely doesn't receive much Packets to count.
+	Hence for a server these statistics wont be meaningful.
+	especially since they are split into individual connections.
+
+	If you plan to investigate why some download is slow, say Y.
+
 config TCP_MD5SIG
 	bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff -rub A/net/ipv4/tcp_input.c B/net/ipv4/tcp_input.c
--- A/net/ipv4/tcp_input.c	2012-07-08 02:23:56.000000000 +0200
+++ B/net/ipv4/tcp_input.c	2012-07-16 16:45:17.000000000 +0200
@@ -4492,6 +4492,11 @@
 		}

 		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+#ifdef CONFIG_TCPHEALTH
+			/* Course-Grained Timeout caused retransmit inefficiency-
+			 * this packet has been received twice. */
+			tp->dup_pkts_recv++;
+#endif
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
 			__skb_unlink(skb, &tp->out_of_order_queue);
 			__kfree_skb(skb);
@@ -4824,6 +4829,12 @@
 		return;
 	}

+#ifdef CONFIG_TCPHEALTH
+	/* A packet is a "duplicate" if it contains bytes we have already
received. */
+	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+		tp->dup_pkts_recv++;
+#endif
+
 	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
 		/* A retransmit, 2nd most common case.  Force an immediate ack. */
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
@@ -5535,6 +5546,12 @@

 	tp->rx_opt.saw_tstamp = 0;

+#ifdef CONFIG_TCPHEALTH
+	/*
+	 *	total per-connection packet arrivals.
+	 */
+	tp->pkts_recv++;
+#endif
 	/*	pred_flags is 0xS?10 << 16 + snd_wnd
 	 *	if header_prediction is to be made
 	 *	'S' will always be tp->tcp_header_len >> 2
diff -rub A/net/ipv4/tcp_ipv4.c B/net/ipv4/tcp_ipv4.c
--- A/net/ipv4/tcp_ipv4.c	2012-07-08 02:23:56.000000000 +0200
+++ B/net/ipv4/tcp_ipv4.c	2012-07-16 16:29:05.000000000 +0200
@@ -2500,6 +2500,57 @@
 	return 0;
 }

+#ifdef CONFIG_TCPHEALTH
+/*
+ *	Output /proc/net/tcphealth
+ */
+#define LINESZ 128
+
+int tcp_health_seq_show(struct seq_file *seq, void *v)
+{
+	int len;
+	struct tcp_iter_state *st;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq,
+		"id   Local Address        Remote Address       RttEst(ms) AcksSent "
+		"DupAcksSent PktsRecv DupPktsRecv\n");
+		goto out;
+	}
+
+	/* Loop through established TCP connections */
+	st = seq->private;
+
+
+	if (st->state == TCP_SEQ_STATE_ESTABLISHED)
+	{
+		const struct tcp_sock *tp = tcp_sk(v);
+		const struct inet_sock *inet = inet_sk(v);
+
+		seq_printf(seq, "%d: %-21pI4:%u %-21pI4:%u "
+				"%8u %8lu %8lu %8lu %8lu%n",
+				st->num,
+				&inet->inet_rcv_saddr,
+				ntohs(inet->inet_sport),
+				&inet->inet_daddr,
+				ntohs(inet->inet_dport),
+				jiffies_to_msecs(tp->srtt),
+				tp->acks_sent,
+				tp->dup_acks_sent,
+				tp->pkts_recv,
+				tp->dup_pkts_recv,
+
+				&len
+			);
+
+		seq_printf(seq, "%*s\n", LINESZ - 1 - len, "");
+	}
+
+out:
+	return 0;
+}
+#endif /* CONFIG_TCPHEALTH */
+
 static const struct file_operations tcp_afinfo_seq_fops = {
 	.owner   = THIS_MODULE,
 	.open    = tcp_seq_open,
@@ -2508,6 +2559,17 @@
 	.release = seq_release_net
 };

+#ifdef CONFIG_TCPHEALTH
+static struct tcp_seq_afinfo tcphealth_seq_afinfo = {
+	.name		= "tcphealth",
+	.family		= AF_INET,
+	.seq_fops	= &tcp_afinfo_seq_fops,
+	.seq_ops	= {
+		.show		= tcp_health_seq_show,
+	},
+};
+#endif
+
 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
 	.name		= "tcp",
 	.family		= AF_INET,
@@ -2519,12 +2581,20 @@

 static int __net_init tcp4_proc_init_net(struct net *net)
 {
-	return tcp_proc_register(net, &tcp4_seq_afinfo);
+	int ret = tcp_proc_register(net, &tcp4_seq_afinfo);
+#ifdef CONFIG_TCPHEALTH
+	if(ret == 0)
+		ret = tcp_proc_register(net, &tcphealth_seq_afinfo);
+#endif
+	return ret;
 }

 static void __net_exit tcp4_proc_exit_net(struct net *net)
 {
 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
+#ifdef CONFIG_TCPHEALTH
+	tcp_proc_unregister(net, &tcphealth_seq_afinfo);
+#endif
 }

 static struct pernet_operations tcp4_net_ops = {
diff -rub A/net/ipv4/tcp_output.c B/net/ipv4/tcp_output.c
--- A/net/ipv4/tcp_output.c	2012-07-08 02:23:56.000000000 +0200
+++ B/net/ipv4/tcp_output.c	2012-07-16 09:44:02.000000000 +0200
@@ -2772,8 +2772,19 @@
 	skb_reserve(buff, MAX_TCP_HEADER);
 	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);

+#ifdef CONFIG_TCPHEALTH
+	/* If the rcv_nxt has not advanced since sending our last ACK, this is a
duplicate. */
+	if (tcp_sk(sk)->rcv_nxt == tcp_sk(sk)->last_ack_sent)
+		tcp_sk(sk)->dup_acks_sent++;
+	/* Record the total number of acks sent on this connection. */
+	tcp_sk(sk)->acks_sent++;
+#endif
+
 	/* Send it off, this clears delayed acks for us. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
+#ifdef CONFIG_TCPHEALTH
+	tcp_sk(sk)->last_ack_sent = tcp_sk(sk)->rcv_nxt;
+#endif
 	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
 }



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-14  7:56 Piotr Sawuk
  2012-07-14  8:27 ` Eric Dumazet
@ 2012-07-16 13:32 ` Ben Hutchings
  2012-07-16 15:12   ` Piotr Sawuk
  1 sibling, 1 reply; 24+ messages in thread
From: Ben Hutchings @ 2012-07-16 13:32 UTC (permalink / raw)
  To: Piotr Sawuk; +Cc: netdev, linux-kernel

On Sat, 2012-07-14 at 09:56 +0200, Piotr Sawuk wrote:
> On Sa, 14.07.2012, 03:31, valdis.kletnieks@vt.edu wrote:
> > On Fri, 13 Jul 2012 16:55:44 -0700, Stephen Hemminger said:
> >
> >> >+			/* Course retransmit inefficiency- this packet has been received
> >> twice. */
> >> >+			tp->dup_pkts_recv++;
> >> I don't understand that comment, could you use a better sentence please?
> >
> > I think what was intended was:
> >
> > /* Curse you, retransmit inefficiency! This packet has been received at
> least twice */
> >
> 
> LOL, no. I think "course retransmit" is short for "course-grained timeout
> caused retransmit" but I can't be sure since I'm not the author of these
> lines. I'll replace that comment with the non-shorthand version though.
> however, I think the real comment here should be:
[...]

The word you are looking for is 'coarse' not 'course' (they are
generally pronounced the same, to confuse you).

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-16 11:46     ` Eric Dumazet
@ 2012-07-16 13:03       ` Piotr Sawuk
  2012-07-20 14:06         ` Yuchung Cheng
  0 siblings, 1 reply; 24+ messages in thread
From: Piotr Sawuk @ 2012-07-16 13:03 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

On Mo, 16.07.2012, 13:46, Eric Dumazet wrote:
> On Mon, 2012-07-16 at 13:33 +0200, Piotr Sawuk wrote:
>> On Sa, 14.07.2012, 01:55, Stephen Hemminger wrote:
>> > I am not sure if the is really necessary since the most
>> > of the stats are available elsewhere.
>>
>> if by "most" you mean address and port then you're right.
>> but even the rtt reported by "ss -i" seems to differ from tcphealth.
>
> Thats because tcphealth is wrong, it assumes HZ=1000 ?
>
> tp->srtt unit is jiffies, not ms.

thanks. any conversion-functions in the kernel for that?
>
> tcphealth is a gross hack.

what would you do if you tried making it less gross?

I've not found any similar functionality, in the kernel.
I want to know an estimate for the percentage of data lost in tcp.
and I want to know that without actually sending much packets.
afterall I'm on the receiving end most of the time.
percentage of duplicate packets received is nice too.
you have any suggestions?


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-16 11:33   ` Piotr Sawuk
@ 2012-07-16 11:46     ` Eric Dumazet
  2012-07-16 13:03       ` Piotr Sawuk
  0 siblings, 1 reply; 24+ messages in thread
From: Eric Dumazet @ 2012-07-16 11:46 UTC (permalink / raw)
  To: Piotr Sawuk; +Cc: netdev, linux-kernel

On Mon, 2012-07-16 at 13:33 +0200, Piotr Sawuk wrote:
> On Sa, 14.07.2012, 01:55, Stephen Hemminger wrote:
> > I am not sure if the is really necessary since the most
> > of the stats are available elsewhere.
> 
> if by "most" you mean address and port then you're right.
> but even the rtt reported by "ss -i" seems to differ from tcphealth.

Thats because tcphealth is wrong, it assumes HZ=1000 ?

tp->srtt unit is jiffies, not ms.

tcphealth is a gross hack.



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-13 23:55 ` Stephen Hemminger
  2012-07-14  1:31   ` valdis.kletnieks
@ 2012-07-16 11:33   ` Piotr Sawuk
  2012-07-16 11:46     ` Eric Dumazet
  1 sibling, 1 reply; 24+ messages in thread
From: Piotr Sawuk @ 2012-07-16 11:33 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

On Sa, 14.07.2012, 01:55, Stephen Hemminger wrote:
> I am not sure if the is really necessary since the most
> of the stats are available elsewhere.

if by "most" you mean address and port then you're right.
but even the rtt reported by "ss -i" seems to differ from tcphealth.

however, if instead by "elsewhere" you mean "on the server"...
>

>>+		seq_printf(seq,
>>+		"TCP Health Monitoring (established connections only)\n"
>>+		" -Duplicate ACKs indicate lost or reordered packets on the
>>connection.\n"
>>+		" -Duplicate Packets Received signal a slow and badly inefficient
>>connection.\n"
>>+		" -RttEst estimates how long future packets will take on a round trip
>>over the connection.\n"
>>+		"id   Local Address        Remote Address       RttEst(ms) AcksSent "
>
> Header seems excessive, just put one line of header please.

I guess the header was sort of documentation for this patch.
I've put it into Kconfig instead.
>
>
>>+		"DupAcksSent PktsRecv DupPktsRecv\n");
>>+		goto out;
>>+	}
>>+
>>+	/* Loop through established TCP connections */
>>+	st = seq->private;
>>+
>>+
>>+	if (st->state == TCP_SEQ_STATE_ESTABLISHED)
>>+	{
>>+/*	; //insert read-lock here */
>
> Don't think you need read-lock

you mean I wont get segfault reading a tcp_sock that's gone?
>

> Kernel has %pI4 to print IP addresses.

thanks, I didn't know.
>

>>+		seq_printf(seq, "%*s\n", LINESZ - 1 - len, "");
>
> This padding of line is bogus, just print variable length line.
> Are you trying to make it fixed length record file?

I guess so, /proc/net/tcp is doing the same.
wont question the authors of that user-interface.

OK, new version, this time with Kconfig changed:

diff -rub A/include/linux/tcp.h B/include/linux/tcp.h
--- A/include/linux/tcp.h	2012-07-08 02:23:56.000000000 +0200
+++ B/include/linux/tcp.h	2012-07-16 09:04:54.000000000 +0200
@@ -492,6 +492,17 @@
 	 * contains related tcp_cookie_transactions fields.
 	 */
 	struct tcp_cookie_values  *cookie_values;
+
+#ifdef CONFIG_TCPHEALTH
+	/*
+	 * TCP health monitoring counters.
+	 */
+	__u32	dup_acks_sent;
+	__u32	dup_pkts_recv;
+	__u32	acks_sent;
+	__u32	pkts_recv;
+	__u32	last_ack_sent;	/* Sequence number of the last ack we sent. */
+#endif
 };

 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
Only in B/include/linux: tcp.h.orig
diff -rub A/net/ipv4/Kconfig B/net/ipv4/Kconfig
--- A/net/ipv4/Kconfig	2012-07-08 02:23:56.000000000 +0200
+++ B/net/ipv4/Kconfig	2012-07-16 11:56:15.000000000 +0200
@@ -619,6 +619,28 @@
 	default "reno" if DEFAULT_RENO
 	default "cubic"

+config TCPHEALTH
+	bool "TCP: client-side health-statistics (/proc/net/tcphealth)"
+	default n
+	---help---
+	IPv4 TCP Health Monitoring (established connections only):
+	 -Duplicate ACKs indicate there could be lost or reordered packets
+	  on the connection.
+	 -Duplicate Packets Received signal a slow and badly inefficient
+	  connection.
+	 -RttEst estimates how long future packets will take on a round trip
+	  over the connection.
+
+	Additionally you get total amount of sent ACKs and received Packets.
+	All these values are displayed seperately for each connection.
+	If you are running a dedicated server you wont need this.
+	Duplicate ACKs refers only to those sent upon receiving a Packet.
+	A server most likely doesn't receive much Packets to count.
+	Hence for a server these statistics wont be meaningful.
+	especially since they are split into individual connections.
+
+	If you plan to investigate why some download is slow, say Y.
+
 config TCP_MD5SIG
 	bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
Only in B/net/ipv4: Kconfig~
diff -rub A/net/ipv4/tcp_input.c B/net/ipv4/tcp_input.c
--- A/net/ipv4/tcp_input.c	2012-07-08 02:23:56.000000000 +0200
+++ B/net/ipv4/tcp_input.c	2012-07-16 09:28:23.000000000 +0200
@@ -4492,6 +4492,11 @@
 		}

 		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+#ifdef CONFIG_TCPHEALTH
+			/* Course Timeout caused retransmit inefficiency-
+			 * this packet has been received twice. */
+			tp->dup_pkts_recv++;
+#endif
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
 			__skb_unlink(skb, &tp->out_of_order_queue);
 			__kfree_skb(skb);
@@ -4824,6 +4829,12 @@
 		return;
 	}

+#ifdef CONFIG_TCPHEALTH
+	/* A packet is a "duplicate" if it contains bytes we have already
received. */
+	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+		tp->dup_pkts_recv++;
+#endif
+
 	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
 		/* A retransmit, 2nd most common case.  Force an immediate ack. */
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
@@ -5535,6 +5546,12 @@

 	tp->rx_opt.saw_tstamp = 0;

+#ifdef CONFIG_TCPHEALTH
+	/*
+	 *	total per-connection packet arrivals.
+	 */
+	tp->pkts_recv++;
+#endif
 	/*	pred_flags is 0xS?10 << 16 + snd_wnd
 	 *	if header_prediction is to be made
 	 *	'S' will always be tp->tcp_header_len >> 2
diff -rub A/net/ipv4/tcp_ipv4.c B/net/ipv4/tcp_ipv4.c
--- A/net/ipv4/tcp_ipv4.c	2012-07-08 02:23:56.000000000 +0200
+++ B/net/ipv4/tcp_ipv4.c	2012-07-16 10:12:48.000000000 +0200
@@ -2500,6 +2500,57 @@
 	return 0;
 }

+#ifdef CONFIG_TCPHEALTH
+/*
+ *	Output /proc/net/tcphealth
+ */
+#define LINESZ 128
+
+int tcp_health_seq_show(struct seq_file *seq, void *v)
+{
+	int len, num;
+	struct tcp_iter_state *st;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq,
+		"id   Local Address        Remote Address       RttEst(ms) AcksSent "
+		"DupAcksSent PktsRecv DupPktsRecv\n");
+		goto out;
+	}
+
+	/* Loop through established TCP connections */
+	st = seq->private;
+
+
+	if (st->state == TCP_SEQ_STATE_ESTABLISHED)
+	{
+		const struct tcp_sock *tp = tcp_sk(v);
+		const struct inet_sock *inet = inet_sk(v);
+
+		seq_printf(seq, "%d: %-21pI4:%u %-21pI4:%u "
+				"%8lu %8lu %8lu %8lu %8lu%n",
+				st->num,
+				inet->inet_rcv_saddr,
+				ntohs(inet->inet_sport),
+				inet->inet_daddr,
+				ntohs(inet->inet_dport),
+				tp->srtt >> 3,
+				tp->acks_sent,
+				tp->dup_acks_sent,
+				tp->pkts_recv,
+				tp->dup_pkts_recv,
+
+				&len
+			);
+
+		seq_printf(seq, "%*s\n", LINESZ - 1 - len, "");
+	}
+
+out:
+	return 0;
+}
+#endif /* CONFIG_TCPHEALTH */
+
 static const struct file_operations tcp_afinfo_seq_fops = {
 	.owner   = THIS_MODULE,
 	.open    = tcp_seq_open,
@@ -2508,6 +2559,17 @@
 	.release = seq_release_net
 };

+#ifdef CONFIG_TCPHEALTH
+static struct tcp_seq_afinfo tcphealth_seq_afinfo = {
+	.name		= "tcphealth",
+	.family		= AF_INET,
+	.seq_fops	= &tcp_afinfo_seq_fops,
+	.seq_ops	= {
+		.show		= tcp_health_seq_show,
+	},
+};
+#endif
+
 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
 	.name		= "tcp",
 	.family		= AF_INET,
@@ -2519,12 +2581,20 @@

 static int __net_init tcp4_proc_init_net(struct net *net)
 {
-	return tcp_proc_register(net, &tcp4_seq_afinfo);
+	int ret = tcp_proc_register(net, &tcp4_seq_afinfo);
+#ifdef CONFIG_TCPHEALTH
+	if(ret == 0)
+		ret = tcp_proc_register(net, &tcphealth_seq_afinfo);
+#endif
+	return ret;
 }

 static void __net_exit tcp4_proc_exit_net(struct net *net)
 {
 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
+#ifdef CONFIG_TCPHEALTH
+	tcp_proc_unregister(net, &tcphealth_seq_afinfo);
+#endif
 }

 static struct pernet_operations tcp4_net_ops = {
Only in B/net/ipv4: tcp_ipv4.c~
Only in B/net/ipv4: tcp_ipv4.c.orig
diff -rub A/net/ipv4/tcp_output.c B/net/ipv4/tcp_output.c
--- A/net/ipv4/tcp_output.c	2012-07-08 02:23:56.000000000 +0200
+++ B/net/ipv4/tcp_output.c	2012-07-16 09:44:02.000000000 +0200
@@ -2772,8 +2772,19 @@
 	skb_reserve(buff, MAX_TCP_HEADER);
 	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);

+#ifdef CONFIG_TCPHEALTH
+	/* If the rcv_nxt has not advanced since sending our last ACK, this is a
duplicate. */
+	if (tcp_sk(sk)->rcv_nxt == tcp_sk(sk)->last_ack_sent)
+		tcp_sk(sk)->dup_acks_sent++;
+	/* Record the total number of acks sent on this connection. */
+	tcp_sk(sk)->acks_sent++;
+#endif
+
 	/* Send it off, this clears delayed acks for us. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
+#ifdef CONFIG_TCPHEALTH
+	tcp_sk(sk)->last_ack_sent = tcp_sk(sk)->rcv_nxt;
+#endif
 	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
 }



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-14  8:27 ` Eric Dumazet
@ 2012-07-14 19:29   ` David Miller
  0 siblings, 0 replies; 24+ messages in thread
From: David Miller @ 2012-07-14 19:29 UTC (permalink / raw)
  To: eric.dumazet; +Cc: a9702387, netdev, linux-kernel


Don't we report all of this shit in tcp_info already?

I really hate such cruddy patches like this.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-14  7:56 Piotr Sawuk
@ 2012-07-14  8:27 ` Eric Dumazet
  2012-07-14 19:29   ` David Miller
  2012-07-16 13:32 ` Ben Hutchings
  1 sibling, 1 reply; 24+ messages in thread
From: Eric Dumazet @ 2012-07-14  8:27 UTC (permalink / raw)
  To: Piotr Sawuk; +Cc: netdev, linux-kernel

On Sat, 2012-07-14 at 09:56 +0200, Piotr Sawuk wrote:
> On Sa, 14.07.2012, 03:31, valdis.kletnieks@vt.edu wrote:
> > On Fri, 13 Jul 2012 16:55:44 -0700, Stephen Hemminger said:
> >
> >> >+			/* Course retransmit inefficiency- this packet has been received
> >> twice. */
> >> >+			tp->dup_pkts_recv++;
> >> I don't understand that comment, could you use a better sentence please?
> >
> > I think what was intended was:
> >
> > /* Curse you, retransmit inefficiency! This packet has been received at
> least twice */
> >
> 
> LOL, no. I think "course retransmit" is short for "course-grained timeout
> caused retransmit" but I can't be sure since I'm not the author of these
> lines. I'll replace that comment with the non-shorthand version though.
> however, I think the real comment here should be:
> 
> /*A perceived shortcoming of the standard TCP implementation: A
> TCP receiver can get duplicate packets from the sender because it cannot
> acknowledge packets that arrive out of order. These duplicates would happen
> when the sender mistakenly thinks some packets have been lost by the network
> because it does not receive acks for them but in reality they were
> successfully received out of order. Since the receiver has no way of letting
> the sender know about the receipt of these packets, they could potentially
> be re-sent and re-received at the receiver. Not only do duplicate packets
> waste precious Internet bandwidth but they hurt performance because the
> sender mistakenly detects congestion from packet losses. The SACK TCP
> extension speci\fcally addresses this issue. A large number of duplicate
> packets received would indicate a signi\fcant bene\ft to the wide adoption of
> SACK. The duplicatepacketsreceived metric is computed at the
> receiver and counts these packets on a per-connection basis.*/
> 
> as copied from his thesis at [1]. also in the thesis he writes:
> 
> In our limited experiment, the results indicated no duplicate packets were
> received on any connection in the 18 hour run. This leads us to several
> conclusions. Since duplicate ACKs were seen on many connections we know that
> some packets were lost or reordered, but unACKed reordered packets never
> caused a /coursegrainedtimeouts/ on our connections. Only these timeouts
> will cause duplicate packets to be received since less severe out-of-order
> conditions will be resolved with fast retransmits. The lack of course
> timeouts
> may be due to the quality of UCSD's ActiveWeb network or the paucity of
> large gaps between received packet groups. It should be noted that Linux 2.2
> implements fast retransmits for up to two packet gaps, thus reducing the
> need for course grained timeouts due to the lack of SACK.
> 
> [1] https://sacerdoti.org/tcphealth/tcphealth-paper.pdf

Not sure how pertinent is this paper today in 2012

I would prefer you add global counters, instead of per tcp counters that
most applications wont use at all.

Example of a more useful patch : add a counter of packets queued in Out
Of Order queue ( in tcp_data_queue_ofo() )

"netstat -s" will display the total count, without any changes in
userland tools/applications.




^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
@ 2012-07-14  7:56 Piotr Sawuk
  2012-07-14  8:27 ` Eric Dumazet
  2012-07-16 13:32 ` Ben Hutchings
  0 siblings, 2 replies; 24+ messages in thread
From: Piotr Sawuk @ 2012-07-14  7:56 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

On Sa, 14.07.2012, 03:31, valdis.kletnieks@vt.edu wrote:
> On Fri, 13 Jul 2012 16:55:44 -0700, Stephen Hemminger said:
>
>> >+			/* Course retransmit inefficiency- this packet has been received
>> twice. */
>> >+			tp->dup_pkts_recv++;
>> I don't understand that comment, could you use a better sentence please?
>
> I think what was intended was:
>
> /* Curse you, retransmit inefficiency! This packet has been received at
least twice */
>

LOL, no. I think "course retransmit" is short for "course-grained timeout
caused retransmit" but I can't be sure since I'm not the author of these
lines. I'll replace that comment with the non-shorthand version though.
however, I think the real comment here should be:

/*A perceived shortcoming of the standard TCP implementation: A
TCP receiver can get duplicate packets from the sender because it cannot
acknowledge packets that arrive out of order. These duplicates would happen
when the sender mistakenly thinks some packets have been lost by the network
because it does not receive acks for them but in reality they were
successfully received out of order. Since the receiver has no way of letting
the sender know about the receipt of these packets, they could potentially
be re-sent and re-received at the receiver. Not only do duplicate packets
waste precious Internet bandwidth but they hurt performance because the
sender mistakenly detects congestion from packet losses. The SACK TCP
extension speci\fcally addresses this issue. A large number of duplicate
packets received would indicate a signi\fcant bene\ft to the wide adoption of
SACK. The duplicatepacketsreceived metric is computed at the
receiver and counts these packets on a per-connection basis.*/

as copied from his thesis at [1]. also in the thesis he writes:

In our limited experiment, the results indicated no duplicate packets were
received on any connection in the 18 hour run. This leads us to several
conclusions. Since duplicate ACKs were seen on many connections we know that
some packets were lost or reordered, but unACKed reordered packets never
caused a /coursegrainedtimeouts/ on our connections. Only these timeouts
will cause duplicate packets to be received since less severe out-of-order
conditions will be resolved with fast retransmits. The lack of course
timeouts
may be due to the quality of UCSD's ActiveWeb network or the paucity of
large gaps between received packet groups. It should be noted that Linux 2.2
implements fast retransmits for up to two packet gaps, thus reducing the
need for course grained timeouts due to the lack of SACK.

[1] https://sacerdoti.org/tcphealth/tcphealth-paper.pdf






^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-13 23:55 ` Stephen Hemminger
@ 2012-07-14  1:31   ` valdis.kletnieks
  2012-07-16 11:33   ` Piotr Sawuk
  1 sibling, 0 replies; 24+ messages in thread
From: valdis.kletnieks @ 2012-07-14  1:31 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Piotr Sawuk, netdev, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 368 bytes --]

On Fri, 13 Jul 2012 16:55:44 -0700, Stephen Hemminger said:

> >+			/* Course retransmit inefficiency- this packet has been received twice. */
> >+			tp->dup_pkts_recv++;
>
> I don't understand that comment, could you use a better sentence please?

I think what was intended was:

/* Curse you, retransmit inefficiency! This packet has been received at least twice */

[-- Attachment #2: Type: application/pgp-signature, Size: 865 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
  2012-07-13  7:33 Piotr Sawuk
@ 2012-07-13 23:55 ` Stephen Hemminger
  2012-07-14  1:31   ` valdis.kletnieks
  2012-07-16 11:33   ` Piotr Sawuk
  0 siblings, 2 replies; 24+ messages in thread
From: Stephen Hemminger @ 2012-07-13 23:55 UTC (permalink / raw)
  To: Piotr Sawuk; +Cc: netdev, linux-kernel

I am not sure if the is really necessary since the most
of the stats are available elsewhere.

Here are some comments on getting the simplified to match
the kernel style.

>
> static inline struct tcp_sock *tcp_sk(const struct sock *sk)
>diff -rub A/net/ipv4/tcp_input.c B/net/ipv4/tcp_input.c
>--- A/net/ipv4/tcp_input.c	2012-06-22 20:37:50.000000000 +0200
>+++ B/net/ipv4/tcp_input.c	2012-07-06 10:12:12.000000000 +0200
>@@ -4414,6 +4415,8 @@
> 		}
>
> 		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
>+			/* Course retransmit inefficiency- this packet has been received twice. */
>+			tp->dup_pkts_recv++;

I don't understand that comment, could you use a better sentence please?

>
> 	tp->rx_opt.saw_tstamp = 0;
>
>+	/*
>+	 *	Tcp health monitoring is interested in
>+	 *	total per-connection packet arrivals.
>+	 *	This is in the fast path, but is quick.
>+	 */
>+	tp->pkts_recv++;
>+

Comment seems bigger justification than necessary for simple
operation.

>diff -rub A/net/ipv4/tcp_ipv4.c B/net/ipv4/tcp_ipv4.c
>--- A/net/ipv4/tcp_ipv4.c	2012-06-22 20:37:50.000000000 +0200
>+++ B/net/ipv4/tcp_ipv4.c	2012-07-11 09:34:22.000000000 +0200
>@@ -2533,6 +2533,82 @@
> 	return 0;
> }
>
>+
>+/*
>+ *	Output /proc/net/tcphealth
>+ */
>+#define LINESZ 128
>+
>+int tcp_health_seq_show(struct seq_file *seq, void *v)
>+{
>+	int len, num;
>+	char srcIP[32], destIP[32];
Unnecessary see below

>+
>+	unsigned long  SmoothedRttEstimate,
>+		AcksSent, DupAcksSent, PktsRecv, DupPktsRecv;

Do not use CamelCase in kernel code.

>+	struct tcp_iter_state *st;
>+
>+	if (v == SEQ_START_TOKEN) {
>+		seq_printf(seq,
>+		"TCP Health Monitoring (established connections only)\n"
>+		" -Duplicate ACKs indicate lost or reordered packets on the
>connection.\n"
>+		" -Duplicate Packets Received signal a slow and badly inefficient
>connection.\n"
>+		" -RttEst estimates how long future packets will take on a round trip
>over the connection.\n"
>+		"id   Local Address        Remote Address       RttEst(ms) AcksSent "

Header seems excessive, just put one line of header please.


>+		"DupAcksSent PktsRecv DupPktsRecv\n");
>+		goto out;
>+	}
>+
>+	/* Loop through established TCP connections */
>+	st = seq->private;
>+
>+
>+	if (st->state == TCP_SEQ_STATE_ESTABLISHED)
>+	{
>+/*	; //insert read-lock here */

Don't think you need read-lock

>+		const struct tcp_sock *tp = tcp_sk(v);
>+		const struct inet_sock *inet = inet_sk(v);
>+		__be32 dest = inet->inet_daddr;
>+		__be32 src = inet->inet_rcv_saddr;
>+		__u16 destp = ntohs(inet->inet_dport);
>+		__u16 srcp = ntohs(inet->inet_sport);
>+

These temp variables aren't redundant.

>+		num = st->num;
>+		SmoothedRttEstimate = (tp->srtt >> 3);
>+		AcksSent = tp->acks_sent;
>+		DupAcksSent = tp->dup_acks_sent;
>+		PktsRecv = tp->pkts_recv;
>+		DupPktsRecv = tp->dup_pkts_recv;
>+
>+		sprintf(srcIP, "%lu.%lu.%lu.%lu:%u",
>+			((src >> 24) & 0xFF), ((src >> 16) & 0xFF), ((src >> 8) & 0xFF), (src &
>0xFF),
>+			srcp);
>+		sprintf(destIP, "%3d.%3d.%3d.%3d:%u",
>+			((dest >> 24) & 0xFF), ((dest >> 16) & 0xFF), ((dest >> 8) & 0xFF),
>(dest & 0xFF),
>+			destp);
>+
>+		seq_printf(seq, "%d: %-21s %-21s "
>+				"%8lu %8lu %8lu %8lu %8lu%n",
>+				num,
>+				srcIP,
>+				destIP,
>+				SmoothedRttEstimate,
>+				AcksSent,
>+				DupAcksSent,
>+				PktsRecv,
>+				DupPktsRecv,
>+
>+				&len
>+			);
>+

Kernel has %pI4 to print IP addresses. 

	seq_printf(seq, "%d: %-21pI4 %-21pI4 "
			"%8lu %8lu %8lu %8lu %8lu\n",
			num,
			&inet->inet_rcv_saddr,
			&inet->inet_daddr,
			tp->srtt >> 3,
			tp->acks_sent,
			tp->dup_acks_sent,
			tp->pkts_recv,
			tp->dup_pkts_recv);
    
>+		seq_printf(seq, "%*s\n", LINESZ - 1 - len, "");

This padding of line is bogus, just print variable length line.
Are you trying to make it fixed length record file?



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: resurrecting tcphealth
@ 2012-07-13  7:33 Piotr Sawuk
  2012-07-13 23:55 ` Stephen Hemminger
  0 siblings, 1 reply; 24+ messages in thread
From: Piotr Sawuk @ 2012-07-13  7:33 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

On Do, 12.07.2012, 23:35, Stephen Hemminger wrote:
> On Thu, 12 Jul 2012 22:55:57 +0200
> "Piotr Sawuk" <a9702387@unet.univie.ac.at> wrote:
>
>> + *		Federico D. Sacerdoti:	Added TCP health monitoring.
>
> Please don't do this.
> The kernel community no longer maintains a list of contributors
> in the comments. The history is maintained in the git commit log.
>

thanks for the proof-reading, to Randy Dunlap too. now I have tested the
patch against mainline.

so, anyone has a comment on my actual question about the need for a read-lock?

currently my patch looks like this (again comments are welcome):

diff -rub A/include/linux/tcp.h B/include/linux/tcp.h
--- A/include/linux/tcp.h	2012-06-22 20:37:50.000000000 +0200
+++ B/include/linux/tcp.h	2012-07-06 10:23:13.000000000 +0200
@@ -472,6 +474,15 @@
 	 * contains related tcp_cookie_transactions fields.
 	 */
 	struct tcp_cookie_values  *cookie_values;
+
+	/*
+	 * TCP health monitoring counters.
+	 */
+	__u32	dup_acks_sent;
+	__u32	dup_pkts_recv;
+	__u32	acks_sent;
+	__u32	pkts_recv;
+	__u32	last_ack_sent;	/* Sequence number of the last ack we sent. */
 };

 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff -rub A/net/ipv4/tcp_input.c B/net/ipv4/tcp_input.c
--- A/net/ipv4/tcp_input.c	2012-06-22 20:37:50.000000000 +0200
+++ B/net/ipv4/tcp_input.c	2012-07-06 10:12:12.000000000 +0200
@@ -4414,6 +4415,8 @@
 		}

 		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+			/* Course retransmit inefficiency- this packet has been received twice. */
+			tp->dup_pkts_recv++;
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
 			__skb_unlink(skb, &tp->out_of_order_queue);
 			__kfree_skb(skb);
@@ -4664,6 +4667,10 @@
 		return;
 	}

+	/* A packet is a "duplicate" if it contains bytes we have already
received. */
+	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+		tp->dup_pkts_recv++;
+
 	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
 		/* A retransmit, 2nd most common case.  Force an immediate ack. */
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
@@ -5375,6 +5382,13 @@

 	tp->rx_opt.saw_tstamp = 0;

+	/*
+	 *	Tcp health monitoring is interested in
+	 *	total per-connection packet arrivals.
+	 *	This is in the fast path, but is quick.
+	 */
+	tp->pkts_recv++;
+
 	/*	pred_flags is 0xS?10 << 16 + snd_wnd
 	 *	if header_prediction is to be made
 	 *	'S' will always be tp->tcp_header_len >> 2
diff -rub A/net/ipv4/tcp_ipv4.c B/net/ipv4/tcp_ipv4.c
--- A/net/ipv4/tcp_ipv4.c	2012-06-22 20:37:50.000000000 +0200
+++ B/net/ipv4/tcp_ipv4.c	2012-07-11 09:34:22.000000000 +0200
@@ -2533,6 +2533,82 @@
 	return 0;
 }

+
+/*
+ *	Output /proc/net/tcphealth
+ */
+#define LINESZ 128
+
+int tcp_health_seq_show(struct seq_file *seq, void *v)
+{
+	int len, num;
+	char srcIP[32], destIP[32];
+
+	unsigned long  SmoothedRttEstimate,
+		AcksSent, DupAcksSent, PktsRecv, DupPktsRecv;
+	struct tcp_iter_state *st;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq,
+		"TCP Health Monitoring (established connections only)\n"
+		" -Duplicate ACKs indicate lost or reordered packets on the
connection.\n"
+		" -Duplicate Packets Received signal a slow and badly inefficient
connection.\n"
+		" -RttEst estimates how long future packets will take on a round trip
over the connection.\n"
+		"id   Local Address        Remote Address       RttEst(ms) AcksSent "
+		"DupAcksSent PktsRecv DupPktsRecv\n");
+		goto out;
+	}
+
+	/* Loop through established TCP connections */
+	st = seq->private;
+
+
+	if (st->state == TCP_SEQ_STATE_ESTABLISHED)
+	{
+/*	; //insert read-lock here */
+		const struct tcp_sock *tp = tcp_sk(v);
+		const struct inet_sock *inet = inet_sk(v);
+		__be32 dest = inet->inet_daddr;
+		__be32 src = inet->inet_rcv_saddr;
+		__u16 destp = ntohs(inet->inet_dport);
+		__u16 srcp = ntohs(inet->inet_sport);
+
+		num = st->num;
+		SmoothedRttEstimate = (tp->srtt >> 3);
+		AcksSent = tp->acks_sent;
+		DupAcksSent = tp->dup_acks_sent;
+		PktsRecv = tp->pkts_recv;
+		DupPktsRecv = tp->dup_pkts_recv;
+
+		sprintf(srcIP, "%lu.%lu.%lu.%lu:%u",
+			((src >> 24) & 0xFF), ((src >> 16) & 0xFF), ((src >> 8) & 0xFF), (src &
0xFF),
+			srcp);
+		sprintf(destIP, "%3d.%3d.%3d.%3d:%u",
+			((dest >> 24) & 0xFF), ((dest >> 16) & 0xFF), ((dest >> 8) & 0xFF),
(dest & 0xFF),
+			destp);
+
+		seq_printf(seq, "%d: %-21s %-21s "
+				"%8lu %8lu %8lu %8lu %8lu%n",
+				num,
+				srcIP,
+				destIP,
+				SmoothedRttEstimate,
+				AcksSent,
+				DupAcksSent,
+				PktsRecv,
+				DupPktsRecv,
+
+				&len
+			);
+
+		seq_printf(seq, "%*s\n", LINESZ - 1 - len, "");
+/*	; //insert read-unlock here */
+	}
+
+out:
+	return 0;
+}
+
 static const struct file_operations tcp_afinfo_seq_fops = {
 	.owner   = THIS_MODULE,
 	.open    = tcp_seq_open,
@@ -2541,6 +2617,15 @@
 	.release = seq_release_net
 };

+static struct tcp_seq_afinfo tcphealth_seq_afinfo = {
+	.name		= "tcphealth",
+	.family		= AF_INET,
+	.seq_fops	= &tcp_afinfo_seq_fops,
+	.seq_ops	= {
+		.show		= tcp_health_seq_show,
+	},
+};
+
 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
 	.name		= "tcp",
 	.family		= AF_INET,
@@ -2552,12 +2637,16 @@

 static int __net_init tcp4_proc_init_net(struct net *net)
 {
-	return tcp_proc_register(net, &tcp4_seq_afinfo);
+	int ret = tcp_proc_register(net, &tcp4_seq_afinfo);
+	if(ret == 0)
+		ret = tcp_proc_register(net, &tcphealth_seq_afinfo);
+	return ret;
 }

 static void __net_exit tcp4_proc_exit_net(struct net *net)
 {
 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
+	tcp_proc_unregister(net, &tcphealth_seq_afinfo);
 }

 static struct pernet_operations tcp4_net_ops = {
diff -rub A/net/ipv4/tcp_output.c B/net/ipv4/tcp_output.c
--- A/net/ipv4/tcp_output.c	2012-06-22 20:37:50.000000000 +0200
+++ B/net/ipv4/tcp_output.c	2012-07-06 17:15:14.000000000 +0200
@@ -2754,8 +2755,15 @@
 	skb_reserve(buff, MAX_TCP_HEADER);
 	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);

+		/* If the rcv_nxt has not advanced since sending our last ACK, this is a
duplicate. */
+		if (tcp_sk(sk)->rcv_nxt == tcp_sk(sk)->last_ack_sent)
+			tcp_sk(sk)->dup_acks_sent++;
+		/* Record the total number of acks sent on this connection. */
+		tcp_sk(sk)->acks_sent++;
+
 	/* Send it off, this clears delayed acks for us. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
+	tcp_sk(sk)->last_ack_sent = tcp_sk(sk)->rcv_nxt;
 	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
 }








^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2012-07-21 10:34 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-07-12 20:55 resurrecting tcphealth Piotr Sawuk
2012-07-12 21:35 ` Stephen Hemminger
2012-07-12 22:29 ` Randy Dunlap
2012-07-14 21:48 ` Stephen Hemminger
2012-07-14 23:43   ` Piotr Sawuk
2012-07-15  7:16     ` Eric Dumazet
2012-07-15  9:17       ` Piotr Sawuk
2012-07-15  9:53         ` Eric Dumazet
2012-07-15 22:17           ` Piotr Sawuk
2012-07-13  7:33 Piotr Sawuk
2012-07-13 23:55 ` Stephen Hemminger
2012-07-14  1:31   ` valdis.kletnieks
2012-07-16 11:33   ` Piotr Sawuk
2012-07-16 11:46     ` Eric Dumazet
2012-07-16 13:03       ` Piotr Sawuk
2012-07-20 14:06         ` Yuchung Cheng
2012-07-21 10:34           ` Piotr Sawuk
2012-07-14  7:56 Piotr Sawuk
2012-07-14  8:27 ` Eric Dumazet
2012-07-14 19:29   ` David Miller
2012-07-16 13:32 ` Ben Hutchings
2012-07-16 15:12   ` Piotr Sawuk
2012-07-16 15:24     ` Christoph Paasch
2012-07-19 10:37       ` Piotr Sawuk

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.