All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] /proc/net/tcp, overhead removed
@ 2009-09-26 21:31 ` Yakov Lerner
  0 siblings, 0 replies; 13+ messages in thread
From: Yakov Lerner @ 2009-09-26 21:31 UTC (permalink / raw)
  To: linux-kernel, netdev, davem, kuznet, pekkas, jmorris, yoshfuji,
	kaber, torvalds
  Cc: Yakov Lerner

/proc/net/tcp does 20,000 sockets in 60-80 milliseconds, with this patch.

The overhead was in tcp_seq_start(). See analysis (3) below.
The patch is against Linus git tree (1). The patch is small.

------------  -----------   ------------------------------------
Before patch  After patch   20,000 sockets (10,000 tw + 10,000 estab)(2)
------------  -----------   ------------------------------------
6 sec          0.06 sec     dd bs=1k if=/proc/net/tcp >/dev/null 
1.5 sec        0.06 sec     dd bs=4k if=/proc/net/tcp >/dev/null

1.9 sec        0.16 sec     netstat -4ant >/dev/null
------------  -----------   ------------------------------------

This is ~ x25 improvement.
The new time is not dependent on read blockize.
Speed of netstat, naturally, improves, too; both -4 and -6.
/proc/net/tcp6 does 20,000 sockets in 100 millisec.

(1) against git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git

(2) Used 'manysock' utility to stress system with large number of sockets:
  "manysock 10000 10000"    - 10,000 tw + 10,000 estab ip4 sockets.
  "manysock -6 10000 10000" - 10,000 tw + 10,000 estab ip6 sockets.
Found at http://ilerner.3b1.org/manysock/manysock.c

(3) Algorithmic analysis. 
    Old algorithm.

During 'cat </proc/net/tcp', tcp_seq_start() is called O(numsockets) times (4).
On average, every call to tcp_seq_start() scans half the whole hashtable. Ouch.
This is O(numsockets * hashsize). 95-99% of 'cat </proc/net/tcp' is spent in
tcp_seq_start()->tcp_get_idx. This overhead is eliminated by new algorithm,
which is O(numsockets + hashsize).

    New algorithm.

New algorithms is O(numsockets + hashsize). We jump to the right
hash bucket in tcp_seq_start(), without scanning half the hash.
To jump right to the hash bucket corresponding to *pos in tcp_seq_start(),
we reuse three pieces of state (st->num, st->bucket, st->sbucket)
as follows:
 - we check that requested pos >= last seen pos (st->num), the typical case. 
 - if so, we jump to bucket st->bucket
 - to arrive to the right item after beginning of st->bucket, we
keep in st->sbucket the position corresponding to the beginning of
bucket.

(4) Explanation of O( numsockets * hashsize) of old algorithm.

tcp_seq_start() is called once for every ~7 lines of netstat output 
if readsize is 1kb, or once for every ~28 lines if readsize >= 4kb.
Since record length of /proc/net/tcp records is 150 bytes, formula for
number of calls to tcp_seq_start() is
            (numsockets * 150 / min(4096,readsize)).
Netstat uses 4kb readsize (newer versions), or 1kb (older versions).
Note that speed of old algorithm does not improve above 4kb blocksize.

Speed of the new algorithm does not depend on blocksize.

Speed of the new algorithm does not perceptibly depend on hashsize (which
depends on ramsize). Speed of old algorithm drops with bigger hashsize.

(5) Reporting order.

Reporting order is exactly same as before if hash does not change underfoot.
When hash elements come and go during report, reporting order will be
same as that of tcpdiag.

Signed-off-by: Yakov Lerner <iler.ml@gmail.com>
---
 net/ipv4/tcp_ipv4.c |   26 ++++++++++++++++++++++++--
 1 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7cda24b..7d9421a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1994,13 +1994,14 @@ static inline int empty_bucket(struct tcp_iter_state *st)
 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
 }
 
-static void *established_get_first(struct seq_file *seq)
+static void *established_get_first_after(struct seq_file *seq, int bucket)
 {
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 	void *rc = NULL;
 
-	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
+	for (st->bucket = bucket; st->bucket < tcp_hashinfo.ehash_size;
+	     ++st->bucket) {
 		struct sock *sk;
 		struct hlist_nulls_node *node;
 		struct inet_timewait_sock *tw;
@@ -2036,6 +2037,11 @@ out:
 	return rc;
 }
 
+static void *established_get_first(struct seq_file *seq)
+{
+	return established_get_first_after(seq, 0);
+}
+
 static void *established_get_next(struct seq_file *seq, void *cur)
 {
 	struct sock *sk = cur;
@@ -2045,6 +2051,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
 	struct net *net = seq_file_net(seq);
 
 	++st->num;
+	st->sbucket = st->num;
 
 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
 		tw = cur;
@@ -2116,6 +2123,21 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct tcp_iter_state *st = seq->private;
+
+	if (*pos && *pos >= st->sbucket &&
+	    (st->state == TCP_SEQ_STATE_ESTABLISHED ||
+	     st->state == TCP_SEQ_STATE_TIME_WAIT)) {
+		int nskip;
+		void *cur;
+
+		st->num = st->sbucket;
+		st->state = TCP_SEQ_STATE_ESTABLISHED;
+		cur = established_get_first_after(seq, st->bucket);
+		for (nskip = *pos - st->sbucket; nskip > 0 && cur; --nskip)
+			cur = established_get_next(seq, cur);
+		return cur;
+	}
+
 	st->state = TCP_SEQ_STATE_LISTENING;
 	st->num = 0;
 	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
-- 
1.6.5.rc2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH] /proc/net/tcp, overhead removed
@ 2009-09-26 21:31 ` Yakov Lerner
  0 siblings, 0 replies; 13+ messages in thread
From: Yakov Lerner @ 2009-09-26 21:31 UTC (permalink / raw)
  To: linux-kernel, netdev, davem, kuznet, pekkas, jmorris, yoshfuji,
	kaber, torval
  Cc: Yakov Lerner

/proc/net/tcp does 20,000 sockets in 60-80 milliseconds, with this patch.

The overhead was in tcp_seq_start(). See analysis (3) below.
The patch is against Linus git tree (1). The patch is small.

------------  -----------   ------------------------------------
Before patch  After patch   20,000 sockets (10,000 tw + 10,000 estab)(2)
------------  -----------   ------------------------------------
6 sec          0.06 sec     dd bs=1k if=/proc/net/tcp >/dev/null 
1.5 sec        0.06 sec     dd bs=4k if=/proc/net/tcp >/dev/null

1.9 sec        0.16 sec     netstat -4ant >/dev/null
------------  -----------   ------------------------------------

This is ~ x25 improvement.
The new time is not dependent on read blockize.
Speed of netstat, naturally, improves, too; both -4 and -6.
/proc/net/tcp6 does 20,000 sockets in 100 millisec.

(1) against git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git

(2) Used 'manysock' utility to stress system with large number of sockets:
  "manysock 10000 10000"    - 10,000 tw + 10,000 estab ip4 sockets.
  "manysock -6 10000 10000" - 10,000 tw + 10,000 estab ip6 sockets.
Found at http://ilerner.3b1.org/manysock/manysock.c

(3) Algorithmic analysis. 
    Old algorithm.

During 'cat </proc/net/tcp', tcp_seq_start() is called O(numsockets) times (4).
On average, every call to tcp_seq_start() scans half the whole hashtable. Ouch.
This is O(numsockets * hashsize). 95-99% of 'cat </proc/net/tcp' is spent in
tcp_seq_start()->tcp_get_idx. This overhead is eliminated by new algorithm,
which is O(numsockets + hashsize).

    New algorithm.

New algorithms is O(numsockets + hashsize). We jump to the right
hash bucket in tcp_seq_start(), without scanning half the hash.
To jump right to the hash bucket corresponding to *pos in tcp_seq_start(),
we reuse three pieces of state (st->num, st->bucket, st->sbucket)
as follows:
 - we check that requested pos >= last seen pos (st->num), the typical case. 
 - if so, we jump to bucket st->bucket
 - to arrive to the right item after beginning of st->bucket, we
keep in st->sbucket the position corresponding to the beginning of
bucket.

(4) Explanation of O( numsockets * hashsize) of old algorithm.

tcp_seq_start() is called once for every ~7 lines of netstat output 
if readsize is 1kb, or once for every ~28 lines if readsize >= 4kb.
Since record length of /proc/net/tcp records is 150 bytes, formula for
number of calls to tcp_seq_start() is
            (numsockets * 150 / min(4096,readsize)).
Netstat uses 4kb readsize (newer versions), or 1kb (older versions).
Note that speed of old algorithm does not improve above 4kb blocksize.

Speed of the new algorithm does not depend on blocksize.

Speed of the new algorithm does not perceptibly depend on hashsize (which
depends on ramsize). Speed of old algorithm drops with bigger hashsize.

(5) Reporting order.

Reporting order is exactly same as before if hash does not change underfoot.
When hash elements come and go during report, reporting order will be
same as that of tcpdiag.

Signed-off-by: Yakov Lerner <iler.ml@gmail.com>
---
 net/ipv4/tcp_ipv4.c |   26 ++++++++++++++++++++++++--
 1 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7cda24b..7d9421a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1994,13 +1994,14 @@ static inline int empty_bucket(struct tcp_iter_state *st)
 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
 }
 
-static void *established_get_first(struct seq_file *seq)
+static void *established_get_first_after(struct seq_file *seq, int bucket)
 {
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 	void *rc = NULL;
 
-	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
+	for (st->bucket = bucket; st->bucket < tcp_hashinfo.ehash_size;
+	     ++st->bucket) {
 		struct sock *sk;
 		struct hlist_nulls_node *node;
 		struct inet_timewait_sock *tw;
@@ -2036,6 +2037,11 @@ out:
 	return rc;
 }
 
+static void *established_get_first(struct seq_file *seq)
+{
+	return established_get_first_after(seq, 0);
+}
+
 static void *established_get_next(struct seq_file *seq, void *cur)
 {
 	struct sock *sk = cur;
@@ -2045,6 +2051,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
 	struct net *net = seq_file_net(seq);
 
 	++st->num;
+	st->sbucket = st->num;
 
 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
 		tw = cur;
@@ -2116,6 +2123,21 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct tcp_iter_state *st = seq->private;
+
+	if (*pos && *pos >= st->sbucket &&
+	    (st->state == TCP_SEQ_STATE_ESTABLISHED ||
+	     st->state == TCP_SEQ_STATE_TIME_WAIT)) {
+		int nskip;
+		void *cur;
+
+		st->num = st->sbucket;
+		st->state = TCP_SEQ_STATE_ESTABLISHED;
+		cur = established_get_first_after(seq, st->bucket);
+		for (nskip = *pos - st->sbucket; nskip > 0 && cur; --nskip)
+			cur = established_get_next(seq, cur);
+		return cur;
+	}
+
 	st->state = TCP_SEQ_STATE_LISTENING;
 	st->num = 0;
 	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
-- 
1.6.5.rc2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] /proc/net/tcp, overhead removed
  2009-09-26 21:31 ` Yakov Lerner
  (?)
@ 2009-09-27  9:53 ` Eric Dumazet
  2009-09-28 22:10   ` Yakov Lerner
  -1 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2009-09-27  9:53 UTC (permalink / raw)
  To: Yakov Lerner
  Cc: linux-kernel, netdev, davem, kuznet, pekkas, jmorris, yoshfuji,
	kaber, torvalds

Yakov Lerner a écrit :
> /proc/net/tcp does 20,000 sockets in 60-80 milliseconds, with this patch.
> 
> The overhead was in tcp_seq_start(). See analysis (3) below.
> The patch is against Linus git tree (1). The patch is small.
> 
> ------------  -----------   ------------------------------------
> Before patch  After patch   20,000 sockets (10,000 tw + 10,000 estab)(2)
> ------------  -----------   ------------------------------------
> 6 sec          0.06 sec     dd bs=1k if=/proc/net/tcp >/dev/null 
> 1.5 sec        0.06 sec     dd bs=4k if=/proc/net/tcp >/dev/null
> 
> 1.9 sec        0.16 sec     netstat -4ant >/dev/null
> ------------  -----------   ------------------------------------
> 
> This is ~ x25 improvement.
> The new time is not dependent on read blockize.
> Speed of netstat, naturally, improves, too; both -4 and -6.
> /proc/net/tcp6 does 20,000 sockets in 100 millisec.
> 
> (1) against git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
> 
> (2) Used 'manysock' utility to stress system with large number of sockets:
>   "manysock 10000 10000"    - 10,000 tw + 10,000 estab ip4 sockets.
>   "manysock -6 10000 10000" - 10,000 tw + 10,000 estab ip6 sockets.
> Found at http://ilerner.3b1.org/manysock/manysock.c
> 
> (3) Algorithmic analysis. 
>     Old algorithm.
> 
> During 'cat </proc/net/tcp', tcp_seq_start() is called O(numsockets) times (4).
> On average, every call to tcp_seq_start() scans half the whole hashtable. Ouch.
> This is O(numsockets * hashsize). 95-99% of 'cat </proc/net/tcp' is spent in
> tcp_seq_start()->tcp_get_idx. This overhead is eliminated by new algorithm,
> which is O(numsockets + hashsize).
> 
>     New algorithm.
> 
> New algorithms is O(numsockets + hashsize). We jump to the right
> hash bucket in tcp_seq_start(), without scanning half the hash.
> To jump right to the hash bucket corresponding to *pos in tcp_seq_start(),
> we reuse three pieces of state (st->num, st->bucket, st->sbucket)
> as follows:
>  - we check that requested pos >= last seen pos (st->num), the typical case. 
>  - if so, we jump to bucket st->bucket
>  - to arrive to the right item after beginning of st->bucket, we
> keep in st->sbucket the position corresponding to the beginning of
> bucket.
> 
> (4) Explanation of O( numsockets * hashsize) of old algorithm.
> 
> tcp_seq_start() is called once for every ~7 lines of netstat output 
> if readsize is 1kb, or once for every ~28 lines if readsize >= 4kb.
> Since record length of /proc/net/tcp records is 150 bytes, formula for
> number of calls to tcp_seq_start() is
>             (numsockets * 150 / min(4096,readsize)).
> Netstat uses 4kb readsize (newer versions), or 1kb (older versions).
> Note that speed of old algorithm does not improve above 4kb blocksize.
> 
> Speed of the new algorithm does not depend on blocksize.
> 
> Speed of the new algorithm does not perceptibly depend on hashsize (which
> depends on ramsize). Speed of old algorithm drops with bigger hashsize.
> 
> (5) Reporting order.
> 
> Reporting order is exactly same as before if hash does not change underfoot.
> When hash elements come and go during report, reporting order will be
> same as that of tcpdiag.
> 
> Signed-off-by: Yakov Lerner <iler.ml@gmail.com>
> ---
>  net/ipv4/tcp_ipv4.c |   26 ++++++++++++++++++++++++--
>  1 files changed, 24 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 7cda24b..7d9421a 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1994,13 +1994,14 @@ static inline int empty_bucket(struct tcp_iter_state *st)
>  		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
>  }
>  
> -static void *established_get_first(struct seq_file *seq)
> +static void *established_get_first_after(struct seq_file *seq, int bucket)
>  {
>  	struct tcp_iter_state *st = seq->private;
>  	struct net *net = seq_file_net(seq);
>  	void *rc = NULL;
>  
> -	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
> +	for (st->bucket = bucket; st->bucket < tcp_hashinfo.ehash_size;
> +	     ++st->bucket) {
>  		struct sock *sk;
>  		struct hlist_nulls_node *node;
>  		struct inet_timewait_sock *tw;
> @@ -2036,6 +2037,11 @@ out:
>  	return rc;
>  }
>  
> +static void *established_get_first(struct seq_file *seq)
> +{
> +	return established_get_first_after(seq, 0);
> +}
> +
>  static void *established_get_next(struct seq_file *seq, void *cur)
>  {
>  	struct sock *sk = cur;
> @@ -2045,6 +2051,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
>  	struct net *net = seq_file_net(seq);
>  
>  	++st->num;
> +	st->sbucket = st->num;

Hello Yakov

Intention of your patch is very good, but not currently working.

It seems you believe there is at most one entry per hash slot or something like that

Please reboot your test machine with "thash_entries=4096" so that tcp hash
size is 4096, and try to fill 20000 tcp sockets with a test program.

then :

# ss | wc -l
20001
(ok)

# cat /proc/net/tcp | wc -l
22160
(not quite correct ...)

# netstat -tn | wc -l
<never ends>


# dd if=/proc/net/tcp ibs=1024 | wc -l
<never ends>


Please send your next patch on netdev@vger.kernel.org , DaveM only , were netdev people
are reviewing netdev patches, there is no need include other people for first submissions.

Thank you


#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <string.h>
int fdlisten;
main()
{
        int i;
        struct sockaddr_in sockaddr;

        fdlisten = socket(AF_INET, SOCK_STREAM, 0);
        memset(&sockaddr, 0, sizeof(sockaddr));
        sockaddr.sin_family = AF_INET;
        sockaddr.sin_port = htons(2222);
        if (bind(fdlisten, (struct sockaddr *)&sockaddr, sizeof(sockaddr))== -1) {
                perror("bind");
                return 1;
        }
        if (listen(fdlisten, 10)== -1) {
                perror("listen");
                return 1;
        }
        if (fork() == 0) {
                while (1) {
                        socklen_t len = sizeof(sockaddr);
                        int newfd = accept(fdlisten, (struct sockaddr *)&sockaddr, &len);
                }
        }
        for (i = 0 ; i < 10000; i++) {
                int fd = socket(AF_INET, SOCK_STREAM, 0);
                if (fd == -1) {
                        perror("socket");
                        break;
                        }
                connect(fd, (struct sockaddr *)&sockaddr, sizeof(sockaddr));
        }
        pause();
}

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] /proc/net/tcp, overhead removed
  2009-09-27  9:53 ` Eric Dumazet
@ 2009-09-28 22:10   ` Yakov Lerner
  2009-09-28 22:20     ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: Yakov Lerner @ 2009-09-28 22:10 UTC (permalink / raw)
  To: netdev, Eric Dumazet, David Miller

On Sun, Sep 27, 2009 at 12:53, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Yakov Lerner a écrit :
>> /proc/net/tcp does 20,000 sockets in 60-80 milliseconds, with this patch.
>>
>> The overhead was in tcp_seq_start(). See analysis (3) below.
>> The patch is against Linus git tree (1). The patch is small.
>>
>> ------------  -----------   ------------------------------------
>> Before patch  After patch   20,000 sockets (10,000 tw + 10,000 estab)(2)
>> ------------  -----------   ------------------------------------
>> 6 sec          0.06 sec     dd bs=1k if=/proc/net/tcp >/dev/null
>> 1.5 sec        0.06 sec     dd bs=4k if=/proc/net/tcp >/dev/null
>>
>> 1.9 sec        0.16 sec     netstat -4ant >/dev/null
>> ------------  -----------   ------------------------------------
>>
>> This is ~ x25 improvement.
>> The new time is not dependent on read blockize.
>> Speed of netstat, naturally, improves, too; both -4 and -6.
>> /proc/net/tcp6 does 20,000 sockets in 100 millisec.
>>
>> (1) against git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
>>
>> (2) Used 'manysock' utility to stress system with large number of sockets:
>>   "manysock 10000 10000"    - 10,000 tw + 10,000 estab ip4 sockets.
>>   "manysock -6 10000 10000" - 10,000 tw + 10,000 estab ip6 sockets.
>> Found at http://ilerner.3b1.org/manysock/manysock.c
>>
>> (3) Algorithmic analysis.
>>     Old algorithm.
>>
>> During 'cat </proc/net/tcp', tcp_seq_start() is called O(numsockets) times (4).
>> On average, every call to tcp_seq_start() scans half the whole hashtable. Ouch.
>> This is O(numsockets * hashsize). 95-99% of 'cat </proc/net/tcp' is spent in
>> tcp_seq_start()->tcp_get_idx. This overhead is eliminated by new algorithm,
>> which is O(numsockets + hashsize).
>>
>>     New algorithm.
>>
>> New algorithms is O(numsockets + hashsize). We jump to the right
>> hash bucket in tcp_seq_start(), without scanning half the hash.
>> To jump right to the hash bucket corresponding to *pos in tcp_seq_start(),
>> we reuse three pieces of state (st->num, st->bucket, st->sbucket)
>> as follows:
>>  - we check that requested pos >= last seen pos (st->num), the typical case.
>>  - if so, we jump to bucket st->bucket
>>  - to arrive to the right item after beginning of st->bucket, we
>> keep in st->sbucket the position corresponding to the beginning of
>> bucket.
>>
>> (4) Explanation of O( numsockets * hashsize) of old algorithm.
>>
>> tcp_seq_start() is called once for every ~7 lines of netstat output
>> if readsize is 1kb, or once for every ~28 lines if readsize >= 4kb.
>> Since record length of /proc/net/tcp records is 150 bytes, formula for
>> number of calls to tcp_seq_start() is
>>             (numsockets * 150 / min(4096,readsize)).
>> Netstat uses 4kb readsize (newer versions), or 1kb (older versions).
>> Note that speed of old algorithm does not improve above 4kb blocksize.
>>
>> Speed of the new algorithm does not depend on blocksize.
>>
>> Speed of the new algorithm does not perceptibly depend on hashsize (which
>> depends on ramsize). Speed of old algorithm drops with bigger hashsize.
>>
>> (5) Reporting order.
>>
>> Reporting order is exactly same as before if hash does not change underfoot.
>> When hash elements come and go during report, reporting order will be
>> same as that of tcpdiag.
>>
>> Signed-off-by: Yakov Lerner <iler.ml@gmail.com>
>> ---
>>  net/ipv4/tcp_ipv4.c |   26 ++++++++++++++++++++++++--
>>  1 files changed, 24 insertions(+), 2 deletions(-)
>>
>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>> index 7cda24b..7d9421a 100644
>> --- a/net/ipv4/tcp_ipv4.c
>> +++ b/net/ipv4/tcp_ipv4.c
>> @@ -1994,13 +1994,14 @@ static inline int empty_bucket(struct tcp_iter_state *st)
>>               hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
>>  }
>>
>> -static void *established_get_first(struct seq_file *seq)
>> +static void *established_get_first_after(struct seq_file *seq, int bucket)
>>  {
>>       struct tcp_iter_state *st = seq->private;
>>       struct net *net = seq_file_net(seq);
>>       void *rc = NULL;
>>
>> -     for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
>> +     for (st->bucket = bucket; st->bucket < tcp_hashinfo.ehash_size;
>> +          ++st->bucket) {
>>               struct sock *sk;
>>               struct hlist_nulls_node *node;
>>               struct inet_timewait_sock *tw;
>> @@ -2036,6 +2037,11 @@ out:
>>       return rc;
>>  }
>>
>> +static void *established_get_first(struct seq_file *seq)
>> +{
>> +     return established_get_first_after(seq, 0);
>> +}
>> +
>>  static void *established_get_next(struct seq_file *seq, void *cur)
>>  {
>>       struct sock *sk = cur;
>> @@ -2045,6 +2051,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
>>       struct net *net = seq_file_net(seq);
>>
>>       ++st->num;
>> +     st->sbucket = st->num;
>
> Hello Yakov
>
> Intention of your patch is very good, but not currently working.
>
> It seems you believe there is at most one entry per hash slot or something like that
>
> Please reboot your test machine with "thash_entries=4096" so that tcp hash
> size is 4096, and try to fill 20000 tcp sockets with a test program.
>
> then :
>
> # ss | wc -l
> 20001
> (ok)
>
> # cat /proc/net/tcp | wc -l
> 22160
> (not quite correct ...)
>
> # netstat -tn | wc -l
> <never ends>
>
>
> # dd if=/proc/net/tcp ibs=1024 | wc -l
> <never ends>
>
>
> Please send your next patch on netdev@vger.kernel.org , DaveM only , were netdev people
> are reviewing netdev patches, there is no need include other people for first submissions.
>
> Thank you
>
>
> #include <sys/types.h>
> #include <sys/socket.h>
> #include <netinet/in.h>
> #include <string.h>
> int fdlisten;
> main()
> {
>        int i;
>        struct sockaddr_in sockaddr;
>
>        fdlisten = socket(AF_INET, SOCK_STREAM, 0);
>        memset(&sockaddr, 0, sizeof(sockaddr));
>        sockaddr.sin_family = AF_INET;
>        sockaddr.sin_port = htons(2222);
>        if (bind(fdlisten, (struct sockaddr *)&sockaddr, sizeof(sockaddr))== -1) {
>                perror("bind");
>                return 1;
>        }
>        if (listen(fdlisten, 10)== -1) {
>                perror("listen");
>                return 1;
>        }
>        if (fork() == 0) {
>                while (1) {
>                        socklen_t len = sizeof(sockaddr);
>                        int newfd = accept(fdlisten, (struct sockaddr *)&sockaddr, &len);
>                }
>        }
>        for (i = 0 ; i < 10000; i++) {
>                int fd = socket(AF_INET, SOCK_STREAM, 0);
>                if (fd == -1) {
>                        perror("socket");
>                        break;
>                        }
>                connect(fd, (struct sockaddr *)&sockaddr, sizeof(sockaddr));
>        }
>        pause();
> }
>

Hello Eric,

I found the problem, thanks. I'll re-send after testing.

In the meantime, I'd like to ask you whether it makes sense to
add the /proc/net entry, to switch between "old way" and "new way".
The switch would allow quick compare/test between new way and
old way not only by line count, but by full contents, without reboot.

Yakov

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] /proc/net/tcp, overhead removed
  2009-09-28 22:10   ` Yakov Lerner
@ 2009-09-28 22:20     ` Eric Dumazet
  2009-09-28 23:24       ` Stephen Hemminger
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2009-09-28 22:20 UTC (permalink / raw)
  To: Yakov Lerner; +Cc: netdev, Eric Dumazet, David Miller

Yakov Lerner a écrit :
> On Sun, Sep 27, 2009 at 12:53, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> Yakov Lerner a écrit :
>>> /proc/net/tcp does 20,000 sockets in 60-80 milliseconds, with this patch.
>>>
>>> The overhead was in tcp_seq_start(). See analysis (3) below.
>>> The patch is against Linus git tree (1). The patch is small.
>>>
>>> ------------  -----------   ------------------------------------
>>> Before patch  After patch   20,000 sockets (10,000 tw + 10,000 estab)(2)
>>> ------------  -----------   ------------------------------------
>>> 6 sec          0.06 sec     dd bs=1k if=/proc/net/tcp >/dev/null
>>> 1.5 sec        0.06 sec     dd bs=4k if=/proc/net/tcp >/dev/null
>>>
>>> 1.9 sec        0.16 sec     netstat -4ant >/dev/null
>>> ------------  -----------   ------------------------------------
>>>
>>> This is ~ x25 improvement.
>>> The new time is not dependent on read blockize.
>>> Speed of netstat, naturally, improves, too; both -4 and -6.
>>> /proc/net/tcp6 does 20,000 sockets in 100 millisec.
>>>
>>> (1) against git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
>>>
>>> (2) Used 'manysock' utility to stress system with large number of sockets:
>>>   "manysock 10000 10000"    - 10,000 tw + 10,000 estab ip4 sockets.
>>>   "manysock -6 10000 10000" - 10,000 tw + 10,000 estab ip6 sockets.
>>> Found at http://ilerner.3b1.org/manysock/manysock.c
>>>
>>> (3) Algorithmic analysis.
>>>     Old algorithm.
>>>
>>> During 'cat </proc/net/tcp', tcp_seq_start() is called O(numsockets) times (4).
>>> On average, every call to tcp_seq_start() scans half the whole hashtable. Ouch.
>>> This is O(numsockets * hashsize). 95-99% of 'cat </proc/net/tcp' is spent in
>>> tcp_seq_start()->tcp_get_idx. This overhead is eliminated by new algorithm,
>>> which is O(numsockets + hashsize).
>>>
>>>     New algorithm.
>>>
>>> New algorithms is O(numsockets + hashsize). We jump to the right
>>> hash bucket in tcp_seq_start(), without scanning half the hash.
>>> To jump right to the hash bucket corresponding to *pos in tcp_seq_start(),
>>> we reuse three pieces of state (st->num, st->bucket, st->sbucket)
>>> as follows:
>>>  - we check that requested pos >= last seen pos (st->num), the typical case.
>>>  - if so, we jump to bucket st->bucket
>>>  - to arrive to the right item after beginning of st->bucket, we
>>> keep in st->sbucket the position corresponding to the beginning of
>>> bucket.
>>>
>>> (4) Explanation of O( numsockets * hashsize) of old algorithm.
>>>
>>> tcp_seq_start() is called once for every ~7 lines of netstat output
>>> if readsize is 1kb, or once for every ~28 lines if readsize >= 4kb.
>>> Since record length of /proc/net/tcp records is 150 bytes, formula for
>>> number of calls to tcp_seq_start() is
>>>             (numsockets * 150 / min(4096,readsize)).
>>> Netstat uses 4kb readsize (newer versions), or 1kb (older versions).
>>> Note that speed of old algorithm does not improve above 4kb blocksize.
>>>
>>> Speed of the new algorithm does not depend on blocksize.
>>>
>>> Speed of the new algorithm does not perceptibly depend on hashsize (which
>>> depends on ramsize). Speed of old algorithm drops with bigger hashsize.
>>>
>>> (5) Reporting order.
>>>
>>> Reporting order is exactly same as before if hash does not change underfoot.
>>> When hash elements come and go during report, reporting order will be
>>> same as that of tcpdiag.
>>>
>>> Signed-off-by: Yakov Lerner <iler.ml@gmail.com>
>>> ---
>>>  net/ipv4/tcp_ipv4.c |   26 ++++++++++++++++++++++++--
>>>  1 files changed, 24 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>>> index 7cda24b..7d9421a 100644
>>> --- a/net/ipv4/tcp_ipv4.c
>>> +++ b/net/ipv4/tcp_ipv4.c
>>> @@ -1994,13 +1994,14 @@ static inline int empty_bucket(struct tcp_iter_state *st)
>>>               hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
>>>  }
>>>
>>> -static void *established_get_first(struct seq_file *seq)
>>> +static void *established_get_first_after(struct seq_file *seq, int bucket)
>>>  {
>>>       struct tcp_iter_state *st = seq->private;
>>>       struct net *net = seq_file_net(seq);
>>>       void *rc = NULL;
>>>
>>> -     for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
>>> +     for (st->bucket = bucket; st->bucket < tcp_hashinfo.ehash_size;
>>> +          ++st->bucket) {
>>>               struct sock *sk;
>>>               struct hlist_nulls_node *node;
>>>               struct inet_timewait_sock *tw;
>>> @@ -2036,6 +2037,11 @@ out:
>>>       return rc;
>>>  }
>>>
>>> +static void *established_get_first(struct seq_file *seq)
>>> +{
>>> +     return established_get_first_after(seq, 0);
>>> +}
>>> +
>>>  static void *established_get_next(struct seq_file *seq, void *cur)
>>>  {
>>>       struct sock *sk = cur;
>>> @@ -2045,6 +2051,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
>>>       struct net *net = seq_file_net(seq);
>>>
>>>       ++st->num;
>>> +     st->sbucket = st->num;
>> Hello Yakov
>>
>> Intention of your patch is very good, but not currently working.
>>
>> It seems you believe there is at most one entry per hash slot or something like that
>>
>> Please reboot your test machine with "thash_entries=4096" so that tcp hash
>> size is 4096, and try to fill 20000 tcp sockets with a test program.
>>
>> then :
>>
>> # ss | wc -l
>> 20001
>> (ok)
>>
>> # cat /proc/net/tcp | wc -l
>> 22160
>> (not quite correct ...)
>>
>> # netstat -tn | wc -l
>> <never ends>
>>
>>
>> # dd if=/proc/net/tcp ibs=1024 | wc -l
>> <never ends>
>>
>>
>> Please send your next patch on netdev@vger.kernel.org , DaveM only , were netdev people
>> are reviewing netdev patches, there is no need include other people for first submissions.
>>
>> Thank you
>>
>>
>> #include <sys/types.h>
>> #include <sys/socket.h>
>> #include <netinet/in.h>
>> #include <string.h>
>> int fdlisten;
>> main()
>> {
>>        int i;
>>        struct sockaddr_in sockaddr;
>>
>>        fdlisten = socket(AF_INET, SOCK_STREAM, 0);
>>        memset(&sockaddr, 0, sizeof(sockaddr));
>>        sockaddr.sin_family = AF_INET;
>>        sockaddr.sin_port = htons(2222);
>>        if (bind(fdlisten, (struct sockaddr *)&sockaddr, sizeof(sockaddr))== -1) {
>>                perror("bind");
>>                return 1;
>>        }
>>        if (listen(fdlisten, 10)== -1) {
>>                perror("listen");
>>                return 1;
>>        }
>>        if (fork() == 0) {
>>                while (1) {
>>                        socklen_t len = sizeof(sockaddr);
>>                        int newfd = accept(fdlisten, (struct sockaddr *)&sockaddr, &len);
>>                }
>>        }
>>        for (i = 0 ; i < 10000; i++) {
>>                int fd = socket(AF_INET, SOCK_STREAM, 0);
>>                if (fd == -1) {
>>                        perror("socket");
>>                        break;
>>                        }
>>                connect(fd, (struct sockaddr *)&sockaddr, sizeof(sockaddr));
>>        }
>>        pause();
>> }
>>
> 
> Hello Eric,
> 
> I found the problem, thanks. I'll re-send after testing.

OK good !

> 
> In the meantime, I'd like to ask you whether it makes sense to
> add the /proc/net entry, to switch between "old way" and "new way".
> The switch would allow quick compare/test between new way and
> old way not only by line count, but by full contents, without reboot.
> 

Well, this switch wont be needed for patch validation, but it might help
you to test your patch of course.

Actually I found the error reading your patch, and I made a quick test to
confirm my understanding :)

See you tomorrow, its rather late here :)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] /proc/net/tcp, overhead removed
  2009-09-28 22:20     ` Eric Dumazet
@ 2009-09-28 23:24       ` Stephen Hemminger
  2009-09-29  7:43         ` Yakov Lerner
  0 siblings, 1 reply; 13+ messages in thread
From: Stephen Hemminger @ 2009-09-28 23:24 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Yakov Lerner, netdev, Eric Dumazet, David Miller

On Tue, 29 Sep 2009 00:20:07 +0200
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Yakov Lerner a écrit :
> > On Sun, Sep 27, 2009 at 12:53, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >> Yakov Lerner a écrit :
> >>> /proc/net/tcp does 20,000 sockets in 60-80 milliseconds, with this patch.
> >>>
> >>> The overhead was in tcp_seq_start(). See analysis (3) below.
> >>> The patch is against Linus git tree (1). The patch is small.
> >>>
> >>> ------------  -----------   ------------------------------------
> >>> Before patch  After patch   20,000 sockets (10,000 tw + 10,000 estab)(2)
> >>> ------------  -----------   ------------------------------------
> >>> 6 sec          0.06 sec     dd bs=1k if=/proc/net/tcp >/dev/null
> >>> 1.5 sec        0.06 sec     dd bs=4k if=/proc/net/tcp >/dev/null
> >>>
> >>> 1.9 sec        0.16 sec     netstat -4ant >/dev/null
> >>> ------------  -----------   ------------------------------------
> >>>
> >>> This is ~ x25 improvement.
> >>> The new time is not dependent on read blockize.
> >>> Speed of netstat, naturally, improves, too; both -4 and -6.
> >>> /proc/net/tcp6 does 20,000 sockets in 100 millisec.
> >>>
> >>> (1) against git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
> >>>
> >>> (2) Used 'manysock' utility to stress system with large number of sockets:
> >>>   "manysock 10000 10000"    - 10,000 tw + 10,000 estab ip4 sockets.
> >>>   "manysock -6 10000 10000" - 10,000 tw + 10,000 estab ip6 sockets.
> >>> Found at http://ilerner.3b1.org/manysock/manysock.c
> >>>
> >>> (3) Algorithmic analysis.
> >>>     Old algorithm.
> >>>
> >>> During 'cat </proc/net/tcp', tcp_seq_start() is called O(numsockets) times (4).
> >>> On average, every call to tcp_seq_start() scans half the whole hashtable. Ouch.
> >>> This is O(numsockets * hashsize). 95-99% of 'cat </proc/net/tcp' is spent in
> >>> tcp_seq_start()->tcp_get_idx. This overhead is eliminated by new algorithm,
> >>> which is O(numsockets + hashsize).
> >>>
> >>>     New algorithm.
> >>>
> >>> New algorithms is O(numsockets + hashsize). We jump to the right
> >>> hash bucket in tcp_seq_start(), without scanning half the hash.
> >>> To jump right to the hash bucket corresponding to *pos in tcp_seq_start(),
> >>> we reuse three pieces of state (st->num, st->bucket, st->sbucket)
> >>> as follows:
> >>>  - we check that requested pos >= last seen pos (st->num), the typical case.
> >>>  - if so, we jump to bucket st->bucket
> >>>  - to arrive to the right item after beginning of st->bucket, we
> >>> keep in st->sbucket the position corresponding to the beginning of
> >>> bucket.
> >>>
> >>> (4) Explanation of O( numsockets * hashsize) of old algorithm.
> >>>
> >>> tcp_seq_start() is called once for every ~7 lines of netstat output
> >>> if readsize is 1kb, or once for every ~28 lines if readsize >= 4kb.
> >>> Since record length of /proc/net/tcp records is 150 bytes, formula for
> >>> number of calls to tcp_seq_start() is
> >>>             (numsockets * 150 / min(4096,readsize)).
> >>> Netstat uses 4kb readsize (newer versions), or 1kb (older versions).
> >>> Note that speed of old algorithm does not improve above 4kb blocksize.
> >>>
> >>> Speed of the new algorithm does not depend on blocksize.
> >>>
> >>> Speed of the new algorithm does not perceptibly depend on hashsize (which
> >>> depends on ramsize). Speed of old algorithm drops with bigger hashsize.
> >>>
> >>> (5) Reporting order.
> >>>
> >>> Reporting order is exactly same as before if hash does not change underfoot.
> >>> When hash elements come and go during report, reporting order will be
> >>> same as that of tcpdiag.
> >>>
> >>> Signed-off-by: Yakov Lerner <iler.ml@gmail.com>

Does the netlink interface used by ss command have the problem?

-- 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] /proc/net/tcp, overhead removed
  2009-09-28 23:24       ` Stephen Hemminger
@ 2009-09-29  7:43         ` Yakov Lerner
  0 siblings, 0 replies; 13+ messages in thread
From: Yakov Lerner @ 2009-09-29  7:43 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Eric Dumazet, netdev, David Miller

On Tue, Sep 29, 2009 at 02:24, Stephen Hemminger <shemminger@vyatta.com> wrote:
> On Tue, 29 Sep 2009 00:20:07 +0200
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>> Yakov Lerner a écrit :
>> > On Sun, Sep 27, 2009 at 12:53, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> >> Yakov Lerner a écrit :
>> >>> /proc/net/tcp does 20,000 sockets in 60-80 milliseconds, with this patch.
>> >>>
>> >>> The overhead was in tcp_seq_start(). See analysis (3) below.
>> >>> The patch is against Linus git tree (1). The patch is small.
>> >>>
>> >>> ------------  -----------   ------------------------------------
>> >>> Before patch  After patch   20,000 sockets (10,000 tw + 10,000 estab)(2)
>> >>> ------------  -----------   ------------------------------------
>> >>> 6 sec          0.06 sec     dd bs=1k if=/proc/net/tcp >/dev/null
>> >>> 1.5 sec        0.06 sec     dd bs=4k if=/proc/net/tcp >/dev/null
>> >>>
>> >>> 1.9 sec        0.16 sec     netstat -4ant >/dev/null
>> >>> ------------  -----------   ------------------------------------
>> >>>
>> >>> This is ~ x25 improvement.
>> >>> The new time is not dependent on read blockize.
>> >>> Speed of netstat, naturally, improves, too; both -4 and -6.
>> >>> /proc/net/tcp6 does 20,000 sockets in 100 millisec.
>> >>>
>> >>> (1) against git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
>> >>>
>> >>> (2) Used 'manysock' utility to stress system with large number of sockets:
>> >>>   "manysock 10000 10000"    - 10,000 tw + 10,000 estab ip4 sockets.
>> >>>   "manysock -6 10000 10000" - 10,000 tw + 10,000 estab ip6 sockets.
>> >>> Found at http://ilerner.3b1.org/manysock/manysock.c
>> >>>
>> >>> (3) Algorithmic analysis.
>> >>>     Old algorithm.
>> >>>
>> >>> During 'cat </proc/net/tcp', tcp_seq_start() is called O(numsockets) times (4).
>> >>> On average, every call to tcp_seq_start() scans half the whole hashtable. Ouch.
>> >>> This is O(numsockets * hashsize). 95-99% of 'cat </proc/net/tcp' is spent in
>> >>> tcp_seq_start()->tcp_get_idx. This overhead is eliminated by new algorithm,
>> >>> which is O(numsockets + hashsize).
>> >>>
>> >>>     New algorithm.
>> >>>
>> >>> New algorithms is O(numsockets + hashsize). We jump to the right
>> >>> hash bucket in tcp_seq_start(), without scanning half the hash.
>> >>> To jump right to the hash bucket corresponding to *pos in tcp_seq_start(),
>> >>> we reuse three pieces of state (st->num, st->bucket, st->sbucket)
>> >>> as follows:
>> >>>  - we check that requested pos >= last seen pos (st->num), the typical case.
>> >>>  - if so, we jump to bucket st->bucket
>> >>>  - to arrive to the right item after beginning of st->bucket, we
>> >>> keep in st->sbucket the position corresponding to the beginning of
>> >>> bucket.
>> >>>
>> >>> (4) Explanation of O( numsockets * hashsize) of old algorithm.
>> >>>
>> >>> tcp_seq_start() is called once for every ~7 lines of netstat output
>> >>> if readsize is 1kb, or once for every ~28 lines if readsize >= 4kb.
>> >>> Since record length of /proc/net/tcp records is 150 bytes, formula for
>> >>> number of calls to tcp_seq_start() is
>> >>>             (numsockets * 150 / min(4096,readsize)).
>> >>> Netstat uses 4kb readsize (newer versions), or 1kb (older versions).
>> >>> Note that speed of old algorithm does not improve above 4kb blocksize.
>> >>>
>> >>> Speed of the new algorithm does not depend on blocksize.
>> >>>
>> >>> Speed of the new algorithm does not perceptibly depend on hashsize (which
>> >>> depends on ramsize). Speed of old algorithm drops with bigger hashsize.
>> >>>
>> >>> (5) Reporting order.
>> >>>
>> >>> Reporting order is exactly same as before if hash does not change underfoot.
>> >>> When hash elements come and go during report, reporting order will be
>> >>> same as that of tcpdiag.
>> >>>
>> >>> Signed-off-by: Yakov Lerner <iler.ml@gmail.com>
>
> Does the netlink interface used by ss command have the problem?

No. It's  /proc/net/tcp that has fixable problem.

Yakov

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] /proc/net/tcp, overhead removed
  2009-09-29 15:45     ` Stephen Hemminger
@ 2009-09-29 17:34       ` Yakov Lerner
  0 siblings, 0 replies; 13+ messages in thread
From: Yakov Lerner @ 2009-09-29 17:34 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Eric Dumazet, netdev, davem

On Tue, Sep 29, 2009 at 18:45, Stephen Hemminger <shemminger@vyatta.com> wrote:
> On Tue, 29 Sep 2009 11:55:18 +0300
> Yakov Lerner <iler.ml@gmail.com> wrote:
>
>> On Tue, Sep 29, 2009 at 10:56, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> >
>> > Yakov Lerner a écrit :
>> > > Take 2.
>> > >
>> > > "Sharp improvement in performance of /proc/net/tcp when number of
>> > > sockets is large and hashsize is large.
>> > > O(numsock * hashsize) time becomes O(numsock + hashsize). On slow
>> > > processors, speed difference can be x100 and more."
>> > >
>> > > I must say that I'm not fully satisfied with my choice of "st->sbucket"
>> > > for the new preserved index. The better name would be "st->snum".
>> > > Re-using "st->sbucket" saves 4 bytes, and keeps the patch to one sourcefile.
>> > > But "st->sbucket" has different meaning in OPENREQ and LISTEN states;
>> > > this can be confusing.
>> > > Maybe better add "snum" member to struct tcp_iter_state ?
>> > >
>> > > Shall I change subject when sending "take N+1", or keep the old subject ?
>> > >
>> > > Signed-off-by: Yakov Lerner <iler.ml@gmail.com>
>> > > ---
>> > >  net/ipv4/tcp_ipv4.c |   35 +++++++++++++++++++++++++++++++++--
>> > >  1 files changed, 33 insertions(+), 2 deletions(-)
>> > >
>> > > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>> > > index 7cda24b..e4c4f19 100644
>> > > --- a/net/ipv4/tcp_ipv4.c
>> > > +++ b/net/ipv4/tcp_ipv4.c
>> > > @@ -1994,13 +1994,14 @@ static inline int empty_bucket(struct tcp_iter_state *st)
>> > >               hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
>> > >  }
>> > >
>> > > -static void *established_get_first(struct seq_file *seq)
>> > > +static void *established_get_first_after(struct seq_file *seq, int bucket)
>> > >  {
>> > >       struct tcp_iter_state *st = seq->private;
>> > >       struct net *net = seq_file_net(seq);
>> > >       void *rc = NULL;
>> > >
>> > > -     for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
>> > > +     for (st->bucket = bucket; st->bucket < tcp_hashinfo.ehash_size;
>> > > +          ++st->bucket) {
>> > >               struct sock *sk;
>> > >               struct hlist_nulls_node *node;
>> > >               struct inet_timewait_sock *tw;
>> > > @@ -2010,6 +2011,8 @@ static void *established_get_first(struct seq_file *seq)
>> > >               if (empty_bucket(st))
>> > >                       continue;
>> > >
>> > > +             st->sbucket = st->num;
>> > > +
>> > >               spin_lock_bh(lock);
>> > >               sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
>> > >                       if (sk->sk_family != st->family ||
>> > > @@ -2036,6 +2039,11 @@ out:
>> > >       return rc;
>> > >  }
>> > >
>> > > +static void *established_get_first(struct seq_file *seq)
>> > > +{
>> > > +     return established_get_first_after(seq, 0);
>> > > +}
>> > > +
>> > >  static void *established_get_next(struct seq_file *seq, void *cur)
>> > >  {
>> > >       struct sock *sk = cur;
>> > > @@ -2064,6 +2072,9 @@ get_tw:
>> > >               while (++st->bucket < tcp_hashinfo.ehash_size &&
>> > >                               empty_bucket(st))
>> > >                       ;
>> > > +
>> > > +             st->sbucket = st->num;
>> > > +
>> > >               if (st->bucket >= tcp_hashinfo.ehash_size)
>> > >                       return NULL;
>> > >
>> > > @@ -2107,6 +2118,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
>> > >
>> > >       if (!rc) {
>> > >               st->state = TCP_SEQ_STATE_ESTABLISHED;
>> > > +             st->sbucket = 0;
>> > >               rc        = established_get_idx(seq, pos);
>> > >       }
>> > >
>> > > @@ -2116,6 +2128,25 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
>> > >  static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
>> > >  {
>> > >       struct tcp_iter_state *st = seq->private;
>> > > +
>> > > +     if (*pos && *pos >= st->sbucket &&
>> > > +         (st->state == TCP_SEQ_STATE_ESTABLISHED ||
>> > > +          st->state == TCP_SEQ_STATE_TIME_WAIT)) {
>> > > +             void *cur;
>> > > +             int nskip;
>> > > +
>> > > +             /* for states estab and tw, st->sbucket is index (*pos) */
>> > > +             /* corresponding to the beginning of bucket st->bucket */
>> > > +
>> > > +             st->num = st->sbucket;
>> > > +             /* jump to st->bucket, then skip (*pos - st->sbucket) items */
>> > > +             st->state = TCP_SEQ_STATE_ESTABLISHED;
>> > > +             cur = established_get_first_after(seq, st->bucket);
>> > > +             for (nskip = *pos - st->num; cur && nskip > 0; --nskip)
>> > > +                     cur = established_get_next(seq, cur);
>> > > +             return cur;
>> > > +     }
>> > > +
>> > >       st->state = TCP_SEQ_STATE_LISTENING;
>> > >       st->num = 0;
>> > >       return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
>> >
>> > Just in case you are working on "take 3" of the patch, there is a fondamental problem.
>> >
>> > All the scalability problems come from the fact that tcp_seq_start()
>> > *has* to rescan all the tables from the begining, because of lseek() capability
>> > on /proc/net/tcp file
>> >
>> > We probably could disable llseek() (on other positions than start of the file),
>> > and rely only on internal state (listening/established hashtable, hash bucket, position in chain)
>> >
>> > I cannot imagine how an application could rely on lseek() on >0 position in this file.
>>
>>
>> I thought  /proc/net/tcp  can  both  be fast and allow lseek;
>> (1) when no lseek was issued since last read
>> (we can detect this), /proc/net/tcp can jump to the
>> last known bucket (common case), vs
>> (2) switch to slow mode (scan from the beginning of hash)
>> when lseek was used , no ?
>
> If you look at fib_hash and fib_trie, they already do the same thing.
>  * fib_hash records last hash chain to avoid overhead of rescan.
>  * fib_trie records last route and does fast lookup to restart from there.

Thanks for the pointer.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] /proc/net/tcp, overhead removed
  2009-09-29  8:55   ` Yakov Lerner
@ 2009-09-29 15:45     ` Stephen Hemminger
  2009-09-29 17:34       ` Yakov Lerner
  0 siblings, 1 reply; 13+ messages in thread
From: Stephen Hemminger @ 2009-09-29 15:45 UTC (permalink / raw)
  To: Yakov Lerner; +Cc: Eric Dumazet, netdev, davem

On Tue, 29 Sep 2009 11:55:18 +0300
Yakov Lerner <iler.ml@gmail.com> wrote:

> On Tue, Sep 29, 2009 at 10:56, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > Yakov Lerner a écrit :
> > > Take 2.
> > >
> > > "Sharp improvement in performance of /proc/net/tcp when number of
> > > sockets is large and hashsize is large.
> > > O(numsock * hashsize) time becomes O(numsock + hashsize). On slow
> > > processors, speed difference can be x100 and more."
> > >
> > > I must say that I'm not fully satisfied with my choice of "st->sbucket"
> > > for the new preserved index. The better name would be "st->snum".
> > > Re-using "st->sbucket" saves 4 bytes, and keeps the patch to one sourcefile.
> > > But "st->sbucket" has different meaning in OPENREQ and LISTEN states;
> > > this can be confusing.
> > > Maybe better add "snum" member to struct tcp_iter_state ?
> > >
> > > Shall I change subject when sending "take N+1", or keep the old subject ?
> > >
> > > Signed-off-by: Yakov Lerner <iler.ml@gmail.com>
> > > ---
> > >  net/ipv4/tcp_ipv4.c |   35 +++++++++++++++++++++++++++++++++--
> > >  1 files changed, 33 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> > > index 7cda24b..e4c4f19 100644
> > > --- a/net/ipv4/tcp_ipv4.c
> > > +++ b/net/ipv4/tcp_ipv4.c
> > > @@ -1994,13 +1994,14 @@ static inline int empty_bucket(struct tcp_iter_state *st)
> > >               hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
> > >  }
> > >
> > > -static void *established_get_first(struct seq_file *seq)
> > > +static void *established_get_first_after(struct seq_file *seq, int bucket)
> > >  {
> > >       struct tcp_iter_state *st = seq->private;
> > >       struct net *net = seq_file_net(seq);
> > >       void *rc = NULL;
> > >
> > > -     for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
> > > +     for (st->bucket = bucket; st->bucket < tcp_hashinfo.ehash_size;
> > > +          ++st->bucket) {
> > >               struct sock *sk;
> > >               struct hlist_nulls_node *node;
> > >               struct inet_timewait_sock *tw;
> > > @@ -2010,6 +2011,8 @@ static void *established_get_first(struct seq_file *seq)
> > >               if (empty_bucket(st))
> > >                       continue;
> > >
> > > +             st->sbucket = st->num;
> > > +
> > >               spin_lock_bh(lock);
> > >               sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
> > >                       if (sk->sk_family != st->family ||
> > > @@ -2036,6 +2039,11 @@ out:
> > >       return rc;
> > >  }
> > >
> > > +static void *established_get_first(struct seq_file *seq)
> > > +{
> > > +     return established_get_first_after(seq, 0);
> > > +}
> > > +
> > >  static void *established_get_next(struct seq_file *seq, void *cur)
> > >  {
> > >       struct sock *sk = cur;
> > > @@ -2064,6 +2072,9 @@ get_tw:
> > >               while (++st->bucket < tcp_hashinfo.ehash_size &&
> > >                               empty_bucket(st))
> > >                       ;
> > > +
> > > +             st->sbucket = st->num;
> > > +
> > >               if (st->bucket >= tcp_hashinfo.ehash_size)
> > >                       return NULL;
> > >
> > > @@ -2107,6 +2118,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
> > >
> > >       if (!rc) {
> > >               st->state = TCP_SEQ_STATE_ESTABLISHED;
> > > +             st->sbucket = 0;
> > >               rc        = established_get_idx(seq, pos);
> > >       }
> > >
> > > @@ -2116,6 +2128,25 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
> > >  static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
> > >  {
> > >       struct tcp_iter_state *st = seq->private;
> > > +
> > > +     if (*pos && *pos >= st->sbucket &&
> > > +         (st->state == TCP_SEQ_STATE_ESTABLISHED ||
> > > +          st->state == TCP_SEQ_STATE_TIME_WAIT)) {
> > > +             void *cur;
> > > +             int nskip;
> > > +
> > > +             /* for states estab and tw, st->sbucket is index (*pos) */
> > > +             /* corresponding to the beginning of bucket st->bucket */
> > > +
> > > +             st->num = st->sbucket;
> > > +             /* jump to st->bucket, then skip (*pos - st->sbucket) items */
> > > +             st->state = TCP_SEQ_STATE_ESTABLISHED;
> > > +             cur = established_get_first_after(seq, st->bucket);
> > > +             for (nskip = *pos - st->num; cur && nskip > 0; --nskip)
> > > +                     cur = established_get_next(seq, cur);
> > > +             return cur;
> > > +     }
> > > +
> > >       st->state = TCP_SEQ_STATE_LISTENING;
> > >       st->num = 0;
> > >       return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
> >
> > Just in case you are working on "take 3" of the patch, there is a fondamental problem.
> >
> > All the scalability problems come from the fact that tcp_seq_start()
> > *has* to rescan all the tables from the begining, because of lseek() capability
> > on /proc/net/tcp file
> >
> > We probably could disable llseek() (on other positions than start of the file),
> > and rely only on internal state (listening/established hashtable, hash bucket, position in chain)
> >
> > I cannot imagine how an application could rely on lseek() on >0 position in this file.
> 
> 
> I thought  /proc/net/tcp  can  both  be fast and allow lseek;
> (1) when no lseek was issued since last read
> (we can detect this), /proc/net/tcp can jump to the
> last known bucket (common case), vs
> (2) switch to slow mode (scan from the beginning of hash)
> when lseek was used , no ?

If you look at fib_hash and fib_trie, they already do the same thing.
  * fib_hash records last hash chain to avoid overhead of rescan.
  * fib_trie records last route and does fast lookup to restart from there.


-- 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] /proc/net/tcp, overhead removed
  2009-09-29  7:56 ` Eric Dumazet
@ 2009-09-29  8:55   ` Yakov Lerner
  2009-09-29 15:45     ` Stephen Hemminger
  0 siblings, 1 reply; 13+ messages in thread
From: Yakov Lerner @ 2009-09-29  8:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, davem

On Tue, Sep 29, 2009 at 10:56, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> Yakov Lerner a écrit :
> > Take 2.
> >
> > "Sharp improvement in performance of /proc/net/tcp when number of
> > sockets is large and hashsize is large.
> > O(numsock * hashsize) time becomes O(numsock + hashsize). On slow
> > processors, speed difference can be x100 and more."
> >
> > I must say that I'm not fully satisfied with my choice of "st->sbucket"
> > for the new preserved index. The better name would be "st->snum".
> > Re-using "st->sbucket" saves 4 bytes, and keeps the patch to one sourcefile.
> > But "st->sbucket" has different meaning in OPENREQ and LISTEN states;
> > this can be confusing.
> > Maybe better add "snum" member to struct tcp_iter_state ?
> >
> > Shall I change subject when sending "take N+1", or keep the old subject ?
> >
> > Signed-off-by: Yakov Lerner <iler.ml@gmail.com>
> > ---
> >  net/ipv4/tcp_ipv4.c |   35 +++++++++++++++++++++++++++++++++--
> >  1 files changed, 33 insertions(+), 2 deletions(-)
> >
> > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> > index 7cda24b..e4c4f19 100644
> > --- a/net/ipv4/tcp_ipv4.c
> > +++ b/net/ipv4/tcp_ipv4.c
> > @@ -1994,13 +1994,14 @@ static inline int empty_bucket(struct tcp_iter_state *st)
> >               hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
> >  }
> >
> > -static void *established_get_first(struct seq_file *seq)
> > +static void *established_get_first_after(struct seq_file *seq, int bucket)
> >  {
> >       struct tcp_iter_state *st = seq->private;
> >       struct net *net = seq_file_net(seq);
> >       void *rc = NULL;
> >
> > -     for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
> > +     for (st->bucket = bucket; st->bucket < tcp_hashinfo.ehash_size;
> > +          ++st->bucket) {
> >               struct sock *sk;
> >               struct hlist_nulls_node *node;
> >               struct inet_timewait_sock *tw;
> > @@ -2010,6 +2011,8 @@ static void *established_get_first(struct seq_file *seq)
> >               if (empty_bucket(st))
> >                       continue;
> >
> > +             st->sbucket = st->num;
> > +
> >               spin_lock_bh(lock);
> >               sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
> >                       if (sk->sk_family != st->family ||
> > @@ -2036,6 +2039,11 @@ out:
> >       return rc;
> >  }
> >
> > +static void *established_get_first(struct seq_file *seq)
> > +{
> > +     return established_get_first_after(seq, 0);
> > +}
> > +
> >  static void *established_get_next(struct seq_file *seq, void *cur)
> >  {
> >       struct sock *sk = cur;
> > @@ -2064,6 +2072,9 @@ get_tw:
> >               while (++st->bucket < tcp_hashinfo.ehash_size &&
> >                               empty_bucket(st))
> >                       ;
> > +
> > +             st->sbucket = st->num;
> > +
> >               if (st->bucket >= tcp_hashinfo.ehash_size)
> >                       return NULL;
> >
> > @@ -2107,6 +2118,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
> >
> >       if (!rc) {
> >               st->state = TCP_SEQ_STATE_ESTABLISHED;
> > +             st->sbucket = 0;
> >               rc        = established_get_idx(seq, pos);
> >       }
> >
> > @@ -2116,6 +2128,25 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
> >  static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
> >  {
> >       struct tcp_iter_state *st = seq->private;
> > +
> > +     if (*pos && *pos >= st->sbucket &&
> > +         (st->state == TCP_SEQ_STATE_ESTABLISHED ||
> > +          st->state == TCP_SEQ_STATE_TIME_WAIT)) {
> > +             void *cur;
> > +             int nskip;
> > +
> > +             /* for states estab and tw, st->sbucket is index (*pos) */
> > +             /* corresponding to the beginning of bucket st->bucket */
> > +
> > +             st->num = st->sbucket;
> > +             /* jump to st->bucket, then skip (*pos - st->sbucket) items */
> > +             st->state = TCP_SEQ_STATE_ESTABLISHED;
> > +             cur = established_get_first_after(seq, st->bucket);
> > +             for (nskip = *pos - st->num; cur && nskip > 0; --nskip)
> > +                     cur = established_get_next(seq, cur);
> > +             return cur;
> > +     }
> > +
> >       st->state = TCP_SEQ_STATE_LISTENING;
> >       st->num = 0;
> >       return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
>
> Just in case you are working on "take 3" of the patch, there is a fondamental problem.
>
> All the scalability problems come from the fact that tcp_seq_start()
> *has* to rescan all the tables from the begining, because of lseek() capability
> on /proc/net/tcp file
>
> We probably could disable llseek() (on other positions than start of the file),
> and rely only on internal state (listening/established hashtable, hash bucket, position in chain)
>
> I cannot imagine how an application could rely on lseek() on >0 position in this file.


I thought  /proc/net/tcp  can  both  be fast and allow lseek;
(1) when no lseek was issued since last read
(we can detect this), /proc/net/tcp can jump to the
last known bucket (common case), vs
(2) switch to slow mode (scan from the beginning of hash)
when lseek was used , no ?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] /proc/net/tcp, overhead removed
  2009-09-28 23:01 Yakov Lerner
  2009-09-29  4:39 ` Eric Dumazet
@ 2009-09-29  7:56 ` Eric Dumazet
  2009-09-29  8:55   ` Yakov Lerner
  1 sibling, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2009-09-29  7:56 UTC (permalink / raw)
  To: Yakov Lerner; +Cc: netdev, davem

Yakov Lerner a écrit :
> Take 2. 
> 
> "Sharp improvement in performance of /proc/net/tcp when number of 
> sockets is large and hashsize is large. 
> O(numsock * hashsize) time becomes O(numsock + hashsize). On slow
> processors, speed difference can be x100 and more."
> 
> I must say that I'm not fully satisfied with my choice of "st->sbucket" 
> for the new preserved index. The better name would be "st->snum". 
> Re-using "st->sbucket" saves 4 bytes, and keeps the patch to one sourcefile.
> But "st->sbucket" has different meaning in OPENREQ and LISTEN states;
> this can be confusing. 
> Maybe better add "snum" member to struct tcp_iter_state ?
> 
> Shall I change subject when sending "take N+1", or keep the old subject ?
> 
> Signed-off-by: Yakov Lerner <iler.ml@gmail.com>
> ---
>  net/ipv4/tcp_ipv4.c |   35 +++++++++++++++++++++++++++++++++--
>  1 files changed, 33 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 7cda24b..e4c4f19 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1994,13 +1994,14 @@ static inline int empty_bucket(struct tcp_iter_state *st)
>  		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
>  }
>  
> -static void *established_get_first(struct seq_file *seq)
> +static void *established_get_first_after(struct seq_file *seq, int bucket)
>  {
>  	struct tcp_iter_state *st = seq->private;
>  	struct net *net = seq_file_net(seq);
>  	void *rc = NULL;
>  
> -	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
> +	for (st->bucket = bucket; st->bucket < tcp_hashinfo.ehash_size;
> +	     ++st->bucket) {
>  		struct sock *sk;
>  		struct hlist_nulls_node *node;
>  		struct inet_timewait_sock *tw;
> @@ -2010,6 +2011,8 @@ static void *established_get_first(struct seq_file *seq)
>  		if (empty_bucket(st))
>  			continue;
>  
> +		st->sbucket = st->num;
> +
>  		spin_lock_bh(lock);
>  		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
>  			if (sk->sk_family != st->family ||
> @@ -2036,6 +2039,11 @@ out:
>  	return rc;
>  }
>  
> +static void *established_get_first(struct seq_file *seq)
> +{
> +	return established_get_first_after(seq, 0);
> +}
> +
>  static void *established_get_next(struct seq_file *seq, void *cur)
>  {
>  	struct sock *sk = cur;
> @@ -2064,6 +2072,9 @@ get_tw:
>  		while (++st->bucket < tcp_hashinfo.ehash_size &&
>  				empty_bucket(st))
>  			;
> +
> +		st->sbucket = st->num;
> +
>  		if (st->bucket >= tcp_hashinfo.ehash_size)
>  			return NULL;
>  
> @@ -2107,6 +2118,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
>  
>  	if (!rc) {
>  		st->state = TCP_SEQ_STATE_ESTABLISHED;
> +		st->sbucket = 0;
>  		rc	  = established_get_idx(seq, pos);
>  	}
>  
> @@ -2116,6 +2128,25 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
>  static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
>  {
>  	struct tcp_iter_state *st = seq->private;
> +
> +	if (*pos && *pos >= st->sbucket &&
> +	    (st->state == TCP_SEQ_STATE_ESTABLISHED ||
> +	     st->state == TCP_SEQ_STATE_TIME_WAIT)) {
> +		void *cur;
> +		int nskip;
> +
> +		/* for states estab and tw, st->sbucket is index (*pos) */
> +		/* corresponding to the beginning of bucket st->bucket */
> +
> +		st->num = st->sbucket;
> +		/* jump to st->bucket, then skip (*pos - st->sbucket) items */
> +		st->state = TCP_SEQ_STATE_ESTABLISHED;
> +		cur = established_get_first_after(seq, st->bucket);
> +		for (nskip = *pos - st->num; cur && nskip > 0; --nskip)
> +			cur = established_get_next(seq, cur);
> +		return cur;
> +	}
> +
>  	st->state = TCP_SEQ_STATE_LISTENING;
>  	st->num = 0;
>  	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;

Just in case you are working on "take 3" of the patch, there is a fondamental problem.

All the scalability problems come from the fact that tcp_seq_start()
*has* to rescan all the tables from the begining, because of lseek() capability
on /proc/net/tcp file 

We probably could disable llseek() (on other positions than start of the file),
and rely only on internal state (listening/established hashtable, hash bucket, position in chain)

I cannot imagine how an application could rely on lseek() on >0 position in this file.



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] /proc/net/tcp, overhead removed
  2009-09-28 23:01 Yakov Lerner
@ 2009-09-29  4:39 ` Eric Dumazet
  2009-09-29  7:56 ` Eric Dumazet
  1 sibling, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2009-09-29  4:39 UTC (permalink / raw)
  To: Yakov Lerner; +Cc: netdev, davem

Yakov Lerner a écrit :
> Take 2. 
> 
> "Sharp improvement in performance of /proc/net/tcp when number of 
> sockets is large and hashsize is large. 
> O(numsock * hashsize) time becomes O(numsock + hashsize). On slow
> processors, speed difference can be x100 and more."
> 
> I must say that I'm not fully satisfied with my choice of "st->sbucket" 
> for the new preserved index. The better name would be "st->snum". 
> Re-using "st->sbucket" saves 4 bytes, and keeps the patch to one sourcefile.
> But "st->sbucket" has different meaning in OPENREQ and LISTEN states;
> this can be confusing. 
> Maybe better add "snum" member to struct tcp_iter_state ?

You can add more fields to tcp_iter_state if it makes code more easy to read
and faster.

This structure is allocated once at open("/proc/net/tcp") time and could
be any reasonable size. You can add 10 longs in it, it is not a big deal.

> 
> Shall I change subject when sending "take N+1", or keep the old subject ?

Not a big deal, but keeping old subject is probably the common way.

[PATCH v2] tcp: Remove /proc/net/tcp O(N*H) overhead

> 
> Signed-off-by: Yakov Lerner <iler.ml@gmail.com>
> ---
>  net/ipv4/tcp_ipv4.c |   35 +++++++++++++++++++++++++++++++++--
>  1 files changed, 33 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 7cda24b..e4c4f19 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1994,13 +1994,14 @@ static inline int empty_bucket(struct tcp_iter_state *st)
>  		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
>  }
>  
> -static void *established_get_first(struct seq_file *seq)
> +static void *established_get_first_after(struct seq_file *seq, int bucket)
>  {
>  	struct tcp_iter_state *st = seq->private;
>  	struct net *net = seq_file_net(seq);
>  	void *rc = NULL;
>  
> -	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
> +	for (st->bucket = bucket; st->bucket < tcp_hashinfo.ehash_size;
> +	     ++st->bucket) {
>  		struct sock *sk;
>  		struct hlist_nulls_node *node;
>  		struct inet_timewait_sock *tw;
> @@ -2010,6 +2011,8 @@ static void *established_get_first(struct seq_file *seq)
>  		if (empty_bucket(st))
>  			continue;
>  

> +		st->sbucket = st->num;
> +

oh this is ugly...

Check tcp_seq_stop() to see why st->sbucket should not change after getting
lock. Any reader of this will have a heart attack :)

>  		spin_lock_bh(lock);
>  		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
>  			if (sk->sk_family != st->family ||
> @@ -2036,6 +2039,11 @@ out:
>  	return rc;
>  }
>  
> +static void *established_get_first(struct seq_file *seq)
> +{
> +	return established_get_first_after(seq, 0);
> +}
> +
>  static void *established_get_next(struct seq_file *seq, void *cur)
>  {
>  	struct sock *sk = cur;
> @@ -2064,6 +2072,9 @@ get_tw:
>  		while (++st->bucket < tcp_hashinfo.ehash_size &&
>  				empty_bucket(st))
>  			;
> +
> +		st->sbucket = st->num;

same here, this is ugly, even if it happens to work.

> +
>  		if (st->bucket >= tcp_hashinfo.ehash_size)
>  			return NULL;
>  
> @@ -2107,6 +2118,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
>  
>  	if (!rc) {
>  		st->state = TCP_SEQ_STATE_ESTABLISHED;
> +		st->sbucket = 0;
>  		rc	  = established_get_idx(seq, pos);
>  	}
>  
> @@ -2116,6 +2128,25 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
>  static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
>  {
>  	struct tcp_iter_state *st = seq->private;
> +
> +	if (*pos && *pos >= st->sbucket &&
> +	    (st->state == TCP_SEQ_STATE_ESTABLISHED ||
> +	     st->state == TCP_SEQ_STATE_TIME_WAIT)) {
> +		void *cur;
> +		int nskip;
> +
> +		/* for states estab and tw, st->sbucket is index (*pos) */
> +		/* corresponding to the beginning of bucket st->bucket */
> +
> +		st->num = st->sbucket;
ugly...
> +		/* jump to st->bucket, then skip (*pos - st->sbucket) items */
> +		st->state = TCP_SEQ_STATE_ESTABLISHED;
> +		cur = established_get_first_after(seq, st->bucket);
> +		for (nskip = *pos - st->num; cur && nskip > 0; --nskip)
> +			cur = established_get_next(seq, cur);
> +		return cur;
> +	}
> +

I dont think you need this chunk in tcp_get_start(), and its also probably buggy,
even if its hard to prove this claim, we'll need some prog to get TIME_WAIT sockets
in a reproducable form.

Jumping to the right hash slot is more than enough to avoid the O(N*H) problem.

You should try to optimize both established/listening algos, so that
code is readable and maintenable. On pathological cases, we can also have 10000
sockets in LISTENING/OPENREQ state.

Maybe we need a first patch to cleanup code, since its a really complex one,
then a patch to optimize it ?

IMHO the /proc/net/tcp file suffers from bugs, before a performance problem.

Currently, we can miss to output some live sockets in the dump, if :

Thread A gets a block from /proc/net/tcp and stops in hash slot N, socket X.
Thread B deletes sockets X, before socket Y in hash chain, or any socket
in previous hash slots.
Thread A gets 'next block', missing socket Y and possibly Y+1, Y+2....

-> Thread A doesnt see socket Y as an established/timewait socket.

So I believe being able to store the hash slot could really help both performance and
avoid skiping lot of sockets in case a thread B destroys sockets 'before our cursor'

The remaining window would be small, as only deleting sockets in our hash slot could
make us skip live sockets. (And closing this hole is really tricky, inet_diag has
same problem I believe)

Following program to establish 10000 sockets in listening state, and 2*10000 in
established state. Non random ports so that we can compare before/after patches.

#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <string.h>
#include <unistd.h>
#include <stdio.h>

int fdlisten[10000];
#define PORT 2222
int main(int argc, char *argv[])
{
        int i;
        struct sockaddr_in sockaddr, locaddr;

        for (i = 0; i < 10000; i++) {
                fdlisten[i] = socket(AF_INET, SOCK_STREAM, 0);
                memset(&sockaddr, 0, sizeof(sockaddr));
                sockaddr.sin_family = AF_INET;
                sockaddr.sin_port = htons(PORT);
                sockaddr.sin_addr.s_addr = htonl(0x7f000001 + i);
                if (bind(fdlisten[i], (struct sockaddr *)&sockaddr, sizeof(sockaddr))== -1) {
                        perror("bind");
                        return 1;
                }
                if (listen(fdlisten[i], 1)== -1) {
                        perror("listen");
                        return 1;
                }
        }
        if (fork() == 0) {
                i = 0;
                while (1) {
                        socklen_t len = sizeof(sockaddr);
                        int newfd = accept(fdlisten[i++], (struct sockaddr *)&sockaddr, &len);

                        if (newfd == -1)
                                perror("accept");
                        if (i == 10000)
                                i = 0;
                }
        }
        for (i = 0 ; i < 10000; i++) {
                int fd;

                close(fdlisten[i]);
                fd = socket(AF_INET, SOCK_STREAM, 0);
                if (fd == -1) {
                        perror("socket");
                        break;
                        }
                memset(&locaddr, 0, sizeof(locaddr));
                locaddr.sin_family = AF_INET;
                locaddr.sin_port = htons(i + 20000);
                locaddr.sin_addr.s_addr = htonl(0x7f000001 + i);
                bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr));

                memset(&sockaddr, 0, sizeof(sockaddr));
                sockaddr.sin_family = AF_INET;
                sockaddr.sin_port = htons(PORT);
                sockaddr.sin_addr.s_addr = htonl(0x7f000001 + i);
                connect(fd, (struct sockaddr *)&sockaddr, sizeof(sockaddr));
        }
        pause();
        return 0;
}

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] /proc/net/tcp, overhead removed
@ 2009-09-28 23:01 Yakov Lerner
  2009-09-29  4:39 ` Eric Dumazet
  2009-09-29  7:56 ` Eric Dumazet
  0 siblings, 2 replies; 13+ messages in thread
From: Yakov Lerner @ 2009-09-28 23:01 UTC (permalink / raw)
  To: netdev, eric.dumazet, davem; +Cc: Yakov Lerner

Take 2. 

"Sharp improvement in performance of /proc/net/tcp when number of 
sockets is large and hashsize is large. 
O(numsock * hashsize) time becomes O(numsock + hashsize). On slow
processors, speed difference can be x100 and more."

I must say that I'm not fully satisfied with my choice of "st->sbucket" 
for the new preserved index. The better name would be "st->snum". 
Re-using "st->sbucket" saves 4 bytes, and keeps the patch to one sourcefile.
But "st->sbucket" has different meaning in OPENREQ and LISTEN states;
this can be confusing. 
Maybe better add "snum" member to struct tcp_iter_state ?

Shall I change subject when sending "take N+1", or keep the old subject ?

Signed-off-by: Yakov Lerner <iler.ml@gmail.com>
---
 net/ipv4/tcp_ipv4.c |   35 +++++++++++++++++++++++++++++++++--
 1 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7cda24b..e4c4f19 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1994,13 +1994,14 @@ static inline int empty_bucket(struct tcp_iter_state *st)
 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
 }
 
-static void *established_get_first(struct seq_file *seq)
+static void *established_get_first_after(struct seq_file *seq, int bucket)
 {
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 	void *rc = NULL;
 
-	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
+	for (st->bucket = bucket; st->bucket < tcp_hashinfo.ehash_size;
+	     ++st->bucket) {
 		struct sock *sk;
 		struct hlist_nulls_node *node;
 		struct inet_timewait_sock *tw;
@@ -2010,6 +2011,8 @@ static void *established_get_first(struct seq_file *seq)
 		if (empty_bucket(st))
 			continue;
 
+		st->sbucket = st->num;
+
 		spin_lock_bh(lock);
 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
 			if (sk->sk_family != st->family ||
@@ -2036,6 +2039,11 @@ out:
 	return rc;
 }
 
+static void *established_get_first(struct seq_file *seq)
+{
+	return established_get_first_after(seq, 0);
+}
+
 static void *established_get_next(struct seq_file *seq, void *cur)
 {
 	struct sock *sk = cur;
@@ -2064,6 +2072,9 @@ get_tw:
 		while (++st->bucket < tcp_hashinfo.ehash_size &&
 				empty_bucket(st))
 			;
+
+		st->sbucket = st->num;
+
 		if (st->bucket >= tcp_hashinfo.ehash_size)
 			return NULL;
 
@@ -2107,6 +2118,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
 
 	if (!rc) {
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
+		st->sbucket = 0;
 		rc	  = established_get_idx(seq, pos);
 	}
 
@@ -2116,6 +2128,25 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct tcp_iter_state *st = seq->private;
+
+	if (*pos && *pos >= st->sbucket &&
+	    (st->state == TCP_SEQ_STATE_ESTABLISHED ||
+	     st->state == TCP_SEQ_STATE_TIME_WAIT)) {
+		void *cur;
+		int nskip;
+
+		/* for states estab and tw, st->sbucket is index (*pos) */
+		/* corresponding to the beginning of bucket st->bucket */
+
+		st->num = st->sbucket;
+		/* jump to st->bucket, then skip (*pos - st->sbucket) items */
+		st->state = TCP_SEQ_STATE_ESTABLISHED;
+		cur = established_get_first_after(seq, st->bucket);
+		for (nskip = *pos - st->num; cur && nskip > 0; --nskip)
+			cur = established_get_next(seq, cur);
+		return cur;
+	}
+
 	st->state = TCP_SEQ_STATE_LISTENING;
 	st->num = 0;
 	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
-- 
1.6.5.rc2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2009-09-29 17:34 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-09-26 21:31 [PATCH] /proc/net/tcp, overhead removed Yakov Lerner
2009-09-26 21:31 ` Yakov Lerner
2009-09-27  9:53 ` Eric Dumazet
2009-09-28 22:10   ` Yakov Lerner
2009-09-28 22:20     ` Eric Dumazet
2009-09-28 23:24       ` Stephen Hemminger
2009-09-29  7:43         ` Yakov Lerner
2009-09-28 23:01 Yakov Lerner
2009-09-29  4:39 ` Eric Dumazet
2009-09-29  7:56 ` Eric Dumazet
2009-09-29  8:55   ` Yakov Lerner
2009-09-29 15:45     ` Stephen Hemminger
2009-09-29 17:34       ` Yakov Lerner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.