All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
@ 2010-04-23  8:12 Changli Gao
  2010-04-23  9:27 ` Eric Dumazet
  2010-04-23 10:26 ` Eric Dumazet
  0 siblings, 2 replies; 108+ messages in thread
From: Changli Gao @ 2010-04-23  8:12 UTC (permalink / raw)
  To: David S. Miller
  Cc: jamal, Tom Herbert, Eric Dumazet, Stephen Hemminger, netdev, Changli Gao

batch skb dequeueing from softnet input_pkt_queue.

batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
contention when RPS is enabled.

Note: in the worst case, the number of packets in a softnet_data may be double
of netdev_max_backlog.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 include/linux/netdevice.h |    6 +++--
 net/core/dev.c            |   50 +++++++++++++++++++++++++++++++---------------
 2 files changed, 38 insertions(+), 18 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3c5ed5f..6ae9f2b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1387,6 +1387,7 @@ struct softnet_data {
 	struct Qdisc		*output_queue;
 	struct list_head	poll_list;
 	struct sk_buff		*completion_queue;
+	struct sk_buff_head	process_queue;
 
 #ifdef CONFIG_RPS
 	struct softnet_data	*rps_ipi_list;
@@ -1401,10 +1402,11 @@ struct softnet_data {
 	struct napi_struct	backlog;
 };
 
-static inline void input_queue_head_incr(struct softnet_data *sd)
+static inline void input_queue_head_add(struct softnet_data *sd,
+					unsigned int len)
 {
 #ifdef CONFIG_RPS
-	sd->input_queue_head++;
+	sd->input_queue_head += len;
 #endif
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index a4a7c36..c1585f9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2409,12 +2409,13 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 	__get_cpu_var(netdev_rx_stat).total++;
 
 	rps_lock(sd);
-	if (sd->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (sd->input_pkt_queue.qlen) {
+	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+		if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
 #ifdef CONFIG_RPS
-			*qtail = sd->input_queue_head + sd->input_pkt_queue.qlen;
+			*qtail = sd->input_queue_head +
+					skb_queue_len(&sd->input_pkt_queue);
 #endif
 			rps_unlock(sd);
 			local_irq_restore(flags);
@@ -2934,13 +2935,21 @@ static void flush_backlog(void *arg)
 	struct sk_buff *skb, *tmp;
 
 	rps_lock(sd);
-	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp)
+	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 		if (skb->dev == dev) {
 			__skb_unlink(skb, &sd->input_pkt_queue);
 			kfree_skb(skb);
-			input_queue_head_incr(sd);
+			input_queue_head_add(sd, 1);
 		}
+	}
 	rps_unlock(sd);
+
+	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+		if (skb->dev == dev) {
+			__skb_unlink(skb, &sd->process_queue);
+			kfree_skb(skb);
+		}
+	}
 }
 
 static int napi_gro_complete(struct sk_buff *skb)
@@ -3286,24 +3295,30 @@ static int process_backlog(struct napi_struct *napi, int quota)
 	}
 #endif
 	napi->weight = weight_p;
-	do {
+	local_irq_disable();
+	while (1) {
 		struct sk_buff *skb;
 
-		local_irq_disable();
+		while ((skb = __skb_dequeue(&sd->process_queue))) {
+			local_irq_enable();
+			__netif_receive_skb(skb);
+			if (++work >= quota)
+				return work;
+			local_irq_disable();
+		}
+
 		rps_lock(sd);
-		skb = __skb_dequeue(&sd->input_pkt_queue);
-		if (!skb) {
+		input_queue_head_add(sd, skb_queue_len(&sd->input_pkt_queue));
+		skb_queue_splice_tail_init(&sd->input_pkt_queue,
+					   &sd->process_queue);
+		if (skb_queue_empty(&sd->process_queue)) {
 			__napi_complete(napi);
 			rps_unlock(sd);
-			local_irq_enable();
 			break;
 		}
-		input_queue_head_incr(sd);
 		rps_unlock(sd);
-		local_irq_enable();
-
-		__netif_receive_skb(skb);
-	} while (++work < quota);
+	}
+	local_irq_enable();
 
 	return work;
 }
@@ -5631,8 +5646,10 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 	/* Process offline CPU's input_pkt_queue */
 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
 		netif_rx(skb);
-		input_queue_head_incr(oldsd);
+		input_queue_head_add(oldsd, 1);
 	}
+	while ((skb = __skb_dequeue(&oldsd->process_queue)))
+		netif_rx(skb);
 
 	return NOTIFY_OK;
 }
@@ -5851,6 +5868,7 @@ static int __init net_dev_init(void)
 		struct softnet_data *sd = &per_cpu(softnet_data, i);
 
 		skb_queue_head_init(&sd->input_pkt_queue);
+		skb_queue_head_init(&sd->process_queue);
 		sd->completion_queue = NULL;
 		INIT_LIST_HEAD(&sd->poll_list);
 

^ permalink raw reply related	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-04-23  8:12 [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue Changli Gao
@ 2010-04-23  9:27 ` Eric Dumazet
  2010-04-23 22:02   ` jamal
  2010-04-23 10:26 ` Eric Dumazet
  1 sibling, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-04-23  9:27 UTC (permalink / raw)
  To: Changli Gao
  Cc: David S. Miller, jamal, Tom Herbert, Stephen Hemminger, netdev

Le vendredi 23 avril 2010 à 16:12 +0800, Changli Gao a écrit :
> batch skb dequeueing from softnet input_pkt_queue.
> 
> batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
> contention when RPS is enabled.
> 
> Note: in the worst case, the number of packets in a softnet_data may be double
> of netdev_max_backlog.
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>

Very good patch Changli, thanks !

Lets see how it improves thing for Jamal benchs ;)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

> ----
>  include/linux/netdevice.h |    6 +++--
>  net/core/dev.c            |   50 +++++++++++++++++++++++++++++++---------------
>  2 files changed, 38 insertions(+), 18 deletions(-)
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 3c5ed5f..6ae9f2b 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1387,6 +1387,7 @@ struct softnet_data {
>  	struct Qdisc		*output_queue;
>  	struct list_head	poll_list;
>  	struct sk_buff		*completion_queue;
> +	struct sk_buff_head	process_queue;
>  
>  #ifdef CONFIG_RPS
>  	struct softnet_data	*rps_ipi_list;
> @@ -1401,10 +1402,11 @@ struct softnet_data {
>  	struct napi_struct	backlog;
>  };
>  
> -static inline void input_queue_head_incr(struct softnet_data *sd)
> +static inline void input_queue_head_add(struct softnet_data *sd,
> +					unsigned int len)
>  {
>  #ifdef CONFIG_RPS
> -	sd->input_queue_head++;
> +	sd->input_queue_head += len;
>  #endif
>  }
>  
> diff --git a/net/core/dev.c b/net/core/dev.c
> index a4a7c36..c1585f9 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2409,12 +2409,13 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
>  	__get_cpu_var(netdev_rx_stat).total++;
>  
>  	rps_lock(sd);
> -	if (sd->input_pkt_queue.qlen <= netdev_max_backlog) {
> -		if (sd->input_pkt_queue.qlen) {
> +	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
> +		if (skb_queue_len(&sd->input_pkt_queue)) {
>  enqueue:
>  			__skb_queue_tail(&sd->input_pkt_queue, skb);
>  #ifdef CONFIG_RPS
> -			*qtail = sd->input_queue_head + sd->input_pkt_queue.qlen;
> +			*qtail = sd->input_queue_head +
> +					skb_queue_len(&sd->input_pkt_queue);
>  #endif
>  			rps_unlock(sd);
>  			local_irq_restore(flags);
> @@ -2934,13 +2935,21 @@ static void flush_backlog(void *arg)
>  	struct sk_buff *skb, *tmp;
>  
>  	rps_lock(sd);
> -	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp)
> +	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
>  		if (skb->dev == dev) {
>  			__skb_unlink(skb, &sd->input_pkt_queue);
>  			kfree_skb(skb);
> -			input_queue_head_incr(sd);
> +			input_queue_head_add(sd, 1);
>  		}
> +	}
>  	rps_unlock(sd);
> +
> +	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
> +		if (skb->dev == dev) {
> +			__skb_unlink(skb, &sd->process_queue);
> +			kfree_skb(skb);
> +		}
> +	}
>  }
>  
>  static int napi_gro_complete(struct sk_buff *skb)
> @@ -3286,24 +3295,30 @@ static int process_backlog(struct napi_struct *napi, int quota)
>  	}
>  #endif
>  	napi->weight = weight_p;
> -	do {
> +	local_irq_disable();
> +	while (1) {
>  		struct sk_buff *skb;
>  
> -		local_irq_disable();
> +		while ((skb = __skb_dequeue(&sd->process_queue))) {
> +			local_irq_enable();
> +			__netif_receive_skb(skb);
> +			if (++work >= quota)
> +				return work;
> +			local_irq_disable();
> +		}
> +
>  		rps_lock(sd);
> -		skb = __skb_dequeue(&sd->input_pkt_queue);
> -		if (!skb) {
> +		input_queue_head_add(sd, skb_queue_len(&sd->input_pkt_queue));
> +		skb_queue_splice_tail_init(&sd->input_pkt_queue,
> +					   &sd->process_queue);
> +		if (skb_queue_empty(&sd->process_queue)) {
>  			__napi_complete(napi);
>  			rps_unlock(sd);
> -			local_irq_enable();
>  			break;
>  		}
> -		input_queue_head_incr(sd);
>  		rps_unlock(sd);
> -		local_irq_enable();
> -
> -		__netif_receive_skb(skb);
> -	} while (++work < quota);
> +	}
> +	local_irq_enable();
>  
>  	return work;
>  }
> @@ -5631,8 +5646,10 @@ static int dev_cpu_callback(struct notifier_block *nfb,
>  	/* Process offline CPU's input_pkt_queue */
>  	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
>  		netif_rx(skb);
> -		input_queue_head_incr(oldsd);
> +		input_queue_head_add(oldsd, 1);
>  	}
> +	while ((skb = __skb_dequeue(&oldsd->process_queue)))
> +		netif_rx(skb);
>  
>  	return NOTIFY_OK;
>  }
> @@ -5851,6 +5868,7 @@ static int __init net_dev_init(void)
>  		struct softnet_data *sd = &per_cpu(softnet_data, i);
>  
>  		skb_queue_head_init(&sd->input_pkt_queue);
> +		skb_queue_head_init(&sd->process_queue);
>  		sd->completion_queue = NULL;
>  		INIT_LIST_HEAD(&sd->poll_list);
>  
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-04-23  8:12 [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue Changli Gao
  2010-04-23  9:27 ` Eric Dumazet
@ 2010-04-23 10:26 ` Eric Dumazet
  2010-04-27 22:08   ` David Miller
  1 sibling, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-04-23 10:26 UTC (permalink / raw)
  To: Changli Gao
  Cc: David S. Miller, jamal, Tom Herbert, Stephen Hemminger, netdev

Le vendredi 23 avril 2010 à 16:12 +0800, Changli Gao a écrit :
> batch skb dequeueing from softnet input_pkt_queue.
> 
> batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
> contention when RPS is enabled.
> 
> Note: in the worst case, the number of packets in a softnet_data may be double
> of netdev_max_backlog.
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----

Oops, reading it again, I found process_backlog() was still taking the
lock twice, if only one packet is waiting in input_pkt_queue.

Possible fix, on top of your patch :

diff --git a/net/core/dev.c b/net/core/dev.c
index 0eddd23..0569be7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3296,8 +3296,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
 #endif
 	napi->weight = weight_p;
 	local_irq_disable();
-	while (1) {
+	while (work < quota) {
 		struct sk_buff *skb;
+		unsigned int qlen;
 
 		while ((skb = __skb_dequeue(&sd->process_queue))) {
 			local_irq_enable();
@@ -3308,13 +3309,15 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		}
 
 		rps_lock(sd);
-		input_queue_head_add(sd, skb_queue_len(&sd->input_pkt_queue));
-		skb_queue_splice_tail_init(&sd->input_pkt_queue,
-					   &sd->process_queue);
-		if (skb_queue_empty(&sd->process_queue)) {
+		qlen = skb_queue_len(&sd->input_pkt_queue);
+		if (qlen) {
+			input_queue_head_add(sd, qlen);
+			skb_queue_splice_tail_init(&sd->input_pkt_queue,
+						   &sd->process_queue);
+		}
+		if (qlen < quota - work) {
 			__napi_complete(napi);
-			rps_unlock(sd);
-			break;
+			quota = work + qlen;
 		}
 		rps_unlock(sd);
 	}



^ permalink raw reply related	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-04-23  9:27 ` Eric Dumazet
@ 2010-04-23 22:02   ` jamal
  2010-04-24 14:10     ` jamal
  0 siblings, 1 reply; 108+ messages in thread
From: jamal @ 2010-04-23 22:02 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger, netdev

On Fri, 2010-04-23 at 11:27 +0200, Eric Dumazet wrote:

> 
> Lets see how it improves thing for Jamal benchs ;)


Ive done a setup with the last patch from Changli + net-next - I will
post test results tomorrow AM.

cheers,
jamal


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-04-23 22:02   ` jamal
@ 2010-04-24 14:10     ` jamal
  2010-04-26 14:03       ` Eric Dumazet
  0 siblings, 1 reply; 108+ messages in thread
From: jamal @ 2010-04-24 14:10 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger, netdev

[-- Attachment #1: Type: text/plain, Size: 203 bytes --]

On Fri, 2010-04-23 at 18:02 -0400, jamal wrote:

> Ive done a setup with the last patch from Changli + net-next - I will
> post test results tomorrow AM.

ok, annotated results attached. 

cheers,
jamal

[-- Attachment #2: summary-apr23.txt --]
[-- Type: text/plain, Size: 45513 bytes --]

		sink    cpu all     cpuint       cpuapp
nn-standalone 	93.95%   84.5%        99.8%        79.8%
nn-rps          96.41%   85.4%        95.5%        82.5%
nn-cl           97.29%   84.0%        99.9%        79.6%
nn-cl-rps       97.76%   86.5%        96.5%        84.8%

nn-standalone: Basic net-next from Apr23
nn-rps: Basic net-next from Apr23 with rps mask ee and irq affinity to cpu0
nn-cl: Basic net-next from Apr23 + Changli patch
nn-cl-rps: Basic net-next from Apr23 + Changli patch + rps mask ee,irq aff cpu0
sink: the amount of traffic the system was able to sink in.
cpu all: avg % system cpu consumed in test
cpuint: avg %cpu consumed by the cpu where interrupts happened
cpuapp: avg %cpu consumed by a sample cpu which did app processing

Testing was as previously explained..
I repeated each test 4-5 times and took averages..

It seems the non-rps case has improved drammatically since the last 
net-next i tested. The rps case has also improved but the gap between 
rps and non-rps is smaller.
[There are just too many variables for me to pinpoint
to one item as being the contributor. For example sky2 driver may
have become worse (consumes more cycles) but i cant quantify it yet
(i just see sky2_rx_submit showing up higher in profiles than before).
Also call_function_single_interrupt shows up prominently on application
processing CPUs but improved by Changli's changes].
After doing the math, I dont trust my results after applying Changlis patch. 
It seems both the rps and non-rps case have gotten better (and i dont 
see Changlis contribution to non-rps). It also seems that the gap between 
rps and non-rps is non-existent now. In other words, there is no benefit to
using rps (it consumes more cpu for the same throughput). So it is likely 
that i need to repeat these tests; maybe i did something wrong in my setup...

And here are the profiles:
--------------------------

cpu0 always received all the interrupts regardless of the tests.
cpu1, 7 etc were processing apps..
I could not spot much difference between before and after Changli's


I: Test setup : nn-standalone: Basic net-next from Apr23

All cpus

-------------------------------------------------------------------------------
   PerfTop:    3784 irqs/sec  kernel:84.2% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             3254.00 10.3% sky2_poll                   [sky2]  
             1853.00  5.9% _raw_spin_lock_irqsave      [kernel]
              872.00  2.8% fget                        [kernel]
              870.00  2.8% copy_user_generic_string    [kernel]
              819.00  2.6% _raw_spin_unlock_irqrestore [kernel]
              729.00  2.3% sys_epoll_ctl               [kernel]
              701.00  2.2% datagram_poll               [kernel]
              615.00  2.0% udp_recvmsg                 [kernel]
              602.00  1.9% _raw_spin_lock_bh           [kernel]
              595.00  1.9% system_call                 [kernel]
              592.00  1.9% kmem_cache_free             [kernel]
              574.00  1.8% schedule                    [kernel]
              568.00  1.8% _raw_spin_lock              [kernel]


-------------------------------------------------------------------------------
   PerfTop:    3574 irqs/sec  kernel:85.1% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             5023.00 10.9% sky2_poll                   [sky2]  
             2762.00  6.0% _raw_spin_lock_irqsave      [kernel]
             1319.00  2.9% copy_user_generic_string    [kernel]
             1306.00  2.8% fget                        [kernel]
             1198.00  2.6% _raw_spin_unlock_irqrestore [kernel]
             1071.00  2.3% datagram_poll               [kernel]
             1061.00  2.3% sys_epoll_ctl               [kernel]
              927.00  2.0% _raw_spin_lock_bh           [kernel]
              917.00  2.0% system_call                 [kernel]
              901.00  1.9% udp_recvmsg                 [kernel]
              895.00  1.9% kmem_cache_free             [kernel]
              819.00  1.8% _raw_spin_lock              [kernel]
              802.00  1.7% schedule                    [kernel]
              774.00  1.7% sys_epoll_wait              [kernel]
              720.00  1.6% kmem_cache_alloc            [kernel]


-------------------------------------------------------------------------------
   PerfTop:    1000 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function               DSO
             _______ _____ ______________________ ________

              751.00 36.1% sky2_poll              [sky2]  
              108.00  5.2% __udp4_lib_lookup      [kernel]
               95.00  4.6% ip_route_input         [kernel]
               83.00  4.0% _raw_spin_lock         [kernel]
               79.00  3.8% _raw_spin_lock_irqsave [kernel]
               77.00  3.7% __netif_receive_skb    [kernel]
               77.00  3.7% __alloc_skb            [kernel]
               66.00  3.2% ip_rcv                 [kernel]
               60.00  2.9% __udp4_lib_rcv         [kernel]
               54.00  2.6% sock_queue_rcv_skb     [kernel]
               45.00  2.2% sky2_rx_submit         [sky2]  
               42.00  2.0% __wake_up_common       [kernel]
               40.00  1.9% __kmalloc              [kernel]
               39.00  1.9% sock_def_readable      [kernel]
               30.00  1.4% ep_poll_callback       [kernel]


-------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:99.8% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function               DSO
             _______ _____ ______________________ ________

             3511.00 36.7% sky2_poll              [sky2]  
              519.00  5.4% __udp4_lib_lookup      [kernel]
              431.00  4.5% ip_route_input         [kernel]
              353.00  3.7% _raw_spin_lock_irqsave [kernel]
              351.00  3.7% __alloc_skb            [kernel]
              338.00  3.5% __netif_receive_skb    [kernel]
              337.00  3.5% _raw_spin_lock         [kernel]
              307.00  3.2% ip_rcv                 [kernel]
              264.00  2.8% sky2_rx_submit         [sky2]  
              254.00  2.7% sock_queue_rcv_skb     [kernel]
              246.00  2.6% __udp4_lib_rcv         [kernel]
              206.00  2.2% sock_def_readable      [kernel]
              177.00  1.9% __wake_up_common       [kernel]
              168.00  1.8% __kmalloc              [kernel]


-------------------------------------------------------------------------------
   PerfTop:     908 irqs/sec  kernel:80.0% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

              177.00  6.7% _raw_spin_lock_irqsave      [kernel]
              120.00  4.5% copy_user_generic_string    [kernel]
              110.00  4.2% fget                        [kernel]
              108.00  4.1% datagram_poll               [kernel]
               98.00  3.7% _raw_spin_lock_bh           [kernel]
               91.00  3.4% sys_epoll_ctl               [kernel]
               89.00  3.4% kmem_cache_free             [kernel]
               77.00  2.9% system_call                 [kernel]
               76.00  2.9% schedule                    [kernel]
               76.00  2.9% _raw_spin_unlock_irqrestore [kernel]
               63.00  2.4% fput                        [kernel]
               61.00  2.3% sys_epoll_wait              [kernel]
               61.00  2.3% udp_recvmsg                 [kernel]
               49.00  1.8% process_recv                mcpudp  


-------------------------------------------------------------------------------
   PerfTop:     815 irqs/sec  kernel:79.8% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ _________________

              491.00  8.0% _raw_spin_lock_irqsave      [kernel.kallsyms]
              285.00  4.7% copy_user_generic_string    [kernel.kallsyms]
              252.00  4.1% fget                        [kernel.kallsyms]
              215.00  3.5% datagram_poll               [kernel.kallsyms]
              206.00  3.4% _raw_spin_unlock_irqrestore [kernel.kallsyms]
              204.00  3.3% sys_epoll_ctl               [kernel.kallsyms]
              196.00  3.2% _raw_spin_lock_bh           [kernel.kallsyms]
              184.00  3.0% udp_recvmsg                 [kernel.kallsyms]
              184.00  3.0% kmem_cache_free             [kernel.kallsyms]
              180.00  2.9% system_call                 [kernel.kallsyms]
              168.00  2.7% sys_epoll_wait              [kernel.kallsyms]
              159.00  2.6% schedule                    [kernel.kallsyms]
              144.00  2.4% fput                        [kernel.kallsyms]


II: Test setup 
nn-rps: Basic net-next from Apr23 with rps mask ee and irq affinity to cpu0

-------------------------------------------------------------------------------
   PerfTop:    3558 irqs/sec  kernel:85.0% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ ________

             3519.00 15.9% sky2_poll                      [sky2]  
              865.00  3.9% _raw_spin_lock_irqsave         [kernel]
              568.00  2.6% _raw_spin_unlock_irqrestore    [kernel]
              526.00  2.4% sky2_intr                      [sky2]  
              493.00  2.2% __netif_receive_skb            [kernel]
              477.00  2.2% _raw_spin_lock                 [kernel]
              470.00  2.1% ip_rcv                         [kernel]
              456.00  2.1% fget                           [kernel]
              447.00  2.0% sys_epoll_ctl                  [kernel]
              420.00  1.9% copy_user_generic_string       [kernel]
              387.00  1.8% ip_route_input                 [kernel]
              359.00  1.6% system_call                    [kernel]
              334.00  1.5% kmem_cache_free                [kernel]
              310.00  1.4% kmem_cache_alloc               [kernel]
              302.00  1.4% call_function_single_interrupt [kernel]


-------------------------------------------------------------------------------
   PerfTop:    3546 irqs/sec  kernel:85.8% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ ________

             6592.00 16.2% sky2_poll                      [sky2]  
             1540.00  3.8% _raw_spin_lock_irqsave         [kernel]
             1014.00  2.5% _raw_spin_unlock_irqrestore    [kernel]
              885.00  2.2% fget                           [kernel]
              881.00  2.2% _raw_spin_lock                 [kernel]
              880.00  2.2% sky2_intr                      [sky2]  
              872.00  2.1% __netif_receive_skb            [kernel]
              858.00  2.1% ip_rcv                         [kernel]
              802.00  2.0% sys_epoll_ctl                  [kernel]
              710.00  1.7% copy_user_generic_string       [kernel]
              696.00  1.7% system_call                    [kernel]
              692.00  1.7% ip_route_input                 [kernel]
              634.00  1.6% schedule                       [kernel]
              618.00  1.5% kmem_cache_free                [kernel]
              605.00  1.5% call_function_single_interrupt [kernel]


cpu0

-------------------------------------------------------------------------------
   PerfTop:     971 irqs/sec  kernel:96.5% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             4222.00 58.2% sky2_poll                   [sky2]  
              668.00  9.2% sky2_intr                   [sky2]  
              228.00  3.1% __alloc_skb                 [kernel]
              183.00  2.5% get_rps_cpu                 [kernel]
              138.00  1.9% sky2_rx_submit              [sky2]  
              124.00  1.7% enqueue_to_backlog          [kernel]
              119.00  1.6% __kmalloc                   [kernel]
              103.00  1.4% kmem_cache_alloc            [kernel]
               91.00  1.3% _raw_spin_lock              [kernel]
               90.00  1.2% _raw_spin_lock_irqsave      [kernel]
               73.00  1.0% swiotlb_sync_single         [kernel]
               72.00  1.0% irq_entries_start           [kernel]
               55.00  0.8% copy_user_generic_string    [kernel]
               53.00  0.7% _raw_spin_unlock_irqrestore [kernel]
               48.00  0.7% fget                        [kernel]


-------------------------------------------------------------------------------
   PerfTop:     998 irqs/sec  kernel:94.8% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             6745.00 58.5% sky2_poll                   [sky2]  
              831.00  7.2% sky2_intr                   [sky2]  
              352.00  3.1% __alloc_skb                 [kernel]
              281.00  2.4% get_rps_cpu                 [kernel]
              226.00  2.0% sky2_rx_submit              [sky2]  
              186.00  1.6% __kmalloc                   [kernel]
              181.00  1.6% enqueue_to_backlog          [kernel]
              173.00  1.5% _raw_spin_lock_irqsave      [kernel]
              166.00  1.4% kmem_cache_alloc            [kernel]
              162.00  1.4% _raw_spin_lock              [kernel]
               99.00  0.9% swiotlb_sync_single         [kernel]
               98.00  0.9% irq_entries_start           [kernel]
               94.00  0.8% fget                        [kernel]
               92.00  0.8% _raw_spin_unlock_irqrestore [kernel]
               80.00  0.7% system_call                 [kernel]


cpu1


-------------------------------------------------------------------------------
   PerfTop:     724 irqs/sec  kernel:82.0% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ _________________

              204.00  5.3% _raw_spin_lock_irqsave         [kernel.kallsyms]
              153.00  4.0% _raw_spin_unlock_irqrestore    [kernel.kallsyms]
              147.00  3.8% call_function_single_interrupt [kernel.kallsyms]
              139.00  3.6% __netif_receive_skb            [kernel.kallsyms]
              135.00  3.5% sys_epoll_ctl                  [kernel.kallsyms]
              132.00  3.4% ip_rcv                         [kernel.kallsyms]
              129.00  3.3% fget                           [kernel.kallsyms]
              128.00  3.3% _raw_spin_lock                 [kernel.kallsyms]
              122.00  3.2% system_call                    [kernel.kallsyms]
              118.00  3.1% ip_route_input                 [kernel.kallsyms]
              109.00  2.8% kmem_cache_free                [kernel.kallsyms]
              108.00  2.8% copy_user_generic_string       [kernel.kallsyms]
               90.00  2.3% schedule                       [kernel.kallsyms]
               85.00  2.2% fput                           [kernel.kallsyms]



-------------------------------------------------------------------------------
   PerfTop:     763 irqs/sec  kernel:83.0% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ _________________

              428.00  6.2% _raw_spin_lock_irqsave         [kernel.kallsyms]
              302.00  4.4% _raw_spin_unlock_irqrestore    [kernel.kallsyms]
              269.00  3.9% __netif_receive_skb            [kernel.kallsyms]
              258.00  3.7% call_function_single_interrupt [kernel.kallsyms]
              254.00  3.7% fget                           [kernel.kallsyms]
              238.00  3.4% ip_rcv                         [kernel.kallsyms]
              230.00  3.3% sys_epoll_ctl                  [kernel.kallsyms]
              222.00  3.2% _raw_spin_lock                 [kernel.kallsyms]
              220.00  3.2% ip_route_input                 [kernel.kallsyms]
              197.00  2.9% system_call                    [kernel.kallsyms]
              189.00  2.7% kmem_cache_free                [kernel.kallsyms]
              184.00  2.7% copy_user_generic_string       [kernel.kallsyms]
              144.00  2.1% ep_remove                      [kernel.kallsyms]
              140.00  2.0% schedule                       [kernel.kallsyms]


-------------------------------------------------------------------------------
   PerfTop:     546 irqs/sec  kernel:83.3% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ _________________

              346.00  5.7% _raw_spin_lock_irqsave         [kernel.kallsyms]
              275.00  4.6% _raw_spin_unlock_irqrestore    [kernel.kallsyms]
              238.00  3.9% call_function_single_interrupt [kernel.kallsyms]
              228.00  3.8% fget                           [kernel.kallsyms]
              222.00  3.7% __netif_receive_skb            [kernel.kallsyms]
              219.00  3.6% sys_epoll_ctl                  [kernel.kallsyms]
              209.00  3.5% _raw_spin_lock                 [kernel.kallsyms]
              205.00  3.4% ip_rcv                         [kernel.kallsyms]
              199.00  3.3% ip_route_input                 [kernel.kallsyms]
              173.00  2.9% system_call                    [kernel.kallsyms]
              170.00  2.8% copy_user_generic_string       [kernel.kallsyms]
              167.00  2.8% kmem_cache_free                [kernel.kallsyms]
              127.00  2.1% ep_remove                      [kernel.kallsyms]
              123.00  2.0% dst_release                    [kernel.kalls



III: Test setup 
nn-cl: Basic net-next from Apr23 + Changli patch

-------------------------------------------------------------------------------
   PerfTop:    3789 irqs/sec  kernel:84.1% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ____________________

             3514.00 10.2% sky2_poll                   [sky2]              
             1862.00  5.4% _raw_spin_lock_irqsave      [kernel]            
             1274.00  3.7% system_call                 [kernel]            
              926.00  2.7% fget                        [kernel]            
              872.00  2.5% _raw_spin_unlock_irqrestore [kernel]            
              862.00  2.5% copy_user_generic_string    [kernel]            
              766.00  2.2% sys_epoll_ctl               [kernel]            
              765.00  2.2% datagram_poll               [kernel]            
              671.00  2.0% _raw_spin_lock_bh           [kernel]            
              668.00  1.9% kmem_cache_free             [kernel]            
              602.00  1.8% udp_recvmsg                 [kernel]            
              586.00  1.7% _raw_spin_lock              [kernel]            
              585.00  1.7% vread_tsc                   [kernel].vsyscall_fn



-------------------------------------------------------------------------------
   PerfTop:    3794 irqs/sec  kernel:83.6% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ____________________

             4756.00  9.8% sky2_poll                   [sky2]              
             2742.00  5.7% _raw_spin_lock_irqsave      [kernel]            
             1826.00  3.8% system_call                 [kernel]            
             1285.00  2.7% fget                        [kernel]            
             1284.00  2.7% copy_user_generic_string    [kernel]            
             1235.00  2.6% _raw_spin_unlock_irqrestore [kernel]            
             1096.00  2.3% sys_epoll_ctl               [kernel]            
             1071.00  2.2% datagram_poll               [kernel]            
              954.00  2.0% kmem_cache_free             [kernel]            
              925.00  1.9% _raw_spin_lock_bh           [kernel]            
              888.00  1.8% vread_tsc                   [kernel].vsyscall_fn
              880.00  1.8% udp_recvmsg                 [kernel]            
              793.00  1.6% _raw_spin_lock              [kernel]            
              790.00  1.6% schedule                    [kernel]   

-------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:99.9% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function               DSO
             _______ _____ ______________________ ________

              675.00 32.6% sky2_poll              [sky2]  
              116.00  5.6% __udp4_lib_lookup      [kernel]
              111.00  5.4% ip_route_input         [kernel]
               81.00  3.9% _raw_spin_lock_irqsave [kernel]
               81.00  3.9% _raw_spin_lock         [kernel]
               70.00  3.4% __alloc_skb            [kernel]
               67.00  3.2% ip_rcv                 [kernel]
               66.00  3.2% __netif_receive_skb    [kernel]
               61.00  2.9% __udp4_lib_rcv         [kernel]
               57.00  2.8% sock_queue_rcv_skb     [kernel]
               47.00  2.3% sock_def_readable      [kernel]
               42.00  2.0% __kmalloc              [kernel]
               42.00  2.0% __wake_up_common       [kernel]
               38.00  1.8% sky2_rx_submit         [sky2]  

-------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function               DSO
             _______ _____ ______________________ ________

             2526.00 32.8% sky2_poll              [sky2]  
              406.00  5.3% ip_route_input         [kernel]
              399.00  5.2% __udp4_lib_lookup      [kernel]
              328.00  4.3% _raw_spin_lock_irqsave [kernel]
              307.00  4.0% _raw_spin_lock         [kernel]
              296.00  3.8% ip_rcv                 [kernel]
              287.00  3.7% __alloc_skb            [kernel]
              272.00  3.5% sock_queue_rcv_skb     [kernel]
              224.00  2.9% __udp4_lib_rcv         [kernel]
              224.00  2.9% __netif_receive_skb    [kernel]
              182.00  2.4% sock_def_readable      [kernel]
              163.00  2.1% __wake_up_common       [kernel]
              140.00  1.8% sky2_rx_submit         [sky2]  

-------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function               DSO
             _______ _____ ______________________ ________

             4445.00 33.4% sky2_poll              [sky2]  
              707.00  5.3% __udp4_lib_lookup      [kernel]
              662.00  5.0% ip_route_input         [kernel]
              567.00  4.3% _raw_spin_lock_irqsave [kernel]
              512.00  3.8% __alloc_skb            [kernel]
              506.00  3.8% ip_rcv                 [kernel]
              476.00  3.6% sock_queue_rcv_skb     [kernel]
              473.00  3.6% _raw_spin_lock         [kernel]
              415.00  3.1% __udp4_lib_rcv         [kernel]
              408.00  3.1% __netif_receive_skb    [kernel]
              306.00  2.3% sock_def_readable      [kernel]
              272.00  2.0% __wake_up_common       [kernel]
              260.00  2.0% __kmalloc              [kernel]
              216.00  1.6% _raw_read_lock         [kernel]
              214.00  1.6% sky2_rx_submit         [sky2]  


-------------------------------------------------------------------------------
   PerfTop:     748 irqs/sec  kernel:80.9% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ____________________

              244.00  7.4% _raw_spin_lock_irqsave      [kernel]            
              207.00  6.2% system_call                 [kernel]            
              127.00  3.8% _raw_spin_unlock_irqrestore [kernel]            
              124.00  3.7% copy_user_generic_string    [kernel]            
              122.00  3.7% sys_epoll_ctl               [kernel]            
              120.00  3.6% fget                        [kernel]            
              118.00  3.6% datagram_poll               [kernel]            
               96.00  2.9% schedule                    [kernel]            
               94.00  2.8% _raw_spin_lock_bh           [kernel]            
               86.00  2.6% vread_tsc                   [kernel].vsyscall_fn
               82.00  2.5% udp_recvmsg                 [kernel]            
               76.00  2.3% fput                        [kernel]            
               73.00  2.2% kmem_cache_free             [kernel]            
               67.00  2.0% sys_epoll_wait              [kernel]         

-------------------------------------------------------------------------------
   PerfTop:     625 irqs/sec  kernel:78.6% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ____________________

              488.00  7.5% _raw_spin_lock_irqsave      [kernel]            
              380.00  5.9% system_call                 [kernel]            
              274.00  4.2% copy_user_generic_string    [kernel]            
              252.00  3.9% fget                        [kernel]            
              244.00  3.8% datagram_poll               [kernel]            
              217.00  3.3% _raw_spin_unlock_irqrestore [kernel]            
              211.00  3.3% sys_epoll_ctl               [kernel]            
              186.00  2.9% schedule                    [kernel]            
              185.00  2.9% _raw_spin_lock_bh           [kernel]            
              173.00  2.7% udp_recvmsg                 [kernel]            
              169.00  2.6% vread_tsc                   [kernel].vsyscall_fn
              164.00  2.5% kmem_cache_free             [kernel]            
              143.00  2.2% fput                        [kernel]            
              133.00  2.1% sys_epoll_wait              [kernel]        


IV: Test setup 
nn-cl-rps: Basic net-next from Apr23 + Changli patch + rps mask ee,irq aff

--------------------------------------------------------------------------
   PerfTop:    3043 irqs/sec  kernel:87.5% [1000Hz cycles],  (all, 8 CPUs)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

             2240.00 20.4% sky2_poll                  [sky2]              
              375.00  3.4% _raw_spin_lock_irqsave     [kernel]            
              335.00  3.0% sky2_intr                  [sky2]              
              326.00  3.0% system_call                [kernel]            
              239.00  2.2% _raw_spin_unlock_irqrestor [kernel]            
              224.00  2.0% ip_rcv                     [kernel]            
              201.00  1.8% __netif_receive_skb        [kernel]            
              198.00  1.8% sys_epoll_ctl              [kernel]            
              190.00  1.7% _raw_spin_lock             [kernel]            
              182.00  1.7% fget                       [kernel]            
              169.00  1.5% copy_user_generic_string   [kernel]            
              165.00  1.5% kmem_cache_free            [kernel]            
              149.00  1.4% load_balance               [kernel]            
              146.00  1.3% ip_route_input             [kernel]           


--------------------------------------------------------------------------
   PerfTop:    3210 irqs/sec  kernel:85.8% [1000Hz cycles],  (all, 8 CPUs)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

             6539.00 20.4% sky2_poll                  [sky2]              
             1106.00  3.4% _raw_spin_lock_irqsave     [kernel]            
             1014.00  3.2% sky2_intr                  [sky2]              
              976.00  3.0% system_call                [kernel]            
              684.00  2.1% _raw_spin_unlock_irqrestor [kernel]            
              611.00  1.9% ip_rcv                     [kernel]            
              601.00  1.9% fget                       [kernel]            
              593.00  1.8% _raw_spin_lock             [kernel]            
              592.00  1.8% sys_epoll_ctl              [kernel]            
              574.00  1.8% __netif_receive_skb        [kernel]            
              526.00  1.6% copy_user_generic_string   [kernel]            
              482.00  1.5% kmem_cache_free            [kernel]            
              480.00  1.5% ip_route_input             [kernel]            
              425.00  1.3% vread_tsc                  [kernel].vsyscall_fn
              410.00  1.3% kmem_cache_alloc           [kernel]            


--------------------------------------------------------------------------
   PerfTop:     999 irqs/sec  kernel:97.2% [1000Hz cycles],  (all, cpu: 0)
--------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             2035.00 60.5% sky2_poll                   [sky2]  
              302.00  9.0% sky2_intr                   [sky2]  
              109.00  3.2% __alloc_skb                 [kernel]
               57.00  1.7% _raw_spin_lock              [kernel]
               57.00  1.7% get_rps_cpu                 [kernel]
               52.00  1.5% __kmalloc                   [kernel]
               51.00  1.5% enqueue_to_backlog          [kernel]
               49.00  1.5% _raw_spin_lock_irqsave      [kernel]
               44.00  1.3% kmem_cache_alloc            [kernel]
               34.00  1.0% sky2_rx_submit              [sky2]  
               33.00  1.0% swiotlb_sync_single         [kernel]
               31.00  0.9% system_call                 [kernel]
               28.00  0.8% irq_entries_start           [kernel]
               22.00  0.7% _raw_spin_unlock_irqrestore [kernel]
               21.00  0.6% sky2_remove                 [sky2]  

--------------------------------------------------------------------------
   PerfTop:    1000 irqs/sec  kernel:96.2% [1000Hz cycles],  (all, cpu: 0)
--------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             5493.00 60.1% sky2_poll                   [sky2]  
              803.00  8.8% sky2_intr                   [sky2]  
              281.00  3.1% __alloc_skb                 [kernel]
              233.00  2.6% get_rps_cpu                 [kernel]
              136.00  1.5% enqueue_to_backlog          [kernel]
              132.00  1.4% __kmalloc                   [kernel]
              126.00  1.4% _raw_spin_lock              [kernel]
              122.00  1.3% kmem_cache_alloc            [kernel]
              122.00  1.3% _raw_spin_lock_irqsave      [kernel]
              102.00  1.1% swiotlb_sync_single         [kernel]
               88.00  1.0% sky2_rx_submit              [sky2]  
               77.00  0.8% system_call                 [kernel]
               69.00  0.8% irq_entries_start           [kernel]
               55.00  0.6% _raw_spin_unlock_irqrestore [kernel]
               54.00  0.6% copy_user_generic_string    [kernel]

--------------------------------------------------------------------------
   PerfTop:     999 irqs/sec  kernel:97.5% [1000Hz cycles],  (all, cpu: 0)
--------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             6699.00 60.1% sky2_poll                   [sky2]  
              988.00  8.9% sky2_intr                   [sky2]  
              327.00  2.9% __alloc_skb                 [kernel]
              261.00  2.3% get_rps_cpu                 [kernel]
              168.00  1.5% __kmalloc                   [kernel]
              161.00  1.4% kmem_cache_alloc            [kernel]
              160.00  1.4% enqueue_to_backlog          [kernel]
              157.00  1.4% _raw_spin_lock              [kernel]
              125.00  1.1% _raw_spin_lock_irqsave      [kernel]
              122.00  1.1% swiotlb_sync_single         [kernel]
              114.00  1.0% sky2_rx_submit              [sky2]  
               96.00  0.9% system_call                 [kernel]
               85.00  0.8% irq_entries_start           [kernel]
               66.00  0.6% sky2_remove                 [sky2]  
               64.00  0.6% _raw_spin_unlock_irqrestore [kernel]

--------------------------------------------------------------------------
   PerfTop:     420 irqs/sec  kernel:84.8% [1000Hz cycles],  (all, cpu: 2)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

              188.00  4.8% _raw_spin_lock_irqsave     [kernel]            
              175.00  4.5% system_call                [kernel]            
              155.00  4.0% _raw_spin_unlock_irqrestor [kernel]            
              143.00  3.7% __netif_receive_skb        [kernel]            
              124.00  3.2% ip_route_input             [kernel]            
              122.00  3.1% fget                       [kernel]            
              118.00  3.0% ip_rcv                     [kernel]            
              115.00  2.9% sys_epoll_ctl              [kernel]            
              107.00  2.7% call_function_single_inter [kernel]            
               98.00  2.5% vread_tsc                  [kernel].vsyscall_fn
               97.00  2.5% _raw_spin_lock             [kernel]            
               89.00  2.3% copy_user_generic_string   [kernel]        

--------------------------------------------------------------------------
   PerfTop:     372 irqs/sec  kernel:87.9% [1000Hz cycles],  (all, cpu: 2)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

              212.00  4.6% _raw_spin_lock_irqsave     [kernel]            
              192.00  4.2% system_call                [kernel]            
              187.00  4.1% __netif_receive_skb        [kernel]            
              184.00  4.0% ip_rcv                     [kernel]            
              174.00  3.8% ip_route_input             [kernel]            
              165.00  3.6% _raw_spin_unlock_irqrestor [kernel]            
              143.00  3.1% call_function_single_inter [kernel]            
              135.00  3.0% fget                       [kernel]            
              133.00  2.9% sys_epoll_ctl              [kernel]            
              122.00  2.7% _raw_spin_lock             [kernel]            
              112.00  2.5% __udp4_lib_lookup          [kernel]            
               99.00  2.2% copy_user_generic_string   [kernel]            
               93.00  2.0% vread_tsc                  [kernel].vsyscall_fn
               90.00  2.0% kmem_cache_free            [kernel]            
               89.00  1.9% ep_remove                  [kernel]        
o
--------------------------------------------------------------------------
   PerfTop:     269 irqs/sec  kernel:85.1% [1000Hz cycles],  (all, cpu: 7)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

               23.00  4.6% _raw_spin_lock_irqsave     [kernel]            
               21.00  4.2% system_call                [kernel]            
               19.00  3.8% _raw_spin_unlock_irqrestor [kernel]            
               17.00  3.4% fget                       [kernel]            
               15.00  3.0% __netif_receive_skb        [kernel]            
               14.00  2.8% dst_release                [kernel]            
               13.00  2.6% call_function_single_inter [kernel]            
               11.00  2.2% kmem_cache_free            [kernel]            
               10.00  2.0% vread_tsc                  [kernel].vsyscall_fn
               10.00  2.0% copy_user_generic_string   [kernel]            
               10.00  2.0% ktime_get                  [kernel]            
               10.00  2.0% ip_route_input             [kernel]            
               10.00  2.0% schedule                   [kernel]            


--------------------------------------------------------------------------
   PerfTop:     253 irqs/sec  kernel:84.6% [1000Hz cycles],  (all, cpu: 7)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

              109.00  4.9% system_call                [kernel]            
              104.00  4.6% _raw_spin_lock_irqsave     [kernel]            
               79.00  3.5% ip_rcv                     [kernel]            
               74.00  3.3% _raw_spin_unlock_irqrestor [kernel]            
               71.00  3.2% fget                       [kernel]            
               68.00  3.0% sys_epoll_ctl              [kernel]            
               66.00  2.9% ip_route_input             [kernel]            
               58.00  2.6% call_function_single_inter [kernel]            
               55.00  2.4% _raw_spin_lock             [kernel]            
               54.00  2.4% copy_user_generic_string   [kernel]            
               53.00  2.4% __netif_receive_skb        [kernel]            
               51.00  2.3% schedule                   [kernel]            
               51.00  2.3% kmem_cache_free            [kernel]            
               43.00  1.9% vread_tsc                  [kernel].vsyscall_fn
               38.00  1.7% __udp4_lib_lookup          [kernel]  

--------------------------------------------------------------------------
   PerfTop:     236 irqs/sec  kernel:84.3% [1000Hz cycles],  (all, cpu: 7)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

              131.00  4.9% _raw_spin_lock_irqsave     [kernel]            
              128.00  4.8% system_call                [kernel]            
              101.00  3.8% _raw_spin_unlock_irqrestor [kernel]            
               89.00  3.3% fget                       [kernel]            
               85.00  3.2% sys_epoll_ctl              [kernel]            
               81.00  3.0% ip_rcv                     [kernel]            
               76.00  2.8% ip_route_input             [kernel]            
               66.00  2.5% call_function_single_inter [kernel]            
               65.00  2.4% _raw_spin_lock             [kernel]            
               65.00  2.4% kmem_cache_free            [kernel]            
               64.00  2.4% copy_user_generic_string   [kernel]            
               57.00  2.1% __netif_receive_skb        [kernel]            
               47.00  1.8% schedule                   [kernel]            
               45.00  1.7% vread_tsc                  [kernel].vsyscall_fn


--------------------------------------------------------------------------
   PerfTop:     478 irqs/sec  kernel:82.2% [1000Hz cycles],  (all, cpu: 2)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

              319.00  5.2% _raw_spin_lock_irqsave     [kernel]            
              289.00  4.7% system_call                [kernel]            
              246.00  4.0% _raw_spin_unlock_irqrestor [kernel]            
              199.00  3.2% ip_route_input             [kernel]            
              198.00  3.2% __netif_receive_skb        [kernel]            
              197.00  3.2% sys_epoll_ctl              [kernel]            
              183.00  3.0% ip_rcv                     [kernel]            
              182.00  2.9% fget                       [kernel]            
              166.00  2.7% call_function_single_inter [kernel]            
              157.00  2.5% copy_user_generic_string   [kernel]            
              149.00  2.4% kmem_cache_free            [kernel]            
              146.00  2.4% vread_tsc                  [kernel].vsyscall_fn
              133.00  2.1% _raw_spin_lock             [kernel]            
              118.00  1.9% schedule                   [kernel]            
              112.00  1.8% __udp4_lib_lookup          [kernel]            



--------------------------------------------------------------------------
   PerfTop:     535 irqs/sec  kernel:83.0% [1000Hz cycles],  (all, cpu: 2)
--------------------------------------------------------------------------

             samples  pcnt function                   DSO
             _______ _____ __________________________ ____________________

              345.00  5.2% _raw_spin_lock_irqsave     [kernel]            
              291.00  4.4% system_call                [kernel]            
              255.00  3.9% _raw_spin_unlock_irqrestor [kernel]            
              218.00  3.3% fget                       [kernel]            
              201.00  3.0% ip_route_input             [kernel]            
              193.00  2.9% __netif_receive_skb        [kernel]            
              193.00  2.9% sys_epoll_ctl              [kernel]            
              180.00  2.7% ip_rcv                     [kernel]            
              173.00  2.6% call_function_single_inter [kernel]            
              163.00  2.5% copy_user_generic_string   [kernel]            
              152.00  2.3% kmem_cache_free            [kernel]            
              151.00  2.3% vread_tsc                  [kernel].vsyscall_fn
              142.00  2.1% _raw_spin_lock             [kernel]            
              131.00  2.0% schedule                   [kernel]            



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-04-24 14:10     ` jamal
@ 2010-04-26 14:03       ` Eric Dumazet
  2010-04-26 14:55         ` Eric Dumazet
  2010-04-26 21:03         ` jamal
  0 siblings, 2 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-04-26 14:03 UTC (permalink / raw)
  To: hadi; +Cc: Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger, netdev

Le samedi 24 avril 2010 à 10:10 -0400, jamal a écrit :
> On Fri, 2010-04-23 at 18:02 -0400, jamal wrote:
> 
> > Ive done a setup with the last patch from Changli + net-next - I will
> > post test results tomorrow AM.
> 
> ok, annotated results attached. 
> 
> cheers,
> jamal

Jamal, I have a Nehalem setup now, and I can see
_raw_spin_lock_irqsave() abuse is not coming from network tree, but from
clockevents_notify()

My pktgen sends 1040989pps :

# Samples: 389707198131
#
# Overhead         Command                 Shared Object  Symbol
# ........  ..............  ............................  ......
#
    23.52%            init  [kernel.kallsyms]             [k] _raw_spin_lock_irqsave
                      |
                      --- _raw_spin_lock_irqsave
                         |          
                         |--94.74%-- clockevents_notify
                         |          lapic_timer_state_broadcast
                         |          acpi_idle_enter_bm
                         |          cpuidle_idle_call
                         |          cpu_idle
                         |          start_secondary
                         |          
                         |--4.10%-- tick_broadcast_oneshot_control
                         |          tick_notify
                         |          notifier_call_chain
                         |          __raw_notifier_call_chain
                         |          raw_notifier_call_chain
                         |          clockevents_do_notify
                         |          clockevents_notify
                         |          lapic_timer_state_broadcast
                         |          acpi_idle_enter_bm
                         |          cpuidle_idle_call
                         |          cpu_idle
                         |          start_secondary
                         |          
                         |--0.58%-- lapic_timer_state_broadcast
                         |          acpi_idle_enter_bm
                         |          cpuidle_idle_call
                         |          cpu_idle
                         |          start_secondary
                          --0.58%-- [...]

     8.94%            init  [kernel.kallsyms]             [k] acpi_os_read_port
                      |
                      --- acpi_os_read_port
                         |          
                         |--99.55%-- acpi_hw_read_port
                         |          acpi_hw_read
                         |          acpi_hw_read_multiple
                         |          acpi_hw_register_read
                         |          acpi_read_bit_register



# Samples: 389233082962
#
# Overhead         Command                 Shared Object  Symbol
# ........  ..............  ............................  ......
#
    23.25%            init  [kernel.kallsyms]             [k] _raw_spin_lock_irqsave
     8.90%            init  [kernel.kallsyms]             [k] acpi_os_read_port
     2.93%            init  [kernel.kallsyms]             [k] mwait_idle_with_hints
     1.99%            init  [kernel.kallsyms]             [k] schedule
     1.94%         udpsink  [kernel.kallsyms]             [k] schedule
     1.73%         swapper  [kernel.kallsyms]             [k] _raw_spin_lock_irqsave
     1.48%            init  [kernel.kallsyms]             [k] bnx2x_rx_int
     1.47%            init  [kernel.kallsyms]             [k] _raw_spin_unlock_irqrestore
     1.44%            init  [kernel.kallsyms]             [k] _raw_spin_lock
     1.36%         udpsink  [kernel.kallsyms]             [k] udp_recvmsg
     1.05%         udpsink  [kernel.kallsyms]             [k] __skb_recv_datagram
     1.05%            init  [kernel.kallsyms]             [k] __udp4_lib_lookup
     1.04%         udpsink  [kernel.kallsyms]             [k] copy_user_generic_string
     1.04%         udpsink  [kernel.kallsyms]             [k] __slab_free
     0.99%            init  [kernel.kallsyms]             [k] select_task_rq_fair
     0.99%            init  [kernel.kallsyms]             [k] try_to_wake_up
     0.98%            init  [kernel.kallsyms]             [k] task_rq_lock
     0.93%            init  [kernel.kallsyms]             [k] tick_broadcast_oneshot_control
     0.89%            init  [kernel.kallsyms]             [k] sock_queue_rcv_skb
     0.89%         udpsink  [kernel.kallsyms]             [k] sock_recv_ts_and_drops
     0.88%         udpsink  [kernel.kallsyms]             [k] kfree
     0.79%         swapper  [kernel.kallsyms]             [k] acpi_os_read_port
     0.76%         udpsink  [kernel.kallsyms]             [k] _raw_spin_lock_irqsave
     0.73%         udpsink  [kernel.kallsyms]             [k] inet_recvmsg
     0.71%         udpsink  [vdso]                        [.] 0x000000ffffe431
     0.65%         udpsink  [kernel.kallsyms]             [k] sock_recvmsg
     0.62%            init  [kernel.kallsyms]             [k] gs_change
     0.61%            init  [kernel.kallsyms]             [k] enqueue_task_fair
     0.61%            init  [kernel.kallsyms]             [k] eth_type_trans
     0.61%            init  [kernel.kallsyms]             [k] sock_def_readable
     0.60%         udpsink  [kernel.kallsyms]             [k] _raw_spin_lock_bh
     0.59%            init  [kernel.kallsyms]             [k] ip_route_input
     0.59%         udpsink  libpthread-2.3.4.so           [.] __pthread_disable_asynccancel
     0.56%            init  [kernel.kallsyms]             [k] bnx2x_poll
     0.56%         udpsink  [kernel.kallsyms]             [k] __get_user_4



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-04-26 14:03       ` Eric Dumazet
@ 2010-04-26 14:55         ` Eric Dumazet
  2010-04-26 21:06           ` jamal
       [not found]           ` <20100429174056.GA8044@gargoyle.fritz.box>
  2010-04-26 21:03         ` jamal
  1 sibling, 2 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-04-26 14:55 UTC (permalink / raw)
  To: hadi
  Cc: Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger,
	netdev, Andi Kleen

Le lundi 26 avril 2010 à 16:03 +0200, Eric Dumazet a écrit :
> Le samedi 24 avril 2010 à 10:10 -0400, jamal a écrit :
> > On Fri, 2010-04-23 at 18:02 -0400, jamal wrote:
> > 
> > > Ive done a setup with the last patch from Changli + net-next - I will
> > > post test results tomorrow AM.
> > 
> > ok, annotated results attached. 
> > 
> > cheers,
> > jamal
> 
> Jamal, I have a Nehalem setup now, and I can see
> _raw_spin_lock_irqsave() abuse is not coming from network tree, but from
> clockevents_notify()
> 

Another interesting finding:

- if all packets are received on a single queue, max speed seems to be
1.200.000 packets per second on my machine :-(

And on profile of receiving cpu (RPS enabled, pakets sent to 15 other
cpus), we can see default_send_IPI_mask_sequence_phys() is the slow
thing...

Andi, what do you think of this one ?
Dont we have a function to send an IPI to an individual cpu instead ?

void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int
vector)
{
        unsigned long query_cpu;
        unsigned long flags;

        /*
         * Hack. The clustered APIC addressing mode doesn't allow us to
send
         * to an arbitrary mask, so I do a unicast to each CPU instead.
         * - mbligh
         */
        local_irq_save(flags);
        for_each_cpu(query_cpu, mask) {
                __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
                                query_cpu), vector, APIC_DEST_PHYSICAL);
        }
        local_irq_restore(flags);
}


-----------------------------------------------------------------------------------------------------------------------------------------
   PerfTop:    1000 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu:
7)
-----------------------------------------------------------------------------------------------------------------------------------------

             samples  pcnt function                            DSO
             _______ _____ ___________________________________ _______

              668.00 17.7% default_send_IPI_mask_sequence_phys vmlinux
              363.00  9.6% bnx2x_rx_int                        vmlinux
              354.00  9.4% eth_type_trans                      vmlinux
              332.00  8.8% kmem_cache_alloc_node               vmlinux
              285.00  7.6% __kmalloc_node_track_caller         vmlinux
              278.00  7.4% _raw_spin_lock                      vmlinux
              166.00  4.4% __slab_alloc                        vmlinux
              147.00  3.9% __memset                            vmlinux
              136.00  3.6% list_del                            vmlinux
              132.00  3.5% get_partial_node                    vmlinux
              131.00  3.5% get_rps_cpu                         vmlinux
              102.00  2.7% enqueue_to_backlog                  vmlinux
               95.00  2.5% unmap_single                        vmlinux
               94.00  2.5% __alloc_skb                         vmlinux
               74.00  2.0% vlan_gro_common                     vmlinux
               52.00  1.4% __phys_addr                         vmlinux
               48.00  1.3% dev_gro_receive                     vmlinux
               39.00  1.0% swiotlb_dma_mapping_error           vmlinux
               36.00  1.0% swiotlb_map_page                    vmlinux
               34.00  0.9% skb_put                             vmlinux
               27.00  0.7% is_swiotlb_buffer                   vmlinux
               23.00  0.6% deactivate_slab                     vmlinux
               20.00  0.5% vlan_gro_receive                    vmlinux
               17.00  0.5% __skb_bond_should_drop              vmlinux
               14.00  0.4% netif_receive_skb                   vmlinux
               14.00  0.4% __netdev_alloc_skb                  vmlinux
               12.00  0.3% skb_gro_reset_offset                vmlinux
               12.00  0.3% get_slab                            vmlinux
               11.00  0.3% napi_skb_finish                     vmlinux



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-04-26 14:03       ` Eric Dumazet
  2010-04-26 14:55         ` Eric Dumazet
@ 2010-04-26 21:03         ` jamal
  1 sibling, 0 replies; 108+ messages in thread
From: jamal @ 2010-04-26 21:03 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger, netdev

On Mon, 2010-04-26 at 16:03 +0200, Eric Dumazet wrote:

> 
> Jamal, I have a Nehalem setup now, and I can see
> _raw_spin_lock_irqsave() abuse is not coming from network tree, but from
> clockevents_notify()

yikes. Thanks Eric - I shouldve been able to figure that one out. But
why is this thing expensive? I will run the test tommorow and see if i
see the same thing. 

cheers,
jamal




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-04-26 14:55         ` Eric Dumazet
@ 2010-04-26 21:06           ` jamal
       [not found]           ` <20100429174056.GA8044@gargoyle.fritz.box>
  1 sibling, 0 replies; 108+ messages in thread
From: jamal @ 2010-04-26 21:06 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger,
	netdev, Andi Kleen

On Mon, 2010-04-26 at 16:55 +0200, Eric Dumazet wrote:

> Another interesting finding:
> 
> - if all packets are received on a single queue, max speed seems to be
> 1.200.000 packets per second on my machine :-(

Well, if any consolation, it is not as bad as sky2 hardware;-> I cant do
more than 750Kpps.
Also, it seems you use VLANS - max pps will be lower than without VLANs
by probably maybe 6-70Kpps (doesnt explain the 1.2Mpps of course).

cheers,
jamal


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-04-23 10:26 ` Eric Dumazet
@ 2010-04-27 22:08   ` David Miller
  2010-04-27 22:18     ` [PATCH net-next-2.6] bnx2x: Remove two prefetch() Eric Dumazet
  0 siblings, 1 reply; 108+ messages in thread
From: David Miller @ 2010-04-27 22:08 UTC (permalink / raw)
  To: eric.dumazet; +Cc: xiaosuo, hadi, therbert, shemminger, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 23 Apr 2010 12:26:06 +0200

> Le vendredi 23 avril 2010 à 16:12 +0800, Changli Gao a écrit :
>> batch skb dequeueing from softnet input_pkt_queue.
>> 
>> batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
>> contention when RPS is enabled.
>> 
>> Note: in the worst case, the number of packets in a softnet_data may be double
>> of netdev_max_backlog.
>> 
>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
>> ----
> 
> Oops, reading it again, I found process_backlog() was still taking the
> lock twice, if only one packet is waiting in input_pkt_queue.
> 
> Possible fix, on top of your patch :

I've applied Changli's patch with this fixup added to it.

If there are any follow-on changes necessary after further analysis,
please send patches on top of this work.

Thanks.

^ permalink raw reply	[flat|nested] 108+ messages in thread

* [PATCH net-next-2.6] bnx2x: Remove two prefetch()
  2010-04-27 22:08   ` David Miller
@ 2010-04-27 22:18     ` Eric Dumazet
  2010-04-27 22:19       ` David Miller
  2010-04-28 11:33       ` jamal
  0 siblings, 2 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-04-27 22:18 UTC (permalink / raw)
  To: David Miller
  Cc: xiaosuo, hadi, therbert, shemminger, netdev, Eilon Greenstein

Le mardi 27 avril 2010 à 15:08 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Fri, 23 Apr 2010 12:26:06 +0200
> 
> > Le vendredi 23 avril 2010 à 16:12 +0800, Changli Gao a écrit :
> >> batch skb dequeueing from softnet input_pkt_queue.
> >> 
> >> batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
> >> contention when RPS is enabled.
> >> 
> >> Note: in the worst case, the number of packets in a softnet_data may be double
> >> of netdev_max_backlog.
> >> 
> >> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> >> ----
> > 
> > Oops, reading it again, I found process_backlog() was still taking the
> > lock twice, if only one packet is waiting in input_pkt_queue.
> > 
> > Possible fix, on top of your patch :
> 
> I've applied Changli's patch with this fixup added to it.
> 
> If there are any follow-on changes necessary after further analysis,
> please send patches on top of this work.
> 

Thanks David, I was about to resubmit the cumulative patch ;)

On my 'old' dev machine (two quad core), RPS is able to get a 300%
increase on udpsink test on 20 flows.

I yet have to make routing/firewalling tests as well.

I also noticed bnx2x driver has some strange prefetch() calls.

[PATCH net-next-2.6] bnx2x: Remove two prefetch()

1) Even on 64bit arches, sizeof(struct sk_buff) < 256
2) No need to prefetch same pointer twice.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Eilon Greenstein <eilong@broadcom.com>
---

diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index 613f727..f706ed1 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -1617,7 +1617,6 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 			rx_buf = &fp->rx_buf_ring[bd_cons];
 			skb = rx_buf->skb;
 			prefetch(skb);
-			prefetch((u8 *)skb + 256);
 			len = le16_to_cpu(cqe->fast_path_cqe.pkt_len);
 			pad = cqe->fast_path_cqe.placement_offset;
 
@@ -1668,7 +1667,6 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 					dma_unmap_addr(rx_buf, mapping),
 						   pad + RX_COPY_THRESH,
 						   DMA_FROM_DEVICE);
-			prefetch(skb);
 			prefetch(((char *)(skb)) + 128);
 
 			/* is this an error packet? */




^ permalink raw reply related	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch()
  2010-04-27 22:18     ` [PATCH net-next-2.6] bnx2x: Remove two prefetch() Eric Dumazet
@ 2010-04-27 22:19       ` David Miller
  2010-04-28 13:14         ` Eilon Greenstein
  2010-04-28 11:33       ` jamal
  1 sibling, 1 reply; 108+ messages in thread
From: David Miller @ 2010-04-27 22:19 UTC (permalink / raw)
  To: eric.dumazet; +Cc: xiaosuo, hadi, therbert, shemminger, netdev, eilong

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 28 Apr 2010 00:18:13 +0200

> [PATCH net-next-2.6] bnx2x: Remove two prefetch()
> 
> 1) Even on 64bit arches, sizeof(struct sk_buff) < 256
> 2) No need to prefetch same pointer twice.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> CC: Eilon Greenstein <eilong@broadcom.com>

Eilon please review and ACK/NACK

Thanks.

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch()
  2010-04-27 22:18     ` [PATCH net-next-2.6] bnx2x: Remove two prefetch() Eric Dumazet
  2010-04-27 22:19       ` David Miller
@ 2010-04-28 11:33       ` jamal
  2010-04-28 12:33         ` Eric Dumazet
  1 sibling, 1 reply; 108+ messages in thread
From: jamal @ 2010-04-28 11:33 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, xiaosuo, therbert, shemminger, netdev, Eilon Greenstein

On Wed, 2010-04-28 at 00:18 +0200, Eric Dumazet wrote:

> Thanks David, I was about to resubmit the cumulative patch ;)

Hrm, i never got the email with your patch on top of Changlis
(the fscking ISP has creative ways of reordering, delaying and also
occassionaly loosing my emails). So all my tests from last
week did not include the extra patch. I will try to make time today
to test with latest net-next which seems to have some extra goodies.
If there is any other patch you want me to try let me know...

cheers,
jamal


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch()
  2010-04-28 11:33       ` jamal
@ 2010-04-28 12:33         ` Eric Dumazet
  2010-04-28 12:36           ` jamal
  0 siblings, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-04-28 12:33 UTC (permalink / raw)
  To: hadi
  Cc: David Miller, xiaosuo, therbert, shemminger, netdev, Eilon Greenstein

Le mercredi 28 avril 2010 à 07:33 -0400, jamal a écrit :
> On Wed, 2010-04-28 at 00:18 +0200, Eric Dumazet wrote:
> 
> > Thanks David, I was about to resubmit the cumulative patch ;)
> 
> Hrm, i never got the email with your patch on top of Changlis
> (the fscking ISP has creative ways of reordering, delaying and also
> occassionaly loosing my emails). So all my tests from last
> week did not include the extra patch. I will try to make time today
> to test with latest net-next which seems to have some extra goodies.
> If there is any other patch you want me to try let me know...
> 
> cheers,
> jamal

If you wait a bit, I have another patch to speedup udp receive path ;)



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch()
  2010-04-28 12:33         ` Eric Dumazet
@ 2010-04-28 12:36           ` jamal
  2010-04-28 14:06             ` [PATCH net-next-2.6] net: speedup udp receive path Eric Dumazet
  0 siblings, 1 reply; 108+ messages in thread
From: jamal @ 2010-04-28 12:36 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, xiaosuo, therbert, shemminger, netdev, Eilon Greenstein

On Wed, 2010-04-28 at 14:33 +0200, Eric Dumazet wrote:

> If you wait a bit, I have another patch to speedup udp receive path ;)

Shoot whenever you are ready ;-> I will test with and without your
patch..

cheers,
jamal


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch()
  2010-04-27 22:19       ` David Miller
@ 2010-04-28 13:14         ` Eilon Greenstein
  2010-04-28 15:44           ` Eliezer Tamir
                             ` (2 more replies)
  0 siblings, 3 replies; 108+ messages in thread
From: Eilon Greenstein @ 2010-04-28 13:14 UTC (permalink / raw)
  To: David Miller
  Cc: vladz, eliezert, eric.dumazet, xiaosuo, hadi, therbert,
	shemminger, netdev

On Tue, 2010-04-27 at 15:19 -0700, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 28 Apr 2010 00:18:13 +0200
> 
> > [PATCH net-next-2.6] bnx2x: Remove two prefetch()
> > 
> > 1) Even on 64bit arches, sizeof(struct sk_buff) < 256
> > 2) No need to prefetch same pointer twice.
> > 
> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> > CC: Eilon Greenstein <eilong@broadcom.com>
> 
> Eilon please review and ACK/NACK

Vlad ran few benchmarks, and we couldn't find any justification for
those prefetch calls. After consulting with Eliezer Tamir (the original
author) we are glad to Ack this patch.

Thanks Eric!
Acked-by: <eilong@broadcom.com>




^ permalink raw reply	[flat|nested] 108+ messages in thread

* [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-28 12:36           ` jamal
@ 2010-04-28 14:06             ` Eric Dumazet
  2010-04-28 14:19               ` Eric Dumazet
                                 ` (2 more replies)
  0 siblings, 3 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-04-28 14:06 UTC (permalink / raw)
  To: hadi
  Cc: David Miller, xiaosuo, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

Le mercredi 28 avril 2010 à 08:36 -0400, jamal a écrit :
> On Wed, 2010-04-28 at 14:33 +0200, Eric Dumazet wrote:
> 
> > If you wait a bit, I have another patch to speedup udp receive path ;)
> 
> Shoot whenever you are ready ;-> I will test with and without your
> patch..
> 

Here it is ;)

Thanks

[PATCH net-next-2.6] net: speedup udp receive path

Since commit 95766fff ([UDP]: Add memory accounting.), 
each received packet needs one extra sock_lock()/sock_release() pair.

This added latency because of possible backlog handling. Then later,
ticket spinlocks added yet another latency source in case of DDOS.

This patch introduces lock_sock_bh() and unlock_sock_bh()
synchronization primitives, avoiding one atomic operation and backlog
processing.

skb_free_datagram_locked() uses them instead of full blown
lock_sock()/release_sock(). skb is orphaned inside locked section for
proper socket memory reclaim, and finally freed outside of it.

UDP receive path now take the socket spinlock only once.

Signed-off-by: Eric DUmazet <eric.dumazet@gmail.com>
---
 include/net/sock.h  |   10 ++++++++++
 net/core/datagram.c |   10 +++++++---
 net/ipv4/udp.c      |   12 ++++++------
 net/ipv6/udp.c      |    4 ++--
 4 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index cf12b1e..d361c77 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1021,6 +1021,16 @@ extern void release_sock(struct sock *sk);
 				SINGLE_DEPTH_NESTING)
 #define bh_unlock_sock(__sk)	spin_unlock(&((__sk)->sk_lock.slock))
 
+static inline void lock_sock_bh(struct sock *sk)
+{
+	spin_lock_bh(&sk->sk_lock.slock);
+}
+
+static inline void unlock_sock_bh(struct sock *sk)
+{
+	spin_unlock_bh(&sk->sk_lock.slock);
+}
+
 extern struct sock		*sk_alloc(struct net *net, int family,
 					  gfp_t priority,
 					  struct proto *prot);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 5574a5d..95b851f 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -229,9 +229,13 @@ EXPORT_SYMBOL(skb_free_datagram);
 
 void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
 {
-	lock_sock(sk);
-	skb_free_datagram(sk, skb);
-	release_sock(sk);
+	lock_sock_bh(sk);
+	skb_orphan(skb);
+	sk_mem_reclaim_partial(sk);
+	unlock_sock_bh(sk);
+
+	/* skb is now orphaned, might be freed outside of locked section */
+	consume_skb(skb);
 }
 EXPORT_SYMBOL(skb_free_datagram_locked);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 63eb56b..1f86965 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1062,10 +1062,10 @@ static unsigned int first_packet_length(struct sock *sk)
 	spin_unlock_bh(&rcvq->lock);
 
 	if (!skb_queue_empty(&list_kill)) {
-		lock_sock(sk);
+		lock_sock_bh(sk);
 		__skb_queue_purge(&list_kill);
 		sk_mem_reclaim_partial(sk);
-		release_sock(sk);
+		unlock_sock_bh(sk);
 	}
 	return res;
 }
@@ -1196,10 +1196,10 @@ out:
 	return err;
 
 csum_copy_err:
-	lock_sock(sk);
+	lock_sock_bh(sk);
 	if (!skb_kill_datagram(sk, skb, flags))
 		UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
-	release_sock(sk);
+	unlock_sock_bh(sk);
 
 	if (noblock)
 		return -EAGAIN;
@@ -1624,9 +1624,9 @@ int udp_rcv(struct sk_buff *skb)
 
 void udp_destroy_sock(struct sock *sk)
 {
-	lock_sock(sk);
+	lock_sock_bh(sk);
 	udp_flush_pending_frames(sk);
-	release_sock(sk);
+	unlock_sock_bh(sk);
 }
 
 /*
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3ead20a..91c60f0 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -424,7 +424,7 @@ out:
 	return err;
 
 csum_copy_err:
-	lock_sock(sk);
+	lock_sock_bh(sk);
 	if (!skb_kill_datagram(sk, skb, flags)) {
 		if (is_udp4)
 			UDP_INC_STATS_USER(sock_net(sk),
@@ -433,7 +433,7 @@ csum_copy_err:
 			UDP6_INC_STATS_USER(sock_net(sk),
 					UDP_MIB_INERRORS, is_udplite);
 	}
-	release_sock(sk);
+	unlock_sock_bh(sk);
 
 	if (flags & MSG_DONTWAIT)
 		return -EAGAIN;



^ permalink raw reply related	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-28 14:06             ` [PATCH net-next-2.6] net: speedup udp receive path Eric Dumazet
@ 2010-04-28 14:19               ` Eric Dumazet
  2010-04-28 14:34                 ` Eric Dumazet
  2010-04-28 21:36               ` David Miller
  2010-04-28 23:44               ` [PATCH net-next-2.6] net: speedup udp receive path jamal
  2 siblings, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-04-28 14:19 UTC (permalink / raw)
  To: hadi
  Cc: David Miller, xiaosuo, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

Le mercredi 28 avril 2010 à 16:06 +0200, Eric Dumazet a écrit :
> Le mercredi 28 avril 2010 à 08:36 -0400, jamal a écrit :
> > On Wed, 2010-04-28 at 14:33 +0200, Eric Dumazet wrote:
> > 
> > > If you wait a bit, I have another patch to speedup udp receive path ;)
> > 
> > Shoot whenever you are ready ;-> I will test with and without your
> > patch..
> > 
> 
> Here it is ;)
> 
> Thanks

I forgot to say that with my previous DDOS test/bench (16 cpus trying to
feed one udp socket), my receiver can now process 420.000 pps instead of
200.000 ;)




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-28 14:19               ` Eric Dumazet
@ 2010-04-28 14:34                 ` Eric Dumazet
  0 siblings, 0 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-04-28 14:34 UTC (permalink / raw)
  To: hadi
  Cc: David Miller, xiaosuo, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

Le mercredi 28 avril 2010 à 16:19 +0200, Eric Dumazet a écrit :

> I forgot to say that with my previous DDOS test/bench (16 cpus trying to
> feed one udp socket), my receiver can now process 420.000 pps instead of
> 200.000 ;)

And perf top of the cpu dedicated to the thread doing the recvmsg() is :
(after patch)

----------------------------------------------------------------------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:98.0% [1000Hz cycles],  (all, cpu: 1)
----------------------------------------------------------------------------------------------------------------------------------------------

             samples  pcnt function                      DSO
             _______ _____ _____________________________ ____________________________

             5463.00 45.5% _raw_spin_lock_bh             vmlinux                     
              761.00  6.3% copy_user_generic_string      vmlinux                     
              662.00  5.5% sock_recv_ts_and_drops        vmlinux                     
              645.00  5.4% kfree                         vmlinux                     
              568.00  4.7% _raw_spin_lock                vmlinux                     
              494.00  4.1% __skb_recv_datagram           vmlinux                     
              488.00  4.1% skb_copy_datagram_iovec       vmlinux                     
              467.00  3.9% __slab_free                   vmlinux                     
              176.00  1.5% udp_recvmsg                   vmlinux                     
              168.00  1.4% ia32_sysenter_target          vmlinux                     
              161.00  1.3% kmem_cache_free               vmlinux                     
              161.00  1.3% _raw_spin_lock_irqsave        vmlinux                     
              151.00  1.3% memcpy_toiovec                vmlinux                     
              131.00  1.1% fget_light                    vmlinux                     
              130.00  1.1% sock_rfree                    vmlinux                     
              104.00  0.9% inet_recvmsg                  vmlinux                     
               99.00  0.8% dst_release                   vmlinux                     
               98.00  0.8% skb_release_head_state        vmlinux                     
               83.00  0.7% __sk_mem_reclaim              vmlinux                     
               75.00  0.6% sys_recvfrom                  vmlinux                     
               61.00  0.5% sysexit_from_sys_call         vmlinux                     
               59.00  0.5% fput                          vmlinux                     
               56.00  0.5% schedule                      vmlinux                     
               56.00  0.5% sock_recvmsg                  vmlinux                     
               54.00  0.4% move_addr_to_user             vmlinux                     
               51.00  0.4% compat_sys_socketcall         vmlinux                     
               48.00  0.4% _raw_spin_unlock_bh           vmlinux                    



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch()
  2010-04-28 13:14         ` Eilon Greenstein
@ 2010-04-28 15:44           ` Eliezer Tamir
  2010-04-28 16:53           ` David Miller
       [not found]           ` <w2ue8f3c3211004280842r9f2589e8qb8fd4b7933cd9756@mail.gmail.com>
  2 siblings, 0 replies; 108+ messages in thread
From: Eliezer Tamir @ 2010-04-28 15:44 UTC (permalink / raw)
  To: eilong
  Cc: David Miller, vladz, eric.dumazet, xiaosuo, hadi, therbert,
	shemminger, netdev

On Wed, Apr 28, 2010 at 4:14 PM, Eilon Greenstein <eilong@broadcom.com> wrote:
>
> On Tue, 2010-04-27 at 15:19 -0700, David Miller wrote:
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 28 Apr 2010 00:18:13 +0200
> >
> > > [PATCH net-next-2.6] bnx2x: Remove two prefetch()
> > >
> > > 1) Even on 64bit arches, sizeof(struct sk_buff) < 256
> > > 2) No need to prefetch same pointer twice.
> > >
> > > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> > > CC: Eilon Greenstein <eilong@broadcom.com>
> >
> > Eilon please review and ACK/NACK
>
> Vlad ran few benchmarks, and we couldn't find any justification for
> those prefetch calls. After consulting with Eliezer Tamir (the original
> author) we are glad to Ack this patch.
>
> Thanks Eric!
> Acked-by: <eilong@broadcom.com>
>
>
Normally, I would not have said anything but since Eilon asked.
Acked-by: <eliezer@tamir.org.il>
(this time in plain text)

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch()
  2010-04-28 13:14         ` Eilon Greenstein
  2010-04-28 15:44           ` Eliezer Tamir
@ 2010-04-28 16:53           ` David Miller
       [not found]           ` <w2ue8f3c3211004280842r9f2589e8qb8fd4b7933cd9756@mail.gmail.com>
  2 siblings, 0 replies; 108+ messages in thread
From: David Miller @ 2010-04-28 16:53 UTC (permalink / raw)
  To: eilong
  Cc: vladz, eliezert, eric.dumazet, xiaosuo, hadi, therbert,
	shemminger, netdev

From: "Eilon Greenstein" <eilong@broadcom.com>
Date: Wed, 28 Apr 2010 16:14:15 +0300

> On Tue, 2010-04-27 at 15:19 -0700, David Miller wrote:
>> From: Eric Dumazet <eric.dumazet@gmail.com>
>> Date: Wed, 28 Apr 2010 00:18:13 +0200
>> 
>> > [PATCH net-next-2.6] bnx2x: Remove two prefetch()
>> > 
>> > 1) Even on 64bit arches, sizeof(struct sk_buff) < 256
>> > 2) No need to prefetch same pointer twice.
>> > 
>> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
>> > CC: Eilon Greenstein <eilong@broadcom.com>
>> 
>> Eilon please review and ACK/NACK
> 
> Vlad ran few benchmarks, and we couldn't find any justification for
> those prefetch calls. After consulting with Eliezer Tamir (the original
> author) we are glad to Ack this patch.
> 
> Thanks Eric!
> Acked-by: <eilong@broadcom.com>

Thanks, applied.

Please put your full name as well as your email address in Acked-by:
tags, just like you do for Signed-off-by: tags.

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] bnx2x: Remove two prefetch()
       [not found]           ` <w2ue8f3c3211004280842r9f2589e8qb8fd4b7933cd9756@mail.gmail.com>
@ 2010-04-28 16:55             ` David Miller
  0 siblings, 0 replies; 108+ messages in thread
From: David Miller @ 2010-04-28 16:55 UTC (permalink / raw)
  To: eliezer
  Cc: eilong, vladz, eric.dumazet, xiaosuo, hadi, therbert, shemminger, netdev

From: Eliezer Tamir <eliezer@tamir.org.il>
Date: Wed, 28 Apr 2010 18:42:37 +0300

> Acked-by: <eliezer@tamir.org.il>

Like I told Eilon, please specify your full name in future Acked-by: tags,
just as you would for a Signed-off-by: tag.

Thanks.

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-28 14:06             ` [PATCH net-next-2.6] net: speedup udp receive path Eric Dumazet
  2010-04-28 14:19               ` Eric Dumazet
@ 2010-04-28 21:36               ` David Miller
  2010-04-28 22:22                 ` [PATCH net-next-2.6] net: ip_queue_rcv_skb() helper Eric Dumazet
  2010-04-28 23:44               ` [PATCH net-next-2.6] net: speedup udp receive path jamal
  2 siblings, 1 reply; 108+ messages in thread
From: David Miller @ 2010-04-28 21:36 UTC (permalink / raw)
  To: eric.dumazet; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 28 Apr 2010 16:06:45 +0200

> [PATCH net-next-2.6] net: speedup udp receive path
> 
> Since commit 95766fff ([UDP]: Add memory accounting.), 
> each received packet needs one extra sock_lock()/sock_release() pair.
> 
> This added latency because of possible backlog handling. Then later,
> ticket spinlocks added yet another latency source in case of DDOS.
> 
> This patch introduces lock_sock_bh() and unlock_sock_bh()
> synchronization primitives, avoiding one atomic operation and backlog
> processing.
> 
> skb_free_datagram_locked() uses them instead of full blown
> lock_sock()/release_sock(). skb is orphaned inside locked section for
> proper socket memory reclaim, and finally freed outside of it.
> 
> UDP receive path now take the socket spinlock only once.
> 
> Signed-off-by: Eric DUmazet <eric.dumazet@gmail.com>

Clever, let's see what this breaks :-)

Applied, thanks Eric.

^ permalink raw reply	[flat|nested] 108+ messages in thread

* [PATCH net-next-2.6] net: ip_queue_rcv_skb() helper
  2010-04-28 21:36               ` David Miller
@ 2010-04-28 22:22                 ` Eric Dumazet
  2010-04-28 22:39                   ` David Miller
  0 siblings, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-04-28 22:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Le mercredi 28 avril 2010 à 14:36 -0700, David Miller a écrit :

> 
> Clever, let's see what this breaks :-)
> 
> Applied, thanks Eric.

Thanks ;)

Let's respin an old work about dst, with a first small work unit :

Next patch will try to not touch dst refcount in input path (previously
attempted in July 2009)
Ref : http://kerneltrap.org/mailarchive/linux-netdev/2009/7/22/6248753


[PATCH net-next-2.6] net: ip_queue_rcv_skb() helper

When queueing a skb to socket, we can immediately release its dst if
target socket do not use IP_CMSG_PKTINFO.

tcp_data_queue() can drop dst too.

This to benefit from a hot cache line and avoid the receiver, possibly
on another cpu, to dirty this cache line himself.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/ip.h       |    1 +
 net/ipv4/ip_sockglue.c |   16 ++++++++++++++++
 net/ipv4/raw.c         |    2 +-
 net/ipv4/tcp_input.c   |    1 +
 net/ipv4/udp.c         |    2 +-
 net/ipv6/raw.c         |    2 +-
 net/ipv6/udp.c         |    2 +-
 7 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index a84ceb6..8149b77 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -393,6 +393,7 @@ extern int ip_options_rcv_srr(struct sk_buff *skb);
  *	Functions provided by ip_sockglue.c
  */
 
+extern int	ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 extern void	ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb);
 extern int	ip_cmsg_send(struct net *net,
 			     struct msghdr *msg, struct ipcm_cookie *ipc);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b0aa054..ce23178 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -954,6 +954,22 @@ e_inval:
 	return -EINVAL;
 }
 
+/**
+ * ip_queue_rcv_skb - Queue an skb into sock receive queue
+ * @sk: socket
+ * @skb: buffer
+ *
+ * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option
+ * is not set, we drop skb dst entry now, while dst cache line is hot.
+ */
+int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO))
+		skb_dst_drop(skb);
+	return sock_queue_rcv_skb(sk, skb);
+}
+EXPORT_SYMBOL(ip_queue_rcv_skb);
+
 int ip_setsockopt(struct sock *sk, int level,
 		int optname, char __user *optval, unsigned int optlen)
 {
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index cc6f097..52ef5af 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -290,7 +290,7 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
 {
 	/* Charge it to the socket. */
 
-	if (sock_queue_rcv_skb(sk, skb) < 0) {
+	if (ip_queue_rcv_skb(sk, skb) < 0) {
 		kfree_skb(skb);
 		return NET_RX_DROP;
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ae3ec15..e82162c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4367,6 +4367,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
 		goto drop;
 
+	skb_dst_drop(skb);
 	__skb_pull(skb, th->doff * 4);
 
 	TCP_ECN_accept_cwr(tp, skb);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 63eb56b..8591398 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1264,7 +1264,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	if (inet_sk(sk)->inet_daddr)
 		sock_rps_save_rxhash(sk, skb->rxhash);
 
-	rc = sock_queue_rcv_skb(sk, skb);
+	rc = ip_queue_rcv_skb(sk, skb);
 	if (rc < 0) {
 		int is_udplite = IS_UDPLITE(sk);
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 8562738..0e3d2dd 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -381,7 +381,7 @@ static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
 	}
 
 	/* Charge it to the socket. */
-	if (sock_queue_rcv_skb(sk, skb) < 0) {
+	if (ip_queue_rcv_skb(sk, skb) < 0) {
 		kfree_skb(skb);
 		return NET_RX_DROP;
 	}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3ead20a..aa0e47a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -514,7 +514,7 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 			goto drop;
 	}
 
-	if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
+	if ((rc = ip_queue_rcv_skb(sk, skb)) < 0) {
 		/* Note that an ENOMEM error is charged twice */
 		if (rc == -ENOMEM)
 			UDP6_INC_STATS_BH(sock_net(sk),



^ permalink raw reply related	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: ip_queue_rcv_skb() helper
  2010-04-28 22:22                 ` [PATCH net-next-2.6] net: ip_queue_rcv_skb() helper Eric Dumazet
@ 2010-04-28 22:39                   ` David Miller
  0 siblings, 0 replies; 108+ messages in thread
From: David Miller @ 2010-04-28 22:39 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 29 Apr 2010 00:22:44 +0200

> Next patch will try to not touch dst refcount in input path (previously
> attempted in July 2009)
> Ref : http://kerneltrap.org/mailarchive/linux-netdev/2009/7/22/6248753

Yes, I remember this.

> [PATCH net-next-2.6] net: ip_queue_rcv_skb() helper
> 
> When queueing a skb to socket, we can immediately release its dst if
> target socket do not use IP_CMSG_PKTINFO.
> 
> tcp_data_queue() can drop dst too.
> 
> This to benefit from a hot cache line and avoid the receiver, possibly
> on another cpu, to dirty this cache line himself.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Pretty soon the whole receive path will be "read mostly" :-)

Applied, thanks Eric.

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-28 14:06             ` [PATCH net-next-2.6] net: speedup udp receive path Eric Dumazet
  2010-04-28 14:19               ` Eric Dumazet
  2010-04-28 21:36               ` David Miller
@ 2010-04-28 23:44               ` jamal
  2010-04-29  0:00                 ` jamal
  2010-04-29  4:09                 ` Eric Dumazet
  2 siblings, 2 replies; 108+ messages in thread
From: jamal @ 2010-04-28 23:44 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, xiaosuo, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

[-- Attachment #1: Type: text/plain, Size: 1188 bytes --]

On Wed, 2010-04-28 at 16:06 +0200, Eric Dumazet wrote:

> Here it is ;)

Sorry - things got a little hectic with TheMan.

I am afraid i dont have good news.
Actually, I should say i dont have good news in regards to rps.
For my sample app, two things seem to be happening:
a) The overall performance has gotten better for both rps
and non-rps.
b) non-rps is now performing relatively better

This is just what i see in net-next not related to your patch.
It seems the kernels i tested prior to April 23 showed rps better.
The one i tested on Apr23 showed rps being about the same as non-rps.
As i stated in my last result posting, I thought i didnt test properly
but i did again today and saw the same thing. And now non-rps is
_consistently_ better.
So some regression is going on...

Your patch has improved the performance of rps relative to what is in
net-next very lightly; but it has also improved the performance of
non-rps;->
My traces look different for the app cpu than yours - likely because of
the apps being different.

At the moment i dont have time to dig deeper into code, but i could
test as cycles show up.

I am attaching the profile traces and results.

cheers,
jamal

[-- Attachment #2: sum-apr23and28.txt --]
[-- Type: text/plain, Size: 1469 bytes --]


April 23 net-next

kernel           sink    cpu all     cpuint       cpuapp
---------------------------------------------------------
nn              93.95%   84.5%        99.8%        79.8%
nn-rps          96.41%   85.4%        95.5%        82.5%
nn-cl           97.29%   84.0%        99.9%        79.6%
nn-cl-rps       97.76%   86.5%        96.5%        84.8%

nn: Basic net-next from Apr23
nn-rps: Basic net-next from Apr23 with rps mask ee and irq affinity to cpu0
nn-cl: Basic net-next from Apr23 + Changli patch
nn-cl-rps: Basic net-next from Apr23 + Changli patch + rps mask ee,irq aff cpu0
sink: the amount of traffic the system was able to sink in.
cpu all: avg % system cpu consumed in test
cpuint: avg %cpu consumed by the cpu where interrupts happened
cpuapp: avg %cpu consumed by a sample cpu which did app processing

Now repeat with Erics changes and kernel from Apr-28

kernel         sink      cpu all     cpuint       cpuapp
---------------------------------------------------------
nn2              98.78%   83.6%        100.0%       82.8%
nn2-rps          94.43%   84.2%        98.1%        82.0%
nn2-ed           98.74%   83.2%        99.9%        81.6%
nn2-ed-rps       95.15%   84.5%        97.3%        82.1%


nn2: Basic net-next from Apr28
nn2-rps: Basic net-next from Apr23 with rps mask ee and irq affinity to cpu0
nn2-ed: Basic net-next from Apr23 + Eric patch
nn2-ed-rps: Basic net-next from Apr23 + Eric patch + rps mask ee,irq aff cpu0

[-- Attachment #3: nn-apr28-summary.txt --]
[-- Type: text/plain, Size: 78977 bytes --]


I: net-next

Average udp sink: 98.78%

--------------------------------------------------------------------------------------------------
   PerfTop:    3632 irqs/sec  kernel:83.7% [1000Hz cycles],  (all, 8 CPUs)
--------------------------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ____________________

             2738.00  9.8% sky2_poll                   [sky2]              
             1543.00  5.5% _raw_spin_lock_irqsave      [kernel]            
             1019.00  3.7% system_call                 [kernel]            
              740.00  2.7% copy_user_generic_string    [kernel]            
              687.00  2.5% fget                        [kernel]            
              640.00  2.3% _raw_spin_unlock_irqrestore [kernel]            
              634.00  2.3% sys_epoll_ctl               [kernel]            
              613.00  2.2% datagram_poll               [kernel]            
              553.00  2.0% _raw_spin_lock_bh           [kernel]            
              530.00  1.9% kmem_cache_free             [kernel]            
              522.00  1.9% schedule                    [kernel]            
              487.00  1.7% vread_tsc                   [kernel].vsyscall_fn
              467.00  1.7% _raw_spin_lock              [kernel]            
              432.00  1.5% udp_recvmsg                 [kernel]            
              426.00  1.5% kmem_cache_alloc            [kernel]            
              418.00  1.5% __udp4_lib_lookup           [kernel]            
              417.00  1.5% sys_epoll_wait              [kernel]            
              376.00  1.3% fput                        [kernel]            
              361.00  1.3% ip_route_input              [kernel]            
              344.00  1.2% local_bh_enable_ip          [kernel]            
              326.00  1.2% ip_rcv                      [kernel]            
              321.00  1.2% first_packet_length         [kernel]            
              307.00  1.1% ep_remove                   [kernel]            
              303.00  1.1% dst_release                 [kernel]            
              301.00  1.1% skb_copy_datagram_iovec     [kernel]            
              297.00  1.1% mutex_lock                  [kernel]            



--------------------------------------------------------------------------------------------------
   PerfTop:    4018 irqs/sec  kernel:83.3% [1000Hz cycles],  (all, 8 CPUs)
--------------------------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ______________________

             4274.00  9.7% sky2_poll                   [sky2]                
             2473.00  5.6% _raw_spin_lock_irqsave      [kernel]              
             1585.00  3.6% system_call                 [kernel]              
             1179.00  2.7% copy_user_generic_string    [kernel]              
             1089.00  2.5% fget                        [kernel]              
             1019.00  2.3% _raw_spin_unlock_irqrestore [kernel]              
             1011.00  2.3% sys_epoll_ctl               [kernel]              
              965.00  2.2% datagram_poll               [kernel]              
              902.00  2.0% kmem_cache_free             [kernel]              
              841.00  1.9% _raw_spin_lock_bh           [kernel]              
              837.00  1.9% schedule                    [kernel]              
              735.00  1.7% vread_tsc                   [kernel].vsyscall_fn  
              730.00  1.7% udp_recvmsg                 [kernel]              
              729.00  1.7% _raw_spin_lock              [kernel]              
              678.00  1.5% kmem_cache_alloc            [kernel]              
              651.00  1.5% sys_epoll_wait              [kernel]              
              635.00  1.4% __udp4_lib_lookup           [kernel]              
              595.00  1.3% fput                        [kernel]              
              568.00  1.3% local_bh_enable_ip          [kernel]              
              562.00  1.3% ip_route_input              [kernel]              
              516.00  1.2% dst_release                 [kernel]              
              502.00  1.1% ep_remove                   [kernel]              
              485.00  1.1% skb_copy_datagram_iovec     [kernel]              
              484.00  1.1% first_packet_length         [kernel]              
              476.00  1.1% ip_rcv                      [kernel]              
              470.00  1.1% __alloc_skb                 [kernel]              
              459.00  1.0% epoll_ctl                   /lib/libc-2.7.so      
              458.00  1.0% mutex_lock                  [kernel]              


--------------------------------------------------------------------------------------------------
   PerfTop:    1000 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu: 0)
--------------------------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             3534.00 34.7% sky2_poll                   [sky2]  
              545.00  5.3% __udp4_lib_lookup           [kernel]
              537.00  5.3% ip_route_input              [kernel]
              427.00  4.2% _raw_spin_lock_irqsave      [kernel]
              401.00  3.9% __alloc_skb                 [kernel]
              360.00  3.5% ip_rcv                      [kernel]
              332.00  3.3% _raw_spin_lock              [kernel]
              292.00  2.9% sock_queue_rcv_skb          [kernel]
              291.00  2.9% __udp4_lib_rcv              [kernel]
              273.00  2.7% sock_def_readable           [kernel]
              269.00  2.6% __netif_receive_skb         [kernel]
              209.00  2.1% __wake_up_common            [kernel]
              196.00  1.9% __kmalloc                   [kernel]
              164.00  1.6% _raw_read_lock              [kernel]
              157.00  1.5% kmem_cache_alloc            [kernel]
              157.00  1.5% ep_poll_callback            [kernel]
              133.00  1.3% resched_task                [kernel]
              128.00  1.3% task_rq_lock                [kernel]
              120.00  1.2% swiotlb_sync_single         [kernel]
              120.00  1.2% sky2_rx_submit              [sky2]  
              117.00  1.1% udp_queue_rcv_skb           [kernel]
              108.00  1.1% ip_local_deliver            [kernel]
              104.00  1.0% try_to_wake_up              [kernel]
              102.00  1.0% _raw_spin_unlock_irqrestore [kernel]
               98.00  1.0% select_task_rq_fair         [kernel]



--------------------------------------------------------------------------------------------------
   PerfTop:    1000 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu: 0)
--------------------------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             4601.00 34.0% sky2_poll                   [sky2]  
              732.00  5.4% __udp4_lib_lookup           [kernel]
              724.00  5.3% ip_route_input              [kernel]
              527.00  3.9% _raw_spin_lock_irqsave      [kernel]
              520.00  3.8% __alloc_skb                 [kernel]
              483.00  3.6% ip_rcv                      [kernel]
              441.00  3.3% _raw_spin_lock              [kernel]
              401.00  3.0% sock_queue_rcv_skb          [kernel]
              373.00  2.8% __udp4_lib_rcv              [kernel]
              365.00  2.7% sock_def_readable           [kernel]
              353.00  2.6% __netif_receive_skb         [kernel]
              285.00  2.1% __wake_up_common            [kernel]
              273.00  2.0% __kmalloc                   [kernel]
              230.00  1.7% _raw_read_lock              [kernel]
              208.00  1.5% ep_poll_callback            [kernel]
              199.00  1.5% kmem_cache_alloc            [kernel]
              180.00  1.3% task_rq_lock                [kernel]
              172.00  1.3% sky2_rx_submit              [sky2]  
              171.00  1.3% resched_task                [kernel]
              165.00  1.2% ip_local_deliver            [kernel]
              162.00  1.2% udp_queue_rcv_skb           [kernel]
              158.00  1.2% _raw_spin_unlock_irqrestore [kernel]
              148.00  1.1% select_task_rq_fair         [kernel]
              144.00  1.1% try_to_wake_up              [kernel]
              142.00  1.0% sky2_remove                 [sky2]  
              140.00  1.0% swiotlb_sync_single         [kernel]
               95.00  0.7% cache_alloc_refill          [kernel]
               92.00  0.7% dev_gro_receive             [kernel]
               82.00  0.6% is_swiotlb_buffer           [kernel]


--------------------------------------------------------------------------------------------------
   PerfTop:     622 irqs/sec  kernel:74.9% [1000Hz cycles],  (all, cpu: 2)
--------------------------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ _____________________________________

              113.00  6.5% _raw_spin_lock_irqsave      /lib/modules/2.6.34-rc5/build/vmlinux
              105.00  6.0% system_call                 /lib/modules/2.6.34-rc5/build/vmlinux
               69.00  3.9% fget                        /lib/modules/2.6.34-rc5/build/vmlinux
               64.00  3.7% datagram_poll               /lib/modules/2.6.34-rc5/build/vmlinux
               56.00  3.2% copy_user_generic_string    /lib/modules/2.6.34-rc5/build/vmlinux
               55.00  3.1% sys_epoll_ctl               /lib/modules/2.6.34-rc5/build/vmlinux
               53.00  3.0% _raw_spin_unlock_irqrestore /lib/modules/2.6.34-rc5/build/vmlinux
               46.00  2.6% _raw_spin_lock_bh           /lib/modules/2.6.34-rc5/build/vmlinux
               42.00  2.4% kmem_cache_free             /lib/modules/2.6.34-rc5/build/vmlinux
               37.00  2.1% dst_release                 /lib/modules/2.6.34-rc5/build/vmlinux
               37.00  2.1% schedule                    /lib/modules/2.6.34-rc5/build/vmlinux
               35.00  2.0% mutex_lock                  /lib/modules/2.6.34-rc5/build/vmlinux
               35.00  2.0% vread_tsc                   [kernel].vsyscall_fn                 
               35.00  2.0% udp_recvmsg                 /lib/modules/2.6.34-rc5/build/vmlinux
               34.00  1.9% sys_epoll_wait              /lib/modules/2.6.34-rc5/build/vmlinux
               31.00  1.8% local_bh_enable_ip          /lib/modules/2.6.34-rc5/build/vmlinux
               29.00  1.7% ep_remove                   /lib/modules/2.6.34-rc5/build/vmlinux
               28.00  1.6% kmem_cache_alloc            /lib/modules/2.6.34-rc5/build/vmlinux
               27.00  1.5% process_recv                /home/hadi/udp_sink/mcpudp           
               25.00  1.4% mutex_unlock                /lib/modules/2.6.34-rc5/build/vmlinux
               24.00  1.4% ep_send_events_proc         /lib/modules/2.6.34-rc5/build/vmlinux
               24.00  1.4% clock_gettime               /lib/librt-2.7.so                    
               23.00  1.3% fput                        /lib/modules/2.6.34-rc5/build/vmlinux
               23.00  1.3% skb_copy_datagram_iovec     /lib/modules/2.6.34-rc5/build/vmlinux
               20.00  1.1% sock_recv_ts_and_drops      /lib/modules/2.6.34-rc5/build/vmlinux
               20.00  1.1% inet_recvmsg                /lib/modules/2.6.34-rc5/build/vmlinux
               19.00  1.1% epoll_dispatch              /usr/lib/libevent-1.3e.so.1.0.3      
               19.00  1.1% first_packet_length         /lib/modules/2.6.34-rc5/build/vmlinux



--------------------------------------------------------------------------------------------------
   PerfTop:     625 irqs/sec  kernel:83.0% [1000Hz cycles],  (all, cpu: 2)
--------------------------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ _____________________________________

              315.00  6.8% _raw_spin_lock_irqsave      /lib/modules/2.6.34-rc5/build/vmlinux
              232.00  5.0% system_call                 /lib/modules/2.6.34-rc5/build/vmlinux
              175.00  3.8% fget                        /lib/modules/2.6.34-rc5/build/vmlinux
              174.00  3.8% datagram_poll               /lib/modules/2.6.34-rc5/build/vmlinux
              168.00  3.6% sys_epoll_ctl               /lib/modules/2.6.34-rc5/build/vmlinux
              155.00  3.4% copy_user_generic_string    /lib/modules/2.6.34-rc5/build/vmlinux
              144.00  3.1% kmem_cache_free             /lib/modules/2.6.34-rc5/build/vmlinux
              133.00  2.9% _raw_spin_lock_bh           /lib/modules/2.6.34-rc5/build/vmlinux
              126.00  2.7% _raw_spin_unlock_irqrestore /lib/modules/2.6.34-rc5/build/vmlinux
              113.00  2.4% vread_tsc                   [kernel].vsyscall_fn                 
              110.00  2.4% _raw_spin_unlock_bh         /lib/modules/2.6.34-rc5/build/vmlinux
              106.00  2.3% schedule                    /lib/modules/2.6.34-rc5/build/vmlinux
              103.00  2.2% local_bh_enable_ip          /lib/modules/2.6.34-rc5/build/vmlinux
              101.00  2.2% udp_recvmsg                 /lib/modules/2.6.34-rc5/build/vmlinux
               97.00  2.1% sys_epoll_wait              /lib/modules/2.6.34-rc5/build/vmlinux
               84.00  1.8% dst_release                 /lib/modules/2.6.34-rc5/build/vmlinux
               78.00  1.7% fput                        /lib/modules/2.6.34-rc5/build/vmlinux
               75.00  1.6% first_packet_length         /lib/modules/2.6.34-rc5/build/vmlinux
               74.00  1.6% kmem_cache_alloc            /lib/modules/2.6.34-rc5/build/vmlinux
               71.00  1.5% ep_remove                   /lib/modules/2.6.34-rc5/build/vmlinux
               69.00  1.5% epoll_ctl                   /lib/libc-2.7.so                     
               67.00  1.5% mutex_lock                  /lib/modules/2.6.34-rc5/build/vmlinux
               65.00  1.4% sock_recv_ts_and_drops      /lib/modules/2.6.34-rc5/build/vmlinux
               65.00  1.4% inet_recvmsg                /lib/modules/2.6.34-rc5/build/vmlinux
               64.00  1.4% process_recv                /home/hadi/udp_sink/mcpudp           
               62.00  1.3% skb_copy_datagram_iovec     /lib/modules/2.6.34-rc5/build/vmlinux
               60.00  1.3% clock_gettime               /lib/librt-2.7.so                    


--------------------------------------------------------------------------------------------------
   PerfTop:     700 irqs/sec  kernel:84.3% [1000Hz cycles],  (all, cpu: 2)
--------------------------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ _____________________________________

              489.00  6.4% _raw_spin_lock_irqsave      /lib/modules/2.6.34-rc5/build/vmlinux
              376.00  4.9% system_call                 /lib/modules/2.6.34-rc5/build/vmlinux
              308.00  4.0% fget                        /lib/modules/2.6.34-rc5/build/vmlinux
              302.00  3.9% copy_user_generic_string    /lib/modules/2.6.34-rc5/build/vmlinux
              280.00  3.6% sys_epoll_ctl               /lib/modules/2.6.34-rc5/build/vmlinux
              274.00  3.6% datagram_poll               /lib/modules/2.6.34-rc5/build/vmlinux
              249.00  3.2% kmem_cache_free             /lib/modules/2.6.34-rc5/build/vmlinux
              223.00  2.9% _raw_spin_unlock_irqrestore /lib/modules/2.6.34-rc5/build/vmlinux
              221.00  2.9% _raw_spin_unlock_bh         /lib/modules/2.6.34-rc5/build/vmlinux
              221.00  2.9% local_bh_enable_ip          /lib/modules/2.6.34-rc5/build/vmlinux
              208.00  2.7% vread_tsc                   [kernel].vsyscall_fn                 
              200.00  2.6% _raw_spin_lock_bh           /lib/modules/2.6.34-rc5/build/vmlinux
              191.00  2.5% schedule                    /lib/modules/2.6.34-rc5/build/vmlinux
              188.00  2.4% sys_epoll_wait              /lib/modules/2.6.34-rc5/build/vmlinux
              177.00  2.3% udp_recvmsg                 /lib/modules/2.6.34-rc5/build/vmlinux
              141.00  1.8% fput                        /lib/modules/2.6.34-rc5/build/vmlinux
              140.00  1.8% first_packet_length         /lib/modules/2.6.34-rc5/build/vmlinux
              128.00  1.7% kmem_cache_alloc            /lib/modules/2.6.34-rc5/build/vmlinux
              119.00  1.5% dst_release                 /lib/modules/2.6.34-rc5/build/vmlinux
              105.00  1.4% ep_remove                   /lib/modules/2.6.34-rc5/build/vmlinux
              104.00  1.4% epoll_ctl                   /lib/libc-2.7.so                     
              102.00  1.3% skb_copy_datagram_iovec     /lib/modules/2.6.34-rc5/build/vmlinux
              100.00  1.3% mutex_lock                  /lib/modules/2.6.34-rc5/build/vmlinux
               95.00  1.2% mutex_unlock                /lib/modules/2.6.34-rc5/build/vmlinux
               94.00  1.2% sock_recv_ts_and_drops      /lib/modules/2.6.34-rc5/build/vmlinux
               92.00  1.2% ep_send_events_proc         /lib/modules/2.6.34-rc5/build/vmlinux
               92.00  1.2% clock_gettime               /lib/librt-2.7.so                    
               92.00  1.2% __skb_recv_datagram         /lib/modules/2.6.34-rc5/build/vmlinux
               91.00  1.2% process_recv                /home/hadi/udp_sink/mcpudp           
               88.00  1.1% kfree                       /lib/modules/2.6.34-rc5/build/vmlinux
               86.00  1.1% _raw_spin_lock              /lib/modules/2.6.34-rc5/build/vmlinux



II: net-next with rps = ee

94.43%
--------------



--------------------------------------------------------------------------------------------------
   PerfTop:    4328 irqs/sec  kernel:84.0% [1000Hz cycles],  (all, 8 CPUs)
--------------------------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ ______________________

             3908.00 17.1% sky2_poll                      [sky2]                
              694.00  3.0% _raw_spin_lock_irqsave         [kernel]              
              584.00  2.6% sky2_intr                      [sky2]                
              557.00  2.4% system_call                    [kernel]              
              490.00  2.1% _raw_spin_unlock_irqrestore    [kernel]              
              488.00  2.1% fget                           [kernel]              
              425.00  1.9% ip_rcv                         [kernel]              
              405.00  1.8% sys_epoll_ctl                  [kernel]              
              398.00  1.7% __netif_receive_skb            [kernel]              
              375.00  1.6% _raw_spin_lock                 [kernel]              
              365.00  1.6% copy_user_generic_string       [kernel]              
              363.00  1.6% ip_route_input                 [kernel]              
              350.00  1.5% kmem_cache_free                [kernel]              
              346.00  1.5% schedule                       [kernel]              
              319.00  1.4% call_function_single_interrupt [kernel]              
              295.00  1.3% vread_tsc                      [kernel].vsyscall_fn  
              270.00  1.2% __udp4_lib_lookup              [kernel]              
              264.00  1.2% kmem_cache_alloc               [kernel]              
              235.00  1.0% fput                           [kernel]              
              219.00  1.0% datagram_poll                  [kernel]              


--------------------------------------------------------------------------------------------------
   PerfTop:    3791 irqs/sec  kernel:84.4% [1000Hz cycles],  (all, 8 CPUs)
--------------------------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ ______________________

             6274.00 17.2% sky2_poll                      [sky2]                
             1139.00  3.1% _raw_spin_lock_irqsave         [kernel]              
              953.00  2.6% system_call                    [kernel]              
              942.00  2.6% sky2_intr                      [sky2]                
              785.00  2.2% _raw_spin_unlock_irqrestore    [kernel]              
              745.00  2.0% fget                           [kernel]              
              695.00  1.9% ip_rcv                         [kernel]              
              653.00  1.8% sys_epoll_ctl                  [kernel]              
              609.00  1.7% ip_route_input                 [kernel]              
              606.00  1.7% __netif_receive_skb            [kernel]              
              583.00  1.6% _raw_spin_lock                 [kernel]              
              569.00  1.6% kmem_cache_free                [kernel]              
              564.00  1.5% copy_user_generic_string       [kernel]              
              554.00  1.5% schedule                       [kernel]              
              510.00  1.4% call_function_single_interrupt [kernel]              
              488.00  1.3% vread_tsc                      [kernel].vsyscall_fn  
              459.00  1.3% kmem_cache_alloc               [kernel]              
              417.00  1.1% __udp4_lib_lookup              [kernel]              
              387.00  1.1% fput                           [kernel]              
              358.00  1.0% __udp4_lib_rcv                 [kernel]              
              347.00  1.0% event_base_loop                libevent-1.3e.so.1.0.3

-----------------------------------------------------------------------------------------------
   PerfTop:     997 irqs/sec  kernel:98.2% [1000Hz cycles],  (all, cpu: 0)
-----------------------------------------------------------------------------------------------

             samples  pcnt function                            DSO
             _______ _____ ___________________________________ ________

             3926.00 61.0% sky2_poll                           [sky2]  
              671.00 10.4% sky2_intr                           [sky2]  
              192.00  3.0% __alloc_skb                         [kernel]
              126.00  2.0% get_rps_cpu                         [kernel]
              111.00  1.7% __kmalloc                           [kernel]
               97.00  1.5% enqueue_to_backlog                  [kernel]
               95.00  1.5% _raw_spin_lock_irqsave              [kernel]
               93.00  1.4% _raw_spin_lock                      [kernel]
               79.00  1.2% kmem_cache_alloc                    [kernel]
               63.00  1.0% sky2_rx_submit                      [sky2]  

-----------------------------------------------------------------------------------------------
   PerfTop:     980 irqs/sec  kernel:98.0% [1000Hz cycles],  (all, cpu: 0)
-----------------------------------------------------------------------------------------------

             samples  pcnt function                            DSO
             _______ _____ ___________________________________ ____________________

             6945.00 61.4% sky2_poll                           [sky2]              
             1219.00 10.8% sky2_intr                           [sky2]              
              323.00  2.9% __alloc_skb                         [kernel]            
              243.00  2.1% get_rps_cpu                         [kernel]            
              195.00  1.7% __kmalloc                           [kernel]            
              161.00  1.4% _raw_spin_lock_irqsave              [kernel]            
              149.00  1.3% enqueue_to_backlog                  [kernel]            
              139.00  1.2% _raw_spin_lock                      [kernel]            
              136.00  1.2% kmem_cache_alloc                    [kernel]            
              135.00  1.2% irq_entries_start                   [kernel]            
              108.00  1.0% sky2_rx_submit                      [sky2]              


-----------------------------------------------------------------------------------------------
   PerfTop:     458 irqs/sec  kernel:80.8% [1000Hz cycles],  (all, cpu: 2)
-----------------------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ _____________________________________

              130.00  4.7% _raw_spin_lock_irqsave         /lib/modules/2.6.34-rc5/build/vmlinux
              114.00  4.1% system_call                    /lib/modules/2.6.34-rc5/build/vmlinux
               91.00  3.3% ip_rcv                         /lib/modules/2.6.34-rc5/build/vmlinux
               82.00  3.0% _raw_spin_unlock_irqrestore    /lib/modules/2.6.34-rc5/build/vmlinux
               74.00  2.7% call_function_single_interrupt /lib/modules/2.6.34-rc5/build/vmlinux
               74.00  2.7% fget                           /lib/modules/2.6.34-rc5/build/vmlinux
               71.00  2.6% __netif_receive_skb            /lib/modules/2.6.34-rc5/build/vmlinux
               69.00  2.5% ip_route_input                 /lib/modules/2.6.34-rc5/build/vmlinux
               66.00  2.4% schedule                       /lib/modules/2.6.34-rc5/build/vmlinux
               63.00  2.3% kmem_cache_free                /lib/modules/2.6.34-rc5/build/vmlinux
               61.00  2.2% sys_epoll_ctl                  /lib/modules/2.6.34-rc5/build/vmlinux
               61.00  2.2% __udp4_lib_lookup              /lib/modules/2.6.34-rc5/build/vmlinux
               57.00  2.1% copy_user_generic_string       /lib/modules/2.6.34-rc5/build/vmlinux
               49.00  1.8% vread_tsc                      [kernel].vsyscall_fn                 
               49.00  1.8% _raw_spin_lock                 /lib/modules/2.6.34-rc5/build/vmlinux
               47.00  1.7% ep_remove                      /lib/modules/2.6.34-rc5/build/vmlinux
               45.00  1.6% fput                           /lib/modules/2.6.34-rc5/build/vmlinux
               44.00  1.6% sys_epoll_wait                 /lib/modules/2.6.34-rc5/build/vmlinux
               40.00  1.4% kmem_cache_alloc               /lib/modules/2.6.34-rc5/build/vmlinux
               40.00  1.4% local_bh_enable_ip             /lib/modules/2.6.34-rc5/build/vmlinux
               38.00  1.4% sock_recv_ts_and_drops         /lib/modules/2.6.34-rc5/build/vmlinux
               35.00  1.3% process_recv                   /home/hadi/udp_sink/mcpudp           
               34.00  1.2% mutex_unlock                   /lib/modules/2.6.34-rc5/build/vmlinux
               31.00  1.1% _raw_spin_unlock_bh            /lib/modules/2.6.34-rc5/build/vmlinux
               31.00  1.1% event_base_loop                /usr/lib/libevent-1.3e.so.1.0.3      


-----------------------------------------------------------------------------------------------
   PerfTop:     552 irqs/sec  kernel:82.4% [1000Hz cycles],  (all, cpu: 2)
-----------------------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ _____________________________________

              204.00  4.7% _raw_spin_lock_irqsave         /lib/modules/2.6.34-rc5/build/vmlinux
              169.00  3.9% system_call                    /lib/modules/2.6.34-rc5/build/vmlinux
              151.00  3.5% _raw_spin_unlock_irqrestore    /lib/modules/2.6.34-rc5/build/vmlinux
              132.00  3.0% ip_rcv                         /lib/modules/2.6.34-rc5/build/vmlinux
              129.00  3.0% fget                           /lib/modules/2.6.34-rc5/build/vmlinux
              123.00  2.8% __netif_receive_skb            /lib/modules/2.6.34-rc5/build/vmlinux
              115.00  2.6% ip_route_input                 /lib/modules/2.6.34-rc5/build/vmlinux
              112.00  2.6% call_function_single_interrupt /lib/modules/2.6.34-rc5/build/vmlinux
              112.00  2.6% sys_epoll_ctl                  /lib/modules/2.6.34-rc5/build/vmlinux
              103.00  2.4% schedule                       /lib/modules/2.6.34-rc5/build/vmlinux
               94.00  2.2% kmem_cache_free                /lib/modules/2.6.34-rc5/build/vmlinux
               89.00  2.0% copy_user_generic_string       /lib/modules/2.6.34-rc5/build/vmlinux
               86.00  2.0% _raw_spin_lock                 /lib/modules/2.6.34-rc5/build/vmlinux
               83.00  1.9% __udp4_lib_lookup              /lib/modules/2.6.34-rc5/build/vmlinux
               76.00  1.7% vread_tsc                      [kernel].vsyscall_fn                 
               68.00  1.6% ep_remove                      /lib/modules/2.6.34-rc5/build/vmlinux
               67.00  1.5% fput                           /lib/modules/2.6.34-rc5/build/vmlinux
               64.00  1.5% kmem_cache_alloc               /lib/modules/2.6.34-rc5/build/vmlinux
               62.00  1.4% sys_epoll_wait                 /lib/modules/2.6.34-rc5/build/vmlinux
               60.00  1.4% dst_release                    /lib/modules/2.6.34-rc5/build/vmlinux
               60.00  1.4% sock_recv_ts_and_drops         /lib/modules/2.6.34-rc5/build/vmlinux
               56.00  1.3% _raw_spin_lock_bh              /lib/modules/2.6.34-rc5/build/vmlinux
               53.00  1.2% event_base_loop                /usr/lib/libevent-1.3e.so.1.0.3      
               51.00  1.2% datagram_poll                  /lib/modules/2.6.34-rc5/build/vmlinux
               48.00  1.1% epoll_ctl                      /lib/libc-2.7.so                     
               48.00  1.1% kfree                          /lib/modules/2.6.34-rc5/build/vmlinux
               47.00  1.1% _raw_spin_unlock_bh            /lib/modules/2.6.34-rc5/build/vmlinux
               47.00  1.1% mutex_unlock                   /lib/modules/2.6.34-rc5/build/vmlinux
               45.00  1.0% __udp4_lib_rcv                 /lib/modules/2.6.34-rc5/build/vmlinux
               45.00  1.0% tick_nohz_stop_sched_tick      /lib/modules/2.6.34-rc5/build/vmlinux

-----------------------------------------------------------------------------------------------
   PerfTop:     408 irqs/sec  kernel:82.1% [1000Hz cycles],  (all, cpu: 2)
-----------------------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ _____________________________________

              240.00  4.8% _raw_spin_lock_irqsave         /lib/modules/2.6.34-rc5/build/vmlinux
              200.00  4.0% system_call                    /lib/modules/2.6.34-rc5/build/vmlinux
              165.00  3.3% _raw_spin_unlock_irqrestore    /lib/modules/2.6.34-rc5/build/vmlinux
              161.00  3.2% ip_rcv                         /lib/modules/2.6.34-rc5/build/vmlinux
              158.00  3.1% fget                           /lib/modules/2.6.34-rc5/build/vmlinux
              150.00  3.0% sys_epoll_ctl                  /lib/modules/2.6.34-rc5/build/vmlinux
              135.00  2.7% __netif_receive_skb            /lib/modules/2.6.34-rc5/build/vmlinux
              122.00  2.4% ip_route_input                 /lib/modules/2.6.34-rc5/build/vmlinux
              117.00  2.3% call_function_single_interrupt /lib/modules/2.6.34-rc5/build/vmlinux
              114.00  2.3% schedule                       /lib/modules/2.6.34-rc5/build/vmlinux
              110.00  2.2% _raw_spin_lock                 /lib/modules/2.6.34-rc5/build/vmlinux
              108.00  2.1% copy_user_generic_string       /lib/modules/2.6.34-rc5/build/vmlinux
              101.00  2.0% kmem_cache_free                /lib/modules/2.6.34-rc5/build/vmlinux
               94.00  1.9% vread_tsc                      [kernel].vsyscall_fn                 
               90.00  1.8% __udp4_lib_lookup              /lib/modules/2.6.34-rc5/build/vmlinux
               85.00  1.7% fput                           /lib/modules/2.6.34-rc5/build/vmlinux
               78.00  1.5% dst_release                    /lib/modules/2.6.34-rc5/build/vmlinux
               77.00  1.5% ep_remove                      /lib/modules/2.6.34-rc5/build/vmlinux
               75.00  1.5% kmem_cache_alloc               /lib/modules/2.6.34-rc5/build/vmlinux
               74.00  1.5% _raw_spin_lock_bh              /lib/modules/2.6.34-rc5/build/vmlinux
               69.00  1.4% sys_epoll_wait                 /lib/modules/2.6.34-rc5/build/vmlinux
               68.00  1.3% event_base_loop                /usr/lib/libevent-1.3e.so.1.0.3      
               68.00  1.3% sock_recv_ts_and_drops         /lib/modules/2.6.34-rc5/build/vmlinux
               62.00  1.2% _raw_spin_unlock_bh            /lib/modules/2.6.34-rc5/build/vmlinux
               62.00  1.2% datagram_poll                  /lib/modules/2.6.34-rc5/build/vmlinux
               55.00  1.1% epoll_ctl                      /lib/libc-2.7.so                     
               53.00  1.1% local_bh_enable_ip             /lib/modules/2.6.34-rc5/build/vmlinux
               53.00  1.1% tick_nohz_stop_sched_tick      /lib/modules/2.6.34-rc5/build/vmlinux
               52.00  1.0% mutex_unlock                   /lib/modules/2.6.34-rc5/build/vmlinux

-----------------------------------------------------------------------------------------------
   PerfTop:     440 irqs/sec  kernel:85.0% [1000Hz cycles],  (all, cpu: 2)
-----------------------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ _____________________________________

              226.00  4.6% _raw_spin_lock_irqsave         /lib/modules/2.6.34-rc5/build/vmlinux
              213.00  4.3% system_call                    /lib/modules/2.6.34-rc5/build/vmlinux
              154.00  3.1% _raw_spin_unlock_irqrestore    /lib/modules/2.6.34-rc5/build/vmlinux
              148.00  3.0% ip_rcv                         /lib/modules/2.6.34-rc5/build/vmlinux
              143.00  2.9% fget                           /lib/modules/2.6.34-rc5/build/vmlinux
              143.00  2.9% ip_route_input                 /lib/modules/2.6.34-rc5/build/vmlinux
              140.00  2.8% __netif_receive_skb            /lib/modules/2.6.34-rc5/build/vmlinux
              124.00  2.5% call_function_single_interrupt /lib/modules/2.6.34-rc5/build/vmlinux
              124.00  2.5% sys_epoll_ctl                  /lib/modules/2.6.34-rc5/build/vmlinux
              104.00  2.1% copy_user_generic_string       /lib/modules/2.6.34-rc5/build/vmlinux
              103.00  2.1% vread_tsc                      [kernel].vsyscall_fn                 
              101.00  2.0% schedule                       /lib/modules/2.6.34-rc5/build/vmlinux
              100.00  2.0% kmem_cache_free                /lib/modules/2.6.34-rc5/build/vmlinux
               99.00  2.0% _raw_spin_lock                 /lib/modules/2.6.34-rc5/build/vmlinux
               93.00  1.9% __udp4_lib_lookup              /lib/modules/2.6.34-rc5/build/vmlinux
               80.00  1.6% fput                           /lib/modules/2.6.34-rc5/build/vmlinux
               76.00  1.5% kmem_cache_alloc               /lib/modules/2.6.34-rc5/build/vmlinux
               75.00  1.5% sock_recv_ts_and_drops         /lib/modules/2.6.34-rc5/build/vmlinux
               73.00  1.5% dst_release                    /lib/modules/2.6.34-rc5/build/vmlinux
               70.00  1.4% sys_epoll_wait                 /lib/modules/2.6.34-rc5/build/vmlinux
               69.00  1.4% datagram_poll                  /lib/modules/2.6.34-rc5/build/vmlinux
               65.00  1.3% event_base_loop                /usr/lib/libevent-1.3e.so.1.0.3      
               65.00  1.3% ep_remove                      /lib/modules/2.6.34-rc5/build/vmlinux



III: Kernel compiled with Erics patch, rps mask 00

Avg udp packets sunk: 98.74%

-------------------------------------------------------------------------------
   PerfTop:    4202 irqs/sec  kernel:82.5% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ______________________

             1639.00  9.0% sky2_poll                   [sky2]                
             1051.00  5.8% _raw_spin_lock_irqsave      [kernel]              
              665.00  3.7% system_call                 [kernel]              
              578.00  3.2% fget                        [kernel]              
              476.00  2.6% _raw_spin_unlock_irqrestore [kernel]              
              457.00  2.5% copy_user_generic_string    [kernel]              
              427.00  2.4% sys_epoll_ctl               [kernel]              
              401.00  2.2% datagram_poll               [kernel]              
              391.00  2.2% kmem_cache_free             [kernel]              
              349.00  1.9% schedule                    [kernel]              
              339.00  1.9% vread_tsc                   [kernel].vsyscall_fn  
              323.00  1.8% udp_recvmsg                 [kernel]              
              292.00  1.6% kmem_cache_alloc            [kernel]              
              285.00  1.6% _raw_spin_lock              [kernel]              
              272.00  1.5% _raw_spin_lock_bh           [kernel]              
              268.00  1.5% sys_epoll_wait              [kernel]              
              260.00  1.4% fput                        [kernel]              
              234.00  1.3% ip_route_input              [kernel]              
              221.00  1.2% __udp4_lib_lookup           [kernel]              
              212.00  1.2% dst_release                 [kernel]              
              209.00  1.2% ip_rcv                      [kernel]              
              203.00  1.1% ep_remove                   [kernel]              
              202.00  1.1% first_packet_length         [kernel]              


-------------------------------------------------------------------------------
   PerfTop:    3999 irqs/sec  kernel:82.3% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ______________________

             3452.00  9.3% sky2_poll                   [sky2]                
             2212.00  5.9% _raw_spin_lock_irqsave      [kernel]              
             1350.00  3.6% system_call                 [kernel]              
             1187.00  3.2% fget                        [kernel]              
             1010.00  2.7% copy_user_generic_string    [kernel]              
              965.00  2.6% _raw_spin_unlock_irqrestore [kernel]              
              842.00  2.3% sys_epoll_ctl               [kernel]              
              833.00  2.2% datagram_poll               [kernel]              
              770.00  2.1% kmem_cache_free             [kernel]              
              710.00  1.9% vread_tsc                   [kernel].vsyscall_fn  
              688.00  1.8% schedule                    [kernel]              
              651.00  1.7% udp_recvmsg                 [kernel]              
              603.00  1.6% _raw_spin_lock_bh           [kernel]              
              599.00  1.6% _raw_spin_lock              [kernel]              
              597.00  1.6% sys_epoll_wait              [kernel]              
              594.00  1.6% kmem_cache_alloc            [kernel]              
              553.00  1.5% ip_route_input              [kernel]              
              528.00  1.4% fput                        [kernel]              
              496.00  1.3% __udp4_lib_lookup           [kernel]              
              444.00  1.2% dst_release                 [kernel]              
              433.00  1.2% ip_rcv                      [kernel]              
              408.00  1.1% first_packet_length         [kernel]              

-------------------------------------------------------------------------------
   PerfTop:    3765 irqs/sec  kernel:83.7% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ______________________

             4275.00  9.5% sky2_poll                   [sky2]                
             2684.00  6.0% _raw_spin_lock_irqsave      [kernel]              
             1654.00  3.7% system_call                 [kernel]              
             1447.00  3.2% fget                        [kernel]              
             1223.00  2.7% copy_user_generic_string    [kernel]              
             1146.00  2.5% _raw_spin_unlock_irqrestore [kernel]              
             1036.00  2.3% sys_epoll_ctl               [kernel]              
             1019.00  2.3% datagram_poll               [kernel]              
              974.00  2.2% kmem_cache_free             [kernel]              
              843.00  1.9% vread_tsc                   [kernel].vsyscall_fn  
              799.00  1.8% schedule                    [kernel]              
              761.00  1.7% udp_recvmsg                 [kernel]              
              736.00  1.6% kmem_cache_alloc            [kernel]              
              719.00  1.6% _raw_spin_lock_bh           [kernel]              
              716.00  1.6% _raw_spin_lock              [kernel]              
              696.00  1.5% sys_epoll_wait              [kernel]              
              680.00  1.5% ip_route_input              [kernel]              
              657.00  1.5% fput                        [kernel]              
              613.00  1.4% __udp4_lib_lookup           [kernel]              
              552.00  1.2% dst_release                 [kernel]              
              507.00  1.1% ip_rcv                      [kernel]            


-------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:99.9% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

              669.00 32.2% sky2_poll                   [sky2]  
              128.00  6.2% ip_route_input              [kernel]
              106.00  5.1% ip_rcv                      [kernel]
              105.00  5.1% __udp4_lib_lookup           [kernel]
               86.00  4.1% _raw_spin_lock              [kernel]
               85.00  4.1% _raw_spin_lock_irqsave      [kernel]
               82.00  3.9% __alloc_skb                 [kernel]
               78.00  3.8% sock_queue_rcv_skb          [kernel]
               57.00  2.7% __netif_receive_skb         [kernel]
               53.00  2.6% __wake_up_common            [kernel]
               47.00  2.3% __udp4_lib_rcv              [kernel]
               42.00  2.0% sock_def_readable           [kernel]
               37.00  1.8% kmem_cache_alloc            [kernel]
               34.00  1.6% ep_poll_callback            [kernel]
               34.00  1.6% __kmalloc                   [kernel]
               34.00  1.6% select_task_rq_fair         [kernel]
               30.00  1.4% _raw_read_lock              [kernel]
               27.00  1.3% _raw_spin_unlock_irqrestore [kernel]
               24.00  1.2% sky2_rx_submit              [sky2]  
               22.00  1.1% udp_queue_rcv_skb           [kernel]
               21.00  1.0% try_to_wake_up              [kernel]


-------------------------------------------------------------------------------
   PerfTop:    1000 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             3061.00 31.9% sky2_poll                   [sky2]  
              529.00  5.5% ip_route_input              [kernel]
              518.00  5.4% __udp4_lib_lookup           [kernel]
              424.00  4.4% ip_rcv                      [kernel]
              390.00  4.1% _raw_spin_lock_irqsave      [kernel]
              389.00  4.1% __alloc_skb                 [kernel]
              365.00  3.8% _raw_spin_lock              [kernel]
              326.00  3.4% sock_queue_rcv_skb          [kernel]
              297.00  3.1% __netif_receive_skb         [kernel]
              273.00  2.8% __udp4_lib_rcv              [kernel]
              223.00  2.3% sock_def_readable           [kernel]
              205.00  2.1% __wake_up_common            [kernel]
              181.00  1.9% __kmalloc                   [kernel]
              151.00  1.6% kmem_cache_alloc            [kernel]
              147.00  1.5% _raw_read_lock              [kernel]
              143.00  1.5% ep_poll_callback            [kernel]
              136.00  1.4% sky2_rx_submit              [sky2]  
              123.00  1.3% task_rq_lock                [kernel]
              118.00  1.2% _raw_spin_unlock_irqrestore [kernel]
              114.00  1.2% select_task_rq_fair         [kernel]
              104.00  1.1% resched_task                [kernel]
              104.00  1.1% sky2_remove                 [sky2]  
              102.00  1.1% udp_queue_rcv_skb           [kernel]


-------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             3898.00 31.0% sky2_poll                   [sky2]  
              715.00  5.7% ip_route_input              [kernel]
              651.00  5.2% __udp4_lib_lookup           [kernel]
              576.00  4.6% ip_rcv                      [kernel]
              534.00  4.2% __alloc_skb                 [kernel]
              518.00  4.1% _raw_spin_lock_irqsave      [kernel]
              441.00  3.5% sock_queue_rcv_skb          [kernel]
              439.00  3.5% _raw_spin_lock              [kernel]
              396.00  3.1% __netif_receive_skb         [kernel]
              351.00  2.8% __udp4_lib_rcv              [kernel]
              300.00  2.4% sock_def_readable           [kernel]
              264.00  2.1% __wake_up_common            [kernel]
              260.00  2.1% __kmalloc                   [kernel]
              198.00  1.6% kmem_cache_alloc            [kernel]
              193.00  1.5% ep_poll_callback            [kernel]
              192.00  1.5% _raw_read_lock              [kernel]
              168.00  1.3% sky2_rx_submit              [sky2]  
              167.00  1.3% task_rq_lock                [kernel]
              153.00  1.2% udp_queue_rcv_skb           [kernel]
              149.00  1.2% _raw_spin_unlock_irqrestore [kernel]
              147.00  1.2% ip_local_deliver            [kernel]
              144.00  1.1% resched_task                [kernel]
              137.00  1.1% sky2_remove                 [sky2]  


-------------------------------------------------------------------------------
   PerfTop:     663 irqs/sec  kernel:81.9% [1000Hz cycles],  (all, cpu: 2)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ____________________

              129.00  7.0% _raw_spin_lock_irqsave      [kernel]            
               84.00  4.5% fget                        [kernel]            
               83.00  4.5% system_call                 [kernel]            
               82.00  4.4% copy_user_generic_string    [kernel]            
               67.00  3.6% _raw_spin_unlock_irqrestore [kernel]            
               63.00  3.4% datagram_poll               [kernel]            
               57.00  3.1% udp_recvmsg                 [kernel]            
               55.00  3.0% sys_epoll_ctl               [kernel]            
               55.00  3.0% vread_tsc                   [kernel].vsyscall_fn
               43.00  2.3% sys_epoll_wait              [kernel]            
               43.00  2.3% _raw_spin_lock_bh           [kernel]            
               41.00  2.2% first_packet_length         [kernel]            
               40.00  2.2% dst_release                 [kernel]            
               37.00  2.0% fput                        [kernel]            
               37.00  2.0% kmem_cache_free             [kernel]            
               36.00  1.9% mutex_unlock                [kernel]            
               35.00  1.9% schedule                    [kernel]            
               34.00  1.8% skb_copy_datagram_iovec     [kernel]            
               34.00  1.8% ep_remove                   [kernel]            
               29.00  1.6% mutex_lock                  [kernel]            
               29.00  1.6% _raw_spin_lock              [kernel]            
               28.00  1.5% __skb_recv_datagram         [kernel]            
               25.00  1.4% epoll_ctl                   /lib/libc-2.7.so    
               25.00  1.4% tick_nohz_stop_sched_tick   [kernel]            


-------------------------------------------------------------------------------
   PerfTop:     629 irqs/sec  kernel:81.1% [1000Hz cycles],  (all, cpu: 2)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ______________________

              351.00  7.9% _raw_spin_lock_irqsave      [kernel]              
              248.00  5.6% system_call                 [kernel]              
              219.00  5.0% fget                        [kernel]              
              194.00  4.4% copy_user_generic_string    [kernel]              
              184.00  4.2% datagram_poll               [kernel]              
              162.00  3.7% sys_epoll_ctl               [kernel]              
              159.00  3.6% _raw_spin_unlock_irqrestore [kernel]              
              129.00  2.9% udp_recvmsg                 [kernel]              
              129.00  2.9% kmem_cache_free             [kernel]              
              123.00  2.8% vread_tsc                   [kernel].vsyscall_fn  
              108.00  2.4% schedule                    [kernel]              
              107.00  2.4% _raw_spin_lock_bh           [kernel]              
              104.00  2.4% sys_epoll_wait              [kernel]              
              100.00  2.3% fput                        [kernel]              
               94.00  2.1% dst_release                 [kernel]              
               78.00  1.8% first_packet_length         [kernel]              
               73.00  1.7% ep_remove                   [kernel]              
               69.00  1.6% epoll_ctl                   /lib/libc-2.7.so      
               66.00  1.5% skb_copy_datagram_iovec     [kernel]              
               66.00  1.5% mutex_unlock                [kernel]              
               64.00  1.4% __skb_recv_datagram         [kernel]              
               64.00  1.4% mutex_lock                  [kernel]              
               57.00  1.3% sock_recv_ts_and_drops      [kernel]              
               51.00  1.2% kmem_cache_alloc            [kernel]              
               49.00  1.1% ep_send_events_proc         [kernel]              

-------------------------------------------------------------------------------
   PerfTop:     457 irqs/sec  kernel:72.0% [1000Hz cycles],  (all, cpu: 2)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ______________________

              411.00  7.8% _raw_spin_lock_irqsave      [kernel]              
              280.00  5.3% system_call                 [kernel]              
              269.00  5.1% fget                        [kernel]              
              239.00  4.5% copy_user_generic_string    [kernel]              
              232.00  4.4% datagram_poll               [kernel]              
              175.00  3.3% _raw_spin_unlock_irqrestore [kernel]              
              170.00  3.2% sys_epoll_ctl               [kernel]              
              169.00  3.2% kmem_cache_free             [kernel]              
              149.00  2.8% udp_recvmsg                 [kernel]              
              144.00  2.7% vread_tsc                   [kernel].vsyscall_fn  
              129.00  2.4% sys_epoll_wait              [kernel]              
              128.00  2.4% _raw_spin_lock_bh           [kernel]              
              115.00  2.2% fput                        [kernel]              
              112.00  2.1% schedule                    [kernel]              
              108.00  2.0% dst_release                 [kernel]              
               88.00  1.7% first_packet_length         [kernel]              
               86.00  1.6% ep_remove                   [kernel]              
               83.00  1.6% mutex_lock                  [kernel]              
               79.00  1.5% skb_copy_datagram_iovec     [kernel]              
               76.00  1.4% mutex_unlock                [kernel]              
               75.00  1.4% epoll_ctl                   /lib/libc-2.7.so      
               73.00  1.4% sock_recv_ts_and_drops      [kernel]              
               67.00  1.3% __skb_recv_datagram         [kernel]              
               65.00  1.2% tick_nohz_stop_sched_tick   [kernel]              


Interesting stuff; check cache miss contributions - wow, how low is eth_type_trans..
and yet we keep optimizing that!

-------------------------------------------------------------------------------
   PerfTop:    1021 irqs/sec  kernel:98.8% [1000Hz cache-misses],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                        DSO
             _______ _____ _______________________________ ________

             5271.00 77.8% sky2_poll                       [sky2]  
              706.00 10.4% kmem_cache_alloc                [kernel]
              154.00  2.3% dev_gro_receive                 [kernel]
              149.00  2.2% __napi_gro_receive              [kernel]
              128.00  1.9% napi_gro_receive                [kernel]
              106.00  1.6% __alloc_skb                     [kernel]
               57.00  0.8% eth_type_trans                  [kernel]
               45.00  0.7% skb_gro_reset_offset            [kernel]
               26.00  0.4% drain_array                     [kernel]
               23.00  0.3% perf_session__mmap_read_counter perf    
               10.00  0.1% cache_alloc_refill              [kernel]
                9.00  0.1% __netdev_alloc_skb              [kernel]
                9.00  0.1% event__preprocess_sample        perf    


-------------------------------------------------------------------------------
   PerfTop:     997 irqs/sec  kernel:100.0% [1000Hz cache-misses],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function             DSO
             _______ _____ ____________________ ________

             3019.00 79.4% sky2_poll            [sky2]  
              360.00  9.5% kmem_cache_alloc     [kernel]
               91.00  2.4% dev_gro_receive      [kernel]
               86.00  2.3% __alloc_skb          [kernel]
               83.00  2.2% __napi_gro_receive   [kernel]
               69.00  1.8% napi_gro_receive     [kernel]
               45.00  1.2% eth_type_trans       [kernel]
               25.00  0.7% skb_gro_reset_offset [kernel]
                9.00  0.2% __netdev_alloc_skb   [kernel]
                5.00  0.1% cache_alloc_refill   [kernel]
                5.00  0.1% skb_pull             [kernel]


-------------------------------------------------------------------------------
   PerfTop:     997 irqs/sec  kernel:100.0% [1000Hz cache-misses],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function             DSO
             _______ _____ ____________________ ________

             8887.00 79.8% sky2_poll            [sky2]  
             1138.00 10.2% kmem_cache_alloc     [kernel]
              273.00  2.5% __napi_gro_receive   [kernel]
              246.00  2.2% dev_gro_receive      [kernel]
              189.00  1.7% napi_gro_receive     [kernel]
              159.00  1.4% __alloc_skb          [kernel]
              119.00  1.1% eth_type_trans       [kernel]
               86.00  0.8% skb_gro_reset_offset [kernel]
               13.00  0.1% __netdev_alloc_skb   [kernel]
                8.00  0.1% skb_pull             [kernel]
                7.00  0.1% cache_alloc_refill   [kernel]


Not much going on in other cpus .. i.e hardly anything shows up in
the profile ..

IV: rps with ee and irq affinity to cpu0

Avg udp packets sunk: 95.15%


-------------------------------------------------------------------------------
   PerfTop:    3558 irqs/sec  kernel:84.6% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                      DSO
             _______ _____ _____________________________ ______________________

             3096.00 17.1% sky2_poll                     [sky2]                
              645.00  3.6% _raw_spin_lock_irqsave        [kernel]              
              493.00  2.7% system_call                   [kernel]              
              462.00  2.6% sky2_intr                     [sky2]                
              416.00  2.3% _raw_spin_unlock_irqrestore   [kernel]              
              382.00  2.1% fget                          [kernel]              
              361.00  2.0% __netif_receive_skb           [kernel]              
              342.00  1.9% ip_rcv                        [kernel]              
              334.00  1.8% _raw_spin_lock                [kernel]              
              320.00  1.8% sys_epoll_ctl                 [kernel]              
              298.00  1.6% copy_user_generic_string      [kernel]              
              288.00  1.6% call_function_single_interrup [kernel]              
              277.00  1.5% load_balance                  [kernel]              
              271.00  1.5% ip_route_input                [kernel]              
              270.00  1.5% vread_tsc                     [kernel].vsyscall_fn  
              256.00  1.4% kmem_cache_free               [kernel]              
              222.00  1.2% __udp4_lib_lookup             [kernel]              
              222.00  1.2% schedule                      [kernel]              
              194.00  1.1% fput                          [kernel]              
              189.00  1.0% kmem_cache_alloc              [kernel]              
              171.00  0.9% sys_epoll_wait                [kernel]              
              164.00  0.9% ep_remove                     [kernel]          

-------------------------------------------------------------------------------
   PerfTop:    3452 irqs/sec  kernel:84.3% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                      DSO
             _______ _____ _____________________________ ______________________

             5033.00 16.2% sky2_poll                     [sky2]                
             1147.00  3.7% _raw_spin_lock_irqsave        [kernel]              
              888.00  2.9% system_call                   [kernel]              
              774.00  2.5% sky2_intr                     [sky2]                
              757.00  2.4% _raw_spin_unlock_irqrestore   [kernel]              
              702.00  2.3% fget                          [kernel]              
              630.00  2.0% __netif_receive_skb           [kernel]              
              609.00  2.0% _raw_spin_lock                [kernel]              
              607.00  2.0% ip_rcv                        [kernel]              
              553.00  1.8% sys_epoll_ctl                 [kernel]              
              514.00  1.7% ip_route_input                [kernel]              
              508.00  1.6% call_function_single_interrup [kernel]              
              504.00  1.6% copy_user_generic_string      [kernel]              
              466.00  1.5% kmem_cache_free               [kernel]              
              452.00  1.5% schedule                      [kernel]              
              450.00  1.4% vread_tsc                     [kernel].vsyscall_fn  
              390.00  1.3% load_balance                  [kernel]              
              377.00  1.2% fput                          [kernel]              
              364.00  1.2% __udp4_lib_lookup             [kernel]              
              329.00  1.1% kmem_cache_alloc              [kernel]              
              314.00  1.0% ep_remove                     [kernel]              
              289.00  0.9% dst_release                   [kernel]              
              276.00  0.9% sys_epoll_wait                [kernel]              
              265.00  0.9% datagram_poll                 [kernel]              

-------------------------------------------------------------------------------
   PerfTop:    3328 irqs/sec  kernel:85.7% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                      DSO
             _______ _____ _____________________________ ______________________

             6788.00 17.5% sky2_poll                     [sky2]                
             1413.00  3.6% _raw_spin_lock_irqsave        [kernel]              
             1042.00  2.7% system_call                   [kernel]              
              997.00  2.6% sky2_intr                     [sky2]                
              903.00  2.3% _raw_spin_unlock_irqrestore   [kernel]              
              837.00  2.2% fget                          [kernel]              
              740.00  1.9% _raw_spin_lock                [kernel]              
              725.00  1.9% __netif_receive_skb           [kernel]              
              722.00  1.9% ip_rcv                        [kernel]              
              651.00  1.7% sys_epoll_ctl                 [kernel]              
              609.00  1.6% call_function_single_interrup [kernel]              
              604.00  1.6% ip_route_input                [kernel]              
              601.00  1.5% copy_user_generic_string      [kernel]              
              573.00  1.5% schedule                      [kernel]              
              561.00  1.4% kmem_cache_free               [kernel]              
              538.00  1.4% load_balance                  [kernel]              
              515.00  1.3% vread_tsc                     [kernel].vsyscall_fn  
              480.00  1.2% fput                          [kernel]              
              421.00  1.1% kmem_cache_alloc              [kernel]              
              418.00  1.1% __udp4_lib_lookup             [kernel]              
              377.00  1.0% ep_remove                     [kernel]              
              347.00  0.9% datagram_poll                 [kernel]              
              335.00  0.9% dst_release                   [kernel]              

-------------------------------------------------------------------------------
   PerfTop:    1000 irqs/sec  kernel:96.2% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function                      DSO
             _______ _____ _____________________________ ______________________

             2109.00 61.3% sky2_poll                     [sky2]                
              366.00 10.6% sky2_intr                     [sky2]                
               84.00  2.4% __alloc_skb                   [kernel]              
               57.00  1.7% _raw_spin_lock_irqsave        [kernel]              
               56.00  1.6% get_rps_cpu                   [kernel]              
               52.00  1.5% __kmalloc                     [kernel]              
               39.00  1.1% irq_entries_start             [kernel]              
               39.00  1.1% enqueue_to_backlog            [kernel]              
               34.00  1.0% kmem_cache_alloc              [kernel]              
               33.00  1.0% default_send_IPI_mask_sequenc [kernel]              
               32.00  0.9% sky2_rx_submit                [sky2]                
               30.00  0.9% swiotlb_sync_single           [kernel]              
               28.00  0.8% _raw_spin_lock                [kernel]              
               23.00  0.7% sky2_remove                   [sky2]                
               22.00  0.6% __smp_call_function_single    [kernel]              
               19.00  0.6% system_call                   [kernel]              
               18.00  0.5% sys_epoll_ctl                 [kernel]              
               18.00  0.5% fget                          [kernel]              
               17.00  0.5% cache_alloc_refill            [kernel]              
               16.00  0.5% copy_user_generic_string      [kernel]              
               16.00  0.5% _raw_spin_unlock_irqrestore   [kernel]              
               15.00  0.4% dev_gro_receive               [kernel]              
               14.00  0.4% net_rx_action                 [kernel]             

-------------------------------------------------------------------------------
   PerfTop:    1000 irqs/sec  kernel:97.9% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function                        DSO
             _______ _____ _______________________________ ____________________

             4479.00 60.9% sky2_poll                       [sky2]              
              849.00 11.5% sky2_intr                       [sky2]              
              163.00  2.2% __alloc_skb                     [kernel]            
              155.00  2.1% get_rps_cpu                     [kernel]            
              121.00  1.6% _raw_spin_lock_irqsave          [kernel]            
               92.00  1.3% __kmalloc                       [kernel]            
               89.00  1.2% _raw_spin_lock                  [kernel]            
               83.00  1.1% enqueue_to_backlog              [kernel]            
               79.00  1.1% irq_entries_start               [kernel]            
               78.00  1.1% kmem_cache_alloc                [kernel]            
               69.00  0.9% sky2_rx_submit                  [sky2]              
               65.00  0.9% swiotlb_sync_single             [kernel]            
               58.00  0.8% default_send_IPI_mask_sequence_ [kernel]            
               50.00  0.7% system_call                     [kernel]            
               45.00  0.6% fget                            [kernel]            
               40.00  0.5% sky2_remove                     [sky2]              
               37.00  0.5% __smp_call_function_single      [kernel]            
               36.00  0.5% datagram_poll                   [kernel]            
               36.00  0.5% _raw_spin_unlock_irqrestore     [kernel]            
               34.00  0.5% cache_alloc_refill              [kernel]            
               31.00  0.4% net_rx_action                   [kernel]            
               28.00  0.4% kmem_cache_free                 [kernel]            
               27.00  0.4% _raw_spin_lock_bh               [kernel]            
               27.00  0.4% copy_user_generic_string        [kernel]            
               25.00  0.3% dev_gro_receive                 [kernel]            


-------------------------------------------------------------------------------
   PerfTop:     980 irqs/sec  kernel:97.3% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function                        DSO
             _______ _____ _______________________________ ____________________

             6544.00 61.6% sky2_poll                       [sky2]              
             1098.00 10.3% sky2_intr                       [sky2]              
              248.00  2.3% __alloc_skb                     [kernel]            
              198.00  1.9% get_rps_cpu                     [kernel]            
              182.00  1.7% _raw_spin_lock_irqsave          [kernel]            
              144.00  1.4% __kmalloc                       [kernel]            
              138.00  1.3% _raw_spin_lock                  [kernel]            
              127.00  1.2% kmem_cache_alloc                [kernel]            
              125.00  1.2% irq_entries_start               [kernel]            
              119.00  1.1% enqueue_to_backlog              [kernel]            
               93.00  0.9% sky2_rx_submit                  [sky2]              
               91.00  0.9% swiotlb_sync_single             [kernel]            
               83.00  0.8% default_send_IPI_mask_sequence_ [kernel]            
               82.00  0.8% system_call                     [kernel]            
               64.00  0.6% sky2_remove                     [sky2]              
               60.00  0.6% fget                            [kernel]            
               58.00  0.5% cache_alloc_refill              [kernel]            
               57.00  0.5% _raw_spin_unlock_irqrestore     [kernel]            
               51.00  0.5% datagram_poll                   [kernel]            
               47.00  0.4% copy_user_generic_string        [kernel]            


-------------------------------------------------------------------------------
   PerfTop:     315 irqs/sec  kernel:81.0% [1000Hz cycles],  (all, cpu: 2)
-------------------------------------------------------------------------------

             samples  pcnt function                      DSO
             _______ _____ _____________________________ ______________________

              114.00  4.5% system_call                   [kernel]              
               98.00  3.9% _raw_spin_lock_irqsave        [kernel]              
               89.00  3.5% _raw_spin_unlock_irqrestore   [kernel]              
               89.00  3.5% ip_rcv                        [kernel]              
               83.00  3.3% call_function_single_interrup [kernel]              
               76.00  3.0% __netif_receive_skb           [kernel]              
               67.00  2.6% fget                          [kernel]              
               62.00  2.4% ip_route_input                [kernel]              
               59.00  2.3% vread_tsc                     [kernel].vsyscall_fn  
               54.00  2.1% kmem_cache_free               [kernel]              
               54.00  2.1% sys_epoll_ctl                 [kernel]              
               51.00  2.0% schedule                      [kernel]              
               49.00  1.9% _raw_spin_lock                [kernel]              
               49.00  1.9% __udp4_lib_lookup             [kernel]              
               44.00  1.7% ep_remove                     [kernel]              
               44.00  1.7% copy_user_generic_string      [kernel]              
               41.00  1.6% fput                          [kernel]              
               38.00  1.5% sys_epoll_wait                [kernel]              
               37.00  1.5% tick_nohz_stop_sched_tick     [kernel]              
               36.00  1.4% kmem_cache_alloc              [kernel]              
               34.00  1.3% datagram_poll                 [kernel]              
               33.00  1.3% __udp4_lib_rcv                [kernel]              
               31.00  1.2% process_recv                  mcpudp               

-------------------------------------------------------------------------------
   PerfTop:     292 irqs/sec  kernel:82.9% [1000Hz cycles],  (all, cpu: 2)
-------------------------------------------------------------------------------

             samples  pcnt function                      DSO
             _______ _____ _____________________________ ______________________

              154.00  4.7% _raw_spin_lock_irqsave        [kernel]              
              140.00  4.2% system_call                   [kernel]              
              111.00  3.4% ip_rcv                        [kernel]              
              106.00  3.2% _raw_spin_unlock_irqrestore   [kernel]              
               96.00  2.9% call_function_single_interrup [kernel]              
               95.00  2.9% fget                          [kernel]              
               90.00  2.7% __netif_receive_skb           [kernel]              
               89.00  2.7% sys_epoll_ctl                 [kernel]              
               77.00  2.3% copy_user_generic_string      [kernel]              
               77.00  2.3% ip_route_input                [kernel]              
               76.00  2.3% kmem_cache_free               [kernel]              
               74.00  2.2% _raw_spin_lock                [kernel]              
               71.00  2.1% schedule                      [kernel]              
               69.00  2.1% vread_tsc                     [kernel].vsyscall_fn  
               58.00  1.8% __udp4_lib_lookup             [kernel]              
               52.00  1.6% __udp4_lib_rcv                [kernel]              
               51.00  1.5% fput                          [kernel]              
               47.00  1.4% ep_remove                     [kernel]              
               47.00  1.4% event_base_loop               libevent-1.3e.so.1.0.3
               39.00  1.2% process_recv                  mcpudp                
               39.00  1.2% sys_epoll_wait                [kernel]              
               38.00  1.2% udp_recvmsg                   [kernel]              
               38.00  1.2% sock_recv_ts_and_drops        [kernel]              
               37.00  1.1% __switch_to                   [kernel]              

-------------------------------------------------------------------------------
   PerfTop:     290 irqs/sec  kernel:82.1% [1000Hz cycles],  (all, cpu: 2)
-------------------------------------------------------------------------------

             samples  pcnt function                      DSO
             _______ _____ _____________________________ ______________________

              175.00  4.7% _raw_spin_lock_irqsave        [kernel]              
              153.00  4.2% system_call                   [kernel]              
              122.00  3.3% ip_rcv                        [kernel]              
              114.00  3.1% _raw_spin_unlock_irqrestore   [kernel]              
              114.00  3.1% fget                          [kernel]              
              105.00  2.8% __netif_receive_skb           [kernel]              
              101.00  2.7% sys_epoll_ctl                 [kernel]              
              100.00  2.7% call_function_single_interrup [kernel]              
               90.00  2.4% copy_user_generic_string      [kernel]              
               84.00  2.3% schedule                      [kernel]              
               76.00  2.1% kmem_cache_free               [kernel]              
               76.00  2.1% _raw_spin_lock                [kernel]              
               72.00  2.0% ip_route_input                [kernel]              
               70.00  1.9% vread_tsc                     [kernel].vsyscall_fn  
               68.00  1.8% __udp4_lib_lookup             [kernel]              
               68.00  1.8% __udp4_lib_rcv                [kernel]              
               57.00  1.5% ep_remove                     [kernel]              
               57.00  1.5% fput                          [kernel]              
               55.00  1.5% kmem_cache_alloc              [kernel]              
               51.00  1.4% process_recv                  mcpudp           




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-28 23:44               ` [PATCH net-next-2.6] net: speedup udp receive path jamal
@ 2010-04-29  0:00                 ` jamal
  2010-04-29  4:09                 ` Eric Dumazet
  1 sibling, 0 replies; 108+ messages in thread
From: jamal @ 2010-04-29  0:00 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, xiaosuo, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

On Wed, 2010-04-28 at 19:45 -0400, jamal wrote:

> Your patch has improved the performance of rps relative to what is in
> net-next very lightly; but it has also improved the performance of
> non-rps;->

Correction: Last part of sentence not true (obvious if you look at
results i attached)

cheers,
jamal



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-28 23:44               ` [PATCH net-next-2.6] net: speedup udp receive path jamal
  2010-04-29  0:00                 ` jamal
@ 2010-04-29  4:09                 ` Eric Dumazet
  2010-04-29 11:35                   ` jamal
  1 sibling, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-04-29  4:09 UTC (permalink / raw)
  To: hadi
  Cc: David Miller, xiaosuo, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

Le mercredi 28 avril 2010 à 19:44 -0400, jamal a écrit :
> On Wed, 2010-04-28 at 16:06 +0200, Eric Dumazet wrote:
> 
> > Here it is ;)
> 
> Sorry - things got a little hectic with TheMan.
> 
> I am afraid i dont have good news.
> Actually, I should say i dont have good news in regards to rps.
> For my sample app, two things seem to be happening:
> a) The overall performance has gotten better for both rps
> and non-rps.
> b) non-rps is now performing relatively better
> 
> This is just what i see in net-next not related to your patch.
> It seems the kernels i tested prior to April 23 showed rps better.
> The one i tested on Apr23 showed rps being about the same as non-rps.
> As i stated in my last result posting, I thought i didnt test properly
> but i did again today and saw the same thing. And now non-rps is
> _consistently_ better.
> So some regression is going on...
> 
> Your patch has improved the performance of rps relative to what is in
> net-next very lightly; but it has also improved the performance of
> non-rps;->
> My traces look different for the app cpu than yours - likely because of
> the apps being different.
> 
> At the moment i dont have time to dig deeper into code, but i could
> test as cycles show up.
> 
> I am attaching the profile traces and results.
> 
> cheers,
> jamal

Hi Jamal

I dont see in your results the number of pps, number of udp ports,
number of flows.

In my latest results, I can handle more pps than before, regardless of
rps being on or off, and with various number of udp ports (one user
thread per port), number of flows (many src addr so that rps spread
packets on many cpus)

If/when contention windows are smaller, cpu can run uncontended, and can
consume more cycles to process more frames ?

With a non yet published patch, I even can reach 600.000 pps in DDOS
situations, instead of 400.000.

Thanks !



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-29  4:09                 ` Eric Dumazet
@ 2010-04-29 11:35                   ` jamal
  2010-04-29 12:12                     ` Changli Gao
  0 siblings, 1 reply; 108+ messages in thread
From: jamal @ 2010-04-29 11:35 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, xiaosuo, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

[-- Attachment #1: Type: text/plain, Size: 2089 bytes --]


On Thu, 2010-04-29 at 06:09 +0200, Eric Dumazet wrote:


> I dont see in your results the number of pps, number of udp ports,
> number of flows.

My test scenario is still the same: send 1M packets of 8 flows
round-robin at 750Kpps. Repeat test 4-6 times and average out. 8 flows
map to 8 cpus. Any rate above 750Kpps and the driver starts dropping.
The flows are {Fixed dst IP, fixed src IP, fixed src port, 8 variable
dst port}. ip_rcv and friends show up in profile as we have already
discussed - but i dont want to change the test characteristic because i
cant do fair backward comparison. Also i use rps mask ee to use all the
cpus except the core doing demux (core 0).
In the results when i say "udp sink 90%" it means 90% of 750Kpps was
successfuly received by the app (on the multiple cpus).

> In my latest results, I can handle more pps than before, regardless of
> rps being on or off, 

Same here - even in my worst case scenario 88.5% of 750Kpps > 600Kpps.
Attached is history results to make more sense of what i am saying:
we have net-next kernels from apr14, apr23, apr23 with changlis change,
apr28, apr28 with your change. What you'll see is non-rps (blue) gets
better and rps (Orange) gets better slowly then by apr28 it is worse.

> and with various number of udp ports (one user
> thread per port), number of flows (many src addr so that rps spread
> packets on many cpus)
> 

This is true for me except for non rps getting relatively better and rps
getting worse in plain net-next for Apr 28. Sorry, dont have time to
dissect where things changed but i figured if i reported it will point
to something obvious.

> If/when contention windows are smaller, cpu can run uncontended, and can
> consume more cycles to process more frames ?
> 
> With a non yet published patch, I even can reach 600.000 pps in DDOS
> situations, instead of 400.000.

So my tests are simpler. What i was hoping to see was at minimum rps
maintains its gap of 6-7% more capacity. I dont mind seeing
rps get better. If both rps and non-rps get better that even more
interesting.

cheers,
jamal

[-- Attachment #2: rps-hist.pdf --]
[-- Type: application/pdf, Size: 212033 bytes --]

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-29 11:35                   ` jamal
@ 2010-04-29 12:12                     ` Changli Gao
  2010-04-29 12:45                       ` Eric Dumazet
  0 siblings, 1 reply; 108+ messages in thread
From: Changli Gao @ 2010-04-29 12:12 UTC (permalink / raw)
  To: hadi
  Cc: Eric Dumazet, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

On Thu, Apr 29, 2010 at 7:35 PM, jamal <hadi@cyberus.ca> wrote:
>
> Same here - even in my worst case scenario 88.5% of 750Kpps > 600Kpps.
> Attached is history results to make more sense of what i am saying:
> we have net-next kernels from apr14, apr23, apr23 with changlis change,
> apr28, apr28 with your change. What you'll see is non-rps (blue) gets
> better and rps (Orange) gets better slowly then by apr28 it is worse.

Did the number of IPIs increase in the apr28 test? The finial patch
with Eric's change may introduce more IPIs. And I am wondering why
23rdcl-non-rps is better than before. Maybe it is the side effect of
my patch: enlarge the netdev_max_backlog.


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-29 12:12                     ` Changli Gao
@ 2010-04-29 12:45                       ` Eric Dumazet
  2010-04-29 13:17                         ` jamal
  2010-04-29 23:07                         ` Changli Gao
  0 siblings, 2 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-04-29 12:45 UTC (permalink / raw)
  To: Changli Gao
  Cc: hadi, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

Le jeudi 29 avril 2010 à 20:12 +0800, Changli Gao a écrit :
> On Thu, Apr 29, 2010 at 7:35 PM, jamal <hadi@cyberus.ca> wrote:
> >
> > Same here - even in my worst case scenario 88.5% of 750Kpps > 600Kpps.
> > Attached is history results to make more sense of what i am saying:
> > we have net-next kernels from apr14, apr23, apr23 with changlis change,
> > apr28, apr28 with your change. What you'll see is non-rps (blue) gets
> > better and rps (Orange) gets better slowly then by apr28 it is worse.
> 
> Did the number of IPIs increase in the apr28 test? The finial patch
> with Eric's change may introduce more IPIs. And I am wondering why
> 23rdcl-non-rps is better than before. Maybe it is the side effect of
> my patch: enlarge the netdev_max_backlog.
> 
> 

Changli, I wonder how you can cook "performance" patches without testing
them at all for real... This cannot be true ?

When the cpu doing the device softirq is flooded, it handles 300 packets
per net_rx_action() round (netdev_budget), so sends at most 6 ipis per
300 packets, with or without my patch, with or without your patch as
well.

(At most because if remote cpus are flooded as well, they dont
napi_complete so no IPI needed at all)

(My patch had an effect only on normal load, ie one packet received in a
while... up to 50.000 pps I would say). And it also has a nice effect on
non RPS loads (mostly the more typical load for following years).
If a second packet comes 3us after the first one, and before 2nd CPU
handled it, we _can_ afford an extra IPI.

750.000/50 = 15.000 IPI per second.

Even with 200.000 IPI per second, 'perf top -C CPU_IPI_sender' shows
that sending IPI is very cheap (maybe ~1% of cpu cycles)

# Samples: 32033467127
#
# Overhead         Command      Shared Object  Symbol
# ........  ..............  .................  ......
#
    18.05%            init  [kernel.kallsyms]  [k] poll_idle
    10.91%            init  [kernel.kallsyms]  [k] bnx2x_rx_int
    10.42%            init  [kernel.kallsyms]  [k] eth_type_trans
     5.72%            init  [kernel.kallsyms]  [k] kmem_cache_alloc_node
     5.43%            init  [kernel.kallsyms]  [k] __memset
     5.20%            init  [kernel.kallsyms]  [k] get_rps_cpu
     4.82%            init  [kernel.kallsyms]  [k] __slab_alloc
     4.34%            init  [kernel.kallsyms]  [k] get_partial_node
     4.22%            init  [kernel.kallsyms]  [k] _raw_spin_lock
     3.41%            init  [kernel.kallsyms]  [k] __kmalloc_node_track_caller
     3.01%            init  [kernel.kallsyms]  [k] __alloc_skb
     2.22%            init  [kernel.kallsyms]  [k] enqueue_to_backlog
     2.10%            init  [kernel.kallsyms]  [k] vlan_gro_common
     1.34%            init  [kernel.kallsyms]  [k] swiotlb_map_page
     1.25%            init  [kernel.kallsyms]  [k] skb_put
     1.06%            init  [kernel.kallsyms]  [k] _raw_spin_lock_irqsave
     0.92%            init  [kernel.kallsyms]  [k] dev_gro_receive
     0.88%            init  [kernel.kallsyms]  [k] swiotlb_dma_mapping_error
     0.83%            init  [kernel.kallsyms]  [k] vlan_gro_receive
     0.83%            init  [kernel.kallsyms]  [k] __phys_addr
     0.83%            init  [kernel.kallsyms]  [k] __napi_complete
     0.83%            init  [kernel.kallsyms]  [k] default_send_IPI_mask_sequence_phys
     0.77%            init  [kernel.kallsyms]  [k] is_swiotlb_buffer
     0.76%            init  [kernel.kallsyms]  [k] __netdev_alloc_skb
     0.74%            init  [kernel.kallsyms]  [k] deactivate_slab
     0.73%            init  [kernel.kallsyms]  [k] netif_receive_skb
     0.72%            init  [kernel.kallsyms]  [k] unmap_single
     0.69%            init  [kernel.kallsyms]  [k] csd_lock
     0.63%            init  [kernel.kallsyms]  [k] bnx2x_poll
     0.61%            init  [kernel.kallsyms]  [k] bnx2x_msix_fp_int
     0.59%            init  [kernel.kallsyms]  [k] irq_entries_start
     0.59%            init  [kernel.kallsyms]  [k] swiotlb_sync_single
     0.54%            init  [kernel.kallsyms]  [k] get_slab
     0.46%            init  [kernel.kallsyms]  [k] napi_skb_finish




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-29 12:45                       ` Eric Dumazet
@ 2010-04-29 13:17                         ` jamal
  2010-04-29 13:21                           ` Eric Dumazet
  2010-04-29 23:07                         ` Changli Gao
  1 sibling, 1 reply; 108+ messages in thread
From: jamal @ 2010-04-29 13:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

On Thu, 2010-04-29 at 14:45 +0200, Eric Dumazet wrote:

> 
> Changli, I wonder how you can cook "performance" patches without testing
> them at all for real... This cannot be true ?

Eric, I am with you, however you are in the minority of people who test
and produce numbers ;-> The system rewards people for sending patches
not much for anything else - so i cant blame Changli ;->

> When the cpu doing the device softirq is flooded, it handles 300 packets
> per net_rx_action() round (netdev_budget), so sends at most 6 ipis per
> 300 packets, with or without my patch, with or without your patch as
> well.
> 
> (At most because if remote cpus are flooded as well, they dont
> napi_complete so no IPI needed at all)
>
> (My patch had an effect only on normal load, ie one packet received in a
> while... up to 50.000 pps I would say). And it also has a nice effect on
> non RPS loads (mostly the more typical load for following years).
> If a second packet comes 3us after the first one, and before 2nd CPU
> handled it, we _can_ afford an extra IPI.
> 
> 750.000/50 = 15.000 IPI per second.

Could we have some stat in there that shows IPIs being produced? I think
it would help to at least observe any changes over variety of tests.
I did try to patch my system during the first few tests to record IPIs
but it seems to make more sense to have it as a perf stat.

> Even with 200.000 IPI per second, 'perf top -C CPU_IPI_sender' shows
> that sending IPI is very cheap (maybe ~1% of cpu cycles)
> 
> # Samples: 32033467127
> #

One thing i observed is our profiles seem different. Could you send me
your .config for a single nehalem and i will try to go as close as
possible to it? I have a sky2 instead of bnx - but i suspect everything
else will be very similar...
I apologize i dont have much time to look into details - but what i can
do is test at least.

cheers,
jamal




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-29 13:17                         ` jamal
@ 2010-04-29 13:21                           ` Eric Dumazet
  2010-04-29 13:37                             ` jamal
  0 siblings, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-04-29 13:21 UTC (permalink / raw)
  To: hadi
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

Le jeudi 29 avril 2010 à 09:17 -0400, jamal a écrit :

> Could we have some stat in there that shows IPIs being produced? I think
> it would help to at least observe any changes over variety of tests.
> I did try to patch my system during the first few tests to record IPIs
> but it seems to make more sense to have it as a perf stat.
> 
> > Even with 200.000 IPI per second, 'perf top -C CPU_IPI_sender' shows
> > that sending IPI is very cheap (maybe ~1% of cpu cycles)
> > 
> > # Samples: 32033467127
> > #
> 
> One thing i observed is our profiles seem different. Could you send me
> your .config for a single nehalem and i will try to go as close as
> possible to it? I have a sky2 instead of bnx - but i suspect everything
> else will be very similar...
> I apologize i dont have much time to look into details - but what i can
> do is test at least.

I'am going to redo some test on my 'old machine', with tg3 driver.

You could try following program :


#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>

struct softnet_stat_vals {
	int flip;
	unsigned int tab[2][10];
};

int read_file(struct softnet_stat_vals *v)
{
	char buffer[1024];
	FILE *F = fopen("/proc/net/softnet_stat", "r");

	v->flip ^= 1;
	if (!F)
		return -1;

	memset(v->tab[v->flip], 0, 10 * sizeof(unsigned int));
	while (fgets(buffer, sizeof(buffer), F)) {
		int i, pos = 0;
		unsigned int val;
	
		for (i = 0; ;) {
			if (sscanf(buffer + pos, "%08x", &val) != 1) break;
			v->tab[v->flip][i] += val;
			pos += 9;
			if (++i == 10)
				break;
			}
		}
	fclose(F);

}


int main(int argc, char *argv[])
{
	struct softnet_stat_vals *v = calloc(sizeof(struct softnet_stat_vals), 1);
	
	read_file(v);
	for (;;) {
		sleep(1);
		read_file(v);
		printf("%u rps\n", v->tab[v->flip][9] - v->tab[v->flip^1][9]);
	}
}



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-29 13:21                           ` Eric Dumazet
@ 2010-04-29 13:37                             ` jamal
  2010-04-29 13:49                               ` Eric Dumazet
  0 siblings, 1 reply; 108+ messages in thread
From: jamal @ 2010-04-29 13:37 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

On Thu, 2010-04-29 at 15:21 +0200, Eric Dumazet wrote:


> 
> You could try following program :
> 

Will do later today (test machine is not on the network and is about 20
minutes from here; so worst case i will get you results by end of day)
I guess this program is good enough since it tells me the system wide
ipi count - what my patch did was also to break it down by which cpu got
how many IPIs (served to check if there was uneven distribution)

> 
> Is your application mono threaded and receiving data to 8 sockets ?
> 

I fork one instance per detected cpu and bind to different ports each
time. Example bind to port 8200 on cpu0, 8201 on cpu1, etc.

cheers,
jamal


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-29 13:37                             ` jamal
@ 2010-04-29 13:49                               ` Eric Dumazet
  2010-04-29 13:56                                 ` jamal
  0 siblings, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-04-29 13:49 UTC (permalink / raw)
  To: hadi
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

Le jeudi 29 avril 2010 à 09:37 -0400, jamal a écrit :
> On Thu, 2010-04-29 at 15:21 +0200, Eric Dumazet wrote:
> 
> 
> > 
> > You could try following program :
> > 
> 
> Will do later today (test machine is not on the network and is about 20
> minutes from here; so worst case i will get you results by end of day)
> I guess this program is good enough since it tells me the system wide
> ipi count - what my patch did was also to break it down by which cpu got
> how many IPIs (served to check if there was uneven distribution)
> 
> > 
> > Is your application mono threaded and receiving data to 8 sockets ?
> > 
> 
> I fork one instance per detected cpu and bind to different ports each
> time. Example bind to port 8200 on cpu0, 8201 on cpu1, etc.
> 

I guess this is the problem ;)

With RPS, you should not bind your threads to cpu.
This is the rps hash who will decide for you.


I am using following program :

/*
 *  Usage: udpsink [ -p baseport] nbports
 *
 */
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>

struct worker_data {
	int fd;
	unsigned long pack_count;
	unsigned long bytes_count;
	unsigned long _padd[16 - 3]; /* alignment */ 
};

void usage(int code)
{
	fprintf(stderr, "Usage: udpsink [-p baseport] nbports\n");
	exit(code);
}

void *worker_func(void *arg)
{
	struct worker_data *wdata = (struct worker_data *)arg;
	char buffer[4096];
	struct sockaddr_in addr;
	int lu;

	while (1) {
		socklen_t len = sizeof(addr);
		lu = recvfrom(wdata->fd, buffer, sizeof(buffer), 0, (struct sockaddr *)&addr, &len);
		if (lu > 0) {
			wdata->pack_count++;
			wdata->bytes_count += lu;
		}
	}
}

int main(int argc, char *argv[])
{
int c;
int baseport = 4000;
int nbthreads;
struct worker_data *wdata;
unsigned long ototal = 0;
int concurrent = 0;
int verbose = 0;
int i;
	while ((c = getopt(argc, argv, "cvp:")) != -1) {
		if (c == 'p')
			baseport = atoi(optarg);
		else if (c == 'c')
			concurrent = 1;
		else if (c == 'v')
			verbose++;
		else usage(1);
	}
	if (optind == argc)
		usage(1);
	nbthreads = atoi(argv[optind]);
	wdata = calloc(sizeof(struct worker_data), nbthreads);
	if (!wdata) {
		perror("calloc");
		return 1;
	}
	for (i = 0; i < nbthreads; i++) {
		struct sockaddr_in addr;
		pthread_t tid;

		if (i && concurrent) {
			wdata[i].fd = wdata[0].fd ;
		} else {
			wdata[i].fd = socket(PF_INET, SOCK_DGRAM, 0);
			if (wdata[i].fd == -1) {
				perror("socket");
				return 1;
			}
			memset(&addr, 0, sizeof(addr));
			addr.sin_family = AF_INET;
//			addr.sin_addr.s_addr = inet_addr(argv[optind]);
			addr.sin_port = htons(baseport + i);
			if (bind(wdata[i].fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
				perror("bind");
				return 1;
				}
//			fcntl(wdata[i].fd, F_SETFL, O_NDELAY);
			}
		pthread_create(&tid, NULL, worker_func, wdata + i);
	}
	for (;;) {
		unsigned long total;
		long delta;

		sleep(1);
		total = 0;
		for (i = 0; i < nbthreads;i++) {
			total += wdata[i].pack_count;
		}
		delta = total - ototal;
		if (delta) {
			printf("%lu pps (%lu", delta, total);
			if (verbose) {
				for (i = 0; i < nbthreads;i++) { 
					if (wdata[i].pack_count)
						printf(" %d:%lu", i, wdata[i].pack_count);
				}
			}
			printf(")\n");
		}
		ototal = total;
	}
}




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-29 13:49                               ` Eric Dumazet
@ 2010-04-29 13:56                                 ` jamal
  2010-04-29 20:36                                   ` jamal
  0 siblings, 1 reply; 108+ messages in thread
From: jamal @ 2010-04-29 13:56 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

On Thu, 2010-04-29 at 15:49 +0200, Eric Dumazet wrote:

> > I fork one instance per detected cpu and bind to different ports each
> > time. Example bind to port 8200 on cpu0, 8201 on cpu1, etc.
> > 
> 
> I guess this is the problem ;)
> 
> With RPS, you should not bind your threads to cpu.
> This is the rps hash who will decide for you.
> 

Sorry - I was not clear; i have the option of binding to cpu
vs the setsched api; but what i meant in this case is:
- for each cpu detected, fork
-- open socket
---bind to udp port cpu# + 8200

I could also bind to a cpu in the last step and i did notice it
improved distribution - but all my tests since apr23 dont do that ;->

> 
> I am using following program :
> 

I will try your program instead so we can reduce the variables

cheers,
jamal


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
       [not found]           ` <20100429174056.GA8044@gargoyle.fritz.box>
@ 2010-04-29 17:56             ` Eric Dumazet
  2010-04-29 18:10               ` OFT - reserving CPU's for networking Stephen Hemminger
       [not found]               ` <20100429182347.GA8512@gargoyle.fritz.box>
  0 siblings, 2 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-04-29 17:56 UTC (permalink / raw)
  To: Andi Kleen
  Cc: hadi, Changli Gao, David S. Miller, Tom Herbert,
	Stephen Hemminger, netdev, Andi Kleen

Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit :
> > Andi, what do you think of this one ?
> > Dont we have a function to send an IPI to an individual cpu instead ?
> 
> That's what this function already does. You only set a single CPU 
> in the target mask, right?
> 
> IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC
> which is a bit faster for this, but that's not available in the lower
> end Nehalems. But even then it's not exactly fast.
> 
> I don't think the IPI primitive can be optimized much. It's not a cheap 
> operation.
> 
> If it's a problem do it less often and batch IPIs.
> 
> It's essentially the same problem as interrupt mitigation or NAPI 
> are solving for NICs. I guess just need a suitable mitigation mechanism.
> 
> Of course that would move more work to the sending CPU again, but 
> perhaps there's no alternative. I guess you could make it cheaper it by
> minimizing access to packet data.
> 
> -Andi

Well, IPI are already batched, and rate is auto adaptative.

After various changes, it seems things are going better, maybe there is
something related to cache line trashing.

I 'solved' it by using idle=poll, but you might take a look at
clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly
contended spinlock...




    23.52%            init  [kernel.kallsyms]             [k] _raw_spin_lock_irqsave
                      |
                      --- _raw_spin_lock_irqsave
                         |          
                         |--94.74%-- clockevents_notify
                         |          lapic_timer_state_broadcast
                         |          acpi_idle_enter_bm
                         |          cpuidle_idle_call
                         |          cpu_idle
                         |          start_secondary
                         |          
                         |--4.10%-- tick_broadcast_oneshot_control
                         |          tick_notify
                         |          notifier_call_chain
                         |          __raw_notifier_call_chain
                         |          raw_notifier_call_chain
                         |          clockevents_do_notify
                         |          clockevents_notify
                         |          lapic_timer_state_broadcast
                         |          acpi_idle_enter_bm
                         |          cpuidle_idle_call
                         |          cpu_idle
                         |          start_secondary
                         |          


^ permalink raw reply	[flat|nested] 108+ messages in thread

* OFT - reserving CPU's for networking
  2010-04-29 17:56             ` Eric Dumazet
@ 2010-04-29 18:10               ` Stephen Hemminger
  2010-04-29 19:19                 ` Thomas Gleixner
       [not found]               ` <20100429182347.GA8512@gargoyle.fritz.box>
  1 sibling, 1 reply; 108+ messages in thread
From: Stephen Hemminger @ 2010-04-29 18:10 UTC (permalink / raw)
  To: Eric Dumazet, Thomas Gleixner; +Cc: Andi Kleen, netdev, Andi Kleen

> Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit :
> > > Andi, what do you think of this one ?
> > > Dont we have a function to send an IPI to an individual cpu instead ?  
> > 
> > That's what this function already does. You only set a single CPU 
> > in the target mask, right?
> > 
> > IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC
> > which is a bit faster for this, but that's not available in the lower
> > end Nehalems. But even then it's not exactly fast.
> > 
> > I don't think the IPI primitive can be optimized much. It's not a cheap 
> > operation.
> > 
> > If it's a problem do it less often and batch IPIs.
> > 
> > It's essentially the same problem as interrupt mitigation or NAPI 
> > are solving for NICs. I guess just need a suitable mitigation mechanism.
> > 
> > Of course that would move more work to the sending CPU again, but 
> > perhaps there's no alternative. I guess you could make it cheaper it by
> > minimizing access to packet data.
> > 
> > -Andi  
> 
> Well, IPI are already batched, and rate is auto adaptative.
> 
> After various changes, it seems things are going better, maybe there is
> something related to cache line trashing.
> 
> I 'solved' it by using idle=poll, but you might take a look at
> clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly
> contended spinlock...
> 
> 
> 
> 
>     23.52%            init  [kernel.kallsyms]             [k] _raw_spin_lock_irqsave
>                       |
>                       --- _raw_spin_lock_irqsave
>                          |          
>                          |--94.74%-- clockevents_notify
>                          |          lapic_timer_state_broadcast
>                          |          acpi_idle_enter_bm
>                          |          cpuidle_idle_call
>                          |          cpu_idle
>                          |          start_secondary
>                          |          
>                          |--4.10%-- tick_broadcast_oneshot_control
>                          |          tick_notify
>                          |          notifier_call_chain
>                          |          __raw_notifier_call_chain
>                          |          raw_notifier_call_chain
>                          |          clockevents_do_notify
>                          |          clockevents_notify
>                          |          lapic_timer_state_broadcast
>                          |          acpi_idle_enter_bm
>                          |          cpuidle_idle_call
>                          |          cpu_idle
>                          |          start_secondary
>                          |          
> 


I keep getting asked about taking some core's away from clock and scheduler
to be reserved just for network processing. Seeing this kind of stuff
makes me wonder if maybe that isn't a half bad idea.


-- 

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
       [not found]               ` <20100429182347.GA8512@gargoyle.fritz.box>
@ 2010-04-29 19:12                 ` Eric Dumazet
       [not found]                   ` <20100429214144.GA10663@gargoyle.fritz.box>
  0 siblings, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-04-29 19:12 UTC (permalink / raw)
  To: Andi Kleen, Andi Kleen
  Cc: hadi, Changli Gao, David S. Miller, Tom Herbert,
	Stephen Hemminger, netdev, Andi Kleen, lenb, arjan

Le jeudi 29 avril 2010 à 20:23 +0200, Andi Kleen a écrit :
> On Thu, Apr 29, 2010 at 07:56:12PM +0200, Eric Dumazet wrote:
> > Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit :
> > > > Andi, what do you think of this one ?
> > > > Dont we have a function to send an IPI to an individual cpu instead ?
> > > 
> > > That's what this function already does. You only set a single CPU 
> > > in the target mask, right?
> > > 
> > > IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC
> > > which is a bit faster for this, but that's not available in the lower
> > > end Nehalems. But even then it's not exactly fast.
> > > 
> > > I don't think the IPI primitive can be optimized much. It's not a cheap 
> > > operation.
> > > 
> > > If it's a problem do it less often and batch IPIs.
> > > 
> > > It's essentially the same problem as interrupt mitigation or NAPI 
> > > are solving for NICs. I guess just need a suitable mitigation mechanism.
> > > 
> > > Of course that would move more work to the sending CPU again, but 
> > > perhaps there's no alternative. I guess you could make it cheaper it by
> > > minimizing access to packet data.
> > > 
> > > -Andi
> > 
> > Well, IPI are already batched, and rate is auto adaptative.
> > 
> > After various changes, it seems things are going better, maybe there is
> > something related to cache line trashing.
> > 
> > I 'solved' it by using idle=poll, but you might take a look at
> > clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly
> > contended spinlock...
> 
> acpi_idle_enter_bm should not be executed on a Nehalem, it's obsolete.
> If it does on your system something is wrong.
> 
> Ahh, that triggers a bell. There's one issue that if the remote CPU is in a very
> deep idle state it could take a long time to wake it up. Nehalem has deeper
> sleep states than earlier CPUs. When this happens the IPI sender will be slow
> too I believe.
> 
> Are the target CPUs idle? 
> 

Yes, mostly, but about 200.000 wakeups per second I would say...

If a cpu in deep state receives an IPI, process a softirq, should it
come back to deep state immediately, or should it wait for some
milliseconds ?

> Perhaps need to feed some information to cpuidle's governour to prevent this problem.
> 
> idle=poll is very drastic, better to limit to C1 
> 

How can I do this ?

Thanks !



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: OFT - reserving CPU's for networking
  2010-04-29 18:10               ` OFT - reserving CPU's for networking Stephen Hemminger
@ 2010-04-29 19:19                 ` Thomas Gleixner
  2010-04-29 20:02                   ` Eric Dumazet
  2010-04-30 18:57                   ` David Miller
  0 siblings, 2 replies; 108+ messages in thread
From: Thomas Gleixner @ 2010-04-29 19:19 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Andi Kleen, netdev, Andi Kleen, Peter Zijlstra

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2800 bytes --]

On Thu, 29 Apr 2010, Stephen Hemminger wrote:
> > Le jeudi 29 avril 2010 à 19:42 +0200, Andi Kleen a écrit :
> > > > Andi, what do you think of this one ?
> > > > Dont we have a function to send an IPI to an individual cpu instead ?  
> > > 
> > > That's what this function already does. You only set a single CPU 
> > > in the target mask, right?
> > > 
> > > IPIs are unfortunately always a bit slow. Nehalem-EX systems have X2APIC
> > > which is a bit faster for this, but that's not available in the lower
> > > end Nehalems. But even then it's not exactly fast.
> > > 
> > > I don't think the IPI primitive can be optimized much. It's not a cheap 
> > > operation.
> > > 
> > > If it's a problem do it less often and batch IPIs.
> > > 
> > > It's essentially the same problem as interrupt mitigation or NAPI 
> > > are solving for NICs. I guess just need a suitable mitigation mechanism.
> > > 
> > > Of course that would move more work to the sending CPU again, but 
> > > perhaps there's no alternative. I guess you could make it cheaper it by
> > > minimizing access to packet data.
> > > 
> > > -Andi  
> > 
> > Well, IPI are already batched, and rate is auto adaptative.
> > 
> > After various changes, it seems things are going better, maybe there is
> > something related to cache line trashing.
> > 
> > I 'solved' it by using idle=poll, but you might take a look at
> > clockevents_notify (acpi_idle_enter_bm) abuse of a shared and higly
> > contended spinlock...

Say thanks to Intel/AMD for providing us timers which stop in lower
c-states.

Not much we can do about the broadcast lock when several cores are
going idle and we need to setup a global timer to work around the
lapic timer stops in C2/C3 issue.

Simply the C-state timer broadcasting does not scale. And it was never
meant to scale. It's a workaround for laptops to have functional NOHZ.

There are several ways to work around that on larger machines:

 - Restrict c-states
 - Disable NOHZ and highres timers
 - idle=poll is definitely the worst of all possible solutions

> I keep getting asked about taking some core's away from clock and scheduler
> to be reserved just for network processing. Seeing this kind of stuff
> makes me wonder if maybe that isn't a half bad idea.

This comes up every few month and we pointed out several times what
needs to be done to make this work w/o these weird hacks which put a
core offline and then start some magic undebugable binary blob on it.
We have not seen anyone working on this, but the "set cores aside and
let them do X" idea seems to stick in peoples heads.

Seriously, that's not a solution. It's going to be some hacked up
nightmare which is completely unmaintainable.

Aside of that I seriously doubt that you can do networking w/o time
and timers.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: OFT - reserving CPU's for networking
  2010-04-29 19:19                 ` Thomas Gleixner
@ 2010-04-29 20:02                   ` Eric Dumazet
  2010-04-30 18:15                     ` Brian Bloniarz
  2010-04-30 18:57                   ` David Miller
  1 sibling, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-04-29 20:02 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Stephen Hemminger, Andi Kleen, netdev, Andi Kleen, Peter Zijlstra

Le jeudi 29 avril 2010 à 21:19 +0200, Thomas Gleixner a écrit :

> Say thanks to Intel/AMD for providing us timers which stop in lower
> c-states.
> 
> Not much we can do about the broadcast lock when several cores are
> going idle and we need to setup a global timer to work around the
> lapic timer stops in C2/C3 issue.
> 
> Simply the C-state timer broadcasting does not scale. And it was never
> meant to scale. It's a workaround for laptops to have functional NOHZ.
> 
> There are several ways to work around that on larger machines:
> 
>  - Restrict c-states
>  - Disable NOHZ and highres timers
>  - idle=poll is definitely the worst of all possible solutions
> 
> > I keep getting asked about taking some core's away from clock and scheduler
> > to be reserved just for network processing. Seeing this kind of stuff
> > makes me wonder if maybe that isn't a half bad idea.
> 
> This comes up every few month and we pointed out several times what
> needs to be done to make this work w/o these weird hacks which put a
> core offline and then start some magic undebugable binary blob on it.
> We have not seen anyone working on this, but the "set cores aside and
> let them do X" idea seems to stick in peoples heads.
> 
> Seriously, that's not a solution. It's going to be some hacked up
> nightmare which is completely unmaintainable.
> 
> Aside of that I seriously doubt that you can do networking w/o time
> and timers.
> 

Thanks a lot !

booting with processor.max_cstate=1 solves the problem

(I already had a CONFIG_NO_HZ=no conf, but highres timer enabled)

Even with _carefuly_ chosen crazy configuration (receiving a packet on a
cpu, then transfert it to another cpu, with a full 16x16 matrix
involved), generating 700.000 IPI per second on the machine seems fine
now.




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-29 13:56                                 ` jamal
@ 2010-04-29 20:36                                   ` jamal
  2010-04-29 21:01                                     ` [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion Eric Dumazet
  2010-04-30 19:30                                     ` [PATCH net-next-2.6] net: speedup udp receive path jamal
  0 siblings, 2 replies; 108+ messages in thread
From: jamal @ 2010-04-29 20:36 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

[-- Attachment #1: Type: text/plain, Size: 738 bytes --]

On Thu, 2010-04-29 at 09:56 -0400, jamal wrote:

> 
> I will try your program instead so we can reduce the variables

Results attached.
With your app rps does a hell lot better and non-rps worse ;->
With my proggie, non-rps does much better than yours and rps does
a lot worse for same setup. I see the scheduler kicking quiet a bit in
non-rps for you...

The main difference between us as i see it is:
a) i use epoll - actually linked to libevent (1.0.something)
b) I fork processes and you use pthreads.

I dont have time to chase it today, but 1) I am either going to change
yours to use libevent or make mine get rid of it then 2) move towards
pthreads or have yours fork..
then observe if that makes any difference..


cheers,
jamal

[-- Attachment #2: apr29-res.txt --]
[-- Type: text/plain, Size: 29074 bytes --]


No RPS; same kernel as yesterday with Eric's changes

-------------------------------------------------------------------------------
   PerfTop:    2572 irqs/sec  kernel:94.7% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             2901.00 17.4% sky2_poll                   [sky2]  
              781.00  4.7% schedule                    [kernel]
              574.00  3.4% __skb_recv_datagram         [kernel]
              518.00  3.1% _raw_spin_lock_irqsave      [kernel]
              460.00  2.8% udp_recvmsg                 [kernel]
              457.00  2.7% copy_user_generic_string    [kernel]
              397.00  2.4% _raw_spin_lock_bh           [kernel]
              340.00  2.0% __udp4_lib_lookup           [kernel]
              320.00  1.9% ip_route_input              [kernel]
              295.00  1.8% _raw_spin_lock              [kernel]
              293.00  1.8% dst_release                 [kernel]
              282.00  1.7% ip_rcv                      [kernel]
              275.00  1.6% skb_copy_datagram_iovec     [kernel]
              263.00  1.6% __switch_to                 [kernel]
              257.00  1.5% __alloc_skb                 [kernel]
              256.00  1.5% system_call                 [kernel]
              243.00  1.5% sock_recv_ts_and_drops      [kernel]
              227.00  1.4% sock_queue_rcv_skb          [kernel]
              225.00  1.3% _raw_spin_unlock_irqrestore [kernel]
              220.00  1.3% fget_light                  [kernel]
              218.00  1.3% pick_next_task_fair         [kernel]

-------------------------------------------------------------------------------
   PerfTop:    1000 irqs/sec  kernel:100.0% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

             1508.00 37.9% sky2_poll                   [sky2]  
              198.00  5.0% ip_route_input              [kernel]
              184.00  4.6% __udp4_lib_lookup           [kernel]
              172.00  4.3% ip_rcv                      [kernel]
              139.00  3.5% _raw_spin_lock              [kernel]
              131.00  3.3% __alloc_skb                 [kernel]
              130.00  3.3% sock_queue_rcv_skb          [kernel]
              111.00  2.8% __udp4_lib_rcv              [kernel]
              101.00  2.5% __netif_receive_skb         [kernel]
               78.00  2.0% select_task_rq_fair         [kernel]
               74.00  1.9% try_to_wake_up              [kernel]
               73.00  1.8% sock_def_readable           [kernel]
               72.00  1.8% _raw_spin_lock_irqsave      [kernel]
               67.00  1.7% task_rq_lock                [kernel]
               66.00  1.7% _raw_read_lock              [kernel]
               64.00  1.6% __kmalloc                   [kernel]
               62.00  1.6% resched_task                [kernel]
               61.00  1.5% sky2_rx_submit              [sky2]  
               52.00  1.3% ip_local_deliver            [kernel]
               51.00  1.3% kmem_cache_alloc            [kernel]
               51.00  1.3% swiotlb_sync_single         [kernel]
               43.00  1.1% sky2_remove                 [sky2]  
               41.00  1.0% udp_queue_rcv_skb           [kernel]
               39.00  1.0% __wake_up_common            [kernel]


-------------------------------------------------------------------------------
   PerfTop:     368 irqs/sec  kernel:95.9% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ________

              279.00  8.2% schedule                    [kernel]
              260.00  7.7% __skb_recv_datagram         [kernel]
              196.00  5.8% _raw_spin_lock_bh           [kernel]
              180.00  5.3% copy_user_generic_string    [kernel]
              176.00  5.2% udp_recvmsg                 [kernel]
              150.00  4.4% _raw_spin_lock_irqsave      [kernel]
              142.00  4.2% dst_release                 [kernel]
              106.00  3.1% skb_copy_datagram_iovec     [kernel]
               97.00  2.9% sock_recv_ts_and_drops      [kernel]
               93.00  2.7% tick_nohz_stop_sched_tick   [kernel]
               89.00  2.6% sys_recvfrom                [kernel]
               89.00  2.6% __switch_to                 [kernel]
               86.00  2.5% pick_next_task_fair         [kernel]
               82.00  2.4% sock_rfree                  [kernel]
               75.00  2.2% system_call                 [kernel]
               73.00  2.2% fget_light                  [kernel]
               70.00  2.1% _raw_spin_lock_irq          [kernel]
               63.00  1.9% kmem_cache_free             [kernel]
               61.00  1.8% _raw_spin_unlock_irqrestore [kernel]
               60.00  1.8% kfree                       [kernel]
               56.00  1.7% select_nohz_load_balancer   [kernel]
               55.00  1.6% finish_task_switch          [kernel]
               48.00  1.4% inet_recvmsg                [kernel]
               41.00  1.2% security_socket_recvmsg     [kernel]


-------------------------------------------------------------------------------
   PerfTop:      97 irqs/sec  kernel:81.4% [1000Hz cycles],  (all, cpu: 7)
-------------------------------------------------------------------------------

             samples  pcnt function                     DSO
             _______ _____ ____________________________ ________

               55.00 10.8% schedule                     [kernel]
               38.00  7.5% __skb_recv_datagram          [kernel]
               36.00  7.1% udp_recvmsg                  [kernel]
               32.00  6.3% _raw_spin_lock_irqsave       [kernel]
               31.00  6.1% _raw_spin_lock_bh            [kernel]
               30.00  5.9% copy_user_generic_string     [kernel]
               29.00  5.7% sock_recv_ts_and_drops       [kernel]
               27.00  5.3% skb_copy_datagram_iovec      [kernel]
               17.00  3.3% system_call                  [kernel]
               17.00  3.3% dst_release                  [kernel]
               14.00  2.7% _raw_spin_unlock_irqrestore  [kernel]
               12.00  2.4% __switch_to                  [kernel]
               12.00  2.4% pick_next_task_fair          [kernel]
               11.00  2.2% inet_recvmsg                 [kernel]
               11.00  2.2% sys_recvfrom                 [kernel]
               10.00  2.0% finish_task_switch           [kernel]
               10.00  2.0% sock_rfree                   [kernel]
               10.00  2.0% select_nohz_load_balancer    [kernel]
                7.00  1.4% rcu_enter_nohz               [kernel]
                7.00  1.4% tick_nohz_stop_sched_tick    [kernel]
                7.00  1.4% tick_nohz_restart_sched_tick [kernel]
                5.00  1.0% ktime_get                    [kernel]

Run1
----
557257 pps (557257 0:69750 1:69417 2:69063 3:68818 4:70139 5:69824 6:70135 7:70113)
737468 pps (1294725 0:162765 1:162430 2:162075 3:155770 4:163150 5:162838 6:163150 7:162549)
744238 pps (2038963 0:255795 1:255460 2:255105 3:248800 4:256180 5:255867 6:256180 7:255579)
719343 pps (2758306 0:348825 1:348202 2:348135 3:338166 4:349210 5:333030 6:349210 7:343528)
741830 pps (3500136 0:440870 1:440933 2:441165 3:430162 4:442240 5:425970 6:442240 7:436558)
686289 pps (4186425 0:533900 1:533749 2:515637 3:511486 4:531997 5:504717 6:525536 7:529406)
681708 pps (4868133 0:613701 1:617409 2:608667 3:599774 4:607480 5:589487 6:609802 7:621817)
697577 pps (5565710 0:704183 1:710439 2:688904 3:681696 4:689120 5:673932 6:702448 7:714988)
729284 pps (6294994 0:797213 1:803469 2:775863 3:770959 4:781160 5:766105 6:792207 7:808018)
734160 pps (7029154 0:886389 1:896504 2:868898 3:863506 4:868426 5:859138 6:885242 7:901053)
728541 pps (7757695 0:978789 1:989534 2:961928 3:946834 4:961458 5:952170 6:978272 7:988714)
709578 pps (8467273 0:1071819 1:1079000 2:1041101 3:1038974 4:1047215 5:1037254 6:1070168 7:1081744)
684154 pps (9151427 0:1160855 1:1158471 2:1122874 3:1129012 4:1136563 5:1120258 6:1153624 7:1169773)
498291 pps (9649718 0:1224303 1:1214178 2:1185737 3:1191467 4:1200058 5:1183753 6:1217121 7:1233101)

Essentially sink in about 96.5% of 10M packet

run2
---
402553 pps (402553 0:51530 1:53289 2:53625 3:45748 4:53625 5:49484 6:42292 7:52960)
711539 pps (1114092 0:144028 1:146426 2:144237 3:124551 4:146760 5:142619 6:119376 7:146095)
692319 pps (1806411 0:208285 1:239557 2:220103 3:211096 4:239890 5:235749 6:212506 7:239225)
731896 pps (2538307 0:301450 1:332723 2:308718 3:304264 4:333055 5:320036 6:305671 7:332390)
712869 pps (3251176 0:393270 1:418806 2:397578 3:396844 4:426245 5:406943 6:398861 7:412629)
681513 pps (3932689 0:486300 1:501926 2:490613 3:489874 4:466455 5:499973 6:491891 7:505659)
697308 pps (4629997 0:567969 1:585032 2:583643 3:576712 4:548243 5:589399 6:581080 7:597922)
712903 pps (5342900 0:657579 1:660221 2:676673 3:669744 4:641273 5:682222 6:674110 7:681082)
687765 pps (6030665 0:744421 1:752470 2:764631 3:751445 4:722250 5:771799 6:761224 7:762426)
695799 pps (6726464 0:832438 1:842797 2:853337 3:844470 4:804427 5:857412 6:846918 7:844668)
720011 pps (7446475 0:925210 1:934696 2:934883 3:937280 4:894644 5:949883 6:932740 7:937142)
712021 pps (8158496 0:1017246 1:1027726 2:1016841 3:1024712 4:978513 5:1042913 6:1023516 7:1027031)
709810 pps (8868306 0:1098522 1:1111823 2:1109871 3:1117444 4:1070124 5:1131774 6:1109841 7:1118909)
591817 pps (9460123 0:1178005 1:1185698 2:1189381 3:1196367 4:1143880 5:1198406 6:1176121 7:1192265)

94.6%

run3
---
682714 pps (682714 0:83336 1:86683 2:86895 3:86243 4:84616 5:81152 6:86895 7:86895)
691212 pps (1373926 0:164602 1:179240 2:171897 3:174162 4:176509 5:158115 6:174083 7:175321)
661913 pps (2035839 0:243004 1:263829 2:259312 3:267160 4:268875 5:231009 6:253411 7:249239)
715612 pps (2751451 0:336034 1:350220 2:346461 3:360190 4:359219 5:317625 6:346441 7:335265)
655354 pps (3406805 0:419339 1:434934 2:432010 3:442138 4:437837 5:394805 6:427064 7:418679)
592126 pps (3998931 0:494253 1:511454 2:508829 3:511992 4:508978 5:474866 6:496884 7:491679)
697177 pps (4696108 0:584474 1:601703 2:589111 3:602252 4:598767 5:565114 6:582153 7:572539)
681004 pps (5377112 0:662864 1:684427 2:678825 3:688402 4:685441 5:651962 6:673697 7:651495)
669622 pps (6046734 0:740275 1:765126 2:762764 3:773772 4:772144 5:731330 6:762339 7:738987)
645906 pps (6692640 0:825606 1:850550 2:846793 3:858243 4:850408 5:812402 6:838248 7:810391)
705873 pps (7398513 0:916877 1:937693 2:929956 3:950433 4:938179 5:894913 6:928125 7:902337)
735460 pps (8133973 0:1009907 1:1030722 2:1022986 3:1037959 4:1031209 5:987943 6:1021155 7:992092)
707605 pps (8841578 0:1102933 1:1122367 2:1101160 3:1129212 4:1124239 5:1063617 6:1112929 7:1085122)
347807 pps (9189385 0:1149677 1:1168026 2:1147905 3:1170556 4:1158858 5:1110362 6:1152134 7:1131867)

91.9%

run4
----
552606 pps (552606 0:72743 1:75411 2:67732 3:70204 4:63741 5:64934 6:66096 7:71746)
684450 pps (1237056 0:162839 1:165064 2:148974 3:160417 4:153919 5:135895 6:156238 7:153710)
696799 pps (1933855 0:254440 1:252304 2:240107 3:249399 4:246028 5:228009 6:247409 7:216161)
676546 pps (2610401 0:341132 1:336959 2:325332 3:330438 4:336250 5:305238 6:336208 7:298848)
712251 pps (3322652 0:432976 1:428990 2:413228 3:419977 4:425918 5:386917 6:426275 7:388371)
615680 pps (3938332 0:515679 1:497421 2:491618 3:505449 4:489452 5:462820 6:505336 7:470561)
635467 pps (4573799 0:597340 1:582917 2:555389 3:582751 4:573273 5:545378 6:584378 7:552373)
725581 pps (5299380 0:690038 1:675870 2:636347 3:676029 4:666231 5:632208 6:677337 7:645324)
699015 pps (5998395 0:783068 1:763654 2:725184 3:762784 4:752559 5:709123 6:764439 7:737586)
674472 pps (6672867 0:872645 1:847669 2:808333 3:827766 4:842267 5:798997 6:853779 7:821412)
680913 pps (7353780 0:961487 1:926760 2:887273 3:919158 4:925165 5:891082 6:929793 7:913064)
666279 pps (8020059 0:1050823 1:1012028 2:972691 3:988738 4:1009904 5:974127 6:1017940 7:993808)
680615 pps (8700674 0:1124223 1:1087779 2:1057541 3:1080546 4:1094373 5:1066880 6:1102496 7:1086838)
420306 pps (9120980 0:1177541 1:1130287 2:1111621 3:1134624 4:1148453 5:1120960 6:1156576 7:1140918)

91.2%

run5
------
294229 pps (294229 0:38805 1:30946 2:32655 3:36613 4:38805 5:38805 6:38800 7:38801)
694748 pps (988977 0:124394 1:123976 2:114107 3:128079 4:111317 5:131835 6:131835 7:123434)
690185 pps (1679162 0:217405 1:216988 2:194192 3:204091 4:195948 5:224678 6:220924 7:204937)
726561 pps (2405723 0:307828 1:309671 2:278163 3:296811 4:286642 5:317346 6:311296 7:297967)
695974 pps (3101697 0:391228 1:395256 2:371056 3:388790 4:379533 5:410242 6:393051 7:372541)
665395 pps (3767092 0:473134 1:484367 2:447394 3:462837 4:471026 5:491170 6:473947 7:463219)
671483 pps (4438575 0:562883 1:574014 2:534258 3:544512 4:534064 5:581420 6:560073 7:547353)
679400 pps (5117975 0:641135 1:663809 2:618019 3:633448 4:605085 5:674433 6:649865 7:632183)
696263 pps (5814238 0:734516 1:743715 2:711049 3:717481 4:693193 5:758493 6:740374 7:715417)
681791 pps (6496029 0:823596 1:836004 2:795579 3:809104 4:783457 5:820061 6:820219 7:808010)
670672 pps (7166701 0:911202 1:927618 2:888127 3:875504 4:874363 5:889342 6:911838 7:888707)
743444 pps (7910145 0:1004233 1:1020652 2:981157 3:968534 4:967393 5:982078 6:1004362 7:981737)
725623 pps (8635768 0:1096546 1:1113682 2:1059978 3:1061564 4:1060423 5:1072761 6:1097392 7:1073423)
662504 pps (9298272 0:1171688 1:1197579 2:1137559 3:1154595 4:1146405 5:1161670 6:1176001 7:1152776)
12979 pps (9311251 0:1173488 1:1199379 2:1137914 3:1156399 4:1148209 5:1163475 6:1177806 7:1154581)

93.1%

Average for no-rps 93.5% of 10M incoming at ~ 750Kpps.


# echo 1 >  /proc/irq/55/smp_affinity 
# echo ee  > /sys/class/net/eth0/queues/rx-0/rps_cpus


-------------------------------------------------------------------------------
   PerfTop:    2273 irqs/sec  kernel:93.7% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ ________

              922.00 10.3% sky2_poll                      [sky2]  
              402.00  4.5% __netif_receive_skb            [kernel]
              400.00  4.4% ip_rcv                         [kernel]
              356.00  4.0% call_function_single_interrupt [kernel]
              339.00  3.8% ip_route_input                 [kernel]
              282.00  3.1% schedule                       [kernel]
              194.00  2.2% _raw_spin_lock_irqsave         [kernel]
              180.00  2.0% sock_recv_ts_and_drops         [kernel]
              178.00  2.0% _raw_spin_lock                 [kernel]
              173.00  1.9% __udp4_lib_lookup              [kernel]
              171.00  1.9% __udp4_lib_rcv                 [kernel]
              162.00  1.8% system_call                    [kernel]
              154.00  1.7% kfree                          [kernel]
              147.00  1.6% __skb_recv_datagram            [kernel]
              146.00  1.6% copy_user_generic_string       [kernel]
              136.00  1.5% dst_release                    [kernel]
              136.00  1.5% _raw_spin_unlock_irqrestore    [kernel]
              126.00  1.4% fget_light                     [kernel]
              126.00  1.4% sky2_intr                      [sky2]  
              122.00  1.4% udp_recvmsg                    [kernel]
              111.00  1.2% sock_queue_rcv_skb             [kernel]



-------------------------------------------------------------------------------
   PerfTop:     325 irqs/sec  kernel:93.2% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function                            DSO
             _______ _____ ___________________________________ ________

             1033.00 62.9% sky2_poll                           [sky2]  
              159.00  9.7% sky2_intr                           [sky2]  
              119.00  7.3% irq_entries_start                   [kernel]
               51.00  3.1% __alloc_skb                         [kernel]
               48.00  2.9% get_rps_cpu                         [kernel]
               24.00  1.5% __kmalloc                           [kernel]
               23.00  1.4% swiotlb_sync_single                 [kernel]
               20.00  1.2% _raw_spin_lock                      [kernel]
               17.00  1.0% sky2_rx_submit                      [sky2]  
               15.00  0.9% enqueue_to_backlog                  [kernel]
               14.00  0.9% kmem_cache_alloc                    [kernel]
               11.00  0.7% default_send_IPI_mask_sequence_phys [kernel]
               10.00  0.6% sky2_remove                         [sky2]  
               10.00  0.6% cache_alloc_refill                  [kernel]
                8.00  0.5% _raw_spin_lock_irqsave              [kernel]
                7.00  0.4% dev_gro_receive                     [kernel]
                6.00  0.4% net_rx_action                       [kernel]
                6.00  0.4% __netdev_alloc_skb                  [kernel]
                6.00  0.4% load_balance                        [kernel]
                5.00  0.3% __smp_call_function_single          [kernel]


-------------------------------------------------------------------------------
   PerfTop:     347 irqs/sec  kernel:96.3% [1000Hz cycles],  (all, cpu: 1)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ ________

              104.00  6.7% call_function_single_interrupt [kernel]
              104.00  6.7% __netif_receive_skb            [kernel]
               95.00  6.1% ip_rcv                         [kernel]
               93.00  6.0% ip_route_input                 [kernel]
               62.00  4.0% schedule                       [kernel]
               49.00  3.2% sock_recv_ts_and_drops         [kernel]
               46.00  3.0% system_call                    [kernel]
               46.00  3.0% dst_release                    [kernel]
               45.00  2.9% _raw_spin_lock                 [kernel]
               41.00  2.7% _raw_spin_lock_irqsave         [kernel]
               40.00  2.6% _raw_spin_unlock_irqrestore    [kernel]
               36.00  2.3% copy_user_generic_string       [kernel]
               34.00  2.2% __udp4_lib_rcv                 [kernel]
               30.00  1.9% fget_light                     [kernel]
               30.00  1.9% sock_queue_rcv_skb             [kernel]
               28.00  1.8% udp_recvmsg                    [kernel]
               28.00  1.8% __udp4_lib_lookup              [kernel]
               26.00  1.7% select_task_rq_fair            [kernel]
               25.00  1.6% tick_nohz_stop_sched_tick      [kernel]
               23.00  1.5% __napi_complete                [kernel]
               20.00  1.3% __switch_to                    [kernel]
               20.00  1.3% finish_task_switch             [kernel]
               20.00  1.3% kmem_cache_free                [kernel]
               20.00  1.3% sys_recvfrom                   [kernel]
               19.00  1.2% kfree                          [kernel]
               19.00  1.2% __skb_recv_datagram            [kernel]

-------------------------------------------------------------------------------
   PerfTop:     243 irqs/sec  kernel:95.5% [1000Hz cycles],  (all, cpu: 7)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ ________

               92.00  7.3% ip_rcv                         [kernel]
               74.00  5.9% __netif_receive_skb            [kernel]
               57.00  4.6% ip_route_input                 [kernel]
               49.00  3.9% sock_recv_ts_and_drops         [kernel]
               49.00  3.9% system_call                    [kernel]
               47.00  3.8% schedule                       [kernel]
               39.00  3.1% _raw_spin_lock_irqsave         [kernel]
               36.00  2.9% call_function_single_interrupt [kernel]
               34.00  2.7% udp_recvmsg                    [kernel]
               32.00  2.6% __udp4_lib_rcv                 [kernel]
               31.00  2.5% copy_user_generic_string       [kernel]
               31.00  2.5% fget_light                     [kernel]
               30.00  2.4% __udp4_lib_lookup              [kernel]
               26.00  2.1% kfree                          [kernel]
               25.00  2.0% __skb_recv_datagram            [kernel]
               25.00  2.0% sock_queue_rcv_skb             [kernel]
               23.00  1.8% __switch_to                    [kernel]
               22.00  1.8% sock_recvmsg                   [kernel]
               22.00  1.8% _raw_spin_unlock_irqrestore    [kernel]
               21.00  1.7% select_task_rq_fair            [kernel]
               18.00  1.4% _raw_spin_lock                 [kernel]
               17.00  1.4% process_backlog                [kernel]
               17.00  1.4% sys_recvfrom                   [kernel]
               17.00  1.4% _raw_spin_lock_bh              [kernel]

run1
----
590479 pps (590479 0:73820 1:73817 2:73820 3:73819 4:73815 5:73815 6:73815 7:73815)
744641 pps (1335120 0:166895 1:166895 2:166895 3:166895 4:166895 5:166895 6:166895 7:166895)
744374 pps (2079494 0:259940 1:259940 2:259940 3:259940 4:259940 5:259940 6:259940 7:259940)
744340 pps (2823834 0:352985 1:352985 2:352985 3:352985 4:352985 5:352985 6:352980 7:352985)
744390 pps (3568224 0:446035 1:446035 2:446035 3:446035 4:446035 5:446035 6:446032 7:446030)
744404 pps (4312628 0:539085 1:539085 2:539085 3:539081 4:539085 5:539085 6:539085 7:539085)
744369 pps (5056997 0:632130 1:632130 2:632130 3:632130 4:632130 5:632130 6:632130 7:632130)
744394 pps (5801391 0:725180 1:725180 2:725180 3:725180 4:725180 5:725180 6:725180 7:725180)
744399 pps (6545790 0:818230 1:818230 2:818229 3:818230 4:818230 5:818226 6:818225 7:818225)
744354 pps (7290144 0:911275 1:911275 2:911275 3:911275 4:911270 5:911270 6:911270 7:911270)
744363 pps (8034507 0:1004320 1:1004320 2:1004320 3:1004320 4:1004320 5:1004306 6:1004320 7:1004317)
744379 pps (8778886 0:1097370 1:1097368 2:1097370 3:1097370 4:1097370 5:1097356 6:1097367 7:1097365)
744449 pps (9523335 0:1190425 1:1190425 2:1190425 3:1190421 4:1190425 5:1190411 6:1190425 7:1190425)
476651 pps (9999986 0:1250000 1:1250000 2:1250000 3:1250000 4:1250000 5:1249986 6:1250000 7:1250000)

99.9% !

rps counter..
865721 rps
1067721 rps

run2
----
573759 pps (573759 0:71720 1:71720 2:71720 3:71723 4:71721 5:71720 6:71720 7:71719)
744249 pps (1318008 0:164755 1:164753 2:164750 3:164750 4:164750 5:164750 6:164750 7:164750)
744260 pps (2062268 0:257785 1:257785 2:257785 3:257785 4:257785 5:257783 6:257780 7:257780)
744238 pps (2806506 0:350815 1:350815 2:350815 3:350815 4:350815 5:350811 6:350810 7:350810)
744233 pps (3550739 0:443845 1:443845 2:443845 3:443845 4:443844 5:443841 6:443841 7:443840)
744236 pps (4294975 0:536875 1:536875 2:536875 3:536870 4:536870 5:536870 6:536870 7:536870)
744244 pps (5039219 0:629905 1:629905 2:629905 3:629905 4:629905 5:629901 6:629901 7:629900)
744240 pps (5783459 0:722935 1:722935 2:722935 3:722934 4:722930 5:722930 6:722930 7:722930)
744214 pps (6527673 0:815962 1:815960 2:815965 3:815963 4:815962 5:815960 6:815955 7:815955)
744268 pps (7271941 0:908995 1:908995 2:908995 3:908995 4:908991 5:908990 6:908990 7:908990)
744239 pps (8016180 0:1002025 1:1002025 2:1002025 3:1002025 4:1002020 5:1002020 6:1002020 7:1002020)
744241 pps (8760421 0:1095055 1:1095055 2:1095052 3:1095055 4:1095055 5:1095050 6:1095050 7:1095050)
744234 pps (9504655 0:1188085 1:1188085 2:1188084 3:1188085 4:1188085 5:1188081 6:1188080 7:1188080)
495345 pps (10000000 0:1250000 1:1250000 2:1250000 3:1250000 4:1250000 5:1250000 6:1250000 7:1250000)

100.0% !!!

rps count ..
3651 rps
1455997 rps
498777 rps

run3
----
72947 pps (72947 0:9120 1:9120 2:9120 3:9120 4:9120 5:9117 6:9115 7:9115)
744616 pps (817563 0:102198 1:102195 2:102195 3:102195 4:102195 5:102195 6:102195 7:102195)
744710 pps (1562273 0:195285 1:195285 2:195285 3:195285 4:195285 5:195285 6:195285 7:195283)
744478 pps (2306751 0:288345 1:288345 2:288345 3:288345 4:288345 5:288345 6:288341 7:288340)
744603 pps (3051354 0:381422 1:381420 2:381420 3:381414 4:381420 5:381420 6:381420 7:381420)
744475 pps (3795829 0:474480 1:474480 2:474480 3:474472 4:474480 5:474480 6:474480 7:474477)
744740 pps (4540569 0:567575 1:567575 2:567575 3:567564 4:567570 5:567570 6:567570 7:567570)
744641 pps (5285210 0:660655 1:660655 2:660655 3:660646 4:660650 5:660650 6:660650 7:660650)
744300 pps (6029510 0:753695 1:753690 2:753690 3:753682 4:753690 5:753690 6:753690 7:753690)
744249 pps (6773759 0:846725 1:846725 2:846725 3:846712 4:846720 5:846720 6:846720 7:846720)
744709 pps (7518468 0:939814 1:939810 2:939810 3:939802 4:939810 5:939810 6:939810 7:939810)
744647 pps (8263115 0:1032893 1:1032890 2:1032890 3:1032882 4:1032890 5:1032890 6:1032890 7:1032890)
744672 pps (9007787 0:1125976 1:1125975 2:1125975 3:1125967 4:1125975 5:1125975 6:1125975 7:1125970)
744692 pps (9752479 0:1219065 1:1219065 2:1219062 3:1219056 4:1219060 5:1219060 6:1219060 7:1219060)
247513 pps (9999992 0:1250000 1:1250000 2:1250000 3:1249992 4:1250000 5:1250000 6:1250000 7:1250000)

99.9%!
rps count ...
1118484 rps
842940 rps

run4
----
288558 pps (288558 0:36070 1:36070 2:36070 3:36070 4:36070 5:36070 6:36070 7:36068)
744237 pps (1032795 0:129103 1:129100 2:129105 3:129100 4:129100 5:129100 6:129095 7:129095)
742988 pps (1775783 0:222135 1:222135 2:222135 3:222135 4:220853 5:222130 6:222130 7:222130)
744210 pps (2519993 0:315160 1:315160 2:315160 3:315160 4:313883 5:315160 6:315155 7:315155)
744214 pps (3264207 0:408189 1:408185 2:408185 3:408185 4:406908 5:408185 6:408185 7:408185)
744278 pps (4008485 0:501223 1:501220 2:501220 3:501220 4:499943 5:501220 6:501220 7:501220)
743699 pps (4752184 0:594252 1:594250 2:593718 3:594250 4:592973 5:594250 6:594248 7:594245)
744243 pps (5496427 0:687280 1:687280 2:686748 3:687280 4:686003 5:687280 6:687280 7:687276)
744231 pps (6240658 0:780310 1:780310 2:779778 3:780310 4:779033 5:780300 6:780310 7:780307)
743958 pps (6984616 0:873342 1:873340 2:872808 3:873340 4:872063 5:873043 6:873340 7:873340)
744241 pps (7728857 0:966373 1:966370 2:965838 3:966370 4:965093 5:966073 6:966370 7:966370)
744232 pps (8473089 0:1059400 1:1059400 2:1058868 3:1059400 4:1058123 5:1059103 6:1059397 7:1059398)
743660 pps (9216749 0:1152434 1:1152430 2:1151898 3:1152430 4:1151153 5:1151556 6:1152427 7:1152430)
744251 pps (9961000 0:1245463 1:1245460 2:1244928 3:1245460 4:1244183 5:1244586 6:1245460 7:1245460)
36317 pps (9997317 0:1250000 1:1250000 2:1249468 3:1250000 4:1248723 5:1249126 6:1250000 7:1250000)

99.9%!
rps count
818552 rps
1146570 rps

run 5
----
686211 pps (686211 0:85780 1:85780 2:85775 3:85779 4:85780 5:85780 6:85775 7:85775)
744260 pps (1430471 0:178810 1:178810 2:178810 3:178810 4:178810 5:178810 6:178806 7:178805)
744242 pps (2174713 0:271840 1:271840 2:271840 3:271840 4:271840 5:271840 6:271838 7:271835)
744241 pps (2918954 0:364870 1:364870 2:364870 3:364870 4:364870 5:364870 6:364869 7:364865)
744238 pps (3663192 0:457900 1:457900 2:457900 3:457900 4:457900 5:457900 6:457900 7:457899)
744240 pps (4407432 0:550930 1:550930 2:550930 3:550930 4:550930 5:550930 6:550927 7:550925)
744244 pps (5151676 0:643960 1:643960 2:643960 3:643960 4:643960 5:643960 6:643960 7:643956)
744236 pps (5895912 0:736990 1:736990 2:736990 3:736990 4:736990 5:736990 6:736987 7:736985)
744241 pps (6640153 0:830020 1:830020 2:830020 3:830020 4:830020 5:830020 6:830018 7:830015)
744235 pps (7384388 0:923050 1:923050 2:923050 3:923050 4:923050 5:923049 6:923045 7:923047)
744244 pps (8128632 0:1016080 1:1016080 2:1016080 3:1016080 4:1016080 5:1016080 6:1016079 7:1016075)
744231 pps (8872863 0:1109110 1:1109110 2:1109110 3:1109110 4:1109108 5:1109105 6:1109105 7:1109105)
744258 pps (9617121 0:1202141 1:1202140 2:1202140 3:1202140 4:1202140 5:1202140 6:1202140 7:1202140)
382879 pps (10000000 0:1250000 1:1250000 2:1250000 3:1250000 4:1250000 5:1250000 6:1250000 7:1250000)

100%
rpsipi count ..
768383 rps
1178132 rps

^ permalink raw reply	[flat|nested] 108+ messages in thread

* [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
  2010-04-29 20:36                                   ` jamal
@ 2010-04-29 21:01                                     ` Eric Dumazet
  2010-04-30 13:55                                       ` Brian Bloniarz
  2010-04-30 23:35                                       ` David Miller
  2010-04-30 19:30                                     ` [PATCH net-next-2.6] net: speedup udp receive path jamal
  1 sibling, 2 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-04-29 21:01 UTC (permalink / raw)
  To: hadi
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

Le jeudi 29 avril 2010 à 16:36 -0400, jamal a écrit :

> Results attached.
> With your app rps does a hell lot better and non-rps worse ;->
> With my proggie, non-rps does much better than yours and rps does
> a lot worse for same setup. I see the scheduler kicking quiet a bit in
> non-rps for you...
> 
> The main difference between us as i see it is:
> a) i use epoll - actually linked to libevent (1.0.something)
> b) I fork processes and you use pthreads.
> 
> I dont have time to chase it today, but 1) I am either going to change
> yours to use libevent or make mine get rid of it then 2) move towards
> pthreads or have yours fork..
> then observe if that makes any difference..
> 

Thanks !

Here is last 'patch of the day' for me ;)

Next one will be able to coalesce wakeup calls (they'll be delayed at
the end of net_rx_action(), like a patch I did last year to help
multicast reception)

vger seems to be down, I suspect I'll have to resend it later.

[PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion

sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we
need two atomic operations (and associated dirtying) per incoming
packet.

RCU conversion is pretty much needed :

1) Add a new structure, called "struct socket_wq" to hold all fields
that will need rcu_read_lock() protection (currently: a
wait_queue_head_t and a struct fasync_struct pointer).

[Future patch will add a list anchor for wakeup coalescing]

2) Attach one of such structure to each "struct socket" created in
sock_alloc_inode().

3) Respect RCU grace period when freeing a "struct socket_wq"

4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct
socket_wq"

5) Change sk_sleep() function to use new sk->sk_wq instead of
sk->sk_sleep

6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside
a rcu_read_lock() section.

7) Change all sk_has_sleeper() callers to :
  - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock)
  - Use wq_has_sleeper() to eventually wakeup tasks.
  - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock)

8) sock_wake_async() is modified to use rcu protection as well.

9) Exceptions :
  macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq"
instead of dynamically allocated ones. They dont need rcu freeing.

Some cleanups or followups are probably needed, (possible
sk_callback_lock conversion to a spinlock for example...).

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
--- 
 drivers/net/macvtap.c |   13 +++++++---
 drivers/net/tun.c     |   21 +++++++++-------
 include/linux/net.h   |   14 +++++++----
 include/net/af_unix.h |   20 ++++++++--------
 include/net/sock.h    |   40 ++++++++++++++++----------------
 net/atm/common.c      |   22 +++++++++++------
 net/core/sock.c       |   50 ++++++++++++++++++++++++----------------
 net/core/stream.c     |   10 +++++---
 net/dccp/output.c     |   10 ++++----
 net/iucv/af_iucv.c    |   11 +++++---
 net/phonet/pep.c      |    8 +++---
 net/phonet/socket.c   |    2 -
 net/rxrpc/af_rxrpc.c  |   10 ++++----
 net/sctp/socket.c     |    2 -
 net/socket.c          |   47 ++++++++++++++++++++++++++++---------
 net/unix/af_unix.c    |   17 ++++++-------
 16 files changed, 182 insertions(+), 115 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index d97e1fd..1c4110d 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -37,6 +37,7 @@
 struct macvtap_queue {
 	struct sock sk;
 	struct socket sock;
+	struct socket_wq wq;
 	struct macvlan_dev *vlan;
 	struct file *file;
 	unsigned int flags;
@@ -242,12 +243,15 @@ static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
 
 static void macvtap_sock_write_space(struct sock *sk)
 {
+	wait_queue_head_t *wqueue;
+
 	if (!sock_writeable(sk) ||
 	    !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
 		return;
 
-	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
-		wake_up_interruptible_poll(sk_sleep(sk), POLLOUT | POLLWRNORM | POLLWRBAND);
+	wqueue = sk_sleep(sk);
+	if (wqueue && waitqueue_active(wqueue))
+		wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
 }
 
 static int macvtap_open(struct inode *inode, struct file *file)
@@ -272,7 +276,8 @@ static int macvtap_open(struct inode *inode, struct file *file)
 	if (!q)
 		goto out;
 
-	init_waitqueue_head(&q->sock.wait);
+	q->sock.wq = &q->wq;
+	init_waitqueue_head(&q->wq.wait);
 	q->sock.type = SOCK_RAW;
 	q->sock.state = SS_CONNECTED;
 	q->sock.file = file;
@@ -308,7 +313,7 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait)
 		goto out;
 
 	mask = 0;
-	poll_wait(file, &q->sock.wait, wait);
+	poll_wait(file, &q->wq.wait, wait);
 
 	if (!skb_queue_empty(&q->sk.sk_receive_queue))
 		mask |= POLLIN | POLLRDNORM;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 20a1793..e525a6c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -109,7 +109,7 @@ struct tun_struct {
 
 	struct tap_filter       txflt;
 	struct socket		socket;
-
+	struct socket_wq	wq;
 #ifdef TUN_DEBUG
 	int debug;
 #endif
@@ -323,7 +323,7 @@ static void tun_net_uninit(struct net_device *dev)
 	/* Inform the methods they need to stop using the dev.
 	 */
 	if (tfile) {
-		wake_up_all(&tun->socket.wait);
+		wake_up_all(&tun->wq.wait);
 		if (atomic_dec_and_test(&tfile->count))
 			__tun_detach(tun);
 	}
@@ -398,7 +398,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Notify and wake up reader process */
 	if (tun->flags & TUN_FASYNC)
 		kill_fasync(&tun->fasync, SIGIO, POLL_IN);
-	wake_up_interruptible_poll(&tun->socket.wait, POLLIN |
+	wake_up_interruptible_poll(&tun->wq.wait, POLLIN |
 				   POLLRDNORM | POLLRDBAND);
 	return NETDEV_TX_OK;
 
@@ -498,7 +498,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
 
 	DBG(KERN_INFO "%s: tun_chr_poll\n", tun->dev->name);
 
-	poll_wait(file, &tun->socket.wait, wait);
+	poll_wait(file, &tun->wq.wait, wait);
 
 	if (!skb_queue_empty(&sk->sk_receive_queue))
 		mask |= POLLIN | POLLRDNORM;
@@ -773,7 +773,7 @@ static ssize_t tun_do_read(struct tun_struct *tun,
 
 	DBG(KERN_INFO "%s: tun_chr_read\n", tun->dev->name);
 
-	add_wait_queue(&tun->socket.wait, &wait);
+	add_wait_queue(&tun->wq.wait, &wait);
 	while (len) {
 		current->state = TASK_INTERRUPTIBLE;
 
@@ -804,7 +804,7 @@ static ssize_t tun_do_read(struct tun_struct *tun,
 	}
 
 	current->state = TASK_RUNNING;
-	remove_wait_queue(&tun->socket.wait, &wait);
+	remove_wait_queue(&tun->wq.wait, &wait);
 
 	return ret;
 }
@@ -861,6 +861,7 @@ static struct rtnl_link_ops tun_link_ops __read_mostly = {
 static void tun_sock_write_space(struct sock *sk)
 {
 	struct tun_struct *tun;
+	wait_queue_head_t *wqueue;
 
 	if (!sock_writeable(sk))
 		return;
@@ -868,8 +869,9 @@ static void tun_sock_write_space(struct sock *sk)
 	if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
 		return;
 
-	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
-		wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT |
+	wqueue = sk_sleep(sk);
+	if (wqueue && waitqueue_active(wqueue))
+		wake_up_interruptible_sync_poll(wqueue, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 
 	tun = tun_sk(sk)->tun;
@@ -1039,7 +1041,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		if (!sk)
 			goto err_free_dev;
 
-		init_waitqueue_head(&tun->socket.wait);
+		tun->socket.wq = &tun->wq;
+		init_waitqueue_head(&tun->wq.wait);
 		tun->socket.ops = &tun_socket_ops;
 		sock_init_data(&tun->socket, sk);
 		sk->sk_write_space = tun_sock_write_space;
diff --git a/include/linux/net.h b/include/linux/net.h
index 4157b5d..2b4deee 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -59,6 +59,7 @@ typedef enum {
 #include <linux/wait.h>
 #include <linux/fcntl.h>	/* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/kmemcheck.h>
+#include <linux/rcupdate.h>
 
 struct poll_table_struct;
 struct pipe_inode_info;
@@ -116,6 +117,12 @@ enum sock_shutdown_cmd {
 	SHUT_RDWR	= 2,
 };
 
+struct socket_wq {
+	wait_queue_head_t	wait;
+	struct fasync_struct	*fasync_list;
+	struct rcu_head		rcu;
+} ____cacheline_aligned_in_smp;
+
 /**
  *  struct socket - general BSD socket
  *  @state: socket state (%SS_CONNECTED, etc)
@@ -135,11 +142,8 @@ struct socket {
 	kmemcheck_bitfield_end(type);
 
 	unsigned long		flags;
-	/*
-	 * Please keep fasync_list & wait fields in the same cache line
-	 */
-	struct fasync_struct	*fasync_list;
-	wait_queue_head_t	wait;
+
+	struct socket_wq	*wq;
 
 	struct file		*file;
 	struct sock		*sk;
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 1614d78..20725e2 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -30,7 +30,7 @@ struct unix_skb_parms {
 #endif
 };
 
-#define UNIXCB(skb) 	(*(struct unix_skb_parms*)&((skb)->cb))
+#define UNIXCB(skb) 	(*(struct unix_skb_parms *)&((skb)->cb))
 #define UNIXCREDS(skb)	(&UNIXCB((skb)).creds)
 #define UNIXSID(skb)	(&UNIXCB((skb)).secid)
 
@@ -45,21 +45,23 @@ struct unix_skb_parms {
 struct unix_sock {
 	/* WARNING: sk has to be the first member */
 	struct sock		sk;
-        struct unix_address     *addr;
-        struct dentry		*dentry;
-        struct vfsmount		*mnt;
+	struct unix_address     *addr;
+	struct dentry		*dentry;
+	struct vfsmount		*mnt;
 	struct mutex		readlock;
-        struct sock		*peer;
-        struct sock		*other;
+	struct sock		*peer;
+	struct sock		*other;
 	struct list_head	link;
-        atomic_long_t           inflight;
-        spinlock_t		lock;
+	atomic_long_t		inflight;
+	spinlock_t		lock;
 	unsigned int		gc_candidate : 1;
 	unsigned int		gc_maybe_cycle : 1;
-        wait_queue_head_t       peer_wait;
+	struct socket_wq	peer_wq;
 };
 #define unix_sk(__sk) ((struct unix_sock *)__sk)
 
+#define peer_wait peer_wq.wait
+
 #ifdef CONFIG_SYSCTL
 extern int unix_sysctl_register(struct net *net);
 extern void unix_sysctl_unregister(struct net *net);
diff --git a/include/net/sock.h b/include/net/sock.h
index d361c77..03d0046 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -159,7 +159,7 @@ struct sock_common {
   *	@sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
   *	@sk_lock:	synchronizer
   *	@sk_rcvbuf: size of receive buffer in bytes
-  *	@sk_sleep: sock wait queue
+  *	@sk_wq: sock wait queue and async head
   *	@sk_dst_cache: destination cache
   *	@sk_dst_lock: destination cache lock
   *	@sk_policy: flow policy
@@ -257,7 +257,7 @@ struct sock {
 		struct sk_buff *tail;
 		int len;
 	} sk_backlog;
-	wait_queue_head_t	*sk_sleep;
+	struct socket_wq	*sk_wq;
 	struct dst_entry	*sk_dst_cache;
 #ifdef CONFIG_XFRM
 	struct xfrm_policy	*sk_policy[2];
@@ -1219,7 +1219,7 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 
 static inline wait_queue_head_t *sk_sleep(struct sock *sk)
 {
-	return sk->sk_sleep;
+	return &sk->sk_wq->wait;
 }
 /* Detach socket from process context.
  * Announce socket dead, detach it from wait queue and inode.
@@ -1233,14 +1233,14 @@ static inline void sock_orphan(struct sock *sk)
 	write_lock_bh(&sk->sk_callback_lock);
 	sock_set_flag(sk, SOCK_DEAD);
 	sk_set_socket(sk, NULL);
-	sk->sk_sleep  = NULL;
+	sk->sk_wq  = NULL;
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
 static inline void sock_graft(struct sock *sk, struct socket *parent)
 {
 	write_lock_bh(&sk->sk_callback_lock);
-	sk->sk_sleep = &parent->wait;
+	rcu_assign_pointer(sk->sk_wq, parent->wq);
 	parent->sk = sk;
 	sk_set_socket(sk, parent);
 	security_sock_graft(sk, parent);
@@ -1392,12 +1392,12 @@ static inline int sk_has_allocations(const struct sock *sk)
 }
 
 /**
- * sk_has_sleeper - check if there are any waiting processes
- * @sk: socket
+ * wq_has_sleeper - check if there are any waiting processes
+ * @sk: struct socket_wq
  *
- * Returns true if socket has waiting processes
+ * Returns true if socket_wq has waiting processes
  *
- * The purpose of the sk_has_sleeper and sock_poll_wait is to wrap the memory
+ * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory
  * barrier call. They were added due to the race found within the tcp code.
  *
  * Consider following tcp code paths:
@@ -1410,9 +1410,10 @@ static inline int sk_has_allocations(const struct sock *sk)
  *   ...                 ...
  *   tp->rcv_nxt check   sock_def_readable
  *   ...                 {
- *   schedule               ...
- *                          if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- *                              wake_up_interruptible(sk_sleep(sk))
+ *   schedule               rcu_read_lock();
+ *                          wq = rcu_dereference(sk->sk_wq);
+ *                          if (wq && waitqueue_active(&wq->wait))
+ *                              wake_up_interruptible(&wq->wait)
  *                          ...
  *                       }
  *
@@ -1421,28 +1422,27 @@ static inline int sk_has_allocations(const struct sock *sk)
  * could then endup calling schedule and sleep forever if there are no more
  * data on the socket.
  *
- * The sk_has_sleeper is always called right after a call to read_lock, so we
- * can use smp_mb__after_lock barrier.
  */
-static inline int sk_has_sleeper(struct sock *sk)
+static inline bool wq_has_sleeper(struct socket_wq *wq)
 {
+
 	/*
 	 * We need to be sure we are in sync with the
 	 * add_wait_queue modifications to the wait queue.
 	 *
 	 * This memory barrier is paired in the sock_poll_wait.
 	 */
-	smp_mb__after_lock();
-	return sk_sleep(sk) && waitqueue_active(sk_sleep(sk));
+	smp_mb();
+	return wq && waitqueue_active(&wq->wait);
 }
-
+ 
 /**
  * sock_poll_wait - place memory barrier behind the poll_wait call.
  * @filp:           file
  * @wait_address:   socket wait queue
  * @p:              poll_table
  *
- * See the comments in the sk_has_sleeper function.
+ * See the comments in the wq_has_sleeper function.
  */
 static inline void sock_poll_wait(struct file *filp,
 		wait_queue_head_t *wait_address, poll_table *p)
@@ -1453,7 +1453,7 @@ static inline void sock_poll_wait(struct file *filp,
 		 * We need to be sure we are in sync with the
 		 * socket flags modification.
 		 *
-		 * This memory barrier is paired in the sk_has_sleeper.
+		 * This memory barrier is paired in the wq_has_sleeper.
 		*/
 		smp_mb();
 	}
diff --git a/net/atm/common.c b/net/atm/common.c
index e3e10e6..b43feb1 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -90,10 +90,13 @@ static void vcc_sock_destruct(struct sock *sk)
 
 static void vcc_def_wakeup(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
-	if (sk_has_sleeper(sk))
-		wake_up(sk_sleep(sk));
-	read_unlock(&sk->sk_callback_lock);
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up(&wq->wait);
+	rcu_read_unlock();
 }
 
 static inline int vcc_writable(struct sock *sk)
@@ -106,16 +109,19 @@ static inline int vcc_writable(struct sock *sk)
 
 static void vcc_write_space(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
+	struct socket_wq *wq;
+
+	rcu_read_lock();
 
 	if (vcc_writable(sk)) {
-		if (sk_has_sleeper(sk))
-			wake_up_interruptible(sk_sleep(sk));
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible(&wq->wait);
 
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 static struct proto vcc_proto = {
diff --git a/net/core/sock.c b/net/core/sock.c
index 5104175..94c4aff 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1211,7 +1211,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 		 */
 		sk_refcnt_debug_inc(newsk);
 		sk_set_socket(newsk, NULL);
-		newsk->sk_sleep	 = NULL;
+		newsk->sk_wq = NULL;
 
 		if (newsk->sk_prot->sockets_allocated)
 			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
@@ -1800,41 +1800,53 @@ EXPORT_SYMBOL(sock_no_sendpage);
 
 static void sock_def_wakeup(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
-	if (sk_has_sleeper(sk))
-		wake_up_interruptible_all(sk_sleep(sk));
-	read_unlock(&sk->sk_callback_lock);
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_all(&wq->wait);
+	rcu_read_unlock();
 }
 
 static void sock_def_error_report(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
-	if (sk_has_sleeper(sk))
-		wake_up_interruptible_poll(sk_sleep(sk), POLLERR);
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_poll(&wq->wait, POLLERR);
 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 static void sock_def_readable(struct sock *sk, int len)
 {
-	read_lock(&sk->sk_callback_lock);
-	if (sk_has_sleeper(sk))
-		wake_up_interruptible_sync_poll(sk_sleep(sk), POLLIN |
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
 						POLLRDNORM | POLLRDBAND);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 static void sock_def_write_space(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
+	struct socket_wq *wq;
+
+	rcu_read_lock();
 
 	/* Do not wake up a writer until he can make "significant"
 	 * progress.  --DaveM
 	 */
 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
-		if (sk_has_sleeper(sk))
-			wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT |
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 
 		/* Should agree with poll, otherwise some programs break */
@@ -1842,7 +1854,7 @@ static void sock_def_write_space(struct sock *sk)
 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 static void sock_def_destruct(struct sock *sk)
@@ -1896,10 +1908,10 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
 	if (sock) {
 		sk->sk_type	=	sock->type;
-		sk->sk_sleep	=	&sock->wait;
+		sk->sk_wq	=	sock->wq;
 		sock->sk	=	sk;
 	} else
-		sk->sk_sleep	=	NULL;
+		sk->sk_wq	=	NULL;
 
 	spin_lock_init(&sk->sk_dst_lock);
 	rwlock_init(&sk->sk_callback_lock);
diff --git a/net/core/stream.c b/net/core/stream.c
index 7b3c3f3..cc196f4 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -28,15 +28,19 @@
 void sk_stream_write_space(struct sock *sk)
 {
 	struct socket *sock = sk->sk_socket;
+	struct socket_wq *wq;
 
 	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) {
 		clear_bit(SOCK_NOSPACE, &sock->flags);
 
-		if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
-			wake_up_interruptible_poll(sk_sleep(sk), POLLOUT |
+		rcu_read_lock();
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible_poll(&wq->wait, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
-		if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+		if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
+		rcu_read_unlock();
 	}
 }
 
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 2d3dcb3..aadbdb5 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -195,15 +195,17 @@ EXPORT_SYMBOL_GPL(dccp_sync_mss);
 
 void dccp_write_space(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
+	struct socket_wq *wq;
 
-	if (sk_has_sleeper(sk))
-		wake_up_interruptible(sk_sleep(sk));
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible(&wq->wait);
 	/* Should agree with poll, otherwise some programs break */
 	if (sock_writeable(sk))
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 /**
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 9636b7d..8be324f 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -305,11 +305,14 @@ static inline int iucv_below_msglim(struct sock *sk)
  */
 static void iucv_sock_wake_msglim(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
-	if (sk_has_sleeper(sk))
-		wake_up_interruptible_all(sk_sleep(sk));
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_all(&wq->wait);
 	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 /* Timers */
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index e2a9576..af4d38b 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -664,12 +664,12 @@ static int pep_wait_connreq(struct sock *sk, int noblock)
 		if (signal_pending(tsk))
 			return sock_intr_errno(timeo);
 
-		prepare_to_wait_exclusive(&sk->sk_socket->wait, &wait,
+		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
 						TASK_INTERRUPTIBLE);
 		release_sock(sk);
 		timeo = schedule_timeout(timeo);
 		lock_sock(sk);
-		finish_wait(&sk->sk_socket->wait, &wait);
+		finish_wait(sk_sleep(sk), &wait);
 	}
 
 	return 0;
@@ -910,10 +910,10 @@ disabled:
 			goto out;
 		}
 
-		prepare_to_wait(&sk->sk_socket->wait, &wait,
+		prepare_to_wait(sk_sleep(sk), &wait,
 				TASK_INTERRUPTIBLE);
 		done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits));
-		finish_wait(&sk->sk_socket->wait, &wait);
+		finish_wait(sk_sleep(sk), &wait);
 
 		if (sk->sk_state != TCP_ESTABLISHED)
 			goto disabled;
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index c785bfd..6e9848b 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -265,7 +265,7 @@ static unsigned int pn_socket_poll(struct file *file, struct socket *sock,
 	struct pep_sock *pn = pep_sk(sk);
 	unsigned int mask = 0;
 
-	poll_wait(file, &sock->wait, wait);
+	poll_wait(file, sk_sleep(sk), wait);
 
 	switch (sk->sk_state) {
 	case TCP_LISTEN:
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index c432d76..0b9bb20 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -62,13 +62,15 @@ static inline int rxrpc_writable(struct sock *sk)
 static void rxrpc_write_space(struct sock *sk)
 {
 	_enter("%p", sk);
-	read_lock(&sk->sk_callback_lock);
+	rcu_read_lock();
 	if (rxrpc_writable(sk)) {
-		if (sk_has_sleeper(sk))
-			wake_up_interruptible(sk_sleep(sk));
+		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
+
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible(&wq->wait);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 /*
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 13d8229..d54700a 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6065,7 +6065,7 @@ static void __sctp_write_space(struct sctp_association *asoc)
 			 * here by modeling from the current TCP/UDP code.
 			 * We have not tested with it yet.
 			 */
-			if (sock->fasync_list &&
+			if (sock->wq->fasync_list &&
 			    !(sk->sk_shutdown & SEND_SHUTDOWN))
 				sock_wake_async(sock,
 						SOCK_WAKE_SPACE, POLL_OUT);
diff --git a/net/socket.c b/net/socket.c
index 9822081..a0a59cb 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -252,9 +252,14 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
-	init_waitqueue_head(&ei->socket.wait);
+	ei->socket.wq = kmalloc(sizeof(struct socket_wq), GFP_KERNEL);
+	if (!ei->socket.wq) {
+		kmem_cache_free(sock_inode_cachep, ei);
+		return NULL;
+	}
+	init_waitqueue_head(&ei->socket.wq->wait);
+	ei->socket.wq->fasync_list = NULL;
 
-	ei->socket.fasync_list = NULL;
 	ei->socket.state = SS_UNCONNECTED;
 	ei->socket.flags = 0;
 	ei->socket.ops = NULL;
@@ -264,10 +269,21 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
+
+static void wq_free_rcu(struct rcu_head *head)
+{
+	struct socket_wq *wq = container_of(head, struct socket_wq, rcu);
+
+	kfree(wq);
+}
+
 static void sock_destroy_inode(struct inode *inode)
 {
-	kmem_cache_free(sock_inode_cachep,
-			container_of(inode, struct socket_alloc, vfs_inode));
+	struct socket_alloc *ei;
+
+	ei = container_of(inode, struct socket_alloc, vfs_inode);
+	call_rcu(&ei->socket.wq->rcu, wq_free_rcu);
+	kmem_cache_free(sock_inode_cachep, ei);
 }
 
 static void init_once(void *foo)
@@ -513,7 +529,7 @@ void sock_release(struct socket *sock)
 		module_put(owner);
 	}
 
-	if (sock->fasync_list)
+	if (sock->wq->fasync_list)
 		printk(KERN_ERR "sock_release: fasync list not empty!\n");
 
 	percpu_sub(sockets_in_use, 1);
@@ -1080,9 +1096,9 @@ static int sock_fasync(int fd, struct file *filp, int on)
 
 	lock_sock(sk);
 
-	fasync_helper(fd, filp, on, &sock->fasync_list);
+	fasync_helper(fd, filp, on, &sock->wq->fasync_list);
 
-	if (!sock->fasync_list)
+	if (!sock->wq->fasync_list)
 		sock_reset_flag(sk, SOCK_FASYNC);
 	else
 		sock_set_flag(sk, SOCK_FASYNC);
@@ -1091,12 +1107,20 @@ static int sock_fasync(int fd, struct file *filp, int on)
 	return 0;
 }
 
-/* This function may be called only under socket lock or callback_lock */
+/* This function may be called only under socket lock or callback_lock or rcu_lock */
 
 int sock_wake_async(struct socket *sock, int how, int band)
 {
-	if (!sock || !sock->fasync_list)
+	struct socket_wq *wq;
+
+	if (!sock)
 		return -1;
+	rcu_read_lock();
+	wq = rcu_dereference(sock->wq);
+	if (!wq || !wq->fasync_list) {
+		rcu_read_unlock();
+		return -1;
+	}
 	switch (how) {
 	case SOCK_WAKE_WAITD:
 		if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
@@ -1108,11 +1132,12 @@ int sock_wake_async(struct socket *sock, int how, int band)
 		/* fall through */
 	case SOCK_WAKE_IO:
 call_kill:
-		kill_fasync(&sock->fasync_list, SIGIO, band);
+		kill_fasync(&wq->fasync_list, SIGIO, band);
 		break;
 	case SOCK_WAKE_URG:
-		kill_fasync(&sock->fasync_list, SIGURG, band);
+		kill_fasync(&wq->fasync_list, SIGURG, band);
 	}
+	rcu_read_unlock();
 	return 0;
 }
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 87c0360..fef2cc5 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -313,13 +313,16 @@ static inline int unix_writable(struct sock *sk)
 
 static void unix_write_space(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
+	struct socket_wq *wq;
+
+	rcu_read_lock();
 	if (unix_writable(sk)) {
-		if (sk_has_sleeper(sk))
-			wake_up_interruptible_sync(sk_sleep(sk));
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible_sync(&wq->wait);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 /* When dgram socket disconnects (or changes its peer), we clear its receive
@@ -406,9 +409,7 @@ static int unix_release_sock(struct sock *sk, int embrion)
 				skpair->sk_err = ECONNRESET;
 			unix_state_unlock(skpair);
 			skpair->sk_state_change(skpair);
-			read_lock(&skpair->sk_callback_lock);
 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
-			read_unlock(&skpair->sk_callback_lock);
 		}
 		sock_put(skpair); /* It may now die */
 		unix_peer(sk) = NULL;
@@ -1142,7 +1143,7 @@ restart:
 	newsk->sk_peercred.pid	= task_tgid_vnr(current);
 	current_euid_egid(&newsk->sk_peercred.uid, &newsk->sk_peercred.gid);
 	newu = unix_sk(newsk);
-	newsk->sk_sleep		= &newu->peer_wait;
+	newsk->sk_wq		= &newu->peer_wq;
 	otheru = unix_sk(other);
 
 	/* copy address information from listening to new sock*/
@@ -1931,12 +1932,10 @@ static int unix_shutdown(struct socket *sock, int mode)
 			other->sk_shutdown |= peer_mode;
 			unix_state_unlock(other);
 			other->sk_state_change(other);
-			read_lock(&other->sk_callback_lock);
 			if (peer_mode == SHUTDOWN_MASK)
 				sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
 			else if (peer_mode & RCV_SHUTDOWN)
 				sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
-			read_unlock(&other->sk_callback_lock);
 		}
 		if (other)
 			sock_put(other);



^ permalink raw reply related	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-29 12:45                       ` Eric Dumazet
  2010-04-29 13:17                         ` jamal
@ 2010-04-29 23:07                         ` Changli Gao
  1 sibling, 0 replies; 108+ messages in thread
From: Changli Gao @ 2010-04-29 23:07 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: hadi, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

On Thu, Apr 29, 2010 at 8:45 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> Changli, I wonder how you can cook "performance" patches without testing
> them at all for real... This cannot be true ?
>

I am sorry. But I wasn't against your patch, and I just wanted to
understand the test result from jamal. It is my fault submitting a
performance patch without testing them. I should not reply on code
inspection for the performance patch.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
       [not found]                   ` <20100429214144.GA10663@gargoyle.fritz.box>
@ 2010-04-30  5:25                     ` Eric Dumazet
  2010-04-30 23:38                     ` David Miller
  1 sibling, 0 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-04-30  5:25 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andi Kleen, hadi, Changli Gao, David S. Miller, Tom Herbert,
	Stephen Hemminger, netdev, lenb, arjan

Le jeudi 29 avril 2010 à 23:41 +0200, Andi Kleen a écrit :
> On Thu, Apr 29, 2010 at 09:12:27PM +0200, Eric Dumazet wrote:
> > Yes, mostly, but about 200.000 wakeups per second I would say...
> > 
> > If a cpu in deep state receives an IPI, process a softirq, should it
> > come back to deep state immediately, or should it wait for some
> > milliseconds ?
> 
> In principle the cpuidle governour should detect this and not put the target into
> the slow deep c states. One change that was done recently to fix a similar 
> problem for disk IO was to take processes that wait for IO into account 
> (see 69d25870). But it doesn't work for networking.
> 
> Here's a untested patch that might help: tell the cpuidle governour 
> networking is waiting for IO. This will tell it to not go down the deeply.
> 
> I might have missed some schedule() paths, feel free to add more.
> 
> Actually it's probably too aggressive because it will avoid C states even for
> a closed window on the other side which might be hours. Better would
> be some heuristic to only do this when you're really expected IO shortly.
> 
> Also does your workload even sleep at all? If not we would need to increase
> the iowait counters in recvmsg() itself.
> 

My workload yes, uses blocking recvmsg() calls, but Jamal one uses
epoll() so I guess problem is more generic than that. We should have an
estimate of the number of wakeups (IO or not...) per second (or
sub-second) so that cpuidle can avoid these deep states ?

> Anyways might be still worth a try.
> 
> For routing we probably need some other solution though, there are no 
> schedules there.
> 
> > 
> > > Perhaps need to feed some information to cpuidle's governour to prevent this problem.
> > > 
> > > idle=poll is very drastic, better to limit to C1 
> > > 
> > 
> > How can I do this ?
> 
> processor.max_cstate=1 or using /dev/network_latency 
> (see Documentation/power/pm_qos_interface.txt)
> 
> -Andi
> 

Thanks, I'll play with this today !

> 
> 
> commit 810227a7c24ecae2bb4aac320490a7115ac33be8
> Author: Andi Kleen <ak@linux.intel.com>
> Date:   Thu Apr 29 23:33:18 2010 +0200
> 
>     Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies
> 
>     XXX: probably too aggressive, some of these sleeps are not under high load.
> 
>     Based on a bug report from Eric Dumazet.
>     
>     Signed-off-by: Andi Kleen <ak@linux.intel.com>
> 
> diff --git a/net/core/sock.c b/net/core/sock.c
> index c5812bb..c246d6c 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1402,7 +1402,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
>  			break;
>  		if (sk->sk_err)
>  			break;
> -		timeo = schedule_timeout(timeo);
> +		timeo = io_schedule_timeout(timeo);
>  	}
>  	finish_wait(sk->sk_sleep, &wait);
>  	return timeo;
> @@ -1512,7 +1512,7 @@ static void __lock_sock(struct sock *sk)
>  		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
>  					TASK_UNINTERRUPTIBLE);
>  		spin_unlock_bh(&sk->sk_lock.slock);
> -		schedule();
> +		io_schedule();
>  		spin_lock_bh(&sk->sk_lock.slock);
>  		if (!sock_owned_by_user(sk))
>  			break;
> 
> > 
> > Thanks !
> > 
> > 



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
  2010-04-29 21:01                                     ` [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion Eric Dumazet
@ 2010-04-30 13:55                                       ` Brian Bloniarz
  2010-04-30 17:26                                         ` Eric Dumazet
  2010-04-30 23:35                                       ` David Miller
  1 sibling, 1 reply; 108+ messages in thread
From: Brian Bloniarz @ 2010-04-30 13:55 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: hadi, Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein

Eric Dumazet wrote:
> Here is last 'patch of the day' for me ;)
> Next one will be able to coalesce wakeup calls (they'll be delayed at
> the end of net_rx_action(), like a patch I did last year to help
> multicast reception)
>
> vger seems to be down, I suspect I'll have to resend it later.
>
> [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
>
> sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we
> need two atomic operations (and associated dirtying) per incoming
> packet.
>   

This patch boots for me, I haven't noticed any strangeness yet.

I ran a few benchmarks (the multicast fan-out mcasttest.c
from last year, a few other things we have lying around).
I think I see a modest improvement from this and your other
2 packets. Presumably the big wins are where multiple cores
perform bh for the same socket, that's not the case in
these benchmarks. If it's appropriate:

Tested-by: Brian Bloniarz <bmb@athenacr.com>

> Next one will be able to coalesce wakeup calls (they'll be delayed at
> the end of net_rx_action(), like a patch I did last year to help
> multicast reception)

Keep em coming :)

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
  2010-04-30 13:55                                       ` Brian Bloniarz
@ 2010-04-30 17:26                                         ` Eric Dumazet
  0 siblings, 0 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-04-30 17:26 UTC (permalink / raw)
  To: Brian Bloniarz
  Cc: hadi, Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein

Le vendredi 30 avril 2010 à 09:55 -0400, Brian Bloniarz a écrit :

> 
> This patch boots for me, I haven't noticed any strangeness yet.
> 
> I ran a few benchmarks (the multicast fan-out mcasttest.c
> from last year, a few other things we have lying around).
> I think I see a modest improvement from this and your other
> 2 packets. Presumably the big wins are where multiple cores
> perform bh for the same socket, that's not the case in
> these benchmarks. If it's appropriate:
> 
> Tested-by: Brian Bloniarz <bmb@athenacr.com>
> 
> > Next one will be able to coalesce wakeup calls (they'll be delayed at
> > the end of net_rx_action(), like a patch I did last year to help
> > multicast reception)
> 
> Keep em coming :)

Thanks for testing !

Here is a respin of "net: relax dst refcnt in input path"
patch for net-next-2.6

Not ready for inclusion, but seems to work quite well on multicast
load : I get about 20% more packets on mcasttest

(Avoid atomic ops on dst entries on input path, and partly on forwading
path). On mccasttest, all sockets share same dst, so producer/consumers
all fight on a single cache line.

Old ref (for informations) :
http://kerneltrap.org/mailarchive/linux-netdev/2009/7/22/6248753

Not-Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

 include/linux/skbuff.h    |   45 +++++++++++++++++++++++++++++++++-
 include/net/dst.h         |   47 +++++++++++++++++++++++++++++++++---
 include/net/route.h       |    2 -
 include/net/sock.h        |    2 +
 net/bridge/br_netfilter.c |    2 -
 net/core/dev.c            |    3 ++
 net/core/skbuff.c         |    3 +-
 net/core/sock.c           |    6 ++++
 net/ipv4/arp.c            |    2 -
 net/ipv4/icmp.c           |    8 +++---
 net/ipv4/ip_forward.c     |    1 
 net/ipv4/ip_fragment.c    |    2 -
 net/ipv4/ip_input.c       |    2 -
 net/ipv4/ip_options.c     |   11 ++++----
 net/ipv4/netfilter.c      |    8 +++---
 net/ipv4/route.c          |   15 +++++++----
 net/ipv4/xfrm4_input.c    |    2 -
 net/ipv6/ip6_tunnel.c     |    2 -
 net/netfilter/nf_queue.c  |    2 +
 net/sched/sch_generic.c   |    2 -
 20 files changed, 136 insertions(+), 31 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 82f5116..6195bcf 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -414,16 +414,59 @@ struct sk_buff {
 
 #include <asm/system.h>
 
+/*
+ * skb might have a dst pointer attached, refcounted or not
+ * _skb_dst low order bit is set if refcount was taken
+ */
+#define SKB_DST_NOREF	1UL
+#define SKB_DST_PTRMASK	~(SKB_DST_NOREF)
+
+/**
+ * skb_dst - returns skb dst_entry
+ * @skb: buffer
+ *
+ * Returns skb dst_entry, regardless of reference taken or not.
+ */
 static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
 {
-	return (struct dst_entry *)skb->_skb_dst;
+	return (struct dst_entry *)(skb->_skb_dst & SKB_DST_PTRMASK);
 }
 
+/**
+ * skb_dst_set - sets skb dst
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was taken on dst and should
+ * be released by skb_dst_drop()
+ */
 static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
 {
 	skb->_skb_dst = (unsigned long)dst;
 }
 
+/**
+ * skb_dst_set_noref - sets skb dst, without a reference
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was _not_ taken on dst
+ * skb_dst_drop() should not dst_release() this dst
+ */
+static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+{
+	skb->_skb_dst = (unsigned long)dst | SKB_DST_NOREF;
+}
+
+/**
+ * skb_dst_is_noref - Test is skb dst isnt refcounted
+ * @skb: buffer
+ */
+static inline bool skb_dst_is_noref(const struct sk_buff *skb)
+{
+	return (skb->_skb_dst & SKB_DST_NOREF) && skb_dst(skb);
+}
+
 static inline struct rtable *skb_rtable(const struct sk_buff *skb)
 {
 	return (struct rtable *)skb_dst(skb);
diff --git a/include/net/dst.h b/include/net/dst.h
index aac5a5f..ad6ea9e 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -168,6 +168,12 @@ static inline void dst_use(struct dst_entry *dst, unsigned long time)
 	dst->lastuse = time;
 }
 
+static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
+{
+	dst->__use++;
+	dst->lastuse = time;
+}
+
 static inline
 struct dst_entry * dst_clone(struct dst_entry * dst)
 {
@@ -177,11 +183,46 @@ struct dst_entry * dst_clone(struct dst_entry * dst)
 }
 
 extern void dst_release(struct dst_entry *dst);
+
+static inline void __skb_dst_drop(unsigned long _skb_dst)
+{
+	if (!(_skb_dst & SKB_DST_NOREF))
+		dst_release((struct dst_entry *)(_skb_dst & SKB_DST_PTRMASK));
+}
+
+/**
+ * skb_dst_drop - drops skb dst
+ * @skb: buffer
+ *
+ * Drops dst reference count if a reference was taken.
+ */
 static inline void skb_dst_drop(struct sk_buff *skb)
 {
-	if (skb->_skb_dst)
-		dst_release(skb_dst(skb));
-	skb->_skb_dst = 0UL;
+	if (skb->_skb_dst) {
+		__skb_dst_drop(skb->_skb_dst);
+		skb->_skb_dst = 0UL;
+	}
+}
+
+static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
+{
+	nskb->_skb_dst = oskb->_skb_dst;
+	if (!(nskb->_skb_dst & SKB_DST_NOREF))
+		dst_clone(skb_dst(nskb));
+}
+
+/**
+ * skb_dst_force - makes sure skb dst is refcounted
+ * @skb: buffer
+ *
+ * If dst is not yet refcounted, let's do it
+ */
+static inline void skb_dst_force(struct sk_buff *skb)
+{
+	if (skb->_skb_dst & SKB_DST_NOREF) {
+		skb->_skb_dst &= ~SKB_DST_NOREF;
+		dst_clone(skb_dst(skb));
+	}
 }
 
 /* Children define the path of the packet through the
diff --git a/include/net/route.h b/include/net/route.h
index 2c9fba7..443f6d4 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -112,7 +112,7 @@ extern void		rt_cache_flush_batch(void);
 extern int		__ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp);
 extern int		ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
 extern int		ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
-extern int		ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin);
+extern int		ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin, bool noref);
 extern unsigned short	ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu, struct net_device *dev);
 extern void		ip_rt_send_redirect(struct sk_buff *skb);
 
diff --git a/include/net/sock.h b/include/net/sock.h
index d361c77..0a0f14d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -598,6 +598,8 @@ static inline int sk_stream_memory_free(struct sock *sk)
 /* OOB backlog add */
 static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
+	/* dont let skb dst not referenced, we are going to leave rcu lock */
+	skb_dst_force(skb);
 	if (!sk->sk_backlog.tail) {
 		sk->sk_backlog.head = sk->sk_backlog.tail = skb;
 	} else {
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 4c4977d..c943ad4 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -350,7 +350,7 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
 	}
 	nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
 	if (dnat_took_place(skb)) {
-		if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+		if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev, false))) {
 			struct flowi fl = {
 				.nl_u = {
 					.ip4_u = {
diff --git a/net/core/dev.c b/net/core/dev.c
index 100dcbd..c331b0e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2047,6 +2047,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		 * waiting to be sent out; and the qdisc is not running -
 		 * xmit the skb directly.
 		 */
+		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
+			skb_dst_force(skb);
 		__qdisc_update_bstats(q, skb->len);
 		if (sch_direct_xmit(skb, q, dev, txq, root_lock))
 			__qdisc_run(q);
@@ -2055,6 +2057,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 
 		rc = NET_XMIT_SUCCESS;
 	} else {
+		skb_dst_force(skb);
 		rc = qdisc_enqueue_root(skb, q);
 		qdisc_run(q);
 	}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4218ff4..f400196 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -531,7 +531,8 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->transport_header	= old->transport_header;
 	new->network_header	= old->network_header;
 	new->mac_header		= old->mac_header;
-	skb_dst_set(new, dst_clone(skb_dst(old)));
+
+	skb_dst_copy(new, old);
 	new->rxhash		= old->rxhash;
 #ifdef CONFIG_XFRM
 	new->sp			= secpath_get(old->sp);
diff --git a/net/core/sock.c b/net/core/sock.c
index 5104175..894bed6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -307,6 +307,11 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	 */
 	skb_len = skb->len;
 
+	/* we escape from rcu protected region, make sure we dont leak
+	 * a norefcounted dst
+	 */
+	skb_dst_force(skb);
+
 	spin_lock_irqsave(&list->lock, flags);
 	skb->dropcount = atomic_read(&sk->sk_drops);
 	__skb_queue_tail(list, skb);
@@ -1535,6 +1540,7 @@ static void __release_sock(struct sock *sk)
 		do {
 			struct sk_buff *next = skb->next;
 
+			WARN_ON_ONCE(skb_dst_is_noref(skb));
 			skb->next = NULL;
 			sk_backlog_rcv(sk, skb);
 
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 6e74706..502ac9f 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -854,7 +854,7 @@ static int arp_process(struct sk_buff *skb)
 	}
 
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
-	    ip_route_input(skb, tip, sip, 0, dev) == 0) {
+	    ip_route_input(skb, tip, sip, 0, dev, true) == 0) {
 
 		rt = skb_rtable(skb);
 		addr_type = rt->rt_type;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f3d339f..a113c08 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -587,20 +587,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 			err = __ip_route_output_key(net, &rt2, &fl);
 		else {
 			struct flowi fl2 = {};
-			struct dst_entry *odst;
+			unsigned long odst;
 
 			fl2.fl4_dst = fl.fl4_src;
 			if (ip_route_output_key(net, &rt2, &fl2))
 				goto relookup_failed;
 
 			/* Ugh! */
-			odst = skb_dst(skb_in);
+			odst = skb_in->_skb_dst; /* save old dst */
 			err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
-					     RT_TOS(tos), rt2->u.dst.dev);
+					     RT_TOS(tos), rt2->u.dst.dev, false);
 
 			dst_release(&rt2->u.dst);
 			rt2 = skb_rtable(skb_in);
-			skb_dst_set(skb_in, odst);
+			skb_in->_skb_dst = odst; /* restore old dst */
 		}
 
 		if (err)
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index af10942..0f58609 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -57,6 +57,7 @@ int ip_forward(struct sk_buff *skb)
 	struct rtable *rt;	/* Route we use */
 	struct ip_options * opt	= &(IPCB(skb)->opt);
 
+/*	pr_err("ip_forward() skb->dst=%lx\n", skb->_skb_dst);*/
 	if (skb_warn_if_lro(skb))
 		goto drop;
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 75347ea..cbcde7a 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -220,7 +220,7 @@ static void ip_expire(unsigned long arg)
 		if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) {
 			const struct iphdr *iph = ip_hdr(head);
 			int err = ip_route_input(head, iph->daddr, iph->saddr,
-						 iph->tos, head->dev);
+						 iph->tos, head->dev, false);
 			if (unlikely(err))
 				goto out_rcu_unlock;
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index f8ab7a3..5d365e8 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -332,7 +332,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 */
 	if (skb_dst(skb) == NULL) {
 		int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
-					 skb->dev);
+					 skb->dev, true);
 		if (unlikely(err)) {
 			if (err == -EHOSTUNREACH)
 				IP_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 4c09a31..1b65d68 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -601,6 +601,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 	unsigned char *optptr = skb_network_header(skb) + opt->srr;
 	struct rtable *rt = skb_rtable(skb);
 	struct rtable *rt2;
+	unsigned long odst;
 	int err;
 
 	if (!opt->srr)
@@ -624,16 +625,16 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 		}
 		memcpy(&nexthop, &optptr[srrptr-1], 4);
 
-		rt = skb_rtable(skb);
+		odst = skb->_skb_dst;
 		skb_dst_set(skb, NULL);
-		err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
+		err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev, false);
 		rt2 = skb_rtable(skb);
 		if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
-			ip_rt_put(rt2);
-			skb_dst_set(skb, &rt->u.dst);
+			skb_dst_drop(skb);
+			skb->_skb_dst = odst;
 			return -EINVAL;
 		}
-		ip_rt_put(rt);
+		__skb_dst_drop(odst);
 		if (rt2->rt_type != RTN_LOCAL)
 			break;
 		/* Superfast 8) loopback forward */
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 82fb43c..e505007 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -17,7 +17,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 	struct flowi fl = {};
-	struct dst_entry *odst;
+	unsigned long odst;
 	unsigned int hh_len;
 	unsigned int type;
 
@@ -51,14 +51,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 		if (ip_route_output_key(net, &rt, &fl) != 0)
 			return -1;
 
-		odst = skb_dst(skb);
+		odst = skb->_skb_dst;
 		if (ip_route_input(skb, iph->daddr, iph->saddr,
-				   RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
+				   RT_TOS(iph->tos), rt->u.dst.dev, false) != 0) {
 			dst_release(&rt->u.dst);
 			return -1;
 		}
 		dst_release(&rt->u.dst);
-		dst_release(odst);
+		__skb_dst_drop(odst);
 	}
 
 	if (skb_dst(skb)->error)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a947428..4f169ce 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2300,7 +2300,7 @@ martian_source:
 }
 
 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-		   u8 tos, struct net_device *dev)
+		   u8 tos, struct net_device *dev, bool noref)
 {
 	struct rtable * rth;
 	unsigned	hash;
@@ -2326,10 +2326,15 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		    rth->fl.mark == skb->mark &&
 		    net_eq(dev_net(rth->u.dst.dev), net) &&
 		    !rt_is_expired(rth)) {
-			dst_use(&rth->u.dst, jiffies);
+			if (noref) {
+				dst_use_noref(&rth->u.dst, jiffies);
+				skb_dst_set_noref(skb, &rth->u.dst);
+			} else {
+				dst_use(&rth->u.dst, jiffies);
+				skb_dst_set(skb, &rth->u.dst);
+			}
 			RT_CACHE_STAT_INC(in_hit);
 			rcu_read_unlock();
-			skb_dst_set(skb, &rth->u.dst);
 			return 0;
 		}
 		RT_CACHE_STAT_INC(in_hlist_search);
@@ -2991,7 +2996,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 		skb->protocol	= htons(ETH_P_IP);
 		skb->dev	= dev;
 		local_bh_disable();
-		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
+		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev, false);
 		local_bh_enable();
 
 		rt = skb_rtable(skb);
@@ -3055,7 +3060,7 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 				continue;
 			if (rt_is_expired(rt))
 				continue;
-			skb_dst_set(skb, dst_clone(&rt->u.dst));
+			skb_dst_set_noref(skb, dst_clone(&rt->u.dst));
 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 					 1, NLM_F_MULTI) <= 0) {
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index c791bb6..0366cbc 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -28,7 +28,7 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
 		const struct iphdr *iph = ip_hdr(skb);
 
 		if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
-				   skb->dev))
+				   skb->dev, true))
 			goto drop;
 	}
 	return dst_input(skb);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 2599870..7ae0fa5 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -570,7 +570,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	} else {
 		ip_rt_put(rt);
 		if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos,
-				   skb2->dev) ||
+				   skb2->dev, false) ||
 		    skb_dst(skb2)->dev->type != ARPHRD_TUNNEL)
 			goto out;
 	}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index c49ef21..cb3cde4 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -9,6 +9,7 @@
 #include <linux/rcupdate.h>
 #include <net/protocol.h>
 #include <net/netfilter/nf_queue.h>
+#include <net/dst.h>
 
 #include "nf_internals.h"
 
@@ -170,6 +171,7 @@ static int __nf_queue(struct sk_buff *skb,
 			dev_hold(physoutdev);
 	}
 #endif
+	skb_dst_force(skb);
 	afinfo->saveroute(skb, entry);
 	status = qh->outfn(entry, queuenum);
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index aeddabf..21e3976 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -179,7 +179,7 @@ static inline int qdisc_restart(struct Qdisc *q)
 	skb = dequeue_skb(q);
 	if (unlikely(!skb))
 		return 0;
-
+	WARN_ON_ONCE(skb_dst_is_noref(skb));
 	root_lock = qdisc_lock(q);
 	dev = qdisc_dev(q);
 	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));



^ permalink raw reply related	[flat|nested] 108+ messages in thread

* Re: OFT - reserving CPU's for networking
  2010-04-29 20:02                   ` Eric Dumazet
@ 2010-04-30 18:15                     ` Brian Bloniarz
  0 siblings, 0 replies; 108+ messages in thread
From: Brian Bloniarz @ 2010-04-30 18:15 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Thomas Gleixner, Stephen Hemminger, netdev, Andi Kleen, Peter Zijlstra

Eric Dumazet wrote:
> Le jeudi 29 avril 2010 à 21:19 +0200, Thomas Gleixner a écrit :
> 
>> Say thanks to Intel/AMD for providing us timers which stop in lower
>> c-states.
>>
>> Not much we can do about the broadcast lock when several cores are
>> going idle and we need to setup a global timer to work around the
>> lapic timer stops in C2/C3 issue.
>>
>> Simply the C-state timer broadcasting does not scale. And it was never
>> meant to scale. It's a workaround for laptops to have functional NOHZ.
>>
>> There are several ways to work around that on larger machines:
>>
>>  - Restrict c-states
>>  - Disable NOHZ and highres timers
>>  - idle=poll is definitely the worst of all possible solutions
>>
>>> I keep getting asked about taking some core's away from clock and scheduler
>>> to be reserved just for network processing. Seeing this kind of stuff
>>> makes me wonder if maybe that isn't a half bad idea.
>> This comes up every few month and we pointed out several times what
>> needs to be done to make this work w/o these weird hacks which put a
>> core offline and then start some magic undebugable binary blob on it.
>> We have not seen anyone working on this, but the "set cores aside and
>> let them do X" idea seems to stick in peoples heads.
>>
>> Seriously, that's not a solution. It's going to be some hacked up
>> nightmare which is completely unmaintainable.
>>
>> Aside of that I seriously doubt that you can do networking w/o time
>> and timers.
>>
> 
> Thanks a lot !
> 
> booting with processor.max_cstate=1 solves the problem
> 
> (I already had a CONFIG_NO_HZ=no conf, but highres timer enabled)
> 
> Even with _carefuly_ chosen crazy configuration (receiving a packet on a
> cpu, then transfert it to another cpu, with a full 16x16 matrix
> involved), generating 700.000 IPI per second on the machine seems fine
> now.

FYI you can also restrict c=states at runtime with PM QoS:
Documentation/power/pm_qos_interface.txt

On my machine, /sys/devices/system/cpu/cpu0/cpuidle/state2/latency
is 205usec, so configuring a PM QoS request for <= 205usec latency
should prevent it being entered:

#!/usr/bin/python
import os;
import struct;
import signal;

latency_rec_usec = 100
f = os.open("/dev/cpu_dma_latency", os.O_WRONLY);
os.write(f, struct.pack("=i", latency_rec_usec));
signal.pause();

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: OFT - reserving CPU's for networking
  2010-04-29 19:19                 ` Thomas Gleixner
  2010-04-29 20:02                   ` Eric Dumazet
@ 2010-04-30 18:57                   ` David Miller
  2010-04-30 19:58                     ` Thomas Gleixner
                                       ` (2 more replies)
  1 sibling, 3 replies; 108+ messages in thread
From: David Miller @ 2010-04-30 18:57 UTC (permalink / raw)
  To: tglx; +Cc: shemminger, eric.dumazet, ak, netdev, andi, peterz

From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Apr 2010 21:19:36 +0200 (CEST)

> Aside of that I seriously doubt that you can do networking w/o time
> and timers.

You're right that we need timestamps and the like.

But only if we actually process the packets on these restricted cpus :-)

If we use RPS and farm out all packets to other cpus, ie. just doing
the driver work and the remote cpu dispatch on these "offline" cpus,
it is doable.

Then we can do cool tricks like having the cpu spin on a mwait() on the
network device's status descriptor in memory.

In any event I agree with you, it's a cool idea at best, and likely
not really practical.

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-29 20:36                                   ` jamal
  2010-04-29 21:01                                     ` [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion Eric Dumazet
@ 2010-04-30 19:30                                     ` jamal
  2010-04-30 20:40                                       ` Eric Dumazet
  1 sibling, 1 reply; 108+ messages in thread
From: jamal @ 2010-04-30 19:30 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

[-- Attachment #1: Type: text/plain, Size: 1322 bytes --]

Eric!

I managed to mod your program to look conceptually similar to mine
and i reproduced the results with same test kernel from yesterday. 
So it is likely the issue is in using epoll vs not using any async as
in your case.
Results attached as well as modified program.

Note: the key things to remember:
rps with this program gets worse over time and different net-next
kernels since Apr14 (look at graph i supplied). Sorry, I am really
busy-ed out to dig any further.

cheers,
jamal



On Thu, 2010-04-29 at 16:36 -0400, jamal wrote:
> On Thu, 2010-04-29 at 09:56 -0400, jamal wrote:
> 
> > 
> > I will try your program instead so we can reduce the variables
> 
> Results attached.
> With your app rps does a hell lot better and non-rps worse ;->
> With my proggie, non-rps does much better than yours and rps does
> a lot worse for same setup. I see the scheduler kicking quiet a bit in
> non-rps for you...
> 
> The main difference between us as i see it is:
> a) i use epoll - actually linked to libevent (1.0.something)
> b) I fork processes and you use pthreads.
> 
> I dont have time to chase it today, but 1) I am either going to change
> yours to use libevent or make mine get rid of it then 2) move towards
> pthreads or have yours fork..
> then observe if that makes any difference..
> 
> 
> cheers,
> jamal

[-- Attachment #2: apr30-ericmod --]
[-- Type: text/plain, Size: 8919 bytes --]


First a few runs with Eric's code + epoll/libevent

-------------------------------------------------------------------------------
   PerfTop:    4009 irqs/sec  kernel:83.4% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ____________________

             2097.00  8.6% sky2_poll                   [sky2]              
             1742.00  7.2% _raw_spin_lock_irqsave      [kernel]            
              831.00  3.4% system_call                 [kernel]            
              654.00  2.7% copy_user_generic_string    [kernel]            
              654.00  2.7% datagram_poll               [kernel]            
              647.00  2.7% fget                        [kernel]            
              623.00  2.6% _raw_spin_unlock_irqrestore [kernel]            
              547.00  2.3% _raw_spin_lock_bh           [kernel]            
              506.00  2.1% sys_epoll_ctl               [kernel]            
              475.00  2.0% kmem_cache_free             [kernel]            
              466.00  1.9% schedule                    [kernel]            
              436.00  1.8% vread_tsc                   [kernel].vsyscall_fn
              417.00  1.7% fput                        [kernel]            
              415.00  1.7% sys_epoll_wait              [kernel]            
              402.00  1.7% _raw_spin_lock              [kernel]            


-------------------------------------------------------------------------------
   PerfTop:     616 irqs/sec  kernel:98.7% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function               DSO
             _______ _____ ______________________ ________

             2534.00 28.6% sky2_poll              [sky2]  
              503.00  5.7% ip_route_input         [kernel]
              438.00  4.9% _raw_spin_lock_irqsave [kernel]
              418.00  4.7% __udp4_lib_lookup      [kernel]
              378.00  4.3% __alloc_skb            [kernel]
              364.00  4.1% ip_rcv                 [kernel]
              323.00  3.6% _raw_spin_lock         [kernel]
              315.00  3.5% sock_queue_rcv_skb     [kernel]
              284.00  3.2% __netif_receive_skb    [kernel]
              281.00  3.2% __udp4_lib_rcv         [kernel]
              266.00  3.0% __wake_up_common       [kernel]
              238.00  2.7% sock_def_readable      [kernel]
              181.00  2.0% __kmalloc              [kernel]
              163.00  1.8% kmem_cache_alloc       [kernel]
              150.00  1.7% ep_poll_callback       [kernel]


-------------------------------------------------------------------------------
   PerfTop:     854 irqs/sec  kernel:80.2% [1000Hz cycles],  (all, cpu: 2)
-------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ____________________

              341.00  8.0% _raw_spin_lock_irqsave      [kernel]            
              235.00  5.5% system_call                 [kernel]            
              174.00  4.1% datagram_poll               [kernel]            
              174.00  4.1% fget                        [kernel]            
              173.00  4.1% copy_user_generic_string    [kernel]            
              135.00  3.2% _raw_spin_unlock_irqrestore [kernel]            
              125.00  2.9% _raw_spin_lock_bh           [kernel]            
              122.00  2.9% schedule                    [kernel]            
              113.00  2.6% sys_epoll_ctl               [kernel]            
              113.00  2.6% kmem_cache_free             [kernel]            
              108.00  2.5% vread_tsc                   [kernel].vsyscall_fn
              105.00  2.5% sys_epoll_wait              [kernel]            
              102.00  2.4% udp_recvmsg                 [kernel]            
               95.00  2.2% mutex_lock                  [kernel]            

Average 97.55% of 10M packets at 750Kpps

Turn on rps mask ee and irq affinity to cpu0

-------------------------------------------------------------------------------
   PerfTop:    3885 irqs/sec  kernel:83.6% [1000Hz cycles],  (all, 8 CPUs)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ ________

             2945.00 16.7% sky2_poll                      [sky2]  
              653.00  3.7% _raw_spin_lock_irqsave         [kernel]
              460.00  2.6% system_call                    [kernel]
              420.00  2.4% _raw_spin_unlock_irqrestore    [kernel]
              414.00  2.3% sky2_intr                      [sky2]  
              392.00  2.2% fget                           [kernel]
              360.00  2.0% ip_rcv                         [kernel]
              324.00  1.8% sys_epoll_ctl                  [kernel]
              323.00  1.8% __netif_receive_skb            [kernel]
              310.00  1.8% schedule                       [kernel]
              292.00  1.7% ip_route_input                 [kernel]
              292.00  1.7% _raw_spin_lock                 [kernel]
              291.00  1.7% copy_user_generic_string       [kernel]
              284.00  1.6% kmem_cache_free                [kernel]
              262.00  1.5% call_function_single_interrupt [kernel]

-------------------------------------------------------------------------------
   PerfTop:    1000 irqs/sec  kernel:98.1% [1000Hz cycles],  (all, cpu: 0)
-------------------------------------------------------------------------------

             samples  pcnt function                            DSO
             _______ _____ ___________________________________ ________

             4170.00 61.9% sky2_poll                           [sky2]  
              723.00 10.7% sky2_intr                           [sky2]  
              159.00  2.4% __alloc_skb                         [kernel]
              140.00  2.1% get_rps_cpu                         [kernel]
              106.00  1.6% __kmalloc                           [kernel]
               95.00  1.4% enqueue_to_backlog                  [kernel]
               86.00  1.3% kmem_cache_alloc                    [kernel]
               85.00  1.3% irq_entries_start                   [kernel]
               85.00  1.3% _raw_spin_lock_irqsave              [kernel]
               82.00  1.2% _raw_spin_lock                      [kernel]
               66.00  1.0% swiotlb_sync_single                 [kernel]
               58.00  0.9% sky2_remove                         [sky2]  
               49.00  0.7% default_send_IPI_mask_sequence_phys [kernel]
               47.00  0.7% sky2_rx_submit                      [sky2]  
               36.00  0.5% _raw_spin_unlock_irqrestore         [kernel]

-------------------------------------------------------------------------------
   PerfTop:     344 irqs/sec  kernel:84.3% [1000Hz cycles],  (all, cpu: 2)
-------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ ____________________

              114.00  5.2% _raw_spin_lock_irqsave         [kernel]            
               79.00  3.6% fget                           [kernel]            
               78.00  3.6% ip_rcv                         [kernel]            
               78.00  3.6% system_call                    [kernel]            
               75.00  3.4% _raw_spin_unlock_irqrestore    [kernel]            
               67.00  3.1% sys_epoll_ctl                  [kernel]            
               65.00  3.0% schedule                       [kernel]            
               61.00  2.8% ip_route_input                 [kernel]            
               48.00  2.2% vread_tsc                      [kernel].vsyscall_fn
               48.00  2.2% call_function_single_interrupt [kernel]            
               46.00  2.1% kmem_cache_free                [kernel]            
               45.00  2.1% __netif_receive_skb            [kernel]            
               41.00  1.9% process_recv                   snkudp              
               40.00  1.8% kfree                          [kernel]            
               39.00  1.8% _raw_spin_lock                 [kernel]            

92.97% of 10M packets at 750Kpps


Ok, so this is exactly what i saw with my app. non-rps is better.
To summarize: It used to be the opposite on net-next before around
Apr14. rps has gotten worse.

[-- Attachment #3: udpsnkfrk.c --]
[-- Type: text/x-csrc, Size: 3650 bytes --]

/*
 *  Usage: udpsink [ -p baseport] nbports
*/
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <event.h>

struct worker_data {
	struct event *snk_ev;
	struct event_base *base;
	struct timeval t;
	unsigned long pack_count;
	unsigned long bytes_count;
	unsigned long tout;
	int fd;			/* move to avoid hole on 64-bit */
	int pad1;		/*64B - let Eric figure the math;-> */
	//unsigned long _padd[16 - 3]; /* alignment */ 
};

void usage(int code)
{
	fprintf(stderr, "Usage: udpsink [-p baseport] nbports\n");
	exit(code);
}

void process_recv(int fd, short ev, void *arg)
{
	char buffer[4096];
	struct sockaddr_in addr;
	socklen_t len = sizeof(addr);
	struct worker_data *wdata = (struct worker_data *)arg;
	int lu = 0;

	if ((event_add(wdata->snk_ev, &wdata->t)) < 0) {
		perror("cb event_add");
		return;
	}

	if (ev == EV_TIMEOUT) {
		wdata->tout++;
	} else {
		lu = recvfrom(wdata->fd, buffer, sizeof(buffer), 0,
			      (struct sockaddr *)&addr, &len);
		if (lu > 0) {
			wdata->pack_count++;
			wdata->bytes_count += lu;
		}
	}
}

int prep_thread(struct worker_data *wdata)
{
	wdata->t.tv_sec = 1;
	wdata->t.tv_usec = random() % 50000L;

	wdata->base = event_init();
	event_set(wdata->snk_ev, wdata->fd, EV_READ, process_recv, wdata);
	event_base_set(wdata->base, wdata->snk_ev);
	if ((event_add(wdata->snk_ev, &wdata->t)) < 0) {
		perror("event_add");
		return -1;
	}
	return 0;
}

void *worker_func(void *arg)
{
	struct worker_data *wdata = (struct worker_data *)arg;

	return (void *)event_base_loop(wdata->base, 0);
}

int main(int argc, char *argv[])
{
	int c;
	int baseport = 4000;
	int nbthreads;
	struct worker_data *wdata;
	unsigned long ototal = 0;
	int concurrent = 0;
	int verbose = 0;
	int i;
	while ((c = getopt(argc, argv, "cvp:")) != -1) {
		if (c == 'p')
			baseport = atoi(optarg);
		else if (c == 'c')
			concurrent = 1;
		else if (c == 'v')
			verbose++;
		else
			usage(1);
	}
	if (optind == argc)
		usage(1);
	nbthreads = atoi(argv[optind]);
	wdata = calloc(sizeof(struct worker_data), nbthreads);
	if (!wdata) {
		perror("calloc");
		return 1;
	}

	for (i = 0; i < nbthreads; i++) {
		struct sockaddr_in addr;
		pthread_t tid;

		if (i && concurrent) {
			wdata[i].fd = wdata[0].fd;
		} else {
			wdata[i].snk_ev = malloc(sizeof(struct event));
			if (!wdata[i].snk_ev)
				return 1;
			memset(wdata[i].snk_ev, 0, sizeof(struct event));

			wdata[i].fd = socket(PF_INET, SOCK_DGRAM, 0);
			if (wdata[i].fd == -1) {
				free(wdata[i].snk_ev);
				perror("socket");
				return 1;
			}
			memset(&addr, 0, sizeof(addr));
			addr.sin_family = AF_INET;
//                      addr.sin_addr.s_addr = inet_addr(argv[optind]);
			addr.sin_port = htons(baseport + i);
			if (bind
			    (wdata[i].fd, (struct sockaddr *)&addr,
			     sizeof(addr)) < 0) {
				free(wdata[i].snk_ev);
				perror("bind");
				return 1;
			}
//                      fcntl(wdata[i].fd, F_SETFL, O_NDELAY);
		}
		if (prep_thread(wdata + i)) {
			printf("failed to allocate thread %d, exit\n", i);
			exit(0);
		}
		pthread_create(&tid, NULL, worker_func, wdata + i);
	}

	for (;;) {
		unsigned long total;
		long delta;

		sleep(1);
		total = 0;
		for (i = 0; i < nbthreads; i++) {
			total += wdata[i].pack_count;
		}
		delta = total - ototal;
		if (delta) {
			printf("%lu pps (%lu", delta, total);
			if (verbose) {
				for (i = 0; i < nbthreads; i++) {
					if (wdata[i].pack_count)
						printf(" %d:%lu", i,
						       wdata[i].pack_count);
				}
			}
			printf(")\n");
		}
		ototal = total;
	}
}

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: OFT - reserving CPU's for networking
  2010-04-30 18:57                   ` David Miller
@ 2010-04-30 19:58                     ` Thomas Gleixner
  2010-04-30 21:01                     ` Andi Kleen
  2010-05-01 20:31                     ` Martin Josefsson
  2 siblings, 0 replies; 108+ messages in thread
From: Thomas Gleixner @ 2010-04-30 19:58 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, eric.dumazet, ak, netdev, andi, peterz

Dave,

On Fri, 30 Apr 2010, David Miller wrote:

> From: Thomas Gleixner <tglx@linutronix.de>
> Date: Thu, 29 Apr 2010 21:19:36 +0200 (CEST)
> 
> > Aside of that I seriously doubt that you can do networking w/o time
> > and timers.
> 
> You're right that we need timestamps and the like.
> 
> But only if we actually process the packets on these restricted cpus :-)
> 
> If we use RPS and farm out all packets to other cpus, ie. just doing
> the driver work and the remote cpu dispatch on these "offline" cpus,
> it is doable.
> 
> Then we can do cool tricks like having the cpu spin on a mwait() on the
> network device's status descriptor in memory.
> 
> In any event I agree with you, it's a cool idea at best, and likely
> not really practical.

Well, it might be worth to experiment with that once we get the basic
infrastructure in place to "isolate" cores under full kernel control. 

It's not too hard to solve the problems, but it seems nobody has a
free time slot to tackle them.

Thanks

	tglx

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-30 19:30                                     ` [PATCH net-next-2.6] net: speedup udp receive path jamal
@ 2010-04-30 20:40                                       ` Eric Dumazet
  2010-05-01  0:06                                         ` jamal
  0 siblings, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-04-30 20:40 UTC (permalink / raw)
  To: hadi
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

Le vendredi 30 avril 2010 à 15:30 -0400, jamal a écrit :
> Eric!
> 
> I managed to mod your program to look conceptually similar to mine
> and i reproduced the results with same test kernel from yesterday. 
> So it is likely the issue is in using epoll vs not using any async as
> in your case.
> Results attached as well as modified program.
> 
> Note: the key things to remember:
> rps with this program gets worse over time and different net-next
> kernels since Apr14 (look at graph i supplied). Sorry, I am really
> busy-ed out to dig any further.
> 
> cheers,
> jamal
> 

I am lost.

I used your program, and with RPS off, I can get at most 220.000 pps
with my "old" hardware. I dont understand how you can reach 700.000 pps
with RPS off. Or is it with your Nehalem ?




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: OFT - reserving CPU's for networking
  2010-04-30 18:57                   ` David Miller
  2010-04-30 19:58                     ` Thomas Gleixner
@ 2010-04-30 21:01                     ` Andi Kleen
  2010-04-30 22:30                       ` David Miller
  2010-05-01 20:31                     ` Martin Josefsson
  2 siblings, 1 reply; 108+ messages in thread
From: Andi Kleen @ 2010-04-30 21:01 UTC (permalink / raw)
  To: David Miller; +Cc: tglx, shemminger, eric.dumazet, netdev, peterz

> Then we can do cool tricks like having the cpu spin on a mwait() on the
> network device's status descriptor in memory.

When you specify a deep C state in that mwait then it will also have the long 
wakeup latency in the idle case.  When you don't then you just killed higher
Turbo mode on that socket and give away a lot of performance on the other
cores.

So you have to solve the idle state governour issue anyways, and then
you likely don't need it anymore.

Besides it seems to me that dispatching is something the NIC should
just do directly. "RPS only CPU" would be essentially just an 
interrupt mitigation/flow redirection scheme that a lot of NICs
do anyways.

> In any event I agree with you, it's a cool idea at best, and likely
> not really practical.

s/cool//

-Andi

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: OFT - reserving CPU's for networking
  2010-04-30 21:01                     ` Andi Kleen
@ 2010-04-30 22:30                       ` David Miller
  2010-05-01 10:53                         ` Andi Kleen
  0 siblings, 1 reply; 108+ messages in thread
From: David Miller @ 2010-04-30 22:30 UTC (permalink / raw)
  To: andi; +Cc: tglx, shemminger, eric.dumazet, netdev, peterz

From: Andi Kleen <andi@firstfloor.org>
Date: Fri, 30 Apr 2010 23:01:31 +0200

> Besides it seems to me that dispatching is something the NIC should
> just do directly. "RPS only CPU" would be essentially just an 
> interrupt mitigation/flow redirection scheme that a lot of NICs
> do anyways.

We've already established that the NIC can't do a complete job in all
important cases, that's why we've integrated the RPS/RFS patches in
the first place.

And we don't want it to, because the decision mechanisms for steering
that we using now are starting to get into the stateful territory and
that's verbotton for NIC offload as far as we're concerned.

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
  2010-04-29 21:01                                     ` [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion Eric Dumazet
  2010-04-30 13:55                                       ` Brian Bloniarz
@ 2010-04-30 23:35                                       ` David Miller
  2010-05-01  4:56                                         ` Eric Dumazet
  2010-05-01  7:02                                         ` Eric Dumazet
  1 sibling, 2 replies; 108+ messages in thread
From: David Miller @ 2010-04-30 23:35 UTC (permalink / raw)
  To: eric.dumazet; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 29 Apr 2010 23:01:49 +0200

> [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion

So what's the difference between call_rcu() freeing this little waitqueue
struct and doing it for the entire socket?

We'll still be doing an RCU call every socket destroy, and now we also have
a new memory allocation/free per connection.

This has to show up in things like 'lat_connect' and friends, does it not?

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
       [not found]                   ` <20100429214144.GA10663@gargoyle.fritz.box>
  2010-04-30  5:25                     ` Eric Dumazet
@ 2010-04-30 23:38                     ` David Miller
  2010-05-01 11:00                       ` Andi Kleen
  1 sibling, 1 reply; 108+ messages in thread
From: David Miller @ 2010-04-30 23:38 UTC (permalink / raw)
  To: ak
  Cc: eric.dumazet, andi, hadi, xiaosuo, therbert, shemminger, netdev,
	lenb, arjan

From: Andi Kleen <ak@gargoyle.fritz.box>
Date: Thu, 29 Apr 2010 23:41:44 +0200

>     Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies
> 
>     XXX: probably too aggressive, some of these sleeps are not under high load.
> 
>     Based on a bug report from Eric Dumazet.
>     
>     Signed-off-by: Andi Kleen <ak@linux.intel.com>

I like this, except that we probably don't want the delayacct_blkio_*() calls
these things do.

Probably the rest of what these things do should remain in the io_schedule*()
functions and the block layer can call it's own versions which add in the
delayacct_blkio_*() bits.

Or, if the delacct stuff is useful for socket I/O too, then it's interfaces
names should have the "blk" stripped from them :-)

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-04-30 20:40                                       ` Eric Dumazet
@ 2010-05-01  0:06                                         ` jamal
  2010-05-01  5:57                                           ` Eric Dumazet
  0 siblings, 1 reply; 108+ messages in thread
From: jamal @ 2010-05-01  0:06 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

On Fri, 2010-04-30 at 22:40 +0200, Eric Dumazet wrote:

> 
> I used your program, and with RPS off, I can get at most 220.000 pps
> with my "old" hardware. I dont understand how you can reach 700.000 pps
> with RPS off. Or is it with your Nehalem ?

Yes, Nehalem. 
RPS off is better (~700Kpp) than RPS on(~650kpps). Are you seeing the
same trend on the old hardware?

cheers,
jamal


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
  2010-04-30 23:35                                       ` David Miller
@ 2010-05-01  4:56                                         ` Eric Dumazet
  2010-05-01  7:02                                         ` Eric Dumazet
  1 sibling, 0 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-05-01  4:56 UTC (permalink / raw)
  To: David Miller; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb

Le vendredi 30 avril 2010 à 16:35 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Thu, 29 Apr 2010 23:01:49 +0200
> 
> > [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
> 
> So what's the difference between call_rcu() freeing this little waitqueue
> struct and doing it for the entire socket?
> 
> We'll still be doing an RCU call every socket destroy, and now we also have
> a new memory allocation/free per connection.
> 
> This has to show up in things like 'lat_connect' and friends, does it not?

Difference is this structure is small, one cache line at most.

So the cost of call_rcu() on this structure, with the well known cache
miss is very much reduced.

The thing that might cost is the smp_mb(), because it translate to a
"mfence" instruction, and it appears to cost more than a a regular
"lock ..."

Unfortunatly, oprofile doesnt work anymore on my bl460c machine after
last BIOS upgrade... Oh well...




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-05-01  0:06                                         ` jamal
@ 2010-05-01  5:57                                           ` Eric Dumazet
  2010-05-01  6:14                                             ` Eric Dumazet
  2010-05-01 11:23                                             ` jamal
  0 siblings, 2 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-05-01  5:57 UTC (permalink / raw)
  To: hadi
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

Le vendredi 30 avril 2010 à 20:06 -0400, jamal a écrit :

> Yes, Nehalem. 
> RPS off is better (~700Kpp) than RPS on(~650kpps). Are you seeing the
> same trend on the old hardware?
> 

Of course not ! Or else RPS would be useless :(

I changed your program a bit to use EV_PERSIST, (to avoid epoll_ctl()
overhead for each packet...)

RPS off : 220.000 pps 

RPS on (ee mask) : 700.000 pps  (with a slightly modified tg3 driver)
96% of delivered packets

This is on tg3 adapter, and tg3 has copybreak feature : small packets
are copied into skb of the right size.

define TG3_RX_COPY_THRESHOLD       256 -> 40 ...

We really should disable this feature for RPS workload,
unfortunatly ethtool cannot tweak this.

So profile of cpu 0 (RPS ON) looks like :

------------------------------------------------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:99.7% [1000Hz cycles],  (all, cpu: 0)
------------------------------------------------------------------------------------------------------------------------

             samples  pcnt function               DSO
             _______ _____ ______________________ _______

              819.00 12.6% __alloc_skb            vmlinux
              592.00  9.1% eth_type_trans         vmlinux
              509.00  7.8% _raw_spin_lock         vmlinux
              475.00  7.3% __kmalloc_track_caller vmlinux
              358.00  5.5% tg3_read32             vmlinux
              345.00  5.3% __netdev_alloc_skb     vmlinux
              329.00  5.0% kmem_cache_alloc       vmlinux
              307.00  4.7% _raw_spin_lock_irqsave vmlinux
              284.00  4.4% bnx2_interrupt         vmlinux
              277.00  4.2% skb_pull               vmlinux
              248.00  3.8% tg3_poll_work          vmlinux
              202.00  3.1% __slab_alloc           vmlinux
              197.00  3.0% get_rps_cpu            vmlinux
              106.00  1.6% enqueue_to_backlog     vmlinux
               87.00  1.3% _raw_spin_lock_bh      vmlinux
               80.00  1.2% __copy_to_user_ll      vmlinux
               77.00  1.2% nommu_map_page         vmlinux
               77.00  1.2% __napi_gro_receive     vmlinux
               65.00  1.0% tg3_alloc_rx_skb       vmlinux
               60.00  0.9% skb_gro_reset_offset   vmlinux
               57.00  0.9% skb_put                vmlinux
               57.00  0.9% __slab_free            vmlinux


/*
 *  Usage: udpsnkfrk [ -p baseport] nbports
*/
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <event.h>

struct worker_data {
	struct event *snk_ev;
	struct event_base *base;
	struct timeval t;
	unsigned long pack_count;
	unsigned long bytes_count;
	unsigned long tout;
	int fd;			/* move to avoid hole on 64-bit */
	int pad1;	
	unsigned long _padd[99]; /* avoid false sharing */
};

void usage(int code)
{
	fprintf(stderr, "Usage: udpsink [-p baseport] nbports\n");
	exit(code);
}

void process_recv(int fd, short ev, void *arg)
{
	char buffer[4096];
	struct sockaddr_in addr;
	socklen_t len = sizeof(addr);
	struct worker_data *wdata = (struct worker_data *)arg;
	int lu = 0;


	if (ev == EV_TIMEOUT) {
		wdata->tout++;
		if ((event_add(wdata->snk_ev, &wdata->t)) < 0) {
			perror("cb event_add");
			return;
		}
	} else {
		do {
			lu = recvfrom(wdata->fd, buffer, sizeof(buffer), 0,
			      (struct sockaddr *)&addr, &len);
			if (lu > 0) {
				wdata->pack_count++;
				wdata->bytes_count += lu;
			}
		} while (lu > 0);
	}
}

int prep_thread(struct worker_data *wdata)
{
	wdata->t.tv_sec = 1;
	wdata->t.tv_usec = random() % 50000L;

	wdata->base = event_init();
	event_set(wdata->snk_ev, wdata->fd, EV_READ|EV_PERSIST, process_recv, wdata);
	event_base_set(wdata->base, wdata->snk_ev);
	if ((event_add(wdata->snk_ev, &wdata->t)) < 0) {
		perror("event_add");
		return -1;
	}
	return 0;
}

void *worker_func(void *arg)
{
	struct worker_data *wdata = (struct worker_data *)arg;

	return (void *)event_base_loop(wdata->base, 0);
}

int main(int argc, char *argv[])
{
	int c;
	int baseport = 4000;
	int nbthreads;
	struct worker_data *wdata;
	unsigned long ototal = 0;
	int concurrent = 0;
	int verbose = 0;
	int i;
	while ((c = getopt(argc, argv, "cvp:")) != -1) {
		if (c == 'p')
			baseport = atoi(optarg);
		else if (c == 'c')
			concurrent = 1;
		else if (c == 'v')
			verbose++;
		else
			usage(1);
	}
	if (optind == argc)
		usage(1);
	nbthreads = atoi(argv[optind]);
	wdata = calloc(sizeof(struct worker_data), nbthreads);
	if (!wdata) {
		perror("calloc");
		return 1;
	}

	for (i = 0; i < nbthreads; i++) {
		struct sockaddr_in addr;
		pthread_t tid;

		if (i && concurrent) {
			wdata[i].fd = wdata[0].fd;
		} else {
			wdata[i].snk_ev = malloc(sizeof(struct event));
			if (!wdata[i].snk_ev)
				return 1;
			memset(wdata[i].snk_ev, 0, sizeof(struct event));

			wdata[i].fd = socket(PF_INET, SOCK_DGRAM, 0);
			if (wdata[i].fd == -1) {
				free(wdata[i].snk_ev);
				perror("socket");
				return 1;
			}
			memset(&addr, 0, sizeof(addr));
			addr.sin_family = AF_INET;
//                      addr.sin_addr.s_addr = inet_addr(argv[optind]);
			addr.sin_port = htons(baseport + i);
			if (bind
			    (wdata[i].fd, (struct sockaddr *)&addr,
			     sizeof(addr)) < 0) {
				free(wdata[i].snk_ev);
				perror("bind");
				return 1;
			}
                      fcntl(wdata[i].fd, F_SETFL, O_NDELAY);
		}
		if (prep_thread(wdata + i)) {
			printf("failed to allocate thread %d, exit\n", i);
			exit(0);
		}
		pthread_create(&tid, NULL, worker_func, wdata + i);
	}

	for (;;) {
		unsigned long total;
		long delta;

		sleep(1);
		total = 0;
		for (i = 0; i < nbthreads; i++) {
			total += wdata[i].pack_count;
		}
		delta = total - ototal;
		if (delta) {
			printf("%lu pps (%lu", delta, total);
			if (verbose) {
				for (i = 0; i < nbthreads; i++) {
					if (wdata[i].pack_count)
						printf(" %d:%lu", i,
						       wdata[i].pack_count);
				}
			}
			printf(")\n");
		}
		ototal = total;
	}
}




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-05-01  5:57                                           ` Eric Dumazet
@ 2010-05-01  6:14                                             ` Eric Dumazet
  2010-05-01 10:24                                               ` Changli Gao
  2010-05-01 11:29                                               ` jamal
  2010-05-01 11:23                                             ` jamal
  1 sibling, 2 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-05-01  6:14 UTC (permalink / raw)
  To: hadi
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

Le samedi 01 mai 2010 à 07:57 +0200, Eric Dumazet a écrit :
> Le vendredi 30 avril 2010 à 20:06 -0400, jamal a écrit :
> 
> > Yes, Nehalem. 
> > RPS off is better (~700Kpp) than RPS on(~650kpps). Are you seeing the
> > same trend on the old hardware?
> > 
> 
> Of course not ! Or else RPS would be useless :(
> 
> I changed your program a bit to use EV_PERSIST, (to avoid epoll_ctl()
> overhead for each packet...)
> 
> RPS off : 220.000 pps 
> 
> RPS on (ee mask) : 700.000 pps  (with a slightly modified tg3 driver)
> 96% of delivered packets

BTW, using ee mask, cpu4 is not used at _all_, even for the user
threads. Scheduler does a bad job IMHO.

Using fe mask, I get all packets (sent at 733311pps by my pktgen
machine), and my CPU0 even has idle time !!!

Limit seems to be around 800.000 pps

------------------------------------------------------------------------------------------------------------------------
   PerfTop:    5616 irqs/sec  kernel:93.9% [1000Hz cycles],  (all, 8 CPUs)
------------------------------------------------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ _______

             3492.00  6.2% __slab_free                 vmlinux
             2334.00  4.2% _raw_spin_lock              vmlinux
             2314.00  4.1% _raw_spin_lock_irqsave      vmlinux
             1807.00  3.2% ip_rcv                      vmlinux
             1605.00  2.9% schedule                    vmlinux
             1474.00  2.6% __netif_receive_skb         vmlinux
             1464.00  2.6% kfree                       vmlinux
             1405.00  2.5% ip_route_input              vmlinux
             1318.00  2.4% __copy_to_user_ll           vmlinux
             1214.00  2.2% __alloc_skb                 vmlinux
             1160.00  2.1% nf_hook_slow                vmlinux
             1020.00  1.8% eth_type_trans              vmlinux
              860.00  1.5% sched_clock_local           vmlinux
              775.00  1.4% read_tsc                    vmlinux
              773.00  1.4% ipt_do_table                vmlinux
              766.00  1.4% _raw_spin_unlock_irqrestore vmlinux
              748.00  1.3% sock_recv_ts_and_drops      vmlinux
              747.00  1.3% ia32_sysenter_target        vmlinux
              740.00  1.3% select_nohz_load_balancer   vmlinux
              644.00  1.2% __kmalloc_track_caller      vmlinux
              596.00  1.1% tg3_read32                  vmlinux
              566.00  1.0% __udp4_lib_lookup           vmlinux





^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
  2010-04-30 23:35                                       ` David Miller
  2010-05-01  4:56                                         ` Eric Dumazet
@ 2010-05-01  7:02                                         ` Eric Dumazet
  2010-05-01  8:03                                           ` Eric Dumazet
  1 sibling, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-05-01  7:02 UTC (permalink / raw)
  To: David Miller; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb

Le vendredi 30 avril 2010 à 16:35 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Thu, 29 Apr 2010 23:01:49 +0200
> 
> > [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
> 
> So what's the difference between call_rcu() freeing this little waitqueue
> struct and doing it for the entire socket?
> 
> We'll still be doing an RCU call every socket destroy, and now we also have
> a new memory allocation/free per connection.
> 
> This has to show up in things like 'lat_connect' and friends, does it not?

Before patch :

lat_connect -N 10 127.0.0.1
TCP/IP connection cost to 127.0.0.1: 27.8872 microseconds

After :

lat_connect -N 10 127.0.0.1
TCP/IP connection cost to 127.0.0.1: 20.7681 microseconds

Strange isnt it ?

(special care should be taken with this bench, as it leave many sockets
in TIME_WAIT state, so to get consistent numbers we have to wait a while
before restarting it)




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
  2010-05-01  7:02                                         ` Eric Dumazet
@ 2010-05-01  8:03                                           ` Eric Dumazet
  2010-05-01 22:00                                             ` David Miller
  0 siblings, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-05-01  8:03 UTC (permalink / raw)
  To: David Miller; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb

Le samedi 01 mai 2010 à 09:02 +0200, Eric Dumazet a écrit :
> Le vendredi 30 avril 2010 à 16:35 -0700, David Miller a écrit :
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Thu, 29 Apr 2010 23:01:49 +0200
> > 
> > > [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
> > 
> > So what's the difference between call_rcu() freeing this little waitqueue
> > struct and doing it for the entire socket?
> > 
> > We'll still be doing an RCU call every socket destroy, and now we also have
> > a new memory allocation/free per connection.
> > 
> > This has to show up in things like 'lat_connect' and friends, does it not?
> 
> Before patch :
> 
> lat_connect -N 10 127.0.0.1
> TCP/IP connection cost to 127.0.0.1: 27.8872 microseconds
> 
> After :
> 
> lat_connect -N 10 127.0.0.1
> TCP/IP connection cost to 127.0.0.1: 20.7681 microseconds
> 
> Strange isnt it ?
> 
> (special care should be taken with this bench, as it leave many sockets
> in TIME_WAIT state, so to get consistent numbers we have to wait a while
> before restarting it)


Oops, this was with the other patch (about dst no_refcounting in input
path), sorry.

With the "sock_def_readable() and friends RCU conversion" patch I got :

lat_connect -N 10 127.0.0.1
TCP/IP connection cost to 127.0.0.1: 27.6244 microseconds


Anyway, this lat_connect seems very unreliable (lot of variance)

with linux-2.6.31, ~33 us
with linux-2.6.33, ~30 us

David, I also need this RCU thing in order to be able to group all
wakeups at the end of net_rx_action().

Plan was to use RCU, so that I dont need to increase sk_refcnt when
queueing a "wakeup" (and decrease sk_refcnt a long time after)

Previous attempt was a bit hacky,
http://patchwork.ozlabs.org/patch/24179/

I expect 2010 one will be cleaner :)



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-05-01  6:14                                             ` Eric Dumazet
@ 2010-05-01 10:24                                               ` Changli Gao
  2010-05-01 10:47                                                 ` Eric Dumazet
  2010-05-01 11:29                                               ` jamal
  1 sibling, 1 reply; 108+ messages in thread
From: Changli Gao @ 2010-05-01 10:24 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: hadi, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

On Sat, May 1, 2010 at 2:14 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> BTW, using ee mask, cpu4 is not used at _all_, even for the user
> threads. Scheduler does a bad job IMHO.
>
> Using fe mask, I get all packets (sent at 733311pps by my pktgen
> machine), and my CPU0 even has idle time !!!
>
> Limit seems to be around 800.000 pps
>
> ------------------------------------------------------------------------------------------------------------------------
>   PerfTop:    5616 irqs/sec  kernel:93.9% [1000Hz cycles],  (all, 8 CPUs)
> ------------------------------------------------------------------------------------------------------------------------
>

Oh, cpu0 usage is about 100-(100-93.9)*8 = 51.2%(Am I right?). If we
can do weighted packet distributing: cpu0's weight is 1, and other
cpus are 2. maybe we can utilize all the cpu power.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-05-01 10:24                                               ` Changli Gao
@ 2010-05-01 10:47                                                 ` Eric Dumazet
  0 siblings, 0 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-05-01 10:47 UTC (permalink / raw)
  To: Changli Gao
  Cc: hadi, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

Le samedi 01 mai 2010 à 18:24 +0800, Changli Gao a écrit :
> On Sat, May 1, 2010 at 2:14 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > BTW, using ee mask, cpu4 is not used at _all_, even for the user
> > threads. Scheduler does a bad job IMHO.
> >
> > Using fe mask, I get all packets (sent at 733311pps by my pktgen
> > machine), and my CPU0 even has idle time !!!
> >
> > Limit seems to be around 800.000 pps
> >
> > ------------------------------------------------------------------------------------------------------------------------
> >   PerfTop:    5616 irqs/sec  kernel:93.9% [1000Hz cycles],  (all, 8 CPUs)
> > ------------------------------------------------------------------------------------------------------------------------
> >
> 
> Oh, cpu0 usage is about 100-(100-93.9)*8 = 51.2%(Am I right?). If we
> can do weighted packet distributing: cpu0's weight is 1, and other
> cpus are 2. maybe we can utilize all the cpu power.
> 

Nope, cpu0 was at 100% in this test, other cpus were about at 50% each.

weigthed would be ok if I wanted to use cpu0 in the 'slave' cpus (RPS
targets). But I know the workload I am interested to, and ability to
resist to DDOS, want to keep cpu0 outside of IP/TCP/UDP stack.


Later, skb_pull() inline in eth_type_trans() permitted to reach 840.000
pps.

top - 12:42:55 up  3:00,  2 users,  load average: 0.44, 0.11, 0.03
Tasks: 126 total,   1 running, 125 sleeping,   0 stopped,   0 zombie
Cpu(s):  2.2%us, 16.5%sy,  0.0%ni, 46.5%id, 11.4%wa,  0.9%hi, 22.5%si,
0.0%st
Mem:   4148112k total,   211152k used,  3936960k free,    15228k buffers
Swap:  4192928k total,        0k used,  4192928k free,   121804k cached

You can see average idle of 46%
So there is probably more optimizations to do to reach maybe 1.300.000
pps ;)




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: OFT - reserving CPU's for networking
  2010-04-30 22:30                       ` David Miller
@ 2010-05-01 10:53                         ` Andi Kleen
  2010-05-01 22:03                           ` David Miller
  0 siblings, 1 reply; 108+ messages in thread
From: Andi Kleen @ 2010-05-01 10:53 UTC (permalink / raw)
  To: David Miller; +Cc: tglx, shemminger, eric.dumazet, netdev, peterz

> And we don't want it to, because the decision mechanisms for steering
> that we using now are starting to get into the stateful territory and
> that's verbotton for NIC offload as far as we're concerned.

Huh? I thought full TCP offload was forbidden?[1] Statefull as in NIC 
(or someone else like netfilter) tracking flows is quite common and very far 
from full offload. AFAIK it doesn't have near all the problems full
offload has.

-Andi

[1] although it seems to leak in more and more through the RDMA backdoor.

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-04-30 23:38                     ` David Miller
@ 2010-05-01 11:00                       ` Andi Kleen
  2010-05-02  6:56                         ` Eric Dumazet
  0 siblings, 1 reply; 108+ messages in thread
From: Andi Kleen @ 2010-05-01 11:00 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan

On Fri, Apr 30, 2010 at 04:38:57PM -0700, David Miller wrote:
> From: Andi Kleen <ak@gargoyle.fritz.box>
> Date: Thu, 29 Apr 2010 23:41:44 +0200
> 
> >     Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies
> > 
> >     XXX: probably too aggressive, some of these sleeps are not under high load.
> > 
> >     Based on a bug report from Eric Dumazet.
> >     
> >     Signed-off-by: Andi Kleen <ak@linux.intel.com>
> 
> I like this, except that we probably don't want the delayacct_blkio_*() calls
> these things do.

Yes.

It needs more work, please don't apply it yet, to handle the "long sleep" case.

Still curious if it fixes Eric's test case.

> 
> Probably the rest of what these things do should remain in the io_schedule*()
> functions and the block layer can call it's own versions which add in the
> delayacct_blkio_*() bits.

Good point.

> 
> Or, if the delacct stuff is useful for socket I/O too, then it's interfaces
> names should have the "blk" stripped from them :-)

Good question. I suspect it's actually useful for some cases, but just adding
sockets might confuse some users.

-Andi

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-05-01  5:57                                           ` Eric Dumazet
  2010-05-01  6:14                                             ` Eric Dumazet
@ 2010-05-01 11:23                                             ` jamal
  2010-05-01 11:42                                               ` Eric Dumazet
  1 sibling, 1 reply; 108+ messages in thread
From: jamal @ 2010-05-01 11:23 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

On Sat, 2010-05-01 at 07:57 +0200, Eric Dumazet wrote:

> I changed your program a bit to use EV_PERSIST, (to avoid epoll_ctl()
> overhead for each packet...)

Thats a different test case then ;-> You can also get rid of the timer
(I doubt it will show much difference in results) - I have it in there
because it i am trying to replicate what i saw causing the regression.

> RPS off : 220.000 pps 
> 
> RPS on (ee mask) : 700.000 pps  (with a slightly modified tg3 driver)
> 96% of delivered packets
> 

That's a very very huge gap. What were the numbers before you changed to
EV_PERSIST?
Note: i did not add any of your other patches for dst refcnt, sockets
etc. Were you running with those patches in these tests? I will try the
next opportunity i get to have latest kernel + those patches. 

> This is on tg3 adapter, and tg3 has copybreak feature : small packets
> are copied into skb of the right size.

Ok, so the driver tuning is also important then (and it shows in the
profile).

cheers,
jamal


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-05-01  6:14                                             ` Eric Dumazet
  2010-05-01 10:24                                               ` Changli Gao
@ 2010-05-01 11:29                                               ` jamal
  1 sibling, 0 replies; 108+ messages in thread
From: jamal @ 2010-05-01 11:29 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

On Sat, 2010-05-01 at 08:14 +0200, Eric Dumazet wrote:

> BTW, using ee mask, cpu4 is not used at _all_, even for the user
> threads. Scheduler does a bad job IMHO.

I have the opposite frustration ;->
I did notice it got used. My goal was to totally avoid using it, for
simple reason it is an SMT thread that shares same core as cpu0.
In retrospect i should probably set irq affinity then to cpu0 and 4.

> Using fe mask, I get all packets (sent at 733311pps by my pktgen
> machine), and my CPU0 even has idle time !!!

I will try this next time i get the chance.

cheers,
jamal


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-05-01 11:23                                             ` jamal
@ 2010-05-01 11:42                                               ` Eric Dumazet
  2010-05-01 11:56                                                 ` jamal
  0 siblings, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-05-01 11:42 UTC (permalink / raw)
  To: hadi
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

Le samedi 01 mai 2010 à 07:23 -0400, jamal a écrit :
> On Sat, 2010-05-01 at 07:57 +0200, Eric Dumazet wrote:
> 
> > I changed your program a bit to use EV_PERSIST, (to avoid epoll_ctl()
> > overhead for each packet...)
> 
> Thats a different test case then ;-> You can also get rid of the timer
> (I doubt it will show much difference in results) - I have it in there
> because it i am trying to replicate what i saw causing the regression.
> 
> > RPS off : 220.000 pps 
> > 
> > RPS on (ee mask) : 700.000 pps  (with a slightly modified tg3 driver)
> > 96% of delivered packets
> > 
> 
> That's a very very huge gap. What were the numbers before you changed to
> EV_PERSIST?

But, whole point of epoll is to not change interest each time you get an
event.

Without EV_PERSIST, you need two more syscalls per recvfrom()

epoll_wait()
 epoll_ctl(REMOVE)
 epoll_ctl(ADD)
 recvfrom()

Even poll() would be faster in your case

poll(one fd)
recvfrom()



> Note: i did not add any of your other patches for dst refcnt, sockets
> etc. Were you running with those patches in these tests? I will try the
> next opportunity i get to have latest kernel + those patches. 
> 
> > This is on tg3 adapter, and tg3 has copybreak feature : small packets
> > are copied into skb of the right size.
> 
> Ok, so the driver tuning is also important then (and it shows in the
> profile).

I always thought copybreak was borderline...

It can help to reduce memory footprint (allocating 128 bytes instead of
2048/4096 bytes per frame), but with RPS, it would make sense to perform
copybreak after RPS, not before.

Reducing memory footprint also means less changes on
udp_memory_allocated /tcp_memory_allocate (memory reclaim logic)




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-05-01 11:42                                               ` Eric Dumazet
@ 2010-05-01 11:56                                                 ` jamal
  2010-05-01 13:22                                                   ` Eric Dumazet
  2010-05-03 20:10                                                   ` jamal
  0 siblings, 2 replies; 108+ messages in thread
From: jamal @ 2010-05-01 11:56 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

On Sat, 2010-05-01 at 13:42 +0200, Eric Dumazet wrote:

> But, whole point of epoll is to not change interest each time you get an
> event.
> 
> Without EV_PERSIST, you need two more syscalls per recvfrom()
> 
> epoll_wait()
>  epoll_ctl(REMOVE)
>  epoll_ctl(ADD)
>  recvfrom()
> 
> Even poll() would be faster in your case
> 
> poll(one fd)
> recvfrom()
> 

This is true - but my goal was/is to replicate the regression i was
seeing[1]. 
I will try with PERSIST next opportunity. If it gets better
then it is something that needs documentation in the doc Tom
promised ;->

> I always thought copybreak was borderline...
> It can help to reduce memory footprint (allocating 128 bytes instead of
> 2048/4096 bytes per frame), but with RPS, it would make sense to perform
> copybreak after RPS, not before.
> 
> Reducing memory footprint also means less changes on
> udp_memory_allocated /tcp_memory_allocate (memory reclaim logic)

Indeed, something that didnt cross my mind in the rush to test - it is
one of those things that need to be mentioned in some doc somewhere.
Tom, are you listening? ;->

cheers,
jamal

[1]i.e with this program rps was getting worse (it was much better
before say net-next of apr14) and that non-rps has been getting better
numbers since. The regression is real - but it is likely in another
subsystem.


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-05-01 11:56                                                 ` jamal
@ 2010-05-01 13:22                                                   ` Eric Dumazet
  2010-05-01 13:49                                                     ` jamal
  2010-05-03 20:10                                                   ` jamal
  1 sibling, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-05-01 13:22 UTC (permalink / raw)
  To: hadi
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

Le samedi 01 mai 2010 à 07:56 -0400, jamal a écrit :

> 
> [1]i.e with this program rps was getting worse (it was much better
> before say net-next of apr14) and that non-rps has been getting better
> numbers since. The regression is real - but it is likely in another
> subsystem.
> 

You must understand that the whole 'bench' is mostly governed by
scheduler artifacts. The regression you mention is probably a side
effect.

By slowing down one part, its possible to zap all calls to scheduler and
go maybe 300% faster (Because consumer threads can avoid 3/4 of the time
to schedule)

Reciprocally, optimizing one part of the network stack might make
threads hitting an empty queue, and need to call more often the
scheduler.

This is why some higly specialized programs never block/schedule and
perform busy loops instead.




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-05-01 13:22                                                   ` Eric Dumazet
@ 2010-05-01 13:49                                                     ` jamal
  0 siblings, 0 replies; 108+ messages in thread
From: jamal @ 2010-05-01 13:49 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

On Sat, 2010-05-01 at 15:22 +0200, Eric Dumazet wrote:

> You must understand that the whole 'bench' is mostly governed by
> scheduler artifacts. The regression you mention is probably a side
> effect.

likely.

> By slowing down one part, its possible to zap all calls to scheduler and
> go maybe 300% faster (Because consumer threads can avoid 3/4 of the time
> to schedule)
> 
> Reciprocally, optimizing one part of the network stack might make
> threads hitting an empty queue, and need to call more often the
> scheduler.

It is fair to say that what i am seeing is _not_ fatal because it is rps
that is regressing; non-rps is fine. I would consider non-rps to be the
common use scenario and if that was doing badly then it is a problem.
The good news is it is getting better - likely because of some changes
made on behalf of rps ;->
With rps, one could follow some instructions on how to make it better.
I am hoping that some of the system "magic" is documented as Tom
mentioned he will.

> This is why some higly specialized programs never block/schedule and
> perform busy loops instead.

Agreed. My brain cells should learn to accept this fact ;->

cheers,
jamal


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: OFT - reserving CPU's for networking
  2010-04-30 18:57                   ` David Miller
  2010-04-30 19:58                     ` Thomas Gleixner
  2010-04-30 21:01                     ` Andi Kleen
@ 2010-05-01 20:31                     ` Martin Josefsson
  2010-05-01 22:13                       ` David Miller
  2 siblings, 1 reply; 108+ messages in thread
From: Martin Josefsson @ 2010-05-01 20:31 UTC (permalink / raw)
  To: David Miller; +Cc: tglx, shemminger, eric.dumazet, ak, netdev, andi, peterz

On Fri, 30 Apr 2010, David Miller wrote:

> Then we can do cool tricks like having the cpu spin on a mwait() on the
> network device's status descriptor in memory.

Can you have mwait monitor multiple cachelines for stores? If not then it 
might be hard to do that when you have multiple nics and you actually 
need to use the status descriptors, otherwise you could possibly have them 
all written to the same cacheline. 
Or if the nic doesn't support updating a status descriptor in memory.

If you just want to wake up quickly without using interrupts it might be 
possible to abuse MSI to wake up without actually using interrupts, set 
the address to the cacheline that is being monitored.

/Martin

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
  2010-05-01  8:03                                           ` Eric Dumazet
@ 2010-05-01 22:00                                             ` David Miller
  0 siblings, 0 replies; 108+ messages in thread
From: David Miller @ 2010-05-01 22:00 UTC (permalink / raw)
  To: eric.dumazet; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 01 May 2010 10:03:31 +0200

> David, I also need this RCU thing in order to be able to group all
> wakeups at the end of net_rx_action().
> 
> Plan was to use RCU, so that I dont need to increase sk_refcnt when
> queueing a "wakeup" (and decrease sk_refcnt a long time after)
> 
> Previous attempt was a bit hacky,
> http://patchwork.ozlabs.org/patch/24179/
> 
> I expect 2010 one will be cleaner :)

Fair enough, I'm convinced now, applied thanks!

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: OFT - reserving CPU's for networking
  2010-05-01 10:53                         ` Andi Kleen
@ 2010-05-01 22:03                           ` David Miller
  2010-05-01 22:58                             ` Andi Kleen
  2010-05-01 23:44                             ` Ben Hutchings
  0 siblings, 2 replies; 108+ messages in thread
From: David Miller @ 2010-05-01 22:03 UTC (permalink / raw)
  To: andi; +Cc: tglx, shemminger, eric.dumazet, netdev, peterz

From: Andi Kleen <andi@firstfloor.org>
Date: Sat, 1 May 2010 12:53:04 +0200

>> And we don't want it to, because the decision mechanisms for steering
>> that we using now are starting to get into the stateful territory and
>> that's verbotton for NIC offload as far as we're concerned.
> 
> Huh? I thought full TCP offload was forbidden?[1] Statefull as in NIC 
> (or someone else like netfilter) tracking flows is quite common and very far 
> from full offload. AFAIK it doesn't have near all the problems full
> offload has.

We're tracking flow cpu location state at the socket operations, like
recvmsg() and sendmsg(), where it belongs.

Would you like us to call into the card drivers and firmware at these
spots instead?

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: OFT - reserving CPU's for networking
  2010-05-01 20:31                     ` Martin Josefsson
@ 2010-05-01 22:13                       ` David Miller
  0 siblings, 0 replies; 108+ messages in thread
From: David Miller @ 2010-05-01 22:13 UTC (permalink / raw)
  To: gandalf; +Cc: tglx, shemminger, eric.dumazet, ak, netdev, andi, peterz

From: Martin Josefsson <gandalf@mjufs.se>
Date: Sat, 1 May 2010 22:31:05 +0200 (CEST)

> On Fri, 30 Apr 2010, David Miller wrote:
> 
>> Then we can do cool tricks like having the cpu spin on a mwait() on
>> the
>> network device's status descriptor in memory.
> 
> Can you have mwait monitor multiple cachelines for stores?

The idea is that if you have hundreds of cpus threads (several of my
machines do, and it's not too long before these kinds of boxes will be
common) in your machine you can spare one for each NIC.

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: OFT - reserving CPU's for networking
  2010-05-01 22:03                           ` David Miller
@ 2010-05-01 22:58                             ` Andi Kleen
  2010-05-01 23:29                               ` David Miller
  2010-05-01 23:44                             ` Ben Hutchings
  1 sibling, 1 reply; 108+ messages in thread
From: Andi Kleen @ 2010-05-01 22:58 UTC (permalink / raw)
  To: David Miller; +Cc: tglx, shemminger, eric.dumazet, netdev, peterz

> We're tracking flow cpu location state at the socket operations, like
> recvmsg() and sendmsg(), where it belongs.
> 
> Would you like us to call into the card drivers and firmware at these
> spots instead?

No, that's not needed for lazy flow tracking like in netfilter or 
some NICs, it doesn't need exact updates. It just works with seen network 
packets. 

-Andi

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: OFT - reserving CPU's for networking
  2010-05-01 22:58                             ` Andi Kleen
@ 2010-05-01 23:29                               ` David Miller
  0 siblings, 0 replies; 108+ messages in thread
From: David Miller @ 2010-05-01 23:29 UTC (permalink / raw)
  To: andi; +Cc: tglx, shemminger, eric.dumazet, netdev, peterz

From: Andi Kleen <andi@firstfloor.org>
Date: Sun, 2 May 2010 00:58:15 +0200

>> We're tracking flow cpu location state at the socket operations, like
>> recvmsg() and sendmsg(), where it belongs.
>> 
>> Would you like us to call into the card drivers and firmware at these
>> spots instead?
> 
> No, that's not needed for lazy flow tracking like in netfilter or 
> some NICs, it doesn't need exact updates. It just works with seen network 
> packets. 

Well what we need is exact flow updates so that we steer packets
to where the applications actually are.

Andi, this discussion is going in circles, can I just say "yeah you're
right Andi" and this will satisfy your desire to be correct and we can
be done with this?

Thanks.

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: OFT - reserving CPU's for networking
  2010-05-01 22:03                           ` David Miller
  2010-05-01 22:58                             ` Andi Kleen
@ 2010-05-01 23:44                             ` Ben Hutchings
  1 sibling, 0 replies; 108+ messages in thread
From: Ben Hutchings @ 2010-05-01 23:44 UTC (permalink / raw)
  To: David Miller; +Cc: andi, tglx, shemminger, eric.dumazet, netdev, peterz

On Sat, 2010-05-01 at 15:03 -0700, David Miller wrote:
> From: Andi Kleen <andi@firstfloor.org>
> Date: Sat, 1 May 2010 12:53:04 +0200
> 
> >> And we don't want it to, because the decision mechanisms for steering
> >> that we using now are starting to get into the stateful territory and
> >> that's verbotton for NIC offload as far as we're concerned.
> > 
> > Huh? I thought full TCP offload was forbidden?[1] Statefull as in NIC 
> > (or someone else like netfilter) tracking flows is quite common and very far 
> > from full offload. AFAIK it doesn't have near all the problems full
> > offload has.
> 
> We're tracking flow cpu location state at the socket operations, like
> recvmsg() and sendmsg(), where it belongs.
> 
> Would you like us to call into the card drivers and firmware at these
> spots instead?

I'm interested in experimenting with this at some point, since our
hardware supports a fairly large number of filters that could be used
for it.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-01 11:00                       ` Andi Kleen
@ 2010-05-02  6:56                         ` Eric Dumazet
  2010-05-02  9:20                           ` Andi Kleen
  0 siblings, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-05-02  6:56 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan

Le samedi 01 mai 2010 à 13:00 +0200, Andi Kleen a écrit :
> On Fri, Apr 30, 2010 at 04:38:57PM -0700, David Miller wrote:
> > From: Andi Kleen <ak@gargoyle.fritz.box>
> > Date: Thu, 29 Apr 2010 23:41:44 +0200
> > 
> > >     Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies
> > > 
> > >     XXX: probably too aggressive, some of these sleeps are not under high load.
> > > 
> > >     Based on a bug report from Eric Dumazet.
> > >     
> > >     Signed-off-by: Andi Kleen <ak@linux.intel.com>
> > 
> > I like this, except that we probably don't want the delayacct_blkio_*() calls
> > these things do.
> 
> Yes.
> 
> It needs more work, please don't apply it yet, to handle the "long sleep" case.
> 
> Still curious if it fixes Eric's test case.
> 

I tried it on the right spot (since my bench was only doing recvmsg()
calls, I had to patch wait_for_packet() in net/core/datagram.c

udp_recvmsg -> __skb_recv_datagram -> wait_for_packet ->
schedule_timeout

Unfortunatly, using io_schedule_timeout() did not solve the problem.

Tell me if you need some traces or something.

Thanks !

diff --git a/net/core/datagram.c b/net/core/datagram.c
index 95b851f..051fd5b 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -113,7 +113,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
 		goto interrupted;
 
 	error = 0;
-	*timeo_p = schedule_timeout(*timeo_p);
+	*timeo_p = io_schedule_timeout(*timeo_p);
 out:
 	finish_wait(sk_sleep(sk), &wait);
 	return error;



^ permalink raw reply related	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02  6:56                         ` Eric Dumazet
@ 2010-05-02  9:20                           ` Andi Kleen
  2010-05-02 10:54                             ` Eric Dumazet
  0 siblings, 1 reply; 108+ messages in thread
From: Andi Kleen @ 2010-05-02  9:20 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan

> I tried it on the right spot (since my bench was only doing recvmsg()
> calls, I had to patch wait_for_packet() in net/core/datagram.c
> 
> udp_recvmsg -> __skb_recv_datagram -> wait_for_packet ->
> schedule_timeout
> 
> Unfortunatly, using io_schedule_timeout() did not solve the problem.

Hmm, too bad. Weird.

> 
> Tell me if you need some traces or something.

I'll try to reproduce it and see what I can do.

-Andi


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02  9:20                           ` Andi Kleen
@ 2010-05-02 10:54                             ` Eric Dumazet
  2010-05-02 14:13                               ` Arjan van de Ven
  2010-05-02 15:46                               ` Andi Kleen
  0 siblings, 2 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-05-02 10:54 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan

Le dimanche 02 mai 2010 à 11:20 +0200, Andi Kleen a écrit :
> > I tried it on the right spot (since my bench was only doing recvmsg()
> > calls, I had to patch wait_for_packet() in net/core/datagram.c
> > 
> > udp_recvmsg -> __skb_recv_datagram -> wait_for_packet ->
> > schedule_timeout
> > 
> > Unfortunatly, using io_schedule_timeout() did not solve the problem.
> 
> Hmm, too bad. Weird.
> 
> > 
> > Tell me if you need some traces or something.
> 
> I'll try to reproduce it and see what I can do.
> 

Here the perf report on the latest test done, I confirm I am using
io_schedule_timeout() in this kernel.

In this test, all 16 queues of one BCM57711E NIC (1Gb link) delivers
 packets at about 1.300.000 pps to 16 cpus (one cpu per queue) and these
packets are then redistributed by RPS to same 16 cpus, generating about
650.000 IPI per second.

top says :
Cpu(s):  3.0%us, 17.3%sy,  0.0%ni, 22.4%id, 28.2%wa,  0.0%hi, 29.1%si,
0.0%st


# Samples: 321362570767
#
# Overhead         Command                 Shared Object  Symbol
# ........  ..............  ............................  ......
#
    25.08%            init  [kernel.kallsyms]             [k] _raw_spin_lock_irqsave
                      |
                      --- _raw_spin_lock_irqsave
                         |          
                         |--93.47%-- clockevents_notify
                         |          lapic_timer_state_broadcast
                         |          acpi_idle_enter_bm
                         |          cpuidle_idle_call
                         |          cpu_idle
                         |          start_secondary
                         |          
                         |--4.70%-- tick_broadcast_oneshot_control
                         |          tick_notify
                         |          notifier_call_chain
                         |          __raw_notifier_call_chain
                         |          raw_notifier_call_chain
                         |          clockevents_do_notify
                         |          clockevents_notify
                         |          lapic_timer_state_broadcast
                         |          acpi_idle_enter_bm
                         |          cpuidle_idle_call
                         |          cpu_idle
                         |          start_secondary
                         |          
                         |--0.64%-- generic_exec_single
                         |          __smp_call_function_single
                         |          net_rps_action_and_irq_enable
...
     9.72%            init  [kernel.kallsyms]             [k] acpi_os_read_port
                      |
                      --- acpi_os_read_port
                         |          
                         |--99.45%-- acpi_hw_read_port
                         |          acpi_hw_read
                         |          acpi_hw_read_multiple
                         |          acpi_hw_register_read
                         |          acpi_read_bit_register
                         |          acpi_idle_enter_bm
                         |          cpuidle_idle_call
                         |          cpu_idle
                         |          start_secondary
                         |          
                          --0.55%-- acpi_hw_read
                                    acpi_hw_read_multiple

powertop says :
     PowerTOP version 1.11      (C) 2007 Intel Corporation

Cn                Avg residency       P-states (frequencies)
C0 (cpu running)        (68.9%)         2.93 Ghz    46.5%
polling           0.0ms ( 0.0%)         2.80 Ghz     5.1%
C1 mwait          0.0ms ( 0.0%)         2.53 Ghz     3.0%
C2 mwait          0.0ms (31.1%)         2.13 Ghz     2.8%
                                        1.60 Ghz    38.2%

Wakeups-from-idle per second : 45177.8  interval: 5.0s
no ACPI power usage estimate available

Top causes for wakeups:
   9.9% (40863.0)       <interrupt> : eth1-fp-7 
   9.9% (40861.0)       <interrupt> : eth1-fp-8 
   9.9% (40858.0)       <interrupt> : eth1-fp-5 
   9.9% (40855.2)       <interrupt> : eth1-fp-10 
   9.9% (40847.6)       <interrupt> : eth1-fp-14 
   9.9% (40847.2)       <interrupt> : eth1-fp-12 
   9.9% (40835.0)       <interrupt> : eth1-fp-1 
   9.9% (40834.2)       <interrupt> : eth1-fp-3 
   9.9% (40834.0)       <interrupt> : eth1-fp-6 
   9.9% (40829.6)       <interrupt> : eth1-fp-4 
   1.0% (4002.0)     <kernel core> : hrtimer_start_range_ns (tick_sched_timer) 
   0.4% (1725.6)       <interrupt> : extra timer interrupt 
   0.0% (  4.0)     <kernel core> : usb_hcd_poll_rh_status (rh_timer_func)
   0.0% (  2.0)     <kernel core> : clocksource_watchdog (clocksource_watchdog)
   0.0% (  2.0)             snmpd : hrtimer_start_range_ns (hrtimer_wakeup)



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 10:54                             ` Eric Dumazet
@ 2010-05-02 14:13                               ` Arjan van de Ven
  2010-05-02 14:27                                 ` Eric Dumazet
  2010-05-02 15:46                               ` Andi Kleen
  1 sibling, 1 reply; 108+ messages in thread
From: Arjan van de Ven @ 2010-05-02 14:13 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger,
	netdev, lenb

> 
> Cn                Avg residency       P-states (frequencies)
> C0 (cpu running)        (68.9%)         2.93 Ghz    46.5%
> polling           0.0ms ( 0.0%)         2.80 Ghz     5.1%
> C1 mwait          0.0ms ( 0.0%)         2.53 Ghz     3.0%
> C2 mwait          0.0ms (31.1%)         2.13 Ghz     2.8%
>                                         1.60 Ghz    38.2%

I bet your system advertizes C2 with the same latency as C1,
but with lower power... which means Linux will pretty much never
pick C1.... no matter how much you take Andi's patch.

this is a bios thing... and until we put in the patch to override the
bios values (I can dust it off but it might need a bit of tweaking
since it was against .31) Andi's patch alone won't cut it... you also
need a non-lying bios ;)



-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 14:13                               ` Arjan van de Ven
@ 2010-05-02 14:27                                 ` Eric Dumazet
  2010-05-02 15:32                                   ` Eric Dumazet
  2010-05-02 17:54                                   ` Arjan van de Ven
  0 siblings, 2 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-05-02 14:27 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger,
	netdev, lenb

Le dimanche 02 mai 2010 à 07:13 -0700, Arjan van de Ven a écrit :
> > 
> > Cn                Avg residency       P-states (frequencies)
> > C0 (cpu running)        (68.9%)         2.93 Ghz    46.5%
> > polling           0.0ms ( 0.0%)         2.80 Ghz     5.1%
> > C1 mwait          0.0ms ( 0.0%)         2.53 Ghz     3.0%
> > C2 mwait          0.0ms (31.1%)         2.13 Ghz     2.8%
> >                                         1.60 Ghz    38.2%
> 
> I bet your system advertizes C2 with the same latency as C1,
> but with lower power... which means Linux will pretty much never
> pick C1.... no matter how much you take Andi's patch.
> 
> this is a bios thing... and until we put in the patch to override the
> bios values (I can dust it off but it might need a bit of tweaking
> since it was against .31) Andi's patch alone won't cut it... you also
> need a non-lying bios ;)
> 
> 
> 
# pwd
/sys/devices/system/cpu/cpu15/cpuidle
# grep . */*
state0/desc:CPUIDLE CORE POLL IDLE
state0/latency:0
state0/name:C0
state0/power:4294967295
state0/time:0
state0/usage:0
state1/desc:ACPI FFH INTEL MWAIT 0x0
state1/latency:1
state1/name:C1
state1/power:1000
state1/time:433855186
state1/usage:126869
state2/desc:ACPI FFH INTEL MWAIT 0x10
state2/latency:64
state2/name:C2
state2/power:500
state2/time:198095020416
state2/usage:76287744

C2 latency seems to be 64  (us ?), while C1 seems to be 1

BIOS Information
	Vendor: HP
	Version: I24
	Release Date: 10/01/2009

# powertop
PowerTOP 1.11   (C) 2007, 2008 Intel Corporation 

Collecting data for 5 seconds 


Your CPU supports the following C-states : C1 C2 C3 
Your BIOS reports the following C-states : C1 C2 

C3 seems to be disabled in BIOS



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 14:27                                 ` Eric Dumazet
@ 2010-05-02 15:32                                   ` Eric Dumazet
  2010-05-02 17:54                                   ` Arjan van de Ven
  1 sibling, 0 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-05-02 15:32 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger,
	netdev, lenb

Le dimanche 02 mai 2010 à 16:27 +0200, Eric Dumazet a écrit :
> Le dimanche 02 mai 2010 à 07:13 -0700, Arjan van de Ven a écrit :
> > > 
> > > Cn                Avg residency       P-states (frequencies)
> > > C0 (cpu running)        (68.9%)         2.93 Ghz    46.5%
> > > polling           0.0ms ( 0.0%)         2.80 Ghz     5.1%
> > > C1 mwait          0.0ms ( 0.0%)         2.53 Ghz     3.0%
> > > C2 mwait          0.0ms (31.1%)         2.13 Ghz     2.8%
> > >                                         1.60 Ghz    38.2%
> > 
> > I bet your system advertizes C2 with the same latency as C1,
> > but with lower power... which means Linux will pretty much never
> > pick C1.... no matter how much you take Andi's patch.
> > 
> > this is a bios thing... and until we put in the patch to override the
> > bios values (I can dust it off but it might need a bit of tweaking
> > since it was against .31) Andi's patch alone won't cut it... you also
> > need a non-lying bios ;)
> > 
> > 
> > 
> # pwd
> /sys/devices/system/cpu/cpu15/cpuidle
> # grep . */*
> state0/desc:CPUIDLE CORE POLL IDLE
> state0/latency:0
> state0/name:C0
> state0/power:4294967295
> state0/time:0
> state0/usage:0
> state1/desc:ACPI FFH INTEL MWAIT 0x0
> state1/latency:1
> state1/name:C1
> state1/power:1000
> state1/time:433855186
> state1/usage:126869
> state2/desc:ACPI FFH INTEL MWAIT 0x10
> state2/latency:64
> state2/name:C2
> state2/power:500
> state2/time:198095020416
> state2/usage:76287744
> 
> C2 latency seems to be 64  (us ?), while C1 seems to be 1
> 
> BIOS Information
> 	Vendor: HP
> 	Version: I24
> 	Release Date: 10/01/2009
> 
> # powertop
> PowerTOP 1.11   (C) 2007, 2008 Intel Corporation 
> 
> Collecting data for 5 seconds 
> 
> 
> Your CPU supports the following C-states : C1 C2 C3 
> Your BIOS reports the following C-states : C1 C2 
> 
> C3 seems to be disabled in BIOS
> 

I took a look at BIOS settings and enabled the minimum sleep state to be
C6 (instead of C3, the default). Now we see C3 being available...

No changes, only more IPI delivered during the test, and more overhead
in clockevents_notify()

# grep . */*
state0/desc:CPUIDLE CORE POLL IDLE
state0/latency:0
state0/name:C0
state0/power:4294967295
state0/time:0
state0/usage:0
state1/desc:ACPI FFH INTEL MWAIT 0x0
state1/latency:1
state1/name:C1
state1/power:1000
state1/time:39432
state1/usage:119
state2/desc:ACPI FFH INTEL MWAIT 0x10
state2/latency:64
state2/name:C2
state2/power:500
state2/time:3170745
state2/usage:11177
state3/desc:ACPI FFH INTEL MWAIT 0x20
state3/latency:96
state3/name:C3
state3/power:350
state3/time:1030987453
state3/usage:14047019

---------------------------------------------------------------------------------------------------------------------------
   PerfTop:   15984 irqs/sec  kernel:98.5% [1000Hz cycles],  (all, 16 CPUs)
---------------------------------------------------------------------------------------------------------------------------

             samples  pcnt function                       DSO
             _______ _____ ______________________________ _______

            23822.00 40.2% _raw_spin_lock_irqsave         vmlinux
             4413.00  7.4% acpi_os_read_port              vmlinux
             1426.00  2.4% _raw_spin_lock                 vmlinux
             1284.00  2.2% _raw_spin_unlock_irqrestore    vmlinux
             1247.00  2.1% schedule                       vmlinux
             1137.00  1.9% bnx2x_rx_int                   vmlinux
              643.00  1.1% tick_broadcast_oneshot_control vmlinux
              597.00  1.0% copy_user_generic_string       vmlinux
              595.00  1.0% __napi_complete                vmlinux
              550.00  0.9% call_function_single_interrupt vmlinux
              548.00  0.9% bnx2x_msix_fp_int              vmlinux
              486.00  0.8% __netif_receive_skb            vmlinux
              461.00  0.8% bnx2x_poll                     vmlinux
              433.00  0.7% eth_type_trans                 vmlinux
              428.00  0.7% acpi_idle_enter_bm             vmlinux
              422.00  0.7% sock_recv_ts_and_drops         vmlinux
              382.00  0.6% __udp4_lib_lookup              vmlinux
              369.00  0.6% __slab_free                    vmlinux
              357.00  0.6% ip_route_input                 vmlinux
              341.00  0.6% kfree                          vmlinux
              335.00  0.6% ipt_do_table                   vmlinux
              334.00  0.6% ip_rcv                         vmlinux
              332.00  0.6% udp_recvmsg                    vmlinux
              317.00  0.5% __kmalloc_node_track_caller    vmlinux

    37.46%            init  [kernel.kallsyms]             [k] _raw_spin_lock_irqsave
                      |
                      --- _raw_spin_lock_irqsave
                         |          
                         |--95.58%-- clockevents_notify
                         |          lapic_timer_state_broadcast
                         |          acpi_idle_enter_bm
                         |          cpuidle_idle_call
                         |          cpu_idle
                         |          start_secondary
                         |          
                         |--3.27%-- tick_broadcast_oneshot_control
                         |          tick_notify
                         |          notifier_call_chain
                         |          __raw_notifier_call_chain
                         |          raw_notifier_call_chain
                         |          clockevents_do_notify
                         |          clockevents_notify
                         |          lapic_timer_state_broadcast
                         |          acpi_idle_enter_bm



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 10:54                             ` Eric Dumazet
  2010-05-02 14:13                               ` Arjan van de Ven
@ 2010-05-02 15:46                               ` Andi Kleen
  2010-05-02 16:35                                 ` Eric Dumazet
  1 sibling, 1 reply; 108+ messages in thread
From: Andi Kleen @ 2010-05-02 15:46 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan

> In this test, all 16 queues of one BCM57711E NIC (1Gb link) delivers
>  packets at about 1.300.000 pps to 16 cpus (one cpu per queue) and these
> packets are then redistributed by RPS to same 16 cpus, generating about
> 650.000 IPI per second.

BTW if rps was SMT aware it could avoid a lot of the IPIs in the first place.

-Andi


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 15:46                               ` Andi Kleen
@ 2010-05-02 16:35                                 ` Eric Dumazet
  2010-05-02 17:43                                   ` Arjan van de Ven
  2010-05-02 21:25                                   ` Andi Kleen
  0 siblings, 2 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-05-02 16:35 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan

Le dimanche 02 mai 2010 à 17:46 +0200, Andi Kleen a écrit :
> > In this test, all 16 queues of one BCM57711E NIC (1Gb link) delivers
> >  packets at about 1.300.000 pps to 16 cpus (one cpu per queue) and these
> > packets are then redistributed by RPS to same 16 cpus, generating about
> > 650.000 IPI per second.
> 
> BTW if rps was SMT aware it could avoid a lot of the IPIs in the first place.

RPS do what you want, just stick a good cpumask, not a unware one :)

In my test, I specifically do something 'stupid' like :

echo fffe >/sys/class/net/bond0.2240/queues/rx-0/rps_cpus
echo fffd >/sys/class/net/bond0.2240/queues/rx-1/rps_cpus
echo fffb >/sys/class/net/bond0.2240/queues/rx-2/rps_cpus
echo fff7 >/sys/class/net/bond0.2240/queues/rx-3/rps_cpus

echo ffef >/sys/class/net/bond0.2240/queues/rx-4/rps_cpus
echo ffdf >/sys/class/net/bond0.2240/queues/rx-5/rps_cpus
echo ffbf >/sys/class/net/bond0.2240/queues/rx-6/rps_cpus
echo ff7f >/sys/class/net/bond0.2240/queues/rx-7/rps_cpus

echo feff >/sys/class/net/bond0.2240/queues/rx-8/rps_cpus
echo fdff >/sys/class/net/bond0.2240/queues/rx-9/rps_cpus
echo fbff >/sys/class/net/bond0.2240/queues/rx-10/rps_cpus
echo f7ff >/sys/class/net/bond0.2240/queues/rx-11/rps_cpus

echo efff >/sys/class/net/bond0.2240/queues/rx-12/rps_cpus
echo dfff >/sys/class/net/bond0.2240/queues/rx-13/rps_cpus
echo bfff >/sys/class/net/bond0.2240/queues/rx-14/rps_cpus
echo 7fff >/sys/class/net/bond0.2240/queues/rx-15/rps_cpus

echo 0001 >/proc/irq/*/eth1-fp-0/../smp_affinity
echo 0002 >/proc/irq/*/eth1-fp-1/../smp_affinity
echo 0004 >/proc/irq/*/eth1-fp-2/../smp_affinity
echo 0008 >/proc/irq/*/eth1-fp-3/../smp_affinity
echo 0010 >/proc/irq/*/eth1-fp-4/../smp_affinity
echo 0020 >/proc/irq/*/eth1-fp-5/../smp_affinity
echo 0040 >/proc/irq/*/eth1-fp-6/../smp_affinity
echo 0080 >/proc/irq/*/eth1-fp-7/../smp_affinity
echo 0100 >/proc/irq/*/eth1-fp-8/../smp_affinity
echo 0200 >/proc/irq/*/eth1-fp-9/../smp_affinity
echo 0400 >/proc/irq/*/eth1-fp-10/../smp_affinity
echo 0800 >/proc/irq/*/eth1-fp-11/../smp_affinity
echo 1000 >/proc/irq/*/eth1-fp-12/../smp_affinity
echo 2000 >/proc/irq/*/eth1-fp-13/../smp_affinity
echo 4000 >/proc/irq/*/eth1-fp-14/../smp_affinity
echo 8000 >/proc/irq/*/eth1-fp-15/../smp_affinity


You mean we can wakeup a thread with something else than an IPI ?




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 16:35                                 ` Eric Dumazet
@ 2010-05-02 17:43                                   ` Arjan van de Ven
  2010-05-02 17:47                                     ` Eric Dumazet
  2010-05-02 21:25                                   ` Andi Kleen
  1 sibling, 1 reply; 108+ messages in thread
From: Arjan van de Ven @ 2010-05-02 17:43 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger,
	netdev, lenb

On Sun, 02 May 2010 18:35:31 +0200
Eric Dumazet <eric.dumazet@gmail.com> wrote
> 
> 
> You mean we can wakeup a thread with something else than an IPI ?
> 

actually we can.

mwait is not only "go idle", it is "go idle until someone writes to
<THIS> cacheline". where <THIS> is set up with a "monitor" instruction.
We don't need to send an ipi per se.. all we need is to write to the
right cacheline that we're monitoring.


-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 17:43                                   ` Arjan van de Ven
@ 2010-05-02 17:47                                     ` Eric Dumazet
  0 siblings, 0 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-05-02 17:47 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger,
	netdev, lenb

Le dimanche 02 mai 2010 à 10:43 -0700, Arjan van de Ven a écrit :
> On Sun, 02 May 2010 18:35:31 +0200
> Eric Dumazet <eric.dumazet@gmail.com> wrote
> > 
> > 
> > You mean we can wakeup a thread with something else than an IPI ?
> > 
> 
> actually we can.
> 
> mwait is not only "go idle", it is "go idle until someone writes to
> <THIS> cacheline". where <THIS> is set up with a "monitor" instruction.
> We don't need to send an ipi per se.. all we need is to write to the
> right cacheline that we're monitoring.
> 
> 

Thats a bit x86 specific, isnt it ?

But we want to eventually send a 'signal' to a cpu, even if not blocked
in idle, so that it can do following action :

/* Called from hardirq (IPI) context */
static void rps_trigger_softirq(void *data)
{
        struct softnet_data *sd = data;

        __napi_schedule(&sd->backlog);
        __get_cpu_var(netdev_rx_stat).received_rps++;
}

And it also should be portable ;)

If something else than an IPI is available, please let us know !

Thanks



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 14:27                                 ` Eric Dumazet
  2010-05-02 15:32                                   ` Eric Dumazet
@ 2010-05-02 17:54                                   ` Arjan van de Ven
  2010-05-02 19:22                                     ` Eric Dumazet
  2010-05-02 21:30                                     ` Andi Kleen
  1 sibling, 2 replies; 108+ messages in thread
From: Arjan van de Ven @ 2010-05-02 17:54 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger,
	netdev, lenb

On Sun, 02 May 2010 16:27:28 +0200
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> C2 latency seems to be 64  (us ?), while C1 seems to be 1

the processor_idle module has a "latency_factor" module parameter.
The default is 2, but sometimes people think 6 is a better value...
.. any chance you can try that value ?

Also, I'm starting to wonder if Andi's patch to use io_schedule() needs
to be replaced with a net_schedule() kind of thing. The cpuidle code
currently has a weight factor for IO (based on measuring/experiments),
and maybe networking really needs another factor... so just having a
parallel concept with a different weight could be the right answer for
that.



> 
> Your CPU supports the following C-states : C1 C2 C3 
> Your BIOS reports the following C-states : C1 C2 
> 
> C3 seems to be disabled in BIOS

btw this C2 == marketing name C3, and C3 == marketing name C6

(too many translations ;-)

we'll fix powertop to report the marketing name soon.


-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 17:54                                   ` Arjan van de Ven
@ 2010-05-02 19:22                                     ` Eric Dumazet
  2010-05-02 22:06                                       ` Andi Kleen
  2010-05-03  3:50                                       ` Arjan van de Ven
  2010-05-02 21:30                                     ` Andi Kleen
  1 sibling, 2 replies; 108+ messages in thread
From: Eric Dumazet @ 2010-05-02 19:22 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger,
	netdev, lenb

Le dimanche 02 mai 2010 à 10:54 -0700, Arjan van de Ven a écrit :
> On Sun, 02 May 2010 16:27:28 +0200
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> > C2 latency seems to be 64  (us ?), while C1 seems to be 1
> 
> the processor_idle module has a "latency_factor" module parameter.
> The default is 2, but sometimes people think 6 is a better value...
> .. any chance you can try that value ?
> 

I tried 6 and 20, nothing changed ;(

> Also, I'm starting to wonder if Andi's patch to use io_schedule() needs
> to be replaced with a net_schedule() kind of thing. The cpuidle code
> currently has a weight factor for IO (based on measuring/experiments),
> and maybe networking really needs another factor... so just having a
> parallel concept with a different weight could be the right answer for
> that.
> 

But a task blocked on disk IO is probably blocked for a small amount of
time, while on network, it can be for a long time. I am not sure its the
right metric.

I was expecting something based on recent history.
Say if we have 20.000 wakeups per second, most likely we should not
enter C2/C3 states...

> 
> we'll fix powertop to report the marketing name soon.
> 
> 

Ah, I see, thanks :)



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 16:35                                 ` Eric Dumazet
  2010-05-02 17:43                                   ` Arjan van de Ven
@ 2010-05-02 21:25                                   ` Andi Kleen
  2010-05-02 21:45                                     ` Eric Dumazet
  1 sibling, 1 reply; 108+ messages in thread
From: Andi Kleen @ 2010-05-02 21:25 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan

> You mean we can wakeup a thread with something else than an IPI ?

It's pointless to send an IPI to your thread sibling for this. 
Everything it could do you can do yourself too with the same performance.

-Andi

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 17:54                                   ` Arjan van de Ven
  2010-05-02 19:22                                     ` Eric Dumazet
@ 2010-05-02 21:30                                     ` Andi Kleen
  1 sibling, 0 replies; 108+ messages in thread
From: Andi Kleen @ 2010-05-02 21:30 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Eric Dumazet, David Miller, hadi, xiaosuo, therbert, shemminger,
	netdev, lenb

On Sun, May 02, 2010 at 10:54:18AM -0700, Arjan van de Ven wrote:
> On Sun, 02 May 2010 16:27:28 +0200
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> > C2 latency seems to be 64  (us ?), while C1 seems to be 1
> 
> the processor_idle module has a "latency_factor" module parameter.
> The default is 2, but sometimes people think 6 is a better value...
> .. any chance you can try that value ?
> 
> Also, I'm starting to wonder if Andi's patch to use io_schedule() needs
> to be replaced with a net_schedule() kind of thing. The cpuidle code
> currently has a weight factor for IO (based on measuring/experiments),
> and maybe networking really needs another factor... so just having a
> parallel concept with a different weight could be the right answer for
> that.

We definitely need a net_schedule() for other reasons too: to avoid the blkio 
wait code and then also because networking needs a short "fast idle" timeout 
because the delays are not bounded.  

Otherwise a sender that suddenly stops sending could break all your power 
saving.

I think the reference count used in io_schedule is not the right model for 
this, probably needs a per cpu timeout ("be fast until this time"). Possibly 
a dynamic one feed by the measured input rate.

-Andi

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 21:25                                   ` Andi Kleen
@ 2010-05-02 21:45                                     ` Eric Dumazet
  2010-05-02 21:54                                       ` Andi Kleen
  0 siblings, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-05-02 21:45 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan

Le dimanche 02 mai 2010 à 23:25 +0200, Andi Kleen a écrit :

> It's pointless to send an IPI to your thread sibling for this. 
> Everything it could do you can do yourself too with the same performance.
> 
> -Andi

Amen

Tests just prove the reverse.

I have some collegues that disable HyperThreading for exact same
reasons. I wonder why Intel designed HT. Should be marketing I guess.




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 21:45                                     ` Eric Dumazet
@ 2010-05-02 21:54                                       ` Andi Kleen
  2010-05-02 22:08                                         ` Eric Dumazet
  0 siblings, 1 reply; 108+ messages in thread
From: Andi Kleen @ 2010-05-02 21:54 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan

On Sun, May 02, 2010 at 11:45:55PM +0200, Eric Dumazet wrote:
> Le dimanche 02 mai 2010 à 23:25 +0200, Andi Kleen a écrit :
> 
> > It's pointless to send an IPI to your thread sibling for this. 
> > Everything it could do you can do yourself too with the same performance.
> > 
> > -Andi
> 
> Amen

That is in terms of cache locality.

> 
> Tests just prove the reverse.

What do you mean? 

> 
> I have some collegues that disable HyperThreading for exact same
> reasons. I wonder why Intel designed HT. Should be marketing I guess.

HT (especially Nehalem HT) is useful for a wide range of workloads.
Just handling network interrupts for its thread sibling is not one of them.

-Andi


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 19:22                                     ` Eric Dumazet
@ 2010-05-02 22:06                                       ` Andi Kleen
  2010-05-03  3:50                                       ` Arjan van de Ven
  1 sibling, 0 replies; 108+ messages in thread
From: Andi Kleen @ 2010-05-02 22:06 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Arjan van de Ven, David Miller, hadi, xiaosuo, therbert,
	shemminger, netdev, lenb

> But a task blocked on disk IO is probably blocked for a small amount of
> time, while on network, it can be for a long time. I am not sure its the
> right metric.

I think it needs a dynamic timeout.

I agree the reference count as is will not work well for networking.

> 
> I was expecting something based on recent history.
> Say if we have 20.000 wakeups per second, most likely we should not
> enter C2/C3 states...

That's what the menu governour already does, it just doesn't work
in some cases :/

-Andi


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 21:54                                       ` Andi Kleen
@ 2010-05-02 22:08                                         ` Eric Dumazet
  2010-05-03 20:15                                           ` jamal
  0 siblings, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-05-02 22:08 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David Miller, hadi, xiaosuo, therbert, shemminger, netdev, lenb, arjan

Le dimanche 02 mai 2010 à 23:54 +0200, Andi Kleen a écrit :
> On Sun, May 02, 2010 at 11:45:55PM +0200, Eric Dumazet wrote:

> > Tests just prove the reverse.
> 
> What do you mean? 
> 

Test I did this week with Jamal.

We first set a "ee" rps mask, because all NIC interrupts were handled by
CPU0, and Jamal thought like you, that not using cpu4 would give better
performance.

But using "fe" mask gave me a bonus, from ~700.000 pps to ~800.000 pps

CPU : E5450  @3.00GHz
Two quad-core cpus in the machine, tg3 NIC.

With RPS, CPU0 does not a lot of things, just talk with the NIC, bring a
few cache lines per packet and dispatch it to a slave cpu.



> HT (especially Nehalem HT) is useful for a wide range of workloads.
> Just handling network interrupts for its thread sibling is not one of them.
> 

Thats the theory, now in practice I see different results.

Of course, this might be related to hash distribution being different
and more uniform.

I should redo the test with many more flows.




^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 19:22                                     ` Eric Dumazet
  2010-05-02 22:06                                       ` Andi Kleen
@ 2010-05-03  3:50                                       ` Arjan van de Ven
  2010-05-03  5:17                                         ` Eric Dumazet
  1 sibling, 1 reply; 108+ messages in thread
From: Arjan van de Ven @ 2010-05-03  3:50 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger,
	netdev, lenb

> > Also, I'm starting to wonder if Andi's patch to use io_schedule()
> > needs to be replaced with a net_schedule() kind of thing. The
> > cpuidle code currently has a weight factor for IO (based on
> > measuring/experiments), and maybe networking really needs another
> > factor... so just having a parallel concept with a different weight
> > could be the right answer for that.
> > 
> 
> But a task blocked on disk IO is probably blocked for a small amount
> of time, while on network, it can be for a long time. I am not sure
> its the right metric.

it's not so much about the duration, as it is about the performance
sensitivity....

 
> I was expecting something based on recent history.
> Say if we have 20.000 wakeups per second, most likely we should not
> enter C2/C3 states...

we effectively do that. The thing is that C2 is so low cost normally
that it's still worth it even at 20k wakeups...

this is where the bios tells us how "heavy" the states are....
and 64 usec... is just not very much.



-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-03  3:50                                       ` Arjan van de Ven
@ 2010-05-03  5:17                                         ` Eric Dumazet
  2010-05-03 10:22                                           ` Arjan van de Ven
  0 siblings, 1 reply; 108+ messages in thread
From: Eric Dumazet @ 2010-05-03  5:17 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger,
	netdev, lenb

Le dimanche 02 mai 2010 à 20:50 -0700, Arjan van de Ven a écrit :

> we effectively do that. The thing is that C2 is so low cost normally
> that it's still worth it even at 20k wakeups...
> 
> this is where the bios tells us how "heavy" the states are....
> and 64 usec... is just not very much.

Maybe its low cost, (apparently, it is, since I can reach ~900.000 ipis
on my 16 cores machine) but multiply this by 16 or 32 or 64 cpus, and
clockevents_notify() cost appears to be a killer, all cpus compete on a
single lock.

Maybe this notifier could use RCU ?



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-03  5:17                                         ` Eric Dumazet
@ 2010-05-03 10:22                                           ` Arjan van de Ven
  2010-05-03 10:34                                             ` Andi Kleen
  0 siblings, 1 reply; 108+ messages in thread
From: Arjan van de Ven @ 2010-05-03 10:22 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andi Kleen, David Miller, hadi, xiaosuo, therbert, shemminger,
	netdev, lenb

On Mon, 03 May 2010 07:17:14 +0200
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Le dimanche 02 mai 2010 à 20:50 -0700, Arjan van de Ven a écrit :
> 
> > we effectively do that. The thing is that C2 is so low cost normally
> > that it's still worth it even at 20k wakeups...
> > 
> > this is where the bios tells us how "heavy" the states are....
> > and 64 usec... is just not very much.
> 
> Maybe its low cost, (apparently, it is, since I can reach ~900.000
> ipis on my 16 cores machine) but multiply this by 16 or 32 or 64
> cpus, and clockevents_notify() cost appears to be a killer, all cpus
> compete on a single lock.
> 
> Maybe this notifier could use RCU ?

could this be an artifact of the local apic stopping in deeper C states?
(which is finally fixed in the Westmere generation)



-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-03 10:22                                           ` Arjan van de Ven
@ 2010-05-03 10:34                                             ` Andi Kleen
  2010-05-03 14:09                                               ` Arjan van de Ven
  0 siblings, 1 reply; 108+ messages in thread
From: Andi Kleen @ 2010-05-03 10:34 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Eric Dumazet, Andi Kleen, David Miller, hadi, xiaosuo, therbert,
	shemminger, netdev, lenb

> > Maybe its low cost, (apparently, it is, since I can reach ~900.000
> > ipis on my 16 cores machine) but multiply this by 16 or 32 or 64
> > cpus, and clockevents_notify() cost appears to be a killer, all cpus
> > compete on a single lock.
> > 
> > Maybe this notifier could use RCU ?
> 
> could this be an artifact of the local apic stopping in deeper C states?
> (which is finally fixed in the Westmere generation)

Yes it is I think.

But I suspect Eric wants a solution for Nehalem.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-03 10:34                                             ` Andi Kleen
@ 2010-05-03 14:09                                               ` Arjan van de Ven
  2010-05-03 14:45                                                 ` Brian Bloniarz
  2010-05-03 15:52                                                 ` Andi Kleen
  0 siblings, 2 replies; 108+ messages in thread
From: Arjan van de Ven @ 2010-05-03 14:09 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Eric Dumazet, David Miller, hadi, xiaosuo, therbert, shemminger,
	netdev, lenb

On Mon, 3 May 2010 12:34:26 +0200
Andi Kleen <andi@firstfloor.org> wrote:

> > > Maybe its low cost, (apparently, it is, since I can reach ~900.000
> > > ipis on my 16 cores machine) but multiply this by 16 or 32 or 64
> > > cpus, and clockevents_notify() cost appears to be a killer, all
> > > cpus compete on a single lock.
> > > 
> > > Maybe this notifier could use RCU ?
> > 
> > could this be an artifact of the local apic stopping in deeper C
> > states? (which is finally fixed in the Westmere generation)
> 
> Yes it is I think.
> 
> But I suspect Eric wants a solution for Nehalem.

sure ;-)


so the hard problem is that on going idle, the local timers need to be
funneled to the external HPET. Afaik right now we use one channel of
the hpet, with the result that we have one global lock for this.

HPETs have more than one channel (2 or 3 historically, newer chipsets
iirc have a few more), so in principle we can split this lock at least
a little bit... if we can get to one hpet channel per level 3 cache
domain we'd already make huge progress in terms of cost of the
contention....



-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-03 14:09                                               ` Arjan van de Ven
@ 2010-05-03 14:45                                                 ` Brian Bloniarz
  2010-05-04  1:10                                                   ` Arjan van de Ven
  2010-05-03 15:52                                                 ` Andi Kleen
  1 sibling, 1 reply; 108+ messages in thread
From: Brian Bloniarz @ 2010-05-03 14:45 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andi Kleen, Eric Dumazet, David Miller, hadi, xiaosuo, therbert,
	shemminger, netdev, lenb

Arjan van de Ven wrote:
> On Mon, 3 May 2010 12:34:26 +0200
> Andi Kleen <andi@firstfloor.org> wrote:
> 
>>>> Maybe its low cost, (apparently, it is, since I can reach ~900.000
>>>> ipis on my 16 cores machine) but multiply this by 16 or 32 or 64
>>>> cpus, and clockevents_notify() cost appears to be a killer, all
>>>> cpus compete on a single lock.
>>>>
>>>> Maybe this notifier could use RCU ?
>>> could this be an artifact of the local apic stopping in deeper C
>>> states? (which is finally fixed in the Westmere generation)
>> Yes it is I think.
>>
>> But I suspect Eric wants a solution for Nehalem.
> 
> sure ;-)
> 
> 
> so the hard problem is that on going idle, the local timers need to be
> funneled to the external HPET. Afaik right now we use one channel of
> the hpet, with the result that we have one global lock for this.

Does the HPET only need to be programmed when going idle?
That could mean that this isn't a big performance issue.
cares if you spin for a while when you're about to sleep for
at least 60usec?

> HPETs have more than one channel (2 or 3 historically, newer chipsets
> iirc have a few more), so in principle we can split this lock at least
> a little bit... if we can get to one hpet channel per level 3 cache
> domain we'd already make huge progress in terms of cost of the
> contention....

Another possible approach: if a core needs the HPET and finds it
locked, it could queue up its request to a backlog which the
locking core will service.

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-03 14:09                                               ` Arjan van de Ven
  2010-05-03 14:45                                                 ` Brian Bloniarz
@ 2010-05-03 15:52                                                 ` Andi Kleen
  2010-05-04  1:11                                                   ` Arjan van de Ven
  1 sibling, 1 reply; 108+ messages in thread
From: Andi Kleen @ 2010-05-03 15:52 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Eric Dumazet, David Miller, hadi, xiaosuo, therbert, shemminger,
	netdev, lenb

> so the hard problem is that on going idle, the local timers need to be
> funneled to the external HPET. Afaik right now we use one channel of
> the hpet, with the result that we have one global lock for this.
> 
> HPETs have more than one channel (2 or 3 historically, newer chipsets
> iirc have a few more), so in principle we can split this lock at least
> a little bit... if we can get to one hpet channel per level 3 cache
> domain we'd already make huge progress in terms of cost of the
> contention....

I suggested the same thing a few emails up @) (great minds think 
alike etc.etc. @) . 

I'm not sure how difficult it would be to implement though.

Potential issues:

Some user applications use the hpet channels directly through
the character device interface so there would be a potential
compatibility issue (but maybe that should be just moved
to be emulated with a hrtimer ?)

And if multiple broadcast controllers are elected this might
make it harder to become idle.

-Andi

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH net-next-2.6] net: speedup udp receive path
  2010-05-01 11:56                                                 ` jamal
  2010-05-01 13:22                                                   ` Eric Dumazet
@ 2010-05-03 20:10                                                   ` jamal
  1 sibling, 0 replies; 108+ messages in thread
From: jamal @ 2010-05-03 20:10 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz

On Sat, 2010-05-01 at 07:56 -0400, jamal wrote:
> On Sat, 2010-05-01 at 13:42 +0200, Eric Dumazet wrote:
> 
> > But, whole point of epoll is to not change interest each time you get an
> > event.
> > 
> > Without EV_PERSIST, you need two more syscalls per recvfrom()
> > 
> > epoll_wait()
> >  epoll_ctl(REMOVE)
> >  epoll_ctl(ADD)
> >  recvfrom()
> > 
> > Even poll() would be faster in your case
> > 
> > poll(one fd)
> > recvfrom()
> > 
> 
> This is true - but my goal was/is to replicate the regression i was
> seeing[1]. 
> I will try with PERSIST next opportunity. If it gets better
> then it is something that needs documentation in the doc Tom
> promised ;->

I tried it with PERSIST and today's net-next and you are right:
rps was better compared with (99.4% vs 98.1% of 750Kpps).
If however i removed the PERSIST i.e both rps and non-rps
have two extra syscalls, again rps performed worse (93.2% vs 97.8%
of 750Kpps). Eric, I know the answer is not to do the non-PERSIST mode
for rps ;-> But lets just ignore that for a sec:
what the heck is going on? I would expect the degradation to be the same
for both non-rps. 
I also wanna do the broken record reminder that kernels before net-next
of Apr14 were doing about 97% (as opposed to 93% currently for same
test).

cheers,
jamal


^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-02 22:08                                         ` Eric Dumazet
@ 2010-05-03 20:15                                           ` jamal
  0 siblings, 0 replies; 108+ messages in thread
From: jamal @ 2010-05-03 20:15 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andi Kleen, David Miller, xiaosuo, therbert, shemminger, netdev,
	lenb, arjan

On Mon, 2010-05-03 at 00:08 +0200, Eric Dumazet wrote:

> 
> Test I did this week with Jamal.
> 
> We first set a "ee" rps mask, because all NIC interrupts were handled by
> CPU0, and Jamal thought like you, that not using cpu4 would give better
> performance.
> 
> But using "fe" mask gave me a bonus, from ~700.000 pps to ~800.000 pps
> 

I am seeing the opposite with my machine (Nehalem):
with ee i get 99.4% and fe i get 94.2% whereas non-rps
is about 98.1%.


cheers,
jamal

PS:- sorry dont have time to collect a lot more data - tommorow i could
do more.



^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-03 14:45                                                 ` Brian Bloniarz
@ 2010-05-04  1:10                                                   ` Arjan van de Ven
  0 siblings, 0 replies; 108+ messages in thread
From: Arjan van de Ven @ 2010-05-04  1:10 UTC (permalink / raw)
  To: Brian Bloniarz
  Cc: Andi Kleen, Eric Dumazet, David Miller, hadi, xiaosuo, therbert,
	shemminger, netdev, lenb

On Mon, 03 May 2010 10:45:07 -0400
Brian Bloniarz <bmb@athenacr

> > so the hard problem is that on going idle, the local timers need to
> > be funneled to the external HPET. Afaik right now we use one
> > channel of the hpet, with the result that we have one global lock
> > for this.
> 
> Does the HPET only need to be programmed when going idle?

correct; when going idle the per logical CPU timer value needs
to be put in the global HPET (assuming 1 channel is in use).
This "global" is where the lock comes in.

> That could mean that this isn't a big performance issue.
> cares if you spin for a while when you're about to sleep for
> at least 60usec?

depends on how long the sleep is ;-)


-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 108+ messages in thread

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
  2010-05-03 15:52                                                 ` Andi Kleen
@ 2010-05-04  1:11                                                   ` Arjan van de Ven
  0 siblings, 0 replies; 108+ messages in thread
From: Arjan van de Ven @ 2010-05-04  1:11 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Eric Dumazet, David Miller, hadi, xiaosuo, therbert, shemminger,
	netdev, lenb

On Mon, 3 May 2010 17:52:04 +0200
Andi Kleen <andi@firstfloor.org> wrote:
> > HPETs have more than one channel (2 or 3 historically, newer
> > chipsets iirc have a few more), so in principle we can split this
> > lock at least a little bit... if we can get to one hpet channel per
> > level 3 cache domain we'd already make huge progress in terms of
> > cost of the contention....
> 
> I suggested the same thing a few emails up @) (great minds think 
> alike etc.etc. @) . 
> 
> I'm not sure how difficult it would be to implement though.

the hardest part will be cases where the SMM code borrows higher HPET
channels or something.. not sure if they do, but.. color me a bit afraid
we'll find cases.


> 
> Potential issues:
> 
> Some user applications use the hpet channels directly through
> the character device interface so there would be a potential
> compatibility issue (but maybe that should be just moved
> to be emulated with a hrtimer ?)

we can and should just emulate this. Same for the rtc device I suspect.

 
> And if multiple broadcast controllers are elected this might
> make it harder to become idle.

not quite, as long as you do a directed broadcast. As long as there's a
predictable mapping for which cores group to which hpet channel.. won't
be that bad since you only need to wake up your own local subset.



-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 108+ messages in thread

end of thread, other threads:[~2010-05-04  1:09 UTC | newest]

Thread overview: 108+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-04-23  8:12 [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue Changli Gao
2010-04-23  9:27 ` Eric Dumazet
2010-04-23 22:02   ` jamal
2010-04-24 14:10     ` jamal
2010-04-26 14:03       ` Eric Dumazet
2010-04-26 14:55         ` Eric Dumazet
2010-04-26 21:06           ` jamal
     [not found]           ` <20100429174056.GA8044@gargoyle.fritz.box>
2010-04-29 17:56             ` Eric Dumazet
2010-04-29 18:10               ` OFT - reserving CPU's for networking Stephen Hemminger
2010-04-29 19:19                 ` Thomas Gleixner
2010-04-29 20:02                   ` Eric Dumazet
2010-04-30 18:15                     ` Brian Bloniarz
2010-04-30 18:57                   ` David Miller
2010-04-30 19:58                     ` Thomas Gleixner
2010-04-30 21:01                     ` Andi Kleen
2010-04-30 22:30                       ` David Miller
2010-05-01 10:53                         ` Andi Kleen
2010-05-01 22:03                           ` David Miller
2010-05-01 22:58                             ` Andi Kleen
2010-05-01 23:29                               ` David Miller
2010-05-01 23:44                             ` Ben Hutchings
2010-05-01 20:31                     ` Martin Josefsson
2010-05-01 22:13                       ` David Miller
     [not found]               ` <20100429182347.GA8512@gargoyle.fritz.box>
2010-04-29 19:12                 ` [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue Eric Dumazet
     [not found]                   ` <20100429214144.GA10663@gargoyle.fritz.box>
2010-04-30  5:25                     ` Eric Dumazet
2010-04-30 23:38                     ` David Miller
2010-05-01 11:00                       ` Andi Kleen
2010-05-02  6:56                         ` Eric Dumazet
2010-05-02  9:20                           ` Andi Kleen
2010-05-02 10:54                             ` Eric Dumazet
2010-05-02 14:13                               ` Arjan van de Ven
2010-05-02 14:27                                 ` Eric Dumazet
2010-05-02 15:32                                   ` Eric Dumazet
2010-05-02 17:54                                   ` Arjan van de Ven
2010-05-02 19:22                                     ` Eric Dumazet
2010-05-02 22:06                                       ` Andi Kleen
2010-05-03  3:50                                       ` Arjan van de Ven
2010-05-03  5:17                                         ` Eric Dumazet
2010-05-03 10:22                                           ` Arjan van de Ven
2010-05-03 10:34                                             ` Andi Kleen
2010-05-03 14:09                                               ` Arjan van de Ven
2010-05-03 14:45                                                 ` Brian Bloniarz
2010-05-04  1:10                                                   ` Arjan van de Ven
2010-05-03 15:52                                                 ` Andi Kleen
2010-05-04  1:11                                                   ` Arjan van de Ven
2010-05-02 21:30                                     ` Andi Kleen
2010-05-02 15:46                               ` Andi Kleen
2010-05-02 16:35                                 ` Eric Dumazet
2010-05-02 17:43                                   ` Arjan van de Ven
2010-05-02 17:47                                     ` Eric Dumazet
2010-05-02 21:25                                   ` Andi Kleen
2010-05-02 21:45                                     ` Eric Dumazet
2010-05-02 21:54                                       ` Andi Kleen
2010-05-02 22:08                                         ` Eric Dumazet
2010-05-03 20:15                                           ` jamal
2010-04-26 21:03         ` jamal
2010-04-23 10:26 ` Eric Dumazet
2010-04-27 22:08   ` David Miller
2010-04-27 22:18     ` [PATCH net-next-2.6] bnx2x: Remove two prefetch() Eric Dumazet
2010-04-27 22:19       ` David Miller
2010-04-28 13:14         ` Eilon Greenstein
2010-04-28 15:44           ` Eliezer Tamir
2010-04-28 16:53           ` David Miller
     [not found]           ` <w2ue8f3c3211004280842r9f2589e8qb8fd4b7933cd9756@mail.gmail.com>
2010-04-28 16:55             ` David Miller
2010-04-28 11:33       ` jamal
2010-04-28 12:33         ` Eric Dumazet
2010-04-28 12:36           ` jamal
2010-04-28 14:06             ` [PATCH net-next-2.6] net: speedup udp receive path Eric Dumazet
2010-04-28 14:19               ` Eric Dumazet
2010-04-28 14:34                 ` Eric Dumazet
2010-04-28 21:36               ` David Miller
2010-04-28 22:22                 ` [PATCH net-next-2.6] net: ip_queue_rcv_skb() helper Eric Dumazet
2010-04-28 22:39                   ` David Miller
2010-04-28 23:44               ` [PATCH net-next-2.6] net: speedup udp receive path jamal
2010-04-29  0:00                 ` jamal
2010-04-29  4:09                 ` Eric Dumazet
2010-04-29 11:35                   ` jamal
2010-04-29 12:12                     ` Changli Gao
2010-04-29 12:45                       ` Eric Dumazet
2010-04-29 13:17                         ` jamal
2010-04-29 13:21                           ` Eric Dumazet
2010-04-29 13:37                             ` jamal
2010-04-29 13:49                               ` Eric Dumazet
2010-04-29 13:56                                 ` jamal
2010-04-29 20:36                                   ` jamal
2010-04-29 21:01                                     ` [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion Eric Dumazet
2010-04-30 13:55                                       ` Brian Bloniarz
2010-04-30 17:26                                         ` Eric Dumazet
2010-04-30 23:35                                       ` David Miller
2010-05-01  4:56                                         ` Eric Dumazet
2010-05-01  7:02                                         ` Eric Dumazet
2010-05-01  8:03                                           ` Eric Dumazet
2010-05-01 22:00                                             ` David Miller
2010-04-30 19:30                                     ` [PATCH net-next-2.6] net: speedup udp receive path jamal
2010-04-30 20:40                                       ` Eric Dumazet
2010-05-01  0:06                                         ` jamal
2010-05-01  5:57                                           ` Eric Dumazet
2010-05-01  6:14                                             ` Eric Dumazet
2010-05-01 10:24                                               ` Changli Gao
2010-05-01 10:47                                                 ` Eric Dumazet
2010-05-01 11:29                                               ` jamal
2010-05-01 11:23                                             ` jamal
2010-05-01 11:42                                               ` Eric Dumazet
2010-05-01 11:56                                                 ` jamal
2010-05-01 13:22                                                   ` Eric Dumazet
2010-05-01 13:49                                                     ` jamal
2010-05-03 20:10                                                   ` jamal
2010-04-29 23:07                         ` Changli Gao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.