All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next] tcp: avoid tx starvation by SYNACK packets
@ 2012-05-31 21:56 Eric Dumazet
  2012-05-31 23:03 ` David Miller
                   ` (3 more replies)
  0 siblings, 4 replies; 42+ messages in thread
From: Eric Dumazet @ 2012-05-31 21:56 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: netdev, Neal Cardwell, Tom Herbert, Jesper Dangaard Brouer

From: Eric Dumazet <edumazet@google.com>

pfifo_fast being the default Qdisc, its pretty easy to fill it with
SYNACK (small) packets while host is under SYNFLOOD attack.

Packets of established TCP sessions are dropped and host appears almost
dead.

Avoid this problem assigning TC_PRIO_FILLER priority to SYNACK
generated in SYNCOOKIE mode, so that these packets are enqueued into
pfifo_fast band 2.

Other packets, queued to band 0 or 1 are dequeued before any SYNACK
packets waiting in band 2.

Reported-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
---
 net/dccp/ipv4.c                  |    3 +++
 net/ipv4/ip_output.c             |    2 +-
 net/ipv4/tcp_ipv4.c              |   13 +++++++++----
 net/ipv6/inet6_connection_sock.c |    1 +
 net/ipv6/ip6_output.c            |    2 +-
 net/ipv6/tcp_ipv6.c              |   10 +++++++---
 6 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 07f5579..d8a3d87 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -515,6 +515,8 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
 
 		dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->loc_addr,
 							      ireq->rmt_addr);
+		
+		skb->priority = sk->sk_priority;
 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 					    ireq->rmt_addr,
 					    ireq->opt);
@@ -556,6 +558,7 @@ static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
 	skb_dst_set(skb, dst_clone(dst));
 
 	bh_lock_sock(ctl_sk);
+	skb->priority = ctl_sk->sk_priority;
 	err = ip_build_and_send_pkt(skb, ctl_sk,
 				    rxiph->daddr, rxiph->saddr, NULL);
 	bh_unlock_sock(ctl_sk);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 451f97c..407e2fc 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -168,7 +168,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 		ip_options_build(skb, &opt->opt, daddr, rt, 0);
 	}
 
-	skb->priority = sk->sk_priority;
+	/* skb->priority is set by the caller */
 	skb->mark = sk->sk_mark;
 
 	/* Send it out. */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a43b87d..613e713 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -81,7 +81,7 @@
 #include <linux/stddef.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-
+#include <linux/pkt_sched.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
 
@@ -824,7 +824,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  */
 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 			      struct request_sock *req,
-			      struct request_values *rvp)
+			      struct request_values *rvp,
+			      bool syncookie)
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	struct flowi4 fl4;
@@ -840,6 +841,9 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 	if (skb) {
 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 
+		/* SYNACK sent in SYNCOOKIE mode have low priority */
+		skb->priority = syncookie ? TC_PRIO_FILLER : sk->sk_priority;
+
 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 					    ireq->rmt_addr,
 					    ireq->opt);
@@ -854,7 +858,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 			      struct request_values *rvp)
 {
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
-	return tcp_v4_send_synack(sk, NULL, req, rvp);
+	return tcp_v4_send_synack(sk, NULL, req, rvp, false);
 }
 
 /*
@@ -1422,7 +1426,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_rsk(req)->snt_synack = tcp_time_stamp;
 
 	if (tcp_v4_send_synack(sk, dst, req,
-			       (struct request_values *)&tmp_ext) ||
+			       (struct request_values *)&tmp_ext,
+			       want_cookie) ||
 	    want_cookie)
 		goto drop_and_free;
 
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index e6cee52..5812a74 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -248,6 +248,7 @@ int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused)
 	/* Restore final destination back after routing done */
 	fl6.daddr = np->daddr;
 
+	skb->priority = sk->sk_priority;
 	res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
 	rcu_read_unlock();
 	return res;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 17b8c67..61c0ea8 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -241,7 +241,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	hdr->saddr = fl6->saddr;
 	hdr->daddr = *first_hop;
 
-	skb->priority = sk->sk_priority;
+	/* skb->priority is set by the caller */
 	skb->mark = sk->sk_mark;
 
 	mtu = dst_mtu(dst);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 554d599..b618413 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -43,6 +43,7 @@
 #include <linux/ipv6.h>
 #include <linux/icmpv6.h>
 #include <linux/random.h>
+#include <linux/pkt_sched.h>
 
 #include <net/tcp.h>
 #include <net/ndisc.h>
@@ -476,7 +477,7 @@ out:
 
 
 static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
-			      struct request_values *rvp)
+			      struct request_values *rvp, bool syncookie)
 {
 	struct inet6_request_sock *treq = inet6_rsk(req);
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -512,6 +513,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 	if (skb) {
 		__tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
 
+		skb->priority = syncookie ? TC_PRIO_FILLER : sk->sk_priority;
 		fl6.daddr = treq->rmt_addr;
 		err = ip6_xmit(sk, skb, &fl6, opt, np->tclass);
 		err = net_xmit_eval(err);
@@ -528,7 +530,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req,
 			     struct request_values *rvp)
 {
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
-	return tcp_v6_send_synack(sk, req, rvp);
+	return tcp_v6_send_synack(sk, req, rvp, false);
 }
 
 static void tcp_v6_reqsk_destructor(struct request_sock *req)
@@ -906,6 +908,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
 	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false);
 	if (!IS_ERR(dst)) {
 		skb_dst_set(buff, dst);
+		skb->priority = ctl_sk->sk_priority;
 		ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass);
 		TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 		if (rst)
@@ -1213,7 +1216,8 @@ have_isn:
 	security_inet_conn_request(sk, skb, req);
 
 	if (tcp_v6_send_synack(sk, req,
-			       (struct request_values *)&tmp_ext) ||
+			       (struct request_values *)&tmp_ext,
+			       want_cookie) ||
 	    want_cookie)
 		goto drop_and_free;
 

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next] tcp: avoid tx starvation by SYNACK packets
  2012-05-31 21:56 [PATCH net-next] tcp: avoid tx starvation by SYNACK packets Eric Dumazet
@ 2012-05-31 23:03 ` David Miller
  2012-06-01  4:48   ` Eric Dumazet
  2012-06-01  7:00 ` [PATCH] tcp: do not create inetpeer on SYNACK message Eric Dumazet
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 42+ messages in thread
From: David Miller @ 2012-05-31 23:03 UTC (permalink / raw)
  To: eric.dumazet; +Cc: hans.schillstrom, netdev, ncardwell, therbert, brouer


Is the net-next tree open yet?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next] tcp: avoid tx starvation by SYNACK packets
  2012-05-31 23:03 ` David Miller
@ 2012-06-01  4:48   ` Eric Dumazet
  2012-06-01 17:46     ` David Miller
  0 siblings, 1 reply; 42+ messages in thread
From: Eric Dumazet @ 2012-06-01  4:48 UTC (permalink / raw)
  To: David Miller; +Cc: hans.schillstrom, netdev, ncardwell, therbert, brouer

On Thu, 2012-05-31 at 19:03 -0400, David Miller wrote:
> Is the net-next tree open yet?

David

Hans asked me to send a patch for testing, I sent it, and made clear it
was not a fix for current net tree.

RFC is like 'I throw a patch, I am not even 50% confident of it, please
comment and fix my bugs'. Most busy people just ignore it.

In this case, I know it really fixes a problem, but since its a day-0
one, its not meant for net tree (linux-3.5)

I don't know what's wrong with ignoring patches and consider them later
when net-next is open and various Acked-by or Tested-by signatures were
added, as done by other maintainers.

Just make clear that if a patch is not anymore listed on
http://patchwork.ozlabs.org/project/netdev/list/ , the author is
responsible for resending it with all added signatures ?

We need to exchange ideas (aka patches), even in the merge window, or if
the subtree maintainer is busy doing its own job.

Thanks

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH] tcp: do not create inetpeer on SYNACK message
  2012-05-31 21:56 [PATCH net-next] tcp: avoid tx starvation by SYNACK packets Eric Dumazet
  2012-05-31 23:03 ` David Miller
@ 2012-06-01  7:00 ` Eric Dumazet
  2012-06-01 18:24   ` David Miller
  2012-06-01 21:34   ` Hans Schillström
  2012-06-01  7:36 ` [PATCH net-next] tcp: avoid tx starvation by SYNACK packets Hans Schillstrom
  2012-06-02  1:28 ` Dave Taht
  3 siblings, 2 replies; 42+ messages in thread
From: Eric Dumazet @ 2012-06-01  7:00 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: netdev, Jesper Dangaard Brouer, David Miller, Neal Cardwell, Tom Herbert

From: Eric Dumazet <edumazet@google.com>

Another problem on SYNFLOOD/DDOS attack is the inetpeer cache getting
larger and larger, using lots of memory and cpu time.

tcp_v4_send_synack()
->inet_csk_route_req()
 ->ip_route_output_flow()
  ->rt_set_nexthop()
   ->rt_init_metrics()
    ->inet_getpeer( create = true)

This is a side effect of commit a4daad6b09230 (net: Pre-COW metrics for
TCP) added in 2.6.39

Possible solution : 

Instruct inet_csk_route_req() to remove FLOWI_FLAG_PRECOW_METRICS 

Before patch :

# grep peer /proc/slabinfo 
inet_peer_cache   4175430 4175430    192   42    2 : tunables    0    0    0 : slabdata  99415  99415      0

Samples: 41K of event 'cycles', Event count (approx.): 30716565122                                          
+  20,24%      ksoftirqd/0  [kernel.kallsyms]           [k] inet_getpeer
+   8,19%      ksoftirqd/0  [kernel.kallsyms]           [k] peer_avl_rebalance.isra.1
+   4,81%      ksoftirqd/0  [kernel.kallsyms]           [k] sha_transform
+   3,64%      ksoftirqd/0  [kernel.kallsyms]           [k] fib_table_lookup
+   2,36%      ksoftirqd/0  [ixgbe]                     [k] ixgbe_poll
+   2,16%      ksoftirqd/0  [kernel.kallsyms]           [k] __ip_route_output_key
+   2,11%      ksoftirqd/0  [kernel.kallsyms]           [k] kernel_map_pages
+   2,11%      ksoftirqd/0  [kernel.kallsyms]           [k] ip_route_input_common
+   2,01%      ksoftirqd/0  [kernel.kallsyms]           [k] __inet_lookup_established
+   1,83%      ksoftirqd/0  [kernel.kallsyms]           [k] md5_transform
+   1,75%      ksoftirqd/0  [kernel.kallsyms]           [k] check_leaf.isra.9
+   1,49%      ksoftirqd/0  [kernel.kallsyms]           [k] ipt_do_table
+   1,46%      ksoftirqd/0  [kernel.kallsyms]           [k] hrtimer_interrupt
+   1,45%      ksoftirqd/0  [kernel.kallsyms]           [k] kmem_cache_alloc
+   1,29%      ksoftirqd/0  [kernel.kallsyms]           [k] inet_csk_search_req
+   1,29%      ksoftirqd/0  [kernel.kallsyms]           [k] __netif_receive_skb
+   1,16%      ksoftirqd/0  [kernel.kallsyms]           [k] copy_user_generic_string
+   1,15%      ksoftirqd/0  [kernel.kallsyms]           [k] kmem_cache_free
+   1,02%      ksoftirqd/0  [kernel.kallsyms]           [k] tcp_make_synack
+   0,93%      ksoftirqd/0  [kernel.kallsyms]           [k] _raw_spin_lock_bh
+   0,87%      ksoftirqd/0  [kernel.kallsyms]           [k] __call_rcu
+   0,84%      ksoftirqd/0  [kernel.kallsyms]           [k] rt_garbage_collect
+   0,84%      ksoftirqd/0  [kernel.kallsyms]           [k] fib_rules_lookup

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Hans Schillstrom <hans.schillstrom@ericsson.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
---
 net/ipv4/inet_connection_sock.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 95e61596..f9ee741 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -377,7 +377,8 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
 
 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
-			   sk->sk_protocol, inet_sk_flowi_flags(sk),
+			   sk->sk_protocol,
+			   inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS,
 			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
 			   ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
 	security_req_classify_flow(req, flowi4_to_flowi(fl4));

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next] tcp: avoid tx starvation by SYNACK packets
  2012-05-31 21:56 [PATCH net-next] tcp: avoid tx starvation by SYNACK packets Eric Dumazet
  2012-05-31 23:03 ` David Miller
  2012-06-01  7:00 ` [PATCH] tcp: do not create inetpeer on SYNACK message Eric Dumazet
@ 2012-06-01  7:36 ` Hans Schillstrom
  2012-06-01  9:34   ` Eric Dumazet
  2012-06-02  1:28 ` Dave Taht
  3 siblings, 1 reply; 42+ messages in thread
From: Hans Schillstrom @ 2012-06-01  7:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Neal Cardwell, Tom Herbert, Jesper Dangaard Brouer

On Thursday 31 May 2012 23:56:37 Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> pfifo_fast being the default Qdisc, its pretty easy to fill it with
> SYNACK (small) packets while host is under SYNFLOOD attack.
> 
> Packets of established TCP sessions are dropped and host appears almost
> dead.
> 
> Avoid this problem assigning TC_PRIO_FILLER priority to SYNACK
> generated in SYNCOOKIE mode, so that these packets are enqueued into
> pfifo_fast band 2.
> 
> Other packets, queued to band 0 or 1 are dequeued before any SYNACK
> packets waiting in band 2.
> 
Thanks Eric,
the patch is in under test now.

-- 
Regards
Hans Schillstrom

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-01  7:36 ` [PATCH net-next] tcp: avoid tx starvation by SYNACK packets Hans Schillstrom
@ 2012-06-01  9:34   ` Eric Dumazet
  0 siblings, 0 replies; 42+ messages in thread
From: Eric Dumazet @ 2012-06-01  9:34 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: netdev, Neal Cardwell, Tom Herbert, Jesper Dangaard Brouer

On Fri, 2012-06-01 at 09:36 +0200, Hans Schillstrom wrote:
> On Thursday 31 May 2012 23:56:37 Eric Dumazet wrote:
> > From: Eric Dumazet <edumazet@google.com>
> > 
> > pfifo_fast being the default Qdisc, its pretty easy to fill it with
> > SYNACK (small) packets while host is under SYNFLOOD attack.
> > 
> > Packets of established TCP sessions are dropped and host appears almost
> > dead.
> > 
> > Avoid this problem assigning TC_PRIO_FILLER priority to SYNACK
> > generated in SYNCOOKIE mode, so that these packets are enqueued into
> > pfifo_fast band 2.
> > 
> > Other packets, queued to band 0 or 1 are dequeued before any SYNACK
> > packets waiting in band 2.
> > 
> Thanks Eric,
> the patch is in under test now.
> 

Thanks Hans

By the way, I found that we have another problem because __qdisc_run()
( called from net_tx_action()) only pushes 64 frames per invocation, and
in fact less if need_resched() breaks the loop.

Its not fair with the net_rx_action, allowed to receive 64 frames per
NAPI device regardless of need_resched().

So if our cpu is flooded by incoming frames, our output is muted.

Basically we need to make SYNACK frames aware of multiqueue devices,
since they currently all end on one single queue.

Obvious choice is to reflect incoming SYN packet @queue_mapping to
SYNACK packet.

I am testing a patch right now.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-01  4:48   ` Eric Dumazet
@ 2012-06-01 17:46     ` David Miller
  0 siblings, 0 replies; 42+ messages in thread
From: David Miller @ 2012-06-01 17:46 UTC (permalink / raw)
  To: eric.dumazet; +Cc: hans.schillstrom, netdev, ncardwell, therbert, brouer

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 01 Jun 2012 06:48:57 +0200

> RFC is like 'I throw a patch, I am not even 50% confident of it, please
> comment and fix my bugs'. Most busy people just ignore it.

And I will ignore it too if you put RFC in the subject line :)

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] tcp: do not create inetpeer on SYNACK message
  2012-06-01  7:00 ` [PATCH] tcp: do not create inetpeer on SYNACK message Eric Dumazet
@ 2012-06-01 18:24   ` David Miller
  2012-06-01 21:34   ` Hans Schillström
  1 sibling, 0 replies; 42+ messages in thread
From: David Miller @ 2012-06-01 18:24 UTC (permalink / raw)
  To: eric.dumazet; +Cc: hans.schillstrom, netdev, brouer, ncardwell, therbert

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 01 Jun 2012 09:00:26 +0200

> From: Eric Dumazet <edumazet@google.com>
> 
> Another problem on SYNFLOOD/DDOS attack is the inetpeer cache getting
> larger and larger, using lots of memory and cpu time.
> 
> tcp_v4_send_synack()
> ->inet_csk_route_req()
>  ->ip_route_output_flow()
>   ->rt_set_nexthop()
>    ->rt_init_metrics()
>     ->inet_getpeer( create = true)
> 
> This is a side effect of commit a4daad6b09230 (net: Pre-COW metrics for
> TCP) added in 2.6.39
> 
> Possible solution : 
> 
> Instruct inet_csk_route_req() to remove FLOWI_FLAG_PRECOW_METRICS 
 ...
> Signed-off-by: Eric Dumazet <edumazet@google.com>

This is definitely the right thing to do.

Applied, thanks Eric.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH] tcp: do not create inetpeer on SYNACK message
  2012-06-01  7:00 ` [PATCH] tcp: do not create inetpeer on SYNACK message Eric Dumazet
  2012-06-01 18:24   ` David Miller
@ 2012-06-01 21:34   ` Hans Schillström
  2012-06-02  6:56     ` Eric Dumazet
  1 sibling, 1 reply; 42+ messages in thread
From: Hans Schillström @ 2012-06-01 21:34 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, Jesper Dangaard Brouer, David Miller, Neal Cardwell, Tom Herbert

Hi Eric
>Another problem on SYNFLOOD/DDOS attack is the inetpeer cache getting
>larger and larger, using lots of memory and cpu time.
>
>>tcp_v4_send_synack()
>->inet_csk_route_req()
> ->ip_route_output_flow()
>  ->rt_set_nexthop()
>   ->rt_init_metrics()
>    ->inet_getpeer( create = true)
>
>This is a side effect of commit a4daad6b09230 (net: Pre-COW metrics for
>TCP) added in 2.6.39
>
>Possible solution :
>
>Instruct inet_csk_route_req() to remove FLOWI_FLAG_PRECOW_METRICS
>

It think we are on the right way now,

Some results from one of our testers:
before applying "reflect SYN queue_mapping into SYNACK"

"(The latest one from Eric is not included. I am building with
that one right now.)
Results were that with the same number of SYN/s, load went down
30% on each of the three Cpus that were handling the SYNs.
Great !!!"

I'm looking forward to see the results of the latests patch.

Then I think conntrack need a little shape up, like a "mini-conntrack"
it is way to expensive to alloc a full "coontack for every SYN.

I have a bunch of patches and ideas for that...

Thanks Eric for a great job

/Hans
 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next] tcp: avoid tx starvation by SYNACK packets
  2012-05-31 21:56 [PATCH net-next] tcp: avoid tx starvation by SYNACK packets Eric Dumazet
                   ` (2 preceding siblings ...)
  2012-06-01  7:36 ` [PATCH net-next] tcp: avoid tx starvation by SYNACK packets Hans Schillstrom
@ 2012-06-02  1:28 ` Dave Taht
  2012-06-02  5:46   ` Eric Dumazet
  3 siblings, 1 reply; 42+ messages in thread
From: Dave Taht @ 2012-06-02  1:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Hans Schillstrom, netdev, Neal Cardwell, Tom Herbert,
	Jesper Dangaard Brouer

On Thu, May 31, 2012 at 2:56 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> pfifo_fast being the default Qdisc, its pretty easy to fill it with
> SYNACK (small) packets while host is under SYNFLOOD attack.
>
> Packets of established TCP sessions are dropped and host appears almost
> dead.
>
> Avoid this problem assigning TC_PRIO_FILLER priority to SYNACK
> generated in SYNCOOKIE mode, so that these packets are enqueued into
> pfifo_fast band 2.
>
> Other packets, queued to band 0 or 1 are dequeued before any SYNACK
> packets waiting in band 2.

I am curious as to how well fq_codel survives an attack like this, without aid.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-02  1:28 ` Dave Taht
@ 2012-06-02  5:46   ` Eric Dumazet
  2012-06-23  7:34     ` Vijay Subramanian
  0 siblings, 1 reply; 42+ messages in thread
From: Eric Dumazet @ 2012-06-02  5:46 UTC (permalink / raw)
  To: Dave Taht
  Cc: Hans Schillstrom, netdev, Neal Cardwell, Tom Herbert,
	Jesper Dangaard Brouer

On Fri, 2012-06-01 at 18:28 -0700, Dave Taht wrote:
> On Thu, May 31, 2012 at 2:56 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > From: Eric Dumazet <edumazet@google.com>
> >
> > pfifo_fast being the default Qdisc, its pretty easy to fill it with
> > SYNACK (small) packets while host is under SYNFLOOD attack.
> >
> > Packets of established TCP sessions are dropped and host appears almost
> > dead.
> >
> > Avoid this problem assigning TC_PRIO_FILLER priority to SYNACK
> > generated in SYNCOOKIE mode, so that these packets are enqueued into
> > pfifo_fast band 2.
> >
> > Other packets, queued to band 0 or 1 are dequeued before any SYNACK
> > packets waiting in band 2.
> 
> I am curious as to how well fq_codel survives an attack like this, without aid.

codel or fq_codel are not doing priority classification.

SYNACK will spread in all hash buckets and global queue limit can be
hit.

fq_codel wont protect you by itself, unless you use a hierarchy with one
"prio" and two or three "fq_codel".

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH] tcp: do not create inetpeer on SYNACK message
  2012-06-01 21:34   ` Hans Schillström
@ 2012-06-02  6:56     ` Eric Dumazet
  0 siblings, 0 replies; 42+ messages in thread
From: Eric Dumazet @ 2012-06-02  6:56 UTC (permalink / raw)
  To: Hans Schillström
  Cc: netdev, Jesper Dangaard Brouer, David Miller, Neal Cardwell, Tom Herbert

On Fri, 2012-06-01 at 23:34 +0200, Hans Schillström wrote:

> It think we are on the right way now,
> 
> Some results from one of our testers:
> before applying "reflect SYN queue_mapping into SYNACK"
> 
> "(The latest one from Eric is not included. I am building with
> that one right now.)
> Results were that with the same number of SYN/s, load went down
> 30% on each of the three Cpus that were handling the SYNs.
> Great !!!"
> 

I am not sure reflecting queue_mapping will help your workload, since
you specifically asked to your NIC to queue all SYN packets on one
single queue.

Eventually not relying on skb->queue_mapping but skb->rxhash to chose an
outgoing queue for the SYNACKS to not harm a single tx queue ?

Then it might be not needed, if the queue is dedicated to SYN and SYNACK
packets, since net_rx_action/net_tx_action should both dequeue 64
packets each round, in a round robin fashion.

(I had problems in a standard setup, where you can have a single cpu
(CPU0 in my case) servicing all NAPI interrupts, so with 16 queues, the
rx_action/tx_action ratio is 16/1 if all synack go to a single queue,
while SYN are distributed to all 16 rx queues)


> I'm looking forward to see the results of the latests patch.
> 
> Then I think conntrack need a little shape up, like a "mini-conntrack"
> it is way to expensive to alloc a full "coontack for every SYN.
> 
> I have a bunch of patches and ideas for that...
> 

Cool ! the conntrack issue is a real one for sure.


Given the conntrack current requirement (being protected by a central
lock), I guess your best bet would be following setup :

One single CPU to handle all SYN packets.

Eventually not relying on skb->queue_mapping but skb->rxhash to chose an
outgoing queue for the SYNACKS to not harm a single tx queue.

> Thanks Eric for a great job
> 

Thanks for giving testing results and ideas !

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-02  5:46   ` Eric Dumazet
@ 2012-06-23  7:34     ` Vijay Subramanian
  2012-06-23  8:42       ` [PATCH v2 " Eric Dumazet
  0 siblings, 1 reply; 42+ messages in thread
From: Vijay Subramanian @ 2012-06-23  7:34 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Dave Taht, Hans Schillstrom, netdev, Neal Cardwell, Tom Herbert,
	Jesper Dangaard Brouer

On 1 June 2012 22:46, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2012-06-01 at 18:28 -0700, Dave Taht wrote:
>> On Thu, May 31, 2012 at 2:56 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> > From: Eric Dumazet <edumazet@google.com>
>> >
>> > pfifo_fast being the default Qdisc, its pretty easy to fill it with
>> > SYNACK (small) packets while host is under SYNFLOOD attack.
>> >
>> > Packets of established TCP sessions are dropped and host appears almost
>> > dead.
>> >
>> > Avoid this problem assigning TC_PRIO_FILLER priority to SYNACK
>> > generated in SYNCOOKIE mode, so that these packets are enqueued into
>> > pfifo_fast band 2.
>> >
>> > Other packets, queued to band 0 or 1 are dequeued before any SYNACK
>> > packets waiting in band 2.
>>

This  patch ([PATCH net-next] tcp: avoid tx starvation by SYNACK
packets)  is neither in net/net-next trees nor on patchwork. Maybe it
was missed since it was sent during the merge window. Is this not
needed anymore or is it being tested currently?

Thanks,
Vijay

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-23  7:34     ` Vijay Subramanian
@ 2012-06-23  8:42       ` Eric Dumazet
  2012-06-25  6:24         ` Hans Schillstrom
  2012-06-25 22:43         ` David Miller
  0 siblings, 2 replies; 42+ messages in thread
From: Eric Dumazet @ 2012-06-23  8:42 UTC (permalink / raw)
  To: Vijay Subramanian
  Cc: Dave Taht, Hans Schillstrom, netdev, Neal Cardwell, Tom Herbert,
	Jesper Dangaard Brouer

From: Eric Dumazet <edumazet@google.com>

On Sat, 2012-06-23 at 00:34 -0700, Vijay Subramanian wrote:

> This  patch ([PATCH net-next] tcp: avoid tx starvation by SYNACK
> packets)  is neither in net/net-next trees nor on patchwork. Maybe it
> was missed since it was sent during the merge window. Is this not
> needed anymore or is it being tested currently?

You're right, thanks for the reminder !

[PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets

pfifo_fast being the default Qdisc, its pretty easy to fill it with
SYNACK (small) packets while host is under synflood attack.

Packets of established TCP sessions are dropped at Qdisc layer and
host appears almost dead.

Avoid this problem assigning TC_PRIO_FILLER priority to SYNACK
generated in SYNCOOKIE mode, so that these packets are enqueued into
pfifo_fast lowest priority (band 2).

Other packets, queued to band 0 or 1 are dequeued before any SYNACK
packets waiting in band 2.

If not under synflood, SYNACK priority is as requested by listener
sk_priority policy.

Reported-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
---
 net/dccp/ipv4.c                  |    2 ++
 net/ipv4/ip_output.c             |    2 +-
 net/ipv4/tcp_ipv4.c              |    7 ++++++-
 net/ipv6/inet6_connection_sock.c |    1 +
 net/ipv6/ip6_output.c            |    2 +-
 net/ipv6/tcp_ipv6.c              |   11 ++++++++---
 6 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 3eb76b5..045176f 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -515,6 +515,7 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
 
 		dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->loc_addr,
 							      ireq->rmt_addr);
+		skb->priority = sk->sk_priority;
 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 					    ireq->rmt_addr,
 					    ireq->opt);
@@ -556,6 +557,7 @@ static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
 	skb_dst_set(skb, dst_clone(dst));
 
 	bh_lock_sock(ctl_sk);
+	skb->priority = ctl_sk->sk_priority;
 	err = ip_build_and_send_pkt(skb, ctl_sk,
 				    rxiph->daddr, rxiph->saddr, NULL);
 	bh_unlock_sock(ctl_sk);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0f3185a..71c6c20 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -155,7 +155,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 		ip_options_build(skb, &opt->opt, daddr, rt, 0);
 	}
 
-	skb->priority = sk->sk_priority;
+	/* skb->priority is set by the caller */
 	skb->mark = sk->sk_mark;
 
 	/* Send it out. */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b52934f..5ef4131 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -81,7 +81,7 @@
 #include <linux/stddef.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-
+#include <linux/pkt_sched.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
 
@@ -821,6 +821,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  *	Send a SYN-ACK after having received a SYN.
  *	This still operates on a request_sock only, not on a big
  *	socket.
+ *	nocache is set for SYN-ACK sent in SYNCOOKIE mode
  */
 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 			      struct request_sock *req,
@@ -843,6 +844,10 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 
 		skb_set_queue_mapping(skb, queue_mapping);
+
+		/* SYNACK sent in SYNCOOKIE mode have low priority */
+		skb->priority = nocache ? TC_PRIO_FILLER : sk->sk_priority;
+
 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 					    ireq->rmt_addr,
 					    ireq->opt);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index e6cee52..5812a74 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -248,6 +248,7 @@ int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused)
 	/* Restore final destination back after routing done */
 	fl6.daddr = np->daddr;
 
+	skb->priority = sk->sk_priority;
 	res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
 	rcu_read_unlock();
 	return res;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a233a7c..a93378a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -228,7 +228,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	hdr->saddr = fl6->saddr;
 	hdr->daddr = *first_hop;
 
-	skb->priority = sk->sk_priority;
+	/* skb->priority is set by the caller */
 	skb->mark = sk->sk_mark;
 
 	mtu = dst_mtu(dst);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 26a8862..f664452 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -43,6 +43,7 @@
 #include <linux/ipv6.h>
 #include <linux/icmpv6.h>
 #include <linux/random.h>
+#include <linux/pkt_sched.h>
 
 #include <net/tcp.h>
 #include <net/ndisc.h>
@@ -479,7 +480,8 @@ out:
 
 static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 			      struct request_values *rvp,
-			      u16 queue_mapping)
+			      u16 queue_mapping,
+			      bool syncookie)
 {
 	struct inet6_request_sock *treq = inet6_rsk(req);
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -515,6 +517,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 	if (skb) {
 		__tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
 
+		skb->priority = syncookie ? TC_PRIO_FILLER : sk->sk_priority;
 		fl6.daddr = treq->rmt_addr;
 		skb_set_queue_mapping(skb, queue_mapping);
 		err = ip6_xmit(sk, skb, &fl6, opt, np->tclass);
@@ -531,7 +534,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req,
 			     struct request_values *rvp)
 {
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
-	return tcp_v6_send_synack(sk, req, rvp, 0);
+	return tcp_v6_send_synack(sk, req, rvp, 0, false);
 }
 
 static void tcp_v6_reqsk_destructor(struct request_sock *req)
@@ -909,6 +912,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
 	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false);
 	if (!IS_ERR(dst)) {
 		skb_dst_set(buff, dst);
+		skb->priority = ctl_sk->sk_priority;
 		ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass);
 		TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 		if (rst)
@@ -1217,7 +1221,8 @@ have_isn:
 
 	if (tcp_v6_send_synack(sk, req,
 			       (struct request_values *)&tmp_ext,
-			       skb_get_queue_mapping(skb)) ||
+			       skb_get_queue_mapping(skb),
+			       want_cookie) ||
 	    want_cookie)
 		goto drop_and_free;
 

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-23  8:42       ` [PATCH v2 " Eric Dumazet
@ 2012-06-25  6:24         ` Hans Schillstrom
  2012-06-25 22:43         ` David Miller
  1 sibling, 0 replies; 42+ messages in thread
From: Hans Schillstrom @ 2012-06-25  6:24 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Vijay Subramanian, Dave Taht, netdev, Neal Cardwell, Tom Herbert,
	Jesper Dangaard Brouer

Hi Eric
On Saturday 23 June 2012 10:42:42 Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> On Sat, 2012-06-23 at 00:34 -0700, Vijay Subramanian wrote:
> 
> > This  patch ([PATCH net-next] tcp: avoid tx starvation by SYNACK
> > packets)  is neither in net/net-next trees nor on patchwork. Maybe it
> > was missed since it was sent during the merge window. Is this not
> > needed anymore or is it being tested currently?
> 
> You're right, thanks for the reminder !

We have been runing this patch for a while now,
so I added a "Tested-by:"

> 
> [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
> 
> pfifo_fast being the default Qdisc, its pretty easy to fill it with
> SYNACK (small) packets while host is under synflood attack.
> 
> Packets of established TCP sessions are dropped at Qdisc layer and
> host appears almost dead.
> 
> Avoid this problem assigning TC_PRIO_FILLER priority to SYNACK
> generated in SYNCOOKIE mode, so that these packets are enqueued into
> pfifo_fast lowest priority (band 2).
> 
> Other packets, queued to band 0 or 1 are dequeued before any SYNACK
> packets waiting in band 2.
> 
> If not under synflood, SYNACK priority is as requested by listener
> sk_priority policy.
> 
> Reported-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Hans Schillstrom <hans.schillstrom@ericsson.com>


> Cc: Jesper Dangaard Brouer <brouer@redhat.com>
> Cc: Neal Cardwell <ncardwell@google.com>
> Cc: Tom Herbert <therbert@google.com>
> Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
> ---
>  net/dccp/ipv4.c                  |    2 ++
>  net/ipv4/ip_output.c             |    2 +-
>  net/ipv4/tcp_ipv4.c              |    7 ++++++-
>  net/ipv6/inet6_connection_sock.c |    1 +
>  net/ipv6/ip6_output.c            |    2 +-
>  net/ipv6/tcp_ipv6.c              |   11 ++++++++---
>  6 files changed, 19 insertions(+), 6 deletions(-)
> 
> diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
> index 3eb76b5..045176f 100644
> --- a/net/dccp/ipv4.c
> +++ b/net/dccp/ipv4.c
> @@ -515,6 +515,7 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
>  
>  		dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->loc_addr,
>  							      ireq->rmt_addr);
> +		skb->priority = sk->sk_priority;
>  		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
>  					    ireq->rmt_addr,
>  					    ireq->opt);
> @@ -556,6 +557,7 @@ static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
>  	skb_dst_set(skb, dst_clone(dst));
>  
>  	bh_lock_sock(ctl_sk);
> +	skb->priority = ctl_sk->sk_priority;
>  	err = ip_build_and_send_pkt(skb, ctl_sk,
>  				    rxiph->daddr, rxiph->saddr, NULL);
>  	bh_unlock_sock(ctl_sk);
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 0f3185a..71c6c20 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -155,7 +155,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
>  		ip_options_build(skb, &opt->opt, daddr, rt, 0);
>  	}
>  
> -	skb->priority = sk->sk_priority;
> +	/* skb->priority is set by the caller */
>  	skb->mark = sk->sk_mark;
>  
>  	/* Send it out. */
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index b52934f..5ef4131 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -81,7 +81,7 @@
>  #include <linux/stddef.h>
>  #include <linux/proc_fs.h>
>  #include <linux/seq_file.h>
> -
> +#include <linux/pkt_sched.h>
>  #include <linux/crypto.h>
>  #include <linux/scatterlist.h>
>  
> @@ -821,6 +821,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
>   *	Send a SYN-ACK after having received a SYN.
>   *	This still operates on a request_sock only, not on a big
>   *	socket.
> + *	nocache is set for SYN-ACK sent in SYNCOOKIE mode
>   */
>  static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
>  			      struct request_sock *req,
> @@ -843,6 +844,10 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
>  		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
>  
>  		skb_set_queue_mapping(skb, queue_mapping);
> +
> +		/* SYNACK sent in SYNCOOKIE mode have low priority */
> +		skb->priority = nocache ? TC_PRIO_FILLER : sk->sk_priority;
> +
>  		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
>  					    ireq->rmt_addr,
>  					    ireq->opt);
> diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> index e6cee52..5812a74 100644
> --- a/net/ipv6/inet6_connection_sock.c
> +++ b/net/ipv6/inet6_connection_sock.c
> @@ -248,6 +248,7 @@ int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused)
>  	/* Restore final destination back after routing done */
>  	fl6.daddr = np->daddr;
>  
> +	skb->priority = sk->sk_priority;
>  	res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
>  	rcu_read_unlock();
>  	return res;
> diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> index a233a7c..a93378a 100644
> --- a/net/ipv6/ip6_output.c
> +++ b/net/ipv6/ip6_output.c
> @@ -228,7 +228,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
>  	hdr->saddr = fl6->saddr;
>  	hdr->daddr = *first_hop;
>  
> -	skb->priority = sk->sk_priority;
> +	/* skb->priority is set by the caller */
>  	skb->mark = sk->sk_mark;
>  
>  	mtu = dst_mtu(dst);
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 26a8862..f664452 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -43,6 +43,7 @@
>  #include <linux/ipv6.h>
>  #include <linux/icmpv6.h>
>  #include <linux/random.h>
> +#include <linux/pkt_sched.h>
>  
>  #include <net/tcp.h>
>  #include <net/ndisc.h>
> @@ -479,7 +480,8 @@ out:
>  
>  static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
>  			      struct request_values *rvp,
> -			      u16 queue_mapping)
> +			      u16 queue_mapping,
> +			      bool syncookie)
>  {
>  	struct inet6_request_sock *treq = inet6_rsk(req);
>  	struct ipv6_pinfo *np = inet6_sk(sk);
> @@ -515,6 +517,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
>  	if (skb) {
>  		__tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
>  
> +		skb->priority = syncookie ? TC_PRIO_FILLER : sk->sk_priority;
>  		fl6.daddr = treq->rmt_addr;
>  		skb_set_queue_mapping(skb, queue_mapping);
>  		err = ip6_xmit(sk, skb, &fl6, opt, np->tclass);
> @@ -531,7 +534,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req,
>  			     struct request_values *rvp)
>  {
>  	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
> -	return tcp_v6_send_synack(sk, req, rvp, 0);
> +	return tcp_v6_send_synack(sk, req, rvp, 0, false);
>  }
>  
>  static void tcp_v6_reqsk_destructor(struct request_sock *req)
> @@ -909,6 +912,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
>  	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false);
>  	if (!IS_ERR(dst)) {
>  		skb_dst_set(buff, dst);
> +		skb->priority = ctl_sk->sk_priority;
>  		ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass);
>  		TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
>  		if (rst)
> @@ -1217,7 +1221,8 @@ have_isn:
>  
>  	if (tcp_v6_send_synack(sk, req,
>  			       (struct request_values *)&tmp_ext,
> -			       skb_get_queue_mapping(skb)) ||
> +			       skb_get_queue_mapping(skb),
> +			       want_cookie) ||
>  	    want_cookie)
>  		goto drop_and_free;
>  
> 
> 
> 

-- 
Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-23  8:42       ` [PATCH v2 " Eric Dumazet
  2012-06-25  6:24         ` Hans Schillstrom
@ 2012-06-25 22:43         ` David Miller
  2012-06-26  4:51           ` Eric Dumazet
  1 sibling, 1 reply; 42+ messages in thread
From: David Miller @ 2012-06-25 22:43 UTC (permalink / raw)
  To: eric.dumazet
  Cc: subramanian.vijay, dave.taht, hans.schillstrom, netdev,
	ncardwell, therbert, brouer

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 23 Jun 2012 10:42:42 +0200

> From: Eric Dumazet <edumazet@google.com>
> 
> On Sat, 2012-06-23 at 00:34 -0700, Vijay Subramanian wrote:
> 
>> This  patch ([PATCH net-next] tcp: avoid tx starvation by SYNACK
>> packets)  is neither in net/net-next trees nor on patchwork. Maybe it
>> was missed since it was sent during the merge window. Is this not
>> needed anymore or is it being tested currently?
> 
> You're right, thanks for the reminder !
> 
> [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets

I don't agree with this change.

What is the point in having real classification configuration if
arbitrary places in the network stack are going to override SKB
priority with a fixed priority setting?

I bet the person who set listening socket priority really meant it and
does not expect you to override it.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-25 22:43         ` David Miller
@ 2012-06-26  4:51           ` Eric Dumazet
  2012-06-26  4:55             ` David Miller
  0 siblings, 1 reply; 42+ messages in thread
From: Eric Dumazet @ 2012-06-26  4:51 UTC (permalink / raw)
  To: David Miller
  Cc: subramanian.vijay, dave.taht, hans.schillstrom, netdev,
	ncardwell, therbert, brouer

On Mon, 2012-06-25 at 15:43 -0700, David Miller wrote:

> I don't agree with this change.
> 
> What is the point in having real classification configuration if
> arbitrary places in the network stack are going to override SKB
> priority with a fixed priority setting?
> 
> I bet the person who set listening socket priority really meant it and
> does not expect you to override it.


If I add a test on listener_sk->sk_priority being 0, would you accept
the patch ? If classification is done after tcp stack, it wont be hurt
by initial skb priority ?

instead of :

	/* SYNACK sent in SYNCOOKIE mode have low priority */
	skb->priority = nocache ? TC_PRIO_FILLER : sk->sk_priority;

Having :

	/* SYNACK sent in SYNCOOKIE mode have low priority */
	skb->priority = (nocache && !sk->sk_priority) ?
			TC_PRIO_FILLER : sk->sk_priority;

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-26  4:51           ` Eric Dumazet
@ 2012-06-26  4:55             ` David Miller
  2012-06-26  5:34               ` Hans Schillstrom
  0 siblings, 1 reply; 42+ messages in thread
From: David Miller @ 2012-06-26  4:55 UTC (permalink / raw)
  To: eric.dumazet
  Cc: subramanian.vijay, dave.taht, hans.schillstrom, netdev,
	ncardwell, therbert, brouer

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 26 Jun 2012 06:51:36 +0200

> On Mon, 2012-06-25 at 15:43 -0700, David Miller wrote:
> 
>> I don't agree with this change.
>> 
>> What is the point in having real classification configuration if
>> arbitrary places in the network stack are going to override SKB
>> priority with a fixed priority setting?
>> 
>> I bet the person who set listening socket priority really meant it and
>> does not expect you to override it.
> 
> 
> If I add a test on listener_sk->sk_priority being 0, would you accept
> the patch ? If classification is done after tcp stack, it wont be hurt
> by initial skb priority ?

It's better than your original patch, but it suffers from the same
fundamental problem.

No user is going to expect that TCP on it's own has choosen a
non-default priority and only for some packet types.  It's completely
unexpected behavior.

A SYN flood consumes so much more RX work than the TX for the SYNACK's
ever can.

So whilst I understand your desire to handle all elements of this kind
of attack, this one is reaching too far.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-26  4:55             ` David Miller
@ 2012-06-26  5:34               ` Hans Schillstrom
  2012-06-26  7:11                 ` David Miller
  2012-06-26 17:02                 ` Eric Dumazet
  0 siblings, 2 replies; 42+ messages in thread
From: Hans Schillstrom @ 2012-06-26  5:34 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, subramanian.vijay, dave.taht, netdev, ncardwell,
	therbert, brouer

On Tuesday 26 June 2012 06:55:37 David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 26 Jun 2012 06:51:36 +0200
> 
> > On Mon, 2012-06-25 at 15:43 -0700, David Miller wrote:
> > 
> >> I don't agree with this change.
> >> 
> >> What is the point in having real classification configuration if
> >> arbitrary places in the network stack are going to override SKB
> >> priority with a fixed priority setting?
> >> 
> >> I bet the person who set listening socket priority really meant it and
> >> does not expect you to override it.
> > 
> > 
> > If I add a test on listener_sk->sk_priority being 0, would you accept
> > the patch ? If classification is done after tcp stack, it wont be hurt
> > by initial skb priority ?
> 
> It's better than your original patch, but it suffers from the same
> fundamental problem.
> 
> No user is going to expect that TCP on it's own has choosen a
> non-default priority and only for some packet types.  It's completely
> unexpected behavior.
> 
> A SYN flood consumes so much more RX work than the TX for the SYNACK's
> ever can.
> 
> So whilst I understand your desire to handle all elements of this kind
> of attack, this one is reaching too far.
> 

This patch didn't give much in gain actually.
The big cycle consumer during a syn attack is SHA sum right now, 
so from that perspective it's better to add aes crypto (by using AES-NI) 
to the syn cookies instead of SHA sum. Even if only newer x86_64 can use it.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-26  5:34               ` Hans Schillstrom
@ 2012-06-26  7:11                 ` David Miller
  2012-06-26  7:27                   ` Hans Schillstrom
  2012-06-26 17:02                 ` Eric Dumazet
  1 sibling, 1 reply; 42+ messages in thread
From: David Miller @ 2012-06-26  7:11 UTC (permalink / raw)
  To: hans.schillstrom
  Cc: eric.dumazet, subramanian.vijay, dave.taht, netdev, ncardwell,
	therbert, brouer

From: Hans Schillstrom <hans.schillstrom@ericsson.com>
Date: Tue, 26 Jun 2012 07:34:31 +0200

> The big cycle consumer during a syn attack is SHA sum right now, 
> so from that perspective it's better to add aes crypto (by using AES-NI) 
> to the syn cookies instead of SHA sum. Even if only newer x86_64 can use it.

I'm surprised that x86 lacks an SHA1 instruction, even shitty sparcs
have one now.

SHA1 seems overkill for what the syncookie code is trying to do, could
you give the following a try?

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6660ffc..b280bf4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -435,7 +435,6 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
 int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size);
 
 /* From syncookies.c */
-extern __u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
 extern struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, 
 				    struct ip_options *opt);
 #ifdef CONFIG_SYN_COOKIES
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index eab2a7f..60116dc 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -13,9 +13,9 @@
 #include <linux/tcp.h>
 #include <linux/slab.h>
 #include <linux/random.h>
-#include <linux/cryptohash.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
+#include <linux/jhash.h>
 #include <net/tcp.h>
 #include <net/route.h>
 
@@ -25,7 +25,7 @@
 
 extern int sysctl_tcp_syncookies;
 
-__u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
+__u32 syncookie_secret[2];
 EXPORT_SYMBOL(syncookie_secret);
 
 static __init int init_syncookies(void)
@@ -38,22 +38,14 @@ __initcall(init_syncookies);
 #define COOKIEBITS 24	/* Upper bits store count */
 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
 
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
-		      ipv4_cookie_scratch);
-
 static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 		       u32 count, int c)
 {
-	__u32 *tmp = __get_cpu_var(ipv4_cookie_scratch);
-
-	memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
-	tmp[0] = (__force u32)saddr;
-	tmp[1] = (__force u32)daddr;
-	tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
-	tmp[3] = count;
-	sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
-
-	return tmp[17];
+	return jhash_3words((__force __u32) saddr + count,
+			    (__force __u32) daddr,
+			    (((__force __u32) sport) << 16 |
+			     (__force __u32) dport),
+			    syncookie_secret[c]);
 }
 
 

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-26  7:11                 ` David Miller
@ 2012-06-26  7:27                   ` Hans Schillstrom
  0 siblings, 0 replies; 42+ messages in thread
From: Hans Schillstrom @ 2012-06-26  7:27 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, subramanian.vijay, dave.taht, netdev, ncardwell,
	therbert, brouer

On Tuesday 26 June 2012 09:11:24 David Miller wrote:
> From: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Date: Tue, 26 Jun 2012 07:34:31 +0200
> 
> > The big cycle consumer during a syn attack is SHA sum right now, 
> > so from that perspective it's better to add aes crypto (by using AES-NI) 
> > to the syn cookies instead of SHA sum. Even if only newer x86_64 can use it.
> 
> I'm surprised that x86 lacks an SHA1 instruction, even shitty sparcs
> have one now.
> 
> SHA1 seems overkill for what the syncookie code is trying to do, could
> you give the following a try?
> 

Sure, I'll give it a try later today 

Thanks

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-26  5:34               ` Hans Schillstrom
  2012-06-26  7:11                 ` David Miller
@ 2012-06-26 17:02                 ` Eric Dumazet
  2012-06-27  5:23                   ` Hans Schillstrom
  2012-06-27  6:32                   ` Jesper Dangaard Brouer
  1 sibling, 2 replies; 42+ messages in thread
From: Eric Dumazet @ 2012-06-26 17:02 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: David Miller, subramanian.vijay, dave.taht, netdev, ncardwell,
	therbert, brouer

On Tue, 2012-06-26 at 07:34 +0200, Hans Schillstrom wrote:

> This patch didn't give much in gain actually.

With a 100Mbps link it does.

With a 1Gbps link we are cpu bounded for sure.

> The big cycle consumer during a syn attack is SHA sum right now, 
> so from that perspective it's better to add aes crypto (by using AES-NI) 
> to the syn cookies instead of SHA sum. Even if only newer x86_64 can use it.

My dev machine is able to process ~280.000 SYN (and synack) per second
(tg3, mono queue), and sha_transform() takes ~10 % of the time according
to perf.

With David patch using jhash instead of SHA, I reach ~315.000 SYN per
second.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-26 17:02                 ` Eric Dumazet
@ 2012-06-27  5:23                   ` Hans Schillstrom
  2012-06-27  8:22                     ` David Miller
  2012-06-27  6:32                   ` Jesper Dangaard Brouer
  1 sibling, 1 reply; 42+ messages in thread
From: Hans Schillstrom @ 2012-06-27  5:23 UTC (permalink / raw)
  To: Eric Dumazet, David Miller
  Cc: subramanian.vijay, dave.taht, netdev, ncardwell, therbert, brouer

On Tuesday 26 June 2012 19:02:36 Eric Dumazet wrote:
> On Tue, 2012-06-26 at 07:34 +0200, Hans Schillstrom wrote:
> 
> > This patch didn't give much in gain actually.
> 
> With a 100Mbps link it does.
 
I was testing with a patched igb driver with TCP SYN irq:s on one core only,
there was some fault in the prev. setup (RPS was also involved) because now it gives a boost of ~15%

> With a 1Gbps link we are cpu bounded for sure.

True.

> 
> > The big cycle consumer during a syn attack is SHA sum right now, 
> > so from that perspective it's better to add aes crypto (by using AES-NI) 
> > to the syn cookies instead of SHA sum. Even if only newer x86_64 can use it.
> 
> My dev machine is able to process ~280.000 SYN (and synack) per second
> (tg3, mono queue), and sha_transform() takes ~10 % of the time according
> to perf.

My test machine is not that fast :-(
I have only 170.000 syn/synack per sec. and sha_transform() takes ~9.6%
have seen peeks of 16% (during 10 sec samples)

> 
> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
> second.

I have similar results from ~170k to ~199k synack/sec.

BTW, 
cookie_hash() did not show up in the perf results, (< 0.08%)

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-26 17:02                 ` Eric Dumazet
  2012-06-27  5:23                   ` Hans Schillstrom
@ 2012-06-27  6:32                   ` Jesper Dangaard Brouer
  2012-06-27  6:54                     ` David Miller
  1 sibling, 1 reply; 42+ messages in thread
From: Jesper Dangaard Brouer @ 2012-06-27  6:32 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Hans Schillstrom, David Miller, subramanian.vijay, dave.taht,
	netdev, ncardwell, therbert, Martin Topholm

On Tue, 2012-06-26 at 19:02 +0200, Eric Dumazet wrote:
> On Tue, 2012-06-26 at 07:34 +0200, Hans Schillstrom wrote:
> 
> > This patch didn't give much in gain actually.
> 
> With a 100Mbps link it does.
> 
> With a 1Gbps link we are cpu bounded for sure.

I'm using a 10G link

> > The big cycle consumer during a syn attack is SHA sum right now, 
> > so from that perspective it's better to add aes crypto (by using AES-NI) 
> > to the syn cookies instead of SHA sum. Even if only newer x86_64 can use it.

How are you avoiding the lock bh_lock_sock_nested(sk) in tcp_v4_rcv()?


> My dev machine is able to process ~280.000 SYN (and synack) per second
> (tg3, mono queue), and sha_transform() takes ~10 % of the time according
> to perf.

With my parallel SYN cookie/brownies patches, I could easily process 750
Kpps (limited by the generator, think the owners of the big machine did
a test where they reached 1400 Kpps).

I also had ~10% CPU usage from sha_transform() but across all cores...


> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
> second.

IMHO a faster hash is not the answer... parallel processing of SYN
packets is a better answer.  But I do think, adding this faster hash as
a sysctl switch might be a good idea, for people with smaller embedded
hardware.  Using it as default, might be "dangerous" and open an attack
vector on SYN cookies in Linux.


-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  6:32                   ` Jesper Dangaard Brouer
@ 2012-06-27  6:54                     ` David Miller
  2012-06-27  7:24                       ` Jesper Dangaard Brouer
  2012-06-27 19:50                       ` Florian Westphal
  0 siblings, 2 replies; 42+ messages in thread
From: David Miller @ 2012-06-27  6:54 UTC (permalink / raw)
  To: brouer
  Cc: eric.dumazet, hans.schillstrom, subramanian.vijay, dave.taht,
	netdev, ncardwell, therbert, mph

From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 27 Jun 2012 08:32:13 +0200

> Using it as default, might be "dangerous" and open an attack vector
> on SYN cookies in Linux.

If it's dangerous for syncookies then it's just as dangerous for
the routing hash and the socket hashes where we use it already.

Therefore, this sounds like a baseless claim to me.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  6:54                     ` David Miller
@ 2012-06-27  7:24                       ` Jesper Dangaard Brouer
  2012-06-27  7:30                         ` Eric Dumazet
  2012-06-27 19:50                       ` Florian Westphal
  1 sibling, 1 reply; 42+ messages in thread
From: Jesper Dangaard Brouer @ 2012-06-27  7:24 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, hans.schillstrom, subramanian.vijay, dave.taht,
	netdev, ncardwell, therbert, mph

On Tue, 2012-06-26 at 23:54 -0700, David Miller wrote:
> From: Jesper Dangaard Brouer <brouer@redhat.com>
> Date: Wed, 27 Jun 2012 08:32:13 +0200
> 
> > Using it as default, might be "dangerous" and open an attack vector
> > on SYN cookies in Linux.
> 
> If it's dangerous for syncookies then it's just as dangerous for
> the routing hash and the socket hashes where we use it already.
> 
> Therefore, this sounds like a baseless claim to me.

Yes, you are right. Looking at you patch again, you also use
syncookie_secret[c] as initval.  So, it should be safe.

But, I still believe that we need, to solve this SYN issues by parallel
processing of packets. (It seems Eric and Hans are looking at a single
core SYN processing scheme, but I might have missed their point).

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  7:24                       ` Jesper Dangaard Brouer
@ 2012-06-27  7:30                         ` Eric Dumazet
  2012-06-27  7:54                           ` Jesper Dangaard Brouer
  2012-06-27  8:13                           ` David Miller
  0 siblings, 2 replies; 42+ messages in thread
From: Eric Dumazet @ 2012-06-27  7:30 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: David Miller, hans.schillstrom, subramanian.vijay, dave.taht,
	netdev, ncardwell, therbert, mph

On Wed, 2012-06-27 at 09:24 +0200, Jesper Dangaard Brouer wrote:

> But, I still believe that we need, to solve this SYN issues by parallel
> processing of packets. (It seems Eric and Hans are looking at a single
> core SYN processing scheme, but I might have missed their point).

Yep

Parallel processing will only benefit multiqueue setups.

Many linux servers in colocations are still using a mono queue NIC, and
default linux configuration is to use a single cpu to handle all
incoming frames (no RPS/RFS).

Sometime the hw IRQ itself is distributed among several cpus, but at one
single moment, only one cpu is serving the NAPI poll.

As long as the LISTEN processing is locking the socket, there is no
point distributing SYN packets to multiple cpus, this only adds
contention and poor performance because of false sharing.

My plan is to get rid of the socket lock for LISTEN and use RCU instead.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  7:30                         ` Eric Dumazet
@ 2012-06-27  7:54                           ` Jesper Dangaard Brouer
  2012-06-27  8:02                             ` Eric Dumazet
  2012-06-27  8:13                           ` David Miller
  1 sibling, 1 reply; 42+ messages in thread
From: Jesper Dangaard Brouer @ 2012-06-27  7:54 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, hans.schillstrom, subramanian.vijay, dave.taht,
	netdev, ncardwell, therbert, mph

On Wed, 2012-06-27 at 09:30 +0200, Eric Dumazet wrote:
> On Wed, 2012-06-27 at 09:24 +0200, Jesper Dangaard Brouer wrote:
> 
> > But, I still believe that we need, to solve this SYN issues by parallel
> > processing of packets. (It seems Eric and Hans are looking at a single
> > core SYN processing scheme, but I might have missed their point).
> 
> Yep
> 
> Parallel processing will only benefit multiqueue setups.
> 
> Many linux servers in colocations are still using a mono queue NIC, and
> default linux configuration is to use a single cpu to handle all
> incoming frames (no RPS/RFS).

I see, your target is different than mine (now I understand you
motivation).  Its good, as optimizing the single queue case, would also
be a benefit once we implement parallel processing / take advantage of
the multi queue devices.


> Sometime the hw IRQ itself is distributed among several cpus, but at one
> single moment, only one cpu is serving the NAPI poll.
> 
> As long as the LISTEN processing is locking the socket, there is no
> point distributing SYN packets to multiple cpus, this only adds
> contention and poor performance because of false sharing.
> 
> My plan is to get rid of the socket lock for LISTEN and use RCU instead.

Well, that would lead to parallel SYN processing, wouldn't it?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  7:54                           ` Jesper Dangaard Brouer
@ 2012-06-27  8:02                             ` Eric Dumazet
  2012-06-27  8:21                               ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 42+ messages in thread
From: Eric Dumazet @ 2012-06-27  8:02 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: David Miller, hans.schillstrom, subramanian.vijay, dave.taht,
	netdev, ncardwell, therbert, mph

On Wed, 2012-06-27 at 09:54 +0200, Jesper Dangaard Brouer wrote:

> Well, that would lead to parallel SYN processing, wouldn't it?

I think we already discussed of the current issues of current code.

Telling people to spread SYN to several cpus is a good way to have a
freeze in case of synflood, because 15 cpus are busy looping while one
is doing progress.

Thats why Intel felt the need of a hardware filter to direct all SYN
packets on a single queue.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  7:30                         ` Eric Dumazet
  2012-06-27  7:54                           ` Jesper Dangaard Brouer
@ 2012-06-27  8:13                           ` David Miller
  1 sibling, 0 replies; 42+ messages in thread
From: David Miller @ 2012-06-27  8:13 UTC (permalink / raw)
  To: eric.dumazet
  Cc: brouer, hans.schillstrom, subramanian.vijay, dave.taht, netdev,
	ncardwell, therbert, mph

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 27 Jun 2012 09:30:16 +0200

> Many linux servers in colocations are still using a mono queue NIC, and
> default linux configuration is to use a single cpu to handle all
> incoming frames (no RPS/RFS).

Even worse, many are virtualized guest with single virtual netdev
queue :-)

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  8:02                             ` Eric Dumazet
@ 2012-06-27  8:21                               ` Jesper Dangaard Brouer
  2012-06-27  8:45                                 ` Eric Dumazet
  0 siblings, 1 reply; 42+ messages in thread
From: Jesper Dangaard Brouer @ 2012-06-27  8:21 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, hans.schillstrom, subramanian.vijay, dave.taht,
	netdev, ncardwell, therbert, mph

On Wed, 2012-06-27 at 10:02 +0200, Eric Dumazet wrote:
> On Wed, 2012-06-27 at 09:54 +0200, Jesper Dangaard Brouer wrote:
> 
> > Well, that would lead to parallel SYN processing, wouldn't it?
> 
> I think we already discussed of the current issues of current code.
> 
> Telling people to spread SYN to several cpus is a good way to have a
> freeze in case of synflood, because 15 cpus are busy looping while one
> is doing progress.

Yes, that was also what I experienced (contention on spinlock), and then
tried to address it with my parallel SYN cookie patches, and it worked
amazing well...


> Thats why Intel felt the need of a hardware filter to direct all SYN
> packets on a single queue.

It works because we have a spinlock problem in the code... Perhaps, they
did it, because we have have locking/contention problem, not the other
way around ;-)  How about fixing the code instead? ;-)))

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  5:23                   ` Hans Schillstrom
@ 2012-06-27  8:22                     ` David Miller
  2012-06-27  8:25                       ` Jesper Dangaard Brouer
                                         ` (2 more replies)
  0 siblings, 3 replies; 42+ messages in thread
From: David Miller @ 2012-06-27  8:22 UTC (permalink / raw)
  To: hans.schillstrom
  Cc: eric.dumazet, subramanian.vijay, dave.taht, netdev, ncardwell,
	therbert, brouer

From: Hans Schillstrom <hans.schillstrom@ericsson.com>
Date: Wed, 27 Jun 2012 07:23:03 +0200

> On Tuesday 26 June 2012 19:02:36 Eric Dumazet wrote:
>> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
>> second.
> 
> I have similar results from ~170k to ~199k synack/sec.

Eric and Hans, I'm going to add Tested-by: tags for both of you
when I commit this patch if you don't mind. :-)

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  8:22                     ` David Miller
@ 2012-06-27  8:25                       ` Jesper Dangaard Brouer
  2012-06-27  8:30                       ` Hans Schillstrom
  2012-06-27  8:40                       ` Eric Dumazet
  2 siblings, 0 replies; 42+ messages in thread
From: Jesper Dangaard Brouer @ 2012-06-27  8:25 UTC (permalink / raw)
  To: David Miller
  Cc: hans.schillstrom, eric.dumazet, subramanian.vijay, dave.taht,
	netdev, ncardwell, therbert

On Wed, 2012-06-27 at 01:22 -0700, David Miller wrote:
> From: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Date: Wed, 27 Jun 2012 07:23:03 +0200
> 
> > On Tuesday 26 June 2012 19:02:36 Eric Dumazet wrote:
> >> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
> >> second.
> > 
> > I have similar results from ~170k to ~199k synack/sec.
> 
> Eric and Hans, I'm going to add Tested-by: tags for both of you
> when I commit this patch if you don't mind. :-)

You can also add my

Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>

(I agree with you patch, and its does not have an attack vector...)

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  8:22                     ` David Miller
  2012-06-27  8:25                       ` Jesper Dangaard Brouer
@ 2012-06-27  8:30                       ` Hans Schillstrom
  2012-06-27  8:40                       ` Eric Dumazet
  2 siblings, 0 replies; 42+ messages in thread
From: Hans Schillstrom @ 2012-06-27  8:30 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, subramanian.vijay, dave.taht, netdev, ncardwell,
	therbert, brouer

On Wednesday 27 June 2012 10:22:04 David Miller wrote:
> From: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Date: Wed, 27 Jun 2012 07:23:03 +0200
> 
> > On Tuesday 26 June 2012 19:02:36 Eric Dumazet wrote:
> >> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
> >> second.
> > 
> > I have similar results from ~170k to ~199k synack/sec.
> 
> Eric and Hans, I'm going to add Tested-by: tags for both of you
> when I commit this patch if you don't mind. :-)
> 

No problems, go ahead

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  8:22                     ` David Miller
  2012-06-27  8:25                       ` Jesper Dangaard Brouer
  2012-06-27  8:30                       ` Hans Schillstrom
@ 2012-06-27  8:40                       ` Eric Dumazet
  2012-06-27  8:48                         ` David Miller
  2 siblings, 1 reply; 42+ messages in thread
From: Eric Dumazet @ 2012-06-27  8:40 UTC (permalink / raw)
  To: David Miller
  Cc: hans.schillstrom, subramanian.vijay, dave.taht, netdev,
	ncardwell, therbert, brouer

On Wed, 2012-06-27 at 01:22 -0700, David Miller wrote:
> From: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Date: Wed, 27 Jun 2012 07:23:03 +0200
> 
> > On Tuesday 26 June 2012 19:02:36 Eric Dumazet wrote:
> >> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
> >> second.
> > 
> > I have similar results from ~170k to ~199k synack/sec.
> 
> Eric and Hans, I'm going to add Tested-by: tags for both of you
> when I commit this patch if you don't mind. :-)

Well, please send your complete patch before (with IPv6 part) ?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  8:21                               ` Jesper Dangaard Brouer
@ 2012-06-27  8:45                                 ` Eric Dumazet
  2012-06-27  9:23                                   ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 42+ messages in thread
From: Eric Dumazet @ 2012-06-27  8:45 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: David Miller, hans.schillstrom, subramanian.vijay, dave.taht,
	netdev, ncardwell, therbert, mph

On Wed, 2012-06-27 at 10:21 +0200, Jesper Dangaard Brouer wrote:

> It works because we have a spinlock problem in the code... Perhaps, they
> did it, because we have have locking/contention problem, not the other
> way around ;-)  How about fixing the code instead? ;-)))

The socket lock is currently mandatory.

It's really _hard_ to remove it, your attempts added a lot of races.

I want to do it properly, adding needed RCU and array of spinlocks were
appropriate.

Hopefully, its easier than the RCU conversion I did for the lookups of
ESTABLISHED/TIMEWAIT sockets.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  8:40                       ` Eric Dumazet
@ 2012-06-27  8:48                         ` David Miller
  0 siblings, 0 replies; 42+ messages in thread
From: David Miller @ 2012-06-27  8:48 UTC (permalink / raw)
  To: eric.dumazet
  Cc: hans.schillstrom, subramanian.vijay, dave.taht, netdev,
	ncardwell, therbert, brouer

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 27 Jun 2012 10:40:00 +0200

> On Wed, 2012-06-27 at 01:22 -0700, David Miller wrote:
>> From: Hans Schillstrom <hans.schillstrom@ericsson.com>
>> Date: Wed, 27 Jun 2012 07:23:03 +0200
>> 
>> > On Tuesday 26 June 2012 19:02:36 Eric Dumazet wrote:
>> >> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
>> >> second.
>> > 
>> > I have similar results from ~170k to ~199k synack/sec.
>> 
>> Eric and Hans, I'm going to add Tested-by: tags for both of you
>> when I commit this patch if you don't mind. :-)
> 
> Well, please send your complete patch before (with IPv6 part) ?

Indeed, I'll do that soon, thanks Eric.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  8:45                                 ` Eric Dumazet
@ 2012-06-27  9:23                                   ` Jesper Dangaard Brouer
  0 siblings, 0 replies; 42+ messages in thread
From: Jesper Dangaard Brouer @ 2012-06-27  9:23 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, hans.schillstrom, subramanian.vijay, dave.taht,
	netdev, ncardwell, therbert, mph

On Wed, 2012-06-27 at 10:45 +0200, Eric Dumazet wrote:
> On Wed, 2012-06-27 at 10:21 +0200, Jesper Dangaard Brouer wrote:
> 
> > It works because we have a spinlock problem in the code... Perhaps, they
> > did it, because we have have locking/contention problem, not the other
> > way around ;-)  How about fixing the code instead? ;-)))
> 
> The socket lock is currently mandatory.
>
> It's really _hard_ to remove it, your attempts added a lot of races.

Yes, its really hard to remove completely.  That's why I choose _only_
to handle the SYN cookie "overload" case, and leave the rest locked, and
I also introduced extra locking in the latest patches.  I know it was
not perfect, hence the RFC tag, but I hope I didn't add that many races.


> I want to do it properly, adding needed RCU and array of spinlocks were
> appropriate.

I really appreciate that you will attempt to fix this properly.  Like a
real network ninja ;-).

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27  6:54                     ` David Miller
  2012-06-27  7:24                       ` Jesper Dangaard Brouer
@ 2012-06-27 19:50                       ` Florian Westphal
  2012-06-27 21:39                         ` Eric Dumazet
  2012-06-27 22:23                         ` David Miller
  1 sibling, 2 replies; 42+ messages in thread
From: Florian Westphal @ 2012-06-27 19:50 UTC (permalink / raw)
  To: David Miller
  Cc: brouer, eric.dumazet, hans.schillstrom, subramanian.vijay,
	dave.taht, netdev, ncardwell, therbert, mph

David Miller <davem@davemloft.net> wrote:
> From: Jesper Dangaard Brouer <brouer@redhat.com>
> Date: Wed, 27 Jun 2012 08:32:13 +0200
> 
> > Using it as default, might be "dangerous" and open an attack vector
> > on SYN cookies in Linux.
> 
> If it's dangerous for syncookies then it's just as dangerous for
> the routing hash and the socket hashes where we use it already.
> Therefore, this sounds like a baseless claim to me.

I doubt using jhash is safe for syncookies.

There a several differences to other uses in kernel:
- all hash input except u32 cookie_secret[2] is known
- we transmit hash result (i.e, its visible to 3rd party)
- we do not re-seed the secret, ever

it should be quite easy to recompute cookie_secret[] from known syncookie
values?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27 19:50                       ` Florian Westphal
@ 2012-06-27 21:39                         ` Eric Dumazet
  2012-06-27 22:23                           ` David Miller
  2012-06-27 22:23                         ` David Miller
  1 sibling, 1 reply; 42+ messages in thread
From: Eric Dumazet @ 2012-06-27 21:39 UTC (permalink / raw)
  To: Florian Westphal
  Cc: David Miller, brouer, hans.schillstrom, subramanian.vijay,
	dave.taht, netdev, ncardwell, therbert, mph

On Wed, 2012-06-27 at 21:50 +0200, Florian Westphal wrote:

> I doubt using jhash is safe for syncookies.
> 
> There a several differences to other uses in kernel:
> - all hash input except u32 cookie_secret[2] is known
> - we transmit hash result (i.e, its visible to 3rd party)
> - we do not re-seed the secret, ever
> 
> it should be quite easy to recompute cookie_secret[] from known syncookie
> values?

We could re-seed the secrets every MSL seconds a bit like in
tcp_cookie_generator()

This would require check_tcp_syn_cookie() doing two checks (most recent
seed, and previous one if first check failed)

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27 19:50                       ` Florian Westphal
  2012-06-27 21:39                         ` Eric Dumazet
@ 2012-06-27 22:23                         ` David Miller
  1 sibling, 0 replies; 42+ messages in thread
From: David Miller @ 2012-06-27 22:23 UTC (permalink / raw)
  To: fw
  Cc: brouer, eric.dumazet, hans.schillstrom, subramanian.vijay,
	dave.taht, netdev, ncardwell, therbert, mph

From: Florian Westphal <fw@strlen.de>
Date: Wed, 27 Jun 2012 21:50:32 +0200

> - we transmit hash result (i.e, its visible to 3rd party)

Indeed, that's right.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
  2012-06-27 21:39                         ` Eric Dumazet
@ 2012-06-27 22:23                           ` David Miller
  0 siblings, 0 replies; 42+ messages in thread
From: David Miller @ 2012-06-27 22:23 UTC (permalink / raw)
  To: eric.dumazet
  Cc: fw, brouer, hans.schillstrom, subramanian.vijay, dave.taht,
	netdev, ncardwell, therbert, mph

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 27 Jun 2012 23:39:20 +0200

> On Wed, 2012-06-27 at 21:50 +0200, Florian Westphal wrote:
> 
>> I doubt using jhash is safe for syncookies.
>> 
>> There a several differences to other uses in kernel:
>> - all hash input except u32 cookie_secret[2] is known
>> - we transmit hash result (i.e, its visible to 3rd party)
>> - we do not re-seed the secret, ever
>> 
>> it should be quite easy to recompute cookie_secret[] from known syncookie
>> values?
> 
> We could re-seed the secrets every MSL seconds a bit like in
> tcp_cookie_generator()
> 
> This would require check_tcp_syn_cookie() doing two checks (most recent
> seed, and previous one if first check failed)

That could help, but I'm leaning towards not doing this at all.  Like
for the normal sequence number generation we really can't do this.

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2012-06-27 22:23 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-31 21:56 [PATCH net-next] tcp: avoid tx starvation by SYNACK packets Eric Dumazet
2012-05-31 23:03 ` David Miller
2012-06-01  4:48   ` Eric Dumazet
2012-06-01 17:46     ` David Miller
2012-06-01  7:00 ` [PATCH] tcp: do not create inetpeer on SYNACK message Eric Dumazet
2012-06-01 18:24   ` David Miller
2012-06-01 21:34   ` Hans Schillström
2012-06-02  6:56     ` Eric Dumazet
2012-06-01  7:36 ` [PATCH net-next] tcp: avoid tx starvation by SYNACK packets Hans Schillstrom
2012-06-01  9:34   ` Eric Dumazet
2012-06-02  1:28 ` Dave Taht
2012-06-02  5:46   ` Eric Dumazet
2012-06-23  7:34     ` Vijay Subramanian
2012-06-23  8:42       ` [PATCH v2 " Eric Dumazet
2012-06-25  6:24         ` Hans Schillstrom
2012-06-25 22:43         ` David Miller
2012-06-26  4:51           ` Eric Dumazet
2012-06-26  4:55             ` David Miller
2012-06-26  5:34               ` Hans Schillstrom
2012-06-26  7:11                 ` David Miller
2012-06-26  7:27                   ` Hans Schillstrom
2012-06-26 17:02                 ` Eric Dumazet
2012-06-27  5:23                   ` Hans Schillstrom
2012-06-27  8:22                     ` David Miller
2012-06-27  8:25                       ` Jesper Dangaard Brouer
2012-06-27  8:30                       ` Hans Schillstrom
2012-06-27  8:40                       ` Eric Dumazet
2012-06-27  8:48                         ` David Miller
2012-06-27  6:32                   ` Jesper Dangaard Brouer
2012-06-27  6:54                     ` David Miller
2012-06-27  7:24                       ` Jesper Dangaard Brouer
2012-06-27  7:30                         ` Eric Dumazet
2012-06-27  7:54                           ` Jesper Dangaard Brouer
2012-06-27  8:02                             ` Eric Dumazet
2012-06-27  8:21                               ` Jesper Dangaard Brouer
2012-06-27  8:45                                 ` Eric Dumazet
2012-06-27  9:23                                   ` Jesper Dangaard Brouer
2012-06-27  8:13                           ` David Miller
2012-06-27 19:50                       ` Florian Westphal
2012-06-27 21:39                         ` Eric Dumazet
2012-06-27 22:23                           ` David Miller
2012-06-27 22:23                         ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.