All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] rps: selective flow shedding during softnet overflow
@ 2013-04-19 17:46 Willem de Bruijn
  2013-04-19 17:58 ` Eric Dumazet
  2013-04-19 19:03 ` [PATCH] " Stephen Hemminger
  0 siblings, 2 replies; 41+ messages in thread
From: Willem de Bruijn @ 2013-04-19 17:46 UTC (permalink / raw)
  To: netdev, davem, edumazet; +Cc: Willem de Bruijn

A cpu executing the network receive path sheds packets when its input
queue grows to netdev_max_backlog. A single high rate flow (such as a
spoofed source DoS) can exceed a single cpu processing rate and will
degrade throughput of other flows hashed onto the same cpu.

This patch adds a more fine grained hashtable. If the netdev backlog
is above a threshold, IRQ cpus track the ratio of total traffic of
each flow (using 1024 buckets, configurable). The ratio is measured
by counting the number of packets per flow over the last 256 packets
from the source cpu. Any flow that occupies a large fraction of this
(set at 50%) will see packet drop while above the threshold.

Tested:
Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
kernel receive (RPS) on cpu0 and application threads on cpus 2--7
each handling 20k req/s. Throughput halves when hit with a 400 kpps
antagonist storm. With this patch applied, antagonist overload is
dropped and the server processes its complete load.

The patch is effective when kernel receive processing is the
bottleneck. The above RPS scenario is a extreme, but the same is
reached with RFS and sufficient kernel processing (iptables, packet
socket tap, ..).

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/linux/netdevice.h  |  16 ++++++++
 net/Kconfig                |  10 +++++
 net/core/dev.c             |  49 +++++++++++++++++++++-
 net/core/net-procfs.c      |  16 +++++++-
 net/core/sysctl_net_core.c | 100 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 188 insertions(+), 3 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 623b57b..d70afcc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1779,6 +1779,18 @@ static inline int unregister_gifconf(unsigned int family)
 	return register_gifconf(family, NULL);
 }
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+#define FLOW_LIMIT_HISTORY	(1 << 8)	/* must be ^2 */
+struct sd_flow_limit {
+	u64			count;
+	unsigned int		history_head;
+	u16			history[FLOW_LIMIT_HISTORY];
+	u8			buckets[];
+};
+
+extern int netdev_flow_limit_table_len;
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
 /*
  * Incoming packets are placed on per-cpu queues
  */
@@ -1808,6 +1820,10 @@ struct softnet_data {
 	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit	*flow_limit;
+#endif
 };
 
 static inline void input_queue_head_incr(struct softnet_data *sd)
diff --git a/net/Kconfig b/net/Kconfig
index 2ddc904..ff66a4f 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -259,6 +259,16 @@ config BPF_JIT
 	  packet sniffing (libpcap/tcpdump). Note : Admin should enable
 	  this feature changing /proc/sys/net/core/bpf_jit_enable
 
+config NET_FLOW_LIMIT
+	bool "Flow shedding under load"
+	---help---
+	  The network stack has to drop packets when a receive processing CPUs
+	  backlog reaches netdev_max_backlog. If a few out of many active flows
+	  generate the vast majority of load, drop their traffic earlier to
+	  maintain capacity for the other flows. This feature provides servers
+	  with many clients some protection against DoS by a single (spoofed)
+	  flow that greatly exceeds average workload.
+
 menu "Network testing"
 
 config NET_PKTGEN
diff --git a/net/core/dev.c b/net/core/dev.c
index 3655ff9..67a4ae0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3054,6 +3054,47 @@ static int rps_ipi_queued(struct softnet_data *sd)
 	return 0;
 }
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+	struct softnet_data *sd;
+	unsigned int old_flow, new_flow;
+
+	if (qlen < (netdev_max_backlog >> 1))
+		return false;
+
+	sd = &per_cpu(softnet_data, smp_processor_id());
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl) {
+		new_flow = skb_get_rxhash(skb) &
+			   (netdev_flow_limit_table_len - 1);
+		old_flow = fl->history[fl->history_head];
+		fl->history[fl->history_head] = new_flow;
+
+		fl->history_head++;
+		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+		if (likely(fl->buckets[old_flow]))
+			fl->buckets[old_flow]--;
+
+		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+			fl->count++;
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+#endif
+	return false;
+}
+
 /*
  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
  * queue (may be a remote CPU queue).
@@ -3063,13 +3104,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 {
 	struct softnet_data *sd;
 	unsigned long flags;
+	unsigned int qlen;
 
 	sd = &per_cpu(softnet_data, cpu);
 
 	local_irq_save(flags);
 
 	rps_lock(sd);
-	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+	qlen = skb_queue_len(&sd->input_pkt_queue);
+	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 		if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6256,6 +6299,10 @@ static int __init net_dev_init(void)
 		sd->backlog.weight = weight_p;
 		sd->backlog.gro_list = NULL;
 		sd->backlog.gro_count = 0;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+		sd->flow_limit = NULL;
+#endif
 	}
 
 	dev_boot_phase = 0;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 569d355..2bf8329 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)
 static int softnet_seq_show(struct seq_file *seq, void *v)
 {
 	struct softnet_data *sd = v;
+	unsigned int flow_limit_count = 0;
 
-	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl)
+		flow_limit_count = fl->count;
+	rcu_read_unlock();
+#endif
+
+	seq_printf(seq,
+		   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
-		   sd->cpu_collision, sd->received_rps);
+		   sd->cpu_collision, sd->received_rps, flow_limit_count);
 	return 0;
 }
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cfdb46a..78d90d9 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -87,6 +87,92 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
 }
 #endif /* CONFIG_RPS */
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+static int flow_limit_cpu_sysctl(ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *ppos)
+{
+	static DEFINE_MUTEX(flow_limit_update_mutex);
+	struct sd_flow_limit *cur;
+	struct softnet_data *sd;
+	cpumask_var_t mask;
+	int i, len, ret = 0;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (write) {
+		ret = cpumask_parse_user(buffer, *lenp, mask);
+		if (ret)
+			goto done;
+
+		len = sizeof(*cur) + netdev_flow_limit_table_len;
+		mutex_lock(&flow_limit_update_mutex);
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			cur = rcu_dereference_protected(sd->flow_limit,
+				      lockdep_is_held(flow_limit_update_mutex));
+			if (cur && !cpumask_test_cpu(i, mask)) {
+				RCU_INIT_POINTER(sd->flow_limit, NULL);
+				synchronize_rcu();
+				kfree(cur);
+			} else if (!cur && cpumask_test_cpu(i, mask)) {
+				cur = kzalloc(len, GFP_KERNEL);
+				if (!cur) {
+					/* not unwinding previous changes */
+					ret = -ENOMEM;
+					goto write_unlock;
+				}
+				rcu_assign_pointer(sd->flow_limit, cur);
+			}
+		}
+write_unlock:
+		synchronize_rcu();
+		mutex_unlock(&flow_limit_update_mutex);
+	} else {
+		if (*ppos || !*lenp) {
+			*lenp = 0;
+			goto done;
+		}
+
+		cpumask_clear(mask);
+		rcu_read_lock();
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			if (rcu_dereference(sd->flow_limit))
+				cpumask_set_cpu(i, mask);
+		}
+		rcu_read_unlock();
+
+		len = cpumask_scnprintf(buffer, *lenp, mask);
+		*lenp = len + 1;
+		*ppos += len + 1;
+	}
+
+done:
+	free_cpumask_var(mask);
+	return ret;
+}
+
+static int flow_limit_table_len_sysctl(ctl_table *table, int write,
+				       void __user *buffer, size_t *lenp,
+				       loff_t *ppos)
+{
+	unsigned int old, *ptr;
+	int ret;
+
+	ptr = table->data;
+	old = *ptr;
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (!ret && write && !is_power_of_2(*ptr)) {
+		*ptr = old;
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -180,6 +266,20 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= rps_sock_flow_sysctl
 	},
 #endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+	{
+		.procname	= "flow_limit_cpu_bitmap",
+		.mode		= 0644,
+		.proc_handler	= flow_limit_cpu_sysctl
+	},
+	{
+		.procname	= "flow_limit_table_len",
+		.data		= &netdev_flow_limit_table_len,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= flow_limit_table_len_sysctl
+	},
+#endif /* CONFIG_NET_FLOW_LIMIT */
 #endif /* CONFIG_NET */
 	{
 		.procname	= "netdev_budget",
-- 
1.8.2.1

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH] rps: selective flow shedding during softnet overflow
  2013-04-19 17:46 [PATCH] rps: selective flow shedding during softnet overflow Willem de Bruijn
@ 2013-04-19 17:58 ` Eric Dumazet
  2013-04-22 20:40   ` Willem de Bruijn
  2013-04-19 19:03 ` [PATCH] " Stephen Hemminger
  1 sibling, 1 reply; 41+ messages in thread
From: Eric Dumazet @ 2013-04-19 17:58 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: netdev, davem, edumazet

On Fri, 2013-04-19 at 13:46 -0400, Willem de Bruijn wrote:
> A cpu executing the network receive path sheds packets when its input
> queue grows to netdev_max_backlog. A single high rate flow (such as a
> spoofed source DoS) can exceed a single cpu processing rate and will
> degrade throughput of other flows hashed onto the same cpu.
> 
> This patch adds a more fine grained hashtable. If the netdev backlog
> is above a threshold, IRQ cpus track the ratio of total traffic of
> each flow (using 1024 buckets, configurable). The ratio is measured
> by counting the number of packets per flow over the last 256 packets
> from the source cpu. Any flow that occupies a large fraction of this
> (set at 50%) will see packet drop while above the threshold.
> 
> Tested:
> Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
> kernel receive (RPS) on cpu0 and application threads on cpus 2--7
> each handling 20k req/s. Throughput halves when hit with a 400 kpps
> antagonist storm. With this patch applied, antagonist overload is
> dropped and the server processes its complete load.
> 
> The patch is effective when kernel receive processing is the
> bottleneck. The above RPS scenario is a extreme, but the same is
> reached with RFS and sufficient kernel processing (iptables, packet
> socket tap, ..).
> 
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---

> +#ifdef CONFIG_NET_FLOW_LIMIT
> +#define FLOW_LIMIT_HISTORY	(1 << 8)	/* must be ^2 */
> +struct sd_flow_limit {
> +	u64			count;
> +	unsigned int		history_head;
> +	u16			history[FLOW_LIMIT_HISTORY];
> +	u8			buckets[];
> +};
> +
> +extern int netdev_flow_limit_table_len;
> +#endif /* CONFIG_NET_FLOW_LIMIT */
> +
>  /*
>   * Incoming packets are placed on per-cpu queues
>   */
> @@ -1808,6 +1820,10 @@ struct softnet_data {
>  	unsigned int		dropped;
>  	struct sk_buff_head	input_pkt_queue;
>  	struct napi_struct	backlog;
> +
> +#ifdef CONFIG_NET_FLOW_LIMIT
> +	struct sd_flow_limit	*flow_limit;
> +#endif
>  };
>  
>  static inline void input_queue_head_incr(struct softnet_data *sd)
> diff --git a/net/Kconfig b/net/Kconfig
> index 2ddc904..ff66a4f 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -259,6 +259,16 @@ config BPF_JIT
>  	  packet sniffing (libpcap/tcpdump). Note : Admin should enable
>  	  this feature changing /proc/sys/net/core/bpf_jit_enable
>  
> +config NET_FLOW_LIMIT
> +	bool "Flow shedding under load"
> +	---help---
> +	  The network stack has to drop packets when a receive processing CPUs
> +	  backlog reaches netdev_max_backlog. If a few out of many active flows
> +	  generate the vast majority of load, drop their traffic earlier to
> +	  maintain capacity for the other flows. This feature provides servers
> +	  with many clients some protection against DoS by a single (spoofed)
> +	  flow that greatly exceeds average workload.
> +
>  menu "Network testing"
>  
>  config NET_PKTGEN
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 3655ff9..67a4ae0 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3054,6 +3054,47 @@ static int rps_ipi_queued(struct softnet_data *sd)
>  	return 0;
>  }
>  
> +#ifdef CONFIG_NET_FLOW_LIMIT
> +int netdev_flow_limit_table_len __read_mostly = (1 << 12);
> +#endif
> +
> +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
> +{
> +#ifdef CONFIG_NET_FLOW_LIMIT
> +	struct sd_flow_limit *fl;
> +	struct softnet_data *sd;
> +	unsigned int old_flow, new_flow;
> +
> +	if (qlen < (netdev_max_backlog >> 1))
> +		return false;
> +
> +	sd = &per_cpu(softnet_data, smp_processor_id());
> +
> +	rcu_read_lock();
> +	fl = rcu_dereference(sd->flow_limit);
> +	if (fl) {
> +		new_flow = skb_get_rxhash(skb) &
> +			   (netdev_flow_limit_table_len - 1);

There is a race accessing netdev_flow_limit_table_len

(the admin might change the value, and we might do an out of bound
access)

This should be a field in fl, aka fl->mask, so thats its safe


> +		old_flow = fl->history[fl->history_head];
> +		fl->history[fl->history_head] = new_flow;
> +
> +		fl->history_head++;
> +		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
> +
> +		if (likely(fl->buckets[old_flow]))
> +			fl->buckets[old_flow]--;
> +
> +		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
> +			fl->count++;
> +			rcu_read_unlock();
> +			return true;
> +		}
> +	}
> +	rcu_read_unlock();
> +#endif
> +	return false;
> +}
> +

Very nice work by the way !

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] rps: selective flow shedding during softnet overflow
  2013-04-19 17:46 [PATCH] rps: selective flow shedding during softnet overflow Willem de Bruijn
  2013-04-19 17:58 ` Eric Dumazet
@ 2013-04-19 19:03 ` Stephen Hemminger
  2013-04-19 19:21   ` Eric Dumazet
  2013-04-19 20:11   ` Willem de Bruijn
  1 sibling, 2 replies; 41+ messages in thread
From: Stephen Hemminger @ 2013-04-19 19:03 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: netdev, davem, edumazet

On Fri, 19 Apr 2013 13:46:52 -0400
Willem de Bruijn <willemb@google.com> wrote:

> A cpu executing the network receive path sheds packets when its input
> queue grows to netdev_max_backlog. A single high rate flow (such as a
> spoofed source DoS) can exceed a single cpu processing rate and will
> degrade throughput of other flows hashed onto the same cpu.
> 
> This patch adds a more fine grained hashtable. If the netdev backlog
> is above a threshold, IRQ cpus track the ratio of total traffic of
> each flow (using 1024 buckets, configurable). The ratio is measured
> by counting the number of packets per flow over the last 256 packets
> from the source cpu. Any flow that occupies a large fraction of this
> (set at 50%) will see packet drop while above the threshold.
> 
> Tested:
> Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
> kernel receive (RPS) on cpu0 and application threads on cpus 2--7
> each handling 20k req/s. Throughput halves when hit with a 400 kpps
> antagonist storm. With this patch applied, antagonist overload is
> dropped and the server processes its complete load.
> 
> The patch is effective when kernel receive processing is the
> bottleneck. The above RPS scenario is a extreme, but the same is
> reached with RFS and sufficient kernel processing (iptables, packet
> socket tap, ..).
> 
> Signed-off-by: Willem de Bruijn <willemb@google.com>

The netdev_backlog only applies for RPS and non-NAPI devices.
So this won't help if receive packet steering is not enabled.
Seems like a deficiency in the receive steering design rather
than the netdev_backlog.

Can't you do this with existing ingress stuff?
The trend seems to be put in more fixed infrastructure to deal with
performance and server problems rather than building general purpose
solutions.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] rps: selective flow shedding during softnet overflow
  2013-04-19 19:03 ` [PATCH] " Stephen Hemminger
@ 2013-04-19 19:21   ` Eric Dumazet
  2013-04-19 20:11   ` Willem de Bruijn
  1 sibling, 0 replies; 41+ messages in thread
From: Eric Dumazet @ 2013-04-19 19:21 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Willem de Bruijn, netdev, davem, edumazet

On Fri, 2013-04-19 at 12:03 -0700, Stephen Hemminger wrote:

> The netdev_backlog only applies for RPS and non-NAPI devices.
> So this won't help if receive packet steering is not enabled.

Yes, the point is exactly using RPS as a proxy to control
the behavior and better distribute the load.

> Seems like a deficiency in the receive steering design rather
> than the netdev_backlog.

Well, RPS works well, even for multiqueue NICS. But nobody said it was
perfect.

Back to NAPI (without RPS) :

If one RX queue (multiqueue device or not) is hit by a single flow, how
do you plan to do anything, since without RPS we process each packet
after another.

There is no queue building up in our stack. (Only queue is in the NIC RX
ring). Eventually NIC drops packets.

> 
> Can't you do this with existing ingress stuff?

ingress is not yet multiqueue enabled, AFAIK.

Expect very poor performance from it.

> The trend seems to be put in more fixed infrastructure to deal with
> performance and server problems rather than building general purpose
> solutions.

OK, I see you want to push netmap. Lets talk about general purpose
solutions.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] rps: selective flow shedding during softnet overflow
  2013-04-19 19:03 ` [PATCH] " Stephen Hemminger
  2013-04-19 19:21   ` Eric Dumazet
@ 2013-04-19 20:11   ` Willem de Bruijn
  1 sibling, 0 replies; 41+ messages in thread
From: Willem de Bruijn @ 2013-04-19 20:11 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, David Miller, Eric Dumazet

On Fri, Apr 19, 2013 at 3:03 PM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Fri, 19 Apr 2013 13:46:52 -0400
> Willem de Bruijn <willemb@google.com> wrote:
>
>> A cpu executing the network receive path sheds packets when its input
>> queue grows to netdev_max_backlog. A single high rate flow (such as a
>> spoofed source DoS) can exceed a single cpu processing rate and will
>> degrade throughput of other flows hashed onto the same cpu.
>>
>> This patch adds a more fine grained hashtable. If the netdev backlog
>> is above a threshold, IRQ cpus track the ratio of total traffic of
>> each flow (using 1024 buckets, configurable). The ratio is measured
>> by counting the number of packets per flow over the last 256 packets
>> from the source cpu. Any flow that occupies a large fraction of this
>> (set at 50%) will see packet drop while above the threshold.
>>
>> Tested:
>> Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
>> kernel receive (RPS) on cpu0 and application threads on cpus 2--7
>> each handling 20k req/s. Throughput halves when hit with a 400 kpps
>> antagonist storm. With this patch applied, antagonist overload is
>> dropped and the server processes its complete load.
>>
>> The patch is effective when kernel receive processing is the
>> bottleneck. The above RPS scenario is a extreme, but the same is
>> reached with RFS and sufficient kernel processing (iptables, packet
>> socket tap, ..).
>>
>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>
> The netdev_backlog only applies for RPS and non-NAPI devices.
> So this won't help if receive packet steering is not enabled.
> Seems like a deficiency in the receive steering design rather
> than the netdev_backlog.

The patch specifically intends to address a consequence of
perfect flow-hashing: that unbalanced input translates into cpu
load imbalance. It is less relevant to servers that do not use
flow hashing to spread traffic (i.e., no rps/rfs).

In normal server workloads, hashing works well, but it makes
machine state subject to external influence. In particular, to
local resource exhaustion (partial DoS). This patch hardens
against these extreme input patterns that should not occur in
normal workloads. The netdev backlog is the clearest indicator of
unsustainable load due to imbalance.

> Can't you do this with existing ingress stuff?
> The trend seems to be put in more fixed infrastructure to deal with
> performance and server problems rather than building general purpose
> solutions.

This isn't necessarily mutually exclusive with iptables/policing/..
mechanisms to filter out bad flows, of course. The earlier in the
pipeline packets are dropped, the fewer cycles are spent, so this
is another layer of (early) defense.

For instance, I recently sent a patch to handle load imbalance in
packet sockets. Those socket queues fill up if the application
threads are the bottleneck instead of the kernel receive path, so
this rps fix would not be relevant.

>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] rps: selective flow shedding during softnet overflow
  2013-04-19 17:58 ` Eric Dumazet
@ 2013-04-22 20:40   ` Willem de Bruijn
  2013-04-22 20:46     ` [PATCH net-next v2] " Willem de Bruijn
  0 siblings, 1 reply; 41+ messages in thread
From: Willem de Bruijn @ 2013-04-22 20:40 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, David Miller

On Fri, Apr 19, 2013 at 1:58 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2013-04-19 at 13:46 -0400, Willem de Bruijn wrote:
>> A cpu executing the network receive path sheds packets when its input
>> queue grows to netdev_max_backlog. A single high rate flow (such as a
>> spoofed source DoS) can exceed a single cpu processing rate and will
>> degrade throughput of other flows hashed onto the same cpu.
>>
>> This patch adds a more fine grained hashtable. If the netdev backlog
>> is above a threshold, IRQ cpus track the ratio of total traffic of
>> each flow (using 1024 buckets, configurable). The ratio is measured
>> by counting the number of packets per flow over the last 256 packets
>> from the source cpu. Any flow that occupies a large fraction of this
>> (set at 50%) will see packet drop while above the threshold.
>>
>> Tested:
>> Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
>> kernel receive (RPS) on cpu0 and application threads on cpus 2--7
>> each handling 20k req/s. Throughput halves when hit with a 400 kpps
>> antagonist storm. With this patch applied, antagonist overload is
>> dropped and the server processes its complete load.
>>
>> The patch is effective when kernel receive processing is the
>> bottleneck. The above RPS scenario is a extreme, but the same is
>> reached with RFS and sufficient kernel processing (iptables, packet
>> socket tap, ..).
>>
>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>> ---
>
>> +#ifdef CONFIG_NET_FLOW_LIMIT
>> +#define FLOW_LIMIT_HISTORY   (1 << 8)        /* must be ^2 */
>> +struct sd_flow_limit {
>> +     u64                     count;
>> +     unsigned int            history_head;
>> +     u16                     history[FLOW_LIMIT_HISTORY];
>> +     u8                      buckets[];
>> +};
>> +
>> +extern int netdev_flow_limit_table_len;
>> +#endif /* CONFIG_NET_FLOW_LIMIT */
>> +
>>  /*
>>   * Incoming packets are placed on per-cpu queues
>>   */
>> @@ -1808,6 +1820,10 @@ struct softnet_data {
>>       unsigned int            dropped;
>>       struct sk_buff_head     input_pkt_queue;
>>       struct napi_struct      backlog;
>> +
>> +#ifdef CONFIG_NET_FLOW_LIMIT
>> +     struct sd_flow_limit    *flow_limit;
>> +#endif
>>  };
>>
>>  static inline void input_queue_head_incr(struct softnet_data *sd)
>> diff --git a/net/Kconfig b/net/Kconfig
>> index 2ddc904..ff66a4f 100644
>> --- a/net/Kconfig
>> +++ b/net/Kconfig
>> @@ -259,6 +259,16 @@ config BPF_JIT
>>         packet sniffing (libpcap/tcpdump). Note : Admin should enable
>>         this feature changing /proc/sys/net/core/bpf_jit_enable
>>
>> +config NET_FLOW_LIMIT
>> +     bool "Flow shedding under load"
>> +     ---help---
>> +       The network stack has to drop packets when a receive processing CPUs
>> +       backlog reaches netdev_max_backlog. If a few out of many active flows
>> +       generate the vast majority of load, drop their traffic earlier to
>> +       maintain capacity for the other flows. This feature provides servers
>> +       with many clients some protection against DoS by a single (spoofed)
>> +       flow that greatly exceeds average workload.
>> +
>>  menu "Network testing"
>>
>>  config NET_PKTGEN
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 3655ff9..67a4ae0 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -3054,6 +3054,47 @@ static int rps_ipi_queued(struct softnet_data *sd)
>>       return 0;
>>  }
>>
>> +#ifdef CONFIG_NET_FLOW_LIMIT
>> +int netdev_flow_limit_table_len __read_mostly = (1 << 12);
>> +#endif
>> +
>> +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
>> +{
>> +#ifdef CONFIG_NET_FLOW_LIMIT
>> +     struct sd_flow_limit *fl;
>> +     struct softnet_data *sd;
>> +     unsigned int old_flow, new_flow;
>> +
>> +     if (qlen < (netdev_max_backlog >> 1))
>> +             return false;
>> +
>> +     sd = &per_cpu(softnet_data, smp_processor_id());
>> +
>> +     rcu_read_lock();
>> +     fl = rcu_dereference(sd->flow_limit);
>> +     if (fl) {
>> +             new_flow = skb_get_rxhash(skb) &
>> +                        (netdev_flow_limit_table_len - 1);
>
> There is a race accessing netdev_flow_limit_table_len
>
> (the admin might change the value, and we might do an out of bound
> access)
>
> This should be a field in fl, aka fl->mask, so thats its safe

Ah, of course. Thanks, Eric!

I held off a new patch for a few days to wait for comments. Just
updated it with this change and will send it as v2.

>
>
>> +             old_flow = fl->history[fl->history_head];
>> +             fl->history[fl->history_head] = new_flow;
>> +
>> +             fl->history_head++;
>> +             fl->history_head &= FLOW_LIMIT_HISTORY - 1;
>> +
>> +             if (likely(fl->buckets[old_flow]))
>> +                     fl->buckets[old_flow]--;
>> +
>> +             if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
>> +                     fl->count++;
>> +                     rcu_read_unlock();
>> +                     return true;
>> +             }
>> +     }
>> +     rcu_read_unlock();
>> +#endif
>> +     return false;
>> +}
>> +
>
> Very nice work by the way !
>
>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH net-next v2] rps: selective flow shedding during softnet overflow
  2013-04-22 20:40   ` Willem de Bruijn
@ 2013-04-22 20:46     ` Willem de Bruijn
  2013-04-22 22:30       ` Eric Dumazet
  0 siblings, 1 reply; 41+ messages in thread
From: Willem de Bruijn @ 2013-04-22 20:46 UTC (permalink / raw)
  To: eric.dumazet, netdev, davem, stephen; +Cc: Willem de Bruijn

A cpu executing the network receive path sheds packets when its input
queue grows to netdev_max_backlog. A single high rate flow (such as a
spoofed source DoS) can exceed a single cpu processing rate and will
degrade throughput of other flows hashed onto the same cpu.

This patch adds a more fine grained hashtable. If the netdev backlog
is above a threshold, IRQ cpus track the ratio of total traffic of
each flow (using 4096 buckets, configurable). The ratio is measured
by counting the number of packets per flow over the last 256 packets
from the source cpu. Any flow that occupies a large fraction of this
(set at 50%) will see packet drop while above the threshold.

Tested:
Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
kernel receive (RPS) on cpu0 and application threads on cpus 2--7
each handling 20k req/s. Throughput halves when hit with a 400 kpps
antagonist storm. With this patch applied, antagonist overload is
dropped and the server processes its complete load.

The patch is effective when kernel receive processing is the
bottleneck. The above RPS scenario is a extreme, but the same is
reached with RFS and sufficient kernel processing (iptables, packet
socket tap, ..).

Signed-off-by: Willem de Bruijn <willemb@google.com>

---

Changed v1->v2
- add fl->num_buckets element to use the actual allocated table length.
- disable the kconfig option by default, as it is workload specific.
---
 include/linux/netdevice.h  |  17 ++++++++
 net/Kconfig                |  11 +++++
 net/core/dev.c             |  48 ++++++++++++++++++++-
 net/core/net-procfs.c      |  16 ++++++-
 net/core/sysctl_net_core.c | 101 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 190 insertions(+), 3 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f8898a4..d781cf1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1779,6 +1779,19 @@ static inline int unregister_gifconf(unsigned int family)
 	return register_gifconf(family, NULL);
 }
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+#define FLOW_LIMIT_HISTORY	(1 << 8)	/* must be ^2 */
+struct sd_flow_limit {
+	u64			count;
+	unsigned int		num_buckets;
+	unsigned int		history_head;
+	u16			history[FLOW_LIMIT_HISTORY];
+	u8			buckets[];
+};
+
+extern int netdev_flow_limit_table_len;
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
 /*
  * Incoming packets are placed on per-cpu queues
  */
@@ -1808,6 +1821,10 @@ struct softnet_data {
 	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit	*flow_limit;
+#endif
 };
 
 static inline void input_queue_head_incr(struct softnet_data *sd)
diff --git a/net/Kconfig b/net/Kconfig
index 1a22216..a0cbd3b 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -268,6 +268,17 @@ config BPF_JIT
 	  packet sniffing (libpcap/tcpdump). Note : Admin should enable
 	  this feature changing /proc/sys/net/core/bpf_jit_enable
 
+config NET_FLOW_LIMIT
+	bool "Flow shedding under load"
+	default n
+	---help---
+	  The network stack has to drop packets when a receive processing CPUs
+	  backlog reaches netdev_max_backlog. If a few out of many active flows
+	  generate the vast majority of load, drop their traffic earlier to
+	  maintain capacity for the other flows. This feature provides servers
+	  with many clients some protection against DoS by a single (spoofed)
+	  flow that greatly exceeds average workload.
+
 menu "Network testing"
 
 config NET_PKTGEN
diff --git a/net/core/dev.c b/net/core/dev.c
index fad4c38..90190c4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3057,6 +3057,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
 	return 0;
 }
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+	struct softnet_data *sd;
+	unsigned int old_flow, new_flow;
+
+	if (qlen < (netdev_max_backlog >> 1))
+		return false;
+
+	sd = &per_cpu(softnet_data, smp_processor_id());
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl) {
+		new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
+		old_flow = fl->history[fl->history_head];
+		fl->history[fl->history_head] = new_flow;
+
+		fl->history_head++;
+		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+		if (likely(fl->buckets[old_flow]))
+			fl->buckets[old_flow]--;
+
+		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+			fl->count++;
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+#endif
+	return false;
+}
+
 /*
  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
  * queue (may be a remote CPU queue).
@@ -3066,13 +3106,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 {
 	struct softnet_data *sd;
 	unsigned long flags;
+	unsigned int qlen;
 
 	sd = &per_cpu(softnet_data, cpu);
 
 	local_irq_save(flags);
 
 	rps_lock(sd);
-	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+	qlen = skb_queue_len(&sd->input_pkt_queue);
+	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 		if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6262,6 +6304,10 @@ static int __init net_dev_init(void)
 		sd->backlog.weight = weight_p;
 		sd->backlog.gro_list = NULL;
 		sd->backlog.gro_count = 0;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+		sd->flow_limit = NULL;
+#endif
 	}
 
 	dev_boot_phase = 0;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 569d355..2bf8329 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)
 static int softnet_seq_show(struct seq_file *seq, void *v)
 {
 	struct softnet_data *sd = v;
+	unsigned int flow_limit_count = 0;
 
-	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl)
+		flow_limit_count = fl->count;
+	rcu_read_unlock();
+#endif
+
+	seq_printf(seq,
+		   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
-		   sd->cpu_collision, sd->received_rps);
+		   sd->cpu_collision, sd->received_rps, flow_limit_count);
 	return 0;
 }
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cfdb46a..297df31 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -87,6 +87,93 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
 }
 #endif /* CONFIG_RPS */
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+static int flow_limit_cpu_sysctl(ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *ppos)
+{
+	static DEFINE_MUTEX(flow_limit_update_mutex);
+	struct sd_flow_limit *cur;
+	struct softnet_data *sd;
+	cpumask_var_t mask;
+	int i, len, ret = 0;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (write) {
+		ret = cpumask_parse_user(buffer, *lenp, mask);
+		if (ret)
+			goto done;
+
+		len = sizeof(*cur) + netdev_flow_limit_table_len;
+		mutex_lock(&flow_limit_update_mutex);
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			cur = rcu_dereference_protected(sd->flow_limit,
+				      lockdep_is_held(flow_limit_update_mutex));
+			if (cur && !cpumask_test_cpu(i, mask)) {
+				RCU_INIT_POINTER(sd->flow_limit, NULL);
+				synchronize_rcu();
+				kfree(cur);
+			} else if (!cur && cpumask_test_cpu(i, mask)) {
+				cur = kzalloc(len, GFP_KERNEL);
+				cur->num_buckets = netdev_flow_limit_table_len;
+				if (!cur) {
+					/* not unwinding previous changes */
+					ret = -ENOMEM;
+					goto write_unlock;
+				}
+				rcu_assign_pointer(sd->flow_limit, cur);
+			}
+		}
+write_unlock:
+		synchronize_rcu();
+		mutex_unlock(&flow_limit_update_mutex);
+	} else {
+		if (*ppos || !*lenp) {
+			*lenp = 0;
+			goto done;
+		}
+
+		cpumask_clear(mask);
+		rcu_read_lock();
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			if (rcu_dereference(sd->flow_limit))
+				cpumask_set_cpu(i, mask);
+		}
+		rcu_read_unlock();
+
+		len = cpumask_scnprintf(buffer, *lenp, mask);
+		*lenp = len + 1;
+		*ppos += len + 1;
+	}
+
+done:
+	free_cpumask_var(mask);
+	return ret;
+}
+
+static int flow_limit_table_len_sysctl(ctl_table *table, int write,
+				       void __user *buffer, size_t *lenp,
+				       loff_t *ppos)
+{
+	unsigned int old, *ptr;
+	int ret;
+
+	ptr = table->data;
+	old = *ptr;
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (!ret && write && !is_power_of_2(*ptr)) {
+		*ptr = old;
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -180,6 +267,20 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= rps_sock_flow_sysctl
 	},
 #endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+	{
+		.procname	= "flow_limit_cpu_bitmap",
+		.mode		= 0644,
+		.proc_handler	= flow_limit_cpu_sysctl
+	},
+	{
+		.procname	= "flow_limit_table_len",
+		.data		= &netdev_flow_limit_table_len,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= flow_limit_table_len_sysctl
+	},
+#endif /* CONFIG_NET_FLOW_LIMIT */
 #endif /* CONFIG_NET */
 	{
 		.procname	= "netdev_budget",
-- 
1.8.2.1

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v2] rps: selective flow shedding during softnet overflow
  2013-04-22 20:46     ` [PATCH net-next v2] " Willem de Bruijn
@ 2013-04-22 22:30       ` Eric Dumazet
  2013-04-23 18:45         ` Willem de Bruijn
  0 siblings, 1 reply; 41+ messages in thread
From: Eric Dumazet @ 2013-04-22 22:30 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: netdev, davem, stephen

On Mon, 2013-04-22 at 16:46 -0400, Willem de Bruijn wrote:

> +		len = sizeof(*cur) + netdev_flow_limit_table_len;
[1]
> +		mutex_lock(&flow_limit_update_mutex);
[2]
> +		for_each_possible_cpu(i) {
> +			sd = &per_cpu(softnet_data, i);
> +			cur = rcu_dereference_protected(sd->flow_limit,
> +				      lockdep_is_held(flow_limit_update_mutex));
> +			if (cur && !cpumask_test_cpu(i, mask)) {
> +				RCU_INIT_POINTER(sd->flow_limit, NULL);
> +				synchronize_rcu();
> +				kfree(cur);
> +			} else if (!cur && cpumask_test_cpu(i, mask)) {
> +				cur = kzalloc(len, GFP_KERNEL);
> +				cur->num_buckets = netdev_flow_limit_table_len;
[3]

Its a bit tricky, but the value of netdev_flow_limit_table_len could
change between [1] and [3]

So you should read its value once, or protect the whole thing using
mutex_lock(&flow_limit_update_mutex) in sysctl code ( and move [1] after
[2])

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v2] rps: selective flow shedding during softnet overflow
  2013-04-22 22:30       ` Eric Dumazet
@ 2013-04-23 18:45         ` Willem de Bruijn
  2013-04-23 18:46           ` [PATCH net-next v3] " Willem de Bruijn
  0 siblings, 1 reply; 41+ messages in thread
From: Willem de Bruijn @ 2013-04-23 18:45 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, David Miller, Stephen Hemminger

On Mon, Apr 22, 2013 at 6:30 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Mon, 2013-04-22 at 16:46 -0400, Willem de Bruijn wrote:
>
>> +             len = sizeof(*cur) + netdev_flow_limit_table_len;
> [1]
>> +             mutex_lock(&flow_limit_update_mutex);
> [2]
>> +             for_each_possible_cpu(i) {
>> +                     sd = &per_cpu(softnet_data, i);
>> +                     cur = rcu_dereference_protected(sd->flow_limit,
>> +                                   lockdep_is_held(flow_limit_update_mutex));
>> +                     if (cur && !cpumask_test_cpu(i, mask)) {
>> +                             RCU_INIT_POINTER(sd->flow_limit, NULL);
>> +                             synchronize_rcu();
>> +                             kfree(cur);
>> +                     } else if (!cur && cpumask_test_cpu(i, mask)) {
>> +                             cur = kzalloc(len, GFP_KERNEL);
>> +                             cur->num_buckets = netdev_flow_limit_table_len;
> [3]
>
> Its a bit tricky, but the value of netdev_flow_limit_table_len could
> change between [1] and [3]
>
> So you should read its value once, or protect the whole thing using
> mutex_lock(&flow_limit_update_mutex) in sysctl code ( and move [1] after
> [2])

Thanks for the detailed explanation. I implemented the second
solution: make writes to the two sysctls mutually exclusive.
>
>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH net-next v3] rps: selective flow shedding during softnet overflow
  2013-04-23 18:45         ` Willem de Bruijn
@ 2013-04-23 18:46           ` Willem de Bruijn
  2013-04-23 19:18             ` Eric Dumazet
  0 siblings, 1 reply; 41+ messages in thread
From: Willem de Bruijn @ 2013-04-23 18:46 UTC (permalink / raw)
  To: eric.dumazet, netdev, davem, stephen; +Cc: Willem de Bruijn

A cpu executing the network receive path sheds packets when its input
queue grows to netdev_max_backlog. A single high rate flow (such as a
spoofed source DoS) can exceed a single cpu processing rate and will
degrade throughput of other flows hashed onto the same cpu.

This patch adds a more fine grained hashtable. If the netdev backlog
is above a threshold, IRQ cpus track the ratio of total traffic of
each flow (using 4096 buckets, configurable). The ratio is measured
by counting the number of packets per flow over the last 256 packets
from the source cpu. Any flow that occupies a large fraction of this
(set at 50%) will see packet drop while above the threshold.

Tested:
Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
kernel receive (RPS) on cpu0 and application threads on cpus 2--7
each handling 20k req/s. Throughput halves when hit with a 400 kpps
antagonist storm. With this patch applied, antagonist overload is
dropped and the server processes its complete load.

The patch is effective when kernel receive processing is the
bottleneck. The above RPS scenario is a extreme, but the same is
reached with RFS and sufficient kernel processing (iptables, packet
socket tap, ..).

Signed-off-by: Willem de Bruijn <willemb@google.com>

---

Changes
v3
- fix race between updates to table_len sysctl during bitmap sysctl.
- fix NULL pointer dereference on alloc failure.
v2
- add fl->num_buckets element to use the actual allocated table length.
- disable the kconfig option by default, as it is workload specific.
---
 include/linux/netdevice.h  |  17 ++++++++
 net/Kconfig                |  11 +++++
 net/core/dev.c             |  48 ++++++++++++++++++++-
 net/core/net-procfs.c      |  16 ++++++-
 net/core/sysctl_net_core.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 194 insertions(+), 3 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f8898a4..d781cf1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1779,6 +1779,19 @@ static inline int unregister_gifconf(unsigned int family)
 	return register_gifconf(family, NULL);
 }
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+#define FLOW_LIMIT_HISTORY	(1 << 8)	/* must be ^2 */
+struct sd_flow_limit {
+	u64			count;
+	unsigned int		num_buckets;
+	unsigned int		history_head;
+	u16			history[FLOW_LIMIT_HISTORY];
+	u8			buckets[];
+};
+
+extern int netdev_flow_limit_table_len;
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
 /*
  * Incoming packets are placed on per-cpu queues
  */
@@ -1808,6 +1821,10 @@ struct softnet_data {
 	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit	*flow_limit;
+#endif
 };
 
 static inline void input_queue_head_incr(struct softnet_data *sd)
diff --git a/net/Kconfig b/net/Kconfig
index 1a22216..a0cbd3b 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -268,6 +268,17 @@ config BPF_JIT
 	  packet sniffing (libpcap/tcpdump). Note : Admin should enable
 	  this feature changing /proc/sys/net/core/bpf_jit_enable
 
+config NET_FLOW_LIMIT
+	bool "Flow shedding under load"
+	default n
+	---help---
+	  The network stack has to drop packets when a receive processing CPUs
+	  backlog reaches netdev_max_backlog. If a few out of many active flows
+	  generate the vast majority of load, drop their traffic earlier to
+	  maintain capacity for the other flows. This feature provides servers
+	  with many clients some protection against DoS by a single (spoofed)
+	  flow that greatly exceeds average workload.
+
 menu "Network testing"
 
 config NET_PKTGEN
diff --git a/net/core/dev.c b/net/core/dev.c
index fad4c38..90190c4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3057,6 +3057,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
 	return 0;
 }
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+	struct softnet_data *sd;
+	unsigned int old_flow, new_flow;
+
+	if (qlen < (netdev_max_backlog >> 1))
+		return false;
+
+	sd = &per_cpu(softnet_data, smp_processor_id());
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl) {
+		new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
+		old_flow = fl->history[fl->history_head];
+		fl->history[fl->history_head] = new_flow;
+
+		fl->history_head++;
+		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+		if (likely(fl->buckets[old_flow]))
+			fl->buckets[old_flow]--;
+
+		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+			fl->count++;
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+#endif
+	return false;
+}
+
 /*
  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
  * queue (may be a remote CPU queue).
@@ -3066,13 +3106,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 {
 	struct softnet_data *sd;
 	unsigned long flags;
+	unsigned int qlen;
 
 	sd = &per_cpu(softnet_data, cpu);
 
 	local_irq_save(flags);
 
 	rps_lock(sd);
-	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+	qlen = skb_queue_len(&sd->input_pkt_queue);
+	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 		if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6262,6 +6304,10 @@ static int __init net_dev_init(void)
 		sd->backlog.weight = weight_p;
 		sd->backlog.gro_list = NULL;
 		sd->backlog.gro_count = 0;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+		sd->flow_limit = NULL;
+#endif
 	}
 
 	dev_boot_phase = 0;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 569d355..2bf8329 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)
 static int softnet_seq_show(struct seq_file *seq, void *v)
 {
 	struct softnet_data *sd = v;
+	unsigned int flow_limit_count = 0;
 
-	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl)
+		flow_limit_count = fl->count;
+	rcu_read_unlock();
+#endif
+
+	seq_printf(seq,
+		   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
-		   sd->cpu_collision, sd->received_rps);
+		   sd->cpu_collision, sd->received_rps, flow_limit_count);
 	return 0;
 }
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cfdb46a..1a4a5dd 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -87,6 +87,97 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
 }
 #endif /* CONFIG_RPS */
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+static DEFINE_MUTEX(flow_limit_update_mutex);
+
+static int flow_limit_cpu_sysctl(ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *ppos)
+{
+	struct sd_flow_limit *cur;
+	struct softnet_data *sd;
+	cpumask_var_t mask;
+	int i, len, ret = 0;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (write) {
+		ret = cpumask_parse_user(buffer, *lenp, mask);
+		if (ret)
+			goto done;
+
+		mutex_lock(&flow_limit_update_mutex);
+		len = sizeof(*cur) + netdev_flow_limit_table_len;
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			cur = rcu_dereference_protected(sd->flow_limit,
+				      lockdep_is_held(flow_limit_update_mutex));
+			if (cur && !cpumask_test_cpu(i, mask)) {
+				RCU_INIT_POINTER(sd->flow_limit, NULL);
+				synchronize_rcu();
+				kfree(cur);
+			} else if (!cur && cpumask_test_cpu(i, mask)) {
+				cur = kzalloc(len, GFP_KERNEL);
+				if (!cur) {
+					/* not unwinding previous changes */
+					ret = -ENOMEM;
+					goto write_unlock;
+				}
+				cur->num_buckets = netdev_flow_limit_table_len;
+				rcu_assign_pointer(sd->flow_limit, cur);
+			}
+		}
+write_unlock:
+		synchronize_rcu();
+		mutex_unlock(&flow_limit_update_mutex);
+	} else {
+		if (*ppos || !*lenp) {
+			*lenp = 0;
+			goto done;
+		}
+
+		cpumask_clear(mask);
+		rcu_read_lock();
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			if (rcu_dereference(sd->flow_limit))
+				cpumask_set_cpu(i, mask);
+		}
+		rcu_read_unlock();
+
+		len = cpumask_scnprintf(buffer, *lenp, mask);
+		*lenp = len + 1;
+		*ppos += len + 1;
+	}
+
+done:
+	free_cpumask_var(mask);
+	return ret;
+}
+
+static int flow_limit_table_len_sysctl(ctl_table *table, int write,
+				       void __user *buffer, size_t *lenp,
+				       loff_t *ppos)
+{
+	unsigned int old, *ptr;
+	int ret;
+
+	mutex_lock(&flow_limit_update_mutex);
+
+	ptr = table->data;
+	old = *ptr;
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (!ret && write && !is_power_of_2(*ptr)) {
+		*ptr = old;
+		ret = -EINVAL;
+	}
+
+	mutex_unlock(&flow_limit_update_mutex);
+	return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -180,6 +271,20 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= rps_sock_flow_sysctl
 	},
 #endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+	{
+		.procname	= "flow_limit_cpu_bitmap",
+		.mode		= 0644,
+		.proc_handler	= flow_limit_cpu_sysctl
+	},
+	{
+		.procname	= "flow_limit_table_len",
+		.data		= &netdev_flow_limit_table_len,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= flow_limit_table_len_sysctl
+	},
+#endif /* CONFIG_NET_FLOW_LIMIT */
 #endif /* CONFIG_NET */
 	{
 		.procname	= "netdev_budget",
-- 
1.8.2.1

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v3] rps: selective flow shedding during softnet overflow
  2013-04-23 18:46           ` [PATCH net-next v3] " Willem de Bruijn
@ 2013-04-23 19:18             ` Eric Dumazet
  2013-04-23 20:30               ` Willem de Bruijn
  0 siblings, 1 reply; 41+ messages in thread
From: Eric Dumazet @ 2013-04-23 19:18 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: netdev, davem, stephen

Hi Willem

On Tue, 2013-04-23 at 14:46 -0400, Willem de Bruijn wrote:

> +
> +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
> +{
> +#ifdef CONFIG_NET_FLOW_LIMIT
> +	struct sd_flow_limit *fl;
> +	struct softnet_data *sd;
> +	unsigned int old_flow, new_flow;
> +
> +	if (qlen < (netdev_max_backlog >> 1))
> +		return false;
> +
> +	sd = &per_cpu(softnet_data, smp_processor_id());

sd = __get_cpu_var(softnet_data);

> +
> +	rcu_read_lock();
> +	fl = rcu_dereference(sd->flow_limit);
> +	if (fl) {
> +		new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
> +		old_flow = fl->history[fl->history_head];
> +		fl->history[fl->history_head] = new_flow;
> +
> +		fl->history_head++;
> +		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
> +
> +		if (likely(fl->buckets[old_flow]))
> +			fl->buckets[old_flow]--;
> +
> +		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
> +			fl->count++;
> +			rcu_read_unlock();
> +			return true;
> +		}
> +	}
> +	rcu_read_unlock();
> +#endif
> +	return false;
> +}
> +

...

 
> +#ifdef CONFIG_NET_FLOW_LIMIT
> +static DEFINE_MUTEX(flow_limit_update_mutex);
> +
> +static int flow_limit_cpu_sysctl(ctl_table *table, int write,
> +				 void __user *buffer, size_t *lenp,
> +				 loff_t *ppos)
> +{
> +	struct sd_flow_limit *cur;
> +	struct softnet_data *sd;
> +	cpumask_var_t mask;
> +	int i, len, ret = 0;
> +
> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	if (write) {
> +		ret = cpumask_parse_user(buffer, *lenp, mask);
> +		if (ret)
> +			goto done;
> +
> +		mutex_lock(&flow_limit_update_mutex);
> +		len = sizeof(*cur) + netdev_flow_limit_table_len;
> +		for_each_possible_cpu(i) {
> +			sd = &per_cpu(softnet_data, i);
> +			cur = rcu_dereference_protected(sd->flow_limit,
> +				      lockdep_is_held(flow_limit_update_mutex));
> +			if (cur && !cpumask_test_cpu(i, mask)) {
> +				RCU_INIT_POINTER(sd->flow_limit, NULL);
> +				synchronize_rcu();
> +				kfree(cur);
> +			} else if (!cur && cpumask_test_cpu(i, mask)) {
> +				cur = kzalloc(len, GFP_KERNEL);
> +				if (!cur) {
> +					/* not unwinding previous changes */
> +					ret = -ENOMEM;
> +					goto write_unlock;
> +				}
> +				cur->num_buckets = netdev_flow_limit_table_len;
> +				rcu_assign_pointer(sd->flow_limit, cur);
> +			}
> +		}
> +write_unlock:
> +		synchronize_rcu();

I believe you do not need this synchronize_rcu() call.

> +		mutex_unlock(&flow_limit_update_mutex);
> +	} else {
> +		if (*ppos || !*lenp) {
> +			*lenp = 0;
> +			goto done;
> +		}
> +
> +		cpumask_clear(mask);
> +		rcu_read_lock();
> +		for_each_possible_cpu(i) {
> +			sd = &per_cpu(softnet_data, i);
> +			if (rcu_dereference(sd->flow_limit))
> +				cpumask_set_cpu(i, mask);
> +		}
> +		rcu_read_unlock();
> +
> +		len = cpumask_scnprintf(buffer, *lenp, mask);
> +		*lenp = len + 1;
> +		*ppos += len + 1;
> +	}
> +
> +done:
> +	free_cpumask_var(mask);
> +	return ret;
> +}
> +

Thanks !

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v3] rps: selective flow shedding during softnet overflow
  2013-04-23 19:18             ` Eric Dumazet
@ 2013-04-23 20:30               ` Willem de Bruijn
  2013-04-23 20:31                 ` [PATCH net-next v4] " Willem de Bruijn
  2013-04-23 20:46                 ` [PATCH net-next v3] " Eric Dumazet
  0 siblings, 2 replies; 41+ messages in thread
From: Willem de Bruijn @ 2013-04-23 20:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, David Miller, Stephen Hemminger

On Tue, Apr 23, 2013 at 3:18 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Hi Willem
>
> On Tue, 2013-04-23 at 14:46 -0400, Willem de Bruijn wrote:
>
>> +
>> +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
>> +{
>> +#ifdef CONFIG_NET_FLOW_LIMIT
>> +     struct sd_flow_limit *fl;
>> +     struct softnet_data *sd;
>> +     unsigned int old_flow, new_flow;
>> +
>> +     if (qlen < (netdev_max_backlog >> 1))
>> +             return false;
>> +
>> +     sd = &per_cpu(softnet_data, smp_processor_id());
>
> sd = __get_cpu_var(softnet_data);
>
>> +
>> +     rcu_read_lock();
>> +     fl = rcu_dereference(sd->flow_limit);
>> +     if (fl) {
>> +             new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
>> +             old_flow = fl->history[fl->history_head];
>> +             fl->history[fl->history_head] = new_flow;
>> +
>> +             fl->history_head++;
>> +             fl->history_head &= FLOW_LIMIT_HISTORY - 1;
>> +
>> +             if (likely(fl->buckets[old_flow]))
>> +                     fl->buckets[old_flow]--;
>> +
>> +             if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
>> +                     fl->count++;
>> +                     rcu_read_unlock();
>> +                     return true;
>> +             }
>> +     }
>> +     rcu_read_unlock();
>> +#endif
>> +     return false;
>> +}
>> +
>
> ...
>
>
>> +#ifdef CONFIG_NET_FLOW_LIMIT
>> +static DEFINE_MUTEX(flow_limit_update_mutex);
>> +
>> +static int flow_limit_cpu_sysctl(ctl_table *table, int write,
>> +                              void __user *buffer, size_t *lenp,
>> +                              loff_t *ppos)
>> +{
>> +     struct sd_flow_limit *cur;
>> +     struct softnet_data *sd;
>> +     cpumask_var_t mask;
>> +     int i, len, ret = 0;
>> +
>> +     if (!alloc_cpumask_var(&mask, GFP_KERNEL))
>> +             return -ENOMEM;
>> +
>> +     if (write) {
>> +             ret = cpumask_parse_user(buffer, *lenp, mask);
>> +             if (ret)
>> +                     goto done;
>> +
>> +             mutex_lock(&flow_limit_update_mutex);
>> +             len = sizeof(*cur) + netdev_flow_limit_table_len;
>> +             for_each_possible_cpu(i) {
>> +                     sd = &per_cpu(softnet_data, i);
>> +                     cur = rcu_dereference_protected(sd->flow_limit,
>> +                                   lockdep_is_held(flow_limit_update_mutex));
>> +                     if (cur && !cpumask_test_cpu(i, mask)) {
>> +                             RCU_INIT_POINTER(sd->flow_limit, NULL);
>> +                             synchronize_rcu();
>> +                             kfree(cur);
>> +                     } else if (!cur && cpumask_test_cpu(i, mask)) {
>> +                             cur = kzalloc(len, GFP_KERNEL);
>> +                             if (!cur) {
>> +                                     /* not unwinding previous changes */
>> +                                     ret = -ENOMEM;
>> +                                     goto write_unlock;
>> +                             }
>> +                             cur->num_buckets = netdev_flow_limit_table_len;
>> +                             rcu_assign_pointer(sd->flow_limit, cur);
>> +                     }
>> +             }
>> +write_unlock:
>> +             synchronize_rcu();
>
> I believe you do not need this synchronize_rcu() call.

Because in this special case rcu_assign_pointer always replaces a
NULL value, correct? Thanks again for the feedback! I rebased, reran
the tests and will send v4 with these two changes (only).

>
>> +             mutex_unlock(&flow_limit_update_mutex);
>> +     } else {
>> +             if (*ppos || !*lenp) {
>> +                     *lenp = 0;
>> +                     goto done;
>> +             }
>> +
>> +             cpumask_clear(mask);
>> +             rcu_read_lock();
>> +             for_each_possible_cpu(i) {
>> +                     sd = &per_cpu(softnet_data, i);
>> +                     if (rcu_dereference(sd->flow_limit))
>> +                             cpumask_set_cpu(i, mask);
>> +             }
>> +             rcu_read_unlock();
>> +
>> +             len = cpumask_scnprintf(buffer, *lenp, mask);
>> +             *lenp = len + 1;
>> +             *ppos += len + 1;
>> +     }
>> +
>> +done:
>> +     free_cpumask_var(mask);
>> +     return ret;
>> +}
>> +
>
> Thanks !
>
>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 20:30               ` Willem de Bruijn
@ 2013-04-23 20:31                 ` Willem de Bruijn
  2013-04-23 21:23                   ` Stephen Hemminger
                                     ` (2 more replies)
  2013-04-23 20:46                 ` [PATCH net-next v3] " Eric Dumazet
  1 sibling, 3 replies; 41+ messages in thread
From: Willem de Bruijn @ 2013-04-23 20:31 UTC (permalink / raw)
  To: eric.dumazet, netdev, davem, stephen; +Cc: Willem de Bruijn

A cpu executing the network receive path sheds packets when its input
queue grows to netdev_max_backlog. A single high rate flow (such as a
spoofed source DoS) can exceed a single cpu processing rate and will
degrade throughput of other flows hashed onto the same cpu.

This patch adds a more fine grained hashtable. If the netdev backlog
is above a threshold, IRQ cpus track the ratio of total traffic of
each flow (using 4096 buckets, configurable). The ratio is measured
by counting the number of packets per flow over the last 256 packets
from the source cpu. Any flow that occupies a large fraction of this
(set at 50%) will see packet drop while above the threshold.

Tested:
Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
kernel receive (RPS) on cpu0 and application threads on cpus 2--7
each handling 20k req/s. Throughput halves when hit with a 400 kpps
antagonist storm. With this patch applied, antagonist overload is
dropped and the server processes its complete load.

The patch is effective when kernel receive processing is the
bottleneck. The above RPS scenario is a extreme, but the same is
reached with RFS and sufficient kernel processing (iptables, packet
socket tap, ..).

Signed-off-by: Willem de Bruijn <willemb@google.com>

---

Changes
v4
- remove unnecessary synchronize_rcu after rcu_assign_pointer to NULL ptr
- simplify lookup of current cpu's softnet
v3
- fix race between updates to table_len sysctl during bitmap sysctl.
- fix NULL pointer dereference on alloc failure.
v2
- add fl->num_buckets element to use the actual allocated table length.
- disable the kconfig option by default, as it is workload specific.
---
 include/linux/netdevice.h  |  17 ++++++++
 net/Kconfig                |  11 +++++
 net/core/dev.c             |  48 ++++++++++++++++++++-
 net/core/net-procfs.c      |  16 ++++++-
 net/core/sysctl_net_core.c | 104 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 193 insertions(+), 3 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f8898a4..d781cf1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1779,6 +1779,19 @@ static inline int unregister_gifconf(unsigned int family)
 	return register_gifconf(family, NULL);
 }
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+#define FLOW_LIMIT_HISTORY	(1 << 8)	/* must be ^2 */
+struct sd_flow_limit {
+	u64			count;
+	unsigned int		num_buckets;
+	unsigned int		history_head;
+	u16			history[FLOW_LIMIT_HISTORY];
+	u8			buckets[];
+};
+
+extern int netdev_flow_limit_table_len;
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
 /*
  * Incoming packets are placed on per-cpu queues
  */
@@ -1808,6 +1821,10 @@ struct softnet_data {
 	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit	*flow_limit;
+#endif
 };
 
 static inline void input_queue_head_incr(struct softnet_data *sd)
diff --git a/net/Kconfig b/net/Kconfig
index 1a22216..a0cbd3b 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -268,6 +268,17 @@ config BPF_JIT
 	  packet sniffing (libpcap/tcpdump). Note : Admin should enable
 	  this feature changing /proc/sys/net/core/bpf_jit_enable
 
+config NET_FLOW_LIMIT
+	bool "Flow shedding under load"
+	default n
+	---help---
+	  The network stack has to drop packets when a receive processing CPUs
+	  backlog reaches netdev_max_backlog. If a few out of many active flows
+	  generate the vast majority of load, drop their traffic earlier to
+	  maintain capacity for the other flows. This feature provides servers
+	  with many clients some protection against DoS by a single (spoofed)
+	  flow that greatly exceeds average workload.
+
 menu "Network testing"
 
 config NET_PKTGEN
diff --git a/net/core/dev.c b/net/core/dev.c
index 9e26b8d..c9b7106 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3058,6 +3058,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
 	return 0;
 }
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+	struct softnet_data *sd;
+	unsigned int old_flow, new_flow;
+
+	if (qlen < (netdev_max_backlog >> 1))
+		return false;
+
+	sd = &__get_cpu_var(softnet_data);
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl) {
+		new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
+		old_flow = fl->history[fl->history_head];
+		fl->history[fl->history_head] = new_flow;
+
+		fl->history_head++;
+		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+		if (likely(fl->buckets[old_flow]))
+			fl->buckets[old_flow]--;
+
+		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+			fl->count++;
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+#endif
+	return false;
+}
+
 /*
  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
  * queue (may be a remote CPU queue).
@@ -3067,13 +3107,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 {
 	struct softnet_data *sd;
 	unsigned long flags;
+	unsigned int qlen;
 
 	sd = &per_cpu(softnet_data, cpu);
 
 	local_irq_save(flags);
 
 	rps_lock(sd);
-	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+	qlen = skb_queue_len(&sd->input_pkt_queue);
+	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 		if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6263,6 +6305,10 @@ static int __init net_dev_init(void)
 		sd->backlog.weight = weight_p;
 		sd->backlog.gro_list = NULL;
 		sd->backlog.gro_count = 0;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+		sd->flow_limit = NULL;
+#endif
 	}
 
 	dev_boot_phase = 0;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 569d355..2bf8329 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)
 static int softnet_seq_show(struct seq_file *seq, void *v)
 {
 	struct softnet_data *sd = v;
+	unsigned int flow_limit_count = 0;
 
-	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl)
+		flow_limit_count = fl->count;
+	rcu_read_unlock();
+#endif
+
+	seq_printf(seq,
+		   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
-		   sd->cpu_collision, sd->received_rps);
+		   sd->cpu_collision, sd->received_rps, flow_limit_count);
 	return 0;
 }
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cfdb46a..9e3e644 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -87,6 +87,96 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
 }
 #endif /* CONFIG_RPS */
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+static DEFINE_MUTEX(flow_limit_update_mutex);
+
+static int flow_limit_cpu_sysctl(ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *ppos)
+{
+	struct sd_flow_limit *cur;
+	struct softnet_data *sd;
+	cpumask_var_t mask;
+	int i, len, ret = 0;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (write) {
+		ret = cpumask_parse_user(buffer, *lenp, mask);
+		if (ret)
+			goto done;
+
+		mutex_lock(&flow_limit_update_mutex);
+		len = sizeof(*cur) + netdev_flow_limit_table_len;
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			cur = rcu_dereference_protected(sd->flow_limit,
+				      lockdep_is_held(flow_limit_update_mutex));
+			if (cur && !cpumask_test_cpu(i, mask)) {
+				RCU_INIT_POINTER(sd->flow_limit, NULL);
+				synchronize_rcu();
+				kfree(cur);
+			} else if (!cur && cpumask_test_cpu(i, mask)) {
+				cur = kzalloc(len, GFP_KERNEL);
+				if (!cur) {
+					/* not unwinding previous changes */
+					ret = -ENOMEM;
+					goto write_unlock;
+				}
+				cur->num_buckets = netdev_flow_limit_table_len;
+				rcu_assign_pointer(sd->flow_limit, cur);
+			}
+		}
+write_unlock:
+		mutex_unlock(&flow_limit_update_mutex);
+	} else {
+		if (*ppos || !*lenp) {
+			*lenp = 0;
+			goto done;
+		}
+
+		cpumask_clear(mask);
+		rcu_read_lock();
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			if (rcu_dereference(sd->flow_limit))
+				cpumask_set_cpu(i, mask);
+		}
+		rcu_read_unlock();
+
+		len = cpumask_scnprintf(buffer, *lenp, mask);
+		*lenp = len + 1;
+		*ppos += len + 1;
+	}
+
+done:
+	free_cpumask_var(mask);
+	return ret;
+}
+
+static int flow_limit_table_len_sysctl(ctl_table *table, int write,
+				       void __user *buffer, size_t *lenp,
+				       loff_t *ppos)
+{
+	unsigned int old, *ptr;
+	int ret;
+
+	mutex_lock(&flow_limit_update_mutex);
+
+	ptr = table->data;
+	old = *ptr;
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (!ret && write && !is_power_of_2(*ptr)) {
+		*ptr = old;
+		ret = -EINVAL;
+	}
+
+	mutex_unlock(&flow_limit_update_mutex);
+	return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -180,6 +270,20 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= rps_sock_flow_sysctl
 	},
 #endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+	{
+		.procname	= "flow_limit_cpu_bitmap",
+		.mode		= 0644,
+		.proc_handler	= flow_limit_cpu_sysctl
+	},
+	{
+		.procname	= "flow_limit_table_len",
+		.data		= &netdev_flow_limit_table_len,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= flow_limit_table_len_sysctl
+	},
+#endif /* CONFIG_NET_FLOW_LIMIT */
 #endif /* CONFIG_NET */
 	{
 		.procname	= "netdev_budget",
-- 
1.8.2.1

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v3] rps: selective flow shedding during softnet overflow
  2013-04-23 20:30               ` Willem de Bruijn
  2013-04-23 20:31                 ` [PATCH net-next v4] " Willem de Bruijn
@ 2013-04-23 20:46                 ` Eric Dumazet
  1 sibling, 0 replies; 41+ messages in thread
From: Eric Dumazet @ 2013-04-23 20:46 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: netdev, David Miller, Stephen Hemminger

On Tue, 2013-04-23 at 16:30 -0400, Willem de Bruijn wrote:
> On Tue, Apr 23, 2013 at 3:18 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:

> >> +write_unlock:
> >> +             synchronize_rcu();
> >
> > I believe you do not need this synchronize_rcu() call.
> 
> Because in this special case rcu_assign_pointer always replaces a
> NULL value, correct? Thanks again for the feedback! I rebased, reran
> the tests and will send v4 with these two changes (only).

Well, there is no assignment or freeing after the synchronize_rcu();

(mask is a local var only)

This looks as a leftover.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 20:31                 ` [PATCH net-next v4] " Willem de Bruijn
@ 2013-04-23 21:23                   ` Stephen Hemminger
  2013-04-23 21:37                     ` Willem de Bruijn
                                       ` (2 more replies)
  2013-04-23 21:34                   ` Eric Dumazet
  2013-04-23 22:41                   ` David Miller
  2 siblings, 3 replies; 41+ messages in thread
From: Stephen Hemminger @ 2013-04-23 21:23 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: eric.dumazet, netdev, davem

On Tue, 23 Apr 2013 16:31:34 -0400
Willem de Bruijn <willemb@google.com> wrote:

> A cpu executing the network receive path sheds packets when its input
> queue grows to netdev_max_backlog. A single high rate flow (such as a
> spoofed source DoS) can exceed a single cpu processing rate and will
> degrade throughput of other flows hashed onto the same cpu.
> 
> This patch adds a more fine grained hashtable. If the netdev backlog
> is above a threshold, IRQ cpus track the ratio of total traffic of
> each flow (using 4096 buckets, configurable). The ratio is measured
> by counting the number of packets per flow over the last 256 packets
> from the source cpu. Any flow that occupies a large fraction of this
> (set at 50%) will see packet drop while above the threshold.
> 
> Tested:
> Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
> kernel receive (RPS) on cpu0 and application threads on cpus 2--7
> each handling 20k req/s. Throughput halves when hit with a 400 kpps
> antagonist storm. With this patch applied, antagonist overload is
> dropped and the server processes its complete load.
> 
> The patch is effective when kernel receive processing is the
> bottleneck. The above RPS scenario is a extreme, but the same is
> reached with RFS and sufficient kernel processing (iptables, packet
> socket tap, ..).
> 
> Signed-off-by: Willem de Bruijn <willemb@google.com>

What about just having a smarter ingress qdisc?

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 20:31                 ` [PATCH net-next v4] " Willem de Bruijn
  2013-04-23 21:23                   ` Stephen Hemminger
@ 2013-04-23 21:34                   ` Eric Dumazet
  2013-04-23 22:41                   ` David Miller
  2 siblings, 0 replies; 41+ messages in thread
From: Eric Dumazet @ 2013-04-23 21:34 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: netdev, davem, stephen

On Tue, 2013-04-23 at 16:31 -0400, Willem de Bruijn wrote:
> A cpu executing the network receive path sheds packets when its input
> queue grows to netdev_max_backlog. A single high rate flow (such as a
> spoofed source DoS) can exceed a single cpu processing rate and will
> degrade throughput of other flows hashed onto the same cpu.
> 
> This patch adds a more fine grained hashtable. If the netdev backlog
> is above a threshold, IRQ cpus track the ratio of total traffic of
> each flow (using 4096 buckets, configurable). The ratio is measured
> by counting the number of packets per flow over the last 256 packets
> from the source cpu. Any flow that occupies a large fraction of this
> (set at 50%) will see packet drop while above the threshold.
> 
> Tested:
> Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
> kernel receive (RPS) on cpu0 and application threads on cpus 2--7
> each handling 20k req/s. Throughput halves when hit with a 400 kpps
> antagonist storm. With this patch applied, antagonist overload is
> dropped and the server processes its complete load.
> 
> The patch is effective when kernel receive processing is the
> bottleneck. The above RPS scenario is a extreme, but the same is
> reached with RFS and sufficient kernel processing (iptables, packet
> socket tap, ..).
> 
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> 
> ---

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 21:23                   ` Stephen Hemminger
@ 2013-04-23 21:37                     ` Willem de Bruijn
  2013-04-23 21:37                     ` Eric Dumazet
  2013-04-23 22:33                     ` David Miller
  2 siblings, 0 replies; 41+ messages in thread
From: Willem de Bruijn @ 2013-04-23 21:37 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Eric Dumazet, netdev, David Miller

On Tue, Apr 23, 2013 at 5:23 PM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Tue, 23 Apr 2013 16:31:34 -0400
> Willem de Bruijn <willemb@google.com> wrote:
>
>> A cpu executing the network receive path sheds packets when its input
>> queue grows to netdev_max_backlog. A single high rate flow (such as a
>> spoofed source DoS) can exceed a single cpu processing rate and will
>> degrade throughput of other flows hashed onto the same cpu.
>>
>> This patch adds a more fine grained hashtable. If the netdev backlog
>> is above a threshold, IRQ cpus track the ratio of total traffic of
>> each flow (using 4096 buckets, configurable). The ratio is measured
>> by counting the number of packets per flow over the last 256 packets
>> from the source cpu. Any flow that occupies a large fraction of this
>> (set at 50%) will see packet drop while above the threshold.
>>
>> Tested:
>> Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
>> kernel receive (RPS) on cpu0 and application threads on cpus 2--7
>> each handling 20k req/s. Throughput halves when hit with a 400 kpps
>> antagonist storm. With this patch applied, antagonist overload is
>> dropped and the server processes its complete load.
>>
>> The patch is effective when kernel receive processing is the
>> bottleneck. The above RPS scenario is a extreme, but the same is
>> reached with RFS and sufficient kernel processing (iptables, packet
>> socket tap, ..).
>>
>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>
> What about just having a smarter ingress qdisc?

For filtering, as this patch does, that is an interesting approach.
Similar to fanout rollover, I plan to evaluate redistributing overload
instead of filtering. That is not acceptable for TCP connections due
to reordering, but may help protect against tcp synfloods, where most
packets will not be part of a connection and can be processed by any
cpu with cycles to spare. Moreover, all this processing takes place in
the kernel receive path, so this is the type of workload that is most
likely to overflow the input_pkt_queue. Frankly, that part requires
much more evaluation to see if it makes sense, which is why I had not
made this context clear: filtering by itself is already useful. An
ingress qdisc is worth evaluating in that regard.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 21:23                   ` Stephen Hemminger
  2013-04-23 21:37                     ` Willem de Bruijn
@ 2013-04-23 21:37                     ` Eric Dumazet
  2013-04-23 21:52                       ` Stephen Hemminger
  2013-04-23 22:33                     ` David Miller
  2 siblings, 1 reply; 41+ messages in thread
From: Eric Dumazet @ 2013-04-23 21:37 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Willem de Bruijn, netdev, davem

On Tue, 2013-04-23 at 14:23 -0700, Stephen Hemminger wrote:

> What about just having a smarter ingress qdisc?

What are your ideas ?

Setting ingress qdisc on linux is no fun, and not scalable.

Its ok for playing with netem and low bandwidth.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 21:37                     ` Eric Dumazet
@ 2013-04-23 21:52                       ` Stephen Hemminger
  2013-04-23 22:34                         ` David Miller
  2013-04-24  0:09                         ` Eric Dumazet
  0 siblings, 2 replies; 41+ messages in thread
From: Stephen Hemminger @ 2013-04-23 21:52 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Willem de Bruijn, netdev, davem

On Tue, 23 Apr 2013 14:37:43 -0700
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> On Tue, 2013-04-23 at 14:23 -0700, Stephen Hemminger wrote:
> 
> > What about just having a smarter ingress qdisc?
> 
> What are your ideas ?
> 
> Setting ingress qdisc on linux is no fun, and not scalable.
> 
> Its ok for playing with netem and low bandwidth.
> 
> 

I just don't want to get tied down to one hard coded policy.
User seem have different ideas about what constitutes a flow and what policy for drop should be.
Existing ingress qdisc is inflexible and ifb is a pain to setup and adds
another queue transistion.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 21:23                   ` Stephen Hemminger
  2013-04-23 21:37                     ` Willem de Bruijn
  2013-04-23 21:37                     ` Eric Dumazet
@ 2013-04-23 22:33                     ` David Miller
  2 siblings, 0 replies; 41+ messages in thread
From: David Miller @ 2013-04-23 22:33 UTC (permalink / raw)
  To: stephen; +Cc: willemb, eric.dumazet, netdev

From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 23 Apr 2013 14:23:33 -0700

> What about just having a smarter ingress qdisc?

Ingress qdiscs go through a single lock, and that is not likely to
change any time soon, nor do I find it reasonable to be required to
use ingress qdiscs to handle this problem.

I find Willem's changes extremely reasonable, and a good improvement
we should integrate now rather than pretending ingress qdiscs are an
acceptable alternative.  They aren't.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 21:52                       ` Stephen Hemminger
@ 2013-04-23 22:34                         ` David Miller
  2013-04-24  0:09                         ` Eric Dumazet
  1 sibling, 0 replies; 41+ messages in thread
From: David Miller @ 2013-04-23 22:34 UTC (permalink / raw)
  To: stephen; +Cc: eric.dumazet, willemb, netdev

From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 23 Apr 2013 14:52:22 -0700

> I just don't want to get tied down to one hard coded policy.  User
> seem have different ideas about what constitutes a flow and what
> policy for drop should be.  Existing ingress qdisc is inflexible and
> ifb is a pain to setup and adds another queue transistion.

There is no hard coded policy.

User defines his policy by what cpu he executes his socket reads and
writes upon, and RFS reacts in kind.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 20:31                 ` [PATCH net-next v4] " Willem de Bruijn
  2013-04-23 21:23                   ` Stephen Hemminger
  2013-04-23 21:34                   ` Eric Dumazet
@ 2013-04-23 22:41                   ` David Miller
  2013-04-23 23:11                     ` Eric Dumazet
  2013-04-24  0:00                     ` Willem de Bruijn
  2 siblings, 2 replies; 41+ messages in thread
From: David Miller @ 2013-04-23 22:41 UTC (permalink / raw)
  To: willemb; +Cc: eric.dumazet, netdev, stephen

From: Willem de Bruijn <willemb@google.com>
Date: Tue, 23 Apr 2013 16:31:34 -0400

> - disable the kconfig option by default, as it is workload specific.

If this logic only kicks in during overload, I see no reason why we
shouldn't have this protection enabled unconditionally, all the time.

Does it hurt performance under normal workloads?

If the new logic is controlled by overload thresholds then it should
have no impact whatsoever in non-overload scenerios.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 22:41                   ` David Miller
@ 2013-04-23 23:11                     ` Eric Dumazet
  2013-04-23 23:15                       ` David Miller
  2013-04-24  0:00                     ` Willem de Bruijn
  1 sibling, 1 reply; 41+ messages in thread
From: Eric Dumazet @ 2013-04-23 23:11 UTC (permalink / raw)
  To: David Miller; +Cc: willemb, netdev, stephen

On Tue, 2013-04-23 at 18:41 -0400, David Miller wrote:
> From: Willem de Bruijn <willemb@google.com>
> Date: Tue, 23 Apr 2013 16:31:34 -0400
> 
> > - disable the kconfig option by default, as it is workload specific.
> 
> If this logic only kicks in during overload, I see no reason why we
> shouldn't have this protection enabled unconditionally, all the time.
> 
> Does it hurt performance under normal workloads?
> 
> If the new logic is controlled by overload thresholds then it should
> have no impact whatsoever in non-overload scenerios.

Exact.

I guess we should at least not include this code on !SMP builds

This adds some code, but as long as you don't write into
flow_limit_cpu_bitmap/flow_limit_table_len we wont allocate the memory,
thus this is disabled.

Then, if enabled, it only kicks if the number of queued packets is above
half the max backlog (netdev_max_backlog >> 1)

So on non overload scenario, cost is 0.

On overload scenario, the loaded cpu keeps a 512 bytes array hot in his
cache. (fl->history[])

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 23:11                     ` Eric Dumazet
@ 2013-04-23 23:15                       ` David Miller
  2013-04-23 23:26                         ` Eric Dumazet
  2013-04-24  0:03                         ` Stephen Hemminger
  0 siblings, 2 replies; 41+ messages in thread
From: David Miller @ 2013-04-23 23:15 UTC (permalink / raw)
  To: eric.dumazet; +Cc: willemb, netdev, stephen

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 23 Apr 2013 16:11:23 -0700

> I guess we should at least not include this code on !SMP builds

I think CONFIG_RPS already an appropriate guard for something like
this.  And it depends upon SMP et al.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 23:15                       ` David Miller
@ 2013-04-23 23:26                         ` Eric Dumazet
  2013-04-24  0:03                         ` Stephen Hemminger
  1 sibling, 0 replies; 41+ messages in thread
From: Eric Dumazet @ 2013-04-23 23:26 UTC (permalink / raw)
  To: David Miller; +Cc: willemb, netdev, stephen

On Tue, 2013-04-23 at 19:15 -0400, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 23 Apr 2013 16:11:23 -0700
> 
> > I guess we should at least not include this code on !SMP builds
> 
> I think CONFIG_RPS already an appropriate guard for something like
> this.  And it depends upon SMP et al.

Yes, something like :

config NET_FLOW_LIMIT
	bool "Flow shedding under load"
	depends on RPS
	default y

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 22:41                   ` David Miller
  2013-04-23 23:11                     ` Eric Dumazet
@ 2013-04-24  0:00                     ` Willem de Bruijn
  1 sibling, 0 replies; 41+ messages in thread
From: Willem de Bruijn @ 2013-04-24  0:00 UTC (permalink / raw)
  To: David Miller; +Cc: Eric Dumazet, netdev, Stephen Hemminger

On Tue, Apr 23, 2013 at 6:41 PM, David Miller <davem@davemloft.net> wrote:
> From: Willem de Bruijn <willemb@google.com>
> Date: Tue, 23 Apr 2013 16:31:34 -0400
>
>> - disable the kconfig option by default, as it is workload specific.
>
> If this logic only kicks in during overload, I see no reason why we
> shouldn't have this protection enabled unconditionally, all the time.
>
> Does it hurt performance under normal workloads?
>
> If the new logic is controlled by overload thresholds then it should
> have no impact whatsoever in non-overload scenerios.

It only adds one branch in that case. I was (probably too) conservative.

Do you want me to resubmit the entire patch with the Kconfig
dependency on RPS that Eric proposed?

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 23:15                       ` David Miller
  2013-04-23 23:26                         ` Eric Dumazet
@ 2013-04-24  0:03                         ` Stephen Hemminger
  1 sibling, 0 replies; 41+ messages in thread
From: Stephen Hemminger @ 2013-04-24  0:03 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, willemb, netdev

On Tue, 23 Apr 2013 19:15:40 -0400 (EDT)
David Miller <davem@davemloft.net> wrote:

> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 23 Apr 2013 16:11:23 -0700
> 
> > I guess we should at least not include this code on !SMP builds
> 
> I think CONFIG_RPS already an appropriate guard for something like
> this.  And it depends upon SMP et al.
> 

I wasn't meaning to say flow shedding is a bad idea, just would love
to see a general solution. We are all concerned with DoS attacks.
If you haven't been hit yet, somebody you know is already.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-23 21:52                       ` Stephen Hemminger
  2013-04-23 22:34                         ` David Miller
@ 2013-04-24  0:09                         ` Eric Dumazet
  2013-04-24  0:37                           ` [PATCH net-next v5] " Willem de Bruijn
  2013-04-24  1:25                           ` [PATCH net-next v4] " Jamal Hadi Salim
  1 sibling, 2 replies; 41+ messages in thread
From: Eric Dumazet @ 2013-04-24  0:09 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Willem de Bruijn, netdev, davem

On Tue, 2013-04-23 at 14:52 -0700, Stephen Hemminger wrote:

> I just don't want to get tied down to one hard coded policy.
> User seem have different ideas about what constitutes a flow and what policy for drop should be.
> Existing ingress qdisc is inflexible and ifb is a pain to setup and adds
> another queue transistion.

qdisc code has a hardcoded dev_hard_start_xmit() call, thats why ifb
hack is used. Not mentioning device flow control.

It might be possible to use a q->xmit() method instead, so that it can
be used on ingress without ifb.

Then we would have to allow one qdisc per RX queue, and not use qdisc
lock (assuming NAPI protects us from reentrancy).

So napi device handler would queue skbs in qdisc (q->enqueue()),
(allowing a standing queue to build so that some clever qdisc can drop
some selected packets)

Not really clear how we would allow packets being delivered to another
queue (RPS/RFS), and not clear how/when doing the qdisc_run() to dequeue
packets and deliver them to stack.

I don't know, this looks like a lot of changes.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH net-next v5] rps: selective flow shedding during softnet overflow
  2013-04-24  0:09                         ` Eric Dumazet
@ 2013-04-24  0:37                           ` Willem de Bruijn
  2013-04-24  1:07                             ` Eric Dumazet
  2013-04-25  8:20                             ` David Miller
  2013-04-24  1:25                           ` [PATCH net-next v4] " Jamal Hadi Salim
  1 sibling, 2 replies; 41+ messages in thread
From: Willem de Bruijn @ 2013-04-24  0:37 UTC (permalink / raw)
  To: eric.dumazet, netdev, davem, stephen; +Cc: Willem de Bruijn

A cpu executing the network receive path sheds packets when its input
queue grows to netdev_max_backlog. A single high rate flow (such as a
spoofed source DoS) can exceed a single cpu processing rate and will
degrade throughput of other flows hashed onto the same cpu.

This patch adds a more fine grained hashtable. If the netdev backlog
is above a threshold, IRQ cpus track the ratio of total traffic of
each flow (using 4096 buckets, configurable). The ratio is measured
by counting the number of packets per flow over the last 256 packets
from the source cpu. Any flow that occupies a large fraction of this
(set at 50%) will see packet drop while above the threshold.

Tested:
Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
kernel receive (RPS) on cpu0 and application threads on cpus 2--7
each handling 20k req/s. Throughput halves when hit with a 400 kpps
antagonist storm. With this patch applied, antagonist overload is
dropped and the server processes its complete load.

The patch is effective when kernel receive processing is the
bottleneck. The above RPS scenario is a extreme, but the same is
reached with RFS and sufficient kernel processing (iptables, packet
socket tap, ..).

Signed-off-by: Willem de Bruijn <willemb@google.com>

---

Changes
v5
- depend on RPS, automatically build if RPS is enabled.
v4
- remove unnecessary synchronize_rcu after rcu_assign_pointer to NULL ptr
- simplify lookup of current cpu's softnet
v3
- fix race between updates to table_len sysctl during bitmap sysctl.
- fix NULL pointer dereference on alloc failure.
v2
- add fl->num_buckets element to use the actual allocated table length.
- disable the kconfig option by default, as it is workload specific.
---
 include/linux/netdevice.h  |  17 ++++++++
 net/Kconfig                |  12 ++++++
 net/core/dev.c             |  48 ++++++++++++++++++++-
 net/core/net-procfs.c      |  16 ++++++-
 net/core/sysctl_net_core.c | 104 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 194 insertions(+), 3 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f8898a4..d781cf1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1779,6 +1779,19 @@ static inline int unregister_gifconf(unsigned int family)
 	return register_gifconf(family, NULL);
 }
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+#define FLOW_LIMIT_HISTORY	(1 << 8)	/* must be ^2 */
+struct sd_flow_limit {
+	u64			count;
+	unsigned int		num_buckets;
+	unsigned int		history_head;
+	u16			history[FLOW_LIMIT_HISTORY];
+	u8			buckets[];
+};
+
+extern int netdev_flow_limit_table_len;
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
 /*
  * Incoming packets are placed on per-cpu queues
  */
@@ -1808,6 +1821,10 @@ struct softnet_data {
 	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit	*flow_limit;
+#endif
 };
 
 static inline void input_queue_head_incr(struct softnet_data *sd)
diff --git a/net/Kconfig b/net/Kconfig
index 1a22216..02ebc71 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -268,6 +268,18 @@ config BPF_JIT
 	  packet sniffing (libpcap/tcpdump). Note : Admin should enable
 	  this feature changing /proc/sys/net/core/bpf_jit_enable
 
+config NET_FLOW_LIMIT
+	bool "Flow shedding under load"
+	depends on RPS
+	default y
+	---help---
+	  The network stack has to drop packets when a receive processing CPUs
+	  backlog reaches netdev_max_backlog. If a few out of many active flows
+	  generate the vast majority of load, drop their traffic earlier to
+	  maintain capacity for the other flows. This feature provides servers
+	  with many clients some protection against DoS by a single (spoofed)
+	  flow that greatly exceeds average workload.
+
 menu "Network testing"
 
 config NET_PKTGEN
diff --git a/net/core/dev.c b/net/core/dev.c
index 9e26b8d..c9b7106 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3058,6 +3058,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
 	return 0;
 }
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+	struct softnet_data *sd;
+	unsigned int old_flow, new_flow;
+
+	if (qlen < (netdev_max_backlog >> 1))
+		return false;
+
+	sd = &__get_cpu_var(softnet_data);
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl) {
+		new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
+		old_flow = fl->history[fl->history_head];
+		fl->history[fl->history_head] = new_flow;
+
+		fl->history_head++;
+		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+		if (likely(fl->buckets[old_flow]))
+			fl->buckets[old_flow]--;
+
+		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+			fl->count++;
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+#endif
+	return false;
+}
+
 /*
  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
  * queue (may be a remote CPU queue).
@@ -3067,13 +3107,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 {
 	struct softnet_data *sd;
 	unsigned long flags;
+	unsigned int qlen;
 
 	sd = &per_cpu(softnet_data, cpu);
 
 	local_irq_save(flags);
 
 	rps_lock(sd);
-	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+	qlen = skb_queue_len(&sd->input_pkt_queue);
+	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 		if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6263,6 +6305,10 @@ static int __init net_dev_init(void)
 		sd->backlog.weight = weight_p;
 		sd->backlog.gro_list = NULL;
 		sd->backlog.gro_count = 0;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+		sd->flow_limit = NULL;
+#endif
 	}
 
 	dev_boot_phase = 0;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 569d355..2bf8329 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)
 static int softnet_seq_show(struct seq_file *seq, void *v)
 {
 	struct softnet_data *sd = v;
+	unsigned int flow_limit_count = 0;
 
-	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl)
+		flow_limit_count = fl->count;
+	rcu_read_unlock();
+#endif
+
+	seq_printf(seq,
+		   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
-		   sd->cpu_collision, sd->received_rps);
+		   sd->cpu_collision, sd->received_rps, flow_limit_count);
 	return 0;
 }
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cfdb46a..9e3e644 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -87,6 +87,96 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
 }
 #endif /* CONFIG_RPS */
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+static DEFINE_MUTEX(flow_limit_update_mutex);
+
+static int flow_limit_cpu_sysctl(ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *ppos)
+{
+	struct sd_flow_limit *cur;
+	struct softnet_data *sd;
+	cpumask_var_t mask;
+	int i, len, ret = 0;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (write) {
+		ret = cpumask_parse_user(buffer, *lenp, mask);
+		if (ret)
+			goto done;
+
+		mutex_lock(&flow_limit_update_mutex);
+		len = sizeof(*cur) + netdev_flow_limit_table_len;
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			cur = rcu_dereference_protected(sd->flow_limit,
+				      lockdep_is_held(flow_limit_update_mutex));
+			if (cur && !cpumask_test_cpu(i, mask)) {
+				RCU_INIT_POINTER(sd->flow_limit, NULL);
+				synchronize_rcu();
+				kfree(cur);
+			} else if (!cur && cpumask_test_cpu(i, mask)) {
+				cur = kzalloc(len, GFP_KERNEL);
+				if (!cur) {
+					/* not unwinding previous changes */
+					ret = -ENOMEM;
+					goto write_unlock;
+				}
+				cur->num_buckets = netdev_flow_limit_table_len;
+				rcu_assign_pointer(sd->flow_limit, cur);
+			}
+		}
+write_unlock:
+		mutex_unlock(&flow_limit_update_mutex);
+	} else {
+		if (*ppos || !*lenp) {
+			*lenp = 0;
+			goto done;
+		}
+
+		cpumask_clear(mask);
+		rcu_read_lock();
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			if (rcu_dereference(sd->flow_limit))
+				cpumask_set_cpu(i, mask);
+		}
+		rcu_read_unlock();
+
+		len = cpumask_scnprintf(buffer, *lenp, mask);
+		*lenp = len + 1;
+		*ppos += len + 1;
+	}
+
+done:
+	free_cpumask_var(mask);
+	return ret;
+}
+
+static int flow_limit_table_len_sysctl(ctl_table *table, int write,
+				       void __user *buffer, size_t *lenp,
+				       loff_t *ppos)
+{
+	unsigned int old, *ptr;
+	int ret;
+
+	mutex_lock(&flow_limit_update_mutex);
+
+	ptr = table->data;
+	old = *ptr;
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (!ret && write && !is_power_of_2(*ptr)) {
+		*ptr = old;
+		ret = -EINVAL;
+	}
+
+	mutex_unlock(&flow_limit_update_mutex);
+	return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -180,6 +270,20 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= rps_sock_flow_sysctl
 	},
 #endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+	{
+		.procname	= "flow_limit_cpu_bitmap",
+		.mode		= 0644,
+		.proc_handler	= flow_limit_cpu_sysctl
+	},
+	{
+		.procname	= "flow_limit_table_len",
+		.data		= &netdev_flow_limit_table_len,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= flow_limit_table_len_sysctl
+	},
+#endif /* CONFIG_NET_FLOW_LIMIT */
 #endif /* CONFIG_NET */
 	{
 		.procname	= "netdev_budget",
-- 
1.8.2.1

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v5] rps: selective flow shedding during softnet overflow
  2013-04-24  0:37                           ` [PATCH net-next v5] " Willem de Bruijn
@ 2013-04-24  1:07                             ` Eric Dumazet
  2013-04-25  8:20                             ` David Miller
  1 sibling, 0 replies; 41+ messages in thread
From: Eric Dumazet @ 2013-04-24  1:07 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: netdev, davem, stephen

On Tue, 2013-04-23 at 20:37 -0400, Willem de Bruijn wrote:
> A cpu executing the network receive path sheds packets when its input
> queue grows to netdev_max_backlog. A single high rate flow (such as a
> spoofed source DoS) can exceed a single cpu processing rate and will
> degrade throughput of other flows hashed onto the same cpu.
> 
> This patch adds a more fine grained hashtable. If the netdev backlog
> is above a threshold, IRQ cpus track the ratio of total traffic of
> each flow (using 4096 buckets, configurable). The ratio is measured
> by counting the number of packets per flow over the last 256 packets
> from the source cpu. Any flow that occupies a large fraction of this
> (set at 50%) will see packet drop while above the threshold.
> 
> Tested:
> Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
> kernel receive (RPS) on cpu0 and application threads on cpus 2--7
> each handling 20k req/s. Throughput halves when hit with a 400 kpps
> antagonist storm. With this patch applied, antagonist overload is
> dropped and the server processes its complete load.
> 
> The patch is effective when kernel receive processing is the
> bottleneck. The above RPS scenario is a extreme, but the same is
> reached with RFS and sufficient kernel processing (iptables, packet
> socket tap, ..).
> 
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> 


>  /*
>   * Incoming packets are placed on per-cpu queues
>   */
> @@ -1808,6 +1821,10 @@ struct softnet_data {
>  	unsigned int		dropped;
>  	struct sk_buff_head	input_pkt_queue;
>  	struct napi_struct	backlog;
> +
> +#ifdef CONFIG_NET_FLOW_LIMIT
> +	struct sd_flow_limit	*flow_limit;
> +#endif
>  };


I guess flow_limit could be put before csd, as its a read only field by
its owner.
This need more testing and is a minor detail.

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-24  0:09                         ` Eric Dumazet
  2013-04-24  0:37                           ` [PATCH net-next v5] " Willem de Bruijn
@ 2013-04-24  1:25                           ` Jamal Hadi Salim
  2013-04-24  1:32                             ` Eric Dumazet
  1 sibling, 1 reply; 41+ messages in thread
From: Jamal Hadi Salim @ 2013-04-24  1:25 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Stephen Hemminger, Willem de Bruijn, netdev, davem

On 13-04-23 08:09 PM, Eric Dumazet wrote:

> qdisc code has a hardcoded dev_hard_start_xmit() call, thats why ifb
> hack is used. Not mentioning device flow control.
>
> It might be possible to use a q->xmit() method instead, so that it can
> be used on ingress without ifb.
>

If i understood correctly what you are trying to achieve:
I dont think one qdisc per rx queue/ring will work well in
presence of qdisc since the qdisc is attached per netdev.
i.e when packets are fanned out across cpu backlogs, as long
as they came in via same netdev queue, they are going to share
the same lock with all other cpus such packets have been fanned out to
the moment you attach an ingress qdisc to that netdev ring/queue.

One unorthodox approach is to have a qdisc per backlog queue
since the backlog is per cpu; given it is abstracted as a netdev,
it becomes a natural fit (sans the fact backlog queue is
unidirectional).

cheers,
jamal

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-24  1:25                           ` [PATCH net-next v4] " Jamal Hadi Salim
@ 2013-04-24  1:32                             ` Eric Dumazet
  2013-04-24  1:44                               ` Jamal Hadi Salim
  0 siblings, 1 reply; 41+ messages in thread
From: Eric Dumazet @ 2013-04-24  1:32 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: Stephen Hemminger, Willem de Bruijn, netdev, davem

On Tue, 2013-04-23 at 21:25 -0400, Jamal Hadi Salim wrote:
> On 13-04-23 08:09 PM, Eric Dumazet wrote:
> 
> > qdisc code has a hardcoded dev_hard_start_xmit() call, thats why ifb
> > hack is used. Not mentioning device flow control.
> >
> > It might be possible to use a q->xmit() method instead, so that it can
> > be used on ingress without ifb.
> >
> 
> If i understood correctly what you are trying to achieve:
> I dont think one qdisc per rx queue/ring will work well in
> presence of qdisc since the qdisc is attached per netdev.

MQ permits to have one qdisc per TX queue.

It would be the same concept in ingress.

> i.e when packets are fanned out across cpu backlogs, as long
> as they came in via same netdev queue, they are going to share
> the same lock with all other cpus such packets have been fanned out to
> the moment you attach an ingress qdisc to that netdev ring/queue.
> 

Not sure what you mean. The qdisc stuff would replace the 'cpu backlog',
not be added to it. Think of having possibility to control backlog using
standard qdiscs, like fq_codel ;)

> One unorthodox approach is to have a qdisc per backlog queue
> since the backlog is per cpu; given it is abstracted as a netdev,
> it becomes a natural fit (sans the fact backlog queue is
> unidirectional).

Yes, but the per cpu backlog is shared for all devices. We probably want
different qdisc for gre tunnel, eth0, ...

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-24  1:32                             ` Eric Dumazet
@ 2013-04-24  1:44                               ` Jamal Hadi Salim
  2013-04-24  2:11                                 ` Eric Dumazet
  0 siblings, 1 reply; 41+ messages in thread
From: Jamal Hadi Salim @ 2013-04-24  1:44 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Stephen Hemminger, Willem de Bruijn, netdev, davem

[-- Attachment #1: Type: text/plain, Size: 839 bytes --]

On 13-04-23 09:32 PM, Eric Dumazet wrote:
> On Tue, 2013-04-23 at 21:25 -0400, Jamal Hadi Salim wrote:


> Not sure what you mean. The qdisc stuff would replace the 'cpu backlog',

Aha ;->
So you would have many little backlogs one per ring per cpu, correct?


> not be added to it. Think of having possibility to control backlog using
> standard qdiscs, like fq_codel ;)

Excellent. So this is not as a big surgery as it sounds then.
the backloglets just need to be exposed as netdevs.

> Yes, but the per cpu backlog is shared for all devices. We probably want
> different qdisc for gre tunnel, eth0, ...

Makes sense.

BTW, looking at __skb_get_rxhash(), if i had a driver that sets either
skb->rxhash (picks it off the dma descriptor), could i not use that 
instead of computing the hash? something like attached patch.

cheers,
jamal

[-- Attachment #2: p1 --]
[-- Type: text/plain, Size: 412 bytes --]

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index e187bf0..a6abee0 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -159,8 +159,9 @@ void __skb_get_rxhash(struct sk_buff *skb)
 	struct flow_keys keys;
 	u32 hash;
 
-	if (!skb_flow_dissect(skb, &keys))
+	if (skb->rxhash || !skb_flow_dissect(skb, &keys)) {
 		return;
+	}
 
 	if (keys.ports)
 		skb->l4_rxhash = 1;

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-24  1:44                               ` Jamal Hadi Salim
@ 2013-04-24  2:11                                 ` Eric Dumazet
  2013-04-24 13:00                                   ` Jamal Hadi Salim
  0 siblings, 1 reply; 41+ messages in thread
From: Eric Dumazet @ 2013-04-24  2:11 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: Stephen Hemminger, Willem de Bruijn, netdev, davem

On Tue, 2013-04-23 at 21:44 -0400, Jamal Hadi Salim wrote:

> 
> BTW, looking at __skb_get_rxhash(), if i had a driver that sets either
> skb->rxhash (picks it off the dma descriptor), could i not use that 
> instead of computing the hash? something like attached patch.
> 

The caller does this already ;)

static inline __u32 skb_get_rxhash(struct sk_buff *skb)
{
        if (!skb->l4_rxhash)
                __skb_get_rxhash(skb);

        return skb->rxhash;
}

Rationale being : if l4 rxhash was already provided, use it.

AFAIK, only bnx2x provides this.

For other cases, we prefer trying a software rxhash, as it gives us more
capabilities than the standard Toepliz hash (Not l4 for UDP flows for
example)

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-24  2:11                                 ` Eric Dumazet
@ 2013-04-24 13:00                                   ` Jamal Hadi Salim
  2013-04-24 14:41                                     ` Eric Dumazet
  0 siblings, 1 reply; 41+ messages in thread
From: Jamal Hadi Salim @ 2013-04-24 13:00 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Stephen Hemminger, Willem de Bruijn, netdev, davem

On 13-04-23 10:11 PM, Eric Dumazet wrote:

>
> The caller does this already ;)

[..]
>
> Rationale being : if l4 rxhash was already provided, use it.
>
> AFAIK, only bnx2x provides this.
>
 > For other cases, we prefer trying a software rxhash, as it gives us
 > more
 > capabilities than the standard Toepliz hash (Not l4 for UDP flows for
 > example)
 >


I forgot about the Toepliz hash connection. I can see it makes sense here.

Let me clarify:
In the scenario i am thinking of, I have clever hardware which is smart 
enough to deal with details of identifying flow state(including 
fragementation etc) and tagging it in a DMA descriptor with 32 bit id.
I want to be able to take the tag produced by the hardware and use
that for rps cpu selection i.e assume the hardware has already done the
hashing and is giving me a 32 bit id. My initial thought was skb->rxhash
is the right spot to store this; then make get_rps_cpu() do the
selection based on this. l4 rxhash is 1 bit which is too small.

cheers,
jamal

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v4] rps: selective flow shedding during softnet overflow
  2013-04-24 13:00                                   ` Jamal Hadi Salim
@ 2013-04-24 14:41                                     ` Eric Dumazet
  0 siblings, 0 replies; 41+ messages in thread
From: Eric Dumazet @ 2013-04-24 14:41 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: Stephen Hemminger, Willem de Bruijn, netdev, davem

On Wed, 2013-04-24 at 09:00 -0400, Jamal Hadi Salim wrote:
> I forgot about the Toepliz hash connection. I can see it makes sense here.
> 
> Let me clarify:
> In the scenario i am thinking of, I have clever hardware which is smart 
> enough to deal with details of identifying flow state(including 
> fragementation etc) and tagging it in a DMA descriptor with 32 bit id.
> I want to be able to take the tag produced by the hardware and use
> that for rps cpu selection i.e assume the hardware has already done the
> hashing and is giving me a 32 bit id. My initial thought was skb->rxhash
> is the right spot to store this; then make get_rps_cpu() do the
> selection based on this. l4 rxhash is 1 bit which is too small.

Set skb->rxrhash to the hash your hardware computed, and skb->l4_rxhash
to 1.

Then get_rps_cpu() will use skb->rxhash happily

(and other callers of skb_get_rxhash() as well)

Not clear what you mean by fragmentation : fragmented frames have no
flow information (but the first fragment)

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v5] rps: selective flow shedding during softnet overflow
  2013-04-24  0:37                           ` [PATCH net-next v5] " Willem de Bruijn
  2013-04-24  1:07                             ` Eric Dumazet
@ 2013-04-25  8:20                             ` David Miller
  2013-05-20 14:02                               ` [PATCH net-next v6] " Willem de Bruijn
  1 sibling, 1 reply; 41+ messages in thread
From: David Miller @ 2013-04-25  8:20 UTC (permalink / raw)
  To: willemb; +Cc: eric.dumazet, netdev, stephen

From: Willem de Bruijn <willemb@google.com>
Date: Tue, 23 Apr 2013 20:37:27 -0400

> A cpu executing the network receive path sheds packets when its input
> queue grows to netdev_max_backlog. A single high rate flow (such as a
> spoofed source DoS) can exceed a single cpu processing rate and will
> degrade throughput of other flows hashed onto the same cpu.
> 
> This patch adds a more fine grained hashtable. If the netdev backlog
> is above a threshold, IRQ cpus track the ratio of total traffic of
> each flow (using 4096 buckets, configurable). The ratio is measured
> by counting the number of packets per flow over the last 256 packets
> from the source cpu. Any flow that occupies a large fraction of this
> (set at 50%) will see packet drop while above the threshold.
> 
> Tested:
> Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
> kernel receive (RPS) on cpu0 and application threads on cpus 2--7
> each handling 20k req/s. Throughput halves when hit with a 400 kpps
> antagonist storm. With this patch applied, antagonist overload is
> dropped and the server processes its complete load.
> 
> The patch is effective when kernel receive processing is the
> bottleneck. The above RPS scenario is a extreme, but the same is
> reached with RFS and sufficient kernel processing (iptables, packet
> socket tap, ..).
> 
> Signed-off-by: Willem de Bruijn <willemb@google.com>

This does't compile:

net/core/sysctl_net_core.c: In function ‘flow_limit_cpu_sysctl’:
net/core/sysctl_net_core.c:114:10: error: invalid type argument of ‘->’ (have ‘struct mutex’)

Also, please change the Kconfig entry to be:

config NET_FLOW_LIMIT
	boolean
	depends on RPS
	default y


like RPS et al. are.

Thanks.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH net-next v6] rps: selective flow shedding during softnet overflow
  2013-04-25  8:20                             ` David Miller
@ 2013-05-20 14:02                               ` Willem de Bruijn
  2013-05-20 16:00                                 ` Eric Dumazet
  0 siblings, 1 reply; 41+ messages in thread
From: Willem de Bruijn @ 2013-05-20 14:02 UTC (permalink / raw)
  To: davem, eric.dumazet, netdev; +Cc: Willem de Bruijn

A cpu executing the network receive path sheds packets when its input
queue grows to netdev_max_backlog. A single high rate flow (such as a
spoofed source DoS) can exceed a single cpu processing rate and will
degrade throughput of other flows hashed onto the same cpu.

This patch adds a more fine grained hashtable. If the netdev backlog
is above a threshold, IRQ cpus track the ratio of total traffic of
each flow (using 4096 buckets, configurable). The ratio is measured
by counting the number of packets per flow over the last 256 packets
from the source cpu. Any flow that occupies a large fraction of this
(set at 50%) will see packet drop while above the threshold.

Tested:
Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
kernel receive (RPS) on cpu0 and application threads on cpus 2--7
each handling 20k req/s. Throughput halves when hit with a 400 kpps
antagonist storm. With this patch applied, antagonist overload is
dropped and the server processes its complete load.

The patch is effective when kernel receive processing is the
bottleneck. The above RPS scenario is a extreme, but the same is
reached with RFS and sufficient kernel processing (iptables, packet
socket tap, ..).

Signed-off-by: Willem de Bruijn <willemb@google.com>

---

Changes
v6
- bugfix: lockdep_is_held bug with CONFIG_PROVE_RCU
  (now built with allyesconfig)
- kconfig: update to follow format of RPS
v5
- depend on RPS, automatically build if RPS is enabled.
v4
- remove unnecessary synchronize_rcu after rcu_assign_pointer to NULL ptr
- simplify lookup of current cpu's softnet
v3
- fix race between updates to table_len sysctl during bitmap sysctl.
- fix NULL pointer dereference on alloc failure.
v2
- add fl->num_buckets element to use the actual allocated table length.
- disable the kconfig option by default, as it is workload specific.
---
 include/linux/netdevice.h  |  17 ++++++++
 net/Kconfig                |  12 ++++++
 net/core/dev.c             |  48 ++++++++++++++++++++-
 net/core/net-procfs.c      |  16 ++++++-
 net/core/sysctl_net_core.c | 104 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 194 insertions(+), 3 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a94a5a0..7dd535d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1778,6 +1778,19 @@ static inline int unregister_gifconf(unsigned int family)
 	return register_gifconf(family, NULL);
 }
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+#define FLOW_LIMIT_HISTORY	(1 << 8)	/* must be ^2 */
+struct sd_flow_limit {
+	u64			count;
+	unsigned int		num_buckets;
+	unsigned int		history_head;
+	u16			history[FLOW_LIMIT_HISTORY];
+	u8			buckets[];
+};
+
+extern int netdev_flow_limit_table_len;
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
 /*
  * Incoming packets are placed on per-cpu queues
  */
@@ -1807,6 +1820,10 @@ struct softnet_data {
 	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit	*flow_limit;
+#endif
 };
 
 static inline void input_queue_head_incr(struct softnet_data *sd)
diff --git a/net/Kconfig b/net/Kconfig
index 2ddc904..08de901 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -259,6 +259,18 @@ config BPF_JIT
 	  packet sniffing (libpcap/tcpdump). Note : Admin should enable
 	  this feature changing /proc/sys/net/core/bpf_jit_enable
 
+config NET_FLOW_LIMIT
+	boolean
+	depends on RPS
+	default y
+	---help---
+	  The network stack has to drop packets when a receive processing CPU's
+	  backlog reaches netdev_max_backlog. If a few out of many active flows
+	  generate the vast majority of load, drop their traffic earlier to
+	  maintain capacity for the other flows. This feature provides servers
+	  with many clients some protection against DoS by a single (spoofed)
+	  flow that greatly exceeds average workload.
+
 menu "Network testing"
 
 config NET_PKTGEN
diff --git a/net/core/dev.c b/net/core/dev.c
index 18e9730..7229bc3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3064,6 +3064,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
 	return 0;
 }
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+	struct softnet_data *sd;
+	unsigned int old_flow, new_flow;
+
+	if (qlen < (netdev_max_backlog >> 1))
+		return false;
+
+	sd = &__get_cpu_var(softnet_data);
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl) {
+		new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
+		old_flow = fl->history[fl->history_head];
+		fl->history[fl->history_head] = new_flow;
+
+		fl->history_head++;
+		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+		if (likely(fl->buckets[old_flow]))
+			fl->buckets[old_flow]--;
+
+		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+			fl->count++;
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+#endif
+	return false;
+}
+
 /*
  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
  * queue (may be a remote CPU queue).
@@ -3073,13 +3113,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 {
 	struct softnet_data *sd;
 	unsigned long flags;
+	unsigned int qlen;
 
 	sd = &per_cpu(softnet_data, cpu);
 
 	local_irq_save(flags);
 
 	rps_lock(sd);
-	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+	qlen = skb_queue_len(&sd->input_pkt_queue);
+	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 		if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6269,6 +6311,10 @@ static int __init net_dev_init(void)
 		sd->backlog.weight = weight_p;
 		sd->backlog.gro_list = NULL;
 		sd->backlog.gro_count = 0;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+		sd->flow_limit = NULL;
+#endif
 	}
 
 	dev_boot_phase = 0;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 569d355..2bf8329 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)
 static int softnet_seq_show(struct seq_file *seq, void *v)
 {
 	struct softnet_data *sd = v;
+	unsigned int flow_limit_count = 0;
 
-	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl)
+		flow_limit_count = fl->count;
+	rcu_read_unlock();
+#endif
+
+	seq_printf(seq,
+		   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
-		   sd->cpu_collision, sd->received_rps);
+		   sd->cpu_collision, sd->received_rps, flow_limit_count);
 	return 0;
 }
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cfdb46a..741db5fc 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -87,6 +87,96 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
 }
 #endif /* CONFIG_RPS */
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+static DEFINE_MUTEX(flow_limit_update_mutex);
+
+static int flow_limit_cpu_sysctl(ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *ppos)
+{
+	struct sd_flow_limit *cur;
+	struct softnet_data *sd;
+	cpumask_var_t mask;
+	int i, len, ret = 0;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (write) {
+		ret = cpumask_parse_user(buffer, *lenp, mask);
+		if (ret)
+			goto done;
+
+		mutex_lock(&flow_limit_update_mutex);
+		len = sizeof(*cur) + netdev_flow_limit_table_len;
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			cur = rcu_dereference_protected(sd->flow_limit,
+				     lockdep_is_held(&flow_limit_update_mutex));
+			if (cur && !cpumask_test_cpu(i, mask)) {
+				RCU_INIT_POINTER(sd->flow_limit, NULL);
+				synchronize_rcu();
+				kfree(cur);
+			} else if (!cur && cpumask_test_cpu(i, mask)) {
+				cur = kzalloc(len, GFP_KERNEL);
+				if (!cur) {
+					/* not unwinding previous changes */
+					ret = -ENOMEM;
+					goto write_unlock;
+				}
+				cur->num_buckets = netdev_flow_limit_table_len;
+				rcu_assign_pointer(sd->flow_limit, cur);
+			}
+		}
+write_unlock:
+		mutex_unlock(&flow_limit_update_mutex);
+	} else {
+		if (*ppos || !*lenp) {
+			*lenp = 0;
+			goto done;
+		}
+
+		cpumask_clear(mask);
+		rcu_read_lock();
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			if (rcu_dereference(sd->flow_limit))
+				cpumask_set_cpu(i, mask);
+		}
+		rcu_read_unlock();
+
+		len = cpumask_scnprintf(buffer, *lenp, mask);
+		*lenp = len + 1;
+		*ppos += len + 1;
+	}
+
+done:
+	free_cpumask_var(mask);
+	return ret;
+}
+
+static int flow_limit_table_len_sysctl(ctl_table *table, int write,
+				       void __user *buffer, size_t *lenp,
+				       loff_t *ppos)
+{
+	unsigned int old, *ptr;
+	int ret;
+
+	mutex_lock(&flow_limit_update_mutex);
+
+	ptr = table->data;
+	old = *ptr;
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (!ret && write && !is_power_of_2(*ptr)) {
+		*ptr = old;
+		ret = -EINVAL;
+	}
+
+	mutex_unlock(&flow_limit_update_mutex);
+	return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -180,6 +270,20 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= rps_sock_flow_sysctl
 	},
 #endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+	{
+		.procname	= "flow_limit_cpu_bitmap",
+		.mode		= 0644,
+		.proc_handler	= flow_limit_cpu_sysctl
+	},
+	{
+		.procname	= "flow_limit_table_len",
+		.data		= &netdev_flow_limit_table_len,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= flow_limit_table_len_sysctl
+	},
+#endif /* CONFIG_NET_FLOW_LIMIT */
 #endif /* CONFIG_NET */
 	{
 		.procname	= "netdev_budget",
-- 
1.8.2.1

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v6] rps: selective flow shedding during softnet overflow
  2013-05-20 14:02                               ` [PATCH net-next v6] " Willem de Bruijn
@ 2013-05-20 16:00                                 ` Eric Dumazet
  2013-05-20 16:08                                   ` Willem de Bruijn
  2013-05-20 20:48                                   ` David Miller
  0 siblings, 2 replies; 41+ messages in thread
From: Eric Dumazet @ 2013-05-20 16:00 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: davem, netdev

On Mon, 2013-05-20 at 10:02 -0400, Willem de Bruijn wrote:
> A cpu executing the network receive path sheds packets when its input
> queue grows to netdev_max_backlog. A single high rate flow (such as a
> spoofed source DoS) can exceed a single cpu processing rate and will
> degrade throughput of other flows hashed onto the same cpu.
> 
> This patch adds a more fine grained hashtable. If the netdev backlog
> is above a threshold, IRQ cpus track the ratio of total traffic of
> each flow (using 4096 buckets, configurable). The ratio is measured
> by counting the number of packets per flow over the last 256 packets
> from the source cpu. Any flow that occupies a large fraction of this
> (set at 50%) will see packet drop while above the threshold.
> 
> Tested:
> Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
> kernel receive (RPS) on cpu0 and application threads on cpus 2--7
> each handling 20k req/s. Throughput halves when hit with a 400 kpps
> antagonist storm. With this patch applied, antagonist overload is
> dropped and the server processes its complete load.
> 
> The patch is effective when kernel receive processing is the
> bottleneck. The above RPS scenario is a extreme, but the same is
> reached with RFS and sufficient kernel processing (iptables, packet
> socket tap, ..).
> 
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> 
> ---

Acked-by: Eric Dumazet <edumazet@google.com>

Willemb, are you planning to add a section in
Documentation/networking/scaling.txt ?

Thanks

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v6] rps: selective flow shedding during softnet overflow
  2013-05-20 16:00                                 ` Eric Dumazet
@ 2013-05-20 16:08                                   ` Willem de Bruijn
  2013-05-20 20:48                                   ` David Miller
  1 sibling, 0 replies; 41+ messages in thread
From: Willem de Bruijn @ 2013-05-20 16:08 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

> Willemb, are you planning to add a section in
> Documentation/networking/scaling.txt ?

Good point, I should. I will send a follow-on patch documenting the
mechanism and sysctls.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH net-next v6] rps: selective flow shedding during softnet overflow
  2013-05-20 16:00                                 ` Eric Dumazet
  2013-05-20 16:08                                   ` Willem de Bruijn
@ 2013-05-20 20:48                                   ` David Miller
  1 sibling, 0 replies; 41+ messages in thread
From: David Miller @ 2013-05-20 20:48 UTC (permalink / raw)
  To: eric.dumazet; +Cc: willemb, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 20 May 2013 09:00:54 -0700

> On Mon, 2013-05-20 at 10:02 -0400, Willem de Bruijn wrote:
>> A cpu executing the network receive path sheds packets when its input
>> queue grows to netdev_max_backlog. A single high rate flow (such as a
>> spoofed source DoS) can exceed a single cpu processing rate and will
>> degrade throughput of other flows hashed onto the same cpu.
>> 
>> This patch adds a more fine grained hashtable. If the netdev backlog
>> is above a threshold, IRQ cpus track the ratio of total traffic of
>> each flow (using 4096 buckets, configurable). The ratio is measured
>> by counting the number of packets per flow over the last 256 packets
>> from the source cpu. Any flow that occupies a large fraction of this
>> (set at 50%) will see packet drop while above the threshold.
>> 
>> Tested:
>> Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
>> kernel receive (RPS) on cpu0 and application threads on cpus 2--7
>> each handling 20k req/s. Throughput halves when hit with a 400 kpps
>> antagonist storm. With this patch applied, antagonist overload is
>> dropped and the server processes its complete load.
>> 
>> The patch is effective when kernel receive processing is the
>> bottleneck. The above RPS scenario is a extreme, but the same is
>> reached with RFS and sufficient kernel processing (iptables, packet
>> socket tap, ..).
>> 
>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>> 
>> ---
> 
> Acked-by: Eric Dumazet <edumazet@google.com>

Applied, thanks guys.

^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2013-05-20 20:48 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-04-19 17:46 [PATCH] rps: selective flow shedding during softnet overflow Willem de Bruijn
2013-04-19 17:58 ` Eric Dumazet
2013-04-22 20:40   ` Willem de Bruijn
2013-04-22 20:46     ` [PATCH net-next v2] " Willem de Bruijn
2013-04-22 22:30       ` Eric Dumazet
2013-04-23 18:45         ` Willem de Bruijn
2013-04-23 18:46           ` [PATCH net-next v3] " Willem de Bruijn
2013-04-23 19:18             ` Eric Dumazet
2013-04-23 20:30               ` Willem de Bruijn
2013-04-23 20:31                 ` [PATCH net-next v4] " Willem de Bruijn
2013-04-23 21:23                   ` Stephen Hemminger
2013-04-23 21:37                     ` Willem de Bruijn
2013-04-23 21:37                     ` Eric Dumazet
2013-04-23 21:52                       ` Stephen Hemminger
2013-04-23 22:34                         ` David Miller
2013-04-24  0:09                         ` Eric Dumazet
2013-04-24  0:37                           ` [PATCH net-next v5] " Willem de Bruijn
2013-04-24  1:07                             ` Eric Dumazet
2013-04-25  8:20                             ` David Miller
2013-05-20 14:02                               ` [PATCH net-next v6] " Willem de Bruijn
2013-05-20 16:00                                 ` Eric Dumazet
2013-05-20 16:08                                   ` Willem de Bruijn
2013-05-20 20:48                                   ` David Miller
2013-04-24  1:25                           ` [PATCH net-next v4] " Jamal Hadi Salim
2013-04-24  1:32                             ` Eric Dumazet
2013-04-24  1:44                               ` Jamal Hadi Salim
2013-04-24  2:11                                 ` Eric Dumazet
2013-04-24 13:00                                   ` Jamal Hadi Salim
2013-04-24 14:41                                     ` Eric Dumazet
2013-04-23 22:33                     ` David Miller
2013-04-23 21:34                   ` Eric Dumazet
2013-04-23 22:41                   ` David Miller
2013-04-23 23:11                     ` Eric Dumazet
2013-04-23 23:15                       ` David Miller
2013-04-23 23:26                         ` Eric Dumazet
2013-04-24  0:03                         ` Stephen Hemminger
2013-04-24  0:00                     ` Willem de Bruijn
2013-04-23 20:46                 ` [PATCH net-next v3] " Eric Dumazet
2013-04-19 19:03 ` [PATCH] " Stephen Hemminger
2013-04-19 19:21   ` Eric Dumazet
2013-04-19 20:11   ` Willem de Bruijn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.