* [RFC PATCH] openvswitch: use percpu flow stats
@ 2016-08-19 19:47 Thadeu Lima de Souza Cascardo
[not found] ` <1471636046-9246-1-git-send-email-cascardo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 5+ messages in thread
From: Thadeu Lima de Souza Cascardo @ 2016-08-19 19:47 UTC (permalink / raw)
To: dev; +Cc: netdev
Instead of using flow stats per NUMA node, use it per CPU. When using
megaflows, the stats lock can be a bottleneck in scalability.
On a E5-2690 12-core system, usual throughput went from ~4Mpps to ~15Mpps
when forwarding between two 40GbE ports with a single flow configured on
the datapath.
---
net/openvswitch/flow.c | 39 +++++++++++++++++++++------------------
net/openvswitch/flow.h | 4 ++--
net/openvswitch/flow_table.c | 21 +++++++++++----------
3 files changed, 34 insertions(+), 30 deletions(-)
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 0ea128e..90ae248 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/in.h>
#include <linux/rcupdate.h>
+#include <linux/cpumask.h>
#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
@@ -72,32 +73,33 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
{
struct flow_stats *stats;
int node = numa_node_id();
+ int cpu = get_cpu();
int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
- stats = rcu_dereference(flow->stats[node]);
+ stats = rcu_dereference(flow->stats[cpu]);
- /* Check if already have node-specific stats. */
+ /* Check if already have CPU-specific stats. */
if (likely(stats)) {
spin_lock(&stats->lock);
/* Mark if we write on the pre-allocated stats. */
- if (node == 0 && unlikely(flow->stats_last_writer != node))
- flow->stats_last_writer = node;
+ if (cpu == 0 && unlikely(flow->stats_last_writer != cpu))
+ flow->stats_last_writer = cpu;
} else {
stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
spin_lock(&stats->lock);
- /* If the current NUMA-node is the only writer on the
+ /* If the current CPU is the only writer on the
* pre-allocated stats keep using them.
*/
- if (unlikely(flow->stats_last_writer != node)) {
+ if (unlikely(flow->stats_last_writer != cpu)) {
/* A previous locker may have already allocated the
- * stats, so we need to check again. If node-specific
+ * stats, so we need to check again. If CPU-specific
* stats were already allocated, we update the pre-
* allocated stats as we have already locked them.
*/
- if (likely(flow->stats_last_writer != NUMA_NO_NODE)
- && likely(!rcu_access_pointer(flow->stats[node]))) {
- /* Try to allocate node-specific stats. */
+ if (likely(flow->stats_last_writer != -1)
+ && likely(!rcu_access_pointer(flow->stats[cpu]))) {
+ /* Try to allocate CPU-specific stats. */
struct flow_stats *new_stats;
new_stats =
@@ -114,12 +116,12 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
new_stats->tcp_flags = tcp_flags;
spin_lock_init(&new_stats->lock);
- rcu_assign_pointer(flow->stats[node],
+ rcu_assign_pointer(flow->stats[cpu],
new_stats);
goto unlock;
}
}
- flow->stats_last_writer = node;
+ flow->stats_last_writer = cpu;
}
}
@@ -129,6 +131,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
stats->tcp_flags |= tcp_flags;
unlock:
spin_unlock(&stats->lock);
+ put_cpu();
}
/* Must be called with rcu_read_lock or ovs_mutex. */
@@ -136,14 +139,14 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
struct ovs_flow_stats *ovs_stats,
unsigned long *used, __be16 *tcp_flags)
{
- int node;
+ int cpu;
*used = 0;
*tcp_flags = 0;
memset(ovs_stats, 0, sizeof(*ovs_stats));
- for_each_node(node) {
- struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]);
+ for_each_possible_cpu(cpu) {
+ struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]);
if (stats) {
/* Local CPU may write on non-local stats, so we must
@@ -163,10 +166,10 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
/* Called with ovs_mutex. */
void ovs_flow_stats_clear(struct sw_flow *flow)
{
- int node;
+ int cpu;
- for_each_node(node) {
- struct flow_stats *stats = ovsl_dereference(flow->stats[node]);
+ for_each_possible_cpu(cpu) {
+ struct flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
if (stats) {
spin_lock_bh(&stats->lock);
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 03378e7..7725ffc 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -172,14 +172,14 @@ struct sw_flow {
struct hlist_node node[2];
u32 hash;
} flow_table, ufid_table;
- int stats_last_writer; /* NUMA-node id of the last writer on
+ int stats_last_writer; /* CPU id of the last writer on
* 'stats[0]'.
*/
struct sw_flow_key key;
struct sw_flow_id id;
struct sw_flow_mask *mask;
struct sw_flow_actions __rcu *sf_acts;
- struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one
+ struct flow_stats __rcu *stats[]; /* One for each CPU. First one
* is allocated at flow creation time,
* the rest are allocated on demand
* while holding the 'stats[0].lock'.
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index d073fff..6c2ed41 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -32,6 +32,7 @@
#include <linux/module.h>
#include <linux/in.h>
#include <linux/rcupdate.h>
+#include <linux/cpumask.h>
#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
@@ -79,7 +80,7 @@ struct sw_flow *ovs_flow_alloc(void)
{
struct sw_flow *flow;
struct flow_stats *stats;
- int node;
+ int cpu;
flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
if (!flow)
@@ -89,7 +90,7 @@ struct sw_flow *ovs_flow_alloc(void)
flow->mask = NULL;
flow->id.unmasked_key = NULL;
flow->id.ufid_len = 0;
- flow->stats_last_writer = NUMA_NO_NODE;
+ flow->stats_last_writer = -1;
/* Initialize the default stat node. */
stats = kmem_cache_alloc_node(flow_stats_cache,
@@ -102,9 +103,9 @@ struct sw_flow *ovs_flow_alloc(void)
RCU_INIT_POINTER(flow->stats[0], stats);
- for_each_node(node)
- if (node != 0)
- RCU_INIT_POINTER(flow->stats[node], NULL);
+ for_each_possible_cpu(cpu)
+ if (cpu != 0)
+ RCU_INIT_POINTER(flow->stats[cpu], NULL);
return flow;
err:
@@ -142,16 +143,16 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
static void flow_free(struct sw_flow *flow)
{
- int node;
+ int cpu;
if (ovs_identifier_is_key(&flow->id))
kfree(flow->id.unmasked_key);
if (flow->sf_acts)
ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
- for_each_node(node)
- if (flow->stats[node])
+ for_each_possible_cpu(cpu)
+ if (flow->stats[cpu])
kmem_cache_free(flow_stats_cache,
- (struct flow_stats __force *)flow->stats[node]);
+ (struct flow_stats __force *)flow->stats[cpu]);
kmem_cache_free(flow_cache, flow);
}
@@ -756,7 +757,7 @@ int ovs_flow_init(void)
BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
- + (nr_node_ids
+ + (num_possible_cpus()
* sizeof(struct flow_stats *)),
0, 0, NULL);
if (flow_cache == NULL)
--
2.7.4
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [RFC PATCH] openvswitch: use percpu flow stats
[not found] ` <1471636046-9246-1-git-send-email-cascardo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2016-08-19 19:56 ` Eric Dumazet
[not found] ` <1471636616.29842.110.camel-XN9IlZ5yJG9HTL0Zs8A6p+yfmBU6pStAUsxypvmhUTTZJqsBc5GL+g@public.gmane.org>
0 siblings, 1 reply; 5+ messages in thread
From: Eric Dumazet @ 2016-08-19 19:56 UTC (permalink / raw)
To: Thadeu Lima de Souza Cascardo
Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA
On Fri, 2016-08-19 at 16:47 -0300, Thadeu Lima de Souza Cascardo wrote:
> Instead of using flow stats per NUMA node, use it per CPU. When using
> megaflows, the stats lock can be a bottleneck in scalability.
...
>
> flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
> - + (nr_node_ids
> + + (num_possible_cpus()
> * sizeof(struct flow_stats *)),
> 0, 0, NULL);
> if (flow_cache == NULL)
This is bogus.
Please use nr_cpu_ids instead of num_possible_cpus().
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [RFC PATCH] openvswitch: use percpu flow stats
[not found] ` <1471636616.29842.110.camel-XN9IlZ5yJG9HTL0Zs8A6p+yfmBU6pStAUsxypvmhUTTZJqsBc5GL+g@public.gmane.org>
@ 2016-08-20 1:09 ` David Miller
[not found] ` <20160819.180953.970774471141286228.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
0 siblings, 1 reply; 5+ messages in thread
From: David Miller @ 2016-08-20 1:09 UTC (permalink / raw)
To: eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w
Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 19 Aug 2016 12:56:56 -0700
> On Fri, 2016-08-19 at 16:47 -0300, Thadeu Lima de Souza Cascardo wrote:
>> Instead of using flow stats per NUMA node, use it per CPU. When using
>> megaflows, the stats lock can be a bottleneck in scalability.
>
> ...
>
>>
>> flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
>> - + (nr_node_ids
>> + + (num_possible_cpus()
>> * sizeof(struct flow_stats *)),
>> 0, 0, NULL);
>> if (flow_cache == NULL)
>
> This is bogus.
>
> Please use nr_cpu_ids instead of num_possible_cpus().
Then how would hot-plugged cpus be handled properly?
The only other way is you'd have to free all objects in the current
flow_cache, destroy it, then make a new kmem_cache and reallocate
all of the existing objects and hook them back up every time a cpu
up event occurs.
That doesn't make any sense at all.
Therefore, the size of the objects coming out of "sw_flow" have
to accomodate all possible cpu indexes.
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [RFC PATCH] openvswitch: use percpu flow stats
[not found] ` <20160819.180953.970774471141286228.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
@ 2016-08-20 4:07 ` Eric Dumazet
2016-08-20 5:25 ` David Miller
0 siblings, 1 reply; 5+ messages in thread
From: Eric Dumazet @ 2016-08-20 4:07 UTC (permalink / raw)
To: David Miller; +Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA
On Fri, 2016-08-19 at 18:09 -0700, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Fri, 19 Aug 2016 12:56:56 -0700
>
> > On Fri, 2016-08-19 at 16:47 -0300, Thadeu Lima de Souza Cascardo wrote:
> >> Instead of using flow stats per NUMA node, use it per CPU. When using
> >> megaflows, the stats lock can be a bottleneck in scalability.
> >
> > ...
> >
> >>
> >> flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
> >> - + (nr_node_ids
> >> + + (num_possible_cpus()
> >> * sizeof(struct flow_stats *)),
> >> 0, 0, NULL);
> >> if (flow_cache == NULL)
> >
> > This is bogus.
> >
> > Please use nr_cpu_ids instead of num_possible_cpus().
>
> Then how would hot-plugged cpus be handled properly?
>
> The only other way is you'd have to free all objects in the current
> flow_cache, destroy it, then make a new kmem_cache and reallocate
> all of the existing objects and hook them back up every time a cpu
> up event occurs.
>
> That doesn't make any sense at all.
>
> Therefore, the size of the objects coming out of "sw_flow" have
> to accomodate all possible cpu indexes.
Here is an example of a system I had in the past.
8 cpus (num_possible_cpus() returns 8)
Cpus were numbered : 0, 1, 2, 3, 8, 9, 10, 11 : (nr_cpu_ids = 12)
I am pretty sure they are still alive.
Since you want an array indexed by cpu numbers, you need to size it by
nr_cpu_ids, otherwise array[11] will crash / access garbage.
This is why you find many nr_cpu_ids in the linux tree, not
num_possible_cpus() to size arrays.
# git grep -n nr_cpu_ids -- net
net/bridge/netfilter/ebtables.c:900: vmalloc(nr_cpu_ids * sizeof(*(newinfo->chainstack)));
net/bridge/netfilter/ebtables.c:1132: countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids;
net/bridge/netfilter/ebtables.c:1185: countersize = COUNTER_OFFSET(repl->nentries) * nr_cpu_ids;
net/bridge/netfilter/ebtables.c:2200: countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids;
net/core/dev.c:3483: if (next_cpu < nr_cpu_ids) {
net/core/dev.c:3588: * - Current CPU is unset (>= nr_cpu_ids).
net/core/dev.c:3596: (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
net/core/dev.c:3603: if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
net/core/dev.c:3651: if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
net/core/neighbour.c:2737: for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
net/core/neighbour.c:2751: for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
net/core/net-procfs.c:122: while (*pos < nr_cpu_ids)
net/core/sysctl_net_core.c:70: rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c:360: for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c:375: for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
net/ipv4/route.c:254: for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
net/ipv4/route.c:267: for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
net/ipv6/icmp.c:918: kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL);
net/netfilter/nf_conntrack_netlink.c:1265: for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
net/netfilter/nf_conntrack_netlink.c:2003: if (cb->args[0] == nr_cpu_ids)
net/netfilter/nf_conntrack_netlink.c:2006: for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
net/netfilter/nf_conntrack_netlink.c:3211: if (cb->args[0] == nr_cpu_ids)
net/netfilter/nf_conntrack_netlink.c:3214: for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
net/netfilter/nf_conntrack_standalone.c:309: for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
net/netfilter/nf_conntrack_standalone.c:324: for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
net/netfilter/nf_synproxy_core.c:255: for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) {
net/netfilter/nf_synproxy_core.c:270: for (cpu = *pos; cpu < nr_cpu_ids; cpu++) {
net/netfilter/x_tables.c:1064: size = sizeof(void **) * nr_cpu_ids;
net/sunrpc/svc.c:167: unsigned int maxpools = nr_cpu_ids;
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [RFC PATCH] openvswitch: use percpu flow stats
2016-08-20 4:07 ` Eric Dumazet
@ 2016-08-20 5:25 ` David Miller
0 siblings, 0 replies; 5+ messages in thread
From: David Miller @ 2016-08-20 5:25 UTC (permalink / raw)
To: eric.dumazet; +Cc: cascardo, dev, netdev
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 19 Aug 2016 21:07:52 -0700
> Here is an example of a system I had in the past.
>
> 8 cpus (num_possible_cpus() returns 8)
>
> Cpus were numbered : 0, 1, 2, 3, 8, 9, 10, 11 : (nr_cpu_ids = 12)
>
> I am pretty sure they are still alive.
>
> Since you want an array indexed by cpu numbers, you need to size it by
> nr_cpu_ids, otherwise array[11] will crash / access garbage.
>
> This is why you find many nr_cpu_ids in the linux tree, not
> num_possible_cpus() to size arrays.
My bad, I misunderstood what nr_cpu_ids means. You are absolutely right.
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2016-08-20 5:25 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-19 19:47 [RFC PATCH] openvswitch: use percpu flow stats Thadeu Lima de Souza Cascardo
[not found] ` <1471636046-9246-1-git-send-email-cascardo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2016-08-19 19:56 ` Eric Dumazet
[not found] ` <1471636616.29842.110.camel-XN9IlZ5yJG9HTL0Zs8A6p+yfmBU6pStAUsxypvmhUTTZJqsBc5GL+g@public.gmane.org>
2016-08-20 1:09 ` David Miller
[not found] ` <20160819.180953.970774471141286228.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2016-08-20 4:07 ` Eric Dumazet
2016-08-20 5:25 ` David Miller
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.