* [PATCH net-next] net: rfs: add hash collision detection
@ 2015-02-06 20:59 Eric Dumazet
2015-02-06 22:21 ` Tom Herbert
2015-02-09 0:54 ` David Miller
0 siblings, 2 replies; 4+ messages in thread
From: Eric Dumazet @ 2015-02-06 20:59 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Tom Herbert, Ying Cai, Willem de Bruijn
From: Eric Dumazet <edumazet@google.com>
Receive Flow Steering is a nice solution but suffers from
hash collisions when a mix of connected and unconnected traffic
is received on the host, when flow hash table is populated.
Also, clearing flow in inet_release() makes RFS not very good
for short lived flows, as many packets can follow close().
(FIN , ACK packets, ...)
This patch extends the information stored into global hash table
to not only include cpu number, but upper part of the hash value.
I use a 32bit value, and dynamically split it in two parts.
For host with less than 64 possible cpus, this gives 6 bits for the
cpu number, and 26 (32-6) bits for the upper part of the hash.
Since hash bucket selection use low order bits of the hash, we have
a full hash match, if /proc/sys/net/core/rps_sock_flow_entries is big
enough.
If the hash found in flow table does not match, we fallback to RPS (if
it is enabled for the rxqueue).
This means that a packet for an non connected flow can avoid the
IPI through a unrelated/victim CPU.
This also means we no longer have to clear the table at socket
close time, and this helps short lived flows performance.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
drivers/net/tun.c | 5 ---
include/linux/netdevice.h | 34 ++++++++++++------------
include/net/sock.h | 24 -----------------
net/core/dev.c | 48 +++++++++++++++++++----------------
net/core/sysctl_net_core.c | 2 -
net/ipv4/af_inet.c | 2 -
6 files changed, 47 insertions(+), 68 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ad7d3d5f3ee5..857dca47bf80 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -256,7 +256,6 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
{
tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
e->rxhash, e->queue_index);
- sock_rps_reset_flow_hash(e->rps_rxhash);
hlist_del_rcu(&e->hash_link);
kfree_rcu(e, rcu);
--tun->flow_count;
@@ -373,10 +372,8 @@ unlock:
*/
static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
{
- if (unlikely(e->rps_rxhash != hash)) {
- sock_rps_reset_flow_hash(e->rps_rxhash);
+ if (unlikely(e->rps_rxhash != hash))
e->rps_rxhash = hash;
- }
}
/* We try to identify a flow through its rxhash first. The reason that
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ce784d5018e0..ab3b7cef4638 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -644,39 +644,39 @@ struct rps_dev_flow_table {
/*
* The rps_sock_flow_table contains mappings of flows to the last CPU
* on which they were processed by the application (set in recvmsg).
+ * Each entry is a 32bit value. Upper part is the high order bits
+ * of flow hash, lower part is cpu number.
+ * rps_cpu_mask is used to partition the space, depending on number of
+ * possible cpus : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
+ * For example, if 64 cpus are possible, rps_cpu_mask = 0x3f,
+ * meaning we use 32-6=26 bits for the hash.
*/
struct rps_sock_flow_table {
- unsigned int mask;
- u16 ents[0];
+ u32 mask;
+ u32 ents[0];
};
-#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \
- ((_num) * sizeof(u16)))
+#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
#define RPS_NO_CPU 0xffff
+extern u32 rps_cpu_mask;
+extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
+
static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
u32 hash)
{
if (table && hash) {
- unsigned int cpu, index = hash & table->mask;
+ unsigned int index = hash & table->mask;
+ u32 val = hash & ~rps_cpu_mask;
/* We only give a hint, preemption can change cpu under us */
- cpu = raw_smp_processor_id();
+ val |= raw_smp_processor_id();
- if (table->ents[index] != cpu)
- table->ents[index] = cpu;
+ if (table->ents[index] != val)
+ table->ents[index] = val;
}
}
-static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
- u32 hash)
-{
- if (table && hash)
- table->ents[hash & table->mask] = RPS_NO_CPU;
-}
-
-extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
-
#ifdef CONFIG_RFS_ACCEL
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
u16 filter_id);
diff --git a/include/net/sock.h b/include/net/sock.h
index d28b8fededd6..e13824570b0f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -857,18 +857,6 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
#endif
}
-static inline void sock_rps_reset_flow_hash(__u32 hash)
-{
-#ifdef CONFIG_RPS
- struct rps_sock_flow_table *sock_flow_table;
-
- rcu_read_lock();
- sock_flow_table = rcu_dereference(rps_sock_flow_table);
- rps_reset_sock_flow(sock_flow_table, hash);
- rcu_read_unlock();
-#endif
-}
-
static inline void sock_rps_record_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
@@ -876,28 +864,18 @@ static inline void sock_rps_record_flow(const struct sock *sk)
#endif
}
-static inline void sock_rps_reset_flow(const struct sock *sk)
-{
-#ifdef CONFIG_RPS
- sock_rps_reset_flow_hash(sk->sk_rxhash);
-#endif
-}
-
static inline void sock_rps_save_rxhash(struct sock *sk,
const struct sk_buff *skb)
{
#ifdef CONFIG_RPS
- if (unlikely(sk->sk_rxhash != skb->hash)) {
- sock_rps_reset_flow(sk);
+ if (unlikely(sk->sk_rxhash != skb->hash))
sk->sk_rxhash = skb->hash;
- }
#endif
}
static inline void sock_rps_reset_rxhash(struct sock *sk)
{
#ifdef CONFIG_RPS
- sock_rps_reset_flow(sk);
sk->sk_rxhash = 0;
#endif
}
diff --git a/net/core/dev.c b/net/core/dev.c
index a3a96ffc67f4..8be38675e1a8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3030,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd,
/* One global table that all flow-based protocols share. */
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
EXPORT_SYMBOL(rps_sock_flow_table);
+u32 rps_cpu_mask __read_mostly;
+EXPORT_SYMBOL(rps_cpu_mask);
struct static_key rps_needed __read_mostly;
@@ -3086,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow **rflowp)
{
- struct netdev_rx_queue *rxqueue;
- struct rps_map *map;
+ const struct rps_sock_flow_table *sock_flow_table;
+ struct netdev_rx_queue *rxqueue = dev->_rx;
struct rps_dev_flow_table *flow_table;
- struct rps_sock_flow_table *sock_flow_table;
+ struct rps_map *map;
int cpu = -1;
- u16 tcpu;
+ u32 tcpu;
u32 hash;
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb);
+
if (unlikely(index >= dev->real_num_rx_queues)) {
WARN_ONCE(dev->real_num_rx_queues > 1,
"%s received packet on queue %u, but number "
@@ -3103,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
dev->name, index, dev->real_num_rx_queues);
goto done;
}
- rxqueue = dev->_rx + index;
- } else
- rxqueue = dev->_rx;
+ rxqueue += index;
+ }
+ /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
+
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
map = rcu_dereference(rxqueue->rps_map);
- if (map) {
- if (map->len == 1 &&
- !rcu_access_pointer(rxqueue->rps_flow_table)) {
- tcpu = map->cpus[0];
- if (cpu_online(tcpu))
- cpu = tcpu;
- goto done;
- }
- } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
+ if (!flow_table && !map)
goto done;
- }
skb_reset_network_header(skb);
hash = skb_get_hash(skb);
if (!hash)
goto done;
- flow_table = rcu_dereference(rxqueue->rps_flow_table);
sock_flow_table = rcu_dereference(rps_sock_flow_table);
if (flow_table && sock_flow_table) {
- u16 next_cpu;
struct rps_dev_flow *rflow;
+ u32 next_cpu;
+ u32 ident;
+
+ /* First check into global flow table if there is a match */
+ ident = sock_flow_table->ents[hash & sock_flow_table->mask];
+ if ((ident ^ hash) & ~rps_cpu_mask)
+ goto try_rps;
+ next_cpu = ident & rps_cpu_mask;
+
+ /* OK, now we know there is a match,
+ * we can look at the local (per receive queue) flow table
+ */
rflow = &flow_table->flows[hash & flow_table->mask];
tcpu = rflow->cpu;
- next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
-
/*
* If the desired CPU (where last recvmsg was done) is
* different from current CPU (one in the rx-queue flow
@@ -3162,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
}
}
+try_rps:
+
if (map) {
tcpu = map->cpus[reciprocal_scale(hash, map->len)];
if (cpu_online(tcpu)) {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index fde21d19e61b..7a31be5e361f 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -65,7 +65,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
mutex_unlock(&sock_flow_mutex);
return -ENOMEM;
}
-
+ rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
sock_table->mask = size - 1;
} else
sock_table = orig_sock_table;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a44773c8346c..d2e49baaff63 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -395,8 +395,6 @@ int inet_release(struct socket *sock)
if (sk) {
long timeout;
- sock_rps_reset_flow(sk);
-
/* Applications forget to leave groups before exiting */
ip_mc_drop_socket(sk);
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH net-next] net: rfs: add hash collision detection
2015-02-06 20:59 [PATCH net-next] net: rfs: add hash collision detection Eric Dumazet
@ 2015-02-06 22:21 ` Tom Herbert
2015-02-07 2:24 ` Eric Dumazet
2015-02-09 0:54 ` David Miller
1 sibling, 1 reply; 4+ messages in thread
From: Tom Herbert @ 2015-02-06 22:21 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, netdev, Ying Cai, Willem de Bruijn
On Fri, Feb 6, 2015 at 12:59 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> Receive Flow Steering is a nice solution but suffers from
> hash collisions when a mix of connected and unconnected traffic
> is received on the host, when flow hash table is populated.
>
> Also, clearing flow in inet_release() makes RFS not very good
> for short lived flows, as many packets can follow close().
> (FIN , ACK packets, ...)
>
> This patch extends the information stored into global hash table
> to not only include cpu number, but upper part of the hash value.
>
> I use a 32bit value, and dynamically split it in two parts.
>
> For host with less than 64 possible cpus, this gives 6 bits for the
> cpu number, and 26 (32-6) bits for the upper part of the hash.
>
> Since hash bucket selection use low order bits of the hash, we have
> a full hash match, if /proc/sys/net/core/rps_sock_flow_entries is big
> enough.
>
> If the hash found in flow table does not match, we fallback to RPS (if
> it is enabled for the rxqueue).
>
> This means that a packet for an non connected flow can avoid the
> IPI through a unrelated/victim CPU.
>
> This also means we no longer have to clear the table at socket
> close time, and this helps short lived flows performance.
>
Acked-by: Tom Herbert <therbert@google.com>
Eric, looks awesome! Can you share any performance numbers?
Thanks,
Tom
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> drivers/net/tun.c | 5 ---
> include/linux/netdevice.h | 34 ++++++++++++------------
> include/net/sock.h | 24 -----------------
> net/core/dev.c | 48 +++++++++++++++++++----------------
> net/core/sysctl_net_core.c | 2 -
> net/ipv4/af_inet.c | 2 -
> 6 files changed, 47 insertions(+), 68 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index ad7d3d5f3ee5..857dca47bf80 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -256,7 +256,6 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
> {
> tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
> e->rxhash, e->queue_index);
> - sock_rps_reset_flow_hash(e->rps_rxhash);
> hlist_del_rcu(&e->hash_link);
> kfree_rcu(e, rcu);
> --tun->flow_count;
> @@ -373,10 +372,8 @@ unlock:
> */
> static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
> {
> - if (unlikely(e->rps_rxhash != hash)) {
> - sock_rps_reset_flow_hash(e->rps_rxhash);
> + if (unlikely(e->rps_rxhash != hash))
> e->rps_rxhash = hash;
> - }
> }
>
> /* We try to identify a flow through its rxhash first. The reason that
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index ce784d5018e0..ab3b7cef4638 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -644,39 +644,39 @@ struct rps_dev_flow_table {
> /*
> * The rps_sock_flow_table contains mappings of flows to the last CPU
> * on which they were processed by the application (set in recvmsg).
> + * Each entry is a 32bit value. Upper part is the high order bits
> + * of flow hash, lower part is cpu number.
> + * rps_cpu_mask is used to partition the space, depending on number of
> + * possible cpus : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
> + * For example, if 64 cpus are possible, rps_cpu_mask = 0x3f,
> + * meaning we use 32-6=26 bits for the hash.
> */
> struct rps_sock_flow_table {
> - unsigned int mask;
> - u16 ents[0];
> + u32 mask;
> + u32 ents[0];
> };
> -#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \
> - ((_num) * sizeof(u16)))
> +#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
>
> #define RPS_NO_CPU 0xffff
>
> +extern u32 rps_cpu_mask;
> +extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
> +
> static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
> u32 hash)
> {
> if (table && hash) {
> - unsigned int cpu, index = hash & table->mask;
> + unsigned int index = hash & table->mask;
> + u32 val = hash & ~rps_cpu_mask;
>
> /* We only give a hint, preemption can change cpu under us */
> - cpu = raw_smp_processor_id();
> + val |= raw_smp_processor_id();
>
> - if (table->ents[index] != cpu)
> - table->ents[index] = cpu;
> + if (table->ents[index] != val)
> + table->ents[index] = val;
> }
> }
>
> -static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
> - u32 hash)
> -{
> - if (table && hash)
> - table->ents[hash & table->mask] = RPS_NO_CPU;
> -}
> -
> -extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
> -
> #ifdef CONFIG_RFS_ACCEL
> bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
> u16 filter_id);
> diff --git a/include/net/sock.h b/include/net/sock.h
> index d28b8fededd6..e13824570b0f 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -857,18 +857,6 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
> #endif
> }
>
> -static inline void sock_rps_reset_flow_hash(__u32 hash)
> -{
> -#ifdef CONFIG_RPS
> - struct rps_sock_flow_table *sock_flow_table;
> -
> - rcu_read_lock();
> - sock_flow_table = rcu_dereference(rps_sock_flow_table);
> - rps_reset_sock_flow(sock_flow_table, hash);
> - rcu_read_unlock();
> -#endif
> -}
> -
> static inline void sock_rps_record_flow(const struct sock *sk)
> {
> #ifdef CONFIG_RPS
> @@ -876,28 +864,18 @@ static inline void sock_rps_record_flow(const struct sock *sk)
> #endif
> }
>
> -static inline void sock_rps_reset_flow(const struct sock *sk)
> -{
> -#ifdef CONFIG_RPS
> - sock_rps_reset_flow_hash(sk->sk_rxhash);
> -#endif
> -}
> -
> static inline void sock_rps_save_rxhash(struct sock *sk,
> const struct sk_buff *skb)
> {
> #ifdef CONFIG_RPS
> - if (unlikely(sk->sk_rxhash != skb->hash)) {
> - sock_rps_reset_flow(sk);
> + if (unlikely(sk->sk_rxhash != skb->hash))
> sk->sk_rxhash = skb->hash;
> - }
> #endif
> }
>
> static inline void sock_rps_reset_rxhash(struct sock *sk)
> {
> #ifdef CONFIG_RPS
> - sock_rps_reset_flow(sk);
> sk->sk_rxhash = 0;
> #endif
> }
> diff --git a/net/core/dev.c b/net/core/dev.c
> index a3a96ffc67f4..8be38675e1a8 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3030,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd,
> /* One global table that all flow-based protocols share. */
> struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
> EXPORT_SYMBOL(rps_sock_flow_table);
> +u32 rps_cpu_mask __read_mostly;
> +EXPORT_SYMBOL(rps_cpu_mask);
>
> struct static_key rps_needed __read_mostly;
>
> @@ -3086,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> struct rps_dev_flow **rflowp)
> {
> - struct netdev_rx_queue *rxqueue;
> - struct rps_map *map;
> + const struct rps_sock_flow_table *sock_flow_table;
> + struct netdev_rx_queue *rxqueue = dev->_rx;
> struct rps_dev_flow_table *flow_table;
> - struct rps_sock_flow_table *sock_flow_table;
> + struct rps_map *map;
> int cpu = -1;
> - u16 tcpu;
> + u32 tcpu;
> u32 hash;
>
> if (skb_rx_queue_recorded(skb)) {
> u16 index = skb_get_rx_queue(skb);
> +
> if (unlikely(index >= dev->real_num_rx_queues)) {
> WARN_ONCE(dev->real_num_rx_queues > 1,
> "%s received packet on queue %u, but number "
> @@ -3103,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> dev->name, index, dev->real_num_rx_queues);
> goto done;
> }
> - rxqueue = dev->_rx + index;
> - } else
> - rxqueue = dev->_rx;
> + rxqueue += index;
> + }
>
> + /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
> +
> + flow_table = rcu_dereference(rxqueue->rps_flow_table);
> map = rcu_dereference(rxqueue->rps_map);
> - if (map) {
> - if (map->len == 1 &&
> - !rcu_access_pointer(rxqueue->rps_flow_table)) {
> - tcpu = map->cpus[0];
> - if (cpu_online(tcpu))
> - cpu = tcpu;
> - goto done;
> - }
> - } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
> + if (!flow_table && !map)
> goto done;
> - }
>
> skb_reset_network_header(skb);
> hash = skb_get_hash(skb);
> if (!hash)
> goto done;
>
> - flow_table = rcu_dereference(rxqueue->rps_flow_table);
> sock_flow_table = rcu_dereference(rps_sock_flow_table);
> if (flow_table && sock_flow_table) {
> - u16 next_cpu;
> struct rps_dev_flow *rflow;
> + u32 next_cpu;
> + u32 ident;
> +
> + /* First check into global flow table if there is a match */
> + ident = sock_flow_table->ents[hash & sock_flow_table->mask];
> + if ((ident ^ hash) & ~rps_cpu_mask)
> + goto try_rps;
>
> + next_cpu = ident & rps_cpu_mask;
> +
> + /* OK, now we know there is a match,
> + * we can look at the local (per receive queue) flow table
> + */
> rflow = &flow_table->flows[hash & flow_table->mask];
> tcpu = rflow->cpu;
>
> - next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
> -
> /*
> * If the desired CPU (where last recvmsg was done) is
> * different from current CPU (one in the rx-queue flow
> @@ -3162,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> }
> }
>
> +try_rps:
> +
> if (map) {
> tcpu = map->cpus[reciprocal_scale(hash, map->len)];
> if (cpu_online(tcpu)) {
> diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
> index fde21d19e61b..7a31be5e361f 100644
> --- a/net/core/sysctl_net_core.c
> +++ b/net/core/sysctl_net_core.c
> @@ -65,7 +65,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
> mutex_unlock(&sock_flow_mutex);
> return -ENOMEM;
> }
> -
> + rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
> sock_table->mask = size - 1;
> } else
> sock_table = orig_sock_table;
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index a44773c8346c..d2e49baaff63 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -395,8 +395,6 @@ int inet_release(struct socket *sock)
> if (sk) {
> long timeout;
>
> - sock_rps_reset_flow(sk);
> -
> /* Applications forget to leave groups before exiting */
> ip_mc_drop_socket(sk);
>
>
>
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH net-next] net: rfs: add hash collision detection
2015-02-06 22:21 ` Tom Herbert
@ 2015-02-07 2:24 ` Eric Dumazet
0 siblings, 0 replies; 4+ messages in thread
From: Eric Dumazet @ 2015-02-07 2:24 UTC (permalink / raw)
To: Tom Herbert; +Cc: David Miller, netdev, Ying Cai, Willem de Bruijn
On Fri, 2015-02-06 at 14:21 -0800, Tom Herbert wrote:
> Acked-by: Tom Herbert <therbert@google.com>
>
> Eric, looks awesome! Can you share any performance numbers?
Right, numbers are awesome.
I flood one target with ~2.3 Mpps UDP packets coming from random IP
addresses.
UDP server uses SO_REUSEPORT with 8 sockets (I have 8 rx queues on the
host)
I force a small RFS table to show that hash collisions no longer
matter :
echo 512 >/proc/sys/net/core/rps_sock_flow_entries
softnettop tool (displaying /proc/net/softnet_stat in realtime)
shows that before starting the TCP flows, only 8 cpus are receiving and
process NIC irqs.
cpu: recv drop time rps
26: 586720 0 0 0
29: 586982 0 0 0
32: 588582 0 0 0
35: 589266 0 0 0
38: 587796 0 0 0
41: 588146 0 0 0
44: 588158 0 0 0
47: 587548 0 0 0
*: 4703282 0 0 5
Then I start 200 netperf -t TCP_RR
When the 200 TCP_RR flows start, we can see load nicely shifting,
but the UDP packets still not use RFS. TCP fl
cpu: recv drop time rps
0: 4254 0 0 1427
1: 4472 0 0 1460
2: 3070 0 0 1132
3: 4210 0 0 1417
4: 4472 0 0 1488
5: 2146 0 0 869
6: 4163 0 0 1456
7: 4354 0 0 1468
8: 3254 0 0 1170
9: 4468 0 0 1479
10: 4449 0 0 1521
11: 2788 0 0 1070
12: 5902 0 0 1665
13: 6160 0 0 1692
14: 2319 0 0 945
15: 5850 0 0 1686
16: 5716 0 0 1643
17: 3388 0 0 1224
18: 5936 0 0 1724
19: 6040 0 0 1691
20: 2962 0 0 1076
21: 5784 0 0 1696
22: 6094 0 0 1717
23: 2748 0 0 999
24: 1392 0 0 638
25: 1158 0 0 524
26: 577970 0 0 77
27: 1410 0 0 644
28: 1166 0 0 544
29: 575270 0 0 5
30: 1158 0 0 523
31: 972 0 0 444
32: 574306 0 0 181
33: 1248 0 0 575
34: 1094 0 0 502
35: 577116 0 0 69
36: 2142 0 0 893
37: 1384 0 0 587
38: 577470 0 0 7
39: 2029 0 0 856
40: 1858 0 0 780
41: 574744 0 0 46
42: 1946 0 0 806
43: 1711 0 0 719
44: 570190 0 0 81
45: 2210 0 0 872
46: 1670 0 0 702
47: 572758 0 0 68
*: 4729371 0 0 44858
cpu: recv drop time rps
0: 48770 0 0 13778
1: 49198 0 0 13979
2: 24580 0 0 8631
3: 48272 0 0 13642
4: 48578 0 0 13831
5: 23716 0 0 8631
6: 48034 0 0 13648
7: 49408 0 0 13848
8: 26147 0 0 9068
9: 48678 0 0 13843
10: 49515 0 0 13931
11: 29836 0 0 10079
12: 45828 0 0 13331
13: 46654 0 0 13553
14: 19850 0 0 7452
15: 44382 0 0 13083
16: 44667 0 0 13072
17: 27196 0 0 9429
18: 44574 0 0 13069
19: 45076 0 0 13193
20: 27352 0 0 9329
21: 45468 0 0 13183
22: 46264 0 0 13435
23: 23598 0 0 8485
24: 14248 0 0 6061
25: 13624 0 0 5851
26: 508738 0 0 828
27: 14516 0 0 6214
28: 13354 0 0 5700
29: 512006 0 0 604
30: 14686 0 0 6249
31: 13866 0 0 5944
32: 491190 0 0 1101
33: 14628 0 0 6237
34: 14164 0 0 6025
35: 499178 0 0 1513
36: 13138 0 0 5687
37: 11964 0 0 5220
38: 523120 0 0 413
39: 13565 0 0 5808
40: 12742 0 0 5500
41: 485162 0 0 1343
42: 13254 0 0 5686
43: 12096 0 0 5254
44: 464680 0 0 1615
45: 13450 0 0 5781
46: 12400 0 0 5346
47: 493070 0 0 935
*: 5148480 0 0 388438
cpu: recv drop time rps
0: 46596 0 0 13977
1: 48633 0 0 14331
2: 24006 0 0 8909
3: 47844 0 0 14080
4: 47742 0 0 14067
5: 26642 0 0 9605
6: 47796 0 0 14117
7: 48806 0 0 14432
8: 27936 0 0 9934
9: 48034 0 0 14233
10: 48892 0 0 14377
11: 30650 0 0 10512
12: 45651 0 0 13614
13: 45439 0 0 13536
14: 24337 0 0 8877
15: 45379 0 0 13666
16: 45695 0 0 13643
17: 26552 0 0 9452
18: 45513 0 0 13608
19: 46588 0 0 13930
20: 26242 0 0 9273
21: 45521 0 0 13670
22: 46255 0 0 13729
23: 28166 0 0 9842
24: 13998 0 0 6086
25: 12518 0 0 5512
26: 503796 0 0 621
27: 13732 0 0 6002
28: 12802 0 0 5611
29: 507766 0 0 689
30: 13968 0 0 6044
31: 13012 0 0 5648
32: 488760 0 0 938
33: 13969 0 0 6044
34: 12666 0 0 5545
35: 497482 0 0 1497
36: 13074 0 0 5715
37: 12187 0 0 5338
38: 520414 0 0 496
39: 13752 0 0 5988
40: 12046 0 0 5297
41: 480412 0 0 904
42: 13436 0 0 5845
43: 11978 0 0 5286
44: 461146 0 0 1288
45: 12982 0 0 5655
46: 12872 0 0 5643
47: 488788 0 0 1066
*: 5122471 0 0 398172
cpu: recv drop time rps
0: 46970 0 0 13467
1: 48129 0 0 13914
2: 25876 0 0 9206
3: 47672 0 0 13723
4: 48566 0 0 13875
5: 25575 0 0 9009
6: 47342 0 0 13673
7: 48636 0 0 13889
8: 28038 0 0 9722
9: 48298 0 0 13875
10: 48662 0 0 13911
11: 28412 0 0 9777
12: 45025 0 0 13323
13: 45620 0 0 13422
14: 20808 0 0 7800
15: 44481 0 0 13271
16: 45032 0 0 13324
17: 25698 0 0 9155
18: 45125 0 0 13483
19: 46371 0 0 13627
20: 27234 0 0 9437
21: 44899 0 0 13234
22: 46065 0 0 13530
23: 24118 0 0 8629
24: 14416 0 0 6137
25: 13690 0 0 5863
26: 508418 0 0 905
27: 14780 0 0 6295
28: 13288 0 0 5716
29: 509010 0 0 1136
30: 15080 0 0 6443
31: 13378 0 0 5771
32: 489950 0 0 1302
33: 14410 0 0 6157
34: 13712 0 0 5894
35: 496760 0 0 1153
36: 13906 0 0 5978
37: 12198 0 0 5326
38: 520846 0 0 444
39: 13206 0 0 5709
40: 13020 0 0 5636
41: 482136 0 0 1029
42: 13068 0 0 5616
43: 12982 0 0 5636
44: 462626 0 0 1378
45: 13342 0 0 5765
46: 12784 0 0 5608
47: 491690 0 0 717
*: 5131348 0 0 391890
cpu: recv drop time rps
0: 45472 0 0 13241
1: 46628 0 0 13471
2: 25556 0 0 8988
3: 46082 0 0 13369
4: 45573 0 0 13263
5: 28061 0 0 9506
6: 45806 0 0 13329
7: 46890 0 0 13631
8: 28321 0 0 9711
9: 45826 0 0 13339
10: 46522 0 0 13569
11: 27168 0 0 9467
12: 47750 0 0 13717
13: 47630 0 0 13619
14: 20660 0 0 7846
15: 47224 0 0 13707
16: 48439 0 0 13884
17: 23894 0 0 8646
18: 47019 0 0 13666
19: 47532 0 0 13811
20: 26957 0 0 9465
21: 47380 0 0 13703
22: 48520 0 0 14014
23: 24379 0 0 8775
24: 13100 0 0 5633
25: 13210 0 0 5696
26: 506252 0 0 887
27: 13564 0 0 5865
28: 12258 0 0 5294
29: 510674 0 0 1436
30: 12892 0 0 5551
31: 13112 0 0 5668
32: 492734 0 0 1633
33: 13480 0 0 5775
34: 12400 0 0 5367
35: 497366 0 0 1371
36: 14236 0 0 6109
37: 12728 0 0 5552
38: 519158 0 0 365
39: 14322 0 0 6096
40: 13432 0 0 5801
41: 481236 0 0 832
42: 14584 0 0 6242
43: 14200 0 0 6099
44: 463266 0 0 1242
45: 14130 0 0 6056
46: 13574 0 0 5824
47: 490142 0 0 778
*: 5131339 0 0 390909
cpu: recv drop time rps
0: 46704 0 0 13515
1: 46175 0 0 13421
2: 22761 0 0 8388
3: 47142 0 0 13758
4: 46908 0 0 13532
5: 27770 0 0 9681
6: 47210 0 0 13638
7: 47645 0 0 13657
8: 28668 0 0 9829
9: 47806 0 0 13763
10: 47638 0 0 13621
11: 30896 0 0 10260
12: 47778 0 0 13721
13: 48396 0 0 13753
14: 18130 0 0 7339
15: 46427 0 0 13522
16: 47578 0 0 13659
17: 25465 0 0 8996
18: 47052 0 0 13537
19: 48184 0 0 13839
20: 27105 0 0 9526
21: 48230 0 0 13697
22: 49176 0 0 13919
23: 20728 0 0 7863
24: 13626 0 0 5844
25: 11770 0 0 5112
26: 532288 0 0 549
27: 13960 0 0 6030
28: 13534 0 0 5863
29: 537130 0 0 1374
30: 14702 0 0 6259
31: 13776 0 0 5923
32: 520046 0 0 1242
33: 14292 0 0 6095
34: 13684 0 0 5868
35: 524026 0 0 1466
36: 14931 0 0 6375
37: 10918 0 0 4738
38: 545740 0 0 119
39: 15372 0 0 6580
40: 13984 0 0 5992
41: 507908 0 0 1185
42: 15540 0 0 6595
43: 14538 0 0 6243
44: 486684 0 0 1202
45: 15100 0 0 6417
46: 13423 0 0 5781
47: 515604 0 0 333
*: 5354148 0 0 393619
cpu: recv drop time rps
0: 49981 0 0 14007
1: 51769 0 0 14362
2: 21066 0 0 7982
3: 51477 0 0 14326
4: 50566 0 0 14090
5: 21334 0 0 8299
6: 50853 0 0 14238
7: 52091 0 0 14379
8: 22447 0 0 8502
9: 51318 0 0 14373
10: 52682 0 0 14605
11: 31434 0 0 10542
12: 45672 0 0 13385
13: 46592 0 0 13574
14: 19514 0 0 7595
15: 45765 0 0 13409
16: 45518 0 0 13260
17: 22627 0 0 8526
18: 45697 0 0 13382
19: 46958 0 0 13667
20: 25211 0 0 9039
21: 46581 0 0 13685
22: 46623 0 0 13493
23: 22625 0 0 8326
24: 15724 0 0 6612
25: 13978 0 0 5959
26: 555430 0 0 296
27: 16266 0 0 6831
28: 14319 0 0 6139
29: 557434 0 0 232
30: 17372 0 0 7232
31: 14600 0 0 6244
32: 541476 0 0 601
33: 17265 0 0 7225
34: 15755 0 0 6693
35: 550066 0 0 1764
36: 14554 0 0 6240
37: 12873 0 0 5575
38: 569992 0 0 323
39: 14685 0 0 6306
40: 13202 0 0 5755
41: 531414 0 0 571
42: 15002 0 0 6467
43: 13336 0 0 5746
44: 512364 0 0 1019
45: 14469 0 0 6158
46: 13390 0 0 5813
47: 541256 0 0 644
*: 5562623 0 0 397491
cpu: recv drop time rps
0: 49230 0 0 13561
1: 48356 0 0 13262
2: 25240 0 0 8872
3: 48772 0 0 13475
4: 49256 0 0 13579
5: 26330 0 0 9117
6: 49474 0 0 13638
7: 50230 0 0 13808
8: 27763 0 0 9450
9: 49538 0 0 13641
10: 50428 0 0 13888
11: 26006 0 0 9110
12: 46391 0 0 13273
13: 46546 0 0 13354
14: 18240 0 0 7144
15: 45833 0 0 13084
16: 46194 0 0 13148
17: 26255 0 0 9218
18: 46387 0 0 13280
19: 47606 0 0 13546
20: 22683 0 0 8411
21: 47091 0 0 13458
22: 47880 0 0 13577
23: 22779 0 0 8252
24: 15551 0 0 6458
25: 14146 0 0 5986
26: 558688 0 0 779
27: 15856 0 0 6559
28: 13980 0 0 5886
29: 562008 0 0 1002
30: 16178 0 0 6766
31: 14736 0 0 6213
32: 542428 0 0 1121
33: 15828 0 0 6541
34: 14944 0 0 6336
35: 547546 0 0 829
36: 13882 0 0 5942
37: 11804 0 0 5165
38: 571056 0 0 227
39: 15422 0 0 6529
40: 12786 0 0 5514
41: 532399 0 0 1073
42: 14810 0 0 6316
43: 13254 0 0 5721
44: 509890 0 0 649
45: 14717 0 0 6280
46: 12160 0 0 5294
47: 538686 0 0 547
*: 5557263 0 0 388879
cpu: recv drop time rps
0: 48494 0 0 13610
1: 48450 0 0 13667
2: 18908 0 0 7389
3: 47301 0 0 13496
4: 48212 0 0 13626
5: 24524 0 0 8817
6: 48055 0 0 13477
7: 50226 0 0 13908
8: 26093 0 0 9198
9: 49130 0 0 13734
10: 50252 0 0 14079
11: 30880 0 0 10275
12: 46759 0 0 13098
13: 48464 0 0 13494
14: 21192 0 0 7887
15: 47295 0 0 13352
16: 48224 0 0 13641
17: 27363 0 0 9500
18: 46990 0 0 13353
19: 48559 0 0 13695
20: 24237 0 0 8670
21: 48024 0 0 13583
22: 49013 0 0 13651
23: 22739 0 0 8365
24: 14664 0 0 6240
25: 10190 0 0 4505
26: 556422 0 0 301
27: 14702 0 0 6260
28: 13725 0 0 5924
29: 560214 0 0 826
30: 15144 0 0 6412
31: 14250 0 0 6075
32: 540102 0 0 972
33: 15122 0 0 6408
34: 14358 0 0 6157
35: 549364 0 0 1676
36: 14894 0 0 6302
37: 13023 0 0 5569
38: 571308 0 0 496
39: 15154 0 0 6391
40: 14430 0 0 6116
41: 530950 0 0 945
42: 15478 0 0 6571
43: 13633 0 0 5849
44: 511994 0 0 1052
45: 14892 0 0 6327
46: 13216 0 0 5686
47: 541362 0 0 559
*: 5557975 0 0 391184
cpu: recv drop time rps
0: 49414 0 0 13841
1: 50425 0 0 14087
2: 19888 0 0 7699
3: 49729 0 0 14059
4: 49847 0 0 13937
5: 20906 0 0 7860
6: 49566 0 0 13901
7: 49948 0 0 13943
8: 26602 0 0 9373
9: 50721 0 0 14104
10: 50977 0 0 14123
11: 31706 0 0 10511
12: 47948 0 0 13746
13: 48042 0 0 13843
14: 17106 0 0 7039
15: 47788 0 0 13749
16: 47675 0 0 13651
17: 22882 0 0 8337
18: 47063 0 0 13544
19: 48308 0 0 13791
20: 23578 0 0 8637
21: 48128 0 0 13847
22: 48846 0 0 13843
23: 20482 0 0 7744
24: 15436 0 0 6517
25: 12962 0 0 5592
26: 555592 0 0 216
27: 16100 0 0 6811
28: 13358 0 0 5712
29: 558520 0 0 572
30: 15792 0 0 6618
31: 14698 0 0 6277
32: 540146 0 0 1166
33: 15970 0 0 6711
34: 15034 0 0 6345
35: 548184 0 0 2099
36: 15194 0 0 6493
37: 12204 0 0 5338
38: 570088 0 0 77
39: 15828 0 0 6732
40: 13156 0 0 5694
41: 528860 0 0 906
42: 15874 0 0 6783
43: 13476 0 0 5795
44: 510034 0 0 973
45: 15476 0 0 6624
46: 13382 0 0 5816
47: 538046 0 0 551
*: 5550985 0 0 395627
cpu: recv drop time rps
0: 42896 0 0 12157
1: 43502 0 0 12338
2: 24072 0 0 8449
3: 42853 0 0 12258
4: 43146 0 0 12331
5: 21194 0 0 7741
6: 42003 0 0 12066
7: 43034 0 0 12182
8: 22830 0 0 7946
9: 43633 0 0 12347
10: 43731 0 0 12329
11: 24231 0 0 8449
12: 43542 0 0 12251
13: 44341 0 0 12409
14: 19231 0 0 7209
15: 43392 0 0 12284
16: 43544 0 0 12189
17: 24620 0 0 8463
18: 42781 0 0 12012
19: 44263 0 0 12439
20: 22462 0 0 8016
21: 43984 0 0 12512
22: 44554 0 0 12435
23: 19194 0 0 7287
24: 13028 0 0 5565
25: 11452 0 0 4959
26: 558724 0 0 850
27: 13408 0 0 5708
28: 11022 0 0 4745
29: 559438 0 0 645
30: 13428 0 0 5695
31: 12946 0 0 5524
32: 543112 0 0 1045
33: 13006 0 0 5520
34: 12110 0 0 5191
35: 547556 0 0 838
36: 13546 0 0 5723
37: 12582 0 0 5456
38: 571372 0 0 424
39: 14040 0 0 5898
40: 12936 0 0 5525
41: 536642 0 0 978
42: 14164 0 0 5957
43: 13158 0 0 5671
44: 518283 0 0 975
45: 13834 0 0 5860
46: 11914 0 0 5093
47: 542846 0 0 489
*: 5457580 0 0 354433
TCP_RR workload ends....
cpu: recv drop time rps
26: 588684 0 0 0
29: 589572 0 0 0
32: 589626 0 0 0
35: 588128 0 0 0
38: 589176 0 0 0
41: 589258 0 0 0
44: 589546 0 0 0
47: 587540 0 0 0
*: 4711633 0 0 12
Thanks
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH net-next] net: rfs: add hash collision detection
2015-02-06 20:59 [PATCH net-next] net: rfs: add hash collision detection Eric Dumazet
2015-02-06 22:21 ` Tom Herbert
@ 2015-02-09 0:54 ` David Miller
1 sibling, 0 replies; 4+ messages in thread
From: David Miller @ 2015-02-09 0:54 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev, therbert, ycai, willemb
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 06 Feb 2015 12:59:01 -0800
> From: Eric Dumazet <edumazet@google.com>
>
> Receive Flow Steering is a nice solution but suffers from
> hash collisions when a mix of connected and unconnected traffic
> is received on the host, when flow hash table is populated.
>
> Also, clearing flow in inet_release() makes RFS not very good
> for short lived flows, as many packets can follow close().
> (FIN , ACK packets, ...)
>
> This patch extends the information stored into global hash table
> to not only include cpu number, but upper part of the hash value.
>
> I use a 32bit value, and dynamically split it in two parts.
>
> For host with less than 64 possible cpus, this gives 6 bits for the
> cpu number, and 26 (32-6) bits for the upper part of the hash.
>
> Since hash bucket selection use low order bits of the hash, we have
> a full hash match, if /proc/sys/net/core/rps_sock_flow_entries is big
> enough.
>
> If the hash found in flow table does not match, we fallback to RPS (if
> it is enabled for the rxqueue).
>
> This means that a packet for an non connected flow can avoid the
> IPI through a unrelated/victim CPU.
>
> This also means we no longer have to clear the table at socket
> close time, and this helps short lived flows performance.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Applied, thanks Eric.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2015-02-09 0:54 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-02-06 20:59 [PATCH net-next] net: rfs: add hash collision detection Eric Dumazet
2015-02-06 22:21 ` Tom Herbert
2015-02-07 2:24 ` Eric Dumazet
2015-02-09 0:54 ` David Miller
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.