All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/2] packet: Add fanout support.
@ 2011-07-05  4:20 David Miller
  2011-07-05  4:33 ` Eric Dumazet
  2011-07-05  6:21 ` Eric Dumazet
  0 siblings, 2 replies; 28+ messages in thread
From: David Miller @ 2011-07-05  4:20 UTC (permalink / raw)
  To: victor; +Cc: netdev


Fanouts allow packet capturing to be demuxed to a set of AF_PACKET
sockets.  Two fanout policies are implemented:

1) Hashing based upon skb->rxhash

2) Pure round-robin

An AF_PACKET socket must be fully bound before it tries to add itself
to a fanout.  All AF_PACKET sockets trying to join the same fanout
must all have the same bind settings.

Fanouts are identified (within a network namespace) by a 16-bit ID.
The first socket to try to add itself to a fanout with a particular
ID, creates that fanout.  When the last socket leaves the fanout
(which happens only when the socket is closed), that fanout is
destroyed.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_packet.h |    4 +
 net/packet/af_packet.c    |  250 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 249 insertions(+), 5 deletions(-)

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 7b31863..1efa1cb 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -49,6 +49,10 @@ struct sockaddr_ll {
 #define PACKET_VNET_HDR			15
 #define PACKET_TX_TIMESTAMP		16
 #define PACKET_TIMESTAMP		17
+#define PACKET_FANOUT			18
+
+#define PACKET_FANOUT_HASH		0
+#define PACKET_FANOUT_LB		1
 
 struct tpacket_stats {
 	unsigned int	tp_packets;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index bb281bf..7db1e12 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -187,9 +187,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
 
 static void packet_flush_mclist(struct sock *sk);
 
+struct packet_fanout;
 struct packet_sock {
 	/* struct sock has to be the first member of packet_sock */
 	struct sock		sk;
+	struct packet_fanout	*fanout;
 	struct tpacket_stats	stats;
 	struct packet_ring_buffer	rx_ring;
 	struct packet_ring_buffer	tx_ring;
@@ -212,6 +214,24 @@ struct packet_sock {
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
 };
 
+#define PACKET_FANOUT_MAX	2048
+
+struct packet_fanout {
+#ifdef CONFIG_NET_NS
+	struct net		*net;
+#endif
+	int			num_members;
+	u16			id;
+	u8			type;
+	u8			pad;
+	atomic_t		rr_cur;
+	struct list_head	list;
+	struct sock		*arr[PACKET_FANOUT_MAX];
+	spinlock_t		lock;
+	atomic_t		sk_ref;
+	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
+};
+
 struct packet_skb_cb {
 	unsigned int origlen;
 	union {
@@ -227,6 +247,9 @@ static inline struct packet_sock *pkt_sk(struct sock *sk)
 	return (struct packet_sock *)sk;
 }
 
+static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
+static void __fanout_link(struct sock *sk, struct packet_sock *po);
+
 /* register_prot_hook must be invoked with the po->bind_lock held,
  * or from a context in which asynchronous accesses to the packet
  * socket is not possible (packet_create()).
@@ -235,7 +258,10 @@ static void register_prot_hook(struct sock *sk)
 {
 	struct packet_sock *po = pkt_sk(sk);
 	if (!po->running) {
-		dev_add_pack(&po->prot_hook);
+		if (po->fanout)
+			__fanout_link(sk, po);
+		else
+			dev_add_pack(&po->prot_hook);
 		sock_hold(sk);
 		po->running = 1;
 	}
@@ -253,7 +279,10 @@ static void __unregister_prot_hook(struct sock *sk, bool sync)
 	struct packet_sock *po = pkt_sk(sk);
 
 	po->running = 0;
-	__dev_remove_pack(&po->prot_hook);
+	if (po->fanout)
+		__fanout_unlink(sk, po);
+	else
+		__dev_remove_pack(&po->prot_hook);
 	__sock_put(sk);
 
 	if (sync) {
@@ -388,6 +417,195 @@ static void packet_sock_destruct(struct sock *sk)
 	sk_refcnt_debug_dec(sk);
 }
 
+static int fanout_rr_next(struct packet_fanout *f)
+{
+	int x = atomic_read(&f->rr_cur) + 1;
+
+	if (x >= f->num_members)
+		x = 0;
+
+	return x;
+}
+
+static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb)
+{
+	u32 idx, hash = skb->rxhash;
+
+	idx = ((u64)hash * f->num_members) >> 32;
+
+	return f->arr[idx];
+}
+
+static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb)
+{
+	int cur, old;
+
+	cur = atomic_read(&f->rr_cur);
+	while ((old = atomic_cmpxchg(&f->rr_cur, cur,
+				     fanout_rr_next(f))) != cur)
+		cur = old;
+	return f->arr[cur];
+}
+
+static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
+				  struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct packet_fanout *f = pt->af_packet_priv;
+	struct packet_sock *po;
+	struct sock *sk;
+
+	if (!net_eq(dev_net(dev), read_pnet(&f->net))) {
+		kfree_skb(skb);
+		return 0;
+	}
+
+	sk = fanout_demux_hash(f, skb);
+	po = pkt_sk(sk);
+
+	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
+}
+
+static int packet_rcv_fanout_lb(struct sk_buff *skb, struct net_device *dev,
+				struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct packet_fanout *f = pt->af_packet_priv;
+	struct packet_sock *po;
+	struct sock *sk;
+
+	if (!net_eq(dev_net(dev), read_pnet(&f->net))) {
+		kfree_skb(skb);
+		return 0;
+	}
+
+	sk = fanout_demux_lb(f, skb);
+	po = pkt_sk(sk);
+
+	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
+}
+
+static DEFINE_MUTEX(fanout_mutex);
+static LIST_HEAD(fanout_list);
+
+static void __fanout_link(struct sock *sk, struct packet_sock *po)
+{
+	struct packet_fanout *f = po->fanout;
+
+	spin_lock(&f->lock);
+	f->arr[f->num_members] = sk;
+	smp_wmb();
+	f->num_members++;
+	spin_unlock(&f->lock);
+}
+
+static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
+{
+	struct packet_fanout *f = po->fanout;
+	int i;
+
+	spin_unlock(&f->lock);
+	for (i = 0; i < f->num_members; i++) {
+		if (f->arr[i] == sk)
+			break;
+	}
+	BUG_ON(i >= f->num_members);
+	f->arr[i] = f->arr[f->num_members - 1];
+	f->num_members--;
+	spin_unlock(&f->lock);
+}
+
+static int fanout_add(struct sock *sk, u16 id, u8 type)
+{
+	struct packet_sock *po = pkt_sk(sk);
+	struct packet_fanout *f, *match;
+	int err;
+
+	switch (type) {
+	case PACKET_FANOUT_HASH:
+	case PACKET_FANOUT_LB:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (!po->running)
+		return -EINVAL;
+
+	if (po->fanout)
+		return -EALREADY;
+
+	mutex_lock(&fanout_mutex);
+	match = NULL;
+	list_for_each_entry(f, &fanout_list, list) {
+		if (f->id == id &&
+		    read_pnet(&f->net) == sock_net(sk)) {
+			match = f;
+			break;
+		}
+	}
+	if (!match) {
+		match = kzalloc(sizeof(*match), GFP_KERNEL);
+		if (match) {
+			write_pnet(&match->net, sock_net(sk));
+			match->id = id;
+			match->type = type;
+			atomic_set(&match->rr_cur, 0);
+			INIT_LIST_HEAD(&match->list);
+			spin_lock_init(&match->lock);
+			atomic_set(&match->sk_ref, 0);
+			match->prot_hook.type = po->prot_hook.type;
+			match->prot_hook.dev = po->prot_hook.dev;
+			switch (type) {
+			case PACKET_FANOUT_HASH:
+				match->prot_hook.func = packet_rcv_fanout_hash;
+				break;
+			case PACKET_FANOUT_LB:
+				match->prot_hook.func = packet_rcv_fanout_lb;
+				break;
+			}
+			match->prot_hook.af_packet_priv = match;
+			dev_add_pack(&match->prot_hook);
+			list_add(&match->list, &fanout_list);
+		}
+	}
+	err = -ENOMEM;
+	if (match) {
+		err = -EINVAL;
+		if (match->type == type &&
+		    match->prot_hook.type == po->prot_hook.type &&
+		    match->prot_hook.dev == po->prot_hook.dev) {
+			err = -ENOSPC;
+			if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
+				__dev_remove_pack(&po->prot_hook);
+				po->fanout = match;
+				atomic_inc(&match->sk_ref);
+				__fanout_link(sk, po);
+				err = 0;
+			}
+		}
+	}
+	mutex_unlock(&fanout_mutex);
+	return err;
+}
+
+static void fanout_release(struct sock *sk)
+{
+	struct packet_sock *po = pkt_sk(sk);
+	struct packet_fanout *f;
+
+	f = po->fanout;
+	if (!f)
+		return;
+
+	po->fanout = NULL;
+
+	mutex_lock(&fanout_mutex);
+	if (atomic_dec_and_test(&f->sk_ref)) {
+		list_del(&f->list);
+		dev_remove_pack(&f->prot_hook);
+		kfree(f);
+	}
+	mutex_unlock(&fanout_mutex);
+}
 
 static const struct proto_ops packet_ops;
 
@@ -1398,6 +1616,8 @@ static int packet_release(struct socket *sock)
 	if (po->tx_ring.pg_vec)
 		packet_set_ring(sk, &req, 1, 1);
 
+	fanout_release(sk);
+
 	synchronize_net();
 	/*
 	 *	Now the socket is dead. No more input will appear.
@@ -1421,9 +1641,9 @@ static int packet_release(struct socket *sock)
 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
 {
 	struct packet_sock *po = pkt_sk(sk);
-	/*
-	 *	Detach an existing hook if present.
-	 */
+
+	if (po->fanout)
+		return -EINVAL;
 
 	lock_sock(sk);
 
@@ -2133,6 +2353,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 		po->tp_tstamp = val;
 		return 0;
 	}
+	case PACKET_FANOUT:
+	{
+		int val;
+
+		if (optlen != sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+
+		return fanout_add(sk, val & 0xffff, val >> 16);
+	}
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -2231,6 +2462,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 		val = po->tp_tstamp;
 		data = &val;
 		break;
+	case PACKET_FANOUT:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		val = (po->fanout ?
+		       ((u32)po->fanout->id |
+			((u32)po->fanout->type << 16)) :
+		       0);
+		data = &val;
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
1.7.5.4


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  4:20 [PATCH 2/2] packet: Add fanout support David Miller
@ 2011-07-05  4:33 ` Eric Dumazet
  2011-07-05  4:36   ` David Miller
  2011-07-05  6:21 ` Eric Dumazet
  1 sibling, 1 reply; 28+ messages in thread
From: Eric Dumazet @ 2011-07-05  4:33 UTC (permalink / raw)
  To: David Miller; +Cc: victor, netdev

Le lundi 04 juillet 2011 à 21:20 -0700, David Miller a écrit :
> Fanouts allow packet capturing to be demuxed to a set of AF_PACKET
> sockets.  Two fanout policies are implemented:
> 
> 1) Hashing based upon skb->rxhash
> 
> 2) Pure round-robin
> 
> An AF_PACKET socket must be fully bound before it tries to add itself
> to a fanout.  All AF_PACKET sockets trying to join the same fanout
> must all have the same bind settings.
> 
> Fanouts are identified (within a network namespace) by a 16-bit ID.
> The first socket to try to add itself to a fanout with a particular
> ID, creates that fanout.  When the last socket leaves the fanout
> (which happens only when the socket is closed), that fanout is
> destroyed.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>  include/linux/if_packet.h |    4 +
>  net/packet/af_packet.c    |  250 ++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 249 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
> index 7b31863..1efa1cb 100644
> --- a/include/linux/if_packet.h
> +++ b/include/linux/if_packet.h
> @@ -49,6 +49,10 @@ struct sockaddr_ll {
>  #define PACKET_VNET_HDR			15
>  #define PACKET_TX_TIMESTAMP		16
>  #define PACKET_TIMESTAMP		17
> +#define PACKET_FANOUT			18
> +
> +#define PACKET_FANOUT_HASH		0
> +#define PACKET_FANOUT_LB		1
>  
>  struct tpacket_stats {
>  	unsigned int	tp_packets;
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index bb281bf..7db1e12 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -187,9 +187,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
>  
>  static void packet_flush_mclist(struct sock *sk);
>  
> +struct packet_fanout;
>  struct packet_sock {
>  	/* struct sock has to be the first member of packet_sock */
>  	struct sock		sk;
> +	struct packet_fanout	*fanout;
>  	struct tpacket_stats	stats;
>  	struct packet_ring_buffer	rx_ring;
>  	struct packet_ring_buffer	tx_ring;
> @@ -212,6 +214,24 @@ struct packet_sock {
>  	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
>  };
>  
> +#define PACKET_FANOUT_MAX	2048
> +
> +struct packet_fanout {
> +#ifdef CONFIG_NET_NS
> +	struct net		*net;
> +#endif
> +	int			num_members;
> +	u16			id;
> +	u8			type;
> +	u8			pad;
> +	atomic_t		rr_cur;
> +	struct list_head	list;
> +	struct sock		*arr[PACKET_FANOUT_MAX];

Thats about 16Kbytes, yet you use kzalloc()

> +	spinlock_t		lock;
> +	atomic_t		sk_ref;
> +	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
> +};
> +

Maybe use a dynamic array ? I suspect most uses wont even reach 16
sockets anyway...


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  4:33 ` Eric Dumazet
@ 2011-07-05  4:36   ` David Miller
  0 siblings, 0 replies; 28+ messages in thread
From: David Miller @ 2011-07-05  4:36 UTC (permalink / raw)
  To: eric.dumazet; +Cc: victor, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 05 Jul 2011 06:33:49 +0200

> Le lundi 04 juillet 2011 à 21:20 -0700, David Miller a écrit :
>> +#define PACKET_FANOUT_MAX	2048
 ...
>> +	struct sock		*arr[PACKET_FANOUT_MAX];
> 
> Thats about 16Kbytes, yet you use kzalloc()
> 
>> +	spinlock_t		lock;
>> +	atomic_t		sk_ref;
>> +	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
>> +};
>> +
> 
> Maybe use a dynamic array ? I suspect most uses wont even reach 16
> sockets anyway...

True.  Another option, for now, is to just make PACKET_FANOUT_MAX more
reasonable.  I'll make it something like 256.

Thanks!


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  4:20 [PATCH 2/2] packet: Add fanout support David Miller
  2011-07-05  4:33 ` Eric Dumazet
@ 2011-07-05  6:21 ` Eric Dumazet
  2011-07-05  6:56   ` Victor Julien
  2011-07-05  7:46   ` David Miller
  1 sibling, 2 replies; 28+ messages in thread
From: Eric Dumazet @ 2011-07-05  6:21 UTC (permalink / raw)
  To: David Miller; +Cc: victor, netdev

Le lundi 04 juillet 2011 à 21:20 -0700, David Miller a écrit :
> Fanouts allow packet capturing to be demuxed to a set of AF_PACKET
> sockets.  Two fanout policies are implemented:
> 
> 1) Hashing based upon skb->rxhash

...

> +
> +static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb)
> +{
> +	u32 idx, hash = skb->rxhash;
> +
> +	idx = ((u64)hash * f->num_members) >> 32;
> +
> +	return f->arr[idx];
> +}
> +

rxhash is 0 unless skb_get_rxhash() was called, or some NIC set it in RX
path.




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  6:21 ` Eric Dumazet
@ 2011-07-05  6:56   ` Victor Julien
  2011-07-05  7:00     ` Eric Dumazet
  2011-07-05  7:48     ` David Miller
  2011-07-05  7:46   ` David Miller
  1 sibling, 2 replies; 28+ messages in thread
From: Victor Julien @ 2011-07-05  6:56 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On 07/05/2011 08:21 AM, Eric Dumazet wrote:
> Le lundi 04 juillet 2011 à 21:20 -0700, David Miller a écrit :
>> Fanouts allow packet capturing to be demuxed to a set of AF_PACKET
>> sockets.  Two fanout policies are implemented:
>>
>> 1) Hashing based upon skb->rxhash
> 
> ...
> 
>> +
>> +static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb)
>> +{
>> +	u32 idx, hash = skb->rxhash;
>> +
>> +	idx = ((u64)hash * f->num_members) >> 32;
>> +
>> +	return f->arr[idx];
>> +}
>> +
> 
> rxhash is 0 unless skb_get_rxhash() was called, or some NIC set it in RX
> path.
> 

Is this still also true for IP fragments?

-- 
---------------------------------------------
Victor Julien
http://www.inliniac.net/
PGP: http://www.inliniac.net/victorjulien.asc
---------------------------------------------


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  6:56   ` Victor Julien
@ 2011-07-05  7:00     ` Eric Dumazet
  2011-07-05  7:06       ` Victor Julien
                         ` (2 more replies)
  2011-07-05  7:48     ` David Miller
  1 sibling, 3 replies; 28+ messages in thread
From: Eric Dumazet @ 2011-07-05  7:00 UTC (permalink / raw)
  To: Victor Julien; +Cc: David Miller, netdev

Le mardi 05 juillet 2011 à 08:56 +0200, Victor Julien a écrit :

> Is this still also true for IP fragments?
> 

This point was already raised. IP fragments have rxhash = 0, obviously,
since we dont have full information (source / destination ports for
example)




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  7:00     ` Eric Dumazet
@ 2011-07-05  7:06       ` Victor Julien
  2011-07-05  7:50       ` David Miller
  2011-07-05 16:03       ` Loke, Chetan
  2 siblings, 0 replies; 28+ messages in thread
From: Victor Julien @ 2011-07-05  7:06 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On 07/05/2011 09:00 AM, Eric Dumazet wrote:
> Le mardi 05 juillet 2011 à 08:56 +0200, Victor Julien a écrit :
> 
>> Is this still also true for IP fragments?
>>
> 
> This point was already raised. IP fragments have rxhash = 0, obviously,
> since we dont have full information (source / destination ports for
> example)

Sure, just seeing if something was changed here as that wasn't
immediately obvious to me from the code.

-- 
---------------------------------------------
Victor Julien
http://www.inliniac.net/
PGP: http://www.inliniac.net/victorjulien.asc
---------------------------------------------


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  6:21 ` Eric Dumazet
  2011-07-05  6:56   ` Victor Julien
@ 2011-07-05  7:46   ` David Miller
  2011-07-05  9:50     ` Eric Dumazet
  1 sibling, 1 reply; 28+ messages in thread
From: David Miller @ 2011-07-05  7:46 UTC (permalink / raw)
  To: eric.dumazet; +Cc: victor, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 05 Jul 2011 08:21:15 +0200

> rxhash is 0 unless skb_get_rxhash() was called, or some NIC set it in RX
> path.

CONFIG_RPS is effectively on all the time for SMP builds.

If you want to make it a hard enable in that situation,
I fully support such a change. :-)


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  6:56   ` Victor Julien
  2011-07-05  7:00     ` Eric Dumazet
@ 2011-07-05  7:48     ` David Miller
  1 sibling, 0 replies; 28+ messages in thread
From: David Miller @ 2011-07-05  7:48 UTC (permalink / raw)
  To: victor; +Cc: eric.dumazet, netdev

From: Victor Julien <victor@inliniac.net>
Date: Tue, 05 Jul 2011 08:56:38 +0200

> On 07/05/2011 08:21 AM, Eric Dumazet wrote:
>> Le lundi 04 juillet 2011 à 21:20 -0700, David Miller a écrit :
>>> Fanouts allow packet capturing to be demuxed to a set of AF_PACKET
>>> sockets.  Two fanout policies are implemented:
>>>
>>> 1) Hashing based upon skb->rxhash
>> 
>> ...
>> 
>>> +
>>> +static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb)
>>> +{
>>> +	u32 idx, hash = skb->rxhash;
>>> +
>>> +	idx = ((u64)hash * f->num_members) >> 32;
>>> +
>>> +	return f->arr[idx];
>>> +}
>>> +
>> 
>> rxhash is 0 unless skb_get_rxhash() was called, or some NIC set it in RX
>> path.
>> 
> 
> Is this still also true for IP fragments?

I have a plan to fix this.  But what I've posted will work as you want
it to for everything else.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  7:00     ` Eric Dumazet
  2011-07-05  7:06       ` Victor Julien
@ 2011-07-05  7:50       ` David Miller
  2011-07-05  7:52         ` Victor Julien
  2011-07-05 16:03       ` Loke, Chetan
  2 siblings, 1 reply; 28+ messages in thread
From: David Miller @ 2011-07-05  7:50 UTC (permalink / raw)
  To: eric.dumazet; +Cc: victor, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 05 Jul 2011 09:00:14 +0200

> Le mardi 05 juillet 2011 à 08:56 +0200, Victor Julien a écrit :
> 
>> Is this still also true for IP fragments?
>> 
> 
> This point was already raised. IP fragments have rxhash = 0, obviously,
> since we dont have full information (source / destination ports for
> example)

I will fix this.

I will add an option to the fanout, as a flag bit to the setsockopt,
that will cause the fanout to act as a ip_defrag() agent and rehash
the packet once all the fragments arrive.

Since this will operate internally on the clones passed to AF_PACKET,
it will not have any effect on the original packets traversing through
the machine.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  7:50       ` David Miller
@ 2011-07-05  7:52         ` Victor Julien
  2011-07-05  8:02           ` David Miller
  0 siblings, 1 reply; 28+ messages in thread
From: Victor Julien @ 2011-07-05  7:52 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, netdev

On 07/05/2011 09:50 AM, David Miller wrote:
> I will add an option to the fanout, as a flag bit to the setsockopt,
> that will cause the fanout to act as a ip_defrag() agent and rehash
> the packet once all the fragments arrive.
> 
> Since this will operate internally on the clones passed to AF_PACKET,
> it will not have any effect on the original packets traversing through
> the machine.
> 

Does this mean the user space app gets a reassembled packet from the fan
out?

-- 
---------------------------------------------
Victor Julien
http://www.inliniac.net/
PGP: http://www.inliniac.net/victorjulien.asc
---------------------------------------------


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  7:52         ` Victor Julien
@ 2011-07-05  8:02           ` David Miller
  2011-07-05  8:47             ` Hagen Paul Pfeifer
  2011-07-05  8:48             ` Victor Julien
  0 siblings, 2 replies; 28+ messages in thread
From: David Miller @ 2011-07-05  8:02 UTC (permalink / raw)
  To: victor; +Cc: eric.dumazet, netdev

From: Victor Julien <victor@inliniac.net>
Date: Tue, 05 Jul 2011 09:52:18 +0200

> On 07/05/2011 09:50 AM, David Miller wrote:
>> I will add an option to the fanout, as a flag bit to the setsockopt,
>> that will cause the fanout to act as a ip_defrag() agent and rehash
>> the packet once all the fragments arrive.
>> 
>> Since this will operate internally on the clones passed to AF_PACKET,
>> it will not have any effect on the original packets traversing through
>> the machine.
>> 
> 
> Does this mean the user space app gets a reassembled packet from the fan
> out?

Yes.

Would you prefer to receive the frags individually?


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  8:02           ` David Miller
@ 2011-07-05  8:47             ` Hagen Paul Pfeifer
  2011-07-05  8:48             ` Victor Julien
  1 sibling, 0 replies; 28+ messages in thread
From: Hagen Paul Pfeifer @ 2011-07-05  8:47 UTC (permalink / raw)
  To: David Miller; +Cc: victor, eric.dumazet, netdev


On Tue, 05 Jul 2011 01:02:54 -0700 (PDT), David Miller wrote:



> Would you prefer to receive the frags individually?



I think both ways are favored. Often you just want to forward packets

unmodified after analysis, without any packet modification. For that the

packet should be received on the same socket (e.g. all with hash 0). On the

other hand there are use cases where packet analysis requires reassembled

packets.



Nice work David!



Hagen 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  8:02           ` David Miller
  2011-07-05  8:47             ` Hagen Paul Pfeifer
@ 2011-07-05  8:48             ` Victor Julien
  2011-07-05  9:01               ` David Miller
  1 sibling, 1 reply; 28+ messages in thread
From: Victor Julien @ 2011-07-05  8:48 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, netdev

On 07/05/2011 10:02 AM, David Miller wrote:
> Would you prefer to receive the frags individually?

Ideally what we receive is the same as on the wire, but I think your
solution would be fine.

In addition to the reassembled packet, having an option to receive the
individual frags as well would be of use. Suricata (and Snort as well)
has it's own defrag code that does anomaly detection. I can see a use
case where just for that anomaly detection we'd still want to see the
frags. In that case it wouldn't be a problem that they would all go to
fanout 0.

-- 
---------------------------------------------
Victor Julien
http://www.inliniac.net/
PGP: http://www.inliniac.net/victorjulien.asc
---------------------------------------------


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  8:48             ` Victor Julien
@ 2011-07-05  9:01               ` David Miller
  0 siblings, 0 replies; 28+ messages in thread
From: David Miller @ 2011-07-05  9:01 UTC (permalink / raw)
  To: victor; +Cc: eric.dumazet, netdev

From: Victor Julien <victor@inliniac.net>
Date: Tue, 05 Jul 2011 10:48:07 +0200

> On 07/05/2011 10:02 AM, David Miller wrote:
>> Would you prefer to receive the frags individually?
> 
> Ideally what we receive is the same as on the wire, but I think your
> solution would be fine.
> 
> In addition to the reassembled packet, having an option to receive the
> individual frags as well would be of use. Suricata (and Snort as well)
> has it's own defrag code that does anomaly detection. I can see a use
> case where just for that anomaly detection we'd still want to see the
> frags. In that case it wouldn't be a problem that they would all go to
> fanout 0.

Ok, I'm about to post a new set of patches that add the pre-defragmentation
support.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  7:46   ` David Miller
@ 2011-07-05  9:50     ` Eric Dumazet
  2011-07-05 10:07       ` David Miller
  0 siblings, 1 reply; 28+ messages in thread
From: Eric Dumazet @ 2011-07-05  9:50 UTC (permalink / raw)
  To: David Miller; +Cc: victor, netdev

Le mardi 05 juillet 2011 à 00:46 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 05 Jul 2011 08:21:15 +0200
> 
> > rxhash is 0 unless skb_get_rxhash() was called, or some NIC set it in RX
> > path.
> 
> CONFIG_RPS is effectively on all the time for SMP builds.
> 
> If you want to make it a hard enable in that situation,
> I fully support such a change. :-)
> 

CONFIG_RPS can be on, but skb->rxhash still 0 by default on tg3 for
example.

get_rps_cpu() wont call skb_get_rxhash() if RPS/RFS is not setup on
rxqueue (rps_map == NULL and rps_flow_table = NULL)

You really have to call skb_get_rxhash() yourself to make sure rxhash is
set (if not already done)

Just replace   skb->rxhash by skb_get_rxhash(skb)




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  9:50     ` Eric Dumazet
@ 2011-07-05 10:07       ` David Miller
  0 siblings, 0 replies; 28+ messages in thread
From: David Miller @ 2011-07-05 10:07 UTC (permalink / raw)
  To: eric.dumazet; +Cc: victor, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 05 Jul 2011 11:50:24 +0200

> You really have to call skb_get_rxhash() yourself to make sure rxhash is
> set (if not already done)
> 
> Just replace   skb->rxhash by skb_get_rxhash(skb)

Read v2 of my patch :-)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] packet: Add fanout support.
  2011-07-05  7:00     ` Eric Dumazet
  2011-07-05  7:06       ` Victor Julien
  2011-07-05  7:50       ` David Miller
@ 2011-07-05 16:03       ` Loke, Chetan
  2011-07-05 16:08         ` David Miller
  2011-07-05 16:16         ` Eric Dumazet
  2 siblings, 2 replies; 28+ messages in thread
From: Loke, Chetan @ 2011-07-05 16:03 UTC (permalink / raw)
  To: Eric Dumazet, Victor Julien; +Cc: David Miller, netdev

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org] On Behalf Of Eric Dumazet
> Sent: July 05, 2011 3:00 AM
> To: Victor Julien
> Cc: David Miller; netdev@vger.kernel.org
> Subject: Re: [PATCH 2/2] packet: Add fanout support.
> 
> Le mardi 05 juillet 2011 à 08:56 +0200, Victor Julien a écrit :
> 
> > Is this still also true for IP fragments?
> >
> 
> This point was already raised. IP fragments have rxhash = 0, obviously,
> since we dont have full information (source / destination ports for
> example)

Can we not do something like:

a = src_ip_addr;
b = dst_ip_addr;

if (ip_is_fragment(ip_hdr(skb)))
	c = ip_hdr->id;
else
	c = src_port | dest_port ; /* port_32 etc - Similar to what we have today */

/* swap a/b etc */
jhash3_words(a,b,c);



Chetan Loke

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05 16:03       ` Loke, Chetan
@ 2011-07-05 16:08         ` David Miller
  2011-07-05 16:16         ` Eric Dumazet
  1 sibling, 0 replies; 28+ messages in thread
From: David Miller @ 2011-07-05 16:08 UTC (permalink / raw)
  To: Chetan.Loke; +Cc: eric.dumazet, victor, netdev

From: "Loke, Chetan" <Chetan.Loke@netscout.com>
Date: Tue, 5 Jul 2011 12:03:29 -0400

> Can we not do something like:
> 
> a = src_ip_addr;
> b = dst_ip_addr;
> 
> if (ip_is_fragment(ip_hdr(skb)))
> 	c = ip_hdr->id;
> else
> 	c = src_port | dest_port ; /* port_32 etc - Similar to what we have today 

A UDP flow can be composed of fragmented and non-fragmented
parts, we want all of the packets from that flow to land
on the same hash.

Your scheme does not provide that essential property.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] packet: Add fanout support.
  2011-07-05 16:03       ` Loke, Chetan
  2011-07-05 16:08         ` David Miller
@ 2011-07-05 16:16         ` Eric Dumazet
  2011-07-05 16:21           ` Victor Julien
  2011-07-05 17:35           ` Loke, Chetan
  1 sibling, 2 replies; 28+ messages in thread
From: Eric Dumazet @ 2011-07-05 16:16 UTC (permalink / raw)
  To: Loke, Chetan; +Cc: Victor Julien, David Miller, netdev

Le mardi 05 juillet 2011 à 12:03 -0400, Loke, Chetan a écrit :
> > -----Original Message-----
> > From: netdev-owner@vger.kernel.org [mailto:netdev-
> > owner@vger.kernel.org] On Behalf Of Eric Dumazet
> > Sent: July 05, 2011 3:00 AM
> > To: Victor Julien
> > Cc: David Miller; netdev@vger.kernel.org
> > Subject: Re: [PATCH 2/2] packet: Add fanout support.
> > 
> > Le mardi 05 juillet 2011 à 08:56 +0200, Victor Julien a écrit :
> > 
> > > Is this still also true for IP fragments?
> > >
> > 
> > This point was already raised. IP fragments have rxhash = 0, obviously,
> > since we dont have full information (source / destination ports for
> > example)
> 
> Can we not do something like:
> 
> a = src_ip_addr;
> b = dst_ip_addr;
> 
> if (ip_is_fragment(ip_hdr(skb)))
> 	c = ip_hdr->id;
> else
> 	c = src_port | dest_port ; /* port_32 etc - Similar to what we have today */
> 
> /* swap a/b etc */
> jhash3_words(a,b,c);
> 
> 
> 

Sure, but non fragmented packets will then get a different rxhash.

Remember, goal is that _all_ packets of a given flow end in same queue.




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05 16:16         ` Eric Dumazet
@ 2011-07-05 16:21           ` Victor Julien
  2011-07-05 16:23             ` Eric Dumazet
  2011-07-05 17:35           ` Loke, Chetan
  1 sibling, 1 reply; 28+ messages in thread
From: Victor Julien @ 2011-07-05 16:21 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Loke, Chetan, David Miller, netdev

On 07/05/2011 06:16 PM, Eric Dumazet wrote:
> Remember, goal is that _all_ packets of a given flow end in same queue.
> 

What about a hashing scheme based on just the ip addresses? Would make
rxhash useless for this purpose, but would be a lot simpler overall maybe...

-- 
---------------------------------------------
Victor Julien
http://www.inliniac.net/
PGP: http://www.inliniac.net/victorjulien.asc
---------------------------------------------


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05 16:21           ` Victor Julien
@ 2011-07-05 16:23             ` Eric Dumazet
  2011-07-05 17:15               ` Victor Julien
  0 siblings, 1 reply; 28+ messages in thread
From: Eric Dumazet @ 2011-07-05 16:23 UTC (permalink / raw)
  To: Victor Julien; +Cc: Loke, Chetan, David Miller, netdev

Le mardi 05 juillet 2011 à 18:21 +0200, Victor Julien a écrit :
> On 07/05/2011 06:16 PM, Eric Dumazet wrote:
> > Remember, goal is that _all_ packets of a given flow end in same queue.
> > 
> 
> What about a hashing scheme based on just the ip addresses? Would make
> rxhash useless for this purpose, but would be a lot simpler overall maybe...
> 

What about loads where a single IP address is used ?

I wonder what's the problem, since David added a defrag unit ;)




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05 16:23             ` Eric Dumazet
@ 2011-07-05 17:15               ` Victor Julien
  2011-07-05 18:26                 ` Eric Dumazet
  0 siblings, 1 reply; 28+ messages in thread
From: Victor Julien @ 2011-07-05 17:15 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Loke, Chetan, David Miller, netdev

On 07/05/2011 06:23 PM, Eric Dumazet wrote:
> Le mardi 05 juillet 2011 à 18:21 +0200, Victor Julien a écrit :
>> On 07/05/2011 06:16 PM, Eric Dumazet wrote:
>>> Remember, goal is that _all_ packets of a given flow end in same queue.
>>>
>>
>> What about a hashing scheme based on just the ip addresses? Would make
>> rxhash useless for this purpose, but would be a lot simpler overall maybe...
>>
> 
> What about loads where a single IP address is used ?

How would that be a problem?

> I wonder what's the problem, since David added a defrag unit ;)

No problem, I just suggested it as Chetan Loke brought up the hashing
suggestion. David's solution would work fine for me.

-- 
---------------------------------------------
Victor Julien
http://www.inliniac.net/
PGP: http://www.inliniac.net/victorjulien.asc
---------------------------------------------


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] packet: Add fanout support.
  2011-07-05 16:16         ` Eric Dumazet
  2011-07-05 16:21           ` Victor Julien
@ 2011-07-05 17:35           ` Loke, Chetan
  2011-07-05 18:20             ` Eric Dumazet
  1 sibling, 1 reply; 28+ messages in thread
From: Loke, Chetan @ 2011-07-05 17:35 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Victor Julien, David Miller, netdev

> -----Original Message-----
> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> Sent: July 05, 2011 12:16 PM
> To: Loke, Chetan
> Cc: Victor Julien; David Miller; netdev@vger.kernel.org
> Subject: RE: [PATCH 2/2] packet: Add fanout support.
> 
> Le mardi 05 juillet 2011 à 12:03 -0400, Loke, Chetan a écrit :
> > > -----Original Message-----
> > > From: netdev-owner@vger.kernel.org [mailto:netdev-
> > > owner@vger.kernel.org] On Behalf Of Eric Dumazet
> > > Sent: July 05, 2011 3:00 AM
> > > To: Victor Julien
> > > Cc: David Miller; netdev@vger.kernel.org
> > > Subject: Re: [PATCH 2/2] packet: Add fanout support.
> > >
> > > Le mardi 05 juillet 2011 à 08:56 +0200, Victor Julien a écrit :
> > >
> > > > Is this still also true for IP fragments?
> > > >
> > >
> > > This point was already raised. IP fragments have rxhash = 0,
> obviously,
> > > since we dont have full information (source / destination ports for
> > > example)
> >
> > Can we not do something like:
> >
> > a = src_ip_addr;
> > b = dst_ip_addr;
> >
> > if (ip_is_fragment(ip_hdr(skb)))
> > 	c = ip_hdr->id;
> > else
> > 	c = src_port | dest_port ; /* port_32 etc - Similar to what we
> have today */
> >
> > /* swap a/b etc */
> > jhash3_words(a,b,c);
> >
> >
> >
> 
> Sure, but non fragmented packets will then get a different rxhash.
> 
> Remember, goal is that _all_ packets of a given flow end in same queue.
> 
> 

Sure, a lookup is needed(to steer what I call - Hot/Cold flows) and was proposed by me on the oisf mailing list. Always, use the ip_id bit then? Another problem that needs to be solved is, what if some decoders are overloaded, then what? How will this scheme work? How will we utilize other CPUs? RPS is needed for sure.

If we maintain a i) per port lookup-table ii) 2^20 flows/table and iii) 16 bytes/flow(one can also squeeze it down to 8 bytes) then we will need around 32MB worth memory/port. It's not a huge memory pressure for folks who want to use linux for doing IPS/IDS sort of stuff.

User-space decoders end up copying the packet anyways. So fanout can be implemented in user-space to achieve effective CPU utilization.
As long as we don't bounce on different CPU-socket we could be ok.


Chetan Loke

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] packet: Add fanout support.
  2011-07-05 17:35           ` Loke, Chetan
@ 2011-07-05 18:20             ` Eric Dumazet
  2011-07-05 21:10               ` Loke, Chetan
  0 siblings, 1 reply; 28+ messages in thread
From: Eric Dumazet @ 2011-07-05 18:20 UTC (permalink / raw)
  To: Loke, Chetan; +Cc: Victor Julien, David Miller, netdev

Le mardi 05 juillet 2011 à 13:35 -0400, Loke, Chetan a écrit :

> Sure, a lookup is needed(to steer what I call - Hot/Cold flows) and
> was proposed by me on the oisf mailing list. Always, use the ip_id bit
> then? Another problem that needs to be solved is, what if some
> decoders are overloaded, then what? How will this scheme work? How
> will we utilize other CPUs? RPS is needed for sure.
> 
> If we maintain a i) per port lookup-table ii) 2^20 flows/table and
> iii) 16 bytes/flow(one can also squeeze it down to 8 bytes) then we
> will need around 32MB worth memory/port. It's not a huge memory
> pressure for folks who want to use linux for doing IPS/IDS sort of
> stuff.
> 
> User-space decoders end up copying the packet anyways. So fanout can
> be implemented in user-space to achieve effective CPU utilization.
> As long as we don't bounce on different CPU-socket we could be ok.

This is the problem we want to address.

Going into user-space to perform the fanout is what you already have
today, with one socket, one thread doing the fanout to worker threads.

David patch is non adaptative : its a hash on N queue, with a fixed hash
function.

What you want is to add another 'control queue' where new flows are
directed. Then user application is able to reinject into kernel flow
director the "This flow should go to queue X" information.

Or, let the kernel do a mix of rxhash and loadbalance : Be able to
select a queue for a new flow without user land control, using a Flow
hash table.






^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05 17:15               ` Victor Julien
@ 2011-07-05 18:26                 ` Eric Dumazet
  2011-07-05 18:34                   ` Victor Julien
  0 siblings, 1 reply; 28+ messages in thread
From: Eric Dumazet @ 2011-07-05 18:26 UTC (permalink / raw)
  To: Victor Julien; +Cc: Loke, Chetan, David Miller, netdev

Le mardi 05 juillet 2011 à 19:15 +0200, Victor Julien a écrit :
> On 07/05/2011 06:23 PM, Eric Dumazet wrote:
> > Le mardi 05 juillet 2011 à 18:21 +0200, Victor Julien a écrit :
> >> On 07/05/2011 06:16 PM, Eric Dumazet wrote:
> >>> Remember, goal is that _all_ packets of a given flow end in same queue.
> >>>
> >>
> >> What about a hashing scheme based on just the ip addresses? Would make
> >> rxhash useless for this purpose, but would be a lot simpler overall maybe...
> >>
> > 
> > What about loads where a single IP address is used ?
> 
> How would that be a problem?

Say I want my program able to process 2.000.000 packets per second on my
64 cpu machine, but one cpu is only able to process 100.000 pps.

All these packets are coming from IP1, and go to IP2, but thousand of
different flows are in flight.

We need hash function taking into account source port and destination
port.




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] packet: Add fanout support.
  2011-07-05 18:26                 ` Eric Dumazet
@ 2011-07-05 18:34                   ` Victor Julien
  0 siblings, 0 replies; 28+ messages in thread
From: Victor Julien @ 2011-07-05 18:34 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Loke, Chetan, David Miller, netdev

On 07/05/2011 08:26 PM, Eric Dumazet wrote:
>>>> What about a hashing scheme based on just the ip addresses? Would make
>>>> rxhash useless for this purpose, but would be a lot simpler overall maybe...
>>>>
>>>
>>> What about loads where a single IP address is used ?
>>
>> How would that be a problem?
> 
> Say I want my program able to process 2.000.000 packets per second on my
> 64 cpu machine, but one cpu is only able to process 100.000 pps.
> 
> All these packets are coming from IP1, and go to IP2, but thousand of
> different flows are in flight.
> 
> We need hash function taking into account source port and destination
> port.

Point taken. I see no other way either.

-- 
---------------------------------------------
Victor Julien
http://www.inliniac.net/
PGP: http://www.inliniac.net/victorjulien.asc
---------------------------------------------


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] packet: Add fanout support.
  2011-07-05 18:20             ` Eric Dumazet
@ 2011-07-05 21:10               ` Loke, Chetan
  0 siblings, 0 replies; 28+ messages in thread
From: Loke, Chetan @ 2011-07-05 21:10 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Victor Julien, David Miller, netdev

> -----Original Message-----
> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> Sent: July 05, 2011 2:21 PM
> To: Loke, Chetan
> Cc: Victor Julien; David Miller; netdev@vger.kernel.org
> Subject: RE: [PATCH 2/2] packet: Add fanout support.
> 
> Le mardi 05 juillet 2011 à 13:35 -0400, Loke, Chetan a écrit :
> 
> > Sure, a lookup is needed(to steer what I call - Hot/Cold flows) and
> > was proposed by me on the oisf mailing list. Always, use the ip_id
> bit
> > then? Another problem that needs to be solved is, what if some
> > decoders are overloaded, then what? How will this scheme work? How
> > will we utilize other CPUs? RPS is needed for sure.
> >
> > If we maintain a i) per port lookup-table ii) 2^20 flows/table and
> > iii) 16 bytes/flow(one can also squeeze it down to 8 bytes) then we
> > will need around 32MB worth memory/port. It's not a huge memory
> > pressure for folks who want to use linux for doing IPS/IDS sort of
> > stuff.
> >
> > User-space decoders end up copying the packet anyways. So fanout can
> > be implemented in user-space to achieve effective CPU utilization.
> > As long as we don't bounce on different CPU-socket we could be ok.
> 
> This is the problem we want to address.
> 
> Going into user-space to perform the fanout is what you already have
> today, with one socket, one thread doing the fanout to worker threads.
> 
> David patch is non adaptative : its a hash on N queue, with a fixed
> hash
> function.
> 


> What you want is to add another 'control queue' where new flows are
> directed. Then user application is able to reinject into kernel flow
> director the "This flow should go to queue X" information.
> 

I like the term - 'kernel-flow-director'. The problem with rebalancing from user-space is that we will need to have some 'idle' period before we inject flow-redirection event into the kernel. And for bursty workloads this may be problematic. We may have to rebalance often. And then we will have to export the 'rebalance-idle-interval' knob.
And users may have to tune it for their workloads etc.

> Or, let the kernel do a mix of rxhash and loadbalance : Be able to
> select a queue for a new flow without user land control, using a Flow
> hash table.
> 

This is exactly what I had proposed. Hash{== lookup of hot/cold flows}+LB{== kernel-flow-director} is what we need.

So something like:

hot_fanout_id = is_flow_active(rx_hash,lookup_table);

if (hot_fanout_id)
   /* This flow is Hot */
   steer_to(hot_fanout_id);
else {
  /* This flow is cold - Get next_rr fanout_id */
  
  fanout_rr_next(...);
  ...
  steer_to(cold_fanout_id);
}

And,

1)hash on <src_ip_addr,dst_ip_addr,src_port,dst_port>
2)store the ip_id from the first fragment in the flow_hash_table for matching subsequent ip_fragments.
  One corner case - If the first fragment arrives out-of-order(OOO).
  2.1) user has configured - assemble-fragments: In this case it doesn’t matter.
  2.2) user has configured - fwd-as-is: Redirect OOO-fragments to the next_rr_fanout_id.
	 2.2.1) 
       And so we may have to set a bit in hash_lookup_table for a flow indicating it arrived 'OOO'.
	 So, is_flow_active will have to lookup twice. First using ip_id as 3rd var in jhash. And second lookup using ports as 3rd var. Worst case there will be two lookups on 	 OOO.
	 OR
	 2.2.2) Effectively treat this fragment as 'assemble-fragments' case(?) as in 2.1).


So {hash+LB} together, should take care of the fragmented/non-fragmented flow.

We will have to purge the flow-entry at some point to avoid false routing. Don't know if the control-queue that you mentioned above can be used for purging flows?
Or the control-queue itself can be mmap'd so that user-space can clear 'a' flow-entry?


Chetan Loke

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2011-07-05 21:10 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-07-05  4:20 [PATCH 2/2] packet: Add fanout support David Miller
2011-07-05  4:33 ` Eric Dumazet
2011-07-05  4:36   ` David Miller
2011-07-05  6:21 ` Eric Dumazet
2011-07-05  6:56   ` Victor Julien
2011-07-05  7:00     ` Eric Dumazet
2011-07-05  7:06       ` Victor Julien
2011-07-05  7:50       ` David Miller
2011-07-05  7:52         ` Victor Julien
2011-07-05  8:02           ` David Miller
2011-07-05  8:47             ` Hagen Paul Pfeifer
2011-07-05  8:48             ` Victor Julien
2011-07-05  9:01               ` David Miller
2011-07-05 16:03       ` Loke, Chetan
2011-07-05 16:08         ` David Miller
2011-07-05 16:16         ` Eric Dumazet
2011-07-05 16:21           ` Victor Julien
2011-07-05 16:23             ` Eric Dumazet
2011-07-05 17:15               ` Victor Julien
2011-07-05 18:26                 ` Eric Dumazet
2011-07-05 18:34                   ` Victor Julien
2011-07-05 17:35           ` Loke, Chetan
2011-07-05 18:20             ` Eric Dumazet
2011-07-05 21:10               ` Loke, Chetan
2011-07-05  7:48     ` David Miller
2011-07-05  7:46   ` David Miller
2011-07-05  9:50     ` Eric Dumazet
2011-07-05 10:07       ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.