All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch net-next-2.6] net: convert bonding to use rx_handler
@ 2011-02-18 13:25 Jiri Pirko
  2011-02-18 13:29 ` Eric Dumazet
  0 siblings, 1 reply; 52+ messages in thread
From: Jiri Pirko @ 2011-02-18 13:25 UTC (permalink / raw)
  To: netdev
  Cc: davem, shemminger, kaber, fubar, eric.dumazet, nicolas.2p.debian, andy

This patch converts bonding to use rx_handler. Results in cleaner
__netif_receive_skb() with much less exceptions needed. Also bond-specific
work is moved into bond code.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/bonding/bond_main.c |   75 ++++++++++++++++++++-
 include/linux/skbuff.h          |    2 +
 net/core/dev.c                  |  144 +++++++++++---------------------------
 net/core/skbuff.c               |    1 +
 4 files changed, 119 insertions(+), 103 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 77e3c6a..a856a11 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1423,6 +1423,68 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
 	bond->setup_by_slave = 1;
 }
 
+/* On bonding slaves other than the currently active slave, suppress
+ * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
+ * ARP on active-backup slaves with arp_validate enabled.
+ */
+static bool bond_should_deliver_exact_match(struct sk_buff *skb,
+					    struct net_device *slave_dev,
+					    struct net_device *bond_dev)
+{
+	if (slave_dev->priv_flags & IFF_SLAVE_INACTIVE) {
+		if (slave_dev->priv_flags & IFF_SLAVE_NEEDARP &&
+		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
+			return false;
+
+		if (bond_dev->priv_flags & IFF_MASTER_ALB &&
+		    skb->pkt_type != PACKET_BROADCAST &&
+		    skb->pkt_type != PACKET_MULTICAST)
+				return false;
+
+		if (bond_dev->priv_flags & IFF_MASTER_8023AD &&
+		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
+			return false;
+
+		return true;
+	}
+	return false;
+}
+
+static struct sk_buff *bond_handle_frame(struct sk_buff *skb)
+{
+	struct net_device *slave_dev;
+	struct net_device *bond_dev;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return NULL;
+	slave_dev = skb->dev;
+	bond_dev = ACCESS_ONCE(slave_dev->master);
+	if (unlikely(!bond_dev))
+		return skb;
+
+	if (bond_dev->priv_flags & IFF_MASTER_ARPMON)
+		slave_dev->last_rx = jiffies;
+
+	if (bond_should_deliver_exact_match(skb, slave_dev, bond_dev)) {
+		skb->deliver_no_wcard = 1;
+		return skb;
+	}
+
+	skb->dev = bond_dev;
+
+	if (bond_dev->priv_flags & IFF_MASTER_ALB &&
+	    bond_dev->priv_flags & IFF_BRIDGE_PORT &&
+	    skb->pkt_type == PACKET_HOST) {
+		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
+
+		memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
+	}
+
+	netif_rx(skb);
+	return NULL;
+}
+
 /* enslave device <slave> to bond device <master> */
 int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 {
@@ -1599,11 +1661,17 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		pr_debug("Error %d calling netdev_set_bond_master\n", res);
 		goto err_restore_mac;
 	}
+	res = netdev_rx_handler_register(slave_dev, bond_handle_frame, NULL);
+	if (res) {
+		pr_debug("Error %d calling netdev_rx_handler_register\n", res);
+		goto err_unset_master;
+	}
+
 	/* open the slave since the application closed it */
 	res = dev_open(slave_dev);
 	if (res) {
 		pr_debug("Opening slave %s failed\n", slave_dev->name);
-		goto err_unset_master;
+		goto err_unreg_rxhandler;
 	}
 
 	new_slave->dev = slave_dev;
@@ -1811,6 +1879,9 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 err_close:
 	dev_close(slave_dev);
 
+err_unreg_rxhandler:
+	netdev_rx_handler_unregister(slave_dev);
+
 err_unset_master:
 	netdev_set_bond_master(slave_dev, NULL);
 
@@ -1992,6 +2063,7 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
 		netif_addr_unlock_bh(bond_dev);
 	}
 
+	netdev_rx_handler_unregister(slave_dev);
 	netdev_set_bond_master(slave_dev, NULL);
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -2114,6 +2186,7 @@ static int bond_release_all(struct net_device *bond_dev)
 			netif_addr_unlock_bh(bond_dev);
 		}
 
+		netdev_rx_handler_unregister(slave_dev);
 		netdev_set_bond_master(slave_dev, NULL);
 
 		/* close slave before restoring its mac address */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 31f02d0..9f3af5d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -267,6 +267,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@sk: Socket we are owned by
  *	@tstamp: Time we arrived
  *	@dev: Device we arrived on/are leaving by
+ *	@input_dev: Original device on which we arrived
  *	@transport_header: Transport layer header
  *	@network_header: Network layer header
  *	@mac_header: Link layer header
@@ -325,6 +326,7 @@ struct sk_buff {
 
 	struct sock		*sk;
 	struct net_device	*dev;
+	struct net_device	*input_dev;
 
 	/*
 	 * This is the control buffer. It is free to use for every
diff --git a/net/core/dev.c b/net/core/dev.c
index 4f69439..9d2f485 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1530,12 +1530,17 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(dev_forward_skb);
 
+static inline int __deliver_skb(struct sk_buff *skb,
+				struct packet_type *pt_prev)
+{
+	return pt_prev->func(skb, skb->dev, pt_prev, skb->input_dev);
+}
+
 static inline int deliver_skb(struct sk_buff *skb,
-			      struct packet_type *pt_prev,
-			      struct net_device *orig_dev)
+			      struct packet_type *pt_prev)
 {
 	atomic_inc(&skb->users);
-	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+	return __deliver_skb(skb, pt_prev);
 }
 
 /*
@@ -1558,7 +1563,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 		    (ptype->af_packet_priv == NULL ||
 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
 			if (pt_prev) {
-				deliver_skb(skb2, pt_prev, skb->dev);
+				deliver_skb(skb2, pt_prev);
 				pt_prev = ptype;
 				continue;
 			}
@@ -1591,7 +1596,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 		}
 	}
 	if (pt_prev)
-		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
+		__deliver_skb(skb2, pt_prev);
 	rcu_read_unlock();
 }
 
@@ -3021,8 +3026,7 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
 }
 
 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
-					 struct packet_type **pt_prev,
-					 int *ret, struct net_device *orig_dev)
+					 struct packet_type **pt_prev, int *ret)
 {
 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
 
@@ -3030,7 +3034,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 		goto out;
 
 	if (*pt_prev) {
-		*ret = deliver_skb(skb, *pt_prev, orig_dev);
+		*ret = deliver_skb(skb, *pt_prev);
 		*pt_prev = NULL;
 	}
 
@@ -3092,63 +3096,30 @@ void netdev_rx_handler_unregister(struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 
-static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
-					      struct net_device *master)
-{
-	if (skb->pkt_type == PACKET_HOST) {
-		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
-
-		memcpy(dest, master->dev_addr, ETH_ALEN);
-	}
-}
-
-/* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
- */
-static int __skb_bond_should_drop(struct sk_buff *skb,
-				  struct net_device *master)
+static void vlan_on_bond_hook(struct sk_buff *skb)
 {
-	struct net_device *dev = skb->dev;
-
-	if (master->priv_flags & IFF_MASTER_ARPMON)
-		dev->last_rx = jiffies;
-
-	if ((master->priv_flags & IFF_MASTER_ALB) &&
-	    (master->priv_flags & IFF_BRIDGE_PORT)) {
-		/* Do address unmangle. The local destination address
-		 * will be always the one master has. Provides the right
-		 * functionality in a bridge.
-		 */
-		skb_bond_set_mac_by_master(skb, master);
-	}
-
-	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
-		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
-		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
-			return 0;
-
-		if (master->priv_flags & IFF_MASTER_ALB) {
-			if (skb->pkt_type != PACKET_BROADCAST &&
-			    skb->pkt_type != PACKET_MULTICAST)
-				return 0;
-		}
-		if (master->priv_flags & IFF_MASTER_8023AD &&
-		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
-			return 0;
+	/*
+	 * Make sure ARP frames received on VLAN interfaces stacked on
+	 * bonding interfaces still make their way to any base bonding
+	 * device that may have registered for a specific ptype.
+	 */
+	if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
+	    vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
+	    skb->protocol == htons(ETH_P_ARP)) {
+		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 
-		return 1;
+		if (!skb2)
+			return;
+		skb2->dev = vlan_dev_real_dev(skb->dev);
+		netif_rx(skb2);
 	}
-	return 0;
 }
 
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	rx_handler_func_t *rx_handler;
-	struct net_device *orig_dev;
-	struct net_device *null_or_orig;
-	struct net_device *orig_or_bond;
+	struct net_device *null_or_dev;
 	int ret = NET_RX_DROP;
 	__be16 type;
 
@@ -3164,29 +3135,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	if (!skb->skb_iif)
 		skb->skb_iif = skb->dev->ifindex;
 
-	/*
-	 * bonding note: skbs received on inactive slaves should only
-	 * be delivered to pkt handlers that are exact matches.  Also
-	 * the deliver_no_wcard flag will be set.  If packet handlers
-	 * are sensitive to duplicate packets these skbs will need to
-	 * be dropped at the handler.
-	 */
-	null_or_orig = NULL;
-	orig_dev = skb->dev;
-	if (skb->deliver_no_wcard)
-		null_or_orig = orig_dev;
-	else if (netif_is_bond_slave(orig_dev)) {
-		struct net_device *bond_master = ACCESS_ONCE(orig_dev->master);
-
-		if (likely(bond_master)) {
-			if (__skb_bond_should_drop(skb, bond_master)) {
-				skb->deliver_no_wcard = 1;
-				/* deliver only exact match */
-				null_or_orig = orig_dev;
-			} else
-				skb->dev = bond_master;
-		}
-	}
+	if (!skb->input_dev)
+		skb->input_dev = skb->dev;
 
 	__this_cpu_inc(softnet_data.processed);
 	skb_reset_network_header(skb);
@@ -3205,26 +3155,24 @@ static int __netif_receive_skb(struct sk_buff *skb)
 #endif
 
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
-		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
-		    ptype->dev == orig_dev) {
+		if (!ptype->dev || ptype->dev == skb->dev) {
 			if (pt_prev)
-				ret = deliver_skb(skb, pt_prev, orig_dev);
+				ret = deliver_skb(skb, pt_prev);
 			pt_prev = ptype;
 		}
 	}
 
 #ifdef CONFIG_NET_CLS_ACT
-	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
+	skb = handle_ing(skb, &pt_prev, &ret);
 	if (!skb)
 		goto out;
 ncls:
 #endif
 
-	/* Handle special case of bridge or macvlan */
 	rx_handler = rcu_dereference(skb->dev->rx_handler);
 	if (rx_handler) {
 		if (pt_prev) {
-			ret = deliver_skb(skb, pt_prev, orig_dev);
+			ret = deliver_skb(skb, pt_prev);
 			pt_prev = NULL;
 		}
 		skb = rx_handler(skb);
@@ -3234,7 +3182,7 @@ ncls:
 
 	if (vlan_tx_tag_present(skb)) {
 		if (pt_prev) {
-			ret = deliver_skb(skb, pt_prev, orig_dev);
+			ret = deliver_skb(skb, pt_prev);
 			pt_prev = NULL;
 		}
 		if (vlan_hwaccel_do_receive(&skb)) {
@@ -3244,32 +3192,24 @@ ncls:
 			goto out;
 	}
 
-	/*
-	 * Make sure frames received on VLAN interfaces stacked on
-	 * bonding interfaces still make their way to any base bonding
-	 * device that may have registered for a specific ptype.  The
-	 * handler may have to adjust skb->dev and orig_dev.
-	 */
-	orig_or_bond = orig_dev;
-	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
-	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
-		orig_or_bond = vlan_dev_real_dev(skb->dev);
-	}
+	vlan_on_bond_hook(skb);
+
+	/* deliver only exact match when indicated */
+	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
 
 	type = skb->protocol;
 	list_for_each_entry_rcu(ptype,
 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
-		if (ptype->type == type && (ptype->dev == null_or_orig ||
-		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
-		     ptype->dev == orig_or_bond)) {
+		if (ptype->type == type &&
+		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
 			if (pt_prev)
-				ret = deliver_skb(skb, pt_prev, orig_dev);
+				ret = deliver_skb(skb, pt_prev);
 			pt_prev = ptype;
 		}
 	}
 
 	if (pt_prev) {
-		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+		ret = __deliver_skb(skb, pt_prev);
 	} else {
 		atomic_long_inc(&skb->dev->rx_dropped);
 		kfree_skb(skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 14cf560..e19eabe 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -508,6 +508,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 {
 	new->tstamp		= old->tstamp;
 	new->dev		= old->dev;
+	new->input_dev		= old->input_dev;
 	new->transport_header	= old->transport_header;
 	new->network_header	= old->network_header;
 	new->mac_header		= old->mac_header;
-- 
1.7.3.4


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6] net: convert bonding to use rx_handler
  2011-02-18 13:25 [patch net-next-2.6] net: convert bonding to use rx_handler Jiri Pirko
@ 2011-02-18 13:29 ` Eric Dumazet
  2011-02-18 14:14   ` Jiri Pirko
  0 siblings, 1 reply; 52+ messages in thread
From: Eric Dumazet @ 2011-02-18 13:29 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, shemminger, kaber, fubar, nicolas.2p.debian, andy

Le vendredi 18 février 2011 à 14:25 +0100, Jiri Pirko a écrit :
> This patch converts bonding to use rx_handler. Results in cleaner
> __netif_receive_skb() with much less exceptions needed. Also bond-specific
> work is moved into bond code.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  drivers/net/bonding/bond_main.c |   75 ++++++++++++++++++++-
>  include/linux/skbuff.h          |    2 +
>  net/core/dev.c                  |  144 +++++++++++---------------------------
>  net/core/skbuff.c               |    1 +
>  4 files changed, 119 insertions(+), 103 deletions(-)
> 

> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 31f02d0..9f3af5d 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -267,6 +267,7 @@ typedef unsigned char *sk_buff_data_t;
>   *	@sk: Socket we are owned by
>   *	@tstamp: Time we arrived
>   *	@dev: Device we arrived on/are leaving by
> + *	@input_dev: Original device on which we arrived
>   *	@transport_header: Transport layer header
>   *	@network_header: Network layer header
>   *	@mac_header: Link layer header
> @@ -325,6 +326,7 @@ struct sk_buff {
>  
>  	struct sock		*sk;
>  	struct net_device	*dev;
> +	struct net_device	*input_dev;
>  

Your patch looks fine, but adding 8 bytes to sk_buff for a "cleanup" is
really a show stopper for me.




^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6] net: convert bonding to use rx_handler
  2011-02-18 13:29 ` Eric Dumazet
@ 2011-02-18 14:14   ` Jiri Pirko
  2011-02-18 14:27     ` Eric Dumazet
  0 siblings, 1 reply; 52+ messages in thread
From: Jiri Pirko @ 2011-02-18 14:14 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, davem, shemminger, kaber, fubar, nicolas.2p.debian, andy

Fri, Feb 18, 2011 at 02:29:51PM CET, eric.dumazet@gmail.com wrote:
>Le vendredi 18 février 2011 à 14:25 +0100, Jiri Pirko a écrit :
>> This patch converts bonding to use rx_handler. Results in cleaner
>> __netif_receive_skb() with much less exceptions needed. Also bond-specific
>> work is moved into bond code.
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> ---
>>  drivers/net/bonding/bond_main.c |   75 ++++++++++++++++++++-
>>  include/linux/skbuff.h          |    2 +
>>  net/core/dev.c                  |  144 +++++++++++---------------------------
>>  net/core/skbuff.c               |    1 +
>>  4 files changed, 119 insertions(+), 103 deletions(-)
>> 
>
>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>> index 31f02d0..9f3af5d 100644
>> --- a/include/linux/skbuff.h
>> +++ b/include/linux/skbuff.h
>> @@ -267,6 +267,7 @@ typedef unsigned char *sk_buff_data_t;
>>   *	@sk: Socket we are owned by
>>   *	@tstamp: Time we arrived
>>   *	@dev: Device we arrived on/are leaving by
>> + *	@input_dev: Original device on which we arrived
>>   *	@transport_header: Transport layer header
>>   *	@network_header: Network layer header
>>   *	@mac_header: Link layer header
>> @@ -325,6 +326,7 @@ struct sk_buff {
>>  
>>  	struct sock		*sk;
>>  	struct net_device	*dev;
>> +	struct net_device	*input_dev;
>>  
>
>Your patch looks fine, but adding 8 bytes to sk_buff for a "cleanup" is
>really a show stopper for me.

Do not know how to do it better. As for percpu variable, not only
origdev would have to be remembered but also probably skb pointer to
know if it's the first run on the skb or not. Can't really figure out a
better solution. Can you?

Thanks.

Jirka
>
>
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6] net: convert bonding to use rx_handler
  2011-02-18 14:14   ` Jiri Pirko
@ 2011-02-18 14:27     ` Eric Dumazet
  2011-02-18 14:46       ` Patrick McHardy
  0 siblings, 1 reply; 52+ messages in thread
From: Eric Dumazet @ 2011-02-18 14:27 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, shemminger, kaber, fubar, nicolas.2p.debian, andy

Le vendredi 18 février 2011 à 15:14 +0100, Jiri Pirko a écrit :

> Do not know how to do it better. As for percpu variable, not only
> origdev would have to be remembered but also probably skb pointer to
> know if it's the first run on the skb or not. Can't really figure out a
> better solution. Can you?

I'll try and let you know.

Thanks



^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6] net: convert bonding to use rx_handler
  2011-02-18 14:27     ` Eric Dumazet
@ 2011-02-18 14:46       ` Patrick McHardy
  2011-02-18 14:58         ` Jiri Pirko
  0 siblings, 1 reply; 52+ messages in thread
From: Patrick McHardy @ 2011-02-18 14:46 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jiri Pirko, netdev, davem, shemminger, fubar, nicolas.2p.debian, andy

Am 18.02.2011 15:27, schrieb Eric Dumazet:
> Le vendredi 18 février 2011 à 15:14 +0100, Jiri Pirko a écrit :
> 
>> Do not know how to do it better. As for percpu variable, not only
>> origdev would have to be remembered but also probably skb pointer to
>> know if it's the first run on the skb or not. Can't really figure out a
>> better solution. Can you?
> 
> I'll try and let you know.

Why not simply do a lookup on skb->iif?

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6] net: convert bonding to use rx_handler
  2011-02-18 14:46       ` Patrick McHardy
@ 2011-02-18 14:58         ` Jiri Pirko
  2011-02-18 15:50           ` Patrick McHardy
  2011-02-18 20:06           ` David Miller
  0 siblings, 2 replies; 52+ messages in thread
From: Jiri Pirko @ 2011-02-18 14:58 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Eric Dumazet, netdev, davem, shemminger, fubar, nicolas.2p.debian, andy

Fri, Feb 18, 2011 at 03:46:45PM CET, kaber@trash.net wrote:
>Am 18.02.2011 15:27, schrieb Eric Dumazet:
>> Le vendredi 18 février 2011 à 15:14 +0100, Jiri Pirko a écrit :
>> 
>>> Do not know how to do it better. As for percpu variable, not only
>>> origdev would have to be remembered but also probably skb pointer to
>>> know if it's the first run on the skb or not. Can't really figure out a
>>> better solution. Can you?
>> 
>> I'll try and let you know.
>
>Why not simply do a lookup on skb->iif?

Well I was trying to avoid iterating over list of devices for each
incoming frame.


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6] net: convert bonding to use rx_handler
  2011-02-18 14:58         ` Jiri Pirko
@ 2011-02-18 15:50           ` Patrick McHardy
  2011-02-18 16:14             ` Eric Dumazet
  2011-02-18 20:06           ` David Miller
  1 sibling, 1 reply; 52+ messages in thread
From: Patrick McHardy @ 2011-02-18 15:50 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Eric Dumazet, netdev, davem, shemminger, fubar, nicolas.2p.debian, andy

On 18.02.2011 15:58, Jiri Pirko wrote:
> Fri, Feb 18, 2011 at 03:46:45PM CET, kaber@trash.net wrote:
>> Am 18.02.2011 15:27, schrieb Eric Dumazet:
>>> Le vendredi 18 février 2011 à 15:14 +0100, Jiri Pirko a écrit :
>>>
>>>> Do not know how to do it better. As for percpu variable, not only
>>>> origdev would have to be remembered but also probably skb pointer to
>>>> know if it's the first run on the skb or not. Can't really figure out a
>>>> better solution. Can you?
>>>
>>> I'll try and let you know.
>>
>> Why not simply do a lookup on skb->iif?
> 
> Well I was trying to avoid iterating over list of devices for each
> incoming frame.
> 

Well, there are a couple of holes on 64 bit, perhaps you can rearrange
things and eliminate either iif or input_dev without increasing size
since they appear to be redundant.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6] net: convert bonding to use rx_handler
  2011-02-18 15:50           ` Patrick McHardy
@ 2011-02-18 16:14             ` Eric Dumazet
  2011-02-18 18:47               ` Jiri Pirko
  0 siblings, 1 reply; 52+ messages in thread
From: Eric Dumazet @ 2011-02-18 16:14 UTC (permalink / raw)
  To: Patrick McHardy, Jiri Pirko
  Cc: netdev, davem, shemminger, fubar, nicolas.2p.debian, andy

Le vendredi 18 février 2011 à 16:50 +0100, Patrick McHardy a écrit :
> On 18.02.2011 15:58, Jiri Pirko wrote:
> > Fri, Feb 18, 2011 at 03:46:45PM CET, kaber@trash.net wrote:
> >> Am 18.02.2011 15:27, schrieb Eric Dumazet:
> >>> Le vendredi 18 février 2011 à 15:14 +0100, Jiri Pirko a écrit :
> >>>
> >>>> Do not know how to do it better. As for percpu variable, not only
> >>>> origdev would have to be remembered but also probably skb pointer to
> >>>> know if it's the first run on the skb or not. Can't really figure out a
> >>>> better solution. Can you?
> >>>
> >>> I'll try and let you know.
> >>
> >> Why not simply do a lookup on skb->iif?
> > 
> > Well I was trying to avoid iterating over list of devices for each
> > incoming frame.
> > 
> 
> Well, there are a couple of holes on 64 bit, perhaps you can rearrange
> things and eliminate either iif or input_dev without increasing size
> since they appear to be redundant.

Jiri

I dont understand why netif_rx() is needed in your patch.

Can we stack 10 bond devices or so ???

If we avoid this stage and call the real thing (netif_receive_skb()),
then we dont need adding a field in each skb, since it can be carried by
a global variable (per cpu of course)

bond_handle_frame() being called from __netif_receive_skb() I believe it
can use netif_receive_skb() instead of netif_rx().

Same remark for vlan_on_bond_hook()




^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6] net: convert bonding to use rx_handler
  2011-02-18 16:14             ` Eric Dumazet
@ 2011-02-18 18:47               ` Jiri Pirko
  2011-02-18 19:17                 ` Eric Dumazet
  0 siblings, 1 reply; 52+ messages in thread
From: Jiri Pirko @ 2011-02-18 18:47 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, netdev, davem, shemminger, fubar,
	nicolas.2p.debian, andy

Fri, Feb 18, 2011 at 05:14:30PM CET, eric.dumazet@gmail.com wrote:
>Le vendredi 18 février 2011 à 16:50 +0100, Patrick McHardy a écrit :
>> On 18.02.2011 15:58, Jiri Pirko wrote:
>> > Fri, Feb 18, 2011 at 03:46:45PM CET, kaber@trash.net wrote:
>> >> Am 18.02.2011 15:27, schrieb Eric Dumazet:
>> >>> Le vendredi 18 février 2011 à 15:14 +0100, Jiri Pirko a écrit :
>> >>>
>> >>>> Do not know how to do it better. As for percpu variable, not only
>> >>>> origdev would have to be remembered but also probably skb pointer to
>> >>>> know if it's the first run on the skb or not. Can't really figure out a
>> >>>> better solution. Can you?
>> >>>
>> >>> I'll try and let you know.
>> >>
>> >> Why not simply do a lookup on skb->iif?
>> > 
>> > Well I was trying to avoid iterating over list of devices for each
>> > incoming frame.
>> > 
>> 
>> Well, there are a couple of holes on 64 bit, perhaps you can rearrange
>> things and eliminate either iif or input_dev without increasing size
>> since they appear to be redundant.
>
>Jiri
>
>I dont understand why netif_rx() is needed in your patch.

I used netif_rx() because bridge and macvlan does that too. I did not see
a reason to not to do the same.

>
>Can we stack 10 bond devices or so ???
>
>If we avoid this stage and call the real thing (netif_receive_skb()),
>then we dont need adding a field in each skb, since it can be carried by
>a global variable (per cpu of course)
>
I'm probably missing something. How do netif_receive_skb() and
netif_rx() differ in this point of view, since both are calling:
"ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);"
?

Still I see a problem with the percpu global variable. We would have to
store skb pointer there as well and in each __netif_receive_skb() call it
would have to be checked if it's different from the current one.
In that case store new skb and orig_Dev.

Leaving aside that global variables are evil in general, I still think
this is not nicer solution then to add skb->input_dev (although I
understand your arguments).


>bond_handle_frame() being called from __netif_receive_skb() I believe it
>can use netif_receive_skb() instead of netif_rx().
>
>Same remark for vlan_on_bond_hook()
>
>
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6] net: convert bonding to use rx_handler
  2011-02-18 18:47               ` Jiri Pirko
@ 2011-02-18 19:17                 ` Eric Dumazet
  2011-02-18 19:28                   ` Jiri Pirko
  0 siblings, 1 reply; 52+ messages in thread
From: Eric Dumazet @ 2011-02-18 19:17 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Patrick McHardy, netdev, davem, shemminger, fubar,
	nicolas.2p.debian, andy

Le vendredi 18 février 2011 à 19:47 +0100, Jiri Pirko a écrit :
> Fri, Feb 18, 2011 at 05:14:30PM CET, eric.dumazet@gmail.com wrote:
> >Le vendredi 18 février 2011 à 16:50 +0100, Patrick McHardy a écrit :
> >> On 18.02.2011 15:58, Jiri Pirko wrote:
> >> > Fri, Feb 18, 2011 at 03:46:45PM CET, kaber@trash.net wrote:
> >> >> Am 18.02.2011 15:27, schrieb Eric Dumazet:
> >> >>> Le vendredi 18 février 2011 à 15:14 +0100, Jiri Pirko a écrit :
> >> >>>
> >> >>>> Do not know how to do it better. As for percpu variable, not only
> >> >>>> origdev would have to be remembered but also probably skb pointer to
> >> >>>> know if it's the first run on the skb or not. Can't really figure out a
> >> >>>> better solution. Can you?
> >> >>>
> >> >>> I'll try and let you know.
> >> >>
> >> >> Why not simply do a lookup on skb->iif?
> >> > 
> >> > Well I was trying to avoid iterating over list of devices for each
> >> > incoming frame.
> >> > 
> >> 
> >> Well, there are a couple of holes on 64 bit, perhaps you can rearrange
> >> things and eliminate either iif or input_dev without increasing size
> >> since they appear to be redundant.
> >
> >Jiri
> >
> >I dont understand why netif_rx() is needed in your patch.
> 
> I used netif_rx() because bridge and macvlan does that too. I did not see
> a reason to not to do the same.
> 
> >
> >Can we stack 10 bond devices or so ???
> >
> >If we avoid this stage and call the real thing (netif_receive_skb()),
> >then we dont need adding a field in each skb, since it can be carried by
> >a global variable (per cpu of course)
> >
> I'm probably missing something. How do netif_receive_skb() and
> netif_rx() differ in this point of view, since both are calling:
> "ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);"
> ?
> 
> Still I see a problem with the percpu global variable. We would have to
> store skb pointer there as well and in each __netif_receive_skb() call it
> would have to be checked if it's different from the current one.
> In that case store new skb and orig_Dev.
> 
> Leaving aside that global variables are evil in general, I still think
> this is not nicer solution then to add skb->input_dev (although I
> understand your arguments).

Really I must miss something about "global variables" thing/fear.

Kernel is full of global variables, they are not evil if properly used.

Take a look at net/core/dev.c :

static DEFINE_PER_CPU(int, xmit_recursion);

For an example of what I have in mind.




^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6] net: convert bonding to use rx_handler
  2011-02-18 19:17                 ` Eric Dumazet
@ 2011-02-18 19:28                   ` Jiri Pirko
  2011-02-18 19:58                     ` Eric Dumazet
  0 siblings, 1 reply; 52+ messages in thread
From: Jiri Pirko @ 2011-02-18 19:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, netdev, davem, shemminger, fubar,
	nicolas.2p.debian, andy

Fri, Feb 18, 2011 at 08:17:37PM CET, eric.dumazet@gmail.com wrote:
>Le vendredi 18 février 2011 à 19:47 +0100, Jiri Pirko a écrit :
>> Fri, Feb 18, 2011 at 05:14:30PM CET, eric.dumazet@gmail.com wrote:
>> >Le vendredi 18 février 2011 à 16:50 +0100, Patrick McHardy a écrit :
>> >> On 18.02.2011 15:58, Jiri Pirko wrote:
>> >> > Fri, Feb 18, 2011 at 03:46:45PM CET, kaber@trash.net wrote:
>> >> >> Am 18.02.2011 15:27, schrieb Eric Dumazet:
>> >> >>> Le vendredi 18 février 2011 à 15:14 +0100, Jiri Pirko a écrit :
>> >> >>>
>> >> >>>> Do not know how to do it better. As for percpu variable, not only
>> >> >>>> origdev would have to be remembered but also probably skb pointer to
>> >> >>>> know if it's the first run on the skb or not. Can't really figure out a
>> >> >>>> better solution. Can you?
>> >> >>>
>> >> >>> I'll try and let you know.
>> >> >>
>> >> >> Why not simply do a lookup on skb->iif?
>> >> > 
>> >> > Well I was trying to avoid iterating over list of devices for each
>> >> > incoming frame.
>> >> > 
>> >> 
>> >> Well, there are a couple of holes on 64 bit, perhaps you can rearrange
>> >> things and eliminate either iif or input_dev without increasing size
>> >> since they appear to be redundant.
>> >
>> >Jiri
>> >
>> >I dont understand why netif_rx() is needed in your patch.
>> 
>> I used netif_rx() because bridge and macvlan does that too. I did not see
>> a reason to not to do the same.
>> 
>> >
>> >Can we stack 10 bond devices or so ???
>> >
>> >If we avoid this stage and call the real thing (netif_receive_skb()),
>> >then we dont need adding a field in each skb, since it can be carried by
>> >a global variable (per cpu of course)
>> >
>> I'm probably missing something. How do netif_receive_skb() and
>> netif_rx() differ in this point of view, since both are calling:
>> "ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);"
>> ?
>> 
>> Still I see a problem with the percpu global variable. We would have to
>> store skb pointer there as well and in each __netif_receive_skb() call it
>> would have to be checked if it's different from the current one.
>> In that case store new skb and orig_Dev.
>> 
>> Leaving aside that global variables are evil in general, I still think
>> this is not nicer solution then to add skb->input_dev (although I
>> understand your arguments).
>
>Really I must miss something about "global variables" thing/fear.
>
>Kernel is full of global variables, they are not evil if properly used.

I know. But that doesn't mean it's ok. But I see your point.

>
>Take a look at net/core/dev.c :
>
>static DEFINE_PER_CPU(int, xmit_recursion);
>
>For an example of what I have in mind.

Yes I saw this. We would have to do something like:

struct skb_rx_context {
	struct sk_buff *skb;
	struct net_device *orig_dev;
};

static DEFINE_PER_CPU(struct skb_rx_context, skb_rx_context);

and then in __netif_receive_skb():

struct skb_rx_context *cont = __this_cpu_read(skb_rx_context);

if (cont->skb != skb) {
	cont->skb = skb;
	orig_dev = cont->orig_dev = skb->dev;
} else {
	orig_dev = cont->orig_dev;
}


Does this make sense?

>
>
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6] net: convert bonding to use rx_handler
  2011-02-18 19:28                   ` Jiri Pirko
@ 2011-02-18 19:58                     ` Eric Dumazet
  2011-02-18 20:03                       ` Jiri Pirko
  0 siblings, 1 reply; 52+ messages in thread
From: Eric Dumazet @ 2011-02-18 19:58 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Patrick McHardy, netdev, davem, shemminger, fubar,
	nicolas.2p.debian, andy

Le vendredi 18 février 2011 à 20:28 +0100, Jiri Pirko a écrit :

> Yes I saw this. We would have to do something like:
> 
> struct skb_rx_context {
> 	struct sk_buff *skb;
> 	struct net_device *orig_dev;
> };
> 
> static DEFINE_PER_CPU(struct skb_rx_context, skb_rx_context);
> 
> and then in __netif_receive_skb():
> 
> struct skb_rx_context *cont = __this_cpu_read(skb_rx_context);
> 
> if (cont->skb != skb) {
> 	cont->skb = skb;
> 	orig_dev = cont->orig_dev = skb->dev;
> } else {
> 	orig_dev = cont->orig_dev;
> }
> 
> 
> Does this make sense?

Well, yes, something like that, but I think you dont need to keep a
pointer to current skb. (It would not work if one handled/freed, same
'skb pointer' is reused a bit later)

Sorry, I wont have time to look at this right now, its now 21h00 in
France, time to get some time with family ;)

See you !



^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6] net: convert bonding to use rx_handler
  2011-02-18 19:58                     ` Eric Dumazet
@ 2011-02-18 20:03                       ` Jiri Pirko
  0 siblings, 0 replies; 52+ messages in thread
From: Jiri Pirko @ 2011-02-18 20:03 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, netdev, davem, shemminger, fubar,
	nicolas.2p.debian, andy

Fri, Feb 18, 2011 at 08:58:26PM CET, eric.dumazet@gmail.com wrote:
>Le vendredi 18 février 2011 à 20:28 +0100, Jiri Pirko a écrit :
>
>> Yes I saw this. We would have to do something like:
>> 
>> struct skb_rx_context {
>> 	struct sk_buff *skb;
>> 	struct net_device *orig_dev;
>> };
>> 
>> static DEFINE_PER_CPU(struct skb_rx_context, skb_rx_context);
>> 
>> and then in __netif_receive_skb():
>> 
>> struct skb_rx_context *cont = __this_cpu_read(skb_rx_context);
>> 
>> if (cont->skb != skb) {
>> 	cont->skb = skb;
>> 	orig_dev = cont->orig_dev = skb->dev;
>> } else {
>> 	orig_dev = cont->orig_dev;
>> }
>> 
>> 
>> Does this make sense?
>
>Well, yes, something like that, but I think you dont need to keep a
>pointer to current skb. (It would not work if one handled/freed, same
>'skb pointer' is reused a bit later)

Well I think I need. How else should I distinguish that new skb (first
time in __netif_receive_skb) is there and I need to remember orig_dev?

>
>Sorry, I wont have time to look at this right now, its now 21h00 in
>France, time to get some time with family ;)

Np, same time here in CZ :)


>
>See you !
>
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6] net: convert bonding to use rx_handler
  2011-02-18 14:58         ` Jiri Pirko
  2011-02-18 15:50           ` Patrick McHardy
@ 2011-02-18 20:06           ` David Miller
  2011-02-18 20:13             ` Jiri Pirko
  2011-02-18 20:58             ` [patch net-next-2.6 V2] " Jiri Pirko
  1 sibling, 2 replies; 52+ messages in thread
From: David Miller @ 2011-02-18 20:06 UTC (permalink / raw)
  To: jpirko
  Cc: kaber, eric.dumazet, netdev, shemminger, fubar, nicolas.2p.debian, andy

From: Jiri Pirko <jpirko@redhat.com>
Date: Fri, 18 Feb 2011 15:58:51 +0100

> Fri, Feb 18, 2011 at 03:46:45PM CET, kaber@trash.net wrote:
>>Am 18.02.2011 15:27, schrieb Eric Dumazet:
>>> Le vendredi 18 février 2011 à 15:14 +0100, Jiri Pirko a écrit :
>>> 
>>>> Do not know how to do it better. As for percpu variable, not only
>>>> origdev would have to be remembered but also probably skb pointer to
>>>> know if it's the first run on the skb or not. Can't really figure out a
>>>> better solution. Can you?
>>> 
>>> I'll try and let you know.
>>
>>Why not simply do a lookup on skb->iif?
> 
> Well I was trying to avoid iterating over list of devices for each
> incoming frame.

It is not list, it is hash :-)

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6] net: convert bonding to use rx_handler
  2011-02-18 20:06           ` David Miller
@ 2011-02-18 20:13             ` Jiri Pirko
  2011-02-18 20:58             ` [patch net-next-2.6 V2] " Jiri Pirko
  1 sibling, 0 replies; 52+ messages in thread
From: Jiri Pirko @ 2011-02-18 20:13 UTC (permalink / raw)
  To: David Miller
  Cc: kaber, eric.dumazet, netdev, shemminger, fubar, nicolas.2p.debian, andy

Fri, Feb 18, 2011 at 09:06:56PM CET, davem@davemloft.net wrote:
>From: Jiri Pirko <jpirko@redhat.com>
>Date: Fri, 18 Feb 2011 15:58:51 +0100
>
>> Fri, Feb 18, 2011 at 03:46:45PM CET, kaber@trash.net wrote:
>>>Am 18.02.2011 15:27, schrieb Eric Dumazet:
>>>> Le vendredi 18 février 2011 à 15:14 +0100, Jiri Pirko a écrit :
>>>> 
>>>>> Do not know how to do it better. As for percpu variable, not only
>>>>> origdev would have to be remembered but also probably skb pointer to
>>>>> know if it's the first run on the skb or not. Can't really figure out a
>>>>> better solution. Can you?
>>>> 
>>>> I'll try and let you know.
>>>
>>>Why not simply do a lookup on skb->iif?
>> 
>> Well I was trying to avoid iterating over list of devices for each
>> incoming frame.
>
>It is not list, it is hash :-)

Even if, Do you think that this would be ok?


^ permalink raw reply	[flat|nested] 52+ messages in thread

* [patch net-next-2.6 V2] net: convert bonding to use rx_handler
  2011-02-18 20:06           ` David Miller
  2011-02-18 20:13             ` Jiri Pirko
@ 2011-02-18 20:58             ` Jiri Pirko
  2011-02-18 23:06               ` Jay Vosburgh
  2011-02-23 19:05               ` Jiri Pirko
  1 sibling, 2 replies; 52+ messages in thread
From: Jiri Pirko @ 2011-02-18 20:58 UTC (permalink / raw)
  To: David Miller
  Cc: kaber, eric.dumazet, netdev, shemminger, fubar, nicolas.2p.debian, andy

This patch converts bonding to use rx_handler. Results in cleaner
__netif_receive_skb() with much less exceptions needed. Also bond-specific
work is moved into bond code.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

v1->v2:
	using skb_iif instead of new input_dev to remember original device

---
 drivers/net/bonding/bond_main.c |   75 ++++++++++++++++++++++++++-
 net/core/dev.c                  |  111 ++++++++-------------------------------
 2 files changed, 97 insertions(+), 89 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 77e3c6a..a856a11 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1423,6 +1423,68 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
 	bond->setup_by_slave = 1;
 }
 
+/* On bonding slaves other than the currently active slave, suppress
+ * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
+ * ARP on active-backup slaves with arp_validate enabled.
+ */
+static bool bond_should_deliver_exact_match(struct sk_buff *skb,
+					    struct net_device *slave_dev,
+					    struct net_device *bond_dev)
+{
+	if (slave_dev->priv_flags & IFF_SLAVE_INACTIVE) {
+		if (slave_dev->priv_flags & IFF_SLAVE_NEEDARP &&
+		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
+			return false;
+
+		if (bond_dev->priv_flags & IFF_MASTER_ALB &&
+		    skb->pkt_type != PACKET_BROADCAST &&
+		    skb->pkt_type != PACKET_MULTICAST)
+				return false;
+
+		if (bond_dev->priv_flags & IFF_MASTER_8023AD &&
+		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
+			return false;
+
+		return true;
+	}
+	return false;
+}
+
+static struct sk_buff *bond_handle_frame(struct sk_buff *skb)
+{
+	struct net_device *slave_dev;
+	struct net_device *bond_dev;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return NULL;
+	slave_dev = skb->dev;
+	bond_dev = ACCESS_ONCE(slave_dev->master);
+	if (unlikely(!bond_dev))
+		return skb;
+
+	if (bond_dev->priv_flags & IFF_MASTER_ARPMON)
+		slave_dev->last_rx = jiffies;
+
+	if (bond_should_deliver_exact_match(skb, slave_dev, bond_dev)) {
+		skb->deliver_no_wcard = 1;
+		return skb;
+	}
+
+	skb->dev = bond_dev;
+
+	if (bond_dev->priv_flags & IFF_MASTER_ALB &&
+	    bond_dev->priv_flags & IFF_BRIDGE_PORT &&
+	    skb->pkt_type == PACKET_HOST) {
+		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
+
+		memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
+	}
+
+	netif_rx(skb);
+	return NULL;
+}
+
 /* enslave device <slave> to bond device <master> */
 int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 {
@@ -1599,11 +1661,17 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		pr_debug("Error %d calling netdev_set_bond_master\n", res);
 		goto err_restore_mac;
 	}
+	res = netdev_rx_handler_register(slave_dev, bond_handle_frame, NULL);
+	if (res) {
+		pr_debug("Error %d calling netdev_rx_handler_register\n", res);
+		goto err_unset_master;
+	}
+
 	/* open the slave since the application closed it */
 	res = dev_open(slave_dev);
 	if (res) {
 		pr_debug("Opening slave %s failed\n", slave_dev->name);
-		goto err_unset_master;
+		goto err_unreg_rxhandler;
 	}
 
 	new_slave->dev = slave_dev;
@@ -1811,6 +1879,9 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 err_close:
 	dev_close(slave_dev);
 
+err_unreg_rxhandler:
+	netdev_rx_handler_unregister(slave_dev);
+
 err_unset_master:
 	netdev_set_bond_master(slave_dev, NULL);
 
@@ -1992,6 +2063,7 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
 		netif_addr_unlock_bh(bond_dev);
 	}
 
+	netdev_rx_handler_unregister(slave_dev);
 	netdev_set_bond_master(slave_dev, NULL);
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -2114,6 +2186,7 @@ static int bond_release_all(struct net_device *bond_dev)
 			netif_addr_unlock_bh(bond_dev);
 		}
 
+		netdev_rx_handler_unregister(slave_dev);
 		netdev_set_bond_master(slave_dev, NULL);
 
 		/* close slave before restoring its mac address */
diff --git a/net/core/dev.c b/net/core/dev.c
index 4f69439..580cff1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3092,63 +3092,31 @@ void netdev_rx_handler_unregister(struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 
-static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
-					      struct net_device *master)
+static void vlan_on_bond_hook(struct sk_buff *skb)
 {
-	if (skb->pkt_type == PACKET_HOST) {
-		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
-
-		memcpy(dest, master->dev_addr, ETH_ALEN);
-	}
-}
-
-/* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
- */
-static int __skb_bond_should_drop(struct sk_buff *skb,
-				  struct net_device *master)
-{
-	struct net_device *dev = skb->dev;
-
-	if (master->priv_flags & IFF_MASTER_ARPMON)
-		dev->last_rx = jiffies;
-
-	if ((master->priv_flags & IFF_MASTER_ALB) &&
-	    (master->priv_flags & IFF_BRIDGE_PORT)) {
-		/* Do address unmangle. The local destination address
-		 * will be always the one master has. Provides the right
-		 * functionality in a bridge.
-		 */
-		skb_bond_set_mac_by_master(skb, master);
-	}
-
-	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
-		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
-		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
-			return 0;
-
-		if (master->priv_flags & IFF_MASTER_ALB) {
-			if (skb->pkt_type != PACKET_BROADCAST &&
-			    skb->pkt_type != PACKET_MULTICAST)
-				return 0;
-		}
-		if (master->priv_flags & IFF_MASTER_8023AD &&
-		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
-			return 0;
+	/*
+	 * Make sure ARP frames received on VLAN interfaces stacked on
+	 * bonding interfaces still make their way to any base bonding
+	 * device that may have registered for a specific ptype.
+	 */
+	if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
+	    vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
+	    skb->protocol == htons(ETH_P_ARP)) {
+		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 
-		return 1;
+		if (!skb2)
+			return;
+		skb2->dev = vlan_dev_real_dev(skb->dev);
+		netif_rx(skb2);
 	}
-	return 0;
 }
 
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	rx_handler_func_t *rx_handler;
+	struct net_device *null_or_dev;
 	struct net_device *orig_dev;
-	struct net_device *null_or_orig;
-	struct net_device *orig_or_bond;
 	int ret = NET_RX_DROP;
 	__be16 type;
 
@@ -3164,30 +3132,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	if (!skb->skb_iif)
 		skb->skb_iif = skb->dev->ifindex;
 
-	/*
-	 * bonding note: skbs received on inactive slaves should only
-	 * be delivered to pkt handlers that are exact matches.  Also
-	 * the deliver_no_wcard flag will be set.  If packet handlers
-	 * are sensitive to duplicate packets these skbs will need to
-	 * be dropped at the handler.
-	 */
-	null_or_orig = NULL;
-	orig_dev = skb->dev;
-	if (skb->deliver_no_wcard)
-		null_or_orig = orig_dev;
-	else if (netif_is_bond_slave(orig_dev)) {
-		struct net_device *bond_master = ACCESS_ONCE(orig_dev->master);
-
-		if (likely(bond_master)) {
-			if (__skb_bond_should_drop(skb, bond_master)) {
-				skb->deliver_no_wcard = 1;
-				/* deliver only exact match */
-				null_or_orig = orig_dev;
-			} else
-				skb->dev = bond_master;
-		}
-	}
-
 	__this_cpu_inc(softnet_data.processed);
 	skb_reset_network_header(skb);
 	skb_reset_transport_header(skb);
@@ -3196,6 +3140,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	pt_prev = NULL;
 
 	rcu_read_lock();
+	orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);
 
 #ifdef CONFIG_NET_CLS_ACT
 	if (skb->tc_verd & TC_NCLS) {
@@ -3205,8 +3150,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 #endif
 
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
-		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
-		    ptype->dev == orig_dev) {
+		if (!ptype->dev || ptype->dev == skb->dev) {
 			if (pt_prev)
 				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;
@@ -3220,7 +3164,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
 ncls:
 #endif
 
-	/* Handle special case of bridge or macvlan */
 	rx_handler = rcu_dereference(skb->dev->rx_handler);
 	if (rx_handler) {
 		if (pt_prev) {
@@ -3244,24 +3187,16 @@ ncls:
 			goto out;
 	}
 
-	/*
-	 * Make sure frames received on VLAN interfaces stacked on
-	 * bonding interfaces still make their way to any base bonding
-	 * device that may have registered for a specific ptype.  The
-	 * handler may have to adjust skb->dev and orig_dev.
-	 */
-	orig_or_bond = orig_dev;
-	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
-	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
-		orig_or_bond = vlan_dev_real_dev(skb->dev);
-	}
+	vlan_on_bond_hook(skb);
+
+	/* deliver only exact match when indicated */
+	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
 
 	type = skb->protocol;
 	list_for_each_entry_rcu(ptype,
 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
-		if (ptype->type == type && (ptype->dev == null_or_orig ||
-		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
-		     ptype->dev == orig_or_bond)) {
+		if (ptype->type == type &&
+		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
 			if (pt_prev)
 				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;
-- 
1.7.3.4


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V2] net: convert bonding to use rx_handler
  2011-02-18 20:58             ` [patch net-next-2.6 V2] " Jiri Pirko
@ 2011-02-18 23:06               ` Jay Vosburgh
  2011-02-19  7:44                 ` Jiri Pirko
  2011-02-19  8:05                 ` [patch net-next-2.6 V3] " Jiri Pirko
  2011-02-23 19:05               ` Jiri Pirko
  1 sibling, 2 replies; 52+ messages in thread
From: Jay Vosburgh @ 2011-02-18 23:06 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: David Miller, kaber, eric.dumazet, netdev, shemminger,
	nicolas.2p.debian, andy

Jiri Pirko <jpirko@redhat.com> wrote:

>This patch converts bonding to use rx_handler. Results in cleaner
>__netif_receive_skb() with much less exceptions needed. Also bond-specific
>work is moved into bond code.
>
>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>
>v1->v2:
>	using skb_iif instead of new input_dev to remember original device
>
>---
> drivers/net/bonding/bond_main.c |   75 ++++++++++++++++++++++++++-
> net/core/dev.c                  |  111 ++++++++-------------------------------
> 2 files changed, 97 insertions(+), 89 deletions(-)
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 77e3c6a..a856a11 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -1423,6 +1423,68 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
> 	bond->setup_by_slave = 1;
> }
>
>+/* On bonding slaves other than the currently active slave, suppress
>+ * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
>+ * ARP on active-backup slaves with arp_validate enabled.
>+ */
>+static bool bond_should_deliver_exact_match(struct sk_buff *skb,
>+					    struct net_device *slave_dev,
>+					    struct net_device *bond_dev)
>+{
>+	if (slave_dev->priv_flags & IFF_SLAVE_INACTIVE) {
>+		if (slave_dev->priv_flags & IFF_SLAVE_NEEDARP &&
>+		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
>+			return false;
>+
>+		if (bond_dev->priv_flags & IFF_MASTER_ALB &&
>+		    skb->pkt_type != PACKET_BROADCAST &&
>+		    skb->pkt_type != PACKET_MULTICAST)
>+				return false;
>+
>+		if (bond_dev->priv_flags & IFF_MASTER_8023AD &&
>+		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
>+			return false;

	Since this is all in the bonding code now, it should be possible
to do away with using priv_flags for all (or at least most) of this.
Perhaps in a follow-on patch.

>+
>+		return true;
>+	}
>+	return false;
>+}
>+
>+static struct sk_buff *bond_handle_frame(struct sk_buff *skb)
>+{
>+	struct net_device *slave_dev;
>+	struct net_device *bond_dev;
>+
>+	skb = skb_share_check(skb, GFP_ATOMIC);
>+	if (unlikely(!skb))
>+		return NULL;
>+	slave_dev = skb->dev;
>+	bond_dev = ACCESS_ONCE(slave_dev->master);
>+	if (unlikely(!bond_dev))
>+		return skb;
>+
>+	if (bond_dev->priv_flags & IFF_MASTER_ARPMON)
>+		slave_dev->last_rx = jiffies;

	The last_rx field could probably move into bonding as well,
although it looks like there are a couple of drivers using last_rx for
something (more than just setting it).

>+	if (bond_should_deliver_exact_match(skb, slave_dev, bond_dev)) {
>+		skb->deliver_no_wcard = 1;
>+		return skb;
>+	}
>+
>+	skb->dev = bond_dev;
>+
>+	if (bond_dev->priv_flags & IFF_MASTER_ALB &&
>+	    bond_dev->priv_flags & IFF_BRIDGE_PORT &&
>+	    skb->pkt_type == PACKET_HOST) {
>+		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
>+
>+		memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
>+	}
>+
>+	netif_rx(skb);
>+	return NULL;
>+}
>+
> /* enslave device <slave> to bond device <master> */
> int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
> {
>@@ -1599,11 +1661,17 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
> 		pr_debug("Error %d calling netdev_set_bond_master\n", res);
> 		goto err_restore_mac;
> 	}
>+	res = netdev_rx_handler_register(slave_dev, bond_handle_frame, NULL);
>+	if (res) {
>+		pr_debug("Error %d calling netdev_rx_handler_register\n", res);
>+		goto err_unset_master;
>+	}
>+
> 	/* open the slave since the application closed it */
> 	res = dev_open(slave_dev);
> 	if (res) {
> 		pr_debug("Opening slave %s failed\n", slave_dev->name);
>-		goto err_unset_master;
>+		goto err_unreg_rxhandler;
> 	}
>
> 	new_slave->dev = slave_dev;
>@@ -1811,6 +1879,9 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
> err_close:
> 	dev_close(slave_dev);
>
>+err_unreg_rxhandler:
>+	netdev_rx_handler_unregister(slave_dev);
>+
> err_unset_master:
> 	netdev_set_bond_master(slave_dev, NULL);
>
>@@ -1992,6 +2063,7 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
> 		netif_addr_unlock_bh(bond_dev);
> 	}
>
>+	netdev_rx_handler_unregister(slave_dev);
> 	netdev_set_bond_master(slave_dev, NULL);
>
> #ifdef CONFIG_NET_POLL_CONTROLLER
>@@ -2114,6 +2186,7 @@ static int bond_release_all(struct net_device *bond_dev)
> 			netif_addr_unlock_bh(bond_dev);
> 		}
>
>+		netdev_rx_handler_unregister(slave_dev);
> 		netdev_set_bond_master(slave_dev, NULL);
>
> 		/* close slave before restoring its mac address */
>diff --git a/net/core/dev.c b/net/core/dev.c
>index 4f69439..580cff1 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -3092,63 +3092,31 @@ void netdev_rx_handler_unregister(struct net_device *dev)
> }
> EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
>
>-static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
>-					      struct net_device *master)
>+static void vlan_on_bond_hook(struct sk_buff *skb)
> {
>-	if (skb->pkt_type == PACKET_HOST) {
>-		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
>-
>-		memcpy(dest, master->dev_addr, ETH_ALEN);
>-	}
>-}
>-
>-/* On bonding slaves other than the currently active slave, suppress
>- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
>- * ARP on active-backup slaves with arp_validate enabled.
>- */
>-static int __skb_bond_should_drop(struct sk_buff *skb,
>-				  struct net_device *master)
>-{
>-	struct net_device *dev = skb->dev;
>-
>-	if (master->priv_flags & IFF_MASTER_ARPMON)
>-		dev->last_rx = jiffies;
>-
>-	if ((master->priv_flags & IFF_MASTER_ALB) &&
>-	    (master->priv_flags & IFF_BRIDGE_PORT)) {
>-		/* Do address unmangle. The local destination address
>-		 * will be always the one master has. Provides the right
>-		 * functionality in a bridge.
>-		 */
>-		skb_bond_set_mac_by_master(skb, master);
>-	}
>-
>-	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
>-		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
>-		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
>-			return 0;
>-
>-		if (master->priv_flags & IFF_MASTER_ALB) {
>-			if (skb->pkt_type != PACKET_BROADCAST &&
>-			    skb->pkt_type != PACKET_MULTICAST)
>-				return 0;
>-		}
>-		if (master->priv_flags & IFF_MASTER_8023AD &&
>-		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
>-			return 0;
>+	/*
>+	 * Make sure ARP frames received on VLAN interfaces stacked on
>+	 * bonding interfaces still make their way to any base bonding
>+	 * device that may have registered for a specific ptype.
>+	 */
>+	if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
>+	    vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
>+	    skb->protocol == htons(ETH_P_ARP)) {
>+		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
>
>-		return 1;
>+		if (!skb2)
>+			return;
>+		skb2->dev = vlan_dev_real_dev(skb->dev);
>+		netif_rx(skb2);
> 	}
>-	return 0;
> }
>
> static int __netif_receive_skb(struct sk_buff *skb)
> {
> 	struct packet_type *ptype, *pt_prev;
> 	rx_handler_func_t *rx_handler;
>+	struct net_device *null_or_dev;
> 	struct net_device *orig_dev;
>-	struct net_device *null_or_orig;
>-	struct net_device *orig_or_bond;
> 	int ret = NET_RX_DROP;
> 	__be16 type;
>
>@@ -3164,30 +3132,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
> 	if (!skb->skb_iif)
> 		skb->skb_iif = skb->dev->ifindex;
>
>-	/*
>-	 * bonding note: skbs received on inactive slaves should only
>-	 * be delivered to pkt handlers that are exact matches.  Also
>-	 * the deliver_no_wcard flag will be set.  If packet handlers
>-	 * are sensitive to duplicate packets these skbs will need to
>-	 * be dropped at the handler.
>-	 */
>-	null_or_orig = NULL;
>-	orig_dev = skb->dev;
>-	if (skb->deliver_no_wcard)
>-		null_or_orig = orig_dev;
>-	else if (netif_is_bond_slave(orig_dev)) {
>-		struct net_device *bond_master = ACCESS_ONCE(orig_dev->master);
>-
>-		if (likely(bond_master)) {
>-			if (__skb_bond_should_drop(skb, bond_master)) {
>-				skb->deliver_no_wcard = 1;
>-				/* deliver only exact match */
>-				null_or_orig = orig_dev;
>-			} else
>-				skb->dev = bond_master;
>-		}
>-	}
>-
> 	__this_cpu_inc(softnet_data.processed);
> 	skb_reset_network_header(skb);
> 	skb_reset_transport_header(skb);
>@@ -3196,6 +3140,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
> 	pt_prev = NULL;
>
> 	rcu_read_lock();
>+	orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);

	Aren't most packets going to have orig_dev == skb->dev at this
point?  Can this be combined with the skb_iif test a few lines above
this in __netif_receive_skb, looking something like:

	if (!skb->skb_iif) {
		skb->skb_iif = skb->dev->ifindex;
		orig_dev = skb->dev;
	else {
		orig_dev = dev_get_by_index_rcu(...);
	}

	Presumably moving the whole thing down inside the rcu_read_lock.

	VLAN packets should come through here twice, but the first time
through is before the call to vlan_hwaccel_do_receive, so skb->dev
hasn't been set to the VLAN's dev yet.

	Unless, of course, you find a place to store the orig_dev.

	-J

> #ifdef CONFIG_NET_CLS_ACT
> 	if (skb->tc_verd & TC_NCLS) {
>@@ -3205,8 +3150,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
> #endif
>
> 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
>-		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
>-		    ptype->dev == orig_dev) {
>+		if (!ptype->dev || ptype->dev == skb->dev) {
> 			if (pt_prev)
> 				ret = deliver_skb(skb, pt_prev, orig_dev);
> 			pt_prev = ptype;
>@@ -3220,7 +3164,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
> ncls:
> #endif
>
>-	/* Handle special case of bridge or macvlan */
> 	rx_handler = rcu_dereference(skb->dev->rx_handler);
> 	if (rx_handler) {
> 		if (pt_prev) {
>@@ -3244,24 +3187,16 @@ ncls:
> 			goto out;
> 	}
>
>-	/*
>-	 * Make sure frames received on VLAN interfaces stacked on
>-	 * bonding interfaces still make their way to any base bonding
>-	 * device that may have registered for a specific ptype.  The
>-	 * handler may have to adjust skb->dev and orig_dev.
>-	 */
>-	orig_or_bond = orig_dev;
>-	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
>-	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
>-		orig_or_bond = vlan_dev_real_dev(skb->dev);
>-	}
>+	vlan_on_bond_hook(skb);
>+
>+	/* deliver only exact match when indicated */
>+	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
>
> 	type = skb->protocol;
> 	list_for_each_entry_rcu(ptype,
> 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
>-		if (ptype->type == type && (ptype->dev == null_or_orig ||
>-		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
>-		     ptype->dev == orig_or_bond)) {
>+		if (ptype->type == type &&
>+		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
> 			if (pt_prev)
> 				ret = deliver_skb(skb, pt_prev, orig_dev);
> 			pt_prev = ptype;
>-- 
>1.7.3.4
>

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V2] net: convert bonding to use rx_handler
  2011-02-18 23:06               ` Jay Vosburgh
@ 2011-02-19  7:44                 ` Jiri Pirko
  2011-02-19  8:05                 ` [patch net-next-2.6 V3] " Jiri Pirko
  1 sibling, 0 replies; 52+ messages in thread
From: Jiri Pirko @ 2011-02-19  7:44 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: David Miller, kaber, eric.dumazet, netdev, shemminger,
	nicolas.2p.debian, andy

Sat, Feb 19, 2011 at 12:06:11AM CET, fubar@us.ibm.com wrote:
>Jiri Pirko <jpirko@redhat.com> wrote:
>
>>This patch converts bonding to use rx_handler. Results in cleaner
>>__netif_receive_skb() with much less exceptions needed. Also bond-specific
>>work is moved into bond code.
>>
>>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>>
>>v1->v2:
>>	using skb_iif instead of new input_dev to remember original device
>>
>>---
>> drivers/net/bonding/bond_main.c |   75 ++++++++++++++++++++++++++-
>> net/core/dev.c                  |  111 ++++++++-------------------------------
>> 2 files changed, 97 insertions(+), 89 deletions(-)
>>
>>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>>index 77e3c6a..a856a11 100644
>>--- a/drivers/net/bonding/bond_main.c
>>+++ b/drivers/net/bonding/bond_main.c
>>@@ -1423,6 +1423,68 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
>> 	bond->setup_by_slave = 1;
>> }
>>
>>+/* On bonding slaves other than the currently active slave, suppress
>>+ * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
>>+ * ARP on active-backup slaves with arp_validate enabled.
>>+ */
>>+static bool bond_should_deliver_exact_match(struct sk_buff *skb,
>>+					    struct net_device *slave_dev,
>>+					    struct net_device *bond_dev)
>>+{
>>+	if (slave_dev->priv_flags & IFF_SLAVE_INACTIVE) {
>>+		if (slave_dev->priv_flags & IFF_SLAVE_NEEDARP &&
>>+		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
>>+			return false;
>>+
>>+		if (bond_dev->priv_flags & IFF_MASTER_ALB &&
>>+		    skb->pkt_type != PACKET_BROADCAST &&
>>+		    skb->pkt_type != PACKET_MULTICAST)
>>+				return false;
>>+
>>+		if (bond_dev->priv_flags & IFF_MASTER_8023AD &&
>>+		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
>>+			return false;
>
>	Since this is all in the bonding code now, it should be possible
>to do away with using priv_flags for all (or at least most) of this.
>Perhaps in a follow-on patch.

follow-on patch was exatly my intension to do this in.

>
>>+
>>+		return true;
>>+	}
>>+	return false;
>>+}
>>+
>>+static struct sk_buff *bond_handle_frame(struct sk_buff *skb)
>>+{
>>+	struct net_device *slave_dev;
>>+	struct net_device *bond_dev;
>>+
>>+	skb = skb_share_check(skb, GFP_ATOMIC);
>>+	if (unlikely(!skb))
>>+		return NULL;
>>+	slave_dev = skb->dev;
>>+	bond_dev = ACCESS_ONCE(slave_dev->master);
>>+	if (unlikely(!bond_dev))
>>+		return skb;
>>+
>>+	if (bond_dev->priv_flags & IFF_MASTER_ARPMON)
>>+		slave_dev->last_rx = jiffies;
>
>	The last_rx field could probably move into bonding as well,
>although it looks like there are a couple of drivers using last_rx for
>something (more than just setting it).

I'll leave this to follow-on patch also.

>
>>+	if (bond_should_deliver_exact_match(skb, slave_dev, bond_dev)) {
>>+		skb->deliver_no_wcard = 1;
>>+		return skb;
>>+	}
>>+
>>+	skb->dev = bond_dev;
>>+
>>+	if (bond_dev->priv_flags & IFF_MASTER_ALB &&
>>+	    bond_dev->priv_flags & IFF_BRIDGE_PORT &&
>>+	    skb->pkt_type == PACKET_HOST) {
>>+		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
>>+
>>+		memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
>>+	}
>>+
>>+	netif_rx(skb);
>>+	return NULL;
>>+}
>>+
>> /* enslave device <slave> to bond device <master> */
>> int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
>> {
>>@@ -1599,11 +1661,17 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
>> 		pr_debug("Error %d calling netdev_set_bond_master\n", res);
>> 		goto err_restore_mac;
>> 	}
>>+	res = netdev_rx_handler_register(slave_dev, bond_handle_frame, NULL);
>>+	if (res) {
>>+		pr_debug("Error %d calling netdev_rx_handler_register\n", res);
>>+		goto err_unset_master;
>>+	}
>>+
>> 	/* open the slave since the application closed it */
>> 	res = dev_open(slave_dev);
>> 	if (res) {
>> 		pr_debug("Opening slave %s failed\n", slave_dev->name);
>>-		goto err_unset_master;
>>+		goto err_unreg_rxhandler;
>> 	}
>>
>> 	new_slave->dev = slave_dev;
>>@@ -1811,6 +1879,9 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
>> err_close:
>> 	dev_close(slave_dev);
>>
>>+err_unreg_rxhandler:
>>+	netdev_rx_handler_unregister(slave_dev);
>>+
>> err_unset_master:
>> 	netdev_set_bond_master(slave_dev, NULL);
>>
>>@@ -1992,6 +2063,7 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
>> 		netif_addr_unlock_bh(bond_dev);
>> 	}
>>
>>+	netdev_rx_handler_unregister(slave_dev);
>> 	netdev_set_bond_master(slave_dev, NULL);
>>
>> #ifdef CONFIG_NET_POLL_CONTROLLER
>>@@ -2114,6 +2186,7 @@ static int bond_release_all(struct net_device *bond_dev)
>> 			netif_addr_unlock_bh(bond_dev);
>> 		}
>>
>>+		netdev_rx_handler_unregister(slave_dev);
>> 		netdev_set_bond_master(slave_dev, NULL);
>>
>> 		/* close slave before restoring its mac address */
>>diff --git a/net/core/dev.c b/net/core/dev.c
>>index 4f69439..580cff1 100644
>>--- a/net/core/dev.c
>>+++ b/net/core/dev.c
>>@@ -3092,63 +3092,31 @@ void netdev_rx_handler_unregister(struct net_device *dev)
>> }
>> EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
>>
>>-static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
>>-					      struct net_device *master)
>>+static void vlan_on_bond_hook(struct sk_buff *skb)
>> {
>>-	if (skb->pkt_type == PACKET_HOST) {
>>-		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
>>-
>>-		memcpy(dest, master->dev_addr, ETH_ALEN);
>>-	}
>>-}
>>-
>>-/* On bonding slaves other than the currently active slave, suppress
>>- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
>>- * ARP on active-backup slaves with arp_validate enabled.
>>- */
>>-static int __skb_bond_should_drop(struct sk_buff *skb,
>>-				  struct net_device *master)
>>-{
>>-	struct net_device *dev = skb->dev;
>>-
>>-	if (master->priv_flags & IFF_MASTER_ARPMON)
>>-		dev->last_rx = jiffies;
>>-
>>-	if ((master->priv_flags & IFF_MASTER_ALB) &&
>>-	    (master->priv_flags & IFF_BRIDGE_PORT)) {
>>-		/* Do address unmangle. The local destination address
>>-		 * will be always the one master has. Provides the right
>>-		 * functionality in a bridge.
>>-		 */
>>-		skb_bond_set_mac_by_master(skb, master);
>>-	}
>>-
>>-	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
>>-		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
>>-		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
>>-			return 0;
>>-
>>-		if (master->priv_flags & IFF_MASTER_ALB) {
>>-			if (skb->pkt_type != PACKET_BROADCAST &&
>>-			    skb->pkt_type != PACKET_MULTICAST)
>>-				return 0;
>>-		}
>>-		if (master->priv_flags & IFF_MASTER_8023AD &&
>>-		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
>>-			return 0;
>>+	/*
>>+	 * Make sure ARP frames received on VLAN interfaces stacked on
>>+	 * bonding interfaces still make their way to any base bonding
>>+	 * device that may have registered for a specific ptype.
>>+	 */
>>+	if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
>>+	    vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
>>+	    skb->protocol == htons(ETH_P_ARP)) {
>>+		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
>>
>>-		return 1;
>>+		if (!skb2)
>>+			return;
>>+		skb2->dev = vlan_dev_real_dev(skb->dev);
>>+		netif_rx(skb2);
>> 	}
>>-	return 0;
>> }
>>
>> static int __netif_receive_skb(struct sk_buff *skb)
>> {
>> 	struct packet_type *ptype, *pt_prev;
>> 	rx_handler_func_t *rx_handler;
>>+	struct net_device *null_or_dev;
>> 	struct net_device *orig_dev;
>>-	struct net_device *null_or_orig;
>>-	struct net_device *orig_or_bond;
>> 	int ret = NET_RX_DROP;
>> 	__be16 type;
>>
>>@@ -3164,30 +3132,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
>> 	if (!skb->skb_iif)
>> 		skb->skb_iif = skb->dev->ifindex;
>>
>>-	/*
>>-	 * bonding note: skbs received on inactive slaves should only
>>-	 * be delivered to pkt handlers that are exact matches.  Also
>>-	 * the deliver_no_wcard flag will be set.  If packet handlers
>>-	 * are sensitive to duplicate packets these skbs will need to
>>-	 * be dropped at the handler.
>>-	 */
>>-	null_or_orig = NULL;
>>-	orig_dev = skb->dev;
>>-	if (skb->deliver_no_wcard)
>>-		null_or_orig = orig_dev;
>>-	else if (netif_is_bond_slave(orig_dev)) {
>>-		struct net_device *bond_master = ACCESS_ONCE(orig_dev->master);
>>-
>>-		if (likely(bond_master)) {
>>-			if (__skb_bond_should_drop(skb, bond_master)) {
>>-				skb->deliver_no_wcard = 1;
>>-				/* deliver only exact match */
>>-				null_or_orig = orig_dev;
>>-			} else
>>-				skb->dev = bond_master;
>>-		}
>>-	}
>>-
>> 	__this_cpu_inc(softnet_data.processed);
>> 	skb_reset_network_header(skb);
>> 	skb_reset_transport_header(skb);
>>@@ -3196,6 +3140,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>> 	pt_prev = NULL;
>>
>> 	rcu_read_lock();
>>+	orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);
>
>	Aren't most packets going to have orig_dev == skb->dev at this
>point?  Can this be combined with the skb_iif test a few lines above
>this in __netif_receive_skb, looking something like:
>
>	if (!skb->skb_iif) {
>		skb->skb_iif = skb->dev->ifindex;
>		orig_dev = skb->dev;
>	else {
>		orig_dev = dev_get_by_index_rcu(...);
>	}
>
>	Presumably moving the whole thing down inside the rcu_read_lock.

Yep, that's reasonable. Thanks.

>
>	VLAN packets should come through here twice, but the first time
>through is before the call to vlan_hwaccel_do_receive, so skb->dev
>hasn't been set to the VLAN's dev yet.
>
>	Unless, of course, you find a place to store the orig_dev.
>
>	-J
>
>> #ifdef CONFIG_NET_CLS_ACT
>> 	if (skb->tc_verd & TC_NCLS) {
>>@@ -3205,8 +3150,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>> #endif
>>
>> 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
>>-		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
>>-		    ptype->dev == orig_dev) {
>>+		if (!ptype->dev || ptype->dev == skb->dev) {
>> 			if (pt_prev)
>> 				ret = deliver_skb(skb, pt_prev, orig_dev);
>> 			pt_prev = ptype;
>>@@ -3220,7 +3164,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
>> ncls:
>> #endif
>>
>>-	/* Handle special case of bridge or macvlan */
>> 	rx_handler = rcu_dereference(skb->dev->rx_handler);
>> 	if (rx_handler) {
>> 		if (pt_prev) {
>>@@ -3244,24 +3187,16 @@ ncls:
>> 			goto out;
>> 	}
>>
>>-	/*
>>-	 * Make sure frames received on VLAN interfaces stacked on
>>-	 * bonding interfaces still make their way to any base bonding
>>-	 * device that may have registered for a specific ptype.  The
>>-	 * handler may have to adjust skb->dev and orig_dev.
>>-	 */
>>-	orig_or_bond = orig_dev;
>>-	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
>>-	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
>>-		orig_or_bond = vlan_dev_real_dev(skb->dev);
>>-	}
>>+	vlan_on_bond_hook(skb);
>>+
>>+	/* deliver only exact match when indicated */
>>+	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
>>
>> 	type = skb->protocol;
>> 	list_for_each_entry_rcu(ptype,
>> 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
>>-		if (ptype->type == type && (ptype->dev == null_or_orig ||
>>-		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
>>-		     ptype->dev == orig_or_bond)) {
>>+		if (ptype->type == type &&
>>+		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>> 			if (pt_prev)
>> 				ret = deliver_skb(skb, pt_prev, orig_dev);
>> 			pt_prev = ptype;
>>-- 
>>1.7.3.4
>>
>
>---
>	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 52+ messages in thread

* [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-18 23:06               ` Jay Vosburgh
  2011-02-19  7:44                 ` Jiri Pirko
@ 2011-02-19  8:05                 ` Jiri Pirko
  2011-02-19  8:37                   ` Eric Dumazet
  2011-02-19 10:56                   ` Nicolas de Pesloüan
  1 sibling, 2 replies; 52+ messages in thread
From: Jiri Pirko @ 2011-02-19  8:05 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: David Miller, kaber, eric.dumazet, netdev, shemminger,
	nicolas.2p.debian, andy

This patch converts bonding to use rx_handler. Results in cleaner
__netif_receive_skb() with much less exceptions needed. Also
bond-specific work is moved into bond code.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

v1->v2:
        using skb_iif instead of new input_dev to remember original
	device
v2->v3:
	set orig_dev = skb->dev if skb_iif is set

---
 drivers/net/bonding/bond_main.c |   75 ++++++++++++++++++++++++-
 net/core/dev.c                  |  120 +++++++++-----------------------------
 2 files changed, 103 insertions(+), 92 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 77e3c6a..a856a11 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1423,6 +1423,68 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
 	bond->setup_by_slave = 1;
 }
 
+/* On bonding slaves other than the currently active slave, suppress
+ * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
+ * ARP on active-backup slaves with arp_validate enabled.
+ */
+static bool bond_should_deliver_exact_match(struct sk_buff *skb,
+					    struct net_device *slave_dev,
+					    struct net_device *bond_dev)
+{
+	if (slave_dev->priv_flags & IFF_SLAVE_INACTIVE) {
+		if (slave_dev->priv_flags & IFF_SLAVE_NEEDARP &&
+		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
+			return false;
+
+		if (bond_dev->priv_flags & IFF_MASTER_ALB &&
+		    skb->pkt_type != PACKET_BROADCAST &&
+		    skb->pkt_type != PACKET_MULTICAST)
+				return false;
+
+		if (bond_dev->priv_flags & IFF_MASTER_8023AD &&
+		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
+			return false;
+
+		return true;
+	}
+	return false;
+}
+
+static struct sk_buff *bond_handle_frame(struct sk_buff *skb)
+{
+	struct net_device *slave_dev;
+	struct net_device *bond_dev;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return NULL;
+	slave_dev = skb->dev;
+	bond_dev = ACCESS_ONCE(slave_dev->master);
+	if (unlikely(!bond_dev))
+		return skb;
+
+	if (bond_dev->priv_flags & IFF_MASTER_ARPMON)
+		slave_dev->last_rx = jiffies;
+
+	if (bond_should_deliver_exact_match(skb, slave_dev, bond_dev)) {
+		skb->deliver_no_wcard = 1;
+		return skb;
+	}
+
+	skb->dev = bond_dev;
+
+	if (bond_dev->priv_flags & IFF_MASTER_ALB &&
+	    bond_dev->priv_flags & IFF_BRIDGE_PORT &&
+	    skb->pkt_type == PACKET_HOST) {
+		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
+
+		memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
+	}
+
+	netif_rx(skb);
+	return NULL;
+}
+
 /* enslave device <slave> to bond device <master> */
 int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 {
@@ -1599,11 +1661,17 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		pr_debug("Error %d calling netdev_set_bond_master\n", res);
 		goto err_restore_mac;
 	}
+	res = netdev_rx_handler_register(slave_dev, bond_handle_frame, NULL);
+	if (res) {
+		pr_debug("Error %d calling netdev_rx_handler_register\n", res);
+		goto err_unset_master;
+	}
+
 	/* open the slave since the application closed it */
 	res = dev_open(slave_dev);
 	if (res) {
 		pr_debug("Opening slave %s failed\n", slave_dev->name);
-		goto err_unset_master;
+		goto err_unreg_rxhandler;
 	}
 
 	new_slave->dev = slave_dev;
@@ -1811,6 +1879,9 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 err_close:
 	dev_close(slave_dev);
 
+err_unreg_rxhandler:
+	netdev_rx_handler_unregister(slave_dev);
+
 err_unset_master:
 	netdev_set_bond_master(slave_dev, NULL);
 
@@ -1992,6 +2063,7 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
 		netif_addr_unlock_bh(bond_dev);
 	}
 
+	netdev_rx_handler_unregister(slave_dev);
 	netdev_set_bond_master(slave_dev, NULL);
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -2114,6 +2186,7 @@ static int bond_release_all(struct net_device *bond_dev)
 			netif_addr_unlock_bh(bond_dev);
 		}
 
+		netdev_rx_handler_unregister(slave_dev);
 		netdev_set_bond_master(slave_dev, NULL);
 
 		/* close slave before restoring its mac address */
diff --git a/net/core/dev.c b/net/core/dev.c
index 4f69439..4ebf7fe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3092,54 +3092,23 @@ void netdev_rx_handler_unregister(struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 
-static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
-					      struct net_device *master)
+static void vlan_on_bond_hook(struct sk_buff *skb)
 {
-	if (skb->pkt_type == PACKET_HOST) {
-		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
-
-		memcpy(dest, master->dev_addr, ETH_ALEN);
-	}
-}
-
-/* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
- */
-static int __skb_bond_should_drop(struct sk_buff *skb,
-				  struct net_device *master)
-{
-	struct net_device *dev = skb->dev;
-
-	if (master->priv_flags & IFF_MASTER_ARPMON)
-		dev->last_rx = jiffies;
-
-	if ((master->priv_flags & IFF_MASTER_ALB) &&
-	    (master->priv_flags & IFF_BRIDGE_PORT)) {
-		/* Do address unmangle. The local destination address
-		 * will be always the one master has. Provides the right
-		 * functionality in a bridge.
-		 */
-		skb_bond_set_mac_by_master(skb, master);
-	}
-
-	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
-		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
-		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
-			return 0;
-
-		if (master->priv_flags & IFF_MASTER_ALB) {
-			if (skb->pkt_type != PACKET_BROADCAST &&
-			    skb->pkt_type != PACKET_MULTICAST)
-				return 0;
-		}
-		if (master->priv_flags & IFF_MASTER_8023AD &&
-		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
-			return 0;
+	/*
+	 * Make sure ARP frames received on VLAN interfaces stacked on
+	 * bonding interfaces still make their way to any base bonding
+	 * device that may have registered for a specific ptype.
+	 */
+	if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
+	    vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
+	    skb->protocol == htons(ETH_P_ARP)) {
+		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 
-		return 1;
+		if (!skb2)
+			return;
+		skb2->dev = vlan_dev_real_dev(skb->dev);
+		netif_rx(skb2);
 	}
-	return 0;
 }
 
 static int __netif_receive_skb(struct sk_buff *skb)
@@ -3147,8 +3116,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	struct packet_type *ptype, *pt_prev;
 	rx_handler_func_t *rx_handler;
 	struct net_device *orig_dev;
-	struct net_device *null_or_orig;
-	struct net_device *orig_or_bond;
+	struct net_device *null_or_dev;
 	int ret = NET_RX_DROP;
 	__be16 type;
 
@@ -3161,33 +3129,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	if (netpoll_receive_skb(skb))
 		return NET_RX_DROP;
 
-	if (!skb->skb_iif)
-		skb->skb_iif = skb->dev->ifindex;
-
-	/*
-	 * bonding note: skbs received on inactive slaves should only
-	 * be delivered to pkt handlers that are exact matches.  Also
-	 * the deliver_no_wcard flag will be set.  If packet handlers
-	 * are sensitive to duplicate packets these skbs will need to
-	 * be dropped at the handler.
-	 */
-	null_or_orig = NULL;
-	orig_dev = skb->dev;
-	if (skb->deliver_no_wcard)
-		null_or_orig = orig_dev;
-	else if (netif_is_bond_slave(orig_dev)) {
-		struct net_device *bond_master = ACCESS_ONCE(orig_dev->master);
-
-		if (likely(bond_master)) {
-			if (__skb_bond_should_drop(skb, bond_master)) {
-				skb->deliver_no_wcard = 1;
-				/* deliver only exact match */
-				null_or_orig = orig_dev;
-			} else
-				skb->dev = bond_master;
-		}
-	}
-
 	__this_cpu_inc(softnet_data.processed);
 	skb_reset_network_header(skb);
 	skb_reset_transport_header(skb);
@@ -3197,6 +3138,13 @@ static int __netif_receive_skb(struct sk_buff *skb)
 
 	rcu_read_lock();
 
+	if (!skb->skb_iif) {
+		skb->skb_iif = skb->dev->ifindex;
+		orig_dev = skb->dev;
+	} else {
+		orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);
+	}
+
 #ifdef CONFIG_NET_CLS_ACT
 	if (skb->tc_verd & TC_NCLS) {
 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
@@ -3205,8 +3153,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 #endif
 
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
-		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
-		    ptype->dev == orig_dev) {
+		if (!ptype->dev || ptype->dev == skb->dev) {
 			if (pt_prev)
 				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;
@@ -3220,7 +3167,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
 ncls:
 #endif
 
-	/* Handle special case of bridge or macvlan */
 	rx_handler = rcu_dereference(skb->dev->rx_handler);
 	if (rx_handler) {
 		if (pt_prev) {
@@ -3244,24 +3190,16 @@ ncls:
 			goto out;
 	}
 
-	/*
-	 * Make sure frames received on VLAN interfaces stacked on
-	 * bonding interfaces still make their way to any base bonding
-	 * device that may have registered for a specific ptype.  The
-	 * handler may have to adjust skb->dev and orig_dev.
-	 */
-	orig_or_bond = orig_dev;
-	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
-	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
-		orig_or_bond = vlan_dev_real_dev(skb->dev);
-	}
+	vlan_on_bond_hook(skb);
+
+	/* deliver only exact match when indicated */
+	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
 
 	type = skb->protocol;
 	list_for_each_entry_rcu(ptype,
 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
-		if (ptype->type == type && (ptype->dev == null_or_orig ||
-		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
-		     ptype->dev == orig_or_bond)) {
+		if (ptype->type == type &&
+		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
 			if (pt_prev)
 				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;
-- 
1.7.3.4


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-19  8:05                 ` [patch net-next-2.6 V3] " Jiri Pirko
@ 2011-02-19  8:37                   ` Eric Dumazet
  2011-02-19  8:58                     ` Jiri Pirko
  2011-02-19 10:56                   ` Nicolas de Pesloüan
  1 sibling, 1 reply; 52+ messages in thread
From: Eric Dumazet @ 2011-02-19  8:37 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Jay Vosburgh, David Miller, kaber, netdev, shemminger,
	nicolas.2p.debian, andy

Le samedi 19 février 2011 à 09:05 +0100, Jiri Pirko a écrit :
> This patch converts bonding to use rx_handler. Results in cleaner
> __netif_receive_skb() with much less exceptions needed. Also
> bond-specific work is moved into bond code.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> v1->v2:
>         using skb_iif instead of new input_dev to remember original
> 	device
> v2->v3:
> 	set orig_dev = skb->dev if skb_iif is set
> 

Seems much better ;)

Do you have some performance numbers ?




^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-19  8:37                   ` Eric Dumazet
@ 2011-02-19  8:58                     ` Jiri Pirko
  2011-02-19  9:22                       ` Eric Dumazet
  0 siblings, 1 reply; 52+ messages in thread
From: Jiri Pirko @ 2011-02-19  8:58 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jay Vosburgh, David Miller, kaber, netdev, shemminger,
	nicolas.2p.debian, andy

Sat, Feb 19, 2011 at 09:37:55AM CET, eric.dumazet@gmail.com wrote:
>Le samedi 19 février 2011 à 09:05 +0100, Jiri Pirko a écrit :
>> This patch converts bonding to use rx_handler. Results in cleaner
>> __netif_receive_skb() with much less exceptions needed. Also
>> bond-specific work is moved into bond code.
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> 
>> v1->v2:
>>         using skb_iif instead of new input_dev to remember original
>> 	device
>> v2->v3:
>> 	set orig_dev = skb->dev if skb_iif is set
>> 
>
>Seems much better ;)
>
>Do you have some performance numbers ?

I don't. I can surely obtain some. What's the best way to measure this?

>
>
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-19  8:58                     ` Jiri Pirko
@ 2011-02-19  9:22                       ` Eric Dumazet
  0 siblings, 0 replies; 52+ messages in thread
From: Eric Dumazet @ 2011-02-19  9:22 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Jay Vosburgh, David Miller, kaber, netdev, shemminger,
	nicolas.2p.debian, andy

Le samedi 19 février 2011 à 09:58 +0100, Jiri Pirko a écrit :
> Sat, Feb 19, 2011 at 09:37:55AM CET, eric.dumazet@gmail.com wrote:
> >Le samedi 19 février 2011 à 09:05 +0100, Jiri Pirko a écrit :
> >> This patch converts bonding to use rx_handler. Results in cleaner
> >> __netif_receive_skb() with much less exceptions needed. Also
> >> bond-specific work is moved into bond code.
> >> 
> >> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> >> 
> >> v1->v2:
> >>         using skb_iif instead of new input_dev to remember original
> >> 	device
> >> v2->v3:
> >> 	set orig_dev = skb->dev if skb_iif is set
> >> 
> >
> >Seems much better ;)
> >
> >Do you have some performance numbers ?
> 
> I don't. I can surely obtain some. What's the best way to measure this?
> 

Hmm, since its receive path :

Two machines, one sending (pktgen) a flood, one receiving it and
check/count how many frames hit destination, before/after patch.




^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-19  8:05                 ` [patch net-next-2.6 V3] " Jiri Pirko
  2011-02-19  8:37                   ` Eric Dumazet
@ 2011-02-19 10:56                   ` Nicolas de Pesloüan
  2011-02-19 11:08                     ` Jiri Pirko
  1 sibling, 1 reply; 52+ messages in thread
From: Nicolas de Pesloüan @ 2011-02-19 10:56 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Jay Vosburgh, David Miller, kaber, eric.dumazet, netdev,
	shemminger, andy

Le 19/02/2011 09:05, Jiri Pirko a écrit :
> This patch converts bonding to use rx_handler. Results in cleaner
> __netif_receive_skb() with much less exceptions needed. Also
> bond-specific work is moved into bond code.
>
> Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>
> v1->v2:
>          using skb_iif instead of new input_dev to remember original
> 	device
> v2->v3:
> 	set orig_dev = skb->dev if skb_iif is set
>

Why do we need to let the rx_handlers call netif_rx() or __netif_receive_skb()?

Bonding used to be handled with very few overhead, simply replacing skb->dev with skb->dev->master. 
Time has passed and we eventually added many special processing for bonding into 
__netif_receive_skb(), but the overhead remained very light.

Calling netif_rx() (or __netif_receive_skb()) to allow nesting would probably lead to some overhead.

Can't we, instead, loop inside __netif_receive_skb(), and deliver whatever need to be delivered, to 
whoever need, inside the loop ?

rx_handler = rcu_dereference(skb->dev->rx_handler);
while (rx_handler) {
	/* ...  */
	orig_dev = skb->dev;
	skb = rx_handler(skb);
	/* ... */
	rx_handler = (skb->dev != orig_dev) ? rcu_dereference(skb->dev->rx_handler) : NULL;
}

This would reduce the overhead, while still allowing nesting: vlan on top on bonding, bridge on top 
on bonding, ...

That way, we can probably keep the list of crossed devices inside a local array, and call 
deliver_skb() with the current "orig_dev" when appropriate. No need to overload sk_buff nor to use a 
global variable.

Of course, this might be a very simplistic view.

Any comments?

	Nicolas.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-19 10:56                   ` Nicolas de Pesloüan
@ 2011-02-19 11:08                     ` Jiri Pirko
  2011-02-19 11:28                       ` Jiri Pirko
  0 siblings, 1 reply; 52+ messages in thread
From: Jiri Pirko @ 2011-02-19 11:08 UTC (permalink / raw)
  To: Nicolas de Pesloüan
  Cc: Jay Vosburgh, David Miller, kaber, eric.dumazet, netdev,
	shemminger, andy

Sat, Feb 19, 2011 at 11:56:23AM CET, nicolas.2p.debian@gmail.com wrote:
>Le 19/02/2011 09:05, Jiri Pirko a écrit :
>>This patch converts bonding to use rx_handler. Results in cleaner
>>__netif_receive_skb() with much less exceptions needed. Also
>>bond-specific work is moved into bond code.
>>
>>Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>
>>v1->v2:
>>         using skb_iif instead of new input_dev to remember original
>>	device
>>v2->v3:
>>	set orig_dev = skb->dev if skb_iif is set
>>
>
>Why do we need to let the rx_handlers call netif_rx() or __netif_receive_skb()?
>
>Bonding used to be handled with very few overhead, simply replacing
>skb->dev with skb->dev->master. Time has passed and we eventually
>added many special processing for bonding into __netif_receive_skb(),
>but the overhead remained very light.
>
>Calling netif_rx() (or __netif_receive_skb()) to allow nesting would probably lead to some overhead.
>
>Can't we, instead, loop inside __netif_receive_skb(), and deliver
>whatever need to be delivered, to whoever need, inside the loop ?
>
>rx_handler = rcu_dereference(skb->dev->rx_handler);
>while (rx_handler) {
>	/* ...  */
>	orig_dev = skb->dev;
>	skb = rx_handler(skb);
>	/* ... */
>	rx_handler = (skb->dev != orig_dev) ? rcu_dereference(skb->dev->rx_handler) : NULL;
>}
>
>This would reduce the overhead, while still allowing nesting: vlan on
>top on bonding, bridge on top on bonding, ...

I see your point. Makes sense to me. But the loop would have to include
at least processing of ptype_all too. I'm going to cook a follow-up
patch.

>
>That way, we can probably keep the list of crossed devices inside a
>local array, and call deliver_skb() with the current "orig_dev" when
>appropriate. No need to overload sk_buff nor to use a global
>variable.
>
>Of course, this might be a very simplistic view.
>
>Any comments?
>
>	Nicolas.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-19 11:08                     ` Jiri Pirko
@ 2011-02-19 11:28                       ` Jiri Pirko
  2011-02-19 13:18                         ` Nicolas de Pesloüan
  0 siblings, 1 reply; 52+ messages in thread
From: Jiri Pirko @ 2011-02-19 11:28 UTC (permalink / raw)
  To: Nicolas de Pesloüan
  Cc: Jay Vosburgh, David Miller, kaber, eric.dumazet, netdev,
	shemminger, andy

Sat, Feb 19, 2011 at 12:08:31PM CET, jpirko@redhat.com wrote:
>Sat, Feb 19, 2011 at 11:56:23AM CET, nicolas.2p.debian@gmail.com wrote:
>>Le 19/02/2011 09:05, Jiri Pirko a écrit :
>>>This patch converts bonding to use rx_handler. Results in cleaner
>>>__netif_receive_skb() with much less exceptions needed. Also
>>>bond-specific work is moved into bond code.
>>>
>>>Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>>
>>>v1->v2:
>>>         using skb_iif instead of new input_dev to remember original
>>>	device
>>>v2->v3:
>>>	set orig_dev = skb->dev if skb_iif is set
>>>
>>
>>Why do we need to let the rx_handlers call netif_rx() or __netif_receive_skb()?
>>
>>Bonding used to be handled with very few overhead, simply replacing
>>skb->dev with skb->dev->master. Time has passed and we eventually
>>added many special processing for bonding into __netif_receive_skb(),
>>but the overhead remained very light.
>>
>>Calling netif_rx() (or __netif_receive_skb()) to allow nesting would probably lead to some overhead.
>>
>>Can't we, instead, loop inside __netif_receive_skb(), and deliver
>>whatever need to be delivered, to whoever need, inside the loop ?
>>
>>rx_handler = rcu_dereference(skb->dev->rx_handler);
>>while (rx_handler) {
>>	/* ...  */
>>	orig_dev = skb->dev;
>>	skb = rx_handler(skb);
>>	/* ... */
>>	rx_handler = (skb->dev != orig_dev) ? rcu_dereference(skb->dev->rx_handler) : NULL;
>>}
>>
>>This would reduce the overhead, while still allowing nesting: vlan on
>>top on bonding, bridge on top on bonding, ...
>
>I see your point. Makes sense to me. But the loop would have to include
>at least processing of ptype_all too. I'm going to cook a follow-up
>patch.
>

DRAFT (doesn't modify rx_handlers):

diff --git a/net/core/dev.c b/net/core/dev.c
index 4ebf7fe..e5dba47 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3115,6 +3115,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	rx_handler_func_t *rx_handler;
+	struct net_device *dev;
 	struct net_device *orig_dev;
 	struct net_device *null_or_dev;
 	int ret = NET_RX_DROP;
@@ -3129,7 +3130,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	if (netpoll_receive_skb(skb))
 		return NET_RX_DROP;
 
-	__this_cpu_inc(softnet_data.processed);
+	skb->skb_iif = skb->dev->ifindex;
+	orig_dev = skb->dev;
+
 	skb_reset_network_header(skb);
 	skb_reset_transport_header(skb);
 	skb->mac_len = skb->network_header - skb->mac_header;
@@ -3138,12 +3141,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
 
 	rcu_read_lock();
 
-	if (!skb->skb_iif) {
-		skb->skb_iif = skb->dev->ifindex;
-		orig_dev = skb->dev;
-	} else {
-		orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);
-	}
+another_round:
+	__this_cpu_inc(softnet_data.processed);
+	dev = skb->dev;
 
 #ifdef CONFIG_NET_CLS_ACT
 	if (skb->tc_verd & TC_NCLS) {
@@ -3153,7 +3153,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 #endif
 
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
-		if (!ptype->dev || ptype->dev == skb->dev) {
+		if (!ptype->dev || ptype->dev == dev) {
 			if (pt_prev)
 				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;
@@ -3167,7 +3167,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 ncls:
 #endif
 
-	rx_handler = rcu_dereference(skb->dev->rx_handler);
+	rx_handler = rcu_dereference(dev->rx_handler);
 	if (rx_handler) {
 		if (pt_prev) {
 			ret = deliver_skb(skb, pt_prev, orig_dev);
@@ -3176,6 +3176,8 @@ ncls:
 		skb = rx_handler(skb);
 		if (!skb)
 			goto out;
+		if (dev != skb->dev)
+			goto another_round;
 	}
 
 	if (vlan_tx_tag_present(skb)) {

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-19 11:28                       ` Jiri Pirko
@ 2011-02-19 13:18                         ` Nicolas de Pesloüan
  2011-02-19 13:46                           ` Jiri Pirko
  0 siblings, 1 reply; 52+ messages in thread
From: Nicolas de Pesloüan @ 2011-02-19 13:18 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Jay Vosburgh, David Miller, kaber, eric.dumazet, netdev,
	shemminger, andy

Le 19/02/2011 12:28, Jiri Pirko a écrit :
> Sat, Feb 19, 2011 at 12:08:31PM CET, jpirko@redhat.com wrote:
>> Sat, Feb 19, 2011 at 11:56:23AM CET, nicolas.2p.debian@gmail.com wrote:
>>> Le 19/02/2011 09:05, Jiri Pirko a écrit :
>>>> This patch converts bonding to use rx_handler. Results in cleaner
>>>> __netif_receive_skb() with much less exceptions needed. Also
>>>> bond-specific work is moved into bond code.
>>>>
>>>> Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>>>
>>>> v1->v2:
>>>>          using skb_iif instead of new input_dev to remember original
>>>> 	device
>>>> v2->v3:
>>>> 	set orig_dev = skb->dev if skb_iif is set
>>>>
>>>
>>> Why do we need to let the rx_handlers call netif_rx() or __netif_receive_skb()?
>>>
>>> Bonding used to be handled with very few overhead, simply replacing
>>> skb->dev with skb->dev->master. Time has passed and we eventually
>>> added many special processing for bonding into __netif_receive_skb(),
>>> but the overhead remained very light.
>>>
>>> Calling netif_rx() (or __netif_receive_skb()) to allow nesting would probably lead to some overhead.
>>>
>>> Can't we, instead, loop inside __netif_receive_skb(), and deliver
>>> whatever need to be delivered, to whoever need, inside the loop ?
>>>
>>> rx_handler = rcu_dereference(skb->dev->rx_handler);
>>> while (rx_handler) {
>>> 	/* ...  */
>>> 	orig_dev = skb->dev;
>>> 	skb = rx_handler(skb);
>>> 	/* ... */
>>> 	rx_handler = (skb->dev != orig_dev) ? rcu_dereference(skb->dev->rx_handler) : NULL;
>>> }
>>>
>>> This would reduce the overhead, while still allowing nesting: vlan on
>>> top on bonding, bridge on top on bonding, ...
>>
>> I see your point. Makes sense to me. But the loop would have to include
>> at least processing of ptype_all too. I'm going to cook a follow-up
>> patch.
>>
>
> DRAFT (doesn't modify rx_handlers):
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 4ebf7fe..e5dba47 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3115,6 +3115,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>   {
>   	struct packet_type *ptype, *pt_prev;
>   	rx_handler_func_t *rx_handler;
> +	struct net_device *dev;
>   	struct net_device *orig_dev;
>   	struct net_device *null_or_dev;
>   	int ret = NET_RX_DROP;
> @@ -3129,7 +3130,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>   	if (netpoll_receive_skb(skb))
>   		return NET_RX_DROP;
>
> -	__this_cpu_inc(softnet_data.processed);
> +	skb->skb_iif = skb->dev->ifindex;
> +	orig_dev = skb->dev;

orig_dev should be set inside the loop, to reflect "previously crossed device", while following the 
path:

eth0 -> bond0 -> br0.

First step inside loop:

orig_dev = eth0
skb->dev = bond0 (at the end of the loop).

Second step inside loop:

orig_dev = bond0
skb->dev = br0 (et the end of the loop).

This would allow for exact match delivery to bond0 if someone bind there.

> +
>   	skb_reset_network_header(skb);
>   	skb_reset_transport_header(skb);
>   	skb->mac_len = skb->network_header - skb->mac_header;
> @@ -3138,12 +3141,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>
>   	rcu_read_lock();
>
> -	if (!skb->skb_iif) {
> -		skb->skb_iif = skb->dev->ifindex;
> -		orig_dev = skb->dev;
> -	} else {
> -		orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);
> -	}

I like the fact that it removes the above part.

> +another_round:
> +	__this_cpu_inc(softnet_data.processed);
> +	dev = skb->dev;
>
>   #ifdef CONFIG_NET_CLS_ACT
>   	if (skb->tc_verd&  TC_NCLS) {
> @@ -3153,7 +3153,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>   #endif
>
>   	list_for_each_entry_rcu(ptype,&ptype_all, list) {
> -		if (!ptype->dev || ptype->dev == skb->dev) {
> +		if (!ptype->dev || ptype->dev == dev) {
>   			if (pt_prev)
>   				ret = deliver_skb(skb, pt_prev, orig_dev);
>   			pt_prev = ptype;

Inside the loop, we should only do exact match delivery, for &ptype_all and for 
&ptype_base[ntohs(type) & PTYPE_HASH_MASK]:

         list_for_each_entry_rcu(ptype, &ptype_all, list) {
-               if (!ptype->dev || ptype->dev == dev) {
+               if (ptype->dev == dev) {
                         if (pt_prev)
                                 ret = deliver_skb(skb, pt_prev, orig_dev);
                         pt_prev = ptype;
                 }
         }


         list_for_each_entry_rcu(ptype,
                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
                 if (ptype->type == type &&
-                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
+                   (ptype->dev == skb->dev)) {
                         if (pt_prev)
                                 ret = deliver_skb(skb, pt_prev, orig_dev);
                         pt_prev = ptype;
                 }
         }

After leaving the loop, we can do wilcard delivery, if skb is not NULL.

         list_for_each_entry_rcu(ptype, &ptype_all, list) {
-               if (!ptype->dev || ptype->dev == dev) {
+               if (!ptype->dev) {
                         if (pt_prev)
                                 ret = deliver_skb(skb, pt_prev, orig_dev);
                         pt_prev = ptype;
                }
         }


         list_for_each_entry_rcu(ptype,
                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
-               if (ptype->type == type &&
-                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
+		if (ptype->type == type && !ptype->dev) {
                         if (pt_prev)
                                 ret = deliver_skb(skb, pt_prev, orig_dev);
                         pt_prev = ptype;
                 }
         }

This would reduce the number of tests inside the list_for_each_entry_rcu() loops. And because we 
match only ptype->dev == dev inside the loop and !ptype->dev outside the loop, this should avoid 
duplicate delivery.

Also, for performance reason, exact match protocol handler lists might be moved from ptype_base or 
ptype_all to a per net_device list. That way, the list_for_each_entry_rcu() inside the loop could be 
empty if no protocol handler bind on the current dev.

inside loop:

         list_for_each_entry_rcu(ptype, dev->ptype_all, list) {
                 if (pt_prev)
			ret = deliver_skb(skb, pt_prev, orig_dev);
                 pt_prev = ptype;
         }

         list_for_each_entry_rcu(ptype,
                         dev->ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
                 if (ptype->type == type) {
                         if (pt_prev)
                                 ret = deliver_skb(skb, pt_prev, orig_dev);
                         pt_prev = ptype;
                 }
         }

Outside loop :

         list_for_each_entry_rcu(ptype, &ptype_all, list) {
                 if (pt_prev)
                         ret = deliver_skb(skb, pt_prev, orig_dev);
                 pt_prev = ptype;
         }


         list_for_each_entry_rcu(ptype,
                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
		if (ptype->type == type) {
                         if (pt_prev)
                                 ret = deliver_skb(skb, pt_prev, orig_dev);
                         pt_prev = ptype;
                 }
         }

This would require several changes into ptype_all and ptype_base handling, but should be faster.

> @@ -3167,7 +3167,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>   ncls:
>   #endif
>
> -	rx_handler = rcu_dereference(skb->dev->rx_handler);
> +	rx_handler = rcu_dereference(dev->rx_handler);
>   	if (rx_handler) {
>   		if (pt_prev) {
>   			ret = deliver_skb(skb, pt_prev, orig_dev);
> @@ -3176,6 +3176,8 @@ ncls:
>   		skb = rx_handler(skb);
>   		if (!skb)
>   			goto out;
> +		if (dev != skb->dev)

I would use "if (skb->dev != dev)" for clarity, because skb->dev is expected to have changed, not dev.

> +			goto another_round;
>   	}
>
>   	if (vlan_tx_tag_present(skb)) {
>

	Nicolas.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-19 13:18                         ` Nicolas de Pesloüan
@ 2011-02-19 13:46                           ` Jiri Pirko
  2011-02-19 14:32                             ` Nicolas de Pesloüan
  2011-02-19 20:27                             ` Nicolas de Pesloüan
  0 siblings, 2 replies; 52+ messages in thread
From: Jiri Pirko @ 2011-02-19 13:46 UTC (permalink / raw)
  To: Nicolas de Pesloüan
  Cc: Jay Vosburgh, David Miller, kaber, eric.dumazet, netdev,
	shemminger, andy

Sat, Feb 19, 2011 at 02:18:00PM CET, nicolas.2p.debian@gmail.com wrote:
>Le 19/02/2011 12:28, Jiri Pirko a écrit :
>>Sat, Feb 19, 2011 at 12:08:31PM CET, jpirko@redhat.com wrote:
>>>Sat, Feb 19, 2011 at 11:56:23AM CET, nicolas.2p.debian@gmail.com wrote:
>>>>Le 19/02/2011 09:05, Jiri Pirko a écrit :
>>>>>This patch converts bonding to use rx_handler. Results in cleaner
>>>>>__netif_receive_skb() with much less exceptions needed. Also
>>>>>bond-specific work is moved into bond code.
>>>>>
>>>>>Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>>>>
>>>>>v1->v2:
>>>>>         using skb_iif instead of new input_dev to remember original
>>>>>	device
>>>>>v2->v3:
>>>>>	set orig_dev = skb->dev if skb_iif is set
>>>>>
>>>>
>>>>Why do we need to let the rx_handlers call netif_rx() or __netif_receive_skb()?
>>>>
>>>>Bonding used to be handled with very few overhead, simply replacing
>>>>skb->dev with skb->dev->master. Time has passed and we eventually
>>>>added many special processing for bonding into __netif_receive_skb(),
>>>>but the overhead remained very light.
>>>>
>>>>Calling netif_rx() (or __netif_receive_skb()) to allow nesting would probably lead to some overhead.
>>>>
>>>>Can't we, instead, loop inside __netif_receive_skb(), and deliver
>>>>whatever need to be delivered, to whoever need, inside the loop ?
>>>>
>>>>rx_handler = rcu_dereference(skb->dev->rx_handler);
>>>>while (rx_handler) {
>>>>	/* ...  */
>>>>	orig_dev = skb->dev;
>>>>	skb = rx_handler(skb);
>>>>	/* ... */
>>>>	rx_handler = (skb->dev != orig_dev) ? rcu_dereference(skb->dev->rx_handler) : NULL;
>>>>}
>>>>
>>>>This would reduce the overhead, while still allowing nesting: vlan on
>>>>top on bonding, bridge on top on bonding, ...
>>>
>>>I see your point. Makes sense to me. But the loop would have to include
>>>at least processing of ptype_all too. I'm going to cook a follow-up
>>>patch.
>>>
>>
>>DRAFT (doesn't modify rx_handlers):
>>
>>diff --git a/net/core/dev.c b/net/core/dev.c
>>index 4ebf7fe..e5dba47 100644
>>--- a/net/core/dev.c
>>+++ b/net/core/dev.c
>>@@ -3115,6 +3115,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>  {
>>  	struct packet_type *ptype, *pt_prev;
>>  	rx_handler_func_t *rx_handler;
>>+	struct net_device *dev;
>>  	struct net_device *orig_dev;
>>  	struct net_device *null_or_dev;
>>  	int ret = NET_RX_DROP;
>>@@ -3129,7 +3130,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>  	if (netpoll_receive_skb(skb))
>>  		return NET_RX_DROP;
>>
>>-	__this_cpu_inc(softnet_data.processed);
>>+	skb->skb_iif = skb->dev->ifindex;
>>+	orig_dev = skb->dev;
>
>orig_dev should be set inside the loop, to reflect "previously
>crossed device", while following the path:
>
>eth0 -> bond0 -> br0.
>
>First step inside loop:
>
>orig_dev = eth0
>skb->dev = bond0 (at the end of the loop).
>
>Second step inside loop:
>
>orig_dev = bond0
>skb->dev = br0 (et the end of the loop).
>
>This would allow for exact match delivery to bond0 if someone bind there.
>
>>+
>>  	skb_reset_network_header(skb);
>>  	skb_reset_transport_header(skb);
>>  	skb->mac_len = skb->network_header - skb->mac_header;
>>@@ -3138,12 +3141,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>
>>  	rcu_read_lock();
>>
>>-	if (!skb->skb_iif) {
>>-		skb->skb_iif = skb->dev->ifindex;
>>-		orig_dev = skb->dev;
>>-	} else {
>>-		orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);
>>-	}
>
>I like the fact that it removes the above part.
>
>>+another_round:
>>+	__this_cpu_inc(softnet_data.processed);
>>+	dev = skb->dev;
>>
>>  #ifdef CONFIG_NET_CLS_ACT
>>  	if (skb->tc_verd&  TC_NCLS) {
>>@@ -3153,7 +3153,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>  #endif
>>
>>  	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>-		if (!ptype->dev || ptype->dev == skb->dev) {
>>+		if (!ptype->dev || ptype->dev == dev) {
>>  			if (pt_prev)
>>  				ret = deliver_skb(skb, pt_prev, orig_dev);
>>  			pt_prev = ptype;
>
>Inside the loop, we should only do exact match delivery, for
>&ptype_all and for &ptype_base[ntohs(type) & PTYPE_HASH_MASK]:
>
>        list_for_each_entry_rcu(ptype, &ptype_all, list) {
>-               if (!ptype->dev || ptype->dev == dev) {
>+               if (ptype->dev == dev) {
>                        if (pt_prev)
>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>                        pt_prev = ptype;
>                }
>        }
>
>
>        list_for_each_entry_rcu(ptype,
>                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
>                if (ptype->type == type &&
>-                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>+                   (ptype->dev == skb->dev)) {
>                        if (pt_prev)
>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>                        pt_prev = ptype;
>                }
>        }
>
>After leaving the loop, we can do wilcard delivery, if skb is not NULL.
>
>        list_for_each_entry_rcu(ptype, &ptype_all, list) {
>-               if (!ptype->dev || ptype->dev == dev) {
>+               if (!ptype->dev) {
>                        if (pt_prev)
>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>                        pt_prev = ptype;
>               }
>        }
>
>
>        list_for_each_entry_rcu(ptype,
>                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
>-               if (ptype->type == type &&
>-                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>+		if (ptype->type == type && !ptype->dev) {
>                        if (pt_prev)
>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>                        pt_prev = ptype;
>                }
>        }
>
>This would reduce the number of tests inside the
>list_for_each_entry_rcu() loops. And because we match only ptype->dev
>== dev inside the loop and !ptype->dev outside the loop, this should
>avoid duplicate delivery.

Would you care to put this into patch so I can see the whole picture?
Thanks.


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-19 13:46                           ` Jiri Pirko
@ 2011-02-19 14:32                             ` Nicolas de Pesloüan
  2011-02-19 20:27                             ` Nicolas de Pesloüan
  1 sibling, 0 replies; 52+ messages in thread
From: Nicolas de Pesloüan @ 2011-02-19 14:32 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Jay Vosburgh, David Miller, kaber, eric.dumazet, netdev,
	shemminger, andy

Le 19/02/2011 14:46, Jiri Pirko a écrit :
> Sat, Feb 19, 2011 at 02:18:00PM CET, nicolas.2p.debian@gmail.com wrote:

[snip]
>> Inside the loop, we should only do exact match delivery, for
>> &ptype_all and for&ptype_base[ntohs(type)&  PTYPE_HASH_MASK]:
>>
>>         list_for_each_entry_rcu(ptype,&ptype_all, list) {
>> -               if (!ptype->dev || ptype->dev == dev) {
>> +               if (ptype->dev == dev) {
>>                         if (pt_prev)
>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>                         pt_prev = ptype;
>>                 }
>>         }
>>
>>
>>         list_for_each_entry_rcu(ptype,
>>                         &ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>>                 if (ptype->type == type&&
>> -                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>> +                   (ptype->dev == skb->dev)) {
>>                         if (pt_prev)
>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>                         pt_prev = ptype;
>>                 }
>>         }
>>
>> After leaving the loop, we can do wilcard delivery, if skb is not NULL.
>>
>>         list_for_each_entry_rcu(ptype,&ptype_all, list) {
>> -               if (!ptype->dev || ptype->dev == dev) {
>> +               if (!ptype->dev) {
>>                         if (pt_prev)
>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>                         pt_prev = ptype;
>>                }
>>         }
>>
>>
>>         list_for_each_entry_rcu(ptype,
>>                         &ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>> -               if (ptype->type == type&&
>> -                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>> +		if (ptype->type == type&&  !ptype->dev) {
>>                         if (pt_prev)
>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>                         pt_prev = ptype;
>>                 }
>>         }
>>
>> This would reduce the number of tests inside the
>> list_for_each_entry_rcu() loops. And because we match only ptype->dev
>> == dev inside the loop and !ptype->dev outside the loop, this should
>> avoid duplicate delivery.
>
> Would you care to put this into patch so I can see the whole picture?
> Thanks.

I will try.

	Nicolas.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-19 13:46                           ` Jiri Pirko
  2011-02-19 14:32                             ` Nicolas de Pesloüan
@ 2011-02-19 20:27                             ` Nicolas de Pesloüan
  2011-02-20 10:36                               ` Jiri Pirko
  1 sibling, 1 reply; 52+ messages in thread
From: Nicolas de Pesloüan @ 2011-02-19 20:27 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Jay Vosburgh, David Miller, kaber, eric.dumazet, netdev,
	shemminger, andy

Le 19/02/2011 14:46, Jiri Pirko a écrit :
> Sat, Feb 19, 2011 at 02:18:00PM CET, nicolas.2p.debian@gmail.com wrote:
>> Le 19/02/2011 12:28, Jiri Pirko a écrit :
>>> Sat, Feb 19, 2011 at 12:08:31PM CET, jpirko@redhat.com wrote:
>>>> Sat, Feb 19, 2011 at 11:56:23AM CET, nicolas.2p.debian@gmail.com wrote:
>>>>> Le 19/02/2011 09:05, Jiri Pirko a écrit :
>>>>>> This patch converts bonding to use rx_handler. Results in cleaner
>>>>>> __netif_receive_skb() with much less exceptions needed. Also
>>>>>> bond-specific work is moved into bond code.
>>>>>>
>>>>>> Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>>>>>
>>>>>> v1->v2:
>>>>>>          using skb_iif instead of new input_dev to remember original
>>>>>> 	device
>>>>>> v2->v3:
>>>>>> 	set orig_dev = skb->dev if skb_iif is set
>>>>>>
>>>>>
>>>>> Why do we need to let the rx_handlers call netif_rx() or __netif_receive_skb()?
>>>>>
>>>>> Bonding used to be handled with very few overhead, simply replacing
>>>>> skb->dev with skb->dev->master. Time has passed and we eventually
>>>>> added many special processing for bonding into __netif_receive_skb(),
>>>>> but the overhead remained very light.
>>>>>
>>>>> Calling netif_rx() (or __netif_receive_skb()) to allow nesting would probably lead to some overhead.
>>>>>
>>>>> Can't we, instead, loop inside __netif_receive_skb(), and deliver
>>>>> whatever need to be delivered, to whoever need, inside the loop ?
>>>>>
>>>>> rx_handler = rcu_dereference(skb->dev->rx_handler);
>>>>> while (rx_handler) {
>>>>> 	/* ...  */
>>>>> 	orig_dev = skb->dev;
>>>>> 	skb = rx_handler(skb);
>>>>> 	/* ... */
>>>>> 	rx_handler = (skb->dev != orig_dev) ? rcu_dereference(skb->dev->rx_handler) : NULL;
>>>>> }
>>>>>
>>>>> This would reduce the overhead, while still allowing nesting: vlan on
>>>>> top on bonding, bridge on top on bonding, ...
>>>>
>>>> I see your point. Makes sense to me. But the loop would have to include
>>>> at least processing of ptype_all too. I'm going to cook a follow-up
>>>> patch.
>>>>
>>>
>>> DRAFT (doesn't modify rx_handlers):
>>>
>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>> index 4ebf7fe..e5dba47 100644
>>> --- a/net/core/dev.c
>>> +++ b/net/core/dev.c
>>> @@ -3115,6 +3115,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>   {
>>>   	struct packet_type *ptype, *pt_prev;
>>>   	rx_handler_func_t *rx_handler;
>>> +	struct net_device *dev;
>>>   	struct net_device *orig_dev;
>>>   	struct net_device *null_or_dev;
>>>   	int ret = NET_RX_DROP;
>>> @@ -3129,7 +3130,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>   	if (netpoll_receive_skb(skb))
>>>   		return NET_RX_DROP;
>>>
>>> -	__this_cpu_inc(softnet_data.processed);
>>> +	skb->skb_iif = skb->dev->ifindex;
>>> +	orig_dev = skb->dev;
>>
>> orig_dev should be set inside the loop, to reflect "previously
>> crossed device", while following the path:
>>
>> eth0 ->  bond0 ->  br0.
>>
>> First step inside loop:
>>
>> orig_dev = eth0
>> skb->dev = bond0 (at the end of the loop).
>>
>> Second step inside loop:
>>
>> orig_dev = bond0
>> skb->dev = br0 (et the end of the loop).
>>
>> This would allow for exact match delivery to bond0 if someone bind there.
>>
>>> +
>>>   	skb_reset_network_header(skb);
>>>   	skb_reset_transport_header(skb);
>>>   	skb->mac_len = skb->network_header - skb->mac_header;
>>> @@ -3138,12 +3141,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>
>>>   	rcu_read_lock();
>>>
>>> -	if (!skb->skb_iif) {
>>> -		skb->skb_iif = skb->dev->ifindex;
>>> -		orig_dev = skb->dev;
>>> -	} else {
>>> -		orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);
>>> -	}
>>
>> I like the fact that it removes the above part.
>>
>>> +another_round:
>>> +	__this_cpu_inc(softnet_data.processed);
>>> +	dev = skb->dev;
>>>
>>>   #ifdef CONFIG_NET_CLS_ACT
>>>   	if (skb->tc_verd&   TC_NCLS) {
>>> @@ -3153,7 +3153,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>   #endif
>>>
>>>   	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>> -		if (!ptype->dev || ptype->dev == skb->dev) {
>>> +		if (!ptype->dev || ptype->dev == dev) {
>>>   			if (pt_prev)
>>>   				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>   			pt_prev = ptype;
>>
>> Inside the loop, we should only do exact match delivery, for
>> &ptype_all and for&ptype_base[ntohs(type)&  PTYPE_HASH_MASK]:
>>
>>         list_for_each_entry_rcu(ptype,&ptype_all, list) {
>> -               if (!ptype->dev || ptype->dev == dev) {
>> +               if (ptype->dev == dev) {
>>                         if (pt_prev)
>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>                         pt_prev = ptype;
>>                 }
>>         }
>>
>>
>>         list_for_each_entry_rcu(ptype,
>>                         &ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>>                 if (ptype->type == type&&
>> -                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>> +                   (ptype->dev == skb->dev)) {
>>                         if (pt_prev)
>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>                         pt_prev = ptype;
>>                 }
>>         }
>>
>> After leaving the loop, we can do wilcard delivery, if skb is not NULL.
>>
>>         list_for_each_entry_rcu(ptype,&ptype_all, list) {
>> -               if (!ptype->dev || ptype->dev == dev) {
>> +               if (!ptype->dev) {
>>                         if (pt_prev)
>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>                         pt_prev = ptype;
>>                }
>>         }
>>
>>
>>         list_for_each_entry_rcu(ptype,
>>                         &ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>> -               if (ptype->type == type&&
>> -                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>> +		if (ptype->type == type&&  !ptype->dev) {
>>                         if (pt_prev)
>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>                         pt_prev = ptype;
>>                 }
>>         }
>>
>> This would reduce the number of tests inside the
>> list_for_each_entry_rcu() loops. And because we match only ptype->dev
>> == dev inside the loop and !ptype->dev outside the loop, this should
>> avoid duplicate delivery.
>
> Would you care to put this into patch so I can see the whole picture?
> Thanks.

Here is what I have in mind. It is based on your previous DRAFT patch, and don't modify rx_handlers yet.

Only compile tested !!

I don't know if every pieces are at the right place. I wonder what to do with CONFIG_NET_CLS_ACT 
part, that currently is between ptype_all and ptype_base processing.

Anyway, the general idea is there.

	Nicolas.

  net/core/dev.c |   70 ++++++++++++++++++++++++++++++++++++++++++++++++--------
  1 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index e5dba47..7e007a9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3117,7 +3117,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
  	rx_handler_func_t *rx_handler;
  	struct net_device *dev;
  	struct net_device *orig_dev;
-	struct net_device *null_or_dev;
  	int ret = NET_RX_DROP;
  	__be16 type;

@@ -3130,9 +3129,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
  	if (netpoll_receive_skb(skb))
  		return NET_RX_DROP;

-	skb->skb_iif = skb->dev->ifindex;
-	orig_dev = skb->dev;
-
  	skb_reset_network_header(skb);
  	skb_reset_transport_header(skb);
  	skb->mac_len = skb->network_header - skb->mac_header;
@@ -3143,6 +3139,8 @@ static int __netif_receive_skb(struct sk_buff *skb)

  another_round:
  	__this_cpu_inc(softnet_data.processed);
+	skb->skb_iif = skb->dev->ifindex;
+	orig_dev = skb->dev;
  	dev = skb->dev;

  #ifdef CONFIG_NET_CLS_ACT
@@ -3152,8 +3150,13 @@ another_round:
  	}
  #endif

+	/*
+	 * Deliver to ptype_all protocol handlers that match current dev.
+	 * This happens before rx_handler is given a chance to change skb->dev.
+	 */
+
  	list_for_each_entry_rcu(ptype, &ptype_all, list) {
-		if (!ptype->dev || ptype->dev == dev) {
+		if (ptype->dev == dev) {
  			if (pt_prev)
  				ret = deliver_skb(skb, pt_prev, orig_dev);
  			pt_prev = ptype;
@@ -3167,6 +3170,31 @@ another_round:
  ncls:
  #endif

+	/*
+	 * Deliver to ptype_base protocol handlers that match current dev.
+	 * This happens before rx_handler is given a chance to change skb->dev.
+	 */
+
+	type = skb->protocol;
+	list_for_each_entry_rcu(ptype,
+			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
+		if (ptype->type == type && ptype->dev == skb->dev) {
+			if (pt_prev)
+				ret = deliver_skb(skb, pt_prev, orig_dev);
+			pt_prev = ptype;
+		}
+	}
+
+	/*
+	 * Call rx_handler for current device.
+	 * If rx_handler return NULL, skip wilcard protocol handler delivery.
+	 * Else, if skb->dev changed, restart the whole delivery process, to
+	 * allow for device nesting.
+	 *
+	 * Warning:
+	 * rx_handlers must kfree_skb(skb) if they return NULL.
+	 */
+
  	rx_handler = rcu_dereference(dev->rx_handler);
  	if (rx_handler) {
  		if (pt_prev) {
@@ -3176,10 +3204,15 @@ ncls:
  		skb = rx_handler(skb);
  		if (!skb)
  			goto out;
-		if (dev != skb->dev)
+		if (skb->dev != dev)
  			goto another_round;
  	}

+	/*
+	 * FIXME: The part below should use rx_handler instead of being hard
+	 * coded here.
+	 */
+
  	if (vlan_tx_tag_present(skb)) {
  		if (pt_prev) {
  			ret = deliver_skb(skb, pt_prev, orig_dev);
@@ -3192,16 +3225,33 @@ ncls:
  			goto out;
  	}

+	/*
+	 * FIXME: Can't this be moved into the rx_handler for bonding,
+	 * or into a futur rx_handler for vlan?
+	 */
+
  	vlan_on_bond_hook(skb);

-	/* deliver only exact match when indicated */
-	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
+	/*
+	 * Deliver to wildcard ptype_all protocol handlers.
+	 */
+
+	list_for_each_entry_rcu(ptype, &ptype_all, list) {
+		if (!ptype->dev) {
+			if (pt_prev)
+				ret = deliver_skb(skb, pt_prev, orig_dev);
+			pt_prev = ptype;
+		}
+	}
+
+	/*
+	 * Deliver to wildcard ptype_all protocol handlers.
+	 */

  	type = skb->protocol;
  	list_for_each_entry_rcu(ptype,
  			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
-		if (ptype->type == type &&
-		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
+		if (ptype->type == type && !ptype->dev) {
  			if (pt_prev)
  				ret = deliver_skb(skb, pt_prev, orig_dev);
  			pt_prev = ptype;
-- 
1.7.2.3




^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-19 20:27                             ` Nicolas de Pesloüan
@ 2011-02-20 10:36                               ` Jiri Pirko
  2011-02-20 12:12                                 ` Nicolas de Pesloüan
  0 siblings, 1 reply; 52+ messages in thread
From: Jiri Pirko @ 2011-02-20 10:36 UTC (permalink / raw)
  To: Nicolas de Pesloüan
  Cc: Jay Vosburgh, David Miller, kaber, eric.dumazet, netdev,
	shemminger, andy

Sat, Feb 19, 2011 at 09:27:37PM CET, nicolas.2p.debian@gmail.com wrote:
>Le 19/02/2011 14:46, Jiri Pirko a écrit :
>>Sat, Feb 19, 2011 at 02:18:00PM CET, nicolas.2p.debian@gmail.com wrote:
>>>Le 19/02/2011 12:28, Jiri Pirko a écrit :
>>>>Sat, Feb 19, 2011 at 12:08:31PM CET, jpirko@redhat.com wrote:
>>>>>Sat, Feb 19, 2011 at 11:56:23AM CET, nicolas.2p.debian@gmail.com wrote:
>>>>>>Le 19/02/2011 09:05, Jiri Pirko a écrit :
>>>>>>>This patch converts bonding to use rx_handler. Results in cleaner
>>>>>>>__netif_receive_skb() with much less exceptions needed. Also
>>>>>>>bond-specific work is moved into bond code.
>>>>>>>
>>>>>>>Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>>>>>>
>>>>>>>v1->v2:
>>>>>>>         using skb_iif instead of new input_dev to remember original
>>>>>>>	device
>>>>>>>v2->v3:
>>>>>>>	set orig_dev = skb->dev if skb_iif is set
>>>>>>>
>>>>>>
>>>>>>Why do we need to let the rx_handlers call netif_rx() or __netif_receive_skb()?
>>>>>>
>>>>>>Bonding used to be handled with very few overhead, simply replacing
>>>>>>skb->dev with skb->dev->master. Time has passed and we eventually
>>>>>>added many special processing for bonding into __netif_receive_skb(),
>>>>>>but the overhead remained very light.
>>>>>>
>>>>>>Calling netif_rx() (or __netif_receive_skb()) to allow nesting would probably lead to some overhead.
>>>>>>
>>>>>>Can't we, instead, loop inside __netif_receive_skb(), and deliver
>>>>>>whatever need to be delivered, to whoever need, inside the loop ?
>>>>>>
>>>>>>rx_handler = rcu_dereference(skb->dev->rx_handler);
>>>>>>while (rx_handler) {
>>>>>>	/* ...  */
>>>>>>	orig_dev = skb->dev;
>>>>>>	skb = rx_handler(skb);
>>>>>>	/* ... */
>>>>>>	rx_handler = (skb->dev != orig_dev) ? rcu_dereference(skb->dev->rx_handler) : NULL;
>>>>>>}
>>>>>>
>>>>>>This would reduce the overhead, while still allowing nesting: vlan on
>>>>>>top on bonding, bridge on top on bonding, ...
>>>>>
>>>>>I see your point. Makes sense to me. But the loop would have to include
>>>>>at least processing of ptype_all too. I'm going to cook a follow-up
>>>>>patch.
>>>>>
>>>>
>>>>DRAFT (doesn't modify rx_handlers):
>>>>
>>>>diff --git a/net/core/dev.c b/net/core/dev.c
>>>>index 4ebf7fe..e5dba47 100644
>>>>--- a/net/core/dev.c
>>>>+++ b/net/core/dev.c
>>>>@@ -3115,6 +3115,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>  {
>>>>  	struct packet_type *ptype, *pt_prev;
>>>>  	rx_handler_func_t *rx_handler;
>>>>+	struct net_device *dev;
>>>>  	struct net_device *orig_dev;
>>>>  	struct net_device *null_or_dev;
>>>>  	int ret = NET_RX_DROP;
>>>>@@ -3129,7 +3130,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>  	if (netpoll_receive_skb(skb))
>>>>  		return NET_RX_DROP;
>>>>
>>>>-	__this_cpu_inc(softnet_data.processed);
>>>>+	skb->skb_iif = skb->dev->ifindex;
>>>>+	orig_dev = skb->dev;
>>>
>>>orig_dev should be set inside the loop, to reflect "previously
>>>crossed device", while following the path:
>>>
>>>eth0 ->  bond0 ->  br0.
>>>
>>>First step inside loop:
>>>
>>>orig_dev = eth0
>>>skb->dev = bond0 (at the end of the loop).
>>>
>>>Second step inside loop:
>>>
>>>orig_dev = bond0
>>>skb->dev = br0 (et the end of the loop).
>>>
>>>This would allow for exact match delivery to bond0 if someone bind there.
>>>
>>>>+
>>>>  	skb_reset_network_header(skb);
>>>>  	skb_reset_transport_header(skb);
>>>>  	skb->mac_len = skb->network_header - skb->mac_header;
>>>>@@ -3138,12 +3141,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>
>>>>  	rcu_read_lock();
>>>>
>>>>-	if (!skb->skb_iif) {
>>>>-		skb->skb_iif = skb->dev->ifindex;
>>>>-		orig_dev = skb->dev;
>>>>-	} else {
>>>>-		orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);
>>>>-	}
>>>
>>>I like the fact that it removes the above part.
>>>
>>>>+another_round:
>>>>+	__this_cpu_inc(softnet_data.processed);
>>>>+	dev = skb->dev;
>>>>
>>>>  #ifdef CONFIG_NET_CLS_ACT
>>>>  	if (skb->tc_verd&   TC_NCLS) {
>>>>@@ -3153,7 +3153,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>  #endif
>>>>
>>>>  	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>>-		if (!ptype->dev || ptype->dev == skb->dev) {
>>>>+		if (!ptype->dev || ptype->dev == dev) {
>>>>  			if (pt_prev)
>>>>  				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>  			pt_prev = ptype;
>>>
>>>Inside the loop, we should only do exact match delivery, for
>>>&ptype_all and for&ptype_base[ntohs(type)&  PTYPE_HASH_MASK]:
>>>
>>>        list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>-               if (!ptype->dev || ptype->dev == dev) {
>>>+               if (ptype->dev == dev) {
>>>                        if (pt_prev)
>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>                        pt_prev = ptype;
>>>                }
>>>        }
>>>
>>>
>>>        list_for_each_entry_rcu(ptype,
>>>                        &ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>>>                if (ptype->type == type&&
>>>-                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>>>+                   (ptype->dev == skb->dev)) {
>>>                        if (pt_prev)
>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>                        pt_prev = ptype;
>>>                }
>>>        }
>>>
>>>After leaving the loop, we can do wilcard delivery, if skb is not NULL.
>>>
>>>        list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>-               if (!ptype->dev || ptype->dev == dev) {
>>>+               if (!ptype->dev) {
>>>                        if (pt_prev)
>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>                        pt_prev = ptype;
>>>               }
>>>        }
>>>
>>>
>>>        list_for_each_entry_rcu(ptype,
>>>                        &ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>>>-               if (ptype->type == type&&
>>>-                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>>>+		if (ptype->type == type&&  !ptype->dev) {
>>>                        if (pt_prev)
>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>                        pt_prev = ptype;
>>>                }
>>>        }
>>>
>>>This would reduce the number of tests inside the
>>>list_for_each_entry_rcu() loops. And because we match only ptype->dev
>>>== dev inside the loop and !ptype->dev outside the loop, this should
>>>avoid duplicate delivery.
>>
>>Would you care to put this into patch so I can see the whole picture?
>>Thanks.
>
>Here is what I have in mind. It is based on your previous DRAFT patch, and don't modify rx_handlers yet.
>
>Only compile tested !!
>
>I don't know if every pieces are at the right place. I wonder what to
>do with CONFIG_NET_CLS_ACT part, that currently is between ptype_all
>and ptype_base processing.
>
>Anyway, the general idea is there.
>
>	Nicolas.
>
> net/core/dev.c |   70 ++++++++++++++++++++++++++++++++++++++++++++++++--------
> 1 files changed, 60 insertions(+), 10 deletions(-)
>
>diff --git a/net/core/dev.c b/net/core/dev.c
>index e5dba47..7e007a9 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -3117,7 +3117,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
> 	rx_handler_func_t *rx_handler;
> 	struct net_device *dev;
> 	struct net_device *orig_dev;
>-	struct net_device *null_or_dev;
> 	int ret = NET_RX_DROP;
> 	__be16 type;
>
>@@ -3130,9 +3129,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
> 	if (netpoll_receive_skb(skb))
> 		return NET_RX_DROP;
>
>-	skb->skb_iif = skb->dev->ifindex;
>-	orig_dev = skb->dev;
>-
> 	skb_reset_network_header(skb);
> 	skb_reset_transport_header(skb);
> 	skb->mac_len = skb->network_header - skb->mac_header;
>@@ -3143,6 +3139,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
>
> another_round:
> 	__this_cpu_inc(softnet_data.processed);
>+	skb->skb_iif = skb->dev->ifindex;
>+	orig_dev = skb->dev;
orig_dev should be set at the end of the loop. Now you are going to have
it always the same as dev and skb->dev.

> 	dev = skb->dev;
>
> #ifdef CONFIG_NET_CLS_ACT
>@@ -3152,8 +3150,13 @@ another_round:
> 	}
> #endif
>
>+	/*
>+	 * Deliver to ptype_all protocol handlers that match current dev.
>+	 * This happens before rx_handler is given a chance to change skb->dev.
>+	 */
>+
> 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
>-		if (!ptype->dev || ptype->dev == dev) {
>+		if (ptype->dev == dev) {
> 			if (pt_prev)
> 				ret = deliver_skb(skb, pt_prev, orig_dev);
> 			pt_prev = ptype;
>@@ -3167,6 +3170,31 @@ another_round:
> ncls:
> #endif
>
>+	/*
>+	 * Deliver to ptype_base protocol handlers that match current dev.
>+	 * This happens before rx_handler is given a chance to change skb->dev.
>+	 */
>+
>+	type = skb->protocol;
>+	list_for_each_entry_rcu(ptype,
>+			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
>+		if (ptype->type == type && ptype->dev == skb->dev) {
>+			if (pt_prev)
>+				ret = deliver_skb(skb, pt_prev, orig_dev);
>+			pt_prev = ptype;
>+		}
>+	}

I'm not sure it is ok to deliver ptype_base here. See comment above
ptype_head() (I'm not sure I understand that correctly)

>+
>+	/*
>+	 * Call rx_handler for current device.
>+	 * If rx_handler return NULL, skip wilcard protocol handler delivery.
>+	 * Else, if skb->dev changed, restart the whole delivery process, to
>+	 * allow for device nesting.
>+	 *
>+	 * Warning:
>+	 * rx_handlers must kfree_skb(skb) if they return NULL.
Well this is not true. They can return NULL and call netif_rx as they
have before. No changes necessary I believe.

>+	 */
>+
> 	rx_handler = rcu_dereference(dev->rx_handler);
> 	if (rx_handler) {
> 		if (pt_prev) {
>@@ -3176,10 +3204,15 @@ ncls:
> 		skb = rx_handler(skb);
> 		if (!skb)
> 			goto out;
>-		if (dev != skb->dev)
>+		if (skb->dev != dev)
> 			goto another_round;
> 	}
>
>+	/*
>+	 * FIXME: The part below should use rx_handler instead of being hard
>+	 * coded here.
I'm not sure it is doable atm. For bridge and bond it should not be a
problem, but for macvlan, there is possible to have macvlans and vlans
on the same dev. This possibility should persist.
/me scratches head on the idea to have multiple rx_handlers although it
was his original idea....


>+	 */
>+
> 	if (vlan_tx_tag_present(skb)) {
> 		if (pt_prev) {
> 			ret = deliver_skb(skb, pt_prev, orig_dev);
>@@ -3192,16 +3225,33 @@ ncls:
> 			goto out;
> 	}
>
>+	/*
>+	 * FIXME: Can't this be moved into the rx_handler for bonding,
>+	 * or into a futur rx_handler for vlan?
This hook is something I do not like at all :/ But anyway if should be in vlan
part I think.

>+	 */
>+
> 	vlan_on_bond_hook(skb);
>
>-	/* deliver only exact match when indicated */
>-	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
>+	/*
>+	 * Deliver to wildcard ptype_all protocol handlers.
>+	 */
>+
>+	list_for_each_entry_rcu(ptype, &ptype_all, list) {
>+		if (!ptype->dev) {
>+			if (pt_prev)
>+				ret = deliver_skb(skb, pt_prev, orig_dev);
>+			pt_prev = ptype;
>+		}
>+	}
>+
>+	/*
>+	 * Deliver to wildcard ptype_all protocol handlers.
>+	 */
>
> 	type = skb->protocol;
> 	list_for_each_entry_rcu(ptype,
> 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
>-		if (ptype->type == type &&
>-		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>+		if (ptype->type == type && !ptype->dev) {
> 			if (pt_prev)
> 				ret = deliver_skb(skb, pt_prev, orig_dev);
> 			pt_prev = ptype;
>-- 
>1.7.2.3
>
>
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-20 10:36                               ` Jiri Pirko
@ 2011-02-20 12:12                                 ` Nicolas de Pesloüan
  2011-02-20 15:07                                   ` Jiri Pirko
  0 siblings, 1 reply; 52+ messages in thread
From: Nicolas de Pesloüan @ 2011-02-20 12:12 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Jay Vosburgh, David Miller, kaber, eric.dumazet, netdev,
	shemminger, andy, Fischer, Anna

Le 20/02/2011 11:36, Jiri Pirko a écrit :
> Sat, Feb 19, 2011 at 09:27:37PM CET, nicolas.2p.debian@gmail.com wrote:
>> Le 19/02/2011 14:46, Jiri Pirko a écrit :
>>> Sat, Feb 19, 2011 at 02:18:00PM CET, nicolas.2p.debian@gmail.com wrote:
>>>> Le 19/02/2011 12:28, Jiri Pirko a écrit :
>>>>> Sat, Feb 19, 2011 at 12:08:31PM CET, jpirko@redhat.com wrote:
>>>>>> Sat, Feb 19, 2011 at 11:56:23AM CET, nicolas.2p.debian@gmail.com wrote:
>>>>>>> Le 19/02/2011 09:05, Jiri Pirko a écrit :
>>>>>>>> This patch converts bonding to use rx_handler. Results in cleaner
>>>>>>>> __netif_receive_skb() with much less exceptions needed. Also
>>>>>>>> bond-specific work is moved into bond code.
>>>>>>>>
>>>>>>>> Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>>>>>>>
>>>>>>>> v1->v2:
>>>>>>>>          using skb_iif instead of new input_dev to remember original
>>>>>>>> 	device
>>>>>>>> v2->v3:
>>>>>>>> 	set orig_dev = skb->dev if skb_iif is set
>>>>>>>>
>>>>>>>
>>>>>>> Why do we need to let the rx_handlers call netif_rx() or __netif_receive_skb()?
>>>>>>>
>>>>>>> Bonding used to be handled with very few overhead, simply replacing
>>>>>>> skb->dev with skb->dev->master. Time has passed and we eventually
>>>>>>> added many special processing for bonding into __netif_receive_skb(),
>>>>>>> but the overhead remained very light.
>>>>>>>
>>>>>>> Calling netif_rx() (or __netif_receive_skb()) to allow nesting would probably lead to some overhead.
>>>>>>>
>>>>>>> Can't we, instead, loop inside __netif_receive_skb(), and deliver
>>>>>>> whatever need to be delivered, to whoever need, inside the loop ?
>>>>>>>
>>>>>>> rx_handler = rcu_dereference(skb->dev->rx_handler);
>>>>>>> while (rx_handler) {
>>>>>>> 	/* ...  */
>>>>>>> 	orig_dev = skb->dev;
>>>>>>> 	skb = rx_handler(skb);
>>>>>>> 	/* ... */
>>>>>>> 	rx_handler = (skb->dev != orig_dev) ? rcu_dereference(skb->dev->rx_handler) : NULL;
>>>>>>> }
>>>>>>>
>>>>>>> This would reduce the overhead, while still allowing nesting: vlan on
>>>>>>> top on bonding, bridge on top on bonding, ...
>>>>>>
>>>>>> I see your point. Makes sense to me. But the loop would have to include
>>>>>> at least processing of ptype_all too. I'm going to cook a follow-up
>>>>>> patch.
>>>>>>
>>>>>
>>>>> DRAFT (doesn't modify rx_handlers):
>>>>>
>>>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>>>> index 4ebf7fe..e5dba47 100644
>>>>> --- a/net/core/dev.c
>>>>> +++ b/net/core/dev.c
>>>>> @@ -3115,6 +3115,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>   {
>>>>>   	struct packet_type *ptype, *pt_prev;
>>>>>   	rx_handler_func_t *rx_handler;
>>>>> +	struct net_device *dev;
>>>>>   	struct net_device *orig_dev;
>>>>>   	struct net_device *null_or_dev;
>>>>>   	int ret = NET_RX_DROP;
>>>>> @@ -3129,7 +3130,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>   	if (netpoll_receive_skb(skb))
>>>>>   		return NET_RX_DROP;
>>>>>
>>>>> -	__this_cpu_inc(softnet_data.processed);
>>>>> +	skb->skb_iif = skb->dev->ifindex;
>>>>> +	orig_dev = skb->dev;
>>>>
>>>> orig_dev should be set inside the loop, to reflect "previously
>>>> crossed device", while following the path:
>>>>
>>>> eth0 ->   bond0 ->   br0.
>>>>
>>>> First step inside loop:
>>>>
>>>> orig_dev = eth0
>>>> skb->dev = bond0 (at the end of the loop).
>>>>
>>>> Second step inside loop:
>>>>
>>>> orig_dev = bond0
>>>> skb->dev = br0 (et the end of the loop).
>>>>
>>>> This would allow for exact match delivery to bond0 if someone bind there.
>>>>
>>>>> +
>>>>>   	skb_reset_network_header(skb);
>>>>>   	skb_reset_transport_header(skb);
>>>>>   	skb->mac_len = skb->network_header - skb->mac_header;
>>>>> @@ -3138,12 +3141,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>
>>>>>   	rcu_read_lock();
>>>>>
>>>>> -	if (!skb->skb_iif) {
>>>>> -		skb->skb_iif = skb->dev->ifindex;
>>>>> -		orig_dev = skb->dev;
>>>>> -	} else {
>>>>> -		orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);
>>>>> -	}
>>>>
>>>> I like the fact that it removes the above part.
>>>>
>>>>> +another_round:
>>>>> +	__this_cpu_inc(softnet_data.processed);
>>>>> +	dev = skb->dev;
>>>>>
>>>>>   #ifdef CONFIG_NET_CLS_ACT
>>>>>   	if (skb->tc_verd&    TC_NCLS) {
>>>>> @@ -3153,7 +3153,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>   #endif
>>>>>
>>>>>   	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>>> -		if (!ptype->dev || ptype->dev == skb->dev) {
>>>>> +		if (!ptype->dev || ptype->dev == dev) {
>>>>>   			if (pt_prev)
>>>>>   				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>>   			pt_prev = ptype;
>>>>
>>>> Inside the loop, we should only do exact match delivery, for
>>>> &ptype_all and for&ptype_base[ntohs(type)&   PTYPE_HASH_MASK]:
>>>>
>>>>         list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>> -               if (!ptype->dev || ptype->dev == dev) {
>>>> +               if (ptype->dev == dev) {
>>>>                         if (pt_prev)
>>>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>                         pt_prev = ptype;
>>>>                 }
>>>>         }
>>>>
>>>>
>>>>         list_for_each_entry_rcu(ptype,
>>>>                         &ptype_base[ntohs(type)&   PTYPE_HASH_MASK], list) {
>>>>                 if (ptype->type == type&&
>>>> -                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>>>> +                   (ptype->dev == skb->dev)) {
>>>>                         if (pt_prev)
>>>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>                         pt_prev = ptype;
>>>>                 }
>>>>         }
>>>>
>>>> After leaving the loop, we can do wilcard delivery, if skb is not NULL.
>>>>
>>>>         list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>> -               if (!ptype->dev || ptype->dev == dev) {
>>>> +               if (!ptype->dev) {
>>>>                         if (pt_prev)
>>>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>                         pt_prev = ptype;
>>>>                }
>>>>         }
>>>>
>>>>
>>>>         list_for_each_entry_rcu(ptype,
>>>>                         &ptype_base[ntohs(type)&   PTYPE_HASH_MASK], list) {
>>>> -               if (ptype->type == type&&
>>>> -                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>>>> +		if (ptype->type == type&&   !ptype->dev) {
>>>>                         if (pt_prev)
>>>>                                 ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>                         pt_prev = ptype;
>>>>                 }
>>>>         }
>>>>
>>>> This would reduce the number of tests inside the
>>>> list_for_each_entry_rcu() loops. And because we match only ptype->dev
>>>> == dev inside the loop and !ptype->dev outside the loop, this should
>>>> avoid duplicate delivery.
>>>
>>> Would you care to put this into patch so I can see the whole picture?
>>> Thanks.
>>
>> Here is what I have in mind. It is based on your previous DRAFT patch, and don't modify rx_handlers yet.
>>
>> Only compile tested !!
>>
>> I don't know if every pieces are at the right place. I wonder what to
>> do with CONFIG_NET_CLS_ACT part, that currently is between ptype_all
>> and ptype_base processing.
>>
>> Anyway, the general idea is there.
>>
>> 	Nicolas.
>>
>> net/core/dev.c |   70 ++++++++++++++++++++++++++++++++++++++++++++++++--------
>> 1 files changed, 60 insertions(+), 10 deletions(-)
>>
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index e5dba47..7e007a9 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -3117,7 +3117,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
>> 	rx_handler_func_t *rx_handler;
>> 	struct net_device *dev;
>> 	struct net_device *orig_dev;
>> -	struct net_device *null_or_dev;
>> 	int ret = NET_RX_DROP;
>> 	__be16 type;
>>
>> @@ -3130,9 +3129,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
>> 	if (netpoll_receive_skb(skb))
>> 		return NET_RX_DROP;
>>
>> -	skb->skb_iif = skb->dev->ifindex;
>> -	orig_dev = skb->dev;
>> -
>> 	skb_reset_network_header(skb);
>> 	skb_reset_transport_header(skb);
>> 	skb->mac_len = skb->network_header - skb->mac_header;
>> @@ -3143,6 +3139,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>
>> another_round:
>> 	__this_cpu_inc(softnet_data.processed);
>> +	skb->skb_iif = skb->dev->ifindex;
>> +	orig_dev = skb->dev;
> orig_dev should be set at the end of the loop. Now you are going to have
> it always the same as dev and skb->dev.
>

Yes, you are right.

I thinking about all this, I wonder what the protocol handlers expect as the orig_dev value ?

Lest imagine the following configuration: eth0 -> bond0 -> br0.

What does a protocol handler listening on br0 expect for orig_dev ? bond0 or eth0 ? Current 
implementation give eth0, but I think bond0 should be the right value, for proper nesting.

>> 	dev = skb->dev;
>>
>> #ifdef CONFIG_NET_CLS_ACT
>> @@ -3152,8 +3150,13 @@ another_round:
>> 	}
>> #endif
>>
>> +	/*
>> +	 * Deliver to ptype_all protocol handlers that match current dev.
>> +	 * This happens before rx_handler is given a chance to change skb->dev.
>> +	 */
>> +
>> 	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>> -		if (!ptype->dev || ptype->dev == dev) {
>> +		if (ptype->dev == dev) {
>> 			if (pt_prev)
>> 				ret = deliver_skb(skb, pt_prev, orig_dev);
>> 			pt_prev = ptype;
>> @@ -3167,6 +3170,31 @@ another_round:
>> ncls:
>> #endif
>>
>> +	/*
>> +	 * Deliver to ptype_base protocol handlers that match current dev.
>> +	 * This happens before rx_handler is given a chance to change skb->dev.
>> +	 */
>> +
>> +	type = skb->protocol;
>> +	list_for_each_entry_rcu(ptype,
>> +			&ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>> +		if (ptype->type == type&&  ptype->dev == skb->dev) {
>> +			if (pt_prev)
>> +				ret = deliver_skb(skb, pt_prev, orig_dev);
>> +			pt_prev = ptype;
>> +		}
>> +	}
>
> I'm not sure it is ok to deliver ptype_base here. See comment above
> ptype_head() (I'm not sure I understand that correctly)

Anyway, all this is probably plain wrong: Delivering the skb to protocol handlers while still 
changing the skb is guaranteed to cause strange behaviors.

If we want to be able to deliver the skb to different protocol handlers and give all of them the 
right values for dev->skb and orig_dev (or previous_dev), we might end up with copying the skb. I 
hate the idea, but currently can't find a cleaner way to do so.

We first need to clarify what orig_dev should be, as stated above.

>> +
>> +	/*
>> +	 * Call rx_handler for current device.
>> +	 * If rx_handler return NULL, skip wilcard protocol handler delivery.
>> +	 * Else, if skb->dev changed, restart the whole delivery process, to
>> +	 * allow for device nesting.
>> +	 *
>> +	 * Warning:
>> +	 * rx_handlers must kfree_skb(skb) if they return NULL.
> Well this is not true. They can return NULL and call netif_rx as they
> have before. No changes necessary I believe.

I don't really know. This needs to be double checked, anyway.

>> +	 */
>> +
>> 	rx_handler = rcu_dereference(dev->rx_handler);
>> 	if (rx_handler) {
>> 		if (pt_prev) {
>> @@ -3176,10 +3204,15 @@ ncls:
>> 		skb = rx_handler(skb);
>> 		if (!skb)
>> 			goto out;
>> -		if (dev != skb->dev)
>> +		if (skb->dev != dev)
>> 			goto another_round;
>> 	}
>>
>> +	/*
>> +	 * FIXME: The part below should use rx_handler instead of being hard
>> +	 * coded here.
> I'm not sure it is doable atm. For bridge and bond it should not be a
> problem, but for macvlan, there is possible to have macvlans and vlans
> on the same dev. This possibility should persist.
> /me scratches head on the idea to have multiple rx_handlers although it
> was his original idea....

I think your original proposal of having several rx_handlers per device was right.

At the time you introduced the rx_handler system, only bridge and macvlan used it. Even if using 
bridge and macvlan on the same base device might be useless, this is not true for every possible 
rx_handler configuration.

Now that we want to move bonding and vlan to the rx_handler system, it becomes obvious that we need 
several rx_handlers per device. At least, vlan should properly mix with bridge. And who know what 
would be the fifth rx_handler...

>> +	 */
>> +
>> 	if (vlan_tx_tag_present(skb)) {
>> 		if (pt_prev) {
>> 			ret = deliver_skb(skb, pt_prev, orig_dev);
>> @@ -3192,16 +3225,33 @@ ncls:
>> 			goto out;
>> 	}
>>
>> +	/*
>> +	 * FIXME: Can't this be moved into the rx_handler for bonding,
>> +	 * or into a futur rx_handler for vlan?
> This hook is something I do not like at all :/ But anyway if should be in vlan
> part I think.

Yes, and in order for the future rx_handler for vlan to properly handle it, it needs to know the 
device just below it, not the pure original device. Hence, my question about the exact meaning of 
orig_dev...

	Nicolas.

>> +	 */
>> +
>> 	vlan_on_bond_hook(skb);
>>
>> -	/* deliver only exact match when indicated */
>> -	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
>> +	/*
>> +	 * Deliver to wildcard ptype_all protocol handlers.
>> +	 */
>> +
>> +	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>> +		if (!ptype->dev) {
>> +			if (pt_prev)
>> +				ret = deliver_skb(skb, pt_prev, orig_dev);
>> +			pt_prev = ptype;
>> +		}
>> +	}
>> +
>> +	/*
>> +	 * Deliver to wildcard ptype_all protocol handlers.
>> +	 */
>>
>> 	type = skb->protocol;
>> 	list_for_each_entry_rcu(ptype,
>> 			&ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>> -		if (ptype->type == type&&
>> -		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>> +		if (ptype->type == type&&  !ptype->dev) {
>> 			if (pt_prev)
>> 				ret = deliver_skb(skb, pt_prev, orig_dev);
>> 			pt_prev = ptype;
>> --
>> 1.7.2.3
>>
>>
>>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-20 12:12                                 ` Nicolas de Pesloüan
@ 2011-02-20 15:07                                   ` Jiri Pirko
  2011-02-21 23:20                                     ` Nicolas de Pesloüan
  0 siblings, 1 reply; 52+ messages in thread
From: Jiri Pirko @ 2011-02-20 15:07 UTC (permalink / raw)
  To: Nicolas de Pesloüan
  Cc: Jay Vosburgh, David Miller, kaber, eric.dumazet, netdev,
	shemminger, andy, Fischer, Anna

Sun, Feb 20, 2011 at 01:12:01PM CET, nicolas.2p.debian@gmail.com wrote:
>Le 20/02/2011 11:36, Jiri Pirko a écrit :
>>Sat, Feb 19, 2011 at 09:27:37PM CET, nicolas.2p.debian@gmail.com wrote:
>>>Le 19/02/2011 14:46, Jiri Pirko a écrit :
>>>>Sat, Feb 19, 2011 at 02:18:00PM CET, nicolas.2p.debian@gmail.com wrote:
>>>>>Le 19/02/2011 12:28, Jiri Pirko a écrit :
>>>>>>Sat, Feb 19, 2011 at 12:08:31PM CET, jpirko@redhat.com wrote:
>>>>>>>Sat, Feb 19, 2011 at 11:56:23AM CET, nicolas.2p.debian@gmail.com wrote:
>>>>>>>>Le 19/02/2011 09:05, Jiri Pirko a écrit :
>>>>>>>>>This patch converts bonding to use rx_handler. Results in cleaner
>>>>>>>>>__netif_receive_skb() with much less exceptions needed. Also
>>>>>>>>>bond-specific work is moved into bond code.
>>>>>>>>>
>>>>>>>>>Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>>>>>>>>
>>>>>>>>>v1->v2:
>>>>>>>>>         using skb_iif instead of new input_dev to remember original
>>>>>>>>>	device
>>>>>>>>>v2->v3:
>>>>>>>>>	set orig_dev = skb->dev if skb_iif is set
>>>>>>>>>
>>>>>>>>
>>>>>>>>Why do we need to let the rx_handlers call netif_rx() or __netif_receive_skb()?
>>>>>>>>
>>>>>>>>Bonding used to be handled with very few overhead, simply replacing
>>>>>>>>skb->dev with skb->dev->master. Time has passed and we eventually
>>>>>>>>added many special processing for bonding into __netif_receive_skb(),
>>>>>>>>but the overhead remained very light.
>>>>>>>>
>>>>>>>>Calling netif_rx() (or __netif_receive_skb()) to allow nesting would probably lead to some overhead.
>>>>>>>>
>>>>>>>>Can't we, instead, loop inside __netif_receive_skb(), and deliver
>>>>>>>>whatever need to be delivered, to whoever need, inside the loop ?
>>>>>>>>
>>>>>>>>rx_handler = rcu_dereference(skb->dev->rx_handler);
>>>>>>>>while (rx_handler) {
>>>>>>>>	/* ...  */
>>>>>>>>	orig_dev = skb->dev;
>>>>>>>>	skb = rx_handler(skb);
>>>>>>>>	/* ... */
>>>>>>>>	rx_handler = (skb->dev != orig_dev) ? rcu_dereference(skb->dev->rx_handler) : NULL;
>>>>>>>>}
>>>>>>>>
>>>>>>>>This would reduce the overhead, while still allowing nesting: vlan on
>>>>>>>>top on bonding, bridge on top on bonding, ...
>>>>>>>
>>>>>>>I see your point. Makes sense to me. But the loop would have to include
>>>>>>>at least processing of ptype_all too. I'm going to cook a follow-up
>>>>>>>patch.
>>>>>>>
>>>>>>
>>>>>>DRAFT (doesn't modify rx_handlers):
>>>>>>
>>>>>>diff --git a/net/core/dev.c b/net/core/dev.c
>>>>>>index 4ebf7fe..e5dba47 100644
>>>>>>--- a/net/core/dev.c
>>>>>>+++ b/net/core/dev.c
>>>>>>@@ -3115,6 +3115,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>>  {
>>>>>>  	struct packet_type *ptype, *pt_prev;
>>>>>>  	rx_handler_func_t *rx_handler;
>>>>>>+	struct net_device *dev;
>>>>>>  	struct net_device *orig_dev;
>>>>>>  	struct net_device *null_or_dev;
>>>>>>  	int ret = NET_RX_DROP;
>>>>>>@@ -3129,7 +3130,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>>  	if (netpoll_receive_skb(skb))
>>>>>>  		return NET_RX_DROP;
>>>>>>
>>>>>>-	__this_cpu_inc(softnet_data.processed);
>>>>>>+	skb->skb_iif = skb->dev->ifindex;
>>>>>>+	orig_dev = skb->dev;
>>>>>
>>>>>orig_dev should be set inside the loop, to reflect "previously
>>>>>crossed device", while following the path:
>>>>>
>>>>>eth0 ->   bond0 ->   br0.
>>>>>
>>>>>First step inside loop:
>>>>>
>>>>>orig_dev = eth0
>>>>>skb->dev = bond0 (at the end of the loop).
>>>>>
>>>>>Second step inside loop:
>>>>>
>>>>>orig_dev = bond0
>>>>>skb->dev = br0 (et the end of the loop).
>>>>>
>>>>>This would allow for exact match delivery to bond0 if someone bind there.
>>>>>
>>>>>>+
>>>>>>  	skb_reset_network_header(skb);
>>>>>>  	skb_reset_transport_header(skb);
>>>>>>  	skb->mac_len = skb->network_header - skb->mac_header;
>>>>>>@@ -3138,12 +3141,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>>
>>>>>>  	rcu_read_lock();
>>>>>>
>>>>>>-	if (!skb->skb_iif) {
>>>>>>-		skb->skb_iif = skb->dev->ifindex;
>>>>>>-		orig_dev = skb->dev;
>>>>>>-	} else {
>>>>>>-		orig_dev = dev_get_by_index_rcu(dev_net(skb->dev), skb->skb_iif);
>>>>>>-	}
>>>>>
>>>>>I like the fact that it removes the above part.
>>>>>
>>>>>>+another_round:
>>>>>>+	__this_cpu_inc(softnet_data.processed);
>>>>>>+	dev = skb->dev;
>>>>>>
>>>>>>  #ifdef CONFIG_NET_CLS_ACT
>>>>>>  	if (skb->tc_verd&    TC_NCLS) {
>>>>>>@@ -3153,7 +3153,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>>>  #endif
>>>>>>
>>>>>>  	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>>>>-		if (!ptype->dev || ptype->dev == skb->dev) {
>>>>>>+		if (!ptype->dev || ptype->dev == dev) {
>>>>>>  			if (pt_prev)
>>>>>>  				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>>>  			pt_prev = ptype;
>>>>>
>>>>>Inside the loop, we should only do exact match delivery, for
>>>>>&ptype_all and for&ptype_base[ntohs(type)&   PTYPE_HASH_MASK]:
>>>>>
>>>>>        list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>>>-               if (!ptype->dev || ptype->dev == dev) {
>>>>>+               if (ptype->dev == dev) {
>>>>>                        if (pt_prev)
>>>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>>                        pt_prev = ptype;
>>>>>                }
>>>>>        }
>>>>>
>>>>>
>>>>>        list_for_each_entry_rcu(ptype,
>>>>>                        &ptype_base[ntohs(type)&   PTYPE_HASH_MASK], list) {
>>>>>                if (ptype->type == type&&
>>>>>-                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>>>>>+                   (ptype->dev == skb->dev)) {
>>>>>                        if (pt_prev)
>>>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>>                        pt_prev = ptype;
>>>>>                }
>>>>>        }
>>>>>
>>>>>After leaving the loop, we can do wilcard delivery, if skb is not NULL.
>>>>>
>>>>>        list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>>>-               if (!ptype->dev || ptype->dev == dev) {
>>>>>+               if (!ptype->dev) {
>>>>>                        if (pt_prev)
>>>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>>                        pt_prev = ptype;
>>>>>               }
>>>>>        }
>>>>>
>>>>>
>>>>>        list_for_each_entry_rcu(ptype,
>>>>>                        &ptype_base[ntohs(type)&   PTYPE_HASH_MASK], list) {
>>>>>-               if (ptype->type == type&&
>>>>>-                   (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>>>>>+		if (ptype->type == type&&   !ptype->dev) {
>>>>>                        if (pt_prev)
>>>>>                                ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>>                        pt_prev = ptype;
>>>>>                }
>>>>>        }
>>>>>
>>>>>This would reduce the number of tests inside the
>>>>>list_for_each_entry_rcu() loops. And because we match only ptype->dev
>>>>>== dev inside the loop and !ptype->dev outside the loop, this should
>>>>>avoid duplicate delivery.
>>>>
>>>>Would you care to put this into patch so I can see the whole picture?
>>>>Thanks.
>>>
>>>Here is what I have in mind. It is based on your previous DRAFT patch, and don't modify rx_handlers yet.
>>>
>>>Only compile tested !!
>>>
>>>I don't know if every pieces are at the right place. I wonder what to
>>>do with CONFIG_NET_CLS_ACT part, that currently is between ptype_all
>>>and ptype_base processing.
>>>
>>>Anyway, the general idea is there.
>>>
>>>	Nicolas.
>>>
>>>net/core/dev.c |   70 ++++++++++++++++++++++++++++++++++++++++++++++++--------
>>>1 files changed, 60 insertions(+), 10 deletions(-)
>>>
>>>diff --git a/net/core/dev.c b/net/core/dev.c
>>>index e5dba47..7e007a9 100644
>>>--- a/net/core/dev.c
>>>+++ b/net/core/dev.c
>>>@@ -3117,7 +3117,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>	rx_handler_func_t *rx_handler;
>>>	struct net_device *dev;
>>>	struct net_device *orig_dev;
>>>-	struct net_device *null_or_dev;
>>>	int ret = NET_RX_DROP;
>>>	__be16 type;
>>>
>>>@@ -3130,9 +3129,6 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>	if (netpoll_receive_skb(skb))
>>>		return NET_RX_DROP;
>>>
>>>-	skb->skb_iif = skb->dev->ifindex;
>>>-	orig_dev = skb->dev;
>>>-
>>>	skb_reset_network_header(skb);
>>>	skb_reset_transport_header(skb);
>>>	skb->mac_len = skb->network_header - skb->mac_header;
>>>@@ -3143,6 +3139,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>
>>>another_round:
>>>	__this_cpu_inc(softnet_data.processed);
>>>+	skb->skb_iif = skb->dev->ifindex;
>>>+	orig_dev = skb->dev;
>>orig_dev should be set at the end of the loop. Now you are going to have
>>it always the same as dev and skb->dev.
>>
>
>Yes, you are right.
>
>I thinking about all this, I wonder what the protocol handlers expect as the orig_dev value ?
>
>Lest imagine the following configuration: eth0 -> bond0 -> br0.
>
>What does a protocol handler listening on br0 expect for orig_dev ?
>bond0 or eth0 ? Current implementation give eth0, but I think bond0
>should be the right value, for proper nesting.

I agree with you.

>
>>>	dev = skb->dev;
>>>
>>>#ifdef CONFIG_NET_CLS_ACT
>>>@@ -3152,8 +3150,13 @@ another_round:
>>>	}
>>>#endif
>>>
>>>+	/*
>>>+	 * Deliver to ptype_all protocol handlers that match current dev.
>>>+	 * This happens before rx_handler is given a chance to change skb->dev.
>>>+	 */
>>>+
>>>	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>-		if (!ptype->dev || ptype->dev == dev) {
>>>+		if (ptype->dev == dev) {
>>>			if (pt_prev)
>>>				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>			pt_prev = ptype;
>>>@@ -3167,6 +3170,31 @@ another_round:
>>>ncls:
>>>#endif
>>>
>>>+	/*
>>>+	 * Deliver to ptype_base protocol handlers that match current dev.
>>>+	 * This happens before rx_handler is given a chance to change skb->dev.
>>>+	 */
>>>+
>>>+	type = skb->protocol;
>>>+	list_for_each_entry_rcu(ptype,
>>>+			&ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>>>+		if (ptype->type == type&&  ptype->dev == skb->dev) {
>>>+			if (pt_prev)
>>>+				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>+			pt_prev = ptype;
>>>+		}
>>>+	}
>>
>>I'm not sure it is ok to deliver ptype_base here. See comment above
>>ptype_head() (I'm not sure I understand that correctly)
>
>Anyway, all this is probably plain wrong: Delivering the skb to
>protocol handlers while still changing the skb is guaranteed to cause
>strange behaviors.
>
>If we want to be able to deliver the skb to different protocol
>handlers and give all of them the right values for dev->skb and
>orig_dev (or previous_dev), we might end up with copying the skb. I
>hate the idea, but currently can't find a cleaner way to do so.

That would be unfortunate :/
>
>We first need to clarify what orig_dev should be, as stated above.
>
>>>+
>>>+	/*
>>>+	 * Call rx_handler for current device.
>>>+	 * If rx_handler return NULL, skip wilcard protocol handler delivery.
>>>+	 * Else, if skb->dev changed, restart the whole delivery process, to
>>>+	 * allow for device nesting.
>>>+	 *
>>>+	 * Warning:
>>>+	 * rx_handlers must kfree_skb(skb) if they return NULL.
>>Well this is not true. They can return NULL and call netif_rx as they
>>have before. No changes necessary I believe.
>
>I don't really know. This needs to be double checked, anyway.
>
>>>+	 */
>>>+
>>>	rx_handler = rcu_dereference(dev->rx_handler);
>>>	if (rx_handler) {
>>>		if (pt_prev) {
>>>@@ -3176,10 +3204,15 @@ ncls:
>>>		skb = rx_handler(skb);
>>>		if (!skb)
>>>			goto out;
>>>-		if (dev != skb->dev)
>>>+		if (skb->dev != dev)
>>>			goto another_round;
>>>	}
>>>
>>>+	/*
>>>+	 * FIXME: The part below should use rx_handler instead of being hard
>>>+	 * coded here.
>>I'm not sure it is doable atm. For bridge and bond it should not be a
>>problem, but for macvlan, there is possible to have macvlans and vlans
>>on the same dev. This possibility should persist.
>>/me scratches head on the idea to have multiple rx_handlers although it
>>was his original idea....
>
>I think your original proposal of having several rx_handlers per device was right.
>
>At the time you introduced the rx_handler system, only bridge and
>macvlan used it. Even if using bridge and macvlan on the same base
>device might be useless, this is not true for every possible
>rx_handler configuration.
>
>Now that we want to move bonding and vlan to the rx_handler system,
>it becomes obvious that we need several rx_handlers per device. At
>least, vlan should properly mix with bridge. And who know what would
>be the fifth rx_handler...
>
>>>+	 */
>>>+
>>>	if (vlan_tx_tag_present(skb)) {
>>>		if (pt_prev) {
>>>			ret = deliver_skb(skb, pt_prev, orig_dev);
>>>@@ -3192,16 +3225,33 @@ ncls:
>>>			goto out;
>>>	}
>>>
>>>+	/*
>>>+	 * FIXME: Can't this be moved into the rx_handler for bonding,
>>>+	 * or into a futur rx_handler for vlan?
>>This hook is something I do not like at all :/ But anyway if should be in vlan
>>part I think.
>
>Yes, and in order for the future rx_handler for vlan to properly
>handle it, it needs to know the device just below it, not the pure
>original device. Hence, my question about the exact meaning of
>orig_dev...
>
>	Nicolas.
>
>>>+	 */
>>>+
>>>	vlan_on_bond_hook(skb);
>>>
>>>-	/* deliver only exact match when indicated */
>>>-	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
>>>+	/*
>>>+	 * Deliver to wildcard ptype_all protocol handlers.
>>>+	 */
>>>+
>>>+	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>+		if (!ptype->dev) {
>>>+			if (pt_prev)
>>>+				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>+			pt_prev = ptype;
>>>+		}
>>>+	}
>>>+
>>>+	/*
>>>+	 * Deliver to wildcard ptype_all protocol handlers.
>>>+	 */
>>>
>>>	type = skb->protocol;
>>>	list_for_each_entry_rcu(ptype,
>>>			&ptype_base[ntohs(type)&  PTYPE_HASH_MASK], list) {
>>>-		if (ptype->type == type&&
>>>-		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
>>>+		if (ptype->type == type&&  !ptype->dev) {
>>>			if (pt_prev)
>>>				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>			pt_prev = ptype;
>>>--
>>>1.7.2.3
>>>
>>>
>>>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-20 15:07                                   ` Jiri Pirko
@ 2011-02-21 23:20                                     ` Nicolas de Pesloüan
  2011-02-26 14:24                                       ` Nicolas de Pesloüan
  0 siblings, 1 reply; 52+ messages in thread
From: Nicolas de Pesloüan @ 2011-02-21 23:20 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Jay Vosburgh, David Miller, kaber, eric.dumazet, netdev,
	shemminger, andy, Fischer, Anna

Le 20/02/2011 16:07, Jiri Pirko a écrit :
> Sun, Feb 20, 2011 at 01:12:01PM CET, nicolas.2p.debian@gmail.com wrote:

[snip]

>> And thinking about all this, I wonder what the protocol handlers expect as the orig_dev value ?
>>
>> Lets imagine the following configuration: eth0 ->  bond0 ->  br0.
>>
>> What does a protocol handler listening on br0 expect for orig_dev ?
>> bond0 or eth0 ? Current implementation give eth0, but I think bond0
>> should be the right value, for proper nesting.
>
> I agree with you.

[snip}

>>> This hook is something I do not like at all :/ But anyway if should be in vlan
>>> part I think.
>>
>> Yes, and in order for the future rx_handler for vlan to properly
>> handle it, it needs to know the device just below it, not the pure
>> original device. Hence, my question about the exact meaning of
>> orig_dev...

After checking every protocol handlers installed by dev_add_pack(), it appears that only 4 of them 
really use the orig_dev parameter given by __netif_receive_skb():

- bond_3ad_lacpdu_recv() @ drivers/net/bonding/bond_3ad.c
- bond_arp_recv() @ drivers/net/bonding/bond_main.c
- packet_rcv() @ net/packet/af_packet.c
- tpacket_rcv() @ net/packet/af_packet.c

 From the bonding point of view, the meaning of orig_dev is obviously "the device one layer below 
the bonding device, through which the packet reached the bonding device". It is used by 
bond_3ad_lacpdu_recv() and bond_arp_recv(), to find the underlying slave device through which the 
LACPDU or ARP was received. (The protocol handler is registered at the bonding device level).

 From the af_packet point of view, the meaning is documented (in commit "[AF_PACKET]: Add option to 
return orig_dev to userspace") as the "physical device [that] actually received the traffic, instead 
of having the encapsulating device hide that information."

When the bonding device is just one level above the physical device, the two meanings happen to 
match the same device, by chance.

So, currently, a bonding device cannot stack properly on top of anything but physical devices. It 
might not be a problem today, but may change in the future...

	Nicolas.







^ permalink raw reply	[flat|nested] 52+ messages in thread

* [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-18 20:58             ` [patch net-next-2.6 V2] " Jiri Pirko
  2011-02-18 23:06               ` Jay Vosburgh
@ 2011-02-23 19:05               ` Jiri Pirko
  2011-02-25 23:46                 ` Nicolas de Pesloüan
  2011-02-27 14:17                 ` Nicolas de Pesloüan
  1 sibling, 2 replies; 52+ messages in thread
From: Jiri Pirko @ 2011-02-23 19:05 UTC (permalink / raw)
  To: David Miller
  Cc: kaber, eric.dumazet, netdev, shemminger, fubar, nicolas.2p.debian, andy

This patch converts bonding to use rx_handler. Results in cleaner
__netif_receive_skb() with much less exceptions needed. Also
bond-specific work is moved into bond code.

Did performance test using pktgen and counting incoming packets by
iptables. No regression noted.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

v1->v2:
        using skb_iif instead of new input_dev to remember original
	device

v2->v3:
	do another loop in case skb->dev is changed. That way orig_dev
	core can be left untouched.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/bonding/bond_main.c |   74 ++++++++++++++++++++++++-
 net/core/dev.c                  |  119 ++++++++++-----------------------------
 2 files changed, 104 insertions(+), 89 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 77e3c6a..3772b61 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1423,6 +1423,67 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
 	bond->setup_by_slave = 1;
 }
 
+/* On bonding slaves other than the currently active slave, suppress
+ * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
+ * ARP on active-backup slaves with arp_validate enabled.
+ */
+static bool bond_should_deliver_exact_match(struct sk_buff *skb,
+					    struct net_device *slave_dev,
+					    struct net_device *bond_dev)
+{
+	if (slave_dev->priv_flags & IFF_SLAVE_INACTIVE) {
+		if (slave_dev->priv_flags & IFF_SLAVE_NEEDARP &&
+		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
+			return false;
+
+		if (bond_dev->priv_flags & IFF_MASTER_ALB &&
+		    skb->pkt_type != PACKET_BROADCAST &&
+		    skb->pkt_type != PACKET_MULTICAST)
+				return false;
+
+		if (bond_dev->priv_flags & IFF_MASTER_8023AD &&
+		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
+			return false;
+
+		return true;
+	}
+	return false;
+}
+
+static struct sk_buff *bond_handle_frame(struct sk_buff *skb)
+{
+	struct net_device *slave_dev;
+	struct net_device *bond_dev;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return NULL;
+	slave_dev = skb->dev;
+	bond_dev = ACCESS_ONCE(slave_dev->master);
+	if (unlikely(!bond_dev))
+		return skb;
+
+	if (bond_dev->priv_flags & IFF_MASTER_ARPMON)
+		slave_dev->last_rx = jiffies;
+
+	if (bond_should_deliver_exact_match(skb, slave_dev, bond_dev)) {
+		skb->deliver_no_wcard = 1;
+		return skb;
+	}
+
+	skb->dev = bond_dev;
+
+	if (bond_dev->priv_flags & IFF_MASTER_ALB &&
+	    bond_dev->priv_flags & IFF_BRIDGE_PORT &&
+	    skb->pkt_type == PACKET_HOST) {
+		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
+
+		memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
+	}
+
+	return skb;
+}
+
 /* enslave device <slave> to bond device <master> */
 int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 {
@@ -1599,11 +1660,17 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		pr_debug("Error %d calling netdev_set_bond_master\n", res);
 		goto err_restore_mac;
 	}
+	res = netdev_rx_handler_register(slave_dev, bond_handle_frame, NULL);
+	if (res) {
+		pr_debug("Error %d calling netdev_rx_handler_register\n", res);
+		goto err_unset_master;
+	}
+
 	/* open the slave since the application closed it */
 	res = dev_open(slave_dev);
 	if (res) {
 		pr_debug("Opening slave %s failed\n", slave_dev->name);
-		goto err_unset_master;
+		goto err_unreg_rxhandler;
 	}
 
 	new_slave->dev = slave_dev;
@@ -1811,6 +1878,9 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 err_close:
 	dev_close(slave_dev);
 
+err_unreg_rxhandler:
+	netdev_rx_handler_unregister(slave_dev);
+
 err_unset_master:
 	netdev_set_bond_master(slave_dev, NULL);
 
@@ -1992,6 +2062,7 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
 		netif_addr_unlock_bh(bond_dev);
 	}
 
+	netdev_rx_handler_unregister(slave_dev);
 	netdev_set_bond_master(slave_dev, NULL);
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -2114,6 +2185,7 @@ static int bond_release_all(struct net_device *bond_dev)
 			netif_addr_unlock_bh(bond_dev);
 		}
 
+		netdev_rx_handler_unregister(slave_dev);
 		netdev_set_bond_master(slave_dev, NULL);
 
 		/* close slave before restoring its mac address */
diff --git a/net/core/dev.c b/net/core/dev.c
index 578415c..e06a834 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3096,63 +3096,31 @@ void netdev_rx_handler_unregister(struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 
-static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
-					      struct net_device *master)
+static void vlan_on_bond_hook(struct sk_buff *skb)
 {
-	if (skb->pkt_type == PACKET_HOST) {
-		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
+	/*
+	 * Make sure ARP frames received on VLAN interfaces stacked on
+	 * bonding interfaces still make their way to any base bonding
+	 * device that may have registered for a specific ptype.
+	 */
+	if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
+	    vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
+	    skb->protocol == htons(ETH_P_ARP)) {
+		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 
-		memcpy(dest, master->dev_addr, ETH_ALEN);
+		if (!skb2)
+			return;
+		skb2->dev = vlan_dev_real_dev(skb->dev);
+		netif_rx(skb2);
 	}
 }
 
-/* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
- */
-static int __skb_bond_should_drop(struct sk_buff *skb,
-				  struct net_device *master)
-{
-	struct net_device *dev = skb->dev;
-
-	if (master->priv_flags & IFF_MASTER_ARPMON)
-		dev->last_rx = jiffies;
-
-	if ((master->priv_flags & IFF_MASTER_ALB) &&
-	    (master->priv_flags & IFF_BRIDGE_PORT)) {
-		/* Do address unmangle. The local destination address
-		 * will be always the one master has. Provides the right
-		 * functionality in a bridge.
-		 */
-		skb_bond_set_mac_by_master(skb, master);
-	}
-
-	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
-		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
-		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
-			return 0;
-
-		if (master->priv_flags & IFF_MASTER_ALB) {
-			if (skb->pkt_type != PACKET_BROADCAST &&
-			    skb->pkt_type != PACKET_MULTICAST)
-				return 0;
-		}
-		if (master->priv_flags & IFF_MASTER_8023AD &&
-		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
-			return 0;
-
-		return 1;
-	}
-	return 0;
-}
-
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	rx_handler_func_t *rx_handler;
 	struct net_device *orig_dev;
-	struct net_device *null_or_orig;
-	struct net_device *orig_or_bond;
+	struct net_device *null_or_dev;
 	int ret = NET_RX_DROP;
 	__be16 type;
 
@@ -3167,32 +3135,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
 
 	if (!skb->skb_iif)
 		skb->skb_iif = skb->dev->ifindex;
-
-	/*
-	 * bonding note: skbs received on inactive slaves should only
-	 * be delivered to pkt handlers that are exact matches.  Also
-	 * the deliver_no_wcard flag will be set.  If packet handlers
-	 * are sensitive to duplicate packets these skbs will need to
-	 * be dropped at the handler.
-	 */
-	null_or_orig = NULL;
 	orig_dev = skb->dev;
-	if (skb->deliver_no_wcard)
-		null_or_orig = orig_dev;
-	else if (netif_is_bond_slave(orig_dev)) {
-		struct net_device *bond_master = ACCESS_ONCE(orig_dev->master);
-
-		if (likely(bond_master)) {
-			if (__skb_bond_should_drop(skb, bond_master)) {
-				skb->deliver_no_wcard = 1;
-				/* deliver only exact match */
-				null_or_orig = orig_dev;
-			} else
-				skb->dev = bond_master;
-		}
-	}
 
-	__this_cpu_inc(softnet_data.processed);
 	skb_reset_network_header(skb);
 	skb_reset_transport_header(skb);
 	skb->mac_len = skb->network_header - skb->mac_header;
@@ -3201,6 +3145,10 @@ static int __netif_receive_skb(struct sk_buff *skb)
 
 	rcu_read_lock();
 
+another_round:
+
+	__this_cpu_inc(softnet_data.processed);
+
 #ifdef CONFIG_NET_CLS_ACT
 	if (skb->tc_verd & TC_NCLS) {
 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
@@ -3209,8 +3157,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 #endif
 
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
-		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
-		    ptype->dev == orig_dev) {
+		if (!ptype->dev || ptype->dev == skb->dev) {
 			if (pt_prev)
 				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;
@@ -3224,16 +3171,20 @@ static int __netif_receive_skb(struct sk_buff *skb)
 ncls:
 #endif
 
-	/* Handle special case of bridge or macvlan */
 	rx_handler = rcu_dereference(skb->dev->rx_handler);
 	if (rx_handler) {
+		struct net_device *prev_dev;
+
 		if (pt_prev) {
 			ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = NULL;
 		}
+		prev_dev = skb->dev;
 		skb = rx_handler(skb);
 		if (!skb)
 			goto out;
+		if (skb->dev != prev_dev)
+			goto another_round;
 	}
 
 	if (vlan_tx_tag_present(skb)) {
@@ -3248,24 +3199,16 @@ ncls:
 			goto out;
 	}
 
-	/*
-	 * Make sure frames received on VLAN interfaces stacked on
-	 * bonding interfaces still make their way to any base bonding
-	 * device that may have registered for a specific ptype.  The
-	 * handler may have to adjust skb->dev and orig_dev.
-	 */
-	orig_or_bond = orig_dev;
-	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
-	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
-		orig_or_bond = vlan_dev_real_dev(skb->dev);
-	}
+	vlan_on_bond_hook(skb);
+
+	/* deliver only exact match when indicated */
+	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
 
 	type = skb->protocol;
 	list_for_each_entry_rcu(ptype,
 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
-		if (ptype->type == type && (ptype->dev == null_or_orig ||
-		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
-		     ptype->dev == orig_or_bond)) {
+		if (ptype->type == type &&
+		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
 			if (pt_prev)
 				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;
-- 
1.7.3.4


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-23 19:05               ` Jiri Pirko
@ 2011-02-25 23:46                 ` Nicolas de Pesloüan
  2011-02-26  7:14                   ` Jiri Pirko
  2011-02-27 14:17                 ` Nicolas de Pesloüan
  1 sibling, 1 reply; 52+ messages in thread
From: Nicolas de Pesloüan @ 2011-02-25 23:46 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: David Miller, kaber, eric.dumazet, netdev, shemminger, fubar, andy

Le 23/02/2011 20:05, Jiri Pirko a écrit :
> This patch converts bonding to use rx_handler. Results in cleaner
> __netif_receive_skb() with much less exceptions needed. Also
> bond-specific work is moved into bond code.
>
> Did performance test using pktgen and counting incoming packets by
> iptables. No regression noted.
>
> Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>
> v1->v2:
>          using skb_iif instead of new input_dev to remember original
> 	device
>
> v2->v3:
> 	do another loop in case skb->dev is changed. That way orig_dev
> 	core can be left untouched.

Hi Jiri,

Eventually taking enough time for a review.

I think we should split this change :

1/ Change __netif_receive_skb() to call rx_handler for diverted net_device, until rx_handler is NULL.

2/ Convert currently existing rx_handlers (bridge and macvlan) to use this new "loop" feature, 
removing the need to call netif_rx() inside their respective rx_handler and also removing the 
associated overhead.

3/ Convert bonding to use rx_handlers.

Also, on step 1, we definitely need to clarify what orig_dev should be.

I now think that orig_dev should be "the device one level below the current one" or NULL if current 
device was not diverted from another one. It means that we should keep an array of crossed 
(diverted) devices and the associated orig_dev. This array would be used to pass the right orig_dev 
to protocol handlers, depending on the device they register on :

eth0 -> bond0 -> br0

A protocol handler registered on bond0 would receive eth0 as orig_dev.
A protocol handler registered on br0 would receive bond0 as orig_dev.

[snip]

> @@ -3167,32 +3135,8 @@ static int __netif_receive_skb(struct sk_buff *skb)

[snip]

> +another_round:
> +
> +	__this_cpu_inc(softnet_data.processed);
> +
>   #ifdef CONFIG_NET_CLS_ACT
>   	if (skb->tc_verd&  TC_NCLS) {
>   		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
> @@ -3209,8 +3157,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>   #endif
>
>   	list_for_each_entry_rcu(ptype,&ptype_all, list) {
> -		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
> -		    ptype->dev == orig_dev) {
> +		if (!ptype->dev || ptype->dev == skb->dev) {
>   			if (pt_prev)
>   				ret = deliver_skb(skb, pt_prev, orig_dev);
>   			pt_prev = ptype;
> @@ -3224,16 +3171,20 @@ static int __netif_receive_skb(struct sk_buff *skb)
>   ncls:
>   #endif
>

Why do you loop to ptype_all before calling rx_handler ?

I don't understand why ptype_all and ptype_base are not handled at the same place in current 
__netif_receive_skb() but I think we should take the opportunity to change that, unless someone know 
of a good reason not to do so.

> -	/* Handle special case of bridge or macvlan */
>   	rx_handler = rcu_dereference(skb->dev->rx_handler);
>   	if (rx_handler) {

	Nicolas.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-25 23:46                 ` Nicolas de Pesloüan
@ 2011-02-26  7:14                   ` Jiri Pirko
  2011-02-26 11:25                     ` Nicolas de Pesloüan
  0 siblings, 1 reply; 52+ messages in thread
From: Jiri Pirko @ 2011-02-26  7:14 UTC (permalink / raw)
  To: Nicolas de Pesloüan
  Cc: David Miller, kaber, eric.dumazet, netdev, shemminger, fubar, andy

Sat, Feb 26, 2011 at 12:46:53AM CET, nicolas.2p.debian@gmail.com wrote:
>Le 23/02/2011 20:05, Jiri Pirko a écrit :
>>This patch converts bonding to use rx_handler. Results in cleaner
>>__netif_receive_skb() with much less exceptions needed. Also
>>bond-specific work is moved into bond code.
>>
>>Did performance test using pktgen and counting incoming packets by
>>iptables. No regression noted.
>>
>>Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>
>>v1->v2:
>>         using skb_iif instead of new input_dev to remember original
>>	device
>>
>>v2->v3:
>>	do another loop in case skb->dev is changed. That way orig_dev
>>	core can be left untouched.
>
>Hi Jiri,
>
>Eventually taking enough time for a review.
>
>I think we should split this change :
>
>1/ Change __netif_receive_skb() to call rx_handler for diverted net_device, until rx_handler is NULL.
>
>2/ Convert currently existing rx_handlers (bridge and macvlan) to use
>this new "loop" feature, removing the need to call netif_rx() inside
>their respective rx_handler and also removing the associated
>overhead.

This might not be possible. Macvlan uses result of called netif_rx for
counting, bridge calls netdev_receive_skb via NF_HOOK. Nevertheless,
this can be eventually handled later, not as a part of this patch.
>
>3/ Convert bonding to use rx_handlers.
>
>Also, on step 1, we definitely need to clarify what orig_dev should be.
>
>I now think that orig_dev should be "the device one level below the
>current one" or NULL if current device was not diverted from another
>one. It means that we should keep an array of crossed (diverted)
>devices and the associated orig_dev. This array would be used to pass
>the right orig_dev to protocol handlers, depending on the device they
>register on :

I constructed the patch in the way origdev is the same in all situations
as before the patch. I think that this decision can be ommitted at the
moment.

>
>eth0 -> bond0 -> br0
>
>A protocol handler registered on bond0 would receive eth0 as orig_dev.
>A protocol handler registered on br0 would receive bond0 as orig_dev.
>
>[snip]
>
>>@@ -3167,32 +3135,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
>
>[snip]
>
>>+another_round:
>>+
>>+	__this_cpu_inc(softnet_data.processed);
>>+
>>  #ifdef CONFIG_NET_CLS_ACT
>>  	if (skb->tc_verd&  TC_NCLS) {
>>  		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
>>@@ -3209,8 +3157,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>  #endif
>>
>>  	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>-		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
>>-		    ptype->dev == orig_dev) {
>>+		if (!ptype->dev || ptype->dev == skb->dev) {
>>  			if (pt_prev)
>>  				ret = deliver_skb(skb, pt_prev, orig_dev);
>>  			pt_prev = ptype;
>>@@ -3224,16 +3171,20 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>  ncls:
>>  #endif
>>
>
>Why do you loop to ptype_all before calling rx_handler ?
>
>I don't understand why ptype_all and ptype_base are not handled at
>the same place in current __netif_receive_skb() but I think we should
>take the opportunity to change that, unless someone know of a good
>reason not to do so.

Again, the patch tries to do as little changes as it can. So this stays
the same as before. In case you want to change it, feel free to submit
patch doing that as follow-on.

>
>>-	/* Handle special case of bridge or macvlan */
>>  	rx_handler = rcu_dereference(skb->dev->rx_handler);
>>  	if (rx_handler) {
>
>	Nicolas.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-26  7:14                   ` Jiri Pirko
@ 2011-02-26 11:25                     ` Nicolas de Pesloüan
  2011-02-26 14:58                       ` Jiri Pirko
  0 siblings, 1 reply; 52+ messages in thread
From: Nicolas de Pesloüan @ 2011-02-26 11:25 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: David Miller, kaber, eric.dumazet, netdev, shemminger, fubar, andy

Le 26/02/2011 08:14, Jiri Pirko a écrit :
> Sat, Feb 26, 2011 at 12:46:53AM CET, nicolas.2p.debian@gmail.com wrote:
>> Le 23/02/2011 20:05, Jiri Pirko a écrit :
>>> This patch converts bonding to use rx_handler. Results in cleaner
>>> __netif_receive_skb() with much less exceptions needed. Also
>>> bond-specific work is moved into bond code.
>>>
>>> Did performance test using pktgen and counting incoming packets by
>>> iptables. No regression noted.
>>>
>>> Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>>
>>> v1->v2:
>>>          using skb_iif instead of new input_dev to remember original
>>> 	device
>>>
>>> v2->v3:
>>> 	do another loop in case skb->dev is changed. That way orig_dev
>>> 	core can be left untouched.
>>
>> Hi Jiri,
>>
>> Eventually taking enough time for a review.
>>
>> I think we should split this change :
>>
>> 1/ Change __netif_receive_skb() to call rx_handler for diverted net_device, until rx_handler is NULL.
>>
>> 2/ Convert currently existing rx_handlers (bridge and macvlan) to use
>> this new "loop" feature, removing the need to call netif_rx() inside
>> their respective rx_handler and also removing the associated
>> overhead.
>
> This might not be possible. Macvlan uses result of called netif_rx for
> counting, bridge calls netdev_receive_skb via NF_HOOK. Nevertheless,
> this can be eventually handled later, not as a part of this patch.

Yes, I agree. Step 2 and step 3 can be swapped.

Anyway, we need to describe the options given to a rx_handler:

- Return skb unchanged. This would cause normal delivery (ptype->dev == NULL or ptype->dev == skb->dev).
- Return skb->dev changed. __netif_receive_skb() will loop to the new device. This would cause 
extact match delivery only (ptype->dev != NULL and ptype->dev == one of the orig_dev).
- Manage the skb another way and return NULL. This would stop any protocol handlers to receive the 
skb, except if the rx_handler arrange to re-inject the skb somewhere.

>> 3/ Convert bonding to use rx_handlers.
>>
>> Also, on step 1, we definitely need to clarify what orig_dev should be.
>>
>> I now think that orig_dev should be "the device one level below the
>> current one" or NULL if current device was not diverted from another
>> one. It means that we should keep an array of crossed (diverted)
>> devices and the associated orig_dev. This array would be used to pass
>> the right orig_dev to protocol handlers, depending on the device they
>> register on :
>
> I constructed the patch in the way origdev is the same in all situations
> as before the patch. I think that this decision can be ommitted at the
> moment.

Agreed, event if the current handling of orig_dev is far from bullet proof and needs to be clarified 
at some time.

>> eth0 ->  bond0 ->  br0
>>
>> A protocol handler registered on bond0 would receive eth0 as orig_dev.
>> A protocol handler registered on br0 would receive bond0 as orig_dev.
>>
>> [snip]
>>
>>> @@ -3167,32 +3135,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>
>> [snip]
>>
>>> +another_round:
>>> +
>>> +	__this_cpu_inc(softnet_data.processed);
>>> +
>>>   #ifdef CONFIG_NET_CLS_ACT
>>>   	if (skb->tc_verd&   TC_NCLS) {
>>>   		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
>>> @@ -3209,8 +3157,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>   #endif
>>>
>>>   	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>> -		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
>>> -		    ptype->dev == orig_dev) {
>>> +		if (!ptype->dev || ptype->dev == skb->dev) {
>>>   			if (pt_prev)
>>>   				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>   			pt_prev = ptype;
>>> @@ -3224,16 +3171,20 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>   ncls:
>>>   #endif
>>>
>>
>> Why do you loop to ptype_all before calling rx_handler ?
>>
>> I don't understand why ptype_all and ptype_base are not handled at
>> the same place in current __netif_receive_skb() but I think we should
>> take the opportunity to change that, unless someone know of a good
>> reason not to do so.
>
> Again, the patch tries to do as little changes as it can. So this stays
> the same as before. In case you want to change it, feel free to submit
> patch doing that as follow-on.

The point here is that bridge and macvlan handling used to be after the ptype_all loop (hence the 
place you inserted the call to rx_handler last summer), but the bonding part is currently before the 
ptype_all loop.

Moving bonding handling after the ptype_all loop will cause the ptype_all loop to be run twice:
- first time, with skb->dev == eth0 and orig_dev == eth0.
- second time, with skb->dev == bond0 and orig_dev == eth0.

The first time currently does not exists. And because bonding wasn't given a chance yet to decide 
that the frame should be dropped, the packet will always be delivered to eth0, causing duplicate 
deliveries. Note that this is probably true for bridge and macvlan too, and that those duplicate 
deliveries probably already exists.

Also, delivering skb inside a loop that may change the skb (skb->dev at least) is guaranteed to 
produce strange behaviors.

Can someone, knowing the history of ptype_all/ptype_base/bridge/macvlan/bonding/vlan handling in 
__netif_receive_skb(), comment on this?

Are there any reasons not to process ptype_all and ptype_base at the same location, at the end of 
__netif_receive_skb(), and to manage all divert features (bridge/macvlan/bonding/vlan) before?

	Nicolas.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-21 23:20                                     ` Nicolas de Pesloüan
@ 2011-02-26 14:24                                       ` Nicolas de Pesloüan
  2011-02-26 19:42                                         ` Jay Vosburgh
  0 siblings, 1 reply; 52+ messages in thread
From: Nicolas de Pesloüan @ 2011-02-26 14:24 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Jiri Pirko, David Miller, kaber, eric.dumazet, netdev,
	shemminger, andy, Fischer, Anna

Le 22/02/2011 00:20, Nicolas de Pesloüan a écrit :

> After checking every protocol handlers installed by dev_add_pack(), it
> appears that only 4 of them really use the orig_dev parameter given by
> __netif_receive_skb():
>
> - bond_3ad_lacpdu_recv() @ drivers/net/bonding/bond_3ad.c
> - bond_arp_recv() @ drivers/net/bonding/bond_main.c
> - packet_rcv() @ net/packet/af_packet.c
> - tpacket_rcv() @ net/packet/af_packet.c
>
>  From the bonding point of view, the meaning of orig_dev is obviously
> "the device one layer below the bonding device, through which the packet
> reached the bonding device". It is used by bond_3ad_lacpdu_recv() and
> bond_arp_recv(), to find the underlying slave device through which the
> LACPDU or ARP was received. (The protocol handler is registered at the
> bonding device level).
>
>  From the af_packet point of view, the meaning is documented (in commit
> "[AF_PACKET]: Add option to return orig_dev to userspace") as the
> "physical device [that] actually received the traffic, instead of having
> the encapsulating device hide that information."
>
> When the bonding device is just one level above the physical device, the
> two meanings happen to match the same device, by chance.
>
> So, currently, a bonding device cannot stack properly on top of anything
> but physical devices. It might not be a problem today, but may change in
> the future...

Hi Jay,

Still thinking about this orig_dev stuff, I wonder why the protocol handlers used in bonding 
(bond_3ad_lacpdu_recv() and bond_arp_rcv()) are registered at the master level instead of at the 
slave level ?

If they were registered at the slave level, they would simply receive skb->dev as the ingress 
interface and use this value instead of needing the orig_dev value given to them when they are 
registered at the master level.

As orig_dev is only used by bonding and by af_packet, but they disagree on the exact meaning of 
orig_dev, one way to fix this discrepancy would be to remove one of the usage. As the af_packet 
usage is exposed to user space, bonding seems the right place to stop using orig_dev, even if 
orig_dev was introduced for bonding :-)

I understand that this would add one entry per slave device to the ptype_base list, but this seems 
to be the only bad effect of registering at the slave level. Can you confirm that this was the 
reason to register at the master level instead?

If you think registering at the slave level would cause too much impact on ptype_base, then we might 
have another way to stop using orig_dev for bonding:

In __skb_bond_should_drop(), we already test for the two interesting protocols:

if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && skb->protocol == __cpu_to_be16(ETH_P_ARP))
	return 0;

if (master->priv_flags & IFF_MASTER_8023AD && skb->protocol == __cpu_to_be16(ETH_P_SLOW))
	return 0;

Would it be possible to call the right handlers directly from inside __skb_bond_should_drop() then 
let __skb_bond_should_drop() return 1 ("should drop") after processing the frames that are only of 
interest for bonding?

	Nicolas.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-26 11:25                     ` Nicolas de Pesloüan
@ 2011-02-26 14:58                       ` Jiri Pirko
  0 siblings, 0 replies; 52+ messages in thread
From: Jiri Pirko @ 2011-02-26 14:58 UTC (permalink / raw)
  To: Nicolas de Pesloüan
  Cc: David Miller, kaber, eric.dumazet, netdev, shemminger, fubar, andy

Sat, Feb 26, 2011 at 12:25:18PM CET, nicolas.2p.debian@gmail.com wrote:
>Le 26/02/2011 08:14, Jiri Pirko a écrit :
>>Sat, Feb 26, 2011 at 12:46:53AM CET, nicolas.2p.debian@gmail.com wrote:
>>>Le 23/02/2011 20:05, Jiri Pirko a écrit :
>>>>This patch converts bonding to use rx_handler. Results in cleaner
>>>>__netif_receive_skb() with much less exceptions needed. Also
>>>>bond-specific work is moved into bond code.
>>>>
>>>>Did performance test using pktgen and counting incoming packets by
>>>>iptables. No regression noted.
>>>>
>>>>Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>>>
>>>>v1->v2:
>>>>         using skb_iif instead of new input_dev to remember original
>>>>	device
>>>>
>>>>v2->v3:
>>>>	do another loop in case skb->dev is changed. That way orig_dev
>>>>	core can be left untouched.
>>>
>>>Hi Jiri,
>>>
>>>Eventually taking enough time for a review.
>>>
>>>I think we should split this change :
>>>
>>>1/ Change __netif_receive_skb() to call rx_handler for diverted net_device, until rx_handler is NULL.
>>>
>>>2/ Convert currently existing rx_handlers (bridge and macvlan) to use
>>>this new "loop" feature, removing the need to call netif_rx() inside
>>>their respective rx_handler and also removing the associated
>>>overhead.
>>
>>This might not be possible. Macvlan uses result of called netif_rx for
>>counting, bridge calls netdev_receive_skb via NF_HOOK. Nevertheless,
>>this can be eventually handled later, not as a part of this patch.
>
>Yes, I agree. Step 2 and step 3 can be swapped.
>
>Anyway, we need to describe the options given to a rx_handler:
>
>- Return skb unchanged. This would cause normal delivery (ptype->dev == NULL or ptype->dev == skb->dev).
>- Return skb->dev changed. __netif_receive_skb() will loop to the new
>device. This would cause extact match delivery only (ptype->dev !=
>NULL and ptype->dev == one of the orig_dev).
>- Manage the skb another way and return NULL. This would stop any
>protocol handlers to receive the skb, except if the rx_handler
>arrange to re-inject the skb somewhere.
>
>>>3/ Convert bonding to use rx_handlers.
>>>
>>>Also, on step 1, we definitely need to clarify what orig_dev should be.
>>>
>>>I now think that orig_dev should be "the device one level below the
>>>current one" or NULL if current device was not diverted from another
>>>one. It means that we should keep an array of crossed (diverted)
>>>devices and the associated orig_dev. This array would be used to pass
>>>the right orig_dev to protocol handlers, depending on the device they
>>>register on :
>>
>>I constructed the patch in the way origdev is the same in all situations
>>as before the patch. I think that this decision can be ommitted at the
>>moment.
>
>Agreed, event if the current handling of orig_dev is far from bullet
>proof and needs to be clarified at some time.
>
>>>eth0 ->  bond0 ->  br0
>>>
>>>A protocol handler registered on bond0 would receive eth0 as orig_dev.
>>>A protocol handler registered on br0 would receive bond0 as orig_dev.
>>>
>>>[snip]
>>>
>>>>@@ -3167,32 +3135,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>
>>>[snip]
>>>
>>>>+another_round:
>>>>+
>>>>+	__this_cpu_inc(softnet_data.processed);
>>>>+
>>>>  #ifdef CONFIG_NET_CLS_ACT
>>>>  	if (skb->tc_verd&   TC_NCLS) {
>>>>  		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
>>>>@@ -3209,8 +3157,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>  #endif
>>>>
>>>>  	list_for_each_entry_rcu(ptype,&ptype_all, list) {
>>>>-		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
>>>>-		    ptype->dev == orig_dev) {
>>>>+		if (!ptype->dev || ptype->dev == skb->dev) {
>>>>  			if (pt_prev)
>>>>  				ret = deliver_skb(skb, pt_prev, orig_dev);
>>>>  			pt_prev = ptype;
>>>>@@ -3224,16 +3171,20 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>>>  ncls:
>>>>  #endif
>>>>
>>>
>>>Why do you loop to ptype_all before calling rx_handler ?
>>>
>>>I don't understand why ptype_all and ptype_base are not handled at
>>>the same place in current __netif_receive_skb() but I think we should
>>>take the opportunity to change that, unless someone know of a good
>>>reason not to do so.
>>
>>Again, the patch tries to do as little changes as it can. So this stays
>>the same as before. In case you want to change it, feel free to submit
>>patch doing that as follow-on.
>
>The point here is that bridge and macvlan handling used to be after
>the ptype_all loop (hence the place you inserted the call to
>rx_handler last summer), but the bonding part is currently before the
>ptype_all loop.
>
>Moving bonding handling after the ptype_all loop will cause the ptype_all loop to be run twice:
>- first time, with skb->dev == eth0 and orig_dev == eth0.
>- second time, with skb->dev == bond0 and orig_dev == eth0.
>
>The first time currently does not exists. And because bonding wasn't
>given a chance yet to decide that the frame should be dropped, the
>packet will always be delivered to eth0, causing duplicate
>deliveries. Note that this is probably true for bridge and macvlan
>too, and that those duplicate deliveries probably already exists.

Yes, and in fact that was what I like about this patch, that then
deliveries are simillar to bridge.

>
>Also, delivering skb inside a loop that may change the skb (skb->dev
>at least) is guaranteed to produce strange behaviors.
>
>Can someone, knowing the history of
>ptype_all/ptype_base/bridge/macvlan/bonding/vlan handling in
>__netif_receive_skb(), comment on this?
>
>Are there any reasons not to process ptype_all and ptype_base at the
>same location, at the end of __netif_receive_skb(), and to manage all
>divert features (bridge/macvlan/bonding/vlan) before?

That is very good set of questions. Would like to hear answers too.

>
>	Nicolas.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-26 14:24                                       ` Nicolas de Pesloüan
@ 2011-02-26 19:42                                         ` Jay Vosburgh
  2011-02-27 12:58                                           ` Jiri Pirko
  0 siblings, 1 reply; 52+ messages in thread
From: Jay Vosburgh @ 2011-02-26 19:42 UTC (permalink / raw)
  To: =?ISO-8859-1?Q?Nicolas_de_Peslo=FCan?=
  Cc: Jiri Pirko, David Miller, kaber, eric.dumazet, netdev,
	shemminger, andy, Fischer, Anna

Nicolas de Pesloüan 	<nicolas.2p.debian@gmail.com> wrote:

>Le 22/02/2011 00:20, Nicolas de Pesloüan a écrit :
>
>> After checking every protocol handlers installed by dev_add_pack(), it
>> appears that only 4 of them really use the orig_dev parameter given by
>> __netif_receive_skb():
>>
>> - bond_3ad_lacpdu_recv() @ drivers/net/bonding/bond_3ad.c
>> - bond_arp_recv() @ drivers/net/bonding/bond_main.c
>> - packet_rcv() @ net/packet/af_packet.c
>> - tpacket_rcv() @ net/packet/af_packet.c
>>
>>  From the bonding point of view, the meaning of orig_dev is obviously
>> "the device one layer below the bonding device, through which the packet
>> reached the bonding device". It is used by bond_3ad_lacpdu_recv() and
>> bond_arp_recv(), to find the underlying slave device through which the
>> LACPDU or ARP was received. (The protocol handler is registered at the
>> bonding device level).
>>
>>  From the af_packet point of view, the meaning is documented (in commit
>> "[AF_PACKET]: Add option to return orig_dev to userspace") as the
>> "physical device [that] actually received the traffic, instead of having
>> the encapsulating device hide that information."
>>
>> When the bonding device is just one level above the physical device, the
>> two meanings happen to match the same device, by chance.
>>
>> So, currently, a bonding device cannot stack properly on top of anything
>> but physical devices. It might not be a problem today, but may change in
>> the future...
>
>Hi Jay,
>
>Still thinking about this orig_dev stuff, I wonder why the protocol
>handlers used in bonding (bond_3ad_lacpdu_recv() and bond_arp_rcv()) are
>registered at the master level instead of at the slave level ?
>
>If they were registered at the slave level, they would simply receive
>skb->dev as the ingress interface and use this value instead of needing
>the orig_dev value given to them when they are registered at the master
>level.
>
>As orig_dev is only used by bonding and by af_packet, but they disagree on
>the exact meaning of orig_dev, one way to fix this discrepancy would be to
>remove one of the usage. As the af_packet usage is exposed to user space,
>bonding seems the right place to stop using orig_dev, even if orig_dev was
>introduced for bonding :-)
>
>I understand that this would add one entry per slave device to the
>ptype_base list, but this seems to be the only bad effect of registering
>at the slave level. Can you confirm that this was the reason to register
>at the master level instead?

	My recollection is that it was done the way it is because there
was no "orig_dev" delivery logic at the time.  A handler registered to a
slave dev would receive no packets at all because assignment of skb->dev
to the master happened first, and the "orig_dev" knowledge was lost.

	When 802.3ad was added, a skb->real_dev field was created, but
it wasn't used for delivery.  802.3ad used real_dev to figure out which
slave a LACPDU arrived on.  The skb->real_dev was eventually replaced
with the orig_dev business that's there now.

	Later, I did the arp_validate stuff the same way as 802.3ad
because it worked and was easier than registering a handler per slave.

>If you think registering at the slave level would cause too much impact on
>ptype_base, then we might have another way to stop using orig_dev for
>bonding:
>
>In __skb_bond_should_drop(), we already test for the two interesting protocols:
>
>if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && skb->protocol == __cpu_to_be16(ETH_P_ARP))
>	return 0;
>
>if (master->priv_flags & IFF_MASTER_8023AD && skb->protocol == __cpu_to_be16(ETH_P_SLOW))
>	return 0;
>
>Would it be possible to call the right handlers directly from inside
>__skb_bond_should_drop() then let __skb_bond_should_drop() return 1
>("should drop") after processing the frames that are only of interest for
>bonding?

	Isn't one purpose of switching to rx_handler that there won't
need to be any skb_bond_should_drop logic in __netif_receive_skb at all?

	Still, if you're just trying to simplify __netif_receive_skb
first, I don't see any reason not to register the packet handlers at the
slave level.  Looking at the ptype_base hash, I don't think that the
protocols bonding is registering (ARP and SLOW) will hash collide with
IP or IPv6, so I suspect there won't be much impact.

	Once an rx_handler is used, then I suspect there's no need for
the packet handlers at all, since the rx_handler is within bonding and
can just deal with the ARP or LACPDU directly.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-26 19:42                                         ` Jay Vosburgh
@ 2011-02-27 12:58                                           ` Jiri Pirko
  2011-02-27 20:44                                             ` Nicolas de Pesloüan
  2011-02-27 23:22                                             ` David Miller
  0 siblings, 2 replies; 52+ messages in thread
From: Jiri Pirko @ 2011-02-27 12:58 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: nicolas.2p.debian, David Miller, kaber, eric.dumazet, netdev,
	shemminger, andy, Fischer, Anna

Sat, Feb 26, 2011 at 08:42:57PM CET, fubar@us.ibm.com wrote:
>Nicolas de Pesloüan 	<nicolas.2p.debian@gmail.com> wrote:
>
>>Le 22/02/2011 00:20, Nicolas de Pesloüan a écrit :
>>
>>> After checking every protocol handlers installed by dev_add_pack(), it
>>> appears that only 4 of them really use the orig_dev parameter given by
>>> __netif_receive_skb():
>>>
>>> - bond_3ad_lacpdu_recv() @ drivers/net/bonding/bond_3ad.c
>>> - bond_arp_recv() @ drivers/net/bonding/bond_main.c
>>> - packet_rcv() @ net/packet/af_packet.c
>>> - tpacket_rcv() @ net/packet/af_packet.c
>>>
>>>  From the bonding point of view, the meaning of orig_dev is obviously
>>> "the device one layer below the bonding device, through which the packet
>>> reached the bonding device". It is used by bond_3ad_lacpdu_recv() and
>>> bond_arp_recv(), to find the underlying slave device through which the
>>> LACPDU or ARP was received. (The protocol handler is registered at the
>>> bonding device level).
>>>
>>>  From the af_packet point of view, the meaning is documented (in commit
>>> "[AF_PACKET]: Add option to return orig_dev to userspace") as the
>>> "physical device [that] actually received the traffic, instead of having
>>> the encapsulating device hide that information."
>>>
>>> When the bonding device is just one level above the physical device, the
>>> two meanings happen to match the same device, by chance.
>>>
>>> So, currently, a bonding device cannot stack properly on top of anything
>>> but physical devices. It might not be a problem today, but may change in
>>> the future...
>>
>>Hi Jay,
>>
>>Still thinking about this orig_dev stuff, I wonder why the protocol
>>handlers used in bonding (bond_3ad_lacpdu_recv() and bond_arp_rcv()) are
>>registered at the master level instead of at the slave level ?
>>
>>If they were registered at the slave level, they would simply receive
>>skb->dev as the ingress interface and use this value instead of needing
>>the orig_dev value given to them when they are registered at the master
>>level.
>>
>>As orig_dev is only used by bonding and by af_packet, but they disagree on
>>the exact meaning of orig_dev, one way to fix this discrepancy would be to
>>remove one of the usage. As the af_packet usage is exposed to user space,
>>bonding seems the right place to stop using orig_dev, even if orig_dev was
>>introduced for bonding :-)
>>
>>I understand that this would add one entry per slave device to the
>>ptype_base list, but this seems to be the only bad effect of registering
>>at the slave level. Can you confirm that this was the reason to register
>>at the master level instead?
>
>	My recollection is that it was done the way it is because there
>was no "orig_dev" delivery logic at the time.  A handler registered to a
>slave dev would receive no packets at all because assignment of skb->dev
>to the master happened first, and the "orig_dev" knowledge was lost.
>
>	When 802.3ad was added, a skb->real_dev field was created, but
>it wasn't used for delivery.  802.3ad used real_dev to figure out which
>slave a LACPDU arrived on.  The skb->real_dev was eventually replaced
>with the orig_dev business that's there now.
>
>	Later, I did the arp_validate stuff the same way as 802.3ad
>because it worked and was easier than registering a handler per slave.
>
>>If you think registering at the slave level would cause too much impact on
>>ptype_base, then we might have another way to stop using orig_dev for
>>bonding:
>>
>>In __skb_bond_should_drop(), we already test for the two interesting protocols:
>>
>>if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && skb->protocol == __cpu_to_be16(ETH_P_ARP))
>>	return 0;
>>
>>if (master->priv_flags & IFF_MASTER_8023AD && skb->protocol == __cpu_to_be16(ETH_P_SLOW))
>>	return 0;
>>
>>Would it be possible to call the right handlers directly from inside
>>__skb_bond_should_drop() then let __skb_bond_should_drop() return 1
>>("should drop") after processing the frames that are only of interest for
>>bonding?
>
>	Isn't one purpose of switching to rx_handler that there won't
>need to be any skb_bond_should_drop logic in __netif_receive_skb at all?

Yes, that (hopefully most)  would be eventually removed.

>
>	Still, if you're just trying to simplify __netif_receive_skb
>first, I don't see any reason not to register the packet handlers at the
>slave level.  Looking at the ptype_base hash, I don't think that the
>protocols bonding is registering (ARP and SLOW) will hash collide with
>IP or IPv6, so I suspect there won't be much impact.
>
>	Once an rx_handler is used, then I suspect there's no need for
>the packet handlers at all, since the rx_handler is within bonding and
>can just deal with the ARP or LACPDU directly.

That is very true. And given that af_packet uses orig_dev to obtain
ifindex, it can be replaced by skb->skb_iif. That way we can get rid of
orig_dev parameter for good.

So I suggest to take V3 of my patch now and do multiple follow-on
patches to get us where we want to get.

Thanks

>
>	-J
>
>---
>	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-23 19:05               ` Jiri Pirko
  2011-02-25 23:46                 ` Nicolas de Pesloüan
@ 2011-02-27 14:17                 ` Nicolas de Pesloüan
  2011-02-27 20:06                   ` Jiri Pirko
  1 sibling, 1 reply; 52+ messages in thread
From: Nicolas de Pesloüan @ 2011-02-27 14:17 UTC (permalink / raw)
  To: Jiri Pirko, David Miller
  Cc: kaber, eric.dumazet, netdev, shemminger, fubar, andy

Le 23/02/2011 20:05, Jiri Pirko a écrit :
> This patch converts bonding to use rx_handler. Results in cleaner
> __netif_receive_skb() with much less exceptions needed. Also
> bond-specific work is moved into bond code.
>
> Did performance test using pktgen and counting incoming packets by
> iptables. No regression noted.
>
> Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>
> v1->v2:
>          using skb_iif instead of new input_dev to remember original
> 	device
>
> v2->v3:
> 	do another loop in case skb->dev is changed. That way orig_dev
> 	core can be left untouched.
>
> Signed-off-by: Jiri Pirko<jpirko@redhat.com>
> ---

[snip]

> +static struct sk_buff *bond_handle_frame(struct sk_buff *skb)
> +{
> +	struct net_device *slave_dev;
> +	struct net_device *bond_dev;
> +
> +	skb = skb_share_check(skb, GFP_ATOMIC);
> +	if (unlikely(!skb))
> +		return NULL;
> +	slave_dev = skb->dev;
> +	bond_dev = ACCESS_ONCE(slave_dev->master);
> +	if (unlikely(!bond_dev))
> +		return skb;
> +
> +	if (bond_dev->priv_flags&  IFF_MASTER_ARPMON)
> +		slave_dev->last_rx = jiffies;
> +
> +	if (bond_should_deliver_exact_match(skb, slave_dev, bond_dev)) {
> +		skb->deliver_no_wcard = 1;
> +		return skb;

Shouldn't we return NULL here ?

> +	}
> +
> +	skb->dev = bond_dev;
> +
> +	if (bond_dev->priv_flags&  IFF_MASTER_ALB&&
> +	    bond_dev->priv_flags&  IFF_BRIDGE_PORT&&
> +	    skb->pkt_type == PACKET_HOST) {
> +		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
> +
> +		memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
> +	}
> +
> +	return skb;
> +}
> +

[snip]

> +static void vlan_on_bond_hook(struct sk_buff *skb)
>   {
> -	if (skb->pkt_type == PACKET_HOST) {
> -		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
> +	/*
> +	 * Make sure ARP frames received on VLAN interfaces stacked on
> +	 * bonding interfaces still make their way to any base bonding
> +	 * device that may have registered for a specific ptype.
> +	 */
> +	if (skb->dev->priv_flags&  IFF_802_1Q_VLAN&&
> +	    vlan_dev_real_dev(skb->dev)->priv_flags&  IFF_BONDING&&
> +	    skb->protocol == htons(ETH_P_ARP)) {

The vlan_on_bond case used to be cost effective. Now, we clone the skb and call netif_rx...

> +		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
>
> -		memcpy(dest, master->dev_addr, ETH_ALEN);
> +		if (!skb2)
> +			return;
> +		skb2->dev = vlan_dev_real_dev(skb->dev);
> +		netif_rx(skb2);
>   	}
>   }

[snip]

>   	if (rx_handler) {
> +		struct net_device *prev_dev;
> +
>   		if (pt_prev) {
>   			ret = deliver_skb(skb, pt_prev, orig_dev);
>   			pt_prev = NULL;
>   		}
> +		prev_dev = skb->dev;
>   		skb = rx_handler(skb);
>   		if (!skb)
>   			goto out;

I would instead consider NULL as meaning exact-match-delivery-only. (The same effect as 
dev_bond_should_drop() returning true).

> +		if (skb->dev != prev_dev)
> +			goto another_round;
>   	}

Anyway, all my comments can't be postponed to follow-up patchs. Thanks Jiri.

Reviewed-by: Nicolas de Pesloüan <nicolas.2p.debian@free.fr>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-27 14:17                 ` Nicolas de Pesloüan
@ 2011-02-27 20:06                   ` Jiri Pirko
  2011-02-27 20:59                     ` Nicolas de Pesloüan
  0 siblings, 1 reply; 52+ messages in thread
From: Jiri Pirko @ 2011-02-27 20:06 UTC (permalink / raw)
  To: Nicolas de Pesloüan
  Cc: David Miller, kaber, eric.dumazet, netdev, shemminger, fubar, andy

Sun, Feb 27, 2011 at 03:17:01PM CET, nicolas.2p.debian@gmail.com wrote:
>Le 23/02/2011 20:05, Jiri Pirko a écrit :
>>This patch converts bonding to use rx_handler. Results in cleaner
>>__netif_receive_skb() with much less exceptions needed. Also
>>bond-specific work is moved into bond code.
>>
>>Did performance test using pktgen and counting incoming packets by
>>iptables. No regression noted.
>>
>>Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>
>>v1->v2:
>>         using skb_iif instead of new input_dev to remember original
>>	device
>>
>>v2->v3:
>>	do another loop in case skb->dev is changed. That way orig_dev
>>	core can be left untouched.
>>
>>Signed-off-by: Jiri Pirko<jpirko@redhat.com>
>>---
>
>[snip]
>
>>+static struct sk_buff *bond_handle_frame(struct sk_buff *skb)
>>+{
>>+	struct net_device *slave_dev;
>>+	struct net_device *bond_dev;
>>+
>>+	skb = skb_share_check(skb, GFP_ATOMIC);
>>+	if (unlikely(!skb))
>>+		return NULL;
>>+	slave_dev = skb->dev;
>>+	bond_dev = ACCESS_ONCE(slave_dev->master);
>>+	if (unlikely(!bond_dev))
>>+		return skb;
>>+
>>+	if (bond_dev->priv_flags&  IFF_MASTER_ARPMON)
>>+		slave_dev->last_rx = jiffies;
>>+
>>+	if (bond_should_deliver_exact_match(skb, slave_dev, bond_dev)) {
>>+		skb->deliver_no_wcard = 1;
>>+		return skb;
>
>Shouldn't we return NULL here ?

No we shouldn't. We need sbk to be delivered to exact match.

>
>>+	}
>>+
>>+	skb->dev = bond_dev;
>>+
>>+	if (bond_dev->priv_flags&  IFF_MASTER_ALB&&
>>+	    bond_dev->priv_flags&  IFF_BRIDGE_PORT&&
>>+	    skb->pkt_type == PACKET_HOST) {
>>+		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
>>+
>>+		memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
>>+	}
>>+
>>+	return skb;
>>+}
>>+
>
>[snip]
>
>>+static void vlan_on_bond_hook(struct sk_buff *skb)
>>  {
>>-	if (skb->pkt_type == PACKET_HOST) {
>>-		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
>>+	/*
>>+	 * Make sure ARP frames received on VLAN interfaces stacked on
>>+	 * bonding interfaces still make their way to any base bonding
>>+	 * device that may have registered for a specific ptype.
>>+	 */
>>+	if (skb->dev->priv_flags&  IFF_802_1Q_VLAN&&
>>+	    vlan_dev_real_dev(skb->dev)->priv_flags&  IFF_BONDING&&
>>+	    skb->protocol == htons(ETH_P_ARP)) {
>
>The vlan_on_bond case used to be cost effective. Now, we clone the skb and call netif_rx...

This should not cost too much overhead considering only few packets are
going thru this. This hook shouldn't have exited in the fisrt place. I
think introducing this functionality was a big mistake.
>
>>+		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
>>
>>-		memcpy(dest, master->dev_addr, ETH_ALEN);
>>+		if (!skb2)
>>+			return;
>>+		skb2->dev = vlan_dev_real_dev(skb->dev);
>>+		netif_rx(skb2);
>>  	}
>>  }
>
>[snip]
>
>>  	if (rx_handler) {
>>+		struct net_device *prev_dev;
>>+
>>  		if (pt_prev) {
>>  			ret = deliver_skb(skb, pt_prev, orig_dev);
>>  			pt_prev = NULL;
>>  		}
>>+		prev_dev = skb->dev;
>>  		skb = rx_handler(skb);
>>  		if (!skb)
>>  			goto out;
>
>I would instead consider NULL as meaning exact-match-delivery-only.
>(The same effect as dev_bond_should_drop() returning true).

we can change the behaviour later on.

>
>>+		if (skb->dev != prev_dev)
>>+			goto another_round;
>>  	}
>
>Anyway, all my comments can't be postponed to follow-up patchs. Thanks Jiri.
>
>Reviewed-by: Nicolas de Pesloüan <nicolas.2p.debian@free.fr>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-27 12:58                                           ` Jiri Pirko
@ 2011-02-27 20:44                                             ` Nicolas de Pesloüan
  2011-02-27 23:22                                             ` David Miller
  1 sibling, 0 replies; 52+ messages in thread
From: Nicolas de Pesloüan @ 2011-02-27 20:44 UTC (permalink / raw)
  To: Jiri Pirko, Jay Vosburgh
  Cc: David Miller, kaber, eric.dumazet, netdev, shemminger, andy,
	Fischer, Anna

Le 27/02/2011 13:58, Jiri Pirko a écrit :
> Sat, Feb 26, 2011 at 08:42:57PM CET, fubar@us.ibm.com wrote:
>> Nicolas de Pesloüan 	<nicolas.2p.debian@gmail.com>  wrote:
>>> Hi Jay,
>>>
>>> Still thinking about this orig_dev stuff, I wonder why the protocol
>>> handlers used in bonding (bond_3ad_lacpdu_recv() and bond_arp_rcv()) are
>>> registered at the master level instead of at the slave level ?
>>>
>>> If they were registered at the slave level, they would simply receive
>>> skb->dev as the ingress interface and use this value instead of needing
>>> the orig_dev value given to them when they are registered at the master
>>> level.
>>>
>>> As orig_dev is only used by bonding and by af_packet, but they disagree on
>>> the exact meaning of orig_dev, one way to fix this discrepancy would be to
>>> remove one of the usage. As the af_packet usage is exposed to user space,
>>> bonding seems the right place to stop using orig_dev, even if orig_dev was
>>> introduced for bonding :-)
>>>
>>> I understand that this would add one entry per slave device to the
>>> ptype_base list, but this seems to be the only bad effect of registering
>>> at the slave level. Can you confirm that this was the reason to register
>>> at the master level instead?
>>
>> 	My recollection is that it was done the way it is because there
>> was no "orig_dev" delivery logic at the time.  A handler registered to a
>> slave dev would receive no packets at all because assignment of skb->dev
>> to the master happened first, and the "orig_dev" knowledge was lost.
>>
>> 	When 802.3ad was added, a skb->real_dev field was created, but
>> it wasn't used for delivery.  802.3ad used real_dev to figure out which
>> slave a LACPDU arrived on.  The skb->real_dev was eventually replaced
>> with the orig_dev business that's there now.
>>
>> 	Later, I did the arp_validate stuff the same way as 802.3ad
>> because it worked and was easier than registering a handler per slave.
>>
>>> If you think registering at the slave level would cause too much impact on
>>> ptype_base, then we might have another way to stop using orig_dev for
>>> bonding:
>>>
>>> In __skb_bond_should_drop(), we already test for the two interesting protocols:
>>>
>>> if ((dev->priv_flags&  IFF_SLAVE_NEEDARP)&&  skb->protocol == __cpu_to_be16(ETH_P_ARP))
>>> 	return 0;
>>>
>>> if (master->priv_flags&  IFF_MASTER_8023AD&&  skb->protocol == __cpu_to_be16(ETH_P_SLOW))
>>> 	return 0;
>>>
>>> Would it be possible to call the right handlers directly from inside
>>> __skb_bond_should_drop() then let __skb_bond_should_drop() return 1
>>> ("should drop") after processing the frames that are only of interest for
>>> bonding?
>>
>> 	Isn't one purpose of switching to rx_handler that there won't
>> need to be any skb_bond_should_drop logic in __netif_receive_skb at all?
>
> Yes, that (hopefully most)  would be eventually removed.

The skb_bond_should_drop logic was simply moved from dev.c to 
bond_should_deliver_exact_match@bond_main.c by Jiri's patch.

But the logic remain and is necessary to decide whether we do normal delivery or only exact match 
delivery.

>> 	Still, if you're just trying to simplify __netif_receive_skb
>> first, I don't see any reason not to register the packet handlers at the
>> slave level.  Looking at the ptype_base hash, I don't think that the
>> protocols bonding is registering (ARP and SLOW) will hash collide with
>> IP or IPv6, so I suspect there won't be much impact.
>>
>> 	Once an rx_handler is used, then I suspect there's no need for
>> the packet handlers at all, since the rx_handler is within bonding and
>> can just deal with the ARP or LACPDU directly.
>
> That is very true. And given that af_packet uses orig_dev to obtain
> ifindex, it can be replaced by skb->skb_iif. That way we can get rid of
> orig_dev parameter for good.

Unfortunately, after doing some more research, I'm afraid we won't be able to suppress at least the 
ARP packet handler:

In commit 1f3c8804acba841b5573b953f5560d2683d2db0d (bonding: allow arp_ip_targets on separate vlans 
to use arp validation), Andy solved the problem of vlan on top of bonding, when the arp_ip_target is 
on one of the vlans:

eth0/eth1 -> bond0 -> bond0.100

At the time the frame is inspected by bonding, the frame is still tagged. This is true for the new 
rx_handler proposed by Jiri, and is also true for the former __skb_bond_should_drop() handling).

To receive the untagged frame, we would have to wait until the vlan code remove the tag. The current 
protocol handler seems to be the best way to catch the frame that late.

This is probably specific to ARP. I don't think SLOW frames can be tagged.

Anyway, Jay, thanks for you clarification.

> So I suggest to take V3 of my patch now and do multiple follow-on
> patches to get us where we want to get.

Agreed.

	Nicolas.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-27 20:06                   ` Jiri Pirko
@ 2011-02-27 20:59                     ` Nicolas de Pesloüan
  0 siblings, 0 replies; 52+ messages in thread
From: Nicolas de Pesloüan @ 2011-02-27 20:59 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: David Miller, kaber, eric.dumazet, netdev, shemminger, fubar, andy

Le 27/02/2011 21:06, Jiri Pirko a écrit :
> Sun, Feb 27, 2011 at 03:17:01PM CET, nicolas.2p.debian@gmail.com wrote:

>>> +	if (bond_should_deliver_exact_match(skb, slave_dev, bond_dev)) {
>>> +		skb->deliver_no_wcard = 1;
>>> +		return skb;
>>
>> Shouldn't we return NULL here ?
>
> No we shouldn't. We need sbk to be delivered to exact match.

So, if I understand properly:

- If skb->dev changed, loop,
- else, if skb->deliver_no_wcard, do exact match delivery only,
- Else, if !skb, drop the frame, without ever exact match delivery,
- Else, do normal delivery.

Right?

>> The vlan_on_bond case used to be cost effective. Now, we clone the skb and call netif_rx...
>
> This should not cost too much overhead considering only few packets are
> going thru this. This hook shouldn't have exited in the fisrt place. I
> think introducing this functionality was a big mistake.

What would you have proposed instead?

Anyway, I think the feature is broken, because it wouldn't provide the expected effect on the 
following configuration:

eth0/eth1 -> bond0 -> br0 -> br0.100.

We probably need a more general way to fix this, after your patch have been accepted.

[snip]

>> I would instead consider NULL as meaning exact-match-delivery-only.
>> (The same effect as dev_bond_should_drop() returning true).
>
> we can change the behaviour later on.

Agreed.

	Nicolas.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-27 12:58                                           ` Jiri Pirko
  2011-02-27 20:44                                             ` Nicolas de Pesloüan
@ 2011-02-27 23:22                                             ` David Miller
  2011-02-28  7:07                                               ` Jiri Pirko
  1 sibling, 1 reply; 52+ messages in thread
From: David Miller @ 2011-02-27 23:22 UTC (permalink / raw)
  To: jpirko
  Cc: fubar, nicolas.2p.debian, kaber, eric.dumazet, netdev,
	shemminger, andy, anna.fischer

From: Jiri Pirko <jpirko@redhat.com>
Date: Sun, 27 Feb 2011 13:58:17 +0100

> That is very true. And given that af_packet uses orig_dev to obtain
> ifindex, it can be replaced by skb->skb_iif. That way we can get rid of
> orig_dev parameter for good.

I would rather see a complete patch set submitting at a unit, thanks.

I've already marked your V3 last night as "changes requested" in
patchwork for this reason.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-27 23:22                                             ` David Miller
@ 2011-02-28  7:07                                               ` Jiri Pirko
  2011-02-28  7:30                                                 ` David Miller
  0 siblings, 1 reply; 52+ messages in thread
From: Jiri Pirko @ 2011-02-28  7:07 UTC (permalink / raw)
  To: David Miller
  Cc: fubar, nicolas.2p.debian, kaber, eric.dumazet, netdev,
	shemminger, andy, anna.fischer

Mon, Feb 28, 2011 at 12:22:08AM CET, davem@davemloft.net wrote:
>From: Jiri Pirko <jpirko@redhat.com>
>Date: Sun, 27 Feb 2011 13:58:17 +0100
>
>> That is very true. And given that af_packet uses orig_dev to obtain
>> ifindex, it can be replaced by skb->skb_iif. That way we can get rid of
>> orig_dev parameter for good.
>
>I would rather see a complete patch set submitting at a unit, thanks.
>
>I've already marked your V3 last night as "changes requested" in
>patchwork for this reason.

That's a pity. V3 is complete patch. The changes we talk about in
discussion are just taking advantage of changes in patch V3. Would be in
my opinion better to apply V3 now and followup with the rest after that.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-28  7:07                                               ` Jiri Pirko
@ 2011-02-28  7:30                                                 ` David Miller
  2011-02-28  9:22                                                   ` Jiri Pirko
  0 siblings, 1 reply; 52+ messages in thread
From: David Miller @ 2011-02-28  7:30 UTC (permalink / raw)
  To: jpirko
  Cc: fubar, nicolas.2p.debian, kaber, eric.dumazet, netdev,
	shemminger, andy, anna.fischer

From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 28 Feb 2011 08:07:33 +0100

> Mon, Feb 28, 2011 at 12:22:08AM CET, davem@davemloft.net wrote:
>>From: Jiri Pirko <jpirko@redhat.com>
>>Date: Sun, 27 Feb 2011 13:58:17 +0100
>>
>>> That is very true. And given that af_packet uses orig_dev to obtain
>>> ifindex, it can be replaced by skb->skb_iif. That way we can get rid of
>>> orig_dev parameter for good.
>>
>>I would rather see a complete patch set submitting at a unit, thanks.
>>
>>I've already marked your V3 last night as "changes requested" in
>>patchwork for this reason.
> 
> That's a pity. V3 is complete patch. The changes we talk about in
> discussion are just taking advantage of changes in patch V3. Would be in
> my opinion better to apply V3 now and followup with the rest after that.

Fair enough, I'll aply it now, thanks.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-28  7:30                                                 ` David Miller
@ 2011-02-28  9:22                                                   ` Jiri Pirko
  2011-02-28  9:35                                                     ` Eric Dumazet
  2011-02-28 18:49                                                     ` [patch net-next-2.6 V3] net: convert bonding to use rx_handler David Miller
  0 siblings, 2 replies; 52+ messages in thread
From: Jiri Pirko @ 2011-02-28  9:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Mon, Feb 28, 2011 at 08:30:13AM CET, davem@davemloft.net wrote:
>From: Jiri Pirko <jpirko@redhat.com>
>Date: Mon, 28 Feb 2011 08:07:33 +0100
>
>> Mon, Feb 28, 2011 at 12:22:08AM CET, davem@davemloft.net wrote:
>>>From: Jiri Pirko <jpirko@redhat.com>
>>>Date: Sun, 27 Feb 2011 13:58:17 +0100
>>>
>>>> That is very true. And given that af_packet uses orig_dev to obtain
>>>> ifindex, it can be replaced by skb->skb_iif. That way we can get rid of
>>>> orig_dev parameter for good.
>>>
>>>I would rather see a complete patch set submitting at a unit, thanks.
>>>
>>>I've already marked your V3 last night as "changes requested" in
>>>patchwork for this reason.
>> 
>> That's a pity. V3 is complete patch. The changes we talk about in
>> discussion are just taking advantage of changes in patch V3. Would be in
>> my opinion better to apply V3 now and followup with the rest after that.
>
>Fair enough, I'll aply it now, thanks.

Applied incorrectly. net/core/dev.c part is missing

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-28  9:22                                                   ` Jiri Pirko
@ 2011-02-28  9:35                                                     ` Eric Dumazet
  2011-02-28  9:55                                                       ` [patch net-next-2.6] net: convert bonding to use rx_handler - second part Jiri Pirko
  2011-02-28 18:49                                                     ` [patch net-next-2.6 V3] net: convert bonding to use rx_handler David Miller
  1 sibling, 1 reply; 52+ messages in thread
From: Eric Dumazet @ 2011-02-28  9:35 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: David Miller, netdev

Le lundi 28 février 2011 à 10:22 +0100, Jiri Pirko a écrit :

> Applied incorrectly. net/core/dev.c part is missing

Hmm, could you please provide a 2nd patch with this part ?

Thanks
 


^ permalink raw reply	[flat|nested] 52+ messages in thread

* [patch net-next-2.6] net: convert bonding to use rx_handler - second part
  2011-02-28  9:35                                                     ` Eric Dumazet
@ 2011-02-28  9:55                                                       ` Jiri Pirko
  0 siblings, 0 replies; 52+ messages in thread
From: Jiri Pirko @ 2011-02-28  9:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

This patch converts bonding to use rx_handler. Results in cleaner
__netif_receive_skb() with much less exceptions needed.

Did performance test using pktgen and counting incoming packets by
iptables. No regression noted.

Reviewed-by: Nicolas de Pesloüan <nicolas.2p.debian@free.fr>
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 net/core/dev.c |  119 ++++++++++++++-----------------------------------------
 1 files changed, 31 insertions(+), 88 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 69a3c08..30440e7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3096,63 +3096,31 @@ void netdev_rx_handler_unregister(struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 
-static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
-					      struct net_device *master)
+static void vlan_on_bond_hook(struct sk_buff *skb)
 {
-	if (skb->pkt_type == PACKET_HOST) {
-		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
+	/*
+	 * Make sure ARP frames received on VLAN interfaces stacked on
+	 * bonding interfaces still make their way to any base bonding
+	 * device that may have registered for a specific ptype.
+	 */
+	if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
+	    vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
+	    skb->protocol == htons(ETH_P_ARP)) {
+		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 
-		memcpy(dest, master->dev_addr, ETH_ALEN);
+		if (!skb2)
+			return;
+		skb2->dev = vlan_dev_real_dev(skb->dev);
+		netif_rx(skb2);
 	}
 }
 
-/* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
- */
-static int __skb_bond_should_drop(struct sk_buff *skb,
-				  struct net_device *master)
-{
-	struct net_device *dev = skb->dev;
-
-	if (master->priv_flags & IFF_MASTER_ARPMON)
-		dev->last_rx = jiffies;
-
-	if ((master->priv_flags & IFF_MASTER_ALB) &&
-	    (master->priv_flags & IFF_BRIDGE_PORT)) {
-		/* Do address unmangle. The local destination address
-		 * will be always the one master has. Provides the right
-		 * functionality in a bridge.
-		 */
-		skb_bond_set_mac_by_master(skb, master);
-	}
-
-	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
-		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
-		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
-			return 0;
-
-		if (master->priv_flags & IFF_MASTER_ALB) {
-			if (skb->pkt_type != PACKET_BROADCAST &&
-			    skb->pkt_type != PACKET_MULTICAST)
-				return 0;
-		}
-		if (master->priv_flags & IFF_MASTER_8023AD &&
-		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
-			return 0;
-
-		return 1;
-	}
-	return 0;
-}
-
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	rx_handler_func_t *rx_handler;
 	struct net_device *orig_dev;
-	struct net_device *null_or_orig;
-	struct net_device *orig_or_bond;
+	struct net_device *null_or_dev;
 	int ret = NET_RX_DROP;
 	__be16 type;
 
@@ -3167,32 +3135,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
 
 	if (!skb->skb_iif)
 		skb->skb_iif = skb->dev->ifindex;
-
-	/*
-	 * bonding note: skbs received on inactive slaves should only
-	 * be delivered to pkt handlers that are exact matches.  Also
-	 * the deliver_no_wcard flag will be set.  If packet handlers
-	 * are sensitive to duplicate packets these skbs will need to
-	 * be dropped at the handler.
-	 */
-	null_or_orig = NULL;
 	orig_dev = skb->dev;
-	if (skb->deliver_no_wcard)
-		null_or_orig = orig_dev;
-	else if (netif_is_bond_slave(orig_dev)) {
-		struct net_device *bond_master = ACCESS_ONCE(orig_dev->master);
-
-		if (likely(bond_master)) {
-			if (__skb_bond_should_drop(skb, bond_master)) {
-				skb->deliver_no_wcard = 1;
-				/* deliver only exact match */
-				null_or_orig = orig_dev;
-			} else
-				skb->dev = bond_master;
-		}
-	}
 
-	__this_cpu_inc(softnet_data.processed);
 	skb_reset_network_header(skb);
 	skb_reset_transport_header(skb);
 	skb->mac_len = skb->network_header - skb->mac_header;
@@ -3201,6 +3145,10 @@ static int __netif_receive_skb(struct sk_buff *skb)
 
 	rcu_read_lock();
 
+another_round:
+
+	__this_cpu_inc(softnet_data.processed);
+
 #ifdef CONFIG_NET_CLS_ACT
 	if (skb->tc_verd & TC_NCLS) {
 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
@@ -3209,8 +3157,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 #endif
 
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
-		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
-		    ptype->dev == orig_dev) {
+		if (!ptype->dev || ptype->dev == skb->dev) {
 			if (pt_prev)
 				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;
@@ -3224,16 +3171,20 @@ static int __netif_receive_skb(struct sk_buff *skb)
 ncls:
 #endif
 
-	/* Handle special case of bridge or macvlan */
 	rx_handler = rcu_dereference(skb->dev->rx_handler);
 	if (rx_handler) {
+		struct net_device *prev_dev;
+
 		if (pt_prev) {
 			ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = NULL;
 		}
+		prev_dev = skb->dev;
 		skb = rx_handler(skb);
 		if (!skb)
 			goto out;
+		if (skb->dev != prev_dev)
+			goto another_round;
 	}
 
 	if (vlan_tx_tag_present(skb)) {
@@ -3248,24 +3199,16 @@ ncls:
 			goto out;
 	}
 
-	/*
-	 * Make sure frames received on VLAN interfaces stacked on
-	 * bonding interfaces still make their way to any base bonding
-	 * device that may have registered for a specific ptype.  The
-	 * handler may have to adjust skb->dev and orig_dev.
-	 */
-	orig_or_bond = orig_dev;
-	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
-	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
-		orig_or_bond = vlan_dev_real_dev(skb->dev);
-	}
+	vlan_on_bond_hook(skb);
+
+	/* deliver only exact match when indicated */
+	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
 
 	type = skb->protocol;
 	list_for_each_entry_rcu(ptype,
 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
-		if (ptype->type == type && (ptype->dev == null_or_orig ||
-		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
-		     ptype->dev == orig_or_bond)) {
+		if (ptype->type == type &&
+		    (ptype->dev == null_or_dev || ptype->dev == skb->dev)) {
 			if (pt_prev)
 				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;
-- 
1.7.3.4


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [patch net-next-2.6 V3] net: convert bonding to use rx_handler
  2011-02-28  9:22                                                   ` Jiri Pirko
  2011-02-28  9:35                                                     ` Eric Dumazet
@ 2011-02-28 18:49                                                     ` David Miller
  1 sibling, 0 replies; 52+ messages in thread
From: David Miller @ 2011-02-28 18:49 UTC (permalink / raw)
  To: jpirko; +Cc: netdev

From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 28 Feb 2011 10:22:24 +0100

> Applied incorrectly. net/core/dev.c part is missing

Fixed, sorry.

^ permalink raw reply	[flat|nested] 52+ messages in thread

end of thread, other threads:[~2011-02-28 18:49 UTC | newest]

Thread overview: 52+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-02-18 13:25 [patch net-next-2.6] net: convert bonding to use rx_handler Jiri Pirko
2011-02-18 13:29 ` Eric Dumazet
2011-02-18 14:14   ` Jiri Pirko
2011-02-18 14:27     ` Eric Dumazet
2011-02-18 14:46       ` Patrick McHardy
2011-02-18 14:58         ` Jiri Pirko
2011-02-18 15:50           ` Patrick McHardy
2011-02-18 16:14             ` Eric Dumazet
2011-02-18 18:47               ` Jiri Pirko
2011-02-18 19:17                 ` Eric Dumazet
2011-02-18 19:28                   ` Jiri Pirko
2011-02-18 19:58                     ` Eric Dumazet
2011-02-18 20:03                       ` Jiri Pirko
2011-02-18 20:06           ` David Miller
2011-02-18 20:13             ` Jiri Pirko
2011-02-18 20:58             ` [patch net-next-2.6 V2] " Jiri Pirko
2011-02-18 23:06               ` Jay Vosburgh
2011-02-19  7:44                 ` Jiri Pirko
2011-02-19  8:05                 ` [patch net-next-2.6 V3] " Jiri Pirko
2011-02-19  8:37                   ` Eric Dumazet
2011-02-19  8:58                     ` Jiri Pirko
2011-02-19  9:22                       ` Eric Dumazet
2011-02-19 10:56                   ` Nicolas de Pesloüan
2011-02-19 11:08                     ` Jiri Pirko
2011-02-19 11:28                       ` Jiri Pirko
2011-02-19 13:18                         ` Nicolas de Pesloüan
2011-02-19 13:46                           ` Jiri Pirko
2011-02-19 14:32                             ` Nicolas de Pesloüan
2011-02-19 20:27                             ` Nicolas de Pesloüan
2011-02-20 10:36                               ` Jiri Pirko
2011-02-20 12:12                                 ` Nicolas de Pesloüan
2011-02-20 15:07                                   ` Jiri Pirko
2011-02-21 23:20                                     ` Nicolas de Pesloüan
2011-02-26 14:24                                       ` Nicolas de Pesloüan
2011-02-26 19:42                                         ` Jay Vosburgh
2011-02-27 12:58                                           ` Jiri Pirko
2011-02-27 20:44                                             ` Nicolas de Pesloüan
2011-02-27 23:22                                             ` David Miller
2011-02-28  7:07                                               ` Jiri Pirko
2011-02-28  7:30                                                 ` David Miller
2011-02-28  9:22                                                   ` Jiri Pirko
2011-02-28  9:35                                                     ` Eric Dumazet
2011-02-28  9:55                                                       ` [patch net-next-2.6] net: convert bonding to use rx_handler - second part Jiri Pirko
2011-02-28 18:49                                                     ` [patch net-next-2.6 V3] net: convert bonding to use rx_handler David Miller
2011-02-23 19:05               ` Jiri Pirko
2011-02-25 23:46                 ` Nicolas de Pesloüan
2011-02-26  7:14                   ` Jiri Pirko
2011-02-26 11:25                     ` Nicolas de Pesloüan
2011-02-26 14:58                       ` Jiri Pirko
2011-02-27 14:17                 ` Nicolas de Pesloüan
2011-02-27 20:06                   ` Jiri Pirko
2011-02-27 20:59                     ` Nicolas de Pesloüan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.