All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
@ 2009-03-13 18:33 ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-13 18:33 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel

Hi all.

This is only a draft of patch to consult. I'm aware that it should be divided
into multiple patches. I want to know opinion from you folks.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices. Except for mode balance-alb. When you put
this kind of bond device into a bridge it will only add one of mac adresses into
a hash list of mac addresses, say X. This mac address is marked as local. But
this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.
Therefore I introduce another function pointer in struct net_device_ops -
ndo_check_mac_address. This function when it's implemented should check passed
mac address against the one set in device. I'm using this in bonding driver when
the bond is in mode balance-alb to walk thru all slaves and checking if any of
them equals passed address.

Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
to recognize the destination mac address as local.

Please look at this and tell me what you think about it.

Thanks

Jirka


Signed-off-by: Jiri Pirko <jpirko@redhat.com>

 drivers/net/bonding/bond_alb.c  |   17 +++++++++++++++++
 drivers/net/bonding/bond_alb.h  |    1 +
 drivers/net/bonding/bond_main.c |   11 +++++++++++
 include/linux/netdevice.h       |    7 +++++++
 net/bridge/br_input.c           |    5 ++++-
 5 files changed, 40 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 27fb7f5..b7bcee0 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1762,6 +1762,23 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
 	return 0;
 }
 
+int bond_alb_check_mac_address(struct net_device *bond_dev, void *addr)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct slave *slave = NULL;
+	int ret = !0;
+	int i;
+
+	read_lock(&bond->lock);
+	bond_for_each_slave(bond, slave, i) {
+		ret = compare_ether_addr(slave->perm_hwaddr, addr);
+		if (!ret)
+			break;
+	}
+	read_unlock(&bond->lock);
+	return ret;
+}
+
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
 {
 	if (bond->alb_info.current_alb_vlan &&
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 50968f8..5e39bda 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
 int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
 void bond_alb_monitor(struct work_struct *);
 int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
+int bond_alb_check_mac_address(struct net_device *bond_dev, void *addr);
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
 #endif /* __BOND_ALB_H__ */
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e0578fe..fbff338 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4279,6 +4279,16 @@ unwind:
 	return res;
 }
 
+static int bond_check_mac_address(struct net_device *bond_dev, void *addr)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+
+	if (bond->params.mode == BOND_MODE_ALB)
+		return bond_alb_check_mac_address(bond_dev, addr);
+
+	return compare_ether_addr(bond_dev->dev_addr, addr);
+}
+
 static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
@@ -4576,6 +4586,7 @@ static const struct net_device_ops bond_netdev_ops = {
 	.ndo_set_multicast_list	= bond_set_multicast_list,
 	.ndo_change_mtu		= bond_change_mtu,
 	.ndo_set_mac_address 	= bond_set_mac_address,
+	.ndo_check_mac_address 	= bond_check_mac_address,
 	.ndo_neigh_setup	= bond_neigh_setup,
 	.ndo_vlan_rx_register	= bond_vlan_rx_register,
 	.ndo_vlan_rx_add_vid 	= bond_vlan_rx_add_vid,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6593667..e75f691 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -491,6 +491,10 @@ struct netdev_queue {
  *	needs to be changed. If not this interface is not defined, the
  *	mac address can not be changed.
  *
+ * int (*ndo_check_mac_address)(struct net_device *dev, void *addr);
+ *	This function is called when the given Media Access Control address
+ *	needs to compared to the one set to the device.
+ *
  * int (*ndo_validate_addr)(struct net_device *dev);
  *	Test if Media Access Control address is valid for the device.
  *
@@ -554,6 +558,9 @@ struct net_device_ops {
 #define HAVE_SET_MAC_ADDR
 	int			(*ndo_set_mac_address)(struct net_device *dev,
 						       void *addr);
+#define HAVE_CHECK_MAC_ADDR
+	int			(*ndo_check_mac_address)(struct net_device *dev,
+						       void *addr);
 #define HAVE_VALIDATE_ADDR
 	int			(*ndo_validate_addr)(struct net_device *dev);
 #define HAVE_PRIVATE_IOCTL
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 30b8877..b071169 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -39,6 +39,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
 {
 	const unsigned char *dest = eth_hdr(skb)->h_dest;
 	struct net_bridge_port *p = rcu_dereference(skb->dev->br_port);
+	struct net_device *dev = p->dev;
 	struct net_bridge *br;
 	struct net_bridge_fdb_entry *dst;
 	struct sk_buff *skb2;
@@ -64,7 +65,9 @@ int br_handle_frame_finish(struct sk_buff *skb)
 	if (is_multicast_ether_addr(dest)) {
 		br->dev->stats.multicast++;
 		skb2 = skb;
-	} else if ((dst = __br_fdb_get(br, dest)) && dst->is_local) {
+	} else if (((dst = __br_fdb_get(br, dest)) && dst->is_local) ||
+		   (dev->netdev_ops->ndo_check_mac_address &&
+		    !dev->netdev_ops->ndo_check_mac_address(dev, (unsigned char *) dest))) {
 		skb2 = skb;
 		/* Do not forward the packet since it's local. */
 		skb = NULL;

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
@ 2009-03-13 18:33 ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-13 18:33 UTC (permalink / raw)
  To: linux-kernel; +Cc: fubar, netdev, bridge, bonding-devel, jgarzik, davem

Hi all.

This is only a draft of patch to consult. I'm aware that it should be divided
into multiple patches. I want to know opinion from you folks.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices. Except for mode balance-alb. When you put
this kind of bond device into a bridge it will only add one of mac adresses into
a hash list of mac addresses, say X. This mac address is marked as local. But
this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.
Therefore I introduce another function pointer in struct net_device_ops -
ndo_check_mac_address. This function when it's implemented should check passed
mac address against the one set in device. I'm using this in bonding driver when
the bond is in mode balance-alb to walk thru all slaves and checking if any of
them equals passed address.

Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
to recognize the destination mac address as local.

Please look at this and tell me what you think about it.

Thanks

Jirka


Signed-off-by: Jiri Pirko <jpirko@redhat.com>

 drivers/net/bonding/bond_alb.c  |   17 +++++++++++++++++
 drivers/net/bonding/bond_alb.h  |    1 +
 drivers/net/bonding/bond_main.c |   11 +++++++++++
 include/linux/netdevice.h       |    7 +++++++
 net/bridge/br_input.c           |    5 ++++-
 5 files changed, 40 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 27fb7f5..b7bcee0 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1762,6 +1762,23 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
 	return 0;
 }
 
+int bond_alb_check_mac_address(struct net_device *bond_dev, void *addr)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct slave *slave = NULL;
+	int ret = !0;
+	int i;
+
+	read_lock(&bond->lock);
+	bond_for_each_slave(bond, slave, i) {
+		ret = compare_ether_addr(slave->perm_hwaddr, addr);
+		if (!ret)
+			break;
+	}
+	read_unlock(&bond->lock);
+	return ret;
+}
+
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
 {
 	if (bond->alb_info.current_alb_vlan &&
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 50968f8..5e39bda 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
 int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
 void bond_alb_monitor(struct work_struct *);
 int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
+int bond_alb_check_mac_address(struct net_device *bond_dev, void *addr);
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
 #endif /* __BOND_ALB_H__ */
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e0578fe..fbff338 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4279,6 +4279,16 @@ unwind:
 	return res;
 }
 
+static int bond_check_mac_address(struct net_device *bond_dev, void *addr)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+
+	if (bond->params.mode == BOND_MODE_ALB)
+		return bond_alb_check_mac_address(bond_dev, addr);
+
+	return compare_ether_addr(bond_dev->dev_addr, addr);
+}
+
 static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
@@ -4576,6 +4586,7 @@ static const struct net_device_ops bond_netdev_ops = {
 	.ndo_set_multicast_list	= bond_set_multicast_list,
 	.ndo_change_mtu		= bond_change_mtu,
 	.ndo_set_mac_address 	= bond_set_mac_address,
+	.ndo_check_mac_address 	= bond_check_mac_address,
 	.ndo_neigh_setup	= bond_neigh_setup,
 	.ndo_vlan_rx_register	= bond_vlan_rx_register,
 	.ndo_vlan_rx_add_vid 	= bond_vlan_rx_add_vid,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6593667..e75f691 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -491,6 +491,10 @@ struct netdev_queue {
  *	needs to be changed. If not this interface is not defined, the
  *	mac address can not be changed.
  *
+ * int (*ndo_check_mac_address)(struct net_device *dev, void *addr);
+ *	This function is called when the given Media Access Control address
+ *	needs to compared to the one set to the device.
+ *
  * int (*ndo_validate_addr)(struct net_device *dev);
  *	Test if Media Access Control address is valid for the device.
  *
@@ -554,6 +558,9 @@ struct net_device_ops {
 #define HAVE_SET_MAC_ADDR
 	int			(*ndo_set_mac_address)(struct net_device *dev,
 						       void *addr);
+#define HAVE_CHECK_MAC_ADDR
+	int			(*ndo_check_mac_address)(struct net_device *dev,
+						       void *addr);
 #define HAVE_VALIDATE_ADDR
 	int			(*ndo_validate_addr)(struct net_device *dev);
 #define HAVE_PRIVATE_IOCTL
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 30b8877..b071169 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -39,6 +39,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
 {
 	const unsigned char *dest = eth_hdr(skb)->h_dest;
 	struct net_bridge_port *p = rcu_dereference(skb->dev->br_port);
+	struct net_device *dev = p->dev;
 	struct net_bridge *br;
 	struct net_bridge_fdb_entry *dst;
 	struct sk_buff *skb2;
@@ -64,7 +65,9 @@ int br_handle_frame_finish(struct sk_buff *skb)
 	if (is_multicast_ether_addr(dest)) {
 		br->dev->stats.multicast++;
 		skb2 = skb;
-	} else if ((dst = __br_fdb_get(br, dest)) && dst->is_local) {
+	} else if (((dst = __br_fdb_get(br, dest)) && dst->is_local) ||
+		   (dev->netdev_ops->ndo_check_mac_address &&
+		    !dev->netdev_ops->ndo_check_mac_address(dev, (unsigned char *) dest))) {
 		skb2 = skb;
 		/* Do not forward the packet since it's local. */
 		skb = NULL;

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
  2009-03-13 18:33 ` [Bridge] " Jiri Pirko
@ 2009-03-14  5:39   ` Stephen Hemminger
  -1 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-03-14  5:39 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, netdev, jgarzik, davem, bridge, fubar,
	bonding-devel, netdev

On Fri, 13 Mar 2009 19:33:04 +0100
Jiri Pirko <jpirko@redhat.com> wrote:

> Hi all.
> 
> This is only a draft of patch to consult. I'm aware that it should be divided
> into multiple patches. I want to know opinion from you folks.
> 
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> 
> Basically here's what's going on. In every mode, bonding interface uses the same
> mac address for all enslaved devices. Except for mode balance-alb. When you put
> this kind of bond device into a bridge it will only add one of mac adresses into
> a hash list of mac addresses, say X. This mac address is marked as local. But
> this bonding interface also has mac address Y. Now then packet arrives with
> destination address Y, this address is not marked as local and the packed looks
> like it needs to be forwarded. This packet is then lost which is wrong.
> 
> Notice that interfaces can be added and removed from bond while it is in bridge.
> Therefore I introduce another function pointer in struct net_device_ops -
> ndo_check_mac_address. This function when it's implemented should check passed
> mac address against the one set in device. I'm using this in bonding driver when
> the bond is in mode balance-alb to walk thru all slaves and checking if any of
> them equals passed address.
> 
> Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
> to recognize the destination mac address as local.
> 
> Please look at this and tell me what you think about it.
> 
> Thanks
> 
> Jirka
>

A better and more general way to do this have the dev_set_mac_address
function check the return of the notifier and unwind. Then any protocol
can easily prevent address from changing.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
@ 2009-03-14  5:39   ` Stephen Hemminger
  0 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-03-14  5:39 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: fubar, netdev, bridge, linux-kernel, bonding-devel, jgarzik, davem

On Fri, 13 Mar 2009 19:33:04 +0100
Jiri Pirko <jpirko@redhat.com> wrote:

> Hi all.
> 
> This is only a draft of patch to consult. I'm aware that it should be divided
> into multiple patches. I want to know opinion from you folks.
> 
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> 
> Basically here's what's going on. In every mode, bonding interface uses the same
> mac address for all enslaved devices. Except for mode balance-alb. When you put
> this kind of bond device into a bridge it will only add one of mac adresses into
> a hash list of mac addresses, say X. This mac address is marked as local. But
> this bonding interface also has mac address Y. Now then packet arrives with
> destination address Y, this address is not marked as local and the packed looks
> like it needs to be forwarded. This packet is then lost which is wrong.
> 
> Notice that interfaces can be added and removed from bond while it is in bridge.
> Therefore I introduce another function pointer in struct net_device_ops -
> ndo_check_mac_address. This function when it's implemented should check passed
> mac address against the one set in device. I'm using this in bonding driver when
> the bond is in mode balance-alb to walk thru all slaves and checking if any of
> them equals passed address.
> 
> Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
> to recognize the destination mac address as local.
> 
> Please look at this and tell me what you think about it.
> 
> Thanks
> 
> Jirka
>

A better and more general way to do this have the dev_set_mac_address
function check the return of the notifier and unwind. Then any protocol
can easily prevent address from changing.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
  2009-03-14  5:39   ` [Bridge] " Stephen Hemminger
@ 2009-03-14  9:49     ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-14  9:49 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: linux-kernel, netdev, jgarzik, davem, bridge, fubar, bonding-devel

Sat, Mar 14, 2009 at 06:39:32AM CET, shemminger@linux-foundation.org wrote:
>On Fri, 13 Mar 2009 19:33:04 +0100
>Jiri Pirko <jpirko@redhat.com> wrote:
>
>> Hi all.
>> 
>> This is only a draft of patch to consult. I'm aware that it should be divided
>> into multiple patches. I want to know opinion from you folks.
>> 
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>> 
>> Basically here's what's going on. In every mode, bonding interface uses the same
>> mac address for all enslaved devices. Except for mode balance-alb. When you put
>> this kind of bond device into a bridge it will only add one of mac adresses into
>> a hash list of mac addresses, say X. This mac address is marked as local. But
>> this bonding interface also has mac address Y. Now then packet arrives with
>> destination address Y, this address is not marked as local and the packed looks
>> like it needs to be forwarded. This packet is then lost which is wrong.
>> 
>> Notice that interfaces can be added and removed from bond while it is in bridge.
>> Therefore I introduce another function pointer in struct net_device_ops -
>> ndo_check_mac_address. This function when it's implemented should check passed
>> mac address against the one set in device. I'm using this in bonding driver when
>> the bond is in mode balance-alb to walk thru all slaves and checking if any of
>> them equals passed address.
>> 
>> Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
>> to recognize the destination mac address as local.
>> 
>> Please look at this and tell me what you think about it.
>> 
>> Thanks
>> 
>> Jirka
>>
>
>A better and more general way to do this have the dev_set_mac_address
>function check the return of the notifier and unwind. Then any protocol
>can easily prevent address from changing.

Can you please describe this thougth a bit more? I can't understand it now...

Thanks

Jirka

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
@ 2009-03-14  9:49     ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-14  9:49 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: fubar, netdev, bridge, linux-kernel, bonding-devel, jgarzik, davem

Sat, Mar 14, 2009 at 06:39:32AM CET, shemminger@linux-foundation.org wrote:
>On Fri, 13 Mar 2009 19:33:04 +0100
>Jiri Pirko <jpirko@redhat.com> wrote:
>
>> Hi all.
>> 
>> This is only a draft of patch to consult. I'm aware that it should be divided
>> into multiple patches. I want to know opinion from you folks.
>> 
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>> 
>> Basically here's what's going on. In every mode, bonding interface uses the same
>> mac address for all enslaved devices. Except for mode balance-alb. When you put
>> this kind of bond device into a bridge it will only add one of mac adresses into
>> a hash list of mac addresses, say X. This mac address is marked as local. But
>> this bonding interface also has mac address Y. Now then packet arrives with
>> destination address Y, this address is not marked as local and the packed looks
>> like it needs to be forwarded. This packet is then lost which is wrong.
>> 
>> Notice that interfaces can be added and removed from bond while it is in bridge.
>> Therefore I introduce another function pointer in struct net_device_ops -
>> ndo_check_mac_address. This function when it's implemented should check passed
>> mac address against the one set in device. I'm using this in bonding driver when
>> the bond is in mode balance-alb to walk thru all slaves and checking if any of
>> them equals passed address.
>> 
>> Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
>> to recognize the destination mac address as local.
>> 
>> Please look at this and tell me what you think about it.
>> 
>> Thanks
>> 
>> Jirka
>>
>
>A better and more general way to do this have the dev_set_mac_address
>function check the return of the notifier and unwind. Then any protocol
>can easily prevent address from changing.

Can you please describe this thougth a bit more? I can't understand it now...

Thanks

Jirka

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
  2009-03-14  9:49     ` [Bridge] " Jiri Pirko
@ 2009-03-15 23:12       ` Stephen Hemminger
  -1 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-03-15 23:12 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, netdev, jgarzik, davem, bridge, fubar, bonding-devel

On Sat, 14 Mar 2009 10:49:11 +0100
Jiri Pirko <jpirko@redhat.com> wrote:

> Sat, Mar 14, 2009 at 06:39:32AM CET, shemminger@linux-foundation.org wrote:
> >On Fri, 13 Mar 2009 19:33:04 +0100
> >Jiri Pirko <jpirko@redhat.com> wrote:
> >
> >> Hi all.
> >> 
> >> This is only a draft of patch to consult. I'm aware that it should be divided
> >> into multiple patches. I want to know opinion from you folks.
> >> 
> >> The problem is described in following bugzilla:
> >> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> >> 
> >> Basically here's what's going on. In every mode, bonding interface uses the same
> >> mac address for all enslaved devices. Except for mode balance-alb. When you put
> >> this kind of bond device into a bridge it will only add one of mac adresses into
> >> a hash list of mac addresses, say X. This mac address is marked as local. But
> >> this bonding interface also has mac address Y. Now then packet arrives with
> >> destination address Y, this address is not marked as local and the packed looks
> >> like it needs to be forwarded. This packet is then lost which is wrong.
> >> 
> >> Notice that interfaces can be added and removed from bond while it is in bridge.
> >> Therefore I introduce another function pointer in struct net_device_ops -
> >> ndo_check_mac_address. This function when it's implemented should check passed
> >> mac address against the one set in device. I'm using this in bonding driver when
> >> the bond is in mode balance-alb to walk thru all slaves and checking if any of
> >> them equals passed address.
> >> 
> >> Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
> >> to recognize the destination mac address as local.
> >> 
> >> Please look at this and tell me what you think about it.
> >> 
> >> Thanks
> >> 
> >> Jirka
> >>
> >
> >A better and more general way to do this have the dev_set_mac_address
> >function check the return of the notifier and unwind. Then any protocol
> >can easily prevent address from changing.
> 
> Can you please describe this thougth a bit more? I can't understand it now...
> 
> Thanks
> 
> Jirka

Something like this:

--- a/net/core/dev.c	2009-03-15 15:55:02.098126056 -0700
+++ b/net/core/dev.c	2009-03-15 16:02:43.999251305 -0700
@@ -3830,6 +3830,7 @@ int dev_set_mac_address(struct net_devic
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int err;
+	char save_addr[MAX_ADDR_LEN];
 
 	if (!ops->ndo_set_mac_address)
 		return -EOPNOTSUPP;
@@ -3837,9 +3838,17 @@ int dev_set_mac_address(struct net_devic
 		return -EINVAL;
 	if (!netif_device_present(dev))
 		return -ENODEV;
+
+	memcpy(save_addr, dev->dev_addr, dev->addr_len);
 	err = ops->ndo_set_mac_address(dev, sa);
-	if (!err)
-		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	if (err)
+		return err;
+
+	err = call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	if (err) {
+		memcpy(sa->sa_data, save_addr, dev->addr_len);
+		ops->ndo_set_mac_address(dev, sa);
+	}
 	return err;
 }
 

And something like this:

--- a/drivers/net/bonding/bond_main.c	2009-03-15 16:03:53.909000973 -0700
+++ b/drivers/net/bonding/bond_main.c	2009-03-15 16:11:43.227127031 -0700
@@ -3534,6 +3534,7 @@ static int bond_slave_netdev_event(unsig
 {
 	struct net_device *bond_dev = slave_dev->master;
 	struct bonding *bond = netdev_priv(bond_dev);
+	int err;
 
 	switch (event) {
 	case NETDEV_UNREGISTER:
@@ -3570,6 +3571,15 @@ static int bond_slave_netdev_event(unsig
 		 * servitude.
 		 */
 		break;
+	case NETDEV_CHANGEADDR:
+		if (bond->params.mode == BOND_MODE_ALB)
+			err = bond_alb_check_mac_address(bond);
+		else if (compare_ether_addr(bond_dev->dev_addr, addr) != 0)
+			err = -EINVAL;
+
+		if (err)
+			return notifier_from_errno(err);
+		break;
 	case NETDEV_CHANGENAME:
 		/*
 		 * TODO: handle changing the primary's name




^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
@ 2009-03-15 23:12       ` Stephen Hemminger
  0 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-03-15 23:12 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: fubar, netdev, bridge, linux-kernel, bonding-devel, jgarzik, davem

On Sat, 14 Mar 2009 10:49:11 +0100
Jiri Pirko <jpirko@redhat.com> wrote:

> Sat, Mar 14, 2009 at 06:39:32AM CET, shemminger@linux-foundation.org wrote:
> >On Fri, 13 Mar 2009 19:33:04 +0100
> >Jiri Pirko <jpirko@redhat.com> wrote:
> >
> >> Hi all.
> >> 
> >> This is only a draft of patch to consult. I'm aware that it should be divided
> >> into multiple patches. I want to know opinion from you folks.
> >> 
> >> The problem is described in following bugzilla:
> >> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> >> 
> >> Basically here's what's going on. In every mode, bonding interface uses the same
> >> mac address for all enslaved devices. Except for mode balance-alb. When you put
> >> this kind of bond device into a bridge it will only add one of mac adresses into
> >> a hash list of mac addresses, say X. This mac address is marked as local. But
> >> this bonding interface also has mac address Y. Now then packet arrives with
> >> destination address Y, this address is not marked as local and the packed looks
> >> like it needs to be forwarded. This packet is then lost which is wrong.
> >> 
> >> Notice that interfaces can be added and removed from bond while it is in bridge.
> >> Therefore I introduce another function pointer in struct net_device_ops -
> >> ndo_check_mac_address. This function when it's implemented should check passed
> >> mac address against the one set in device. I'm using this in bonding driver when
> >> the bond is in mode balance-alb to walk thru all slaves and checking if any of
> >> them equals passed address.
> >> 
> >> Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
> >> to recognize the destination mac address as local.
> >> 
> >> Please look at this and tell me what you think about it.
> >> 
> >> Thanks
> >> 
> >> Jirka
> >>
> >
> >A better and more general way to do this have the dev_set_mac_address
> >function check the return of the notifier and unwind. Then any protocol
> >can easily prevent address from changing.
> 
> Can you please describe this thougth a bit more? I can't understand it now...
> 
> Thanks
> 
> Jirka

Something like this:

--- a/net/core/dev.c	2009-03-15 15:55:02.098126056 -0700
+++ b/net/core/dev.c	2009-03-15 16:02:43.999251305 -0700
@@ -3830,6 +3830,7 @@ int dev_set_mac_address(struct net_devic
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int err;
+	char save_addr[MAX_ADDR_LEN];
 
 	if (!ops->ndo_set_mac_address)
 		return -EOPNOTSUPP;
@@ -3837,9 +3838,17 @@ int dev_set_mac_address(struct net_devic
 		return -EINVAL;
 	if (!netif_device_present(dev))
 		return -ENODEV;
+
+	memcpy(save_addr, dev->dev_addr, dev->addr_len);
 	err = ops->ndo_set_mac_address(dev, sa);
-	if (!err)
-		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	if (err)
+		return err;
+
+	err = call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	if (err) {
+		memcpy(sa->sa_data, save_addr, dev->addr_len);
+		ops->ndo_set_mac_address(dev, sa);
+	}
 	return err;
 }
 

And something like this:

--- a/drivers/net/bonding/bond_main.c	2009-03-15 16:03:53.909000973 -0700
+++ b/drivers/net/bonding/bond_main.c	2009-03-15 16:11:43.227127031 -0700
@@ -3534,6 +3534,7 @@ static int bond_slave_netdev_event(unsig
 {
 	struct net_device *bond_dev = slave_dev->master;
 	struct bonding *bond = netdev_priv(bond_dev);
+	int err;
 
 	switch (event) {
 	case NETDEV_UNREGISTER:
@@ -3570,6 +3571,15 @@ static int bond_slave_netdev_event(unsig
 		 * servitude.
 		 */
 		break;
+	case NETDEV_CHANGEADDR:
+		if (bond->params.mode == BOND_MODE_ALB)
+			err = bond_alb_check_mac_address(bond);
+		else if (compare_ether_addr(bond_dev->dev_addr, addr) != 0)
+			err = -EINVAL;
+
+		if (err)
+			return notifier_from_errno(err);
+		break;
 	case NETDEV_CHANGENAME:
 		/*
 		 * TODO: handle changing the primary's name




^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
  2009-03-15 23:12       ` [Bridge] " Stephen Hemminger
@ 2009-03-16 11:11         ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-16 11:11 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: linux-kernel, netdev, jgarzik, davem, bridge, fubar, bonding-devel

Mon, Mar 16, 2009 at 12:12:17AM CET, shemminger@linux-foundation.org wrote:
>On Sat, 14 Mar 2009 10:49:11 +0100
>Jiri Pirko <jpirko@redhat.com> wrote:
>
>> Sat, Mar 14, 2009 at 06:39:32AM CET, shemminger@linux-foundation.org wrote:
>> >On Fri, 13 Mar 2009 19:33:04 +0100
>> >Jiri Pirko <jpirko@redhat.com> wrote:
>> >
>> >> Hi all.
>> >> 
>> >> This is only a draft of patch to consult. I'm aware that it should be divided
>> >> into multiple patches. I want to know opinion from you folks.
>> >> 
>> >> The problem is described in following bugzilla:
>> >> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>> >> 
>> >> Basically here's what's going on. In every mode, bonding interface uses the same
>> >> mac address for all enslaved devices. Except for mode balance-alb. When you put
>> >> this kind of bond device into a bridge it will only add one of mac adresses into
>> >> a hash list of mac addresses, say X. This mac address is marked as local. But
>> >> this bonding interface also has mac address Y. Now then packet arrives with
>> >> destination address Y, this address is not marked as local and the packed looks
>> >> like it needs to be forwarded. This packet is then lost which is wrong.
>> >> 
>> >> Notice that interfaces can be added and removed from bond while it is in bridge.
>> >> Therefore I introduce another function pointer in struct net_device_ops -
>> >> ndo_check_mac_address. This function when it's implemented should check passed
>> >> mac address against the one set in device. I'm using this in bonding driver when
>> >> the bond is in mode balance-alb to walk thru all slaves and checking if any of
>> >> them equals passed address.
>> >> 
>> >> Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
>> >> to recognize the destination mac address as local.
>> >> 
>> >> Please look at this and tell me what you think about it.
>> >> 
>> >> Thanks
>> >> 
>> >> Jirka
>> >>
>> >
>> >A better and more general way to do this have the dev_set_mac_address
>> >function check the return of the notifier and unwind. Then any protocol
>> >can easily prevent address from changing.
>> 
>> Can you please describe this thougth a bit more? I can't understand it now...
>> 
>> Thanks
>> 
>> Jirka
>
>Something like this:
>
>--- a/net/core/dev.c	2009-03-15 15:55:02.098126056 -0700
>+++ b/net/core/dev.c	2009-03-15 16:02:43.999251305 -0700
>@@ -3830,6 +3830,7 @@ int dev_set_mac_address(struct net_devic
> {
> 	const struct net_device_ops *ops = dev->netdev_ops;
> 	int err;
>+	char save_addr[MAX_ADDR_LEN];
> 
> 	if (!ops->ndo_set_mac_address)
> 		return -EOPNOTSUPP;
>@@ -3837,9 +3838,17 @@ int dev_set_mac_address(struct net_devic
> 		return -EINVAL;
> 	if (!netif_device_present(dev))
> 		return -ENODEV;
>+
>+	memcpy(save_addr, dev->dev_addr, dev->addr_len);
> 	err = ops->ndo_set_mac_address(dev, sa);
>-	if (!err)
>-		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>+	if (err)
>+		return err;
>+
>+	err = call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>+	if (err) {
>+		memcpy(sa->sa_data, save_addr, dev->addr_len);
>+		ops->ndo_set_mac_address(dev, sa);
>+	}
> 	return err;
> }
> 
>
>And something like this:
>
>--- a/drivers/net/bonding/bond_main.c	2009-03-15 16:03:53.909000973 -0700
>+++ b/drivers/net/bonding/bond_main.c	2009-03-15 16:11:43.227127031 -0700
>@@ -3534,6 +3534,7 @@ static int bond_slave_netdev_event(unsig
> {
> 	struct net_device *bond_dev = slave_dev->master;
> 	struct bonding *bond = netdev_priv(bond_dev);
>+	int err;
> 
> 	switch (event) {
> 	case NETDEV_UNREGISTER:
>@@ -3570,6 +3571,15 @@ static int bond_slave_netdev_event(unsig
> 		 * servitude.
> 		 */
> 		break;
>+	case NETDEV_CHANGEADDR:
>+		if (bond->params.mode == BOND_MODE_ALB)
>+			err = bond_alb_check_mac_address(bond);
>+		else if (compare_ether_addr(bond_dev->dev_addr, addr) != 0)
>+			err = -EINVAL;
>+
>+		if (err)
>+			return notifier_from_errno(err);
>+		break;
> 	case NETDEV_CHANGENAME:
> 		/*
> 		 * TODO: handle changing the primary's name
>
Yes, I think the changing mac address of slaves should be also handled by
bonding driver. But my patch fixes a different issue. See, unlike in any other
bonding modes, in balance-alb mode incoming packets have multiple MAC adresses
(of any of enslaved devices). This causes problem because bridge only recognize
one of them (the mac of master which is the mac on one of the slaves) as local -
the other MAC's are not recognized as they are a part of port and therefore
handled as general MAC adresses. This is the problem.

I can see two solutions. Either like my patch or somehow allow bridge to know
more MAC addressses per port (maybe netdev can be changed to know more then
one MAC address).

Any thoughts?

Thanks

Jirka
>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
@ 2009-03-16 11:11         ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-16 11:11 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: fubar, netdev, bridge, linux-kernel, bonding-devel, jgarzik, davem

Mon, Mar 16, 2009 at 12:12:17AM CET, shemminger@linux-foundation.org wrote:
>On Sat, 14 Mar 2009 10:49:11 +0100
>Jiri Pirko <jpirko@redhat.com> wrote:
>
>> Sat, Mar 14, 2009 at 06:39:32AM CET, shemminger@linux-foundation.org wrote:
>> >On Fri, 13 Mar 2009 19:33:04 +0100
>> >Jiri Pirko <jpirko@redhat.com> wrote:
>> >
>> >> Hi all.
>> >> 
>> >> This is only a draft of patch to consult. I'm aware that it should be divided
>> >> into multiple patches. I want to know opinion from you folks.
>> >> 
>> >> The problem is described in following bugzilla:
>> >> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>> >> 
>> >> Basically here's what's going on. In every mode, bonding interface uses the same
>> >> mac address for all enslaved devices. Except for mode balance-alb. When you put
>> >> this kind of bond device into a bridge it will only add one of mac adresses into
>> >> a hash list of mac addresses, say X. This mac address is marked as local. But
>> >> this bonding interface also has mac address Y. Now then packet arrives with
>> >> destination address Y, this address is not marked as local and the packed looks
>> >> like it needs to be forwarded. This packet is then lost which is wrong.
>> >> 
>> >> Notice that interfaces can be added and removed from bond while it is in bridge.
>> >> Therefore I introduce another function pointer in struct net_device_ops -
>> >> ndo_check_mac_address. This function when it's implemented should check passed
>> >> mac address against the one set in device. I'm using this in bonding driver when
>> >> the bond is in mode balance-alb to walk thru all slaves and checking if any of
>> >> them equals passed address.
>> >> 
>> >> Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
>> >> to recognize the destination mac address as local.
>> >> 
>> >> Please look at this and tell me what you think about it.
>> >> 
>> >> Thanks
>> >> 
>> >> Jirka
>> >>
>> >
>> >A better and more general way to do this have the dev_set_mac_address
>> >function check the return of the notifier and unwind. Then any protocol
>> >can easily prevent address from changing.
>> 
>> Can you please describe this thougth a bit more? I can't understand it now...
>> 
>> Thanks
>> 
>> Jirka
>
>Something like this:
>
>--- a/net/core/dev.c	2009-03-15 15:55:02.098126056 -0700
>+++ b/net/core/dev.c	2009-03-15 16:02:43.999251305 -0700
>@@ -3830,6 +3830,7 @@ int dev_set_mac_address(struct net_devic
> {
> 	const struct net_device_ops *ops = dev->netdev_ops;
> 	int err;
>+	char save_addr[MAX_ADDR_LEN];
> 
> 	if (!ops->ndo_set_mac_address)
> 		return -EOPNOTSUPP;
>@@ -3837,9 +3838,17 @@ int dev_set_mac_address(struct net_devic
> 		return -EINVAL;
> 	if (!netif_device_present(dev))
> 		return -ENODEV;
>+
>+	memcpy(save_addr, dev->dev_addr, dev->addr_len);
> 	err = ops->ndo_set_mac_address(dev, sa);
>-	if (!err)
>-		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>+	if (err)
>+		return err;
>+
>+	err = call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>+	if (err) {
>+		memcpy(sa->sa_data, save_addr, dev->addr_len);
>+		ops->ndo_set_mac_address(dev, sa);
>+	}
> 	return err;
> }
> 
>
>And something like this:
>
>--- a/drivers/net/bonding/bond_main.c	2009-03-15 16:03:53.909000973 -0700
>+++ b/drivers/net/bonding/bond_main.c	2009-03-15 16:11:43.227127031 -0700
>@@ -3534,6 +3534,7 @@ static int bond_slave_netdev_event(unsig
> {
> 	struct net_device *bond_dev = slave_dev->master;
> 	struct bonding *bond = netdev_priv(bond_dev);
>+	int err;
> 
> 	switch (event) {
> 	case NETDEV_UNREGISTER:
>@@ -3570,6 +3571,15 @@ static int bond_slave_netdev_event(unsig
> 		 * servitude.
> 		 */
> 		break;
>+	case NETDEV_CHANGEADDR:
>+		if (bond->params.mode == BOND_MODE_ALB)
>+			err = bond_alb_check_mac_address(bond);
>+		else if (compare_ether_addr(bond_dev->dev_addr, addr) != 0)
>+			err = -EINVAL;
>+
>+		if (err)
>+			return notifier_from_errno(err);
>+		break;
> 	case NETDEV_CHANGENAME:
> 		/*
> 		 * TODO: handle changing the primary's name
>
Yes, I think the changing mac address of slaves should be also handled by
bonding driver. But my patch fixes a different issue. See, unlike in any other
bonding modes, in balance-alb mode incoming packets have multiple MAC adresses
(of any of enslaved devices). This causes problem because bridge only recognize
one of them (the mac of master which is the mac on one of the slaves) as local -
the other MAC's are not recognized as they are a part of port and therefore
handled as general MAC adresses. This is the problem.

I can see two solutions. Either like my patch or somehow allow bridge to know
more MAC addressses per port (maybe netdev can be changed to know more then
one MAC address).

Any thoughts?

Thanks

Jirka
>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
  2009-03-16 11:11         ` [Bridge] " Jiri Pirko
@ 2009-03-19  6:20           ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-03-19  6:20 UTC (permalink / raw)
  To: jpirko
  Cc: shemminger, linux-kernel, netdev, jgarzik, bridge, fubar, bonding-devel

From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 16 Mar 2009 12:11:28 +0100

> I can see two solutions. Either like my patch or somehow allow bridge to know
> more MAC addressses per port (maybe netdev can be changed to know more then
> one MAC address).
> 
> Any thoughts?

The netdev struct already supports having a list of multiple unicast
MAC addresses, it can probably be used and inspected for this.

I'll hold off on your patch until we make some more progress on
this discussion.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
@ 2009-03-19  6:20           ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-03-19  6:20 UTC (permalink / raw)
  To: jpirko; +Cc: fubar, netdev, bridge, linux-kernel, jgarzik, bonding-devel

From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 16 Mar 2009 12:11:28 +0100

> I can see two solutions. Either like my patch or somehow allow bridge to know
> more MAC addressses per port (maybe netdev can be changed to know more then
> one MAC address).
> 
> Any thoughts?

The netdev struct already supports having a list of multiple unicast
MAC addresses, it can probably be used and inspected for this.

I'll hold off on your patch until we make some more progress on
this discussion.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
  2009-03-19  6:20           ` [Bridge] " David Miller
@ 2009-03-19  8:44             ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-19  8:44 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, linux-kernel, netdev, jgarzik, bridge, fubar, bonding-devel

Thu, Mar 19, 2009 at 07:20:03AM CET, davem@davemloft.net wrote:
>From: Jiri Pirko <jpirko@redhat.com>
>Date: Mon, 16 Mar 2009 12:11:28 +0100
>
>> I can see two solutions. Either like my patch or somehow allow bridge to know
>> more MAC addressses per port (maybe netdev can be changed to know more then
>> one MAC address).
>> 
>> Any thoughts?
>
>The netdev struct already supports having a list of multiple unicast
>MAC addresses, it can probably be used and inspected for this.
Yes I was looking at this thing yesterday (uc_list). But this list serves
to different purpose. Do you think that it will be correct to use it for this? I
would maybe like to make a new list similar to this for our purpose
(say addr_list). I think it would be more correct.

Eventually in the furute we would use this list as a primary place to store
device address instead of dev_addr value and make it more general (as device
generally may have more adresses). Just a thought...

>
>I'll hold off on your patch until we make some more progress on
>this discussion.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
@ 2009-03-19  8:44             ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-19  8:44 UTC (permalink / raw)
  To: David Miller; +Cc: fubar, netdev, bridge, linux-kernel, jgarzik, bonding-devel

Thu, Mar 19, 2009 at 07:20:03AM CET, davem@davemloft.net wrote:
>From: Jiri Pirko <jpirko@redhat.com>
>Date: Mon, 16 Mar 2009 12:11:28 +0100
>
>> I can see two solutions. Either like my patch or somehow allow bridge to know
>> more MAC addressses per port (maybe netdev can be changed to know more then
>> one MAC address).
>> 
>> Any thoughts?
>
>The netdev struct already supports having a list of multiple unicast
>MAC addresses, it can probably be used and inspected for this.
Yes I was looking at this thing yesterday (uc_list). But this list serves
to different purpose. Do you think that it will be correct to use it for this? I
would maybe like to make a new list similar to this for our purpose
(say addr_list). I think it would be more correct.

Eventually in the furute we would use this list as a primary place to store
device address instead of dev_addr value and make it more general (as device
generally may have more adresses). Just a thought...

>
>I'll hold off on your patch until we make some more progress on
>this discussion.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
  2009-03-19  6:20           ` [Bridge] " David Miller
@ 2009-03-19  8:50             ` Patrick McHardy
  -1 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-03-19  8:50 UTC (permalink / raw)
  To: David Miller
  Cc: jpirko, shemminger, linux-kernel, netdev, jgarzik, bridge, fubar,
	bonding-devel

David Miller wrote:
> From: Jiri Pirko <jpirko@redhat.com>
> Date: Mon, 16 Mar 2009 12:11:28 +0100
> 
>> I can see two solutions. Either like my patch or somehow allow bridge to know
>> more MAC addressses per port (maybe netdev can be changed to know more then
>> one MAC address).
>>
>> Any thoughts?
> 
> The netdev struct already supports having a list of multiple unicast
> MAC addresses, it can probably be used and inspected for this.
> 
> I'll hold off on your patch until we make some more progress on
> this discussion.

 From reading the balance-alb description, I get the impression that this
mode is simply not meant to be used with bridging:

		Adaptive load balancing: includes balance-tlb plus
		receive load balancing (rlb) for IPV4 traffic, and
		does not require any special switch support.  The
		receive load balancing is achieved by ARP negotiation.
		The bonding driver intercepts the ARP Replies sent by
		the local system on their way out and overwrites the
		source hardware address with the unique hardware
		address of one of the slaves in the bond such that
		different peers use different hardware addresses for
		the server.

In any case I'd tend to say that if bond-alb mode mangles outgoing MAC
addresses, it should restore the original one for received packets
and keep the hacks local to bonding.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
@ 2009-03-19  8:50             ` Patrick McHardy
  0 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-03-19  8:50 UTC (permalink / raw)
  To: David Miller
  Cc: fubar, jpirko, netdev, bridge, linux-kernel, jgarzik, bonding-devel

David Miller wrote:
> From: Jiri Pirko <jpirko@redhat.com>
> Date: Mon, 16 Mar 2009 12:11:28 +0100
> 
>> I can see two solutions. Either like my patch or somehow allow bridge to know
>> more MAC addressses per port (maybe netdev can be changed to know more then
>> one MAC address).
>>
>> Any thoughts?
> 
> The netdev struct already supports having a list of multiple unicast
> MAC addresses, it can probably be used and inspected for this.
> 
> I'll hold off on your patch until we make some more progress on
> this discussion.

 From reading the balance-alb description, I get the impression that this
mode is simply not meant to be used with bridging:

		Adaptive load balancing: includes balance-tlb plus
		receive load balancing (rlb) for IPV4 traffic, and
		does not require any special switch support.  The
		receive load balancing is achieved by ARP negotiation.
		The bonding driver intercepts the ARP Replies sent by
		the local system on their way out and overwrites the
		source hardware address with the unique hardware
		address of one of the slaves in the bond such that
		different peers use different hardware addresses for
		the server.

In any case I'd tend to say that if bond-alb mode mangles outgoing MAC
addresses, it should restore the original one for received packets
and keep the hacks local to bonding.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
  2009-03-19  8:44             ` [Bridge] " Jiri Pirko
@ 2009-03-19 10:21               ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-03-19 10:21 UTC (permalink / raw)
  To: jpirko
  Cc: shemminger, linux-kernel, netdev, jgarzik, bridge, fubar, bonding-devel

From: Jiri Pirko <jpirko@redhat.com>
Date: Thu, 19 Mar 2009 09:44:45 +0100

> Yes I was looking at this thing yesterday (uc_list). But this list serves
> to different purpose. Do you think that it will be correct to use it for this? I
> would maybe like to make a new list similar to this for our purpose
> (say addr_list). I think it would be more correct.

Whatever you do with that list privately inside of the bonding
driver should be fine.

It might upset something in the generic code if you don't clean
it up before deregistration of the bonding device, so just be
tidy.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
@ 2009-03-19 10:21               ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-03-19 10:21 UTC (permalink / raw)
  To: jpirko; +Cc: fubar, netdev, bridge, linux-kernel, jgarzik, bonding-devel

From: Jiri Pirko <jpirko@redhat.com>
Date: Thu, 19 Mar 2009 09:44:45 +0100

> Yes I was looking at this thing yesterday (uc_list). But this list serves
> to different purpose. Do you think that it will be correct to use it for this? I
> would maybe like to make a new list similar to this for our purpose
> (say addr_list). I think it would be more correct.

Whatever you do with that list privately inside of the bonding
driver should be fine.

It might upset something in the generic code if you don't clean
it up before deregistration of the bonding device, so just be
tidy.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
  2009-03-19 10:21               ` [Bridge] " David Miller
@ 2009-03-19 11:19                 ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-19 11:19 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, linux-kernel, netdev, jgarzik, bridge, fubar, bonding-devel

Thu, Mar 19, 2009 at 11:21:43AM CET, davem@davemloft.net wrote:
>From: Jiri Pirko <jpirko@redhat.com>
>Date: Thu, 19 Mar 2009 09:44:45 +0100
>
>> Yes I was looking at this thing yesterday (uc_list). But this list serves
>> to different purpose. Do you think that it will be correct to use it for this? I
>> would maybe like to make a new list similar to this for our purpose
>> (say addr_list). I think it would be more correct.
>
>Whatever you do with that list privately inside of the bonding
>driver should be fine.
Well I do not need it only inside the bonding driver. I want bridge to use this
list when adding a device in it and get mac addresses from there into its
hashlist (to recognize these addresses as local).
>
>It might upset something in the generic code if you don't clean
>it up before deregistration of the bonding device, so just be
>tidy.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
@ 2009-03-19 11:19                 ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-19 11:19 UTC (permalink / raw)
  To: David Miller; +Cc: fubar, netdev, bridge, linux-kernel, jgarzik, bonding-devel

Thu, Mar 19, 2009 at 11:21:43AM CET, davem@davemloft.net wrote:
>From: Jiri Pirko <jpirko@redhat.com>
>Date: Thu, 19 Mar 2009 09:44:45 +0100
>
>> Yes I was looking at this thing yesterday (uc_list). But this list serves
>> to different purpose. Do you think that it will be correct to use it for this? I
>> would maybe like to make a new list similar to this for our purpose
>> (say addr_list). I think it would be more correct.
>
>Whatever you do with that list privately inside of the bonding
>driver should be fine.
Well I do not need it only inside the bonding driver. I want bridge to use this
list when adding a device in it and get mac addresses from there into its
hashlist (to recognize these addresses as local).
>
>It might upset something in the generic code if you don't clean
>it up before deregistration of the bonding device, so just be
>tidy.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
  2009-03-19  8:50             ` [Bridge] " Patrick McHardy
@ 2009-03-19 16:31               ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-19 16:31 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Miller, shemminger, linux-kernel, netdev, jgarzik, bridge,
	fubar, bonding-devel

Thu, Mar 19, 2009 at 09:50:03AM CET, kaber@trash.net wrote:
> David Miller wrote:
>> From: Jiri Pirko <jpirko@redhat.com>
>> Date: Mon, 16 Mar 2009 12:11:28 +0100
>>
>>> I can see two solutions. Either like my patch or somehow allow bridge to know
>>> more MAC addressses per port (maybe netdev can be changed to know more then
>>> one MAC address).
>>>
>>> Any thoughts?
>>
>> The netdev struct already supports having a list of multiple unicast
>> MAC addresses, it can probably be used and inspected for this.
>>
>> I'll hold off on your patch until we make some more progress on
>> this discussion.
>
> From reading the balance-alb description, I get the impression that this
> mode is simply not meant to be used with bridging:
>
> 		Adaptive load balancing: includes balance-tlb plus
> 		receive load balancing (rlb) for IPV4 traffic, and
> 		does not require any special switch support.  The
> 		receive load balancing is achieved by ARP negotiation.
> 		The bonding driver intercepts the ARP Replies sent by
> 		the local system on their way out and overwrites the
> 		source hardware address with the unique hardware
> 		address of one of the slaves in the bond such that
> 		different peers use different hardware addresses for
> 		the server.
>
> In any case I'd tend to say that if bond-alb mode mangles outgoing MAC
> addresses, it should restore the original one for received packets
> and keep the hacks local to bonding.

To let bonding driver to resolve this I think there will be needed some kind of
hook in netif_receive_skb() as for example bridge has. I would rather do this
more general and transparent.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
@ 2009-03-19 16:31               ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-19 16:31 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: fubar, netdev, bridge, linux-kernel, bonding-devel, jgarzik,
	David Miller

Thu, Mar 19, 2009 at 09:50:03AM CET, kaber@trash.net wrote:
> David Miller wrote:
>> From: Jiri Pirko <jpirko@redhat.com>
>> Date: Mon, 16 Mar 2009 12:11:28 +0100
>>
>>> I can see two solutions. Either like my patch or somehow allow bridge to know
>>> more MAC addressses per port (maybe netdev can be changed to know more then
>>> one MAC address).
>>>
>>> Any thoughts?
>>
>> The netdev struct already supports having a list of multiple unicast
>> MAC addresses, it can probably be used and inspected for this.
>>
>> I'll hold off on your patch until we make some more progress on
>> this discussion.
>
> From reading the balance-alb description, I get the impression that this
> mode is simply not meant to be used with bridging:
>
> 		Adaptive load balancing: includes balance-tlb plus
> 		receive load balancing (rlb) for IPV4 traffic, and
> 		does not require any special switch support.  The
> 		receive load balancing is achieved by ARP negotiation.
> 		The bonding driver intercepts the ARP Replies sent by
> 		the local system on their way out and overwrites the
> 		source hardware address with the unique hardware
> 		address of one of the slaves in the bond such that
> 		different peers use different hardware addresses for
> 		the server.
>
> In any case I'd tend to say that if bond-alb mode mangles outgoing MAC
> addresses, it should restore the original one for received packets
> and keep the hacks local to bonding.

To let bonding driver to resolve this I think there will be needed some kind of
hook in netif_receive_skb() as for example bridge has. I would rather do this
more general and transparent.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try2
  2009-03-13 18:33 ` [Bridge] " Jiri Pirko
@ 2009-03-25 13:04   ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-25 13:04 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt

(resend)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices. Except for mode balance-alb. When you put
this kind of bond device into a bridge it will only add one of mac adresses into
a hash list of mac addresses, say X. This mac address is marked as local. But
this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patch solves the situation in the bonding without touching bridge code,
as Patrick suggested. For every incoming frame to bonding it searches the
destination address in slaves list and if any of slave addresses matches, it
rewrites the address in frame by the adress of bonding master. This ensures that
all frames comming thru the bonding in alb mode have the same address.

Jirka


Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 27fb7f5..2838be0 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
 	return 0;
 }
 
+void bond_alb_change_dest(struct sk_buff *skb)
+{
+	struct net_device *bond_dev = skb->dev;
+	struct bonding *bond = netdev_priv(bond_dev);
+	unsigned char *dest = eth_hdr(skb)->h_dest;
+	struct slave *slave;
+	int i;
+
+	if (!memcmp(dest, bond_dev->dev_addr, ETH_ALEN))
+		return;
+	read_lock(&bond->lock);
+	bond_for_each_slave(bond, slave, i) {
+		if (!memcmp(slave->dev->dev_addr, dest, ETH_ALEN)) {
+			memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
+			break;
+		}
+	}
+	read_unlock(&bond->lock);
+}
+
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
 {
 	if (bond->alb_info.current_alb_vlan &&
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 50968f8..77f36fb 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
 int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
 void bond_alb_monitor(struct work_struct *);
 int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
+void bond_alb_change_dest(struct sk_buff *skb);
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
 #endif /* __BOND_ALB_H__ */
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3d76686..b62fdc4 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4294,6 +4294,19 @@ unwind:
 	return res;
 }
 
+/*
+ * Called via bond_change_dest_hook.
+ * note: already called with rcu_read_lock (preempt_disabled)
+ */
+void bond_change_dest(struct sk_buff *skb)
+{
+	struct net_device *bond_dev = skb->dev;
+	struct bonding *bond = netdev_priv(bond_dev);
+
+	if (bond->params.mode == BOND_MODE_ALB)
+		bond_alb_change_dest(skb);
+}
+
 static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
@@ -5243,6 +5256,8 @@ static int __init bonding_init(void)
 	register_inetaddr_notifier(&bond_inetaddr_notifier);
 	bond_register_ipv6_notifier();
 
+	bond_change_dest_hook = bond_change_dest;
+
 	goto out;
 err:
 	list_for_each_entry(bond, &bond_dev_list, bond_list) {
@@ -5266,6 +5281,8 @@ static void __exit bonding_exit(void)
 	unregister_inetaddr_notifier(&bond_inetaddr_notifier);
 	bond_unregister_ipv6_notifier();
 
+	bond_change_dest_hook = NULL;
+
 	bond_destroy_sysfs();
 
 	rtnl_lock();
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index ca849d2..df92b70 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -375,5 +375,7 @@ static inline void bond_unregister_ipv6_notifier(void)
 }
 #endif
 
+extern void (*bond_change_dest_hook)(struct sk_buff *skb);
+
 #endif /* _LINUX_BONDING_H */
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e3fe5c7..abe68d9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2061,6 +2061,13 @@ static inline int deliver_skb(struct sk_buff *skb,
 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
+void (*bond_change_dest_hook)(struct sk_buff *skb) __read_mostly;
+EXPORT_SYMBOL(bond_change_dest_hook);
+#else
+#define bond_change_dest_hook(skb) do {} while (0)
+#endif
+
 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
 /* These hooks defined here for ATM */
 struct net_bridge;
@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
 	null_or_orig = NULL;
 	orig_dev = skb->dev;
 	if (orig_dev->master) {
-		if (skb_bond_should_drop(skb))
+		if (skb_bond_should_drop(skb)) {
 			null_or_orig = orig_dev; /* deliver only exact match */
-		else
+		} else {
 			skb->dev = orig_dev->master;
+			bond_change_dest_hook(skb);
+		}
 	}
 
 	__get_cpu_var(netdev_rx_stat).total++;

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try2
@ 2009-03-25 13:04   ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-25 13:04 UTC (permalink / raw)
  To: linux-kernel
  Cc: fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik, davem

(resend)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices. Except for mode balance-alb. When you put
this kind of bond device into a bridge it will only add one of mac adresses into
a hash list of mac addresses, say X. This mac address is marked as local. But
this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patch solves the situation in the bonding without touching bridge code,
as Patrick suggested. For every incoming frame to bonding it searches the
destination address in slaves list and if any of slave addresses matches, it
rewrites the address in frame by the adress of bonding master. This ensures that
all frames comming thru the bonding in alb mode have the same address.

Jirka


Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 27fb7f5..2838be0 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
 	return 0;
 }
 
+void bond_alb_change_dest(struct sk_buff *skb)
+{
+	struct net_device *bond_dev = skb->dev;
+	struct bonding *bond = netdev_priv(bond_dev);
+	unsigned char *dest = eth_hdr(skb)->h_dest;
+	struct slave *slave;
+	int i;
+
+	if (!memcmp(dest, bond_dev->dev_addr, ETH_ALEN))
+		return;
+	read_lock(&bond->lock);
+	bond_for_each_slave(bond, slave, i) {
+		if (!memcmp(slave->dev->dev_addr, dest, ETH_ALEN)) {
+			memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
+			break;
+		}
+	}
+	read_unlock(&bond->lock);
+}
+
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
 {
 	if (bond->alb_info.current_alb_vlan &&
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 50968f8..77f36fb 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
 int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
 void bond_alb_monitor(struct work_struct *);
 int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
+void bond_alb_change_dest(struct sk_buff *skb);
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
 #endif /* __BOND_ALB_H__ */
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3d76686..b62fdc4 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4294,6 +4294,19 @@ unwind:
 	return res;
 }
 
+/*
+ * Called via bond_change_dest_hook.
+ * note: already called with rcu_read_lock (preempt_disabled)
+ */
+void bond_change_dest(struct sk_buff *skb)
+{
+	struct net_device *bond_dev = skb->dev;
+	struct bonding *bond = netdev_priv(bond_dev);
+
+	if (bond->params.mode == BOND_MODE_ALB)
+		bond_alb_change_dest(skb);
+}
+
 static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
@@ -5243,6 +5256,8 @@ static int __init bonding_init(void)
 	register_inetaddr_notifier(&bond_inetaddr_notifier);
 	bond_register_ipv6_notifier();
 
+	bond_change_dest_hook = bond_change_dest;
+
 	goto out;
 err:
 	list_for_each_entry(bond, &bond_dev_list, bond_list) {
@@ -5266,6 +5281,8 @@ static void __exit bonding_exit(void)
 	unregister_inetaddr_notifier(&bond_inetaddr_notifier);
 	bond_unregister_ipv6_notifier();
 
+	bond_change_dest_hook = NULL;
+
 	bond_destroy_sysfs();
 
 	rtnl_lock();
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index ca849d2..df92b70 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -375,5 +375,7 @@ static inline void bond_unregister_ipv6_notifier(void)
 }
 #endif
 
+extern void (*bond_change_dest_hook)(struct sk_buff *skb);
+
 #endif /* _LINUX_BONDING_H */
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e3fe5c7..abe68d9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2061,6 +2061,13 @@ static inline int deliver_skb(struct sk_buff *skb,
 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
+void (*bond_change_dest_hook)(struct sk_buff *skb) __read_mostly;
+EXPORT_SYMBOL(bond_change_dest_hook);
+#else
+#define bond_change_dest_hook(skb) do {} while (0)
+#endif
+
 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
 /* These hooks defined here for ATM */
 struct net_bridge;
@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
 	null_or_orig = NULL;
 	orig_dev = skb->dev;
 	if (orig_dev->master) {
-		if (skb_bond_should_drop(skb))
+		if (skb_bond_should_drop(skb)) {
 			null_or_orig = orig_dev; /* deliver only exact match */
-		else
+		} else {
 			skb->dev = orig_dev->master;
+			bond_change_dest_hook(skb);
+		}
 	}
 
 	__get_cpu_var(netdev_rx_stat).total++;

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try2
  2009-03-25 13:04   ` [Bridge] " Jiri Pirko
@ 2009-03-25 13:40     ` Eric Dumazet
  -1 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-03-25 13:40 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, netdev, jgarzik, davem, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt

Jiri Pirko a écrit :
> (resend)
> 
> Hi all.
> 
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> 
> Basically here's what's going on. In every mode, bonding interface uses the same
> mac address for all enslaved devices. Except for mode balance-alb. When you put
> this kind of bond device into a bridge it will only add one of mac adresses into
> a hash list of mac addresses, say X. This mac address is marked as local. But
> this bonding interface also has mac address Y. Now then packet arrives with
> destination address Y, this address is not marked as local and the packed looks
> like it needs to be forwarded. This packet is then lost which is wrong.
> 
> Notice that interfaces can be added and removed from bond while it is in bridge.
> 
> This patch solves the situation in the bonding without touching bridge code,
> as Patrick suggested. For every incoming frame to bonding it searches the
> destination address in slaves list and if any of slave addresses matches, it
> rewrites the address in frame by the adress of bonding master. This ensures that
> all frames comming thru the bonding in alb mode have the same address.
> 
> Jirka
> 
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
> index 27fb7f5..2838be0 100644
> --- a/drivers/net/bonding/bond_alb.c
> +++ b/drivers/net/bonding/bond_alb.c
> @@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
>  	return 0;
>  }
>  
> +void bond_alb_change_dest(struct sk_buff *skb)
> +{
> +	struct net_device *bond_dev = skb->dev;
> +	struct bonding *bond = netdev_priv(bond_dev);
> +	unsigned char *dest = eth_hdr(skb)->h_dest;
> +	struct slave *slave;
> +	int i;
> +
> +	if (!memcmp(dest, bond_dev->dev_addr, ETH_ALEN))
> +		return;
> +	read_lock(&bond->lock);


Its a pity bonding doesnt use RCU and needs this read_lock(&bond->lock)


> +	bond_for_each_slave(bond, slave, i) {
> +		if (!memcmp(slave->dev->dev_addr, dest, ETH_ALEN)) {

compare_ether_addr() (or even better compare_ether_addr_64bits()) instead of memcmp() ?

> +			memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
> +			break;
> +		}
> +	}
> +	read_unlock(&bond->lock);
> +}
> +


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try2
@ 2009-03-25 13:40     ` Eric Dumazet
  0 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-03-25 13:40 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: fubar, netdev, bridge, linux-kernel, mschmidt, bonding-devel,
	jgarzik, davem

Jiri Pirko a écrit :
> (resend)
> 
> Hi all.
> 
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> 
> Basically here's what's going on. In every mode, bonding interface uses the same
> mac address for all enslaved devices. Except for mode balance-alb. When you put
> this kind of bond device into a bridge it will only add one of mac adresses into
> a hash list of mac addresses, say X. This mac address is marked as local. But
> this bonding interface also has mac address Y. Now then packet arrives with
> destination address Y, this address is not marked as local and the packed looks
> like it needs to be forwarded. This packet is then lost which is wrong.
> 
> Notice that interfaces can be added and removed from bond while it is in bridge.
> 
> This patch solves the situation in the bonding without touching bridge code,
> as Patrick suggested. For every incoming frame to bonding it searches the
> destination address in slaves list and if any of slave addresses matches, it
> rewrites the address in frame by the adress of bonding master. This ensures that
> all frames comming thru the bonding in alb mode have the same address.
> 
> Jirka
> 
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
> index 27fb7f5..2838be0 100644
> --- a/drivers/net/bonding/bond_alb.c
> +++ b/drivers/net/bonding/bond_alb.c
> @@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
>  	return 0;
>  }
>  
> +void bond_alb_change_dest(struct sk_buff *skb)
> +{
> +	struct net_device *bond_dev = skb->dev;
> +	struct bonding *bond = netdev_priv(bond_dev);
> +	unsigned char *dest = eth_hdr(skb)->h_dest;
> +	struct slave *slave;
> +	int i;
> +
> +	if (!memcmp(dest, bond_dev->dev_addr, ETH_ALEN))
> +		return;
> +	read_lock(&bond->lock);


Its a pity bonding doesnt use RCU and needs this read_lock(&bond->lock)


> +	bond_for_each_slave(bond, slave, i) {
> +		if (!memcmp(slave->dev->dev_addr, dest, ETH_ALEN)) {

compare_ether_addr() (or even better compare_ether_addr_64bits()) instead of memcmp() ?

> +			memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
> +			break;
> +		}
> +	}
> +	read_unlock(&bond->lock);
> +}
> +


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try2
  2009-03-25 13:40     ` [Bridge] " Eric Dumazet
@ 2009-03-25 14:39       ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-25 14:39 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: linux-kernel, netdev, jgarzik, davem, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt

Wed, Mar 25, 2009 at 02:40:43PM CET, dada1@cosmosbay.com wrote:
>Jiri Pirko a écrit :
>> (resend)
>> 
>> Hi all.
>> 
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>> 
>> Basically here's what's going on. In every mode, bonding interface uses the same
>> mac address for all enslaved devices. Except for mode balance-alb. When you put
>> this kind of bond device into a bridge it will only add one of mac adresses into
>> a hash list of mac addresses, say X. This mac address is marked as local. But
>> this bonding interface also has mac address Y. Now then packet arrives with
>> destination address Y, this address is not marked as local and the packed looks
>> like it needs to be forwarded. This packet is then lost which is wrong.
>> 
>> Notice that interfaces can be added and removed from bond while it is in bridge.
>> 
>> This patch solves the situation in the bonding without touching bridge code,
>> as Patrick suggested. For every incoming frame to bonding it searches the
>> destination address in slaves list and if any of slave addresses matches, it
>> rewrites the address in frame by the adress of bonding master. This ensures that
>> all frames comming thru the bonding in alb mode have the same address.
>> 
>> Jirka
>> 
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> 
>> diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
>> index 27fb7f5..2838be0 100644
>> --- a/drivers/net/bonding/bond_alb.c
>> +++ b/drivers/net/bonding/bond_alb.c
>> @@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
>>  	return 0;
>>  }
>>  
>> +void bond_alb_change_dest(struct sk_buff *skb)
>> +{
>> +	struct net_device *bond_dev = skb->dev;
>> +	struct bonding *bond = netdev_priv(bond_dev);
>> +	unsigned char *dest = eth_hdr(skb)->h_dest;
>> +	struct slave *slave;
>> +	int i;
>> +
>> +	if (!memcmp(dest, bond_dev->dev_addr, ETH_ALEN))
>> +		return;
>> +	read_lock(&bond->lock);
>
>
>Its a pity bonding doesnt use RCU and needs this read_lock(&bond->lock)

Sure it is...
>
>
>> +	bond_for_each_slave(bond, slave, i) {
>> +		if (!memcmp(slave->dev->dev_addr, dest, ETH_ALEN)) {
>
>compare_ether_addr() (or even better compare_ether_addr_64bits()) instead of memcmp() ?

Okay, I'll use compare_ether_addr_64bits and do the repost later on...
>
>> +			memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
>> +			break;
>> +		}
>> +	}
>> +	read_unlock(&bond->lock);
>> +}
>> +
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try2
@ 2009-03-25 14:39       ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-25 14:39 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: fubar, netdev, bridge, linux-kernel, mschmidt, bonding-devel,
	jgarzik, davem

Wed, Mar 25, 2009 at 02:40:43PM CET, dada1@cosmosbay.com wrote:
>Jiri Pirko a écrit :
>> (resend)
>> 
>> Hi all.
>> 
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>> 
>> Basically here's what's going on. In every mode, bonding interface uses the same
>> mac address for all enslaved devices. Except for mode balance-alb. When you put
>> this kind of bond device into a bridge it will only add one of mac adresses into
>> a hash list of mac addresses, say X. This mac address is marked as local. But
>> this bonding interface also has mac address Y. Now then packet arrives with
>> destination address Y, this address is not marked as local and the packed looks
>> like it needs to be forwarded. This packet is then lost which is wrong.
>> 
>> Notice that interfaces can be added and removed from bond while it is in bridge.
>> 
>> This patch solves the situation in the bonding without touching bridge code,
>> as Patrick suggested. For every incoming frame to bonding it searches the
>> destination address in slaves list and if any of slave addresses matches, it
>> rewrites the address in frame by the adress of bonding master. This ensures that
>> all frames comming thru the bonding in alb mode have the same address.
>> 
>> Jirka
>> 
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> 
>> diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
>> index 27fb7f5..2838be0 100644
>> --- a/drivers/net/bonding/bond_alb.c
>> +++ b/drivers/net/bonding/bond_alb.c
>> @@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
>>  	return 0;
>>  }
>>  
>> +void bond_alb_change_dest(struct sk_buff *skb)
>> +{
>> +	struct net_device *bond_dev = skb->dev;
>> +	struct bonding *bond = netdev_priv(bond_dev);
>> +	unsigned char *dest = eth_hdr(skb)->h_dest;
>> +	struct slave *slave;
>> +	int i;
>> +
>> +	if (!memcmp(dest, bond_dev->dev_addr, ETH_ALEN))
>> +		return;
>> +	read_lock(&bond->lock);
>
>
>Its a pity bonding doesnt use RCU and needs this read_lock(&bond->lock)

Sure it is...
>
>
>> +	bond_for_each_slave(bond, slave, i) {
>> +		if (!memcmp(slave->dev->dev_addr, dest, ETH_ALEN)) {
>
>compare_ether_addr() (or even better compare_ether_addr_64bits()) instead of memcmp() ?

Okay, I'll use compare_ether_addr_64bits and do the repost later on...
>
>> +			memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
>> +			break;
>> +		}
>> +	}
>> +	read_unlock(&bond->lock);
>> +}
>> +
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3
  2009-03-13 18:33 ` [Bridge] " Jiri Pirko
@ 2009-03-25 15:19   ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-25 15:19 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1

(resend, using compare_ether_addr_64bits instead of memcmp)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices. Except for mode balance-alb. When you put
this kind of bond device into a bridge it will only add one of mac adresses into
a hash list of mac addresses, say X. This mac address is marked as local. But
this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patch solves the situation in the bonding without touching bridge code,
as Patrick suggested. For every incoming frame to bonding it searches the
destination address in slaves list and if any of slave addresses matches, it
rewrites the address in frame by the adress of bonding master. This ensures that
all frames comming thru the bonding in alb mode have the same address.

Jirka


Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 27fb7f5..83998f4 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
 	return 0;
 }
 
+void bond_alb_change_dest(struct sk_buff *skb)
+{
+	struct net_device *bond_dev = skb->dev;
+	struct bonding *bond = netdev_priv(bond_dev);
+	unsigned char *dest = eth_hdr(skb)->h_dest;
+	struct slave *slave;
+	int i;
+
+	if (!compare_ether_addr_64bits(dest, bond_dev->dev_addr))
+		return;
+	read_lock(&bond->lock);
+	bond_for_each_slave(bond, slave, i) {
+		if (!compare_ether_addr_64bits(slave->dev->dev_addr, dest)) {
+			memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
+			break;
+		}
+	}
+	read_unlock(&bond->lock);
+}
+
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
 {
 	if (bond->alb_info.current_alb_vlan &&
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 50968f8..77f36fb 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
 int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
 void bond_alb_monitor(struct work_struct *);
 int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
+void bond_alb_change_dest(struct sk_buff *skb);
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
 #endif /* __BOND_ALB_H__ */
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3d76686..b62fdc4 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4294,6 +4294,19 @@ unwind:
 	return res;
 }
 
+/*
+ * Called via bond_change_dest_hook.
+ * note: already called with rcu_read_lock (preempt_disabled)
+ */
+void bond_change_dest(struct sk_buff *skb)
+{
+	struct net_device *bond_dev = skb->dev;
+	struct bonding *bond = netdev_priv(bond_dev);
+
+	if (bond->params.mode == BOND_MODE_ALB)
+		bond_alb_change_dest(skb);
+}
+
 static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
@@ -5243,6 +5256,8 @@ static int __init bonding_init(void)
 	register_inetaddr_notifier(&bond_inetaddr_notifier);
 	bond_register_ipv6_notifier();
 
+	bond_change_dest_hook = bond_change_dest;
+
 	goto out;
 err:
 	list_for_each_entry(bond, &bond_dev_list, bond_list) {
@@ -5266,6 +5281,8 @@ static void __exit bonding_exit(void)
 	unregister_inetaddr_notifier(&bond_inetaddr_notifier);
 	bond_unregister_ipv6_notifier();
 
+	bond_change_dest_hook = NULL;
+
 	bond_destroy_sysfs();
 
 	rtnl_lock();
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index ca849d2..df92b70 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -375,5 +375,7 @@ static inline void bond_unregister_ipv6_notifier(void)
 }
 #endif
 
+extern void (*bond_change_dest_hook)(struct sk_buff *skb);
+
 #endif /* _LINUX_BONDING_H */
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e3fe5c7..abe68d9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2061,6 +2061,13 @@ static inline int deliver_skb(struct sk_buff *skb,
 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
+void (*bond_change_dest_hook)(struct sk_buff *skb) __read_mostly;
+EXPORT_SYMBOL(bond_change_dest_hook);
+#else
+#define bond_change_dest_hook(skb) do {} while (0)
+#endif
+
 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
 /* These hooks defined here for ATM */
 struct net_bridge;
@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
 	null_or_orig = NULL;
 	orig_dev = skb->dev;
 	if (orig_dev->master) {
-		if (skb_bond_should_drop(skb))
+		if (skb_bond_should_drop(skb)) {
 			null_or_orig = orig_dev; /* deliver only exact match */
-		else
+		} else {
 			skb->dev = orig_dev->master;
+			bond_change_dest_hook(skb);
+		}
 	}
 
 	__get_cpu_var(netdev_rx_stat).total++;

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3
@ 2009-03-25 15:19   ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-25 15:19 UTC (permalink / raw)
  To: linux-kernel
  Cc: fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik, dada1, davem

(resend, using compare_ether_addr_64bits instead of memcmp)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices. Except for mode balance-alb. When you put
this kind of bond device into a bridge it will only add one of mac adresses into
a hash list of mac addresses, say X. This mac address is marked as local. But
this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patch solves the situation in the bonding without touching bridge code,
as Patrick suggested. For every incoming frame to bonding it searches the
destination address in slaves list and if any of slave addresses matches, it
rewrites the address in frame by the adress of bonding master. This ensures that
all frames comming thru the bonding in alb mode have the same address.

Jirka


Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 27fb7f5..83998f4 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
 	return 0;
 }
 
+void bond_alb_change_dest(struct sk_buff *skb)
+{
+	struct net_device *bond_dev = skb->dev;
+	struct bonding *bond = netdev_priv(bond_dev);
+	unsigned char *dest = eth_hdr(skb)->h_dest;
+	struct slave *slave;
+	int i;
+
+	if (!compare_ether_addr_64bits(dest, bond_dev->dev_addr))
+		return;
+	read_lock(&bond->lock);
+	bond_for_each_slave(bond, slave, i) {
+		if (!compare_ether_addr_64bits(slave->dev->dev_addr, dest)) {
+			memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
+			break;
+		}
+	}
+	read_unlock(&bond->lock);
+}
+
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
 {
 	if (bond->alb_info.current_alb_vlan &&
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 50968f8..77f36fb 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
 int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
 void bond_alb_monitor(struct work_struct *);
 int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
+void bond_alb_change_dest(struct sk_buff *skb);
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
 #endif /* __BOND_ALB_H__ */
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3d76686..b62fdc4 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4294,6 +4294,19 @@ unwind:
 	return res;
 }
 
+/*
+ * Called via bond_change_dest_hook.
+ * note: already called with rcu_read_lock (preempt_disabled)
+ */
+void bond_change_dest(struct sk_buff *skb)
+{
+	struct net_device *bond_dev = skb->dev;
+	struct bonding *bond = netdev_priv(bond_dev);
+
+	if (bond->params.mode == BOND_MODE_ALB)
+		bond_alb_change_dest(skb);
+}
+
 static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
@@ -5243,6 +5256,8 @@ static int __init bonding_init(void)
 	register_inetaddr_notifier(&bond_inetaddr_notifier);
 	bond_register_ipv6_notifier();
 
+	bond_change_dest_hook = bond_change_dest;
+
 	goto out;
 err:
 	list_for_each_entry(bond, &bond_dev_list, bond_list) {
@@ -5266,6 +5281,8 @@ static void __exit bonding_exit(void)
 	unregister_inetaddr_notifier(&bond_inetaddr_notifier);
 	bond_unregister_ipv6_notifier();
 
+	bond_change_dest_hook = NULL;
+
 	bond_destroy_sysfs();
 
 	rtnl_lock();
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index ca849d2..df92b70 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -375,5 +375,7 @@ static inline void bond_unregister_ipv6_notifier(void)
 }
 #endif
 
+extern void (*bond_change_dest_hook)(struct sk_buff *skb);
+
 #endif /* _LINUX_BONDING_H */
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e3fe5c7..abe68d9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2061,6 +2061,13 @@ static inline int deliver_skb(struct sk_buff *skb,
 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
+void (*bond_change_dest_hook)(struct sk_buff *skb) __read_mostly;
+EXPORT_SYMBOL(bond_change_dest_hook);
+#else
+#define bond_change_dest_hook(skb) do {} while (0)
+#endif
+
 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
 /* These hooks defined here for ATM */
 struct net_bridge;
@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
 	null_or_orig = NULL;
 	orig_dev = skb->dev;
 	if (orig_dev->master) {
-		if (skb_bond_should_drop(skb))
+		if (skb_bond_should_drop(skb)) {
 			null_or_orig = orig_dev; /* deliver only exact match */
-		else
+		} else {
 			skb->dev = orig_dev->master;
+			bond_change_dest_hook(skb);
+		}
 	}
 
 	__get_cpu_var(netdev_rx_stat).total++;

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3
  2009-03-25 15:19   ` [Bridge] " Jiri Pirko
@ 2009-03-25 16:31     ` Jay Vosburgh
  -1 siblings, 0 replies; 214+ messages in thread
From: Jay Vosburgh @ 2009-03-25 16:31 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, netdev, jgarzik, davem, shemminger, bridge,
	bonding-devel, kaber, mschmidt, dada1

Jiri Pirko <jpirko@redhat.com> wrote:

>Basically here's what's going on. In every mode, bonding interface uses the same
>mac address for all enslaved devices. Except for mode balance-alb. 

	I think you mean "only balance-alb will simultaneously use
multiple MAC addresses across different slaves."  Yes?

	I ask because the active-backup mode with fail_over_mac=active
will change the bond's MAC to always be the MAC of whatever the
currently active slave is, but I don't think that will trigger the
problem you're talking about (because it'll only use one MAC at a time).

>[...] When you put
>this kind of bond device into a bridge it will only add one of mac adresses into
>a hash list of mac addresses, say X. This mac address is marked as local. But
>this bonding interface also has mac address Y. Now then packet arrives with
>destination address Y, this address is not marked as local and the packed looks
>like it needs to be forwarded. This packet is then lost which is wrong.
>
>Notice that interfaces can be added and removed from bond while it is in bridge.
>
>This patch solves the situation in the bonding without touching bridge code,
>as Patrick suggested. For every incoming frame to bonding it searches the
>destination address in slaves list and if any of slave addresses matches, it
>rewrites the address in frame by the adress of bonding master. This ensures that
>all frames comming thru the bonding in alb mode have the same address.
>
>Jirka
>
>
>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>
>diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
>index 27fb7f5..83998f4 100644
>--- a/drivers/net/bonding/bond_alb.c
>+++ b/drivers/net/bonding/bond_alb.c
>@@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
> 	return 0;
> }
>
>+void bond_alb_change_dest(struct sk_buff *skb)
>+{
>+	struct net_device *bond_dev = skb->dev;
>+	struct bonding *bond = netdev_priv(bond_dev);
>+	unsigned char *dest = eth_hdr(skb)->h_dest;
>+	struct slave *slave;
>+	int i;
>+
>+	if (!compare_ether_addr_64bits(dest, bond_dev->dev_addr))
>+		return;
>+	read_lock(&bond->lock);
>+	bond_for_each_slave(bond, slave, i) {
>+		if (!compare_ether_addr_64bits(slave->dev->dev_addr, dest)) {
>+			memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
>+			break;
>+		}
>+	}
>+	read_unlock(&bond->lock);
>+}
>+
> void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
> {
> 	if (bond->alb_info.current_alb_vlan &&
>diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
>index 50968f8..77f36fb 100644
>--- a/drivers/net/bonding/bond_alb.h
>+++ b/drivers/net/bonding/bond_alb.h
>@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
> int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
> void bond_alb_monitor(struct work_struct *);
> int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
>+void bond_alb_change_dest(struct sk_buff *skb);
> void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
> #endif /* __BOND_ALB_H__ */
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 3d76686..b62fdc4 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -4294,6 +4294,19 @@ unwind:
> 	return res;
> }
>
>+/*
>+ * Called via bond_change_dest_hook.
>+ * note: already called with rcu_read_lock (preempt_disabled)
>+ */
>+void bond_change_dest(struct sk_buff *skb)
>+{
>+	struct net_device *bond_dev = skb->dev;
>+	struct bonding *bond = netdev_priv(bond_dev);
>+
>+	if (bond->params.mode == BOND_MODE_ALB)
>+		bond_alb_change_dest(skb);
>+}
>+
> static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
> {
> 	struct bonding *bond = netdev_priv(bond_dev);
>@@ -5243,6 +5256,8 @@ static int __init bonding_init(void)
> 	register_inetaddr_notifier(&bond_inetaddr_notifier);
> 	bond_register_ipv6_notifier();
>
>+	bond_change_dest_hook = bond_change_dest;
>+
> 	goto out;
> err:
> 	list_for_each_entry(bond, &bond_dev_list, bond_list) {
>@@ -5266,6 +5281,8 @@ static void __exit bonding_exit(void)
> 	unregister_inetaddr_notifier(&bond_inetaddr_notifier);
> 	bond_unregister_ipv6_notifier();
>
>+	bond_change_dest_hook = NULL;
>+
> 	bond_destroy_sysfs();
>
> 	rtnl_lock();
>diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
>index ca849d2..df92b70 100644
>--- a/drivers/net/bonding/bonding.h
>+++ b/drivers/net/bonding/bonding.h
>@@ -375,5 +375,7 @@ static inline void bond_unregister_ipv6_notifier(void)
> }
> #endif
>
>+extern void (*bond_change_dest_hook)(struct sk_buff *skb);
>+
> #endif /* _LINUX_BONDING_H */
>
>diff --git a/net/core/dev.c b/net/core/dev.c
>index e3fe5c7..abe68d9 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -2061,6 +2061,13 @@ static inline int deliver_skb(struct sk_buff *skb,
> 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
> }
>
>+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
>+void (*bond_change_dest_hook)(struct sk_buff *skb) __read_mostly;
>+EXPORT_SYMBOL(bond_change_dest_hook);
>+#else
>+#define bond_change_dest_hook(skb) do {} while (0)
>+#endif
>+
> #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
> /* These hooks defined here for ATM */
> struct net_bridge;
>@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
> 	null_or_orig = NULL;
> 	orig_dev = skb->dev;
> 	if (orig_dev->master) {
>-		if (skb_bond_should_drop(skb))
>+		if (skb_bond_should_drop(skb)) {
> 			null_or_orig = orig_dev; /* deliver only exact match */
>-		else
>+		} else {
> 			skb->dev = orig_dev->master;
>+			bond_change_dest_hook(skb);

	Since you put the hook outside of the skb_bond_should_drop
function, does the VLAN accelerated receive path do the right thing if,
e.g., there's a VLAN on top of bonding and that VLAN is part of the
bridge?

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3
@ 2009-03-25 16:31     ` Jay Vosburgh
  0 siblings, 0 replies; 214+ messages in thread
From: Jay Vosburgh @ 2009-03-25 16:31 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: mschmidt, netdev, bridge, linux-kernel, bonding-devel, jgarzik,
	dada1, davem

Jiri Pirko <jpirko@redhat.com> wrote:

>Basically here's what's going on. In every mode, bonding interface uses the same
>mac address for all enslaved devices. Except for mode balance-alb. 

	I think you mean "only balance-alb will simultaneously use
multiple MAC addresses across different slaves."  Yes?

	I ask because the active-backup mode with fail_over_mac=active
will change the bond's MAC to always be the MAC of whatever the
currently active slave is, but I don't think that will trigger the
problem you're talking about (because it'll only use one MAC at a time).

>[...] When you put
>this kind of bond device into a bridge it will only add one of mac adresses into
>a hash list of mac addresses, say X. This mac address is marked as local. But
>this bonding interface also has mac address Y. Now then packet arrives with
>destination address Y, this address is not marked as local and the packed looks
>like it needs to be forwarded. This packet is then lost which is wrong.
>
>Notice that interfaces can be added and removed from bond while it is in bridge.
>
>This patch solves the situation in the bonding without touching bridge code,
>as Patrick suggested. For every incoming frame to bonding it searches the
>destination address in slaves list and if any of slave addresses matches, it
>rewrites the address in frame by the adress of bonding master. This ensures that
>all frames comming thru the bonding in alb mode have the same address.
>
>Jirka
>
>
>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>
>diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
>index 27fb7f5..83998f4 100644
>--- a/drivers/net/bonding/bond_alb.c
>+++ b/drivers/net/bonding/bond_alb.c
>@@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
> 	return 0;
> }
>
>+void bond_alb_change_dest(struct sk_buff *skb)
>+{
>+	struct net_device *bond_dev = skb->dev;
>+	struct bonding *bond = netdev_priv(bond_dev);
>+	unsigned char *dest = eth_hdr(skb)->h_dest;
>+	struct slave *slave;
>+	int i;
>+
>+	if (!compare_ether_addr_64bits(dest, bond_dev->dev_addr))
>+		return;
>+	read_lock(&bond->lock);
>+	bond_for_each_slave(bond, slave, i) {
>+		if (!compare_ether_addr_64bits(slave->dev->dev_addr, dest)) {
>+			memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
>+			break;
>+		}
>+	}
>+	read_unlock(&bond->lock);
>+}
>+
> void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
> {
> 	if (bond->alb_info.current_alb_vlan &&
>diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
>index 50968f8..77f36fb 100644
>--- a/drivers/net/bonding/bond_alb.h
>+++ b/drivers/net/bonding/bond_alb.h
>@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
> int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
> void bond_alb_monitor(struct work_struct *);
> int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
>+void bond_alb_change_dest(struct sk_buff *skb);
> void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
> #endif /* __BOND_ALB_H__ */
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 3d76686..b62fdc4 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -4294,6 +4294,19 @@ unwind:
> 	return res;
> }
>
>+/*
>+ * Called via bond_change_dest_hook.
>+ * note: already called with rcu_read_lock (preempt_disabled)
>+ */
>+void bond_change_dest(struct sk_buff *skb)
>+{
>+	struct net_device *bond_dev = skb->dev;
>+	struct bonding *bond = netdev_priv(bond_dev);
>+
>+	if (bond->params.mode == BOND_MODE_ALB)
>+		bond_alb_change_dest(skb);
>+}
>+
> static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
> {
> 	struct bonding *bond = netdev_priv(bond_dev);
>@@ -5243,6 +5256,8 @@ static int __init bonding_init(void)
> 	register_inetaddr_notifier(&bond_inetaddr_notifier);
> 	bond_register_ipv6_notifier();
>
>+	bond_change_dest_hook = bond_change_dest;
>+
> 	goto out;
> err:
> 	list_for_each_entry(bond, &bond_dev_list, bond_list) {
>@@ -5266,6 +5281,8 @@ static void __exit bonding_exit(void)
> 	unregister_inetaddr_notifier(&bond_inetaddr_notifier);
> 	bond_unregister_ipv6_notifier();
>
>+	bond_change_dest_hook = NULL;
>+
> 	bond_destroy_sysfs();
>
> 	rtnl_lock();
>diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
>index ca849d2..df92b70 100644
>--- a/drivers/net/bonding/bonding.h
>+++ b/drivers/net/bonding/bonding.h
>@@ -375,5 +375,7 @@ static inline void bond_unregister_ipv6_notifier(void)
> }
> #endif
>
>+extern void (*bond_change_dest_hook)(struct sk_buff *skb);
>+
> #endif /* _LINUX_BONDING_H */
>
>diff --git a/net/core/dev.c b/net/core/dev.c
>index e3fe5c7..abe68d9 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -2061,6 +2061,13 @@ static inline int deliver_skb(struct sk_buff *skb,
> 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
> }
>
>+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
>+void (*bond_change_dest_hook)(struct sk_buff *skb) __read_mostly;
>+EXPORT_SYMBOL(bond_change_dest_hook);
>+#else
>+#define bond_change_dest_hook(skb) do {} while (0)
>+#endif
>+
> #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
> /* These hooks defined here for ATM */
> struct net_bridge;
>@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
> 	null_or_orig = NULL;
> 	orig_dev = skb->dev;
> 	if (orig_dev->master) {
>-		if (skb_bond_should_drop(skb))
>+		if (skb_bond_should_drop(skb)) {
> 			null_or_orig = orig_dev; /* deliver only exact match */
>-		else
>+		} else {
> 			skb->dev = orig_dev->master;
>+			bond_change_dest_hook(skb);

	Since you put the hook outside of the skb_bond_should_drop
function, does the VLAN accelerated receive path do the right thing if,
e.g., there's a VLAN on top of bonding and that VLAN is part of the
bridge?

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3
  2009-03-25 16:31     ` [Bridge] " Jay Vosburgh
@ 2009-03-25 17:44       ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-25 17:44 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: linux-kernel, netdev, jgarzik, davem, shemminger, bridge,
	bonding-devel, kaber, mschmidt, dada1

Wed, Mar 25, 2009 at 05:31:53PM CET, fubar@us.ibm.com wrote:
>Jiri Pirko <jpirko@redhat.com> wrote:
>
>>Basically here's what's going on. In every mode, bonding interface uses the same
>>mac address for all enslaved devices. Except for mode balance-alb. 
>
>	I think you mean "only balance-alb will simultaneously use
>multiple MAC addresses across different slaves."  Yes?
Yes I do. I will refolmulate the phrase and repost the patch if you want...
>
>	I ask because the active-backup mode with fail_over_mac=active
>will change the bond's MAC to always be the MAC of whatever the
>currently active slave is, but I don't think that will trigger the
>problem you're talking about (because it'll only use one MAC at a time).
>
Yes this fail_over_mac is en exception. In fact I was playing with fail_over_mac
bonding interface in bridge and I have no luck to force a problem with two NICs.
However with 3 NICs I've managed it to the state of 100% packet loss. I'm going
to look at this issue later. This patch is not addressing it...

>>[...] When you put
>>this kind of bond device into a bridge it will only add one of mac adresses into
>>a hash list of mac addresses, say X. This mac address is marked as local. But
>>this bonding interface also has mac address Y. Now then packet arrives with
>>destination address Y, this address is not marked as local and the packed looks
>>like it needs to be forwarded. This packet is then lost which is wrong.
>>
>>Notice that interfaces can be added and removed from bond while it is in bridge.
>>
>>This patch solves the situation in the bonding without touching bridge code,
>>as Patrick suggested. For every incoming frame to bonding it searches the
>>destination address in slaves list and if any of slave addresses matches, it
>>rewrites the address in frame by the adress of bonding master. This ensures that
>>all frames comming thru the bonding in alb mode have the same address.
>>
>>Jirka
>>
>>
>>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>>
>>diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
>>index 27fb7f5..83998f4 100644
>>--- a/drivers/net/bonding/bond_alb.c
>>+++ b/drivers/net/bonding/bond_alb.c
>>@@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
>> 	return 0;
>> }
>>
>>+void bond_alb_change_dest(struct sk_buff *skb)
>>+{
>>+	struct net_device *bond_dev = skb->dev;
>>+	struct bonding *bond = netdev_priv(bond_dev);
>>+	unsigned char *dest = eth_hdr(skb)->h_dest;
>>+	struct slave *slave;
>>+	int i;
>>+
>>+	if (!compare_ether_addr_64bits(dest, bond_dev->dev_addr))
>>+		return;
>>+	read_lock(&bond->lock);
>>+	bond_for_each_slave(bond, slave, i) {
>>+		if (!compare_ether_addr_64bits(slave->dev->dev_addr, dest)) {
>>+			memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
>>+			break;
>>+		}
>>+	}
>>+	read_unlock(&bond->lock);
>>+}
>>+
>> void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
>> {
>> 	if (bond->alb_info.current_alb_vlan &&
>>diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
>>index 50968f8..77f36fb 100644
>>--- a/drivers/net/bonding/bond_alb.h
>>+++ b/drivers/net/bonding/bond_alb.h
>>@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
>> int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
>> void bond_alb_monitor(struct work_struct *);
>> int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
>>+void bond_alb_change_dest(struct sk_buff *skb);
>> void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
>> #endif /* __BOND_ALB_H__ */
>>
>>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>>index 3d76686..b62fdc4 100644
>>--- a/drivers/net/bonding/bond_main.c
>>+++ b/drivers/net/bonding/bond_main.c
>>@@ -4294,6 +4294,19 @@ unwind:
>> 	return res;
>> }
>>
>>+/*
>>+ * Called via bond_change_dest_hook.
>>+ * note: already called with rcu_read_lock (preempt_disabled)
>>+ */
>>+void bond_change_dest(struct sk_buff *skb)
>>+{
>>+	struct net_device *bond_dev = skb->dev;
>>+	struct bonding *bond = netdev_priv(bond_dev);
>>+
>>+	if (bond->params.mode == BOND_MODE_ALB)
>>+		bond_alb_change_dest(skb);
>>+}
>>+
>> static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
>> {
>> 	struct bonding *bond = netdev_priv(bond_dev);
>>@@ -5243,6 +5256,8 @@ static int __init bonding_init(void)
>> 	register_inetaddr_notifier(&bond_inetaddr_notifier);
>> 	bond_register_ipv6_notifier();
>>
>>+	bond_change_dest_hook = bond_change_dest;
>>+
>> 	goto out;
>> err:
>> 	list_for_each_entry(bond, &bond_dev_list, bond_list) {
>>@@ -5266,6 +5281,8 @@ static void __exit bonding_exit(void)
>> 	unregister_inetaddr_notifier(&bond_inetaddr_notifier);
>> 	bond_unregister_ipv6_notifier();
>>
>>+	bond_change_dest_hook = NULL;
>>+
>> 	bond_destroy_sysfs();
>>
>> 	rtnl_lock();
>>diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
>>index ca849d2..df92b70 100644
>>--- a/drivers/net/bonding/bonding.h
>>+++ b/drivers/net/bonding/bonding.h
>>@@ -375,5 +375,7 @@ static inline void bond_unregister_ipv6_notifier(void)
>> }
>> #endif
>>
>>+extern void (*bond_change_dest_hook)(struct sk_buff *skb);
>>+
>> #endif /* _LINUX_BONDING_H */
>>
>>diff --git a/net/core/dev.c b/net/core/dev.c
>>index e3fe5c7..abe68d9 100644
>>--- a/net/core/dev.c
>>+++ b/net/core/dev.c
>>@@ -2061,6 +2061,13 @@ static inline int deliver_skb(struct sk_buff *skb,
>> 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
>> }
>>
>>+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
>>+void (*bond_change_dest_hook)(struct sk_buff *skb) __read_mostly;
>>+EXPORT_SYMBOL(bond_change_dest_hook);
>>+#else
>>+#define bond_change_dest_hook(skb) do {} while (0)
>>+#endif
>>+
>> #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
>> /* These hooks defined here for ATM */
>> struct net_bridge;
>>@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
>> 	null_or_orig = NULL;
>> 	orig_dev = skb->dev;
>> 	if (orig_dev->master) {
>>-		if (skb_bond_should_drop(skb))
>>+		if (skb_bond_should_drop(skb)) {
>> 			null_or_orig = orig_dev; /* deliver only exact match */
>>-		else
>>+		} else {
>> 			skb->dev = orig_dev->master;
>>+			bond_change_dest_hook(skb);
>
>	Since you put the hook outside of the skb_bond_should_drop
>function, does the VLAN accelerated receive path do the right thing if,
>e.g., there's a VLAN on top of bonding and that VLAN is part of the
>bridge?
>
>	-J
>
>---
>	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3
@ 2009-03-25 17:44       ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-25 17:44 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: mschmidt, netdev, bridge, linux-kernel, bonding-devel, jgarzik,
	dada1, davem

Wed, Mar 25, 2009 at 05:31:53PM CET, fubar@us.ibm.com wrote:
>Jiri Pirko <jpirko@redhat.com> wrote:
>
>>Basically here's what's going on. In every mode, bonding interface uses the same
>>mac address for all enslaved devices. Except for mode balance-alb. 
>
>	I think you mean "only balance-alb will simultaneously use
>multiple MAC addresses across different slaves."  Yes?
Yes I do. I will refolmulate the phrase and repost the patch if you want...
>
>	I ask because the active-backup mode with fail_over_mac=active
>will change the bond's MAC to always be the MAC of whatever the
>currently active slave is, but I don't think that will trigger the
>problem you're talking about (because it'll only use one MAC at a time).
>
Yes this fail_over_mac is en exception. In fact I was playing with fail_over_mac
bonding interface in bridge and I have no luck to force a problem with two NICs.
However with 3 NICs I've managed it to the state of 100% packet loss. I'm going
to look at this issue later. This patch is not addressing it...

>>[...] When you put
>>this kind of bond device into a bridge it will only add one of mac adresses into
>>a hash list of mac addresses, say X. This mac address is marked as local. But
>>this bonding interface also has mac address Y. Now then packet arrives with
>>destination address Y, this address is not marked as local and the packed looks
>>like it needs to be forwarded. This packet is then lost which is wrong.
>>
>>Notice that interfaces can be added and removed from bond while it is in bridge.
>>
>>This patch solves the situation in the bonding without touching bridge code,
>>as Patrick suggested. For every incoming frame to bonding it searches the
>>destination address in slaves list and if any of slave addresses matches, it
>>rewrites the address in frame by the adress of bonding master. This ensures that
>>all frames comming thru the bonding in alb mode have the same address.
>>
>>Jirka
>>
>>
>>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>>
>>diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
>>index 27fb7f5..83998f4 100644
>>--- a/drivers/net/bonding/bond_alb.c
>>+++ b/drivers/net/bonding/bond_alb.c
>>@@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
>> 	return 0;
>> }
>>
>>+void bond_alb_change_dest(struct sk_buff *skb)
>>+{
>>+	struct net_device *bond_dev = skb->dev;
>>+	struct bonding *bond = netdev_priv(bond_dev);
>>+	unsigned char *dest = eth_hdr(skb)->h_dest;
>>+	struct slave *slave;
>>+	int i;
>>+
>>+	if (!compare_ether_addr_64bits(dest, bond_dev->dev_addr))
>>+		return;
>>+	read_lock(&bond->lock);
>>+	bond_for_each_slave(bond, slave, i) {
>>+		if (!compare_ether_addr_64bits(slave->dev->dev_addr, dest)) {
>>+			memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
>>+			break;
>>+		}
>>+	}
>>+	read_unlock(&bond->lock);
>>+}
>>+
>> void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
>> {
>> 	if (bond->alb_info.current_alb_vlan &&
>>diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
>>index 50968f8..77f36fb 100644
>>--- a/drivers/net/bonding/bond_alb.h
>>+++ b/drivers/net/bonding/bond_alb.h
>>@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
>> int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
>> void bond_alb_monitor(struct work_struct *);
>> int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
>>+void bond_alb_change_dest(struct sk_buff *skb);
>> void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
>> #endif /* __BOND_ALB_H__ */
>>
>>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>>index 3d76686..b62fdc4 100644
>>--- a/drivers/net/bonding/bond_main.c
>>+++ b/drivers/net/bonding/bond_main.c
>>@@ -4294,6 +4294,19 @@ unwind:
>> 	return res;
>> }
>>
>>+/*
>>+ * Called via bond_change_dest_hook.
>>+ * note: already called with rcu_read_lock (preempt_disabled)
>>+ */
>>+void bond_change_dest(struct sk_buff *skb)
>>+{
>>+	struct net_device *bond_dev = skb->dev;
>>+	struct bonding *bond = netdev_priv(bond_dev);
>>+
>>+	if (bond->params.mode == BOND_MODE_ALB)
>>+		bond_alb_change_dest(skb);
>>+}
>>+
>> static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
>> {
>> 	struct bonding *bond = netdev_priv(bond_dev);
>>@@ -5243,6 +5256,8 @@ static int __init bonding_init(void)
>> 	register_inetaddr_notifier(&bond_inetaddr_notifier);
>> 	bond_register_ipv6_notifier();
>>
>>+	bond_change_dest_hook = bond_change_dest;
>>+
>> 	goto out;
>> err:
>> 	list_for_each_entry(bond, &bond_dev_list, bond_list) {
>>@@ -5266,6 +5281,8 @@ static void __exit bonding_exit(void)
>> 	unregister_inetaddr_notifier(&bond_inetaddr_notifier);
>> 	bond_unregister_ipv6_notifier();
>>
>>+	bond_change_dest_hook = NULL;
>>+
>> 	bond_destroy_sysfs();
>>
>> 	rtnl_lock();
>>diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
>>index ca849d2..df92b70 100644
>>--- a/drivers/net/bonding/bonding.h
>>+++ b/drivers/net/bonding/bonding.h
>>@@ -375,5 +375,7 @@ static inline void bond_unregister_ipv6_notifier(void)
>> }
>> #endif
>>
>>+extern void (*bond_change_dest_hook)(struct sk_buff *skb);
>>+
>> #endif /* _LINUX_BONDING_H */
>>
>>diff --git a/net/core/dev.c b/net/core/dev.c
>>index e3fe5c7..abe68d9 100644
>>--- a/net/core/dev.c
>>+++ b/net/core/dev.c
>>@@ -2061,6 +2061,13 @@ static inline int deliver_skb(struct sk_buff *skb,
>> 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
>> }
>>
>>+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
>>+void (*bond_change_dest_hook)(struct sk_buff *skb) __read_mostly;
>>+EXPORT_SYMBOL(bond_change_dest_hook);
>>+#else
>>+#define bond_change_dest_hook(skb) do {} while (0)
>>+#endif
>>+
>> #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
>> /* These hooks defined here for ATM */
>> struct net_bridge;
>>@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
>> 	null_or_orig = NULL;
>> 	orig_dev = skb->dev;
>> 	if (orig_dev->master) {
>>-		if (skb_bond_should_drop(skb))
>>+		if (skb_bond_should_drop(skb)) {
>> 			null_or_orig = orig_dev; /* deliver only exact match */
>>-		else
>>+		} else {
>> 			skb->dev = orig_dev->master;
>>+			bond_change_dest_hook(skb);
>
>	Since you put the hook outside of the skb_bond_should_drop
>function, does the VLAN accelerated receive path do the right thing if,
>e.g., there's a VLAN on top of bonding and that VLAN is part of the
>bridge?
>
>	-J
>
>---
>	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3
  2009-03-25 17:44       ` [Bridge] " Jiri Pirko
@ 2009-03-26  0:24         ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-03-26  0:24 UTC (permalink / raw)
  To: jpirko
  Cc: fubar, linux-kernel, netdev, jgarzik, shemminger, bridge,
	bonding-devel, kaber, mschmidt, dada1

From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 25 Mar 2009 18:44:05 +0100

> Wed, Mar 25, 2009 at 05:31:53PM CET, fubar@us.ibm.com wrote:
> >Jiri Pirko <jpirko@redhat.com> wrote:
> >
> >>Basically here's what's going on. In every mode, bonding interface uses the same
> >>mac address for all enslaved devices. Except for mode balance-alb. 
> >
> >	I think you mean "only balance-alb will simultaneously use
> >multiple MAC addresses across different slaves."  Yes?
> Yes I do. I will refolmulate the phrase and repost the patch if you want...

I'll let you guys discuss this some more.

It looks like we could have some more tweaks before this patch is
finalized.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3
@ 2009-03-26  0:24         ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-03-26  0:24 UTC (permalink / raw)
  To: jpirko
  Cc: bridge, netdev, fubar, linux-kernel, mschmidt, jgarzik, dada1,
	bonding-devel

From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 25 Mar 2009 18:44:05 +0100

> Wed, Mar 25, 2009 at 05:31:53PM CET, fubar@us.ibm.com wrote:
> >Jiri Pirko <jpirko@redhat.com> wrote:
> >
> >>Basically here's what's going on. In every mode, bonding interface uses the same
> >>mac address for all enslaved devices. Except for mode balance-alb. 
> >
> >	I think you mean "only balance-alb will simultaneously use
> >multiple MAC addresses across different slaves."  Yes?
> Yes I do. I will refolmulate the phrase and repost the patch if you want...

I'll let you guys discuss this some more.

It looks like we could have some more tweaks before this patch is
finalized.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3
  2009-03-25 17:44       ` [Bridge] " Jiri Pirko
@ 2009-03-26  0:34         ` Jay Vosburgh
  -1 siblings, 0 replies; 214+ messages in thread
From: Jay Vosburgh @ 2009-03-26  0:34 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, netdev, jgarzik, davem, shemminger, bridge,
	bonding-devel, kaber, mschmidt, dada1

Jiri Pirko <jpirko@redhat.com> wrote:

>Wed, Mar 25, 2009 at 05:31:53PM CET, fubar@us.ibm.com wrote:
[...]
>>> #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
>>> /* These hooks defined here for ATM */
>>> struct net_bridge;
>>>@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
>>> 	null_or_orig = NULL;
>>> 	orig_dev = skb->dev;
>>> 	if (orig_dev->master) {
>>>-		if (skb_bond_should_drop(skb))
>>>+		if (skb_bond_should_drop(skb)) {
>>> 			null_or_orig = orig_dev; /* deliver only exact match */
>>>-		else
>>>+		} else {
>>> 			skb->dev = orig_dev->master;
>>>+			bond_change_dest_hook(skb);
>>
>>	Since you put the hook outside of the skb_bond_should_drop
>>function, does the VLAN accelerated receive path do the right thing if,
>>e.g., there's a VLAN on top of bonding and that VLAN is part of the
>>bridge?

	Jiri: not trying to be pushy, but you didn't address the above
question about the VLAN path, and I just want to make sure that you saw
it (it was at the bottom of a long email, so I fear you may not have
seen it).

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3
@ 2009-03-26  0:34         ` Jay Vosburgh
  0 siblings, 0 replies; 214+ messages in thread
From: Jay Vosburgh @ 2009-03-26  0:34 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: mschmidt, netdev, bridge, linux-kernel, bonding-devel, jgarzik,
	dada1, davem

Jiri Pirko <jpirko@redhat.com> wrote:

>Wed, Mar 25, 2009 at 05:31:53PM CET, fubar@us.ibm.com wrote:
[...]
>>> #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
>>> /* These hooks defined here for ATM */
>>> struct net_bridge;
>>>@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
>>> 	null_or_orig = NULL;
>>> 	orig_dev = skb->dev;
>>> 	if (orig_dev->master) {
>>>-		if (skb_bond_should_drop(skb))
>>>+		if (skb_bond_should_drop(skb)) {
>>> 			null_or_orig = orig_dev; /* deliver only exact match */
>>>-		else
>>>+		} else {
>>> 			skb->dev = orig_dev->master;
>>>+			bond_change_dest_hook(skb);
>>
>>	Since you put the hook outside of the skb_bond_should_drop
>>function, does the VLAN accelerated receive path do the right thing if,
>>e.g., there's a VLAN on top of bonding and that VLAN is part of the
>>bridge?

	Jiri: not trying to be pushy, but you didn't address the above
question about the VLAN path, and I just want to make sure that you saw
it (it was at the bottom of a long email, so I fear you may not have
seen it).

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3
  2009-03-25 16:31     ` [Bridge] " Jay Vosburgh
@ 2009-03-26 11:12       ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-26 11:12 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: linux-kernel, netdev, jgarzik, davem, shemminger, bridge,
	bonding-devel, kaber, mschmidt, dada1

Wed, Mar 25, 2009 at 05:31:53PM CET, fubar@us.ibm.com wrote:
>>@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
>> 	null_or_orig = NULL;
>> 	orig_dev = skb->dev;
>> 	if (orig_dev->master) {
>>-		if (skb_bond_should_drop(skb))
>>+		if (skb_bond_should_drop(skb)) {
>> 			null_or_orig = orig_dev; /* deliver only exact match */
>>-		else
>>+		} else {
>> 			skb->dev = orig_dev->master;
>>+			bond_change_dest_hook(skb);
>
>	Since you put the hook outside of the skb_bond_should_drop
>function, does the VLAN accelerated receive path do the right thing if,
>e.g., there's a VLAN on top of bonding and that VLAN is part of the
>bridge?

Don't worry :) I did not forget about this - just needed a bit time to
investigate...

Yeah, this look's like a problem. In __vlan_hwaccel_rx there is following line:
	skb->dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK);
This rewrites the dev so latter on when netif_receive_skb is called the hook
will be not called (because dev->master will not be set).

Ok I will put the hook inside the skb_bond_should_drop() - it seems like a
correct solution...

Thanks for pointing this out.
>
>	-J
>
>---
>	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3
@ 2009-03-26 11:12       ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-26 11:12 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: mschmidt, netdev, bridge, linux-kernel, bonding-devel, jgarzik,
	dada1, davem

Wed, Mar 25, 2009 at 05:31:53PM CET, fubar@us.ibm.com wrote:
>>@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
>> 	null_or_orig = NULL;
>> 	orig_dev = skb->dev;
>> 	if (orig_dev->master) {
>>-		if (skb_bond_should_drop(skb))
>>+		if (skb_bond_should_drop(skb)) {
>> 			null_or_orig = orig_dev; /* deliver only exact match */
>>-		else
>>+		} else {
>> 			skb->dev = orig_dev->master;
>>+			bond_change_dest_hook(skb);
>
>	Since you put the hook outside of the skb_bond_should_drop
>function, does the VLAN accelerated receive path do the right thing if,
>e.g., there's a VLAN on top of bonding and that VLAN is part of the
>bridge?

Don't worry :) I did not forget about this - just needed a bit time to
investigate...

Yeah, this look's like a problem. In __vlan_hwaccel_rx there is following line:
	skb->dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK);
This rewrites the dev so latter on when netif_receive_skb is called the hook
will be not called (because dev->master will not be set).

Ok I will put the hook inside the skb_bond_should_drop() - it seems like a
correct solution...

Thanks for pointing this out.
>
>	-J
>
>---
>	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
  2009-03-13 18:33 ` [Bridge] " Jiri Pirko
@ 2009-03-26 15:52   ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-26 15:52 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1

(resend, updated changelog, hook moved into skb_bond_should_drop,
skb_bond_should_drop ifdefed)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patch solves the situation in the bonding without touching bridge code,
as Patrick suggested. For every incoming frame to bonding it searches the
destination address in slaves list and if any of slave addresses matches, it
rewrites the address in frame by the adress of bonding master. This ensures that
all frames comming thru the bonding in alb mode have the same address.

Jirka


Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 27fb7f5..b973ede 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1762,6 +1762,25 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
 	return 0;
 }
 
+void bond_alb_change_dest(struct sk_buff *skb, struct net_device *bond_dev)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	unsigned char *dest = eth_hdr(skb)->h_dest;
+	struct slave *slave;
+	int i;
+
+	if (!compare_ether_addr_64bits(dest, bond_dev->dev_addr))
+		return;
+	read_lock(&bond->lock);
+	bond_for_each_slave(bond, slave, i) {
+		if (!compare_ether_addr_64bits(slave->dev->dev_addr, dest)) {
+			memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
+			break;
+		}
+	}
+	read_unlock(&bond->lock);
+}
+
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
 {
 	if (bond->alb_info.current_alb_vlan &&
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 50968f8..4924dd7 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
 int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
 void bond_alb_monitor(struct work_struct *);
 int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
+void bond_alb_change_dest(struct sk_buff *skb, struct net_device *bond_dev);
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
 #endif /* __BOND_ALB_H__ */
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3d76686..7c7cb81 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4294,6 +4294,18 @@ unwind:
 	return res;
 }
 
+/*
+ * Called via bond_change_dest_hook.
+ * note: already called with rcu_read_lock (preempt_disabled)
+ */
+void bond_change_dest(struct sk_buff *skb, struct net_device *bond_dev)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+
+	if (bond->params.mode == BOND_MODE_ALB)
+		bond_alb_change_dest(skb, bond_dev);
+}
+
 static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
@@ -5243,6 +5255,8 @@ static int __init bonding_init(void)
 	register_inetaddr_notifier(&bond_inetaddr_notifier);
 	bond_register_ipv6_notifier();
 
+	bond_change_dest_hook = bond_change_dest;
+
 	goto out;
 err:
 	list_for_each_entry(bond, &bond_dev_list, bond_list) {
@@ -5266,6 +5280,8 @@ static void __exit bonding_exit(void)
 	unregister_inetaddr_notifier(&bond_inetaddr_notifier);
 	bond_unregister_ipv6_notifier();
 
+	bond_change_dest_hook = NULL;
+
 	bond_destroy_sysfs();
 
 	rtnl_lock();
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index ca849d2..7159483 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -375,5 +375,8 @@ static inline void bond_unregister_ipv6_notifier(void)
 }
 #endif
 
+extern void (*bond_change_dest_hook)(struct sk_buff *skb,
+				     struct net_device *master);
+
 #endif /* _LINUX_BONDING_H */
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6593667..7af6857 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1860,6 +1860,10 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 	dev->gso_max_size = size;
 }
 
+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
+extern void (*bond_change_dest_hook)(struct sk_buff *skb,
+				     struct net_device *master);
+
 /* On bonding slaves other than the currently active slave, suppress
  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
  * ARP on active-backup slaves with arp_validate enabled.
@@ -1876,22 +1880,31 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
 			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
 			    skb->protocol == __constant_htons(ETH_P_ARP))
-				return 0;
+				goto dont_drop;
 
 			if (master->priv_flags & IFF_MASTER_ALB) {
 				if (skb->pkt_type != PACKET_BROADCAST &&
 				    skb->pkt_type != PACKET_MULTICAST)
-					return 0;
+					goto dont_drop;
 			}
 			if (master->priv_flags & IFF_MASTER_8023AD &&
 			    skb->protocol == __constant_htons(ETH_P_SLOW))
-				return 0;
+				goto dont_drop;
 
 			return 1;
 		}
+dont_drop:
+		bond_change_dest_hook(skb, master);
 	}
+
+	return 0;
+}
+#else
+static inline int skb_bond_should_drop(struct sk_buff *skb)
+{
 	return 0;
 }
+#endif
 
 extern struct pernet_operations __net_initdata loopback_net_ops;
 #endif /* __KERNEL__ */
diff --git a/net/core/dev.c b/net/core/dev.c
index e3fe5c7..d9b758b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2061,6 +2061,12 @@ static inline int deliver_skb(struct sk_buff *skb,
 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
+void (*bond_change_dest_hook)(struct sk_buff *skb,
+			      struct net_device *master) __read_mostly;
+EXPORT_SYMBOL(bond_change_dest_hook);
+#endif
+
 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
 /* These hooks defined here for ATM */
 struct net_bridge;

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
@ 2009-03-26 15:52   ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-26 15:52 UTC (permalink / raw)
  To: linux-kernel
  Cc: fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik, dada1, davem

(resend, updated changelog, hook moved into skb_bond_should_drop,
skb_bond_should_drop ifdefed)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patch solves the situation in the bonding without touching bridge code,
as Patrick suggested. For every incoming frame to bonding it searches the
destination address in slaves list and if any of slave addresses matches, it
rewrites the address in frame by the adress of bonding master. This ensures that
all frames comming thru the bonding in alb mode have the same address.

Jirka


Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 27fb7f5..b973ede 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1762,6 +1762,25 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
 	return 0;
 }
 
+void bond_alb_change_dest(struct sk_buff *skb, struct net_device *bond_dev)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	unsigned char *dest = eth_hdr(skb)->h_dest;
+	struct slave *slave;
+	int i;
+
+	if (!compare_ether_addr_64bits(dest, bond_dev->dev_addr))
+		return;
+	read_lock(&bond->lock);
+	bond_for_each_slave(bond, slave, i) {
+		if (!compare_ether_addr_64bits(slave->dev->dev_addr, dest)) {
+			memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
+			break;
+		}
+	}
+	read_unlock(&bond->lock);
+}
+
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
 {
 	if (bond->alb_info.current_alb_vlan &&
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 50968f8..4924dd7 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
 int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
 void bond_alb_monitor(struct work_struct *);
 int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
+void bond_alb_change_dest(struct sk_buff *skb, struct net_device *bond_dev);
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
 #endif /* __BOND_ALB_H__ */
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3d76686..7c7cb81 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4294,6 +4294,18 @@ unwind:
 	return res;
 }
 
+/*
+ * Called via bond_change_dest_hook.
+ * note: already called with rcu_read_lock (preempt_disabled)
+ */
+void bond_change_dest(struct sk_buff *skb, struct net_device *bond_dev)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+
+	if (bond->params.mode == BOND_MODE_ALB)
+		bond_alb_change_dest(skb, bond_dev);
+}
+
 static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
@@ -5243,6 +5255,8 @@ static int __init bonding_init(void)
 	register_inetaddr_notifier(&bond_inetaddr_notifier);
 	bond_register_ipv6_notifier();
 
+	bond_change_dest_hook = bond_change_dest;
+
 	goto out;
 err:
 	list_for_each_entry(bond, &bond_dev_list, bond_list) {
@@ -5266,6 +5280,8 @@ static void __exit bonding_exit(void)
 	unregister_inetaddr_notifier(&bond_inetaddr_notifier);
 	bond_unregister_ipv6_notifier();
 
+	bond_change_dest_hook = NULL;
+
 	bond_destroy_sysfs();
 
 	rtnl_lock();
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index ca849d2..7159483 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -375,5 +375,8 @@ static inline void bond_unregister_ipv6_notifier(void)
 }
 #endif
 
+extern void (*bond_change_dest_hook)(struct sk_buff *skb,
+				     struct net_device *master);
+
 #endif /* _LINUX_BONDING_H */
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6593667..7af6857 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1860,6 +1860,10 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 	dev->gso_max_size = size;
 }
 
+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
+extern void (*bond_change_dest_hook)(struct sk_buff *skb,
+				     struct net_device *master);
+
 /* On bonding slaves other than the currently active slave, suppress
  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
  * ARP on active-backup slaves with arp_validate enabled.
@@ -1876,22 +1880,31 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
 			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
 			    skb->protocol == __constant_htons(ETH_P_ARP))
-				return 0;
+				goto dont_drop;
 
 			if (master->priv_flags & IFF_MASTER_ALB) {
 				if (skb->pkt_type != PACKET_BROADCAST &&
 				    skb->pkt_type != PACKET_MULTICAST)
-					return 0;
+					goto dont_drop;
 			}
 			if (master->priv_flags & IFF_MASTER_8023AD &&
 			    skb->protocol == __constant_htons(ETH_P_SLOW))
-				return 0;
+				goto dont_drop;
 
 			return 1;
 		}
+dont_drop:
+		bond_change_dest_hook(skb, master);
 	}
+
+	return 0;
+}
+#else
+static inline int skb_bond_should_drop(struct sk_buff *skb)
+{
 	return 0;
 }
+#endif
 
 extern struct pernet_operations __net_initdata loopback_net_ops;
 #endif /* __KERNEL__ */
diff --git a/net/core/dev.c b/net/core/dev.c
index e3fe5c7..d9b758b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2061,6 +2061,12 @@ static inline int deliver_skb(struct sk_buff *skb,
 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
+void (*bond_change_dest_hook)(struct sk_buff *skb,
+			      struct net_device *master) __read_mostly;
+EXPORT_SYMBOL(bond_change_dest_hook);
+#endif
+
 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
 /* These hooks defined here for ATM */
 struct net_bridge;

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
  2009-03-26 15:52   ` [Bridge] " Jiri Pirko
@ 2009-03-27  7:38     ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-03-27  7:38 UTC (permalink / raw)
  To: jpirko
  Cc: linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1

From: Jiri Pirko <jpirko@redhat.com>
Date: Thu, 26 Mar 2009 16:52:06 +0100

> (resend, updated changelog, hook moved into skb_bond_should_drop,
> skb_bond_should_drop ifdefed)
> 
> Hi all.
> 
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
 ...
> This patch solves the situation in the bonding without touching bridge code,
> as Patrick suggested. For every incoming frame to bonding it searches the
> destination address in slaves list and if any of slave addresses matches, it
> rewrites the address in frame by the adress of bonding master. This ensures that
> all frames comming thru the bonding in alb mode have the same address.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>


I don't like the hook, but if that's how it's best done....

Patrick, please review this.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
@ 2009-03-27  7:38     ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-03-27  7:38 UTC (permalink / raw)
  To: jpirko
  Cc: fubar, netdev, bridge, linux-kernel, mschmidt, jgarzik, dada1,
	bonding-devel

From: Jiri Pirko <jpirko@redhat.com>
Date: Thu, 26 Mar 2009 16:52:06 +0100

> (resend, updated changelog, hook moved into skb_bond_should_drop,
> skb_bond_should_drop ifdefed)
> 
> Hi all.
> 
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
 ...
> This patch solves the situation in the bonding without touching bridge code,
> as Patrick suggested. For every incoming frame to bonding it searches the
> destination address in slaves list and if any of slave addresses matches, it
> rewrites the address in frame by the adress of bonding master. This ensures that
> all frames comming thru the bonding in alb mode have the same address.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>


I don't like the hook, but if that's how it's best done....

Patrick, please review this.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
  2009-03-27  7:38     ` [Bridge] " David Miller
@ 2009-03-27  7:46       ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-27  7:46 UTC (permalink / raw)
  To: David Miller
  Cc: linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1

Fri, Mar 27, 2009 at 08:38:19AM CET, davem@davemloft.net wrote:
>From: Jiri Pirko <jpirko@redhat.com>
>Date: Thu, 26 Mar 2009 16:52:06 +0100
>
>> (resend, updated changelog, hook moved into skb_bond_should_drop,
>> skb_bond_should_drop ifdefed)
>> 
>> Hi all.
>> 
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> ...
>> This patch solves the situation in the bonding without touching bridge code,
>> as Patrick suggested. For every incoming frame to bonding it searches the
>> destination address in slaves list and if any of slave addresses matches, it
>> rewrites the address in frame by the adress of bonding master. This ensures that
>> all frames comming thru the bonding in alb mode have the same address.
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>
>
>I don't like the hook, but if that's how it's best done....

Yes I agree with you, but I thing that for now it's the best way to do this. I
picked this solution out of 3 that I had in mind and this is the lesser evil :)
If anyone have any other solution please speak up.

>
>Patrick, please review this.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
@ 2009-03-27  7:46       ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-27  7:46 UTC (permalink / raw)
  To: David Miller
  Cc: fubar, netdev, bridge, linux-kernel, mschmidt, jgarzik, dada1,
	bonding-devel

Fri, Mar 27, 2009 at 08:38:19AM CET, davem@davemloft.net wrote:
>From: Jiri Pirko <jpirko@redhat.com>
>Date: Thu, 26 Mar 2009 16:52:06 +0100
>
>> (resend, updated changelog, hook moved into skb_bond_should_drop,
>> skb_bond_should_drop ifdefed)
>> 
>> Hi all.
>> 
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> ...
>> This patch solves the situation in the bonding without touching bridge code,
>> as Patrick suggested. For every incoming frame to bonding it searches the
>> destination address in slaves list and if any of slave addresses matches, it
>> rewrites the address in frame by the adress of bonding master. This ensures that
>> all frames comming thru the bonding in alb mode have the same address.
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>
>
>I don't like the hook, but if that's how it's best done....

Yes I agree with you, but I thing that for now it's the best way to do this. I
picked this solution out of 3 that I had in mind and this is the lesser evil :)
If anyone have any other solution please speak up.

>
>Patrick, please review this.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
  2009-03-27  7:38     ` [Bridge] " David Miller
@ 2009-03-27  7:53       ` Patrick McHardy
  -1 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-03-27  7:53 UTC (permalink / raw)
  To: David Miller
  Cc: jpirko, linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, mschmidt, dada1

David Miller wrote:
> From: Jiri Pirko <jpirko@redhat.com>
> Date: Thu, 26 Mar 2009 16:52:06 +0100
> 
>> (resend, updated changelog, hook moved into skb_bond_should_drop,
>> skb_bond_should_drop ifdefed)
>>
>> Hi all.
>>
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>  ...
>> This patch solves the situation in the bonding without touching bridge code,
>> as Patrick suggested. For every incoming frame to bonding it searches the
>> destination address in slaves list and if any of slave addresses matches, it
>> rewrites the address in frame by the adress of bonding master. This ensures that
>> all frames comming thru the bonding in alb mode have the same address.
>>
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> 
> I don't like the hook, but if that's how it's best done....
> 
> Patrick, please review this.

Me neither, but I don't think this approach can be done without the
hook. While I still find it questionable whether this mode really
needs to be supported for a bridge at all, an alternative approach
would be to have bonding add FDB entries for all secondary MACs to
make bridging treat them as local.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
@ 2009-03-27  7:53       ` Patrick McHardy
  0 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-03-27  7:53 UTC (permalink / raw)
  To: David Miller
  Cc: fubar, jpirko, netdev, bridge, linux-kernel, mschmidt, dada1,
	jgarzik, bonding-devel

David Miller wrote:
> From: Jiri Pirko <jpirko@redhat.com>
> Date: Thu, 26 Mar 2009 16:52:06 +0100
> 
>> (resend, updated changelog, hook moved into skb_bond_should_drop,
>> skb_bond_should_drop ifdefed)
>>
>> Hi all.
>>
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>  ...
>> This patch solves the situation in the bonding without touching bridge code,
>> as Patrick suggested. For every incoming frame to bonding it searches the
>> destination address in slaves list and if any of slave addresses matches, it
>> rewrites the address in frame by the adress of bonding master. This ensures that
>> all frames comming thru the bonding in alb mode have the same address.
>>
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> 
> I don't like the hook, but if that's how it's best done....
> 
> Patrick, please review this.

Me neither, but I don't think this approach can be done without the
hook. While I still find it questionable whether this mode really
needs to be supported for a bridge at all, an alternative approach
would be to have bonding add FDB entries for all secondary MACs to
make bridging treat them as local.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
  2009-03-27  7:53       ` [Bridge] " Patrick McHardy
@ 2009-03-27  8:41         ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-27  8:41 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Miller, linux-kernel, netdev, jgarzik, shemminger, bridge,
	fubar, bonding-devel, mschmidt, dada1

Fri, Mar 27, 2009 at 08:53:13AM CET, kaber@trash.net wrote:
> David Miller wrote:
>> From: Jiri Pirko <jpirko@redhat.com>
>> Date: Thu, 26 Mar 2009 16:52:06 +0100
>>
>>> (resend, updated changelog, hook moved into skb_bond_should_drop,
>>> skb_bond_should_drop ifdefed)
>>>
>>> Hi all.
>>>
>>> The problem is described in following bugzilla:
>>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>>  ...
>>> This patch solves the situation in the bonding without touching bridge code,
>>> as Patrick suggested. For every incoming frame to bonding it searches the
>>> destination address in slaves list and if any of slave addresses matches, it
>>> rewrites the address in frame by the adress of bonding master. This ensures that
>>> all frames comming thru the bonding in alb mode have the same address.
>>>
>>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>>
>>
>> I don't like the hook, but if that's how it's best done....
>>
>> Patrick, please review this.
>
> Me neither, but I don't think this approach can be done without the
> hook. While I still find it questionable whether this mode really
> needs to be supported for a bridge at all

Well there is I think nothing unusual in this net scheme. And by for example
the increasing setups with kvm/bridging it will be needed more and more.

> , an alternative approach
> would be to have bonding add FDB entries for all secondary MACs to
> make bridging treat them as local.

Yes - that is the clear way. But there's not really straihtforward way to do
this. The clear approach would be to extend struct net_device for list of these
mac addresses and let the drivers (binding) fill it and bridge to process it.
But I don't know.


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
@ 2009-03-27  8:41         ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-27  8:41 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: fubar, netdev, bridge, linux-kernel, mschmidt, bonding-devel,
	jgarzik, dada1, David Miller

Fri, Mar 27, 2009 at 08:53:13AM CET, kaber@trash.net wrote:
> David Miller wrote:
>> From: Jiri Pirko <jpirko@redhat.com>
>> Date: Thu, 26 Mar 2009 16:52:06 +0100
>>
>>> (resend, updated changelog, hook moved into skb_bond_should_drop,
>>> skb_bond_should_drop ifdefed)
>>>
>>> Hi all.
>>>
>>> The problem is described in following bugzilla:
>>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>>  ...
>>> This patch solves the situation in the bonding without touching bridge code,
>>> as Patrick suggested. For every incoming frame to bonding it searches the
>>> destination address in slaves list and if any of slave addresses matches, it
>>> rewrites the address in frame by the adress of bonding master. This ensures that
>>> all frames comming thru the bonding in alb mode have the same address.
>>>
>>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>>
>>
>> I don't like the hook, but if that's how it's best done....
>>
>> Patrick, please review this.
>
> Me neither, but I don't think this approach can be done without the
> hook. While I still find it questionable whether this mode really
> needs to be supported for a bridge at all

Well there is I think nothing unusual in this net scheme. And by for example
the increasing setups with kvm/bridging it will be needed more and more.

> , an alternative approach
> would be to have bonding add FDB entries for all secondary MACs to
> make bridging treat them as local.

Yes - that is the clear way. But there's not really straihtforward way to do
this. The clear approach would be to extend struct net_device for list of these
mac addresses and let the drivers (binding) fill it and bridge to process it.
But I don't know.


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
  2009-03-27  8:41         ` [Bridge] " Jiri Pirko
@ 2009-03-27  8:55           ` Patrick McHardy
  -1 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-03-27  8:55 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: David Miller, linux-kernel, netdev, jgarzik, shemminger, bridge,
	fubar, bonding-devel, mschmidt, dada1

Jiri Pirko wrote:
> Fri, Mar 27, 2009 at 08:53:13AM CET, kaber@trash.net wrote:
>> >
>> > Me neither, but I don't think this approach can be done without the
>> > hook. While I still find it questionable whether this mode really
>> > needs to be supported for a bridge at all
> 
> Well there is I think nothing unusual in this net scheme. And by for example
> the increasing setups with kvm/bridging it will be needed more and more.

Mangling ARP packets for load-balancing purposes seems quite unusual.

>> , an alternative approach
>> would be to have bonding add FDB entries for all secondary MACs to
>> make bridging treat them as local.
> 
> Yes - that is the clear way. But there's not really straihtforward way to do
> this. The clear approach would be to extend struct net_device for list of these
> mac addresses and let the drivers (binding) fill it and bridge to process it.
> But I don't know.

We have a list of secondary unicast addresses, but that might not
be suitable in this case since the addresses are (mostly) intended
not to be visible to the stack if I understood correctly.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
@ 2009-03-27  8:55           ` Patrick McHardy
  0 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-03-27  8:55 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: fubar, netdev, bridge, linux-kernel, mschmidt, bonding-devel,
	jgarzik, dada1, David Miller

Jiri Pirko wrote:
> Fri, Mar 27, 2009 at 08:53:13AM CET, kaber@trash.net wrote:
>> >
>> > Me neither, but I don't think this approach can be done without the
>> > hook. While I still find it questionable whether this mode really
>> > needs to be supported for a bridge at all
> 
> Well there is I think nothing unusual in this net scheme. And by for example
> the increasing setups with kvm/bridging it will be needed more and more.

Mangling ARP packets for load-balancing purposes seems quite unusual.

>> , an alternative approach
>> would be to have bonding add FDB entries for all secondary MACs to
>> make bridging treat them as local.
> 
> Yes - that is the clear way. But there's not really straihtforward way to do
> this. The clear approach would be to extend struct net_device for list of these
> mac addresses and let the drivers (binding) fill it and bridge to process it.
> But I don't know.

We have a list of secondary unicast addresses, but that might not
be suitable in this case since the addresses are (mostly) intended
not to be visible to the stack if I understood correctly.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
  2009-03-27  8:55           ` [Bridge] " Patrick McHardy
@ 2009-03-27  9:47             ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-27  9:47 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Miller, linux-kernel, netdev, jgarzik, shemminger, bridge,
	fubar, bonding-devel, mschmidt, dada1

Fri, Mar 27, 2009 at 09:55:39AM CET, kaber@trash.net wrote:
> Jiri Pirko wrote:
>> Fri, Mar 27, 2009 at 08:53:13AM CET, kaber@trash.net wrote:
>>> >
>>> > Me neither, but I don't think this approach can be done without the
>>> > hook. While I still find it questionable whether this mode really
>>> > needs to be supported for a bridge at all
>>
>> Well there is I think nothing unusual in this net scheme. And by for example
>> the increasing setups with kvm/bridging it will be needed more and more.
>
> Mangling ARP packets for load-balancing purposes seems quite unusual.

Well, there are many unusual things, that do not imply that they should not be
supported...

>>> , an alternative approach
>>> would be to have bonding add FDB entries for all secondary MACs to
>>> make bridging treat them as local.
>>
>> Yes - that is the clear way. But there's not really straihtforward way to do
>> this. The clear approach would be to extend struct net_device for list of these
>> mac addresses and let the drivers (binding) fill it and bridge to process it.
>> But I don't know.
>
> We have a list of secondary unicast addresses, but that might not
> be suitable in this case since the addresses are (mostly) intended
> not to be visible to the stack if I understood correctly.

I agree this list is not suitable for this - it's used for different purpose and
I think it would be not wise to mix it with what we want...

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
@ 2009-03-27  9:47             ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-27  9:47 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: fubar, netdev, bridge, linux-kernel, mschmidt, bonding-devel,
	jgarzik, dada1, David Miller

Fri, Mar 27, 2009 at 09:55:39AM CET, kaber@trash.net wrote:
> Jiri Pirko wrote:
>> Fri, Mar 27, 2009 at 08:53:13AM CET, kaber@trash.net wrote:
>>> >
>>> > Me neither, but I don't think this approach can be done without the
>>> > hook. While I still find it questionable whether this mode really
>>> > needs to be supported for a bridge at all
>>
>> Well there is I think nothing unusual in this net scheme. And by for example
>> the increasing setups with kvm/bridging it will be needed more and more.
>
> Mangling ARP packets for load-balancing purposes seems quite unusual.

Well, there are many unusual things, that do not imply that they should not be
supported...

>>> , an alternative approach
>>> would be to have bonding add FDB entries for all secondary MACs to
>>> make bridging treat them as local.
>>
>> Yes - that is the clear way. But there's not really straihtforward way to do
>> this. The clear approach would be to extend struct net_device for list of these
>> mac addresses and let the drivers (binding) fill it and bridge to process it.
>> But I don't know.
>
> We have a list of secondary unicast addresses, but that might not
> be suitable in this case since the addresses are (mostly) intended
> not to be visible to the stack if I understood correctly.

I agree this list is not suitable for this - it's used for different purpose and
I think it would be not wise to mix it with what we want...

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
  2009-03-27  7:53       ` [Bridge] " Patrick McHardy
@ 2009-03-29 20:53         ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-03-29 20:53 UTC (permalink / raw)
  To: kaber
  Cc: jpirko, linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, mschmidt, dada1

From: Patrick McHardy <kaber@trash.net>
Date: Fri, 27 Mar 2009 08:53:13 +0100

> David Miller wrote:
> > I don't like the hook, but if that's how it's best done....
> > Patrick, please review this.
> 
> Me neither, but I don't think this approach can be done without the
> hook. While I still find it questionable whether this mode really
> needs to be supported for a bridge at all, an alternative approach
> would be to have bonding add FDB entries for all secondary MACs to
> make bridging treat them as local.

Do you guys foresee any possibility of an alternative implementation
any time soon?

Otherwise we're just stalling by not putting something into the tree,
and as far as I can tell this patch here might as well be it.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
@ 2009-03-29 20:53         ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-03-29 20:53 UTC (permalink / raw)
  To: kaber
  Cc: fubar, jpirko, netdev, bridge, linux-kernel, mschmidt, dada1,
	jgarzik, bonding-devel

From: Patrick McHardy <kaber@trash.net>
Date: Fri, 27 Mar 2009 08:53:13 +0100

> David Miller wrote:
> > I don't like the hook, but if that's how it's best done....
> > Patrick, please review this.
> 
> Me neither, but I don't think this approach can be done without the
> hook. While I still find it questionable whether this mode really
> needs to be supported for a bridge at all, an alternative approach
> would be to have bonding add FDB entries for all secondary MACs to
> make bridging treat them as local.

Do you guys foresee any possibility of an alternative implementation
any time soon?

Otherwise we're just stalling by not putting something into the tree,
and as far as I can tell this patch here might as well be it.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
  2009-03-29 20:53         ` [Bridge] " David Miller
@ 2009-03-30 12:04           ` Patrick McHardy
  -1 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-03-30 12:04 UTC (permalink / raw)
  To: David Miller
  Cc: jpirko, linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, mschmidt, dada1

David Miller wrote:
> From: Patrick McHardy <kaber@trash.net>
> Date: Fri, 27 Mar 2009 08:53:13 +0100
> 
>> ... an alternative approach
>> would be to have bonding add FDB entries for all secondary MACs to
>> make bridging treat them as local.
> 
> Do you guys foresee any possibility of an alternative implementation
> any time soon?
> 
> Otherwise we're just stalling by not putting something into the tree,
> and as far as I can tell this patch here might as well be it.

Adding bridge FDB entries seems like the best fix. It might
need some minor ugliness to avoid new dependencies between
bonding and bridging, but it definitely beats having new hooks
in the core in my opinion.

But I have no idea whether Jiri is actually implementing this.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
@ 2009-03-30 12:04           ` Patrick McHardy
  0 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-03-30 12:04 UTC (permalink / raw)
  To: David Miller
  Cc: fubar, jpirko, netdev, bridge, linux-kernel, mschmidt, dada1,
	jgarzik, bonding-devel

David Miller wrote:
> From: Patrick McHardy <kaber@trash.net>
> Date: Fri, 27 Mar 2009 08:53:13 +0100
> 
>> ... an alternative approach
>> would be to have bonding add FDB entries for all secondary MACs to
>> make bridging treat them as local.
> 
> Do you guys foresee any possibility of an alternative implementation
> any time soon?
> 
> Otherwise we're just stalling by not putting something into the tree,
> and as far as I can tell this patch here might as well be it.

Adding bridge FDB entries seems like the best fix. It might
need some minor ugliness to avoid new dependencies between
bonding and bridging, but it definitely beats having new hooks
in the core in my opinion.

But I have no idea whether Jiri is actually implementing this.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
  2009-03-30 12:04           ` [Bridge] " Patrick McHardy
@ 2009-03-30 12:40             ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-30 12:40 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Miller, linux-kernel, netdev, jgarzik, shemminger, bridge,
	fubar, bonding-devel, mschmidt, dada1

Mon, Mar 30, 2009 at 02:04:25PM CEST, kaber@trash.net wrote:
> David Miller wrote:
>> From: Patrick McHardy <kaber@trash.net>
>> Date: Fri, 27 Mar 2009 08:53:13 +0100
>>
>>> ... an alternative approach
>>> would be to have bonding add FDB entries for all secondary MACs to
>>> make bridging treat them as local.
>>
>> Do you guys foresee any possibility of an alternative implementation
>> any time soon?
>>
>> Otherwise we're just stalling by not putting something into the tree,
>> and as far as I can tell this patch here might as well be it.
>
> Adding bridge FDB entries seems like the best fix. It might
> need some minor ugliness to avoid new dependencies between
> bonding and bridging, but it definitely beats having new hooks
> in the core in my opinion.

Agree with this.
>
> But I have no idea whether Jiri is actually implementing this.

Currently I'm thinking the way. What I have on mind:
I would like to add a list into struct net_device to contain all mac addresses
of the device. I would also like to use similar interface to handle them as
currently is for uc_list and mc_list. However I do not like that these lists are
not using standard list_head but they are propriate lists only for this purpose.
I'm thinking about converting them to use list_head first. Or maybe ignore them
and do the new list for macs in parallel?

Then we can fill this list with macs in bonding driver and let bridge check it
and make fdb entries.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
@ 2009-03-30 12:40             ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-30 12:40 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: fubar, netdev, bridge, linux-kernel, mschmidt, bonding-devel,
	jgarzik, dada1, David Miller

Mon, Mar 30, 2009 at 02:04:25PM CEST, kaber@trash.net wrote:
> David Miller wrote:
>> From: Patrick McHardy <kaber@trash.net>
>> Date: Fri, 27 Mar 2009 08:53:13 +0100
>>
>>> ... an alternative approach
>>> would be to have bonding add FDB entries for all secondary MACs to
>>> make bridging treat them as local.
>>
>> Do you guys foresee any possibility of an alternative implementation
>> any time soon?
>>
>> Otherwise we're just stalling by not putting something into the tree,
>> and as far as I can tell this patch here might as well be it.
>
> Adding bridge FDB entries seems like the best fix. It might
> need some minor ugliness to avoid new dependencies between
> bonding and bridging, but it definitely beats having new hooks
> in the core in my opinion.

Agree with this.
>
> But I have no idea whether Jiri is actually implementing this.

Currently I'm thinking the way. What I have on mind:
I would like to add a list into struct net_device to contain all mac addresses
of the device. I would also like to use similar interface to handle them as
currently is for uc_list and mc_list. However I do not like that these lists are
not using standard list_head but they are propriate lists only for this purpose.
I'm thinking about converting them to use list_head first. Or maybe ignore them
and do the new list for macs in parallel?

Then we can fill this list with macs in bonding driver and let bridge check it
and make fdb entries.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
  2009-03-30 12:40             ` [Bridge] " Jiri Pirko
@ 2009-03-30 12:47               ` Patrick McHardy
  -1 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-03-30 12:47 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: David Miller, linux-kernel, netdev, jgarzik, shemminger, bridge,
	fubar, bonding-devel, mschmidt, dada1

Jiri Pirko wrote:
> Currently I'm thinking the way. What I have on mind:
> I would like to add a list into struct net_device to contain all mac addresses
> of the device. I would also like to use similar interface to handle them as
> currently is for uc_list and mc_list. However I do not like that these lists are
> not using standard list_head but they are propriate lists only for this purpose.
> I'm thinking about converting them to use list_head first. Or maybe ignore them
> and do the new list for macs in parallel?

Using list_heads in the address lists would require some pretty large
amount of work since you'd need to convert all the drivers. I'm all
in favour of doing this, but I wouldn't make the fix depend on that
work.


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
@ 2009-03-30 12:47               ` Patrick McHardy
  0 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-03-30 12:47 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: fubar, netdev, bridge, linux-kernel, mschmidt, bonding-devel,
	jgarzik, dada1, David Miller

Jiri Pirko wrote:
> Currently I'm thinking the way. What I have on mind:
> I would like to add a list into struct net_device to contain all mac addresses
> of the device. I would also like to use similar interface to handle them as
> currently is for uc_list and mc_list. However I do not like that these lists are
> not using standard list_head but they are propriate lists only for this purpose.
> I'm thinking about converting them to use list_head first. Or maybe ignore them
> and do the new list for macs in parallel?

Using list_heads in the address lists would require some pretty large
amount of work since you'd need to convert all the drivers. I'm all
in favour of doing this, but I wouldn't make the fix depend on that
work.


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
  2009-03-30 12:47               ` [Bridge] " Patrick McHardy
@ 2009-03-30 12:52                 ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-30 12:52 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Miller, linux-kernel, netdev, jgarzik, shemminger, bridge,
	fubar, bonding-devel, mschmidt, dada1

Mon, Mar 30, 2009 at 02:47:59PM CEST, kaber@trash.net wrote:
> Jiri Pirko wrote:
>> Currently I'm thinking the way. What I have on mind:
>> I would like to add a list into struct net_device to contain all mac addresses
>> of the device. I would also like to use similar interface to handle them as
>> currently is for uc_list and mc_list. However I do not like that these lists are
>> not using standard list_head but they are propriate lists only for this purpose.
>> I'm thinking about converting them to use list_head first. Or maybe ignore them
>> and do the new list for macs in parallel?
>
> Using list_heads in the address lists would require some pretty large
> amount of work since you'd need to convert all the drivers. 

Yes, I'm aware of it...
> I'm all
> in favour of doing this, but I wouldn't make the fix depend on that
> work.

ok so you are suggesting to use the current list struct?
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
@ 2009-03-30 12:52                 ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-03-30 12:52 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: fubar, netdev, bridge, linux-kernel, mschmidt, bonding-devel,
	jgarzik, dada1, David Miller

Mon, Mar 30, 2009 at 02:47:59PM CEST, kaber@trash.net wrote:
> Jiri Pirko wrote:
>> Currently I'm thinking the way. What I have on mind:
>> I would like to add a list into struct net_device to contain all mac addresses
>> of the device. I would also like to use similar interface to handle them as
>> currently is for uc_list and mc_list. However I do not like that these lists are
>> not using standard list_head but they are propriate lists only for this purpose.
>> I'm thinking about converting them to use list_head first. Or maybe ignore them
>> and do the new list for macs in parallel?
>
> Using list_heads in the address lists would require some pretty large
> amount of work since you'd need to convert all the drivers. 

Yes, I'm aware of it...
> I'm all
> in favour of doing this, but I wouldn't make the fix depend on that
> work.

ok so you are suggesting to use the current list struct?
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
  2009-03-30 12:52                 ` [Bridge] " Jiri Pirko
@ 2009-03-30 12:58                   ` Patrick McHardy
  -1 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-03-30 12:58 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: David Miller, linux-kernel, netdev, jgarzik, shemminger, bridge,
	fubar, bonding-devel, mschmidt, dada1

Jiri Pirko wrote:
> Mon, Mar 30, 2009 at 02:47:59PM CEST, kaber@trash.net wrote:
>> Jiri Pirko wrote:
>>> Currently I'm thinking the way. What I have on mind:
>>> I would like to add a list into struct net_device to contain all mac addresses
>>> of the device. I would also like to use similar interface to handle them as
>>> currently is for uc_list and mc_list. However I do not like that these lists are
>>> not using standard list_head but they are propriate lists only for this purpose.
>>> I'm thinking about converting them to use list_head first. Or maybe ignore them
>>> and do the new list for macs in parallel?
>> Using list_heads in the address lists would require some pretty large
>> amount of work since you'd need to convert all the drivers. 
> 
> Yes, I'm aware of it...
>> I'm all
>> in favour of doing this, but I wouldn't make the fix depend on that
>> work.
> 
> ok so you are suggesting to use the current list struct?

Whatever will make this easier :) You could of course already add the
new structure and use it for your new list and do the conversion of
the existing structures on top of that.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4
@ 2009-03-30 12:58                   ` Patrick McHardy
  0 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-03-30 12:58 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: fubar, netdev, bridge, linux-kernel, mschmidt, bonding-devel,
	jgarzik, dada1, David Miller

Jiri Pirko wrote:
> Mon, Mar 30, 2009 at 02:47:59PM CEST, kaber@trash.net wrote:
>> Jiri Pirko wrote:
>>> Currently I'm thinking the way. What I have on mind:
>>> I would like to add a list into struct net_device to contain all mac addresses
>>> of the device. I would also like to use similar interface to handle them as
>>> currently is for uc_list and mc_list. However I do not like that these lists are
>>> not using standard list_head but they are propriate lists only for this purpose.
>>> I'm thinking about converting them to use list_head first. Or maybe ignore them
>>> and do the new list for macs in parallel?
>> Using list_heads in the address lists would require some pretty large
>> amount of work since you'd need to convert all the drivers. 
> 
> Yes, I'm aware of it...
>> I'm all
>> in favour of doing this, but I wouldn't make the fix depend on that
>> work.
> 
> ok so you are suggesting to use the current list struct?

Whatever will make this easier :) You could of course already add the
new structure and use it for your new list and do the conversion of
the existing structures on top of that.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH 0/4] bonding: allow bond in mode balance-alb to work properly in bridge -try5
  2009-03-13 18:33 ` [Bridge] " Jiri Pirko
@ 2009-04-13  8:37   ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-13  8:37 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

(resend, updated changelog, completely reworked)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patchset solves this issue in the best way it can be possibly solved. By
adding all mac addresses of all slave devices to the bridge hash list. To carry
these addresses the new list has to be introduced in struct net_device.

Jirka


^ permalink raw reply	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH 0/4] bonding: allow bond in mode balance-alb to work properly in bridge -try5
@ 2009-04-13  8:37   ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-13  8:37 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

(resend, updated changelog, completely reworked)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patchset solves this issue in the best way it can be possibly solved. By
adding all mac addresses of all slave devices to the bridge hash list. To carry
these addresses the new list has to be introduced in struct net_device.

Jirka


^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH 1/4] net: introduce dev_mac_address_changed
  2009-04-13  8:37   ` [Bridge] " Jiri Pirko
@ 2009-04-13  8:38     ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-13  8:38 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

Introducing function dev_mac_address_changed which can be called from driver
which changed his mac address to force notifiers to be called.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/netdevice.h |    1 +
 net/core/dev.c            |   12 ++++++++++++
 2 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..ff8db51 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1461,6 +1461,7 @@ extern int		dev_change_net_namespace(struct net_device *,
 extern int		dev_set_mtu(struct net_device *, int);
 extern int		dev_set_mac_address(struct net_device *,
 					    struct sockaddr *);
+extern void		dev_mac_address_changed(struct net_device *);
 extern int		dev_hard_start_xmit(struct sk_buff *skb,
 					    struct net_device *dev,
 					    struct netdev_queue *txq);
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d..1adc89b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3833,6 +3833,18 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 	return err;
 }
 
+/**
+ *	dev_mac_address_changed - Notify Media Access Control Address changed
+ *	@dev: device
+ *
+ *	Notifies the change of the hardware (MAC) address of the device
+ */
+void dev_mac_address_changed(struct net_device *dev)
+{
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+}
+EXPORT_SYMBOL(dev_mac_address_changed);
+
 /*
  *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
  */
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH 1/4] net: introduce dev_mac_address_changed
@ 2009-04-13  8:38     ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-13  8:38 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

Introducing function dev_mac_address_changed which can be called from driver
which changed his mac address to force notifiers to be called.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/netdevice.h |    1 +
 net/core/dev.c            |   12 ++++++++++++
 2 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..ff8db51 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1461,6 +1461,7 @@ extern int		dev_change_net_namespace(struct net_device *,
 extern int		dev_set_mtu(struct net_device *, int);
 extern int		dev_set_mac_address(struct net_device *,
 					    struct sockaddr *);
+extern void		dev_mac_address_changed(struct net_device *);
 extern int		dev_hard_start_xmit(struct sk_buff *skb,
 					    struct net_device *dev,
 					    struct netdev_queue *txq);
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d..1adc89b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3833,6 +3833,18 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 	return err;
 }
 
+/**
+ *	dev_mac_address_changed - Notify Media Access Control Address changed
+ *	@dev: device
+ *
+ *	Notifies the change of the hardware (MAC) address of the device
+ */
+void dev_mac_address_changed(struct net_device *dev)
+{
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+}
+EXPORT_SYMBOL(dev_mac_address_changed);
+
 /*
  *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
  */
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [PATCH 2/4] net: introduce a list of device addresses dev_addr_list
  2009-04-13  8:37   ` [Bridge] " Jiri Pirko
@ 2009-04-13  8:42     ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-13  8:42 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/netdevice.h |   51 +++++++++-
 net/core/dev.c            |  264 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 313 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ff8db51..8cf62f1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,12 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	int			refcount;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +782,12 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
+	spinlock_t              dev_addr_list_lock;
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1779,6 +1789,32 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/* Locking helpers for spinlock guarding dev_addr_list */
+
+static inline void netif_dev_addr_lock(struct net_device *dev)
+{
+	spin_lock(&dev->dev_addr_list_lock);
+}
+
+static inline void netif_dev_addr_lock_bh(struct net_device *dev)
+{
+	spin_lock_bh(&dev->dev_addr_list_lock);
+}
+
+static inline void netif_dev_addr_unlock(struct net_device *dev)
+{
+	spin_unlock(&dev->dev_addr_list_lock);
+}
+
+static inline void netif_dev_addr_unlock_bh(struct net_device *dev)
+{
+	spin_unlock_bh(&dev->dev_addr_list_lock);
+}
+
+/* dev_addr_list walker */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1791,6 +1827,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int		dev_addr_add(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_del(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_add_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+extern int		dev_addr_del_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1adc89b..0b154b3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3437,6 +3437,263 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct hw_addr *ha;
+	int i = 0;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			ha->refcount++;
+			return 0;
+		}
+	}
+
+	ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->refcount = 1;
+	list_add_tail(&ha->list, list);
+	return 0;
+}
+
+static inline int __hw_addr_add(struct list_head *list, unsigned char *addr,
+				int addr_len)
+{
+	return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			if (--ha->refcount)
+				return 0;
+			list_del(&ha->list);
+			kfree(ha);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static inline int __hw_addr_del(struct list_head *list, unsigned char *addr,
+				int addr_len)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, int ignore_index)
+{
+	int err;
+	struct hw_addr *ha, *ha2;
+
+	list_for_each_entry(ha, from_list, list) {
+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+		if (err)
+			goto unroll;
+	}
+	return 0;
+unroll:
+	list_for_each_entry(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+	}
+	return err;
+}
+
+static inline int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, int ignore_index)
+{
+	struct hw_addr *ha;
+
+	list_for_each_entry(ha, from_list, list) {
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+	}
+}
+
+static inline void __hw_addr_del_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del(&ha->list);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	netif_dev_addr_lock_bh(dev);
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+	netif_dev_addr_unlock_bh(dev);
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct hw_addr *ha;
+	int err;
+
+	spin_lock_init(&dev->dev_addr_list_lock);
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	netif_dev_addr_lock_bh(dev);
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addr_list,
+				      struct hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	netif_dev_addr_unlock_bh(dev);
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	netif_dev_addr_lock_bh(dev);
+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	netif_dev_addr_unlock_bh(dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	netif_dev_addr_lock_bh(dev);
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	netif_dev_addr_unlock_bh(dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *
+ *	Add device addresses of the one device to another.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+
+	netif_dev_addr_lock_bh(from_dev);
+	netif_dev_addr_lock_bh(to_dev);
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, 0);
+	netif_dev_addr_unlock_bh(to_dev);
+	netif_dev_addr_unlock_bh(from_dev);
+
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+
+	netif_dev_addr_lock_bh(from_dev);
+	netif_dev_addr_lock_bh(to_dev);
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, 0);
+	netif_dev_addr_unlock_bh(to_dev);
+	netif_dev_addr_unlock_bh(from_dev);
+
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4269,6 +4526,9 @@ static void rollback_registered(struct net_device *dev)
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	if (dev->netdev_ops->ndo_uninit)
 		dev->netdev_ops->ndo_uninit(dev);
 
@@ -4791,6 +5051,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4977,6 +5238,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	netdev_unregister_kobject(dev);
 
 	/* Actually switch the network namespace */
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH 2/4] net: introduce a list of device addresses dev_addr_list
@ 2009-04-13  8:42     ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-13  8:42 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/netdevice.h |   51 +++++++++-
 net/core/dev.c            |  264 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 313 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ff8db51..8cf62f1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,12 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	int			refcount;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +782,12 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
+	spinlock_t              dev_addr_list_lock;
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1779,6 +1789,32 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/* Locking helpers for spinlock guarding dev_addr_list */
+
+static inline void netif_dev_addr_lock(struct net_device *dev)
+{
+	spin_lock(&dev->dev_addr_list_lock);
+}
+
+static inline void netif_dev_addr_lock_bh(struct net_device *dev)
+{
+	spin_lock_bh(&dev->dev_addr_list_lock);
+}
+
+static inline void netif_dev_addr_unlock(struct net_device *dev)
+{
+	spin_unlock(&dev->dev_addr_list_lock);
+}
+
+static inline void netif_dev_addr_unlock_bh(struct net_device *dev)
+{
+	spin_unlock_bh(&dev->dev_addr_list_lock);
+}
+
+/* dev_addr_list walker */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1791,6 +1827,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int		dev_addr_add(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_del(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_add_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+extern int		dev_addr_del_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1adc89b..0b154b3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3437,6 +3437,263 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct hw_addr *ha;
+	int i = 0;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			ha->refcount++;
+			return 0;
+		}
+	}
+
+	ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->refcount = 1;
+	list_add_tail(&ha->list, list);
+	return 0;
+}
+
+static inline int __hw_addr_add(struct list_head *list, unsigned char *addr,
+				int addr_len)
+{
+	return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			if (--ha->refcount)
+				return 0;
+			list_del(&ha->list);
+			kfree(ha);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static inline int __hw_addr_del(struct list_head *list, unsigned char *addr,
+				int addr_len)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, int ignore_index)
+{
+	int err;
+	struct hw_addr *ha, *ha2;
+
+	list_for_each_entry(ha, from_list, list) {
+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+		if (err)
+			goto unroll;
+	}
+	return 0;
+unroll:
+	list_for_each_entry(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+	}
+	return err;
+}
+
+static inline int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, int ignore_index)
+{
+	struct hw_addr *ha;
+
+	list_for_each_entry(ha, from_list, list) {
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+	}
+}
+
+static inline void __hw_addr_del_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del(&ha->list);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	netif_dev_addr_lock_bh(dev);
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+	netif_dev_addr_unlock_bh(dev);
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct hw_addr *ha;
+	int err;
+
+	spin_lock_init(&dev->dev_addr_list_lock);
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	netif_dev_addr_lock_bh(dev);
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addr_list,
+				      struct hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	netif_dev_addr_unlock_bh(dev);
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	netif_dev_addr_lock_bh(dev);
+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	netif_dev_addr_unlock_bh(dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	netif_dev_addr_lock_bh(dev);
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	netif_dev_addr_unlock_bh(dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *
+ *	Add device addresses of the one device to another.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+
+	netif_dev_addr_lock_bh(from_dev);
+	netif_dev_addr_lock_bh(to_dev);
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, 0);
+	netif_dev_addr_unlock_bh(to_dev);
+	netif_dev_addr_unlock_bh(from_dev);
+
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+
+	netif_dev_addr_lock_bh(from_dev);
+	netif_dev_addr_lock_bh(to_dev);
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, 0);
+	netif_dev_addr_unlock_bh(to_dev);
+	netif_dev_addr_unlock_bh(from_dev);
+
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4269,6 +4526,9 @@ static void rollback_registered(struct net_device *dev)
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	if (dev->netdev_ops->ndo_uninit)
 		dev->netdev_ops->ndo_uninit(dev);
 
@@ -4791,6 +5051,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4977,6 +5238,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	netdev_unregister_kobject(dev);
 
 	/* Actually switch the network namespace */
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [PATCH 3/4] net: bridge: use device address list instead of dev_addr
  2009-04-13  8:37   ` [Bridge] " Jiri Pirko
@ 2009-04-13  8:44     ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-13  8:44 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

This patch changes the handling of mac addresses of bridge port devices. Now
it uses previously introduced list of device addresses. It allows the bridge to
know more then one local mac address per port which is mandatory for the right
work in some cases.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 net/bridge/br_fdb.c     |  120 +++++++++++++++++++++++++++++++++--------------
 net/bridge/br_if.c      |    2 +-
 net/bridge/br_notify.c  |    2 +-
 net/bridge/br_private.h |    4 +-
 4 files changed, 89 insertions(+), 39 deletions(-)

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index a48f5ef..6efc556 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -77,10 +77,45 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
 	br_fdb_put(f);
 }
 
-void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
+/*
+ * Finds out if passed address is one of the addresses assigned to the device.
+ * Returns 1 on positive result
+ */
+static inline int is_dev_addr(struct net_device *dev, unsigned char *addr)
+{
+	struct hw_addr *ha;
+	int ret = 1;
+
+	netif_dev_addr_lock_bh(dev);
+	for_each_dev_addr(dev, ha) {
+		ret = compare_ether_addr(addr, ha->addr);
+		if (!ret)
+			break;
+	}
+	netif_dev_addr_unlock_bh(dev);
+	return !ret ? 1 : 0;
+}
+
+static int another_port_has_addr(const struct net_bridge_port *p,
+				 struct net_bridge_fdb_entry *f)
+{
+	struct net_bridge *br = p->br;
+	struct net_bridge_port *op;
+
+	list_for_each_entry(op, &br->port_list, list) {
+		if (op != p && is_dev_addr(op->dev, f->addr.addr)) {
+			f->dst = op;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
 {
 	struct net_bridge *br = p->br;
 	int i;
+	struct hw_addr *ha;
 
 	spin_lock_bh(&br->hash_lock);
 
@@ -92,26 +127,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
 
 			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
 			if (f->dst == p && f->is_local) {
-				/* maybe another port has same hw addr? */
-				struct net_bridge_port *op;
-				list_for_each_entry(op, &br->port_list, list) {
-					if (op != p &&
-					    !compare_ether_addr(op->dev->dev_addr,
-								f->addr.addr)) {
-						f->dst = op;
-						goto insert;
-					}
-				}
-
-				/* delete old one */
-				fdb_delete(f);
-				goto insert;
+				/*
+				 * maybe another port has same hw addr?,
+				 * if not then delete it
+				 */
+				if (!another_port_has_addr(p, f))
+					fdb_delete(f);
 			}
 		}
 	}
- insert:
-	/* insert new address,  may fail if invalid address or dup. */
-	fdb_insert(br, p, newaddr);
+
+	/* insert device addresses, may fail if invalid address. */
+
+	netif_dev_addr_lock_bh(dev);
+	for_each_dev_addr(dev, ha) {
+		fdb_insert(br, p, ha->addr);
+	}
+	netif_dev_addr_unlock_bh(dev);
 
 	spin_unlock_bh(&br->hash_lock);
 }
@@ -189,20 +221,9 @@ void br_fdb_delete_by_port(struct net_bridge *br,
 			 * then when one port is deleted, assign
 			 * the local entry to other port
 			 */
-			if (f->is_local) {
-				struct net_bridge_port *op;
-				list_for_each_entry(op, &br->port_list, list) {
-					if (op != p &&
-					    !compare_ether_addr(op->dev->dev_addr,
-								f->addr.addr)) {
-						f->dst = op;
-						goto skip_delete;
-					}
-				}
-			}
-
-			fdb_delete(f);
-		skip_delete: ;
+			if (!f->is_local ||
+			    !another_port_has_addr(p, f))
+				fdb_delete(f);
 		}
 	}
 	spin_unlock_bh(&br->hash_lock);
@@ -338,7 +359,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 }
 
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
-		  const unsigned char *addr)
+		      const unsigned char *addr)
 {
 	struct hlist_head *head = &br->hash[br_mac_hash(addr)];
 	struct net_bridge_fdb_entry *fdb;
@@ -366,13 +387,42 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 	return 0;
 }
 
+static int fdb_insert_dev(struct net_bridge *br, struct net_bridge_port *source,
+			  struct net_device *dev)
+{
+	struct hw_addr *ha, *ha2;
+	struct net_bridge_fdb_entry *fdb;
+	struct hlist_head *head;
+	int ret = 0;
+
+	netif_dev_addr_lock_bh(dev);
+	for_each_dev_addr(dev, ha) {
+		ret = fdb_insert(br, source, ha->addr);
+		if (ret)
+			goto unroll;
+	}
+	goto unlock;
+unroll:
+	for_each_dev_addr(dev, ha2) {
+		if (ha2 == ha)
+			break;
+		head = &br->hash[br_mac_hash(ha2->addr)];
+		fdb = fdb_find(head, ha2->addr);
+		if (fdb && fdb->is_local)
+			fdb_delete(fdb);
+	}
+unlock:
+	netif_dev_addr_unlock_bh(dev);
+	return ret;
+}
+
 int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
-		  const unsigned char *addr)
+		  struct net_device *dev)
 {
 	int ret;
 
 	spin_lock_bh(&br->hash_lock);
-	ret = fdb_insert(br, source, addr);
+	ret = fdb_insert_dev(br, source, dev);
 	spin_unlock_bh(&br->hash_lock);
 	return ret;
 }
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 8a96672..789cb30 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -392,7 +392,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	if (err)
 		goto err0;
 
-	err = br_fdb_insert(br, p, dev->dev_addr);
+	err = br_fdb_insert(br, p, dev);
 	if (err)
 		goto err1;
 
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 763a3ec..1423541 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -48,7 +48,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 
 	case NETDEV_CHANGEADDR:
 		spin_lock_bh(&br->lock);
-		br_fdb_changeaddr(p, dev->dev_addr);
+		br_fdb_changeaddr(p, dev);
 		br_stp_recalculate_bridge_id(br);
 		spin_unlock_bh(&br->lock);
 		break;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index b6c3b71..65ffe3d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -148,7 +148,7 @@ extern int br_fdb_init(void);
 extern void br_fdb_fini(void);
 extern void br_fdb_flush(struct net_bridge *br);
 extern void br_fdb_changeaddr(struct net_bridge_port *p,
-			      const unsigned char *newaddr);
+			      struct net_device *dev);
 extern void br_fdb_cleanup(unsigned long arg);
 extern void br_fdb_delete_by_port(struct net_bridge *br,
 				  const struct net_bridge_port *p, int do_all);
@@ -161,7 +161,7 @@ extern int br_fdb_fillbuf(struct net_bridge *br, void *buf,
 			  unsigned long count, unsigned long off);
 extern int br_fdb_insert(struct net_bridge *br,
 			 struct net_bridge_port *source,
-			 const unsigned char *addr);
+			 struct net_device *dev);
 extern void br_fdb_update(struct net_bridge *br,
 			  struct net_bridge_port *source,
 			  const unsigned char *addr);
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH 3/4] net: bridge: use device address list instead of dev_addr
@ 2009-04-13  8:44     ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-13  8:44 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

This patch changes the handling of mac addresses of bridge port devices. Now
it uses previously introduced list of device addresses. It allows the bridge to
know more then one local mac address per port which is mandatory for the right
work in some cases.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 net/bridge/br_fdb.c     |  120 +++++++++++++++++++++++++++++++++--------------
 net/bridge/br_if.c      |    2 +-
 net/bridge/br_notify.c  |    2 +-
 net/bridge/br_private.h |    4 +-
 4 files changed, 89 insertions(+), 39 deletions(-)

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index a48f5ef..6efc556 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -77,10 +77,45 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
 	br_fdb_put(f);
 }
 
-void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
+/*
+ * Finds out if passed address is one of the addresses assigned to the device.
+ * Returns 1 on positive result
+ */
+static inline int is_dev_addr(struct net_device *dev, unsigned char *addr)
+{
+	struct hw_addr *ha;
+	int ret = 1;
+
+	netif_dev_addr_lock_bh(dev);
+	for_each_dev_addr(dev, ha) {
+		ret = compare_ether_addr(addr, ha->addr);
+		if (!ret)
+			break;
+	}
+	netif_dev_addr_unlock_bh(dev);
+	return !ret ? 1 : 0;
+}
+
+static int another_port_has_addr(const struct net_bridge_port *p,
+				 struct net_bridge_fdb_entry *f)
+{
+	struct net_bridge *br = p->br;
+	struct net_bridge_port *op;
+
+	list_for_each_entry(op, &br->port_list, list) {
+		if (op != p && is_dev_addr(op->dev, f->addr.addr)) {
+			f->dst = op;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
 {
 	struct net_bridge *br = p->br;
 	int i;
+	struct hw_addr *ha;
 
 	spin_lock_bh(&br->hash_lock);
 
@@ -92,26 +127,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
 
 			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
 			if (f->dst == p && f->is_local) {
-				/* maybe another port has same hw addr? */
-				struct net_bridge_port *op;
-				list_for_each_entry(op, &br->port_list, list) {
-					if (op != p &&
-					    !compare_ether_addr(op->dev->dev_addr,
-								f->addr.addr)) {
-						f->dst = op;
-						goto insert;
-					}
-				}
-
-				/* delete old one */
-				fdb_delete(f);
-				goto insert;
+				/*
+				 * maybe another port has same hw addr?,
+				 * if not then delete it
+				 */
+				if (!another_port_has_addr(p, f))
+					fdb_delete(f);
 			}
 		}
 	}
- insert:
-	/* insert new address,  may fail if invalid address or dup. */
-	fdb_insert(br, p, newaddr);
+
+	/* insert device addresses, may fail if invalid address. */
+
+	netif_dev_addr_lock_bh(dev);
+	for_each_dev_addr(dev, ha) {
+		fdb_insert(br, p, ha->addr);
+	}
+	netif_dev_addr_unlock_bh(dev);
 
 	spin_unlock_bh(&br->hash_lock);
 }
@@ -189,20 +221,9 @@ void br_fdb_delete_by_port(struct net_bridge *br,
 			 * then when one port is deleted, assign
 			 * the local entry to other port
 			 */
-			if (f->is_local) {
-				struct net_bridge_port *op;
-				list_for_each_entry(op, &br->port_list, list) {
-					if (op != p &&
-					    !compare_ether_addr(op->dev->dev_addr,
-								f->addr.addr)) {
-						f->dst = op;
-						goto skip_delete;
-					}
-				}
-			}
-
-			fdb_delete(f);
-		skip_delete: ;
+			if (!f->is_local ||
+			    !another_port_has_addr(p, f))
+				fdb_delete(f);
 		}
 	}
 	spin_unlock_bh(&br->hash_lock);
@@ -338,7 +359,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 }
 
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
-		  const unsigned char *addr)
+		      const unsigned char *addr)
 {
 	struct hlist_head *head = &br->hash[br_mac_hash(addr)];
 	struct net_bridge_fdb_entry *fdb;
@@ -366,13 +387,42 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 	return 0;
 }
 
+static int fdb_insert_dev(struct net_bridge *br, struct net_bridge_port *source,
+			  struct net_device *dev)
+{
+	struct hw_addr *ha, *ha2;
+	struct net_bridge_fdb_entry *fdb;
+	struct hlist_head *head;
+	int ret = 0;
+
+	netif_dev_addr_lock_bh(dev);
+	for_each_dev_addr(dev, ha) {
+		ret = fdb_insert(br, source, ha->addr);
+		if (ret)
+			goto unroll;
+	}
+	goto unlock;
+unroll:
+	for_each_dev_addr(dev, ha2) {
+		if (ha2 == ha)
+			break;
+		head = &br->hash[br_mac_hash(ha2->addr)];
+		fdb = fdb_find(head, ha2->addr);
+		if (fdb && fdb->is_local)
+			fdb_delete(fdb);
+	}
+unlock:
+	netif_dev_addr_unlock_bh(dev);
+	return ret;
+}
+
 int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
-		  const unsigned char *addr)
+		  struct net_device *dev)
 {
 	int ret;
 
 	spin_lock_bh(&br->hash_lock);
-	ret = fdb_insert(br, source, addr);
+	ret = fdb_insert_dev(br, source, dev);
 	spin_unlock_bh(&br->hash_lock);
 	return ret;
 }
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 8a96672..789cb30 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -392,7 +392,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	if (err)
 		goto err0;
 
-	err = br_fdb_insert(br, p, dev->dev_addr);
+	err = br_fdb_insert(br, p, dev);
 	if (err)
 		goto err1;
 
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 763a3ec..1423541 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -48,7 +48,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 
 	case NETDEV_CHANGEADDR:
 		spin_lock_bh(&br->lock);
-		br_fdb_changeaddr(p, dev->dev_addr);
+		br_fdb_changeaddr(p, dev);
 		br_stp_recalculate_bridge_id(br);
 		spin_unlock_bh(&br->lock);
 		break;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index b6c3b71..65ffe3d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -148,7 +148,7 @@ extern int br_fdb_init(void);
 extern void br_fdb_fini(void);
 extern void br_fdb_flush(struct net_bridge *br);
 extern void br_fdb_changeaddr(struct net_bridge_port *p,
-			      const unsigned char *newaddr);
+			      struct net_device *dev);
 extern void br_fdb_cleanup(unsigned long arg);
 extern void br_fdb_delete_by_port(struct net_bridge *br,
 				  const struct net_bridge_port *p, int do_all);
@@ -161,7 +161,7 @@ extern int br_fdb_fillbuf(struct net_bridge *br, void *buf,
 			  unsigned long count, unsigned long off);
 extern int br_fdb_insert(struct net_bridge *br,
 			 struct net_bridge_port *source,
-			 const unsigned char *addr);
+			 struct net_device *dev);
 extern void br_fdb_update(struct net_bridge *br,
 			  struct net_bridge_port *source,
 			  const unsigned char *addr);
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [PATCH 4/4] net: bonding: add slave device addresses in mode alb
  2009-04-13  8:37   ` [Bridge] " Jiri Pirko
@ 2009-04-13  8:46     ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-13  8:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

When in mode alb, add all device addresses which belong to an enslaved slave
device to the bond device. This ensures that all mac addresses will be
treated as local and bonding in this mode will work fine in bridge.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/bonding/bond_main.c |   30 +++++++++++++++++++++++++++++-
 1 files changed, 29 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 99610f3..47795c7 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1385,6 +1385,11 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
 	bond->setup_by_slave = 1;
 }
 
+static inline int should_copy_dev_addrs(struct bonding *bond)
+{
+	return bond->params.mode == BOND_MODE_ALB ? 1 : 0;
+}
+
 /* enslave device <slave> to bond device <master> */
 int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 {
@@ -1510,6 +1515,13 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	 */
 	new_slave->original_flags = slave_dev->flags;
 
+	if (should_copy_dev_addrs(bond)) {
+		res = dev_addr_add_multiple(bond_dev, slave_dev);
+		if (res)
+			goto err_free;
+		dev_mac_address_changed(bond_dev);
+	}
+
 	/*
 	 * Save slave's original ("permanent") mac address for modes
 	 * that need it, and for restoring it upon release, and then
@@ -1527,7 +1539,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		res = dev_set_mac_address(slave_dev, &addr);
 		if (res) {
 			pr_debug("Error %d calling set_mac_address\n", res);
-			goto err_free;
+			goto err_remove_dev_addrs;
 		}
 	}
 
@@ -1769,6 +1781,12 @@ err_restore_mac:
 		dev_set_mac_address(slave_dev, &addr);
 	}
 
+err_remove_dev_addrs:
+	if (should_copy_dev_addrs(bond)) {
+		dev_addr_del_multiple(bond_dev, slave_dev);
+		dev_mac_address_changed(bond_dev);
+	}
+
 err_free:
 	kfree(new_slave);
 
@@ -1954,6 +1972,11 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
 	/* close slave before restoring its mac address */
 	dev_close(slave_dev);
 
+	if (should_copy_dev_addrs(bond)) {
+		dev_addr_del_multiple(bond_dev, slave_dev);
+		dev_mac_address_changed(bond_dev);
+	}
+
 	if (bond->params.fail_over_mac != BOND_FOM_ACTIVE) {
 		/* restore original ("permanent") mac address */
 		memcpy(addr.sa_data, slave->perm_hwaddr, ETH_ALEN);
@@ -2090,6 +2113,9 @@ static int bond_release_all(struct net_device *bond_dev)
 		/* close slave before restoring its mac address */
 		dev_close(slave_dev);
 
+		if (should_copy_dev_addrs(bond))
+			dev_addr_del_multiple(bond_dev, slave_dev);
+
 		if (!bond->params.fail_over_mac) {
 			/* restore original ("permanent") mac address*/
 			memcpy(addr.sa_data, slave->perm_hwaddr, ETH_ALEN);
@@ -2106,6 +2132,8 @@ static int bond_release_all(struct net_device *bond_dev)
 		write_lock_bh(&bond->lock);
 	}
 
+	dev_mac_address_changed(bond_dev);
+
 	/* zero the mac address of the master so it will be
 	 * set by the application to the mac address of the
 	 * first slave
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH 4/4] net: bonding: add slave device addresses in mode alb
@ 2009-04-13  8:46     ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-13  8:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

When in mode alb, add all device addresses which belong to an enslaved slave
device to the bond device. This ensures that all mac addresses will be
treated as local and bonding in this mode will work fine in bridge.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/bonding/bond_main.c |   30 +++++++++++++++++++++++++++++-
 1 files changed, 29 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 99610f3..47795c7 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1385,6 +1385,11 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
 	bond->setup_by_slave = 1;
 }
 
+static inline int should_copy_dev_addrs(struct bonding *bond)
+{
+	return bond->params.mode == BOND_MODE_ALB ? 1 : 0;
+}
+
 /* enslave device <slave> to bond device <master> */
 int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 {
@@ -1510,6 +1515,13 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	 */
 	new_slave->original_flags = slave_dev->flags;
 
+	if (should_copy_dev_addrs(bond)) {
+		res = dev_addr_add_multiple(bond_dev, slave_dev);
+		if (res)
+			goto err_free;
+		dev_mac_address_changed(bond_dev);
+	}
+
 	/*
 	 * Save slave's original ("permanent") mac address for modes
 	 * that need it, and for restoring it upon release, and then
@@ -1527,7 +1539,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		res = dev_set_mac_address(slave_dev, &addr);
 		if (res) {
 			pr_debug("Error %d calling set_mac_address\n", res);
-			goto err_free;
+			goto err_remove_dev_addrs;
 		}
 	}
 
@@ -1769,6 +1781,12 @@ err_restore_mac:
 		dev_set_mac_address(slave_dev, &addr);
 	}
 
+err_remove_dev_addrs:
+	if (should_copy_dev_addrs(bond)) {
+		dev_addr_del_multiple(bond_dev, slave_dev);
+		dev_mac_address_changed(bond_dev);
+	}
+
 err_free:
 	kfree(new_slave);
 
@@ -1954,6 +1972,11 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
 	/* close slave before restoring its mac address */
 	dev_close(slave_dev);
 
+	if (should_copy_dev_addrs(bond)) {
+		dev_addr_del_multiple(bond_dev, slave_dev);
+		dev_mac_address_changed(bond_dev);
+	}
+
 	if (bond->params.fail_over_mac != BOND_FOM_ACTIVE) {
 		/* restore original ("permanent") mac address */
 		memcpy(addr.sa_data, slave->perm_hwaddr, ETH_ALEN);
@@ -2090,6 +2113,9 @@ static int bond_release_all(struct net_device *bond_dev)
 		/* close slave before restoring its mac address */
 		dev_close(slave_dev);
 
+		if (should_copy_dev_addrs(bond))
+			dev_addr_del_multiple(bond_dev, slave_dev);
+
 		if (!bond->params.fail_over_mac) {
 			/* restore original ("permanent") mac address*/
 			memcpy(addr.sa_data, slave->perm_hwaddr, ETH_ALEN);
@@ -2106,6 +2132,8 @@ static int bond_release_all(struct net_device *bond_dev)
 		write_lock_bh(&bond->lock);
 	}
 
+	dev_mac_address_changed(bond_dev);
+
 	/* zero the mac address of the master so it will be
 	 * set by the application to the mac address of the
 	 * first slave
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH 2/4] net: introduce a list of device addresses dev_addr_list
  2009-04-13  8:42     ` [Bridge] " Jiri Pirko
@ 2009-04-13 14:49       ` Stephen Hemminger
  -1 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-04-13 14:49 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, netdev, jgarzik, davem, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

On Mon, 13 Apr 2009 10:42:02 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  include/linux/netdevice.h |   51 +++++++++-
>  net/core/dev.c            |  264 +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 313 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index ff8db51..8cf62f1 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -210,6 +210,12 @@ struct dev_addr_list
>  #define dmi_users	da_users
>  #define dmi_gusers	da_gusers
>  
> +struct hw_addr {
> +	struct list_head	list;
> +	unsigned char		addr[MAX_ADDR_LEN];
> +	int			refcount;
> +};
> +
>  struct hh_cache
>  {
>  	struct hh_cache *hh_next;	/* Next entry			     */
> @@ -776,8 +782,12 @@ struct net_device
>   */
>  	unsigned long		last_rx;	/* Time of last Rx	*/
>  	/* Interface address info used in eth_type_trans() */
> -	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
> -							   because most packets are unicast) */
> +	unsigned char		*dev_addr;	/* hw address, (before bcast
> +						   because most packets are
> +						   unicast) */
> +
> +	struct list_head	dev_addr_list; /* list of device hw addresses */
> +	spinlock_t              dev_addr_list_lock;
>  
>  	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
>  
> @@ -1779,6 +1789,32 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
>  	spin_unlock_bh(&dev->addr_list_lock);
>  }
>  
> +/* Locking helpers for spinlock guarding dev_addr_list */
> +
> +static inline void netif_dev_addr_lock(struct net_device *dev)
> +{
> +	spin_lock(&dev->dev_addr_list_lock);
> +}
> +
> +static inline void netif_dev_addr_lock_bh(struct net_device *dev)
> +{
> +	spin_lock_bh(&dev->dev_addr_list_lock);
> +}
> +
> +static inline void netif_dev_addr_unlock(struct net_device *dev)
> +{
> +	spin_unlock(&dev->dev_addr_list_lock);
> +}
> +
> +static inline void netif_dev_addr_unlock_bh(struct net_device *dev)
> +{
> +	spin_unlock_bh(&dev->dev_addr_list_lock);
> +}
> +

This lock is unnecessary, use RCU list for read.
Since all changes are under RTNL mutex, there is no chance
for conflict on update.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 2/4] net: introduce a list of device addresses dev_addr_list
@ 2009-04-13 14:49       ` Stephen Hemminger
  0 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-04-13 14:49 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, dada1, jgarzik, davem

On Mon, 13 Apr 2009 10:42:02 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  include/linux/netdevice.h |   51 +++++++++-
>  net/core/dev.c            |  264 +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 313 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index ff8db51..8cf62f1 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -210,6 +210,12 @@ struct dev_addr_list
>  #define dmi_users	da_users
>  #define dmi_gusers	da_gusers
>  
> +struct hw_addr {
> +	struct list_head	list;
> +	unsigned char		addr[MAX_ADDR_LEN];
> +	int			refcount;
> +};
> +
>  struct hh_cache
>  {
>  	struct hh_cache *hh_next;	/* Next entry			     */
> @@ -776,8 +782,12 @@ struct net_device
>   */
>  	unsigned long		last_rx;	/* Time of last Rx	*/
>  	/* Interface address info used in eth_type_trans() */
> -	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
> -							   because most packets are unicast) */
> +	unsigned char		*dev_addr;	/* hw address, (before bcast
> +						   because most packets are
> +						   unicast) */
> +
> +	struct list_head	dev_addr_list; /* list of device hw addresses */
> +	spinlock_t              dev_addr_list_lock;
>  
>  	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
>  
> @@ -1779,6 +1789,32 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
>  	spin_unlock_bh(&dev->addr_list_lock);
>  }
>  
> +/* Locking helpers for spinlock guarding dev_addr_list */
> +
> +static inline void netif_dev_addr_lock(struct net_device *dev)
> +{
> +	spin_lock(&dev->dev_addr_list_lock);
> +}
> +
> +static inline void netif_dev_addr_lock_bh(struct net_device *dev)
> +{
> +	spin_lock_bh(&dev->dev_addr_list_lock);
> +}
> +
> +static inline void netif_dev_addr_unlock(struct net_device *dev)
> +{
> +	spin_unlock(&dev->dev_addr_list_lock);
> +}
> +
> +static inline void netif_dev_addr_unlock_bh(struct net_device *dev)
> +{
> +	spin_unlock_bh(&dev->dev_addr_list_lock);
> +}
> +

This lock is unnecessary, use RCU list for read.
Since all changes are under RTNL mutex, there is no chance
for conflict on update.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 3/4] net: bridge: use device address list instead of dev_addr
  2009-04-13  8:44     ` [Bridge] " Jiri Pirko
@ 2009-04-13 14:54       ` Stephen Hemminger
  -1 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-04-13 14:54 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, netdev, jgarzik, davem, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

On Mon, 13 Apr 2009 10:44:08 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> This patch changes the handling of mac addresses of bridge port devices. Now
> it uses previously introduced list of device addresses. It allows the bridge to
> know more then one local mac address per port which is mandatory for the right
> work in some cases.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  net/bridge/br_fdb.c     |  120 +++++++++++++++++++++++++++++++++--------------
>  net/bridge/br_if.c      |    2 +-
>  net/bridge/br_notify.c  |    2 +-
>  net/bridge/br_private.h |    4 +-
>  4 files changed, 89 insertions(+), 39 deletions(-)
> 
> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index a48f5ef..6efc556 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
> @@ -77,10 +77,45 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
>  	br_fdb_put(f);
>  }
>  
> -void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
> +/*
> + * Finds out if passed address is one of the addresses assigned to the device.
> + * Returns 1 on positive result
> + */
> +static inline int is_dev_addr(struct net_device *dev, unsigned char *addr)

Why not a general version in net_device.h or etherdevice.h?

static inline bool is_etherdev_addr(const struct net_device *dev, const unsigned char addr[ETH_ALEN])

> +{
> +	struct hw_addr *ha;
> +	int ret = 1;
> +
> +	netif_dev_addr_lock_bh(dev);
> +	for_each_dev_addr(dev, ha) {
User RCU

> +		ret = compare_ether_addr(addr, ha->addr);
> +		if (!ret)
> +			break;
> +	}
> +	netif_dev_addr_unlock_bh(dev);
> +	return !ret ? 1 : 0;
> +}
> +
> +static int another_port_has_addr(const struct net_bridge_port *p,
> +				 struct net_bridge_fdb_entry *f)
> +{
> +	struct net_bridge *br = p->br;
> +	struct net_bridge_port *op;
> +
> +	list_for_each_entry(op, &br->port_list, list) {
> +		if (op != p && is_dev_addr(op->dev, f->addr.addr)) {
> +			f->dst = op;
> +			return 1;
> +		}
> +	}
> +	return 0;
> +}

Forwarding database is hot path, people sometimes run lots of devices
on single bridge, doesn't this scale worse?

> +void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
>  {
>  	struct net_bridge *br = p->br;
>  	int i;
> +	struct hw_addr *ha;
>  
>  	spin_lock_bh(&br->hash_lock);
>  
> @@ -92,26 +127,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
>  
>  			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
>  			if (f->dst == p && f->is_local) {
> -				/* maybe another port has same hw addr? */
> -				struct net_bridge_port *op;
> -				list_for_each_entry(op, &br->port_list, list) {
> -					if (op != p &&
> -					    !compare_ether_addr(op->dev->dev_addr,
> -								f->addr.addr)) {
> -						f->dst = op;
> -						goto insert;
> -					}
> -				}
> -
> -				/* delete old one */
> -				fdb_delete(f);
> -				goto insert;
> +				/*
> +				 * maybe another port has same hw addr?,
> +				 * if not then delete it
> +				 */
> +				if (!another_port_has_addr(p, f))
> +					fdb_delete(f);
>  			}
>  		}
>  	}
> - insert:
> -	/* insert new address,  may fail if invalid address or dup. */
> -	fdb_insert(br, p, newaddr);
> +
> +	/* insert device addresses, may fail if invalid address. */
> +
> +	netif_dev_addr_lock_bh(dev);
> +	for_each_dev_addr(dev, ha) {
> +		fdb_insert(br, p, ha->addr);
> +	}
> +	netif_dev_addr_unlock_bh(dev);
>

You added another layer of locking on the already hot bridge
fast path.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 3/4] net: bridge: use device address list instead of dev_addr
@ 2009-04-13 14:54       ` Stephen Hemminger
  0 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-04-13 14:54 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, dada1, jgarzik, davem

On Mon, 13 Apr 2009 10:44:08 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> This patch changes the handling of mac addresses of bridge port devices. Now
> it uses previously introduced list of device addresses. It allows the bridge to
> know more then one local mac address per port which is mandatory for the right
> work in some cases.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  net/bridge/br_fdb.c     |  120 +++++++++++++++++++++++++++++++++--------------
>  net/bridge/br_if.c      |    2 +-
>  net/bridge/br_notify.c  |    2 +-
>  net/bridge/br_private.h |    4 +-
>  4 files changed, 89 insertions(+), 39 deletions(-)
> 
> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index a48f5ef..6efc556 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
> @@ -77,10 +77,45 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
>  	br_fdb_put(f);
>  }
>  
> -void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
> +/*
> + * Finds out if passed address is one of the addresses assigned to the device.
> + * Returns 1 on positive result
> + */
> +static inline int is_dev_addr(struct net_device *dev, unsigned char *addr)

Why not a general version in net_device.h or etherdevice.h?

static inline bool is_etherdev_addr(const struct net_device *dev, const unsigned char addr[ETH_ALEN])

> +{
> +	struct hw_addr *ha;
> +	int ret = 1;
> +
> +	netif_dev_addr_lock_bh(dev);
> +	for_each_dev_addr(dev, ha) {
User RCU

> +		ret = compare_ether_addr(addr, ha->addr);
> +		if (!ret)
> +			break;
> +	}
> +	netif_dev_addr_unlock_bh(dev);
> +	return !ret ? 1 : 0;
> +}
> +
> +static int another_port_has_addr(const struct net_bridge_port *p,
> +				 struct net_bridge_fdb_entry *f)
> +{
> +	struct net_bridge *br = p->br;
> +	struct net_bridge_port *op;
> +
> +	list_for_each_entry(op, &br->port_list, list) {
> +		if (op != p && is_dev_addr(op->dev, f->addr.addr)) {
> +			f->dst = op;
> +			return 1;
> +		}
> +	}
> +	return 0;
> +}

Forwarding database is hot path, people sometimes run lots of devices
on single bridge, doesn't this scale worse?

> +void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
>  {
>  	struct net_bridge *br = p->br;
>  	int i;
> +	struct hw_addr *ha;
>  
>  	spin_lock_bh(&br->hash_lock);
>  
> @@ -92,26 +127,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
>  
>  			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
>  			if (f->dst == p && f->is_local) {
> -				/* maybe another port has same hw addr? */
> -				struct net_bridge_port *op;
> -				list_for_each_entry(op, &br->port_list, list) {
> -					if (op != p &&
> -					    !compare_ether_addr(op->dev->dev_addr,
> -								f->addr.addr)) {
> -						f->dst = op;
> -						goto insert;
> -					}
> -				}
> -
> -				/* delete old one */
> -				fdb_delete(f);
> -				goto insert;
> +				/*
> +				 * maybe another port has same hw addr?,
> +				 * if not then delete it
> +				 */
> +				if (!another_port_has_addr(p, f))
> +					fdb_delete(f);
>  			}
>  		}
>  	}
> - insert:
> -	/* insert new address,  may fail if invalid address or dup. */
> -	fdb_insert(br, p, newaddr);
> +
> +	/* insert device addresses, may fail if invalid address. */
> +
> +	netif_dev_addr_lock_bh(dev);
> +	for_each_dev_addr(dev, ha) {
> +		fdb_insert(br, p, ha->addr);
> +	}
> +	netif_dev_addr_unlock_bh(dev);
>

You added another layer of locking on the already hot bridge
fast path.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 4/4] net: bonding: add slave device addresses in mode alb
  2009-04-13  8:46     ` [Bridge] " Jiri Pirko
@ 2009-04-13 14:56       ` Stephen Hemminger
  -1 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-04-13 14:56 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, netdev, jgarzik, davem, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

On Mon, 13 Apr 2009 10:46:15 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> When in mode alb, add all device addresses which belong to an enslaved slave
> device to the bond device. This ensures that all mac addresses will be
> treated as local and bonding in this mode will work fine in bridge.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  drivers/net/bonding/bond_main.c |   30 +++++++++++++++++++++++++++++-
>  1 files changed, 29 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 99610f3..47795c7 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -1385,6 +1385,11 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
>  	bond->setup_by_slave = 1;
>  }
>  
> +static inline int should_copy_dev_addrs(struct bonding *bond)
> +{
> +	return bond->params.mode == BOND_MODE_ALB ? 1 : 0;
> +}

static inline bool should_copy_dev_addrs(const struct bonding *bond)
{
	return (bond->params.mode == BOND_MODE_ALB);
}

Three things are wrong with your style here:
   1. Needless use of tri-graph operator, just return the result
   2. Use const for test_foo() type functions
   3. Use bool to make it clearer what the result is.

>  /* enslave device <slave> to bond device <master> */
>  int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
>  {
> @@ -1510,6 +1515,13 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
>  	 */
>  	new_slave->original_flags = slave_dev->flags;
>  
> +	if (should_copy_dev_addrs(bond)) {
> +		res = dev_addr_add_multiple(bond_dev, slave_dev);
> +		if (res)
> +			goto err_free;
> +		dev_mac_address_changed(bond_dev);

The notifier (dev_mac_address_changed) should be part of dev_addr_add


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 4/4] net: bonding: add slave device addresses in mode alb
@ 2009-04-13 14:56       ` Stephen Hemminger
  0 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-04-13 14:56 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, dada1, jgarzik, davem

On Mon, 13 Apr 2009 10:46:15 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> When in mode alb, add all device addresses which belong to an enslaved slave
> device to the bond device. This ensures that all mac addresses will be
> treated as local and bonding in this mode will work fine in bridge.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  drivers/net/bonding/bond_main.c |   30 +++++++++++++++++++++++++++++-
>  1 files changed, 29 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 99610f3..47795c7 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -1385,6 +1385,11 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
>  	bond->setup_by_slave = 1;
>  }
>  
> +static inline int should_copy_dev_addrs(struct bonding *bond)
> +{
> +	return bond->params.mode == BOND_MODE_ALB ? 1 : 0;
> +}

static inline bool should_copy_dev_addrs(const struct bonding *bond)
{
	return (bond->params.mode == BOND_MODE_ALB);
}

Three things are wrong with your style here:
   1. Needless use of tri-graph operator, just return the result
   2. Use const for test_foo() type functions
   3. Use bool to make it clearer what the result is.

>  /* enslave device <slave> to bond device <master> */
>  int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
>  {
> @@ -1510,6 +1515,13 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
>  	 */
>  	new_slave->original_flags = slave_dev->flags;
>  
> +	if (should_copy_dev_addrs(bond)) {
> +		res = dev_addr_add_multiple(bond_dev, slave_dev);
> +		if (res)
> +			goto err_free;
> +		dev_mac_address_changed(bond_dev);

The notifier (dev_mac_address_changed) should be part of dev_addr_add


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/4] net: introduce dev_mac_address_changed
  2009-04-13  8:38     ` [Bridge] " Jiri Pirko
@ 2009-04-13 14:58       ` Stephen Hemminger
  -1 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-04-13 14:58 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, netdev, jgarzik, davem, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

On Mon, 13 Apr 2009 10:38:48 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> Introducing function dev_mac_address_changed which can be called from driver
> which changed his mac address to force notifiers to be called.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  include/linux/netdevice.h |    1 +
>  net/core/dev.c            |   12 ++++++++++++
>  2 files changed, 13 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 2e7783f..ff8db51 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1461,6 +1461,7 @@ extern int		dev_change_net_namespace(struct net_device *,
>  extern int		dev_set_mtu(struct net_device *, int);
>  extern int		dev_set_mac_address(struct net_device *,
>  					    struct sockaddr *);
> +extern void		dev_mac_address_changed(struct net_device *);
>  extern int		dev_hard_start_xmit(struct sk_buff *skb,
>  					    struct net_device *dev,
>  					    struct netdev_queue *txq);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 91d792d..1adc89b 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3833,6 +3833,18 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
>  	return err;
>  }
>  
> +/**
> + *	dev_mac_address_changed - Notify Media Access Control Address changed
> + *	@dev: device
> + *
> + *	Notifies the change of the hardware (MAC) address of the device
> + */
> +void dev_mac_address_changed(struct net_device *dev)
> +{
> +	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +}
> +EXPORT_SYMBOL(dev_mac_address_changed);
> +
>  /*
>   *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
>   */

The original version of this that I send, allowed notifiers to return
an error to block changing address (error would go back to application).
This is how other notifier hooks work (mtu, etc).

Why is dev_set_mac_address_changed called out separately, it should
be inside dev_set_mac_address. 


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/4] net: introduce dev_mac_address_changed
@ 2009-04-13 14:58       ` Stephen Hemminger
  0 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-04-13 14:58 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, dada1, jgarzik, davem

On Mon, 13 Apr 2009 10:38:48 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> Introducing function dev_mac_address_changed which can be called from driver
> which changed his mac address to force notifiers to be called.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  include/linux/netdevice.h |    1 +
>  net/core/dev.c            |   12 ++++++++++++
>  2 files changed, 13 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 2e7783f..ff8db51 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1461,6 +1461,7 @@ extern int		dev_change_net_namespace(struct net_device *,
>  extern int		dev_set_mtu(struct net_device *, int);
>  extern int		dev_set_mac_address(struct net_device *,
>  					    struct sockaddr *);
> +extern void		dev_mac_address_changed(struct net_device *);
>  extern int		dev_hard_start_xmit(struct sk_buff *skb,
>  					    struct net_device *dev,
>  					    struct netdev_queue *txq);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 91d792d..1adc89b 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3833,6 +3833,18 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
>  	return err;
>  }
>  
> +/**
> + *	dev_mac_address_changed - Notify Media Access Control Address changed
> + *	@dev: device
> + *
> + *	Notifies the change of the hardware (MAC) address of the device
> + */
> +void dev_mac_address_changed(struct net_device *dev)
> +{
> +	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +}
> +EXPORT_SYMBOL(dev_mac_address_changed);
> +
>  /*
>   *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
>   */

The original version of this that I send, allowed notifiers to return
an error to block changing address (error would go back to application).
This is how other notifier hooks work (mtu, etc).

Why is dev_set_mac_address_changed called out separately, it should
be inside dev_set_mac_address. 


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 2/4] net: introduce a list of device addresses dev_addr_list
  2009-04-13  8:42     ` [Bridge] " Jiri Pirko
@ 2009-04-13 22:53       ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-04-13 22:53 UTC (permalink / raw)
  To: jpirko
  Cc: linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 13 Apr 2009 10:42:02 +0200

> @@ -210,6 +210,12 @@ struct dev_addr_list
>  #define dmi_users	da_users
>  #define dmi_gusers	da_gusers
>  
> +struct hw_addr {
> +	struct list_head	list;
> +	unsigned char		addr[MAX_ADDR_LEN];
> +	int			refcount;
> +};
> +

Please don't pollute the global namespace with a structure name
like this.  Use "netdev_hw_addr" or "net_hw_addr".

> +static inline int __hw_addr_add(struct list_head *list, unsigned char *addr,
> +				int addr_len)

Please let the compiler inline things as it sees fit.  These
aren't routines in some header file or anything like that.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 2/4] net: introduce a list of device addresses dev_addr_list
@ 2009-04-13 22:53       ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-04-13 22:53 UTC (permalink / raw)
  To: jpirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt, jgarzik,
	dada1, bonding-devel

From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 13 Apr 2009 10:42:02 +0200

> @@ -210,6 +210,12 @@ struct dev_addr_list
>  #define dmi_users	da_users
>  #define dmi_gusers	da_gusers
>  
> +struct hw_addr {
> +	struct list_head	list;
> +	unsigned char		addr[MAX_ADDR_LEN];
> +	int			refcount;
> +};
> +

Please don't pollute the global namespace with a structure name
like this.  Use "netdev_hw_addr" or "net_hw_addr".

> +static inline int __hw_addr_add(struct list_head *list, unsigned char *addr,
> +				int addr_len)

Please let the compiler inline things as it sees fit.  These
aren't routines in some header file or anything like that.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 3/4] net: bridge: use device address list instead of dev_addr
  2009-04-13  8:44     ` [Bridge] " Jiri Pirko
@ 2009-04-13 22:54       ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-04-13 22:54 UTC (permalink / raw)
  To: jpirko
  Cc: linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 13 Apr 2009 10:44:08 +0200

> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index a48f5ef..6efc556 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
 ...
> +static inline int is_dev_addr(struct net_device *dev, unsigned char *addr)
> +{

Please drop the inline, let the compiler work it out.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 3/4] net: bridge: use device address list instead of dev_addr
@ 2009-04-13 22:54       ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-04-13 22:54 UTC (permalink / raw)
  To: jpirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt, jgarzik,
	dada1, bonding-devel

From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 13 Apr 2009 10:44:08 +0200

> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index a48f5ef..6efc556 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
 ...
> +static inline int is_dev_addr(struct net_device *dev, unsigned char *addr)
> +{

Please drop the inline, let the compiler work it out.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 2/4] net: introduce a list of device addresses dev_addr_list
  2009-04-13 14:49       ` [Bridge] " Stephen Hemminger
@ 2009-04-13 22:54         ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-04-13 22:54 UTC (permalink / raw)
  To: shemminger
  Cc: jpirko, linux-kernel, netdev, jgarzik, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Mon, 13 Apr 2009 07:49:17 -0700

> This lock is unnecessary, use RCU list for read.
> Since all changes are under RTNL mutex, there is no chance
> for conflict on update.

Agreed.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 2/4] net: introduce a list of device addresses dev_addr_list
@ 2009-04-13 22:54         ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-04-13 22:54 UTC (permalink / raw)
  To: shemminger
  Cc: ivecera, fubar, jpirko, netdev, bridge, linux-kernel, mschmidt,
	dada1, jgarzik, bonding-devel

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Mon, 13 Apr 2009 07:49:17 -0700

> This lock is unnecessary, use RCU list for read.
> Since all changes are under RTNL mutex, there is no chance
> for conflict on update.

Agreed.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 3/4] net: bridge: use device address list instead of dev_addr
  2009-04-13 14:54       ` [Bridge] " Stephen Hemminger
@ 2009-04-14 10:15         ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-14 10:15 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: linux-kernel, netdev, jgarzik, davem, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

Mon, Apr 13, 2009 at 04:54:00PM CEST, shemminger@vyatta.com wrote:
>> +static int another_port_has_addr(const struct net_bridge_port *p,
>> +				 struct net_bridge_fdb_entry *f)
>> +{
>> +	struct net_bridge *br = p->br;
>> +	struct net_bridge_port *op;
>> +
>> +	list_for_each_entry(op, &br->port_list, list) {
>> +		if (op != p && is_dev_addr(op->dev, f->addr.addr)) {
>> +			f->dst = op;
>> +			return 1;
>> +		}
>> +	}
>> +	return 0;
>> +}
>
>Forwarding database is hot path, people sometimes run lots of devices
>on single bridge, doesn't this scale worse?
>
This only puts the original loop code to the function, so if compiler decides to
inline this there might be no difference.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 3/4] net: bridge: use device address list instead of dev_addr
@ 2009-04-14 10:15         ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-14 10:15 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, dada1, jgarzik, davem

Mon, Apr 13, 2009 at 04:54:00PM CEST, shemminger@vyatta.com wrote:
>> +static int another_port_has_addr(const struct net_bridge_port *p,
>> +				 struct net_bridge_fdb_entry *f)
>> +{
>> +	struct net_bridge *br = p->br;
>> +	struct net_bridge_port *op;
>> +
>> +	list_for_each_entry(op, &br->port_list, list) {
>> +		if (op != p && is_dev_addr(op->dev, f->addr.addr)) {
>> +			f->dst = op;
>> +			return 1;
>> +		}
>> +	}
>> +	return 0;
>> +}
>
>Forwarding database is hot path, people sometimes run lots of devices
>on single bridge, doesn't this scale worse?
>
This only puts the original loop code to the function, so if compiler decides to
inline this there might be no difference.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH 0/3] bonding: allow bond in mode balance-alb to work properly in bridge -try6
  2009-03-13 18:33 ` [Bridge] " Jiri Pirko
@ 2009-04-15  8:17   ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15  8:17 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

(resend, rcu list locking, cometics)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patchset solves this issue in the best way it can be possibly solved. By
adding all mac addresses of all slave devices to the bridge hash list. To carry
these addresses the new list has to be introduced in struct net_device.

Jirka


^ permalink raw reply	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH 0/3] bonding: allow bond in mode balance-alb to work properly in bridge -try6
@ 2009-04-15  8:17   ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15  8:17 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

(resend, rcu list locking, cometics)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patchset solves this issue in the best way it can be possibly solved. By
adding all mac addresses of all slave devices to the bridge hash list. To carry
these addresses the new list has to be introduced in struct net_device.

Jirka


^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15  8:17   ` [Bridge] " Jiri Pirko
@ 2009-04-15  8:18     ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15  8:18 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   24 ++++
 include/linux/netdevice.h   |   31 +++++-
 net/core/dev.c              |  262 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 315 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..348a75e 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -205,4 +205,28 @@ static inline int compare_ether_header(const void *a, const void *b)
 	       (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
 }
 
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 *addr)
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
+
 #endif	/* _LINUX_ETHERDEVICE_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..77abfdf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,12 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	int			refcount;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +782,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1787,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1806,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int		dev_addr_add(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_del(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_add_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+extern int		dev_addr_del_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d..f77b5e6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3437,6 +3437,261 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			ha->refcount++;
+			return 0;
+		}
+	}
+	rcu_read_unlock();
+
+	ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->refcount = 1;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+				int addr_len)
+{
+	return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			if (--ha->refcount)
+				return 0;
+			list_del_rcu(&ha->list);
+			synchronize_rcu();
+			kfree(ha);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+				int addr_len)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, int ignore_index)
+{
+	int err = 0;
+	struct netdev_hw_addr *ha, *ha2;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, from_list, list) {
+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+		if (err)
+			goto unroll;
+	}
+	goto unlock;
+unroll:
+	list_for_each_entry_rcu(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+	}
+unlock:
+	rcu_read_unlock();
+	return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, from_list, list) {
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+	}
+	rcu_read_unlock();
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		synchronize_rcu();
+		kfree(ha);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		rcu_read_lock();
+		ha = list_first_entry_rcu(&dev->dev_addr_list,
+					  struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+		rcu_read_unlock();
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *
+ *	Add device addresses of the one device to another.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4257,6 +4512,9 @@ static void rollback_registered(struct net_device *dev)
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	if (dev->netdev_ops->ndo_uninit)
 		dev->netdev_ops->ndo_uninit(dev);
 
@@ -4779,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4965,6 +5224,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	netdev_unregister_kobject(dev);
 
 	/* Actually switch the network namespace */
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15  8:18     ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15  8:18 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   24 ++++
 include/linux/netdevice.h   |   31 +++++-
 net/core/dev.c              |  262 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 315 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..348a75e 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -205,4 +205,28 @@ static inline int compare_ether_header(const void *a, const void *b)
 	       (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
 }
 
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 *addr)
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
+
 #endif	/* _LINUX_ETHERDEVICE_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..77abfdf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,12 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	int			refcount;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +782,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1787,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1806,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int		dev_addr_add(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_del(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_add_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+extern int		dev_addr_del_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d..f77b5e6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3437,6 +3437,261 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			ha->refcount++;
+			return 0;
+		}
+	}
+	rcu_read_unlock();
+
+	ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->refcount = 1;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+				int addr_len)
+{
+	return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			if (--ha->refcount)
+				return 0;
+			list_del_rcu(&ha->list);
+			synchronize_rcu();
+			kfree(ha);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+				int addr_len)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, int ignore_index)
+{
+	int err = 0;
+	struct netdev_hw_addr *ha, *ha2;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, from_list, list) {
+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+		if (err)
+			goto unroll;
+	}
+	goto unlock;
+unroll:
+	list_for_each_entry_rcu(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+	}
+unlock:
+	rcu_read_unlock();
+	return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, from_list, list) {
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+	}
+	rcu_read_unlock();
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		synchronize_rcu();
+		kfree(ha);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		rcu_read_lock();
+		ha = list_first_entry_rcu(&dev->dev_addr_list,
+					  struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+		rcu_read_unlock();
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *
+ *	Add device addresses of the one device to another.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4257,6 +4512,9 @@ static void rollback_registered(struct net_device *dev)
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	if (dev->netdev_ops->ndo_uninit)
 		dev->netdev_ops->ndo_uninit(dev);
 
@@ -4779,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4965,6 +5224,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	netdev_unregister_kobject(dev);
 
 	/* Actually switch the network namespace */
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [PATCH 2/3] net: bridge: use device address list instead of dev_addr
  2009-04-15  8:17   ` [Bridge] " Jiri Pirko
@ 2009-04-15  8:21     ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15  8:21 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

This patch changes the handling of mac addresses of bridge port devices. Now
it uses previously introduced list of device addresses. It allows the bridge to
know more then one local mac address per port which is mandatory for the right
work in some cases.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 net/bridge/br_fdb.c     |  101 ++++++++++++++++++++++++++++++----------------
 net/bridge/br_if.c      |    2 +-
 net/bridge/br_notify.c  |    2 +-
 net/bridge/br_private.h |    4 +-
 4 files changed, 70 insertions(+), 39 deletions(-)

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index a48f5ef..1e63f76 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -77,10 +77,26 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
 	br_fdb_put(f);
 }
 
-void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
+static bool another_port_has_addr(const struct net_bridge_port *p,
+				  struct net_bridge_fdb_entry *f)
+{
+	struct net_bridge *br = p->br;
+	struct net_bridge_port *op;
+
+	list_for_each_entry(op, &br->port_list, list) {
+		if (op != p && is_etherdev_addr(op->dev, f->addr.addr)) {
+			f->dst = op;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
 {
 	struct net_bridge *br = p->br;
 	int i;
+	struct netdev_hw_addr *ha;
 
 	spin_lock_bh(&br->hash_lock);
 
@@ -92,26 +108,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
 
 			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
 			if (f->dst == p && f->is_local) {
-				/* maybe another port has same hw addr? */
-				struct net_bridge_port *op;
-				list_for_each_entry(op, &br->port_list, list) {
-					if (op != p &&
-					    !compare_ether_addr(op->dev->dev_addr,
-								f->addr.addr)) {
-						f->dst = op;
-						goto insert;
-					}
-				}
-
-				/* delete old one */
-				fdb_delete(f);
-				goto insert;
+				/*
+				 * maybe another port has same hw addr?,
+				 * if not then delete it
+				 */
+				if (!another_port_has_addr(p, f))
+					fdb_delete(f);
 			}
 		}
 	}
- insert:
-	/* insert new address,  may fail if invalid address or dup. */
-	fdb_insert(br, p, newaddr);
+
+	/* insert device addresses, may fail if invalid address. */
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		fdb_insert(br, p, ha->addr);
+	}
+	rcu_read_unlock();
 
 	spin_unlock_bh(&br->hash_lock);
 }
@@ -189,20 +202,9 @@ void br_fdb_delete_by_port(struct net_bridge *br,
 			 * then when one port is deleted, assign
 			 * the local entry to other port
 			 */
-			if (f->is_local) {
-				struct net_bridge_port *op;
-				list_for_each_entry(op, &br->port_list, list) {
-					if (op != p &&
-					    !compare_ether_addr(op->dev->dev_addr,
-								f->addr.addr)) {
-						f->dst = op;
-						goto skip_delete;
-					}
-				}
-			}
-
-			fdb_delete(f);
-		skip_delete: ;
+			if (!f->is_local ||
+			    !another_port_has_addr(p, f))
+				fdb_delete(f);
 		}
 	}
 	spin_unlock_bh(&br->hash_lock);
@@ -338,7 +340,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 }
 
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
-		  const unsigned char *addr)
+		      const unsigned char *addr)
 {
 	struct hlist_head *head = &br->hash[br_mac_hash(addr)];
 	struct net_bridge_fdb_entry *fdb;
@@ -366,13 +368,42 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 	return 0;
 }
 
+static int fdb_insert_dev(struct net_bridge *br, struct net_bridge_port *source,
+			  struct net_device *dev)
+{
+	struct netdev_hw_addr *ha, *ha2;
+	struct net_bridge_fdb_entry *fdb;
+	struct hlist_head *head;
+	int ret = 0;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		ret = fdb_insert(br, source, ha->addr);
+		if (ret)
+			goto unroll;
+	}
+	goto unlock;
+unroll:
+	for_each_dev_addr(dev, ha2) {
+		if (ha2 == ha)
+			break;
+		head = &br->hash[br_mac_hash(ha2->addr)];
+		fdb = fdb_find(head, ha2->addr);
+		if (fdb && fdb->is_local)
+			fdb_delete(fdb);
+	}
+unlock:
+	rcu_read_unlock();
+	return ret;
+}
+
 int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
-		  const unsigned char *addr)
+		  struct net_device *dev)
 {
 	int ret;
 
 	spin_lock_bh(&br->hash_lock);
-	ret = fdb_insert(br, source, addr);
+	ret = fdb_insert_dev(br, source, dev);
 	spin_unlock_bh(&br->hash_lock);
 	return ret;
 }
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 8a96672..789cb30 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -392,7 +392,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	if (err)
 		goto err0;
 
-	err = br_fdb_insert(br, p, dev->dev_addr);
+	err = br_fdb_insert(br, p, dev);
 	if (err)
 		goto err1;
 
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 763a3ec..1423541 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -48,7 +48,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 
 	case NETDEV_CHANGEADDR:
 		spin_lock_bh(&br->lock);
-		br_fdb_changeaddr(p, dev->dev_addr);
+		br_fdb_changeaddr(p, dev);
 		br_stp_recalculate_bridge_id(br);
 		spin_unlock_bh(&br->lock);
 		break;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index b6c3b71..65ffe3d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -148,7 +148,7 @@ extern int br_fdb_init(void);
 extern void br_fdb_fini(void);
 extern void br_fdb_flush(struct net_bridge *br);
 extern void br_fdb_changeaddr(struct net_bridge_port *p,
-			      const unsigned char *newaddr);
+			      struct net_device *dev);
 extern void br_fdb_cleanup(unsigned long arg);
 extern void br_fdb_delete_by_port(struct net_bridge *br,
 				  const struct net_bridge_port *p, int do_all);
@@ -161,7 +161,7 @@ extern int br_fdb_fillbuf(struct net_bridge *br, void *buf,
 			  unsigned long count, unsigned long off);
 extern int br_fdb_insert(struct net_bridge *br,
 			 struct net_bridge_port *source,
-			 const unsigned char *addr);
+			 struct net_device *dev);
 extern void br_fdb_update(struct net_bridge *br,
 			  struct net_bridge_port *source,
 			  const unsigned char *addr);
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH 2/3] net: bridge: use device address list instead of dev_addr
@ 2009-04-15  8:21     ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15  8:21 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

This patch changes the handling of mac addresses of bridge port devices. Now
it uses previously introduced list of device addresses. It allows the bridge to
know more then one local mac address per port which is mandatory for the right
work in some cases.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 net/bridge/br_fdb.c     |  101 ++++++++++++++++++++++++++++++----------------
 net/bridge/br_if.c      |    2 +-
 net/bridge/br_notify.c  |    2 +-
 net/bridge/br_private.h |    4 +-
 4 files changed, 70 insertions(+), 39 deletions(-)

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index a48f5ef..1e63f76 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -77,10 +77,26 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
 	br_fdb_put(f);
 }
 
-void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
+static bool another_port_has_addr(const struct net_bridge_port *p,
+				  struct net_bridge_fdb_entry *f)
+{
+	struct net_bridge *br = p->br;
+	struct net_bridge_port *op;
+
+	list_for_each_entry(op, &br->port_list, list) {
+		if (op != p && is_etherdev_addr(op->dev, f->addr.addr)) {
+			f->dst = op;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
 {
 	struct net_bridge *br = p->br;
 	int i;
+	struct netdev_hw_addr *ha;
 
 	spin_lock_bh(&br->hash_lock);
 
@@ -92,26 +108,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
 
 			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
 			if (f->dst == p && f->is_local) {
-				/* maybe another port has same hw addr? */
-				struct net_bridge_port *op;
-				list_for_each_entry(op, &br->port_list, list) {
-					if (op != p &&
-					    !compare_ether_addr(op->dev->dev_addr,
-								f->addr.addr)) {
-						f->dst = op;
-						goto insert;
-					}
-				}
-
-				/* delete old one */
-				fdb_delete(f);
-				goto insert;
+				/*
+				 * maybe another port has same hw addr?,
+				 * if not then delete it
+				 */
+				if (!another_port_has_addr(p, f))
+					fdb_delete(f);
 			}
 		}
 	}
- insert:
-	/* insert new address,  may fail if invalid address or dup. */
-	fdb_insert(br, p, newaddr);
+
+	/* insert device addresses, may fail if invalid address. */
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		fdb_insert(br, p, ha->addr);
+	}
+	rcu_read_unlock();
 
 	spin_unlock_bh(&br->hash_lock);
 }
@@ -189,20 +202,9 @@ void br_fdb_delete_by_port(struct net_bridge *br,
 			 * then when one port is deleted, assign
 			 * the local entry to other port
 			 */
-			if (f->is_local) {
-				struct net_bridge_port *op;
-				list_for_each_entry(op, &br->port_list, list) {
-					if (op != p &&
-					    !compare_ether_addr(op->dev->dev_addr,
-								f->addr.addr)) {
-						f->dst = op;
-						goto skip_delete;
-					}
-				}
-			}
-
-			fdb_delete(f);
-		skip_delete: ;
+			if (!f->is_local ||
+			    !another_port_has_addr(p, f))
+				fdb_delete(f);
 		}
 	}
 	spin_unlock_bh(&br->hash_lock);
@@ -338,7 +340,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 }
 
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
-		  const unsigned char *addr)
+		      const unsigned char *addr)
 {
 	struct hlist_head *head = &br->hash[br_mac_hash(addr)];
 	struct net_bridge_fdb_entry *fdb;
@@ -366,13 +368,42 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 	return 0;
 }
 
+static int fdb_insert_dev(struct net_bridge *br, struct net_bridge_port *source,
+			  struct net_device *dev)
+{
+	struct netdev_hw_addr *ha, *ha2;
+	struct net_bridge_fdb_entry *fdb;
+	struct hlist_head *head;
+	int ret = 0;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		ret = fdb_insert(br, source, ha->addr);
+		if (ret)
+			goto unroll;
+	}
+	goto unlock;
+unroll:
+	for_each_dev_addr(dev, ha2) {
+		if (ha2 == ha)
+			break;
+		head = &br->hash[br_mac_hash(ha2->addr)];
+		fdb = fdb_find(head, ha2->addr);
+		if (fdb && fdb->is_local)
+			fdb_delete(fdb);
+	}
+unlock:
+	rcu_read_unlock();
+	return ret;
+}
+
 int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
-		  const unsigned char *addr)
+		  struct net_device *dev)
 {
 	int ret;
 
 	spin_lock_bh(&br->hash_lock);
-	ret = fdb_insert(br, source, addr);
+	ret = fdb_insert_dev(br, source, dev);
 	spin_unlock_bh(&br->hash_lock);
 	return ret;
 }
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 8a96672..789cb30 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -392,7 +392,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	if (err)
 		goto err0;
 
-	err = br_fdb_insert(br, p, dev->dev_addr);
+	err = br_fdb_insert(br, p, dev);
 	if (err)
 		goto err1;
 
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 763a3ec..1423541 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -48,7 +48,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 
 	case NETDEV_CHANGEADDR:
 		spin_lock_bh(&br->lock);
-		br_fdb_changeaddr(p, dev->dev_addr);
+		br_fdb_changeaddr(p, dev);
 		br_stp_recalculate_bridge_id(br);
 		spin_unlock_bh(&br->lock);
 		break;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index b6c3b71..65ffe3d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -148,7 +148,7 @@ extern int br_fdb_init(void);
 extern void br_fdb_fini(void);
 extern void br_fdb_flush(struct net_bridge *br);
 extern void br_fdb_changeaddr(struct net_bridge_port *p,
-			      const unsigned char *newaddr);
+			      struct net_device *dev);
 extern void br_fdb_cleanup(unsigned long arg);
 extern void br_fdb_delete_by_port(struct net_bridge *br,
 				  const struct net_bridge_port *p, int do_all);
@@ -161,7 +161,7 @@ extern int br_fdb_fillbuf(struct net_bridge *br, void *buf,
 			  unsigned long count, unsigned long off);
 extern int br_fdb_insert(struct net_bridge *br,
 			 struct net_bridge_port *source,
-			 const unsigned char *addr);
+			 struct net_device *dev);
 extern void br_fdb_update(struct net_bridge *br,
 			  struct net_bridge_port *source,
 			  const unsigned char *addr);
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [PATCH 3/3] net: bonding: add slave device addresses in mode alb
  2009-04-15  8:17   ` [Bridge] " Jiri Pirko
@ 2009-04-15  8:22     ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15  8:22 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

When in mode alb, add all device addresses which belong to an enslaved slave
device to the bond device. This ensures that all mac addresses will be
treated as local and bonding in this mode will work fine in bridge.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/bonding/bond_main.c |   23 ++++++++++++++++++++++-
 1 files changed, 22 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 99610f3..4025dd0 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1385,6 +1385,11 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
 	bond->setup_by_slave = 1;
 }
 
+static bool should_copy_dev_addrs(const struct bonding *bond)
+{
+	return (bond->params.mode == BOND_MODE_ALB);
+}
+
 /* enslave device <slave> to bond device <master> */
 int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 {
@@ -1510,6 +1515,12 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	 */
 	new_slave->original_flags = slave_dev->flags;
 
+	if (should_copy_dev_addrs(bond)) {
+		res = dev_addr_add_multiple(bond_dev, slave_dev);
+		if (res)
+			goto err_free;
+	}
+
 	/*
 	 * Save slave's original ("permanent") mac address for modes
 	 * that need it, and for restoring it upon release, and then
@@ -1527,7 +1538,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		res = dev_set_mac_address(slave_dev, &addr);
 		if (res) {
 			pr_debug("Error %d calling set_mac_address\n", res);
-			goto err_free;
+			goto err_remove_dev_addrs;
 		}
 	}
 
@@ -1769,6 +1780,10 @@ err_restore_mac:
 		dev_set_mac_address(slave_dev, &addr);
 	}
 
+err_remove_dev_addrs:
+	if (should_copy_dev_addrs(bond))
+		dev_addr_del_multiple(bond_dev, slave_dev);
+
 err_free:
 	kfree(new_slave);
 
@@ -1954,6 +1969,9 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
 	/* close slave before restoring its mac address */
 	dev_close(slave_dev);
 
+	if (should_copy_dev_addrs(bond))
+		dev_addr_del_multiple(bond_dev, slave_dev);
+
 	if (bond->params.fail_over_mac != BOND_FOM_ACTIVE) {
 		/* restore original ("permanent") mac address */
 		memcpy(addr.sa_data, slave->perm_hwaddr, ETH_ALEN);
@@ -2090,6 +2108,9 @@ static int bond_release_all(struct net_device *bond_dev)
 		/* close slave before restoring its mac address */
 		dev_close(slave_dev);
 
+		if (should_copy_dev_addrs(bond))
+			dev_addr_del_multiple(bond_dev, slave_dev);
+
 		if (!bond->params.fail_over_mac) {
 			/* restore original ("permanent") mac address*/
 			memcpy(addr.sa_data, slave->perm_hwaddr, ETH_ALEN);
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH 3/3] net: bonding: add slave device addresses in mode alb
@ 2009-04-15  8:22     ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15  8:22 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

When in mode alb, add all device addresses which belong to an enslaved slave
device to the bond device. This ensures that all mac addresses will be
treated as local and bonding in this mode will work fine in bridge.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/bonding/bond_main.c |   23 ++++++++++++++++++++++-
 1 files changed, 22 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 99610f3..4025dd0 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1385,6 +1385,11 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
 	bond->setup_by_slave = 1;
 }
 
+static bool should_copy_dev_addrs(const struct bonding *bond)
+{
+	return (bond->params.mode == BOND_MODE_ALB);
+}
+
 /* enslave device <slave> to bond device <master> */
 int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 {
@@ -1510,6 +1515,12 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	 */
 	new_slave->original_flags = slave_dev->flags;
 
+	if (should_copy_dev_addrs(bond)) {
+		res = dev_addr_add_multiple(bond_dev, slave_dev);
+		if (res)
+			goto err_free;
+	}
+
 	/*
 	 * Save slave's original ("permanent") mac address for modes
 	 * that need it, and for restoring it upon release, and then
@@ -1527,7 +1538,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		res = dev_set_mac_address(slave_dev, &addr);
 		if (res) {
 			pr_debug("Error %d calling set_mac_address\n", res);
-			goto err_free;
+			goto err_remove_dev_addrs;
 		}
 	}
 
@@ -1769,6 +1780,10 @@ err_restore_mac:
 		dev_set_mac_address(slave_dev, &addr);
 	}
 
+err_remove_dev_addrs:
+	if (should_copy_dev_addrs(bond))
+		dev_addr_del_multiple(bond_dev, slave_dev);
+
 err_free:
 	kfree(new_slave);
 
@@ -1954,6 +1969,9 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
 	/* close slave before restoring its mac address */
 	dev_close(slave_dev);
 
+	if (should_copy_dev_addrs(bond))
+		dev_addr_del_multiple(bond_dev, slave_dev);
+
 	if (bond->params.fail_over_mac != BOND_FOM_ACTIVE) {
 		/* restore original ("permanent") mac address */
 		memcpy(addr.sa_data, slave->perm_hwaddr, ETH_ALEN);
@@ -2090,6 +2108,9 @@ static int bond_release_all(struct net_device *bond_dev)
 		/* close slave before restoring its mac address */
 		dev_close(slave_dev);
 
+		if (should_copy_dev_addrs(bond))
+			dev_addr_del_multiple(bond_dev, slave_dev);
+
 		if (!bond->params.fail_over_mac) {
 			/* restore original ("permanent") mac address*/
 			memcpy(addr.sa_data, slave->perm_hwaddr, ETH_ALEN);
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15  8:18     ` [Bridge] " Jiri Pirko
@ 2009-04-15  8:26       ` Li Zefan
  -1 siblings, 0 replies; 214+ messages in thread
From: Li Zefan @ 2009-04-15  8:26 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, netdev, jgarzik, davem, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +
> +	if (addr_len > MAX_ADDR_LEN)
> +		return -EINVAL;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			ha->refcount++;
> +			return 0;

missing rcu_read_unlock() ?

> +		}
> +	}
> +	rcu_read_unlock();

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15  8:26       ` Li Zefan
  0 siblings, 0 replies; 214+ messages in thread
From: Li Zefan @ 2009-04-15  8:26 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, jgarzik, dada1, davem

> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +
> +	if (addr_len > MAX_ADDR_LEN)
> +		return -EINVAL;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			ha->refcount++;
> +			return 0;

missing rcu_read_unlock() ?

> +		}
> +	}
> +	rcu_read_unlock();

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15  8:26       ` [Bridge] " Li Zefan
@ 2009-04-15  8:29         ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15  8:29 UTC (permalink / raw)
  To: Li Zefan
  Cc: linux-kernel, netdev, jgarzik, davem, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

Wed, Apr 15, 2009 at 10:26:04AM CEST, lizf@cn.fujitsu.com wrote:
>> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>> +			    int addr_len, int ignore_index)
>> +{
>> +	struct netdev_hw_addr *ha;
>> +	int i = 0;
>> +
>> +	if (addr_len > MAX_ADDR_LEN)
>> +		return -EINVAL;
>> +
>> +	rcu_read_lock();
>> +	list_for_each_entry_rcu(ha, list, list) {
>> +		if (i++ != ignore_index &&
>> +		    !memcmp(ha->addr, addr, addr_len)) {
>> +			ha->refcount++;
>> +			return 0;
>
>missing rcu_read_unlock() ?
>
Sure! Thanks...
>> +		}
>> +	}
>> +	rcu_read_unlock();

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15  8:29         ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15  8:29 UTC (permalink / raw)
  To: Li Zefan
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, jgarzik, dada1, davem

Wed, Apr 15, 2009 at 10:26:04AM CEST, lizf@cn.fujitsu.com wrote:
>> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>> +			    int addr_len, int ignore_index)
>> +{
>> +	struct netdev_hw_addr *ha;
>> +	int i = 0;
>> +
>> +	if (addr_len > MAX_ADDR_LEN)
>> +		return -EINVAL;
>> +
>> +	rcu_read_lock();
>> +	list_for_each_entry_rcu(ha, list, list) {
>> +		if (i++ != ignore_index &&
>> +		    !memcmp(ha->addr, addr, addr_len)) {
>> +			ha->refcount++;
>> +			return 0;
>
>missing rcu_read_unlock() ?
>
Sure! Thanks...
>> +		}
>> +	}
>> +	rcu_read_unlock();

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15  8:26       ` [Bridge] " Li Zefan
@ 2009-04-15  8:32         ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15  8:32 UTC (permalink / raw)
  To: Li Zefan
  Cc: linux-kernel, netdev, jgarzik, davem, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   24 ++++
 include/linux/netdevice.h   |   31 +++++-
 net/core/dev.c              |  263 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 316 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..348a75e 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -205,4 +205,28 @@ static inline int compare_ether_header(const void *a, const void *b)
 	       (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
 }
 
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 *addr)
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
+
 #endif	/* _LINUX_ETHERDEVICE_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..77abfdf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,12 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	int			refcount;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +782,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1787,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1806,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int		dev_addr_add(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_del(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_add_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+extern int		dev_addr_del_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d..04cddbb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3437,6 +3437,262 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			ha->refcount++;
+			rcu_read_unlock();
+			return 0;
+		}
+	}
+	rcu_read_unlock();
+
+	ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->refcount = 1;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+				int addr_len)
+{
+	return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			if (--ha->refcount)
+				return 0;
+			list_del_rcu(&ha->list);
+			synchronize_rcu();
+			kfree(ha);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+				int addr_len)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, int ignore_index)
+{
+	int err = 0;
+	struct netdev_hw_addr *ha, *ha2;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, from_list, list) {
+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+		if (err)
+			goto unroll;
+	}
+	goto unlock;
+unroll:
+	list_for_each_entry_rcu(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+	}
+unlock:
+	rcu_read_unlock();
+	return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, from_list, list) {
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+	}
+	rcu_read_unlock();
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		synchronize_rcu();
+		kfree(ha);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		rcu_read_lock();
+		ha = list_first_entry_rcu(&dev->dev_addr_list,
+					  struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+		rcu_read_unlock();
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *
+ *	Add device addresses of the one device to another.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4257,6 +4513,9 @@ static void rollback_registered(struct net_device *dev)
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	if (dev->netdev_ops->ndo_uninit)
 		dev->netdev_ops->ndo_uninit(dev);
 
@@ -4779,6 +5038,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4965,6 +5225,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	netdev_unregister_kobject(dev);
 
 	/* Actually switch the network namespace */
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15  8:32         ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15  8:32 UTC (permalink / raw)
  To: Li Zefan
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, jgarzik, dada1, davem

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   24 ++++
 include/linux/netdevice.h   |   31 +++++-
 net/core/dev.c              |  263 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 316 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..348a75e 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -205,4 +205,28 @@ static inline int compare_ether_header(const void *a, const void *b)
 	       (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
 }
 
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 *addr)
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
+
 #endif	/* _LINUX_ETHERDEVICE_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..77abfdf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,12 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	int			refcount;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +782,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1787,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1806,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int		dev_addr_add(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_del(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_add_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+extern int		dev_addr_del_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d..04cddbb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3437,6 +3437,262 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			ha->refcount++;
+			rcu_read_unlock();
+			return 0;
+		}
+	}
+	rcu_read_unlock();
+
+	ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->refcount = 1;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+				int addr_len)
+{
+	return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			if (--ha->refcount)
+				return 0;
+			list_del_rcu(&ha->list);
+			synchronize_rcu();
+			kfree(ha);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+				int addr_len)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, int ignore_index)
+{
+	int err = 0;
+	struct netdev_hw_addr *ha, *ha2;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, from_list, list) {
+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+		if (err)
+			goto unroll;
+	}
+	goto unlock;
+unroll:
+	list_for_each_entry_rcu(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+	}
+unlock:
+	rcu_read_unlock();
+	return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, from_list, list) {
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+	}
+	rcu_read_unlock();
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		synchronize_rcu();
+		kfree(ha);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		rcu_read_lock();
+		ha = list_first_entry_rcu(&dev->dev_addr_list,
+					  struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+		rcu_read_unlock();
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *
+ *	Add device addresses of the one device to another.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4257,6 +4513,9 @@ static void rollback_registered(struct net_device *dev)
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	if (dev->netdev_ops->ndo_uninit)
 		dev->netdev_ops->ndo_uninit(dev);
 
@@ -4779,6 +5038,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4965,6 +5225,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	netdev_unregister_kobject(dev);
 
 	/* Actually switch the network namespace */
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15  8:32         ` [Bridge] " Jiri Pirko
@ 2009-04-15  9:21           ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-04-15  9:21 UTC (permalink / raw)
  To: jpirko
  Cc: lizf, linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 15 Apr 2009 10:32:24 +0200

> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 
> Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>

Jiri, please add some distinguishing text to your subject lines when
you post fixed up version of patches.  Like "v2" or something like
that, and make a note under the commit message of the changes you've
made from the previous version.

Otherwise I think it's a dup (because I get a thousand copies anyways)
and will just delete it both in my inbox and on patchwork.

Thanks.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15  9:21           ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-04-15  9:21 UTC (permalink / raw)
  To: jpirko
  Cc: ivecera, fubar, netdev, bridge, lizf, linux-kernel, mschmidt,
	jgarzik, dada1, bonding-devel

From: Jiri Pirko <jpirko@redhat.com>
Date: Wed, 15 Apr 2009 10:32:24 +0200

> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 
> Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>

Jiri, please add some distinguishing text to your subject lines when
you post fixed up version of patches.  Like "v2" or something like
that, and make a note under the commit message of the changes you've
made from the previous version.

Otherwise I think it's a dup (because I get a thousand copies anyways)
and will just delete it both in my inbox and on patchwork.

Thanks.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15  8:32         ` [Bridge] " Jiri Pirko
@ 2009-04-15  9:27           ` Eric Dumazet
  -1 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-04-15  9:27 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Li Zefan, linux-kernel, netdev, jgarzik, davem, shemminger,
	bridge, fubar, bonding-devel, kaber, mschmidt, ivecera

Jiri Pirko a écrit :
> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 

You see no difference ? Please look more closely...

I see one additional dereference in hot path, to small objects possibly
with false sharing effects.

So I would advise not changing dev_addr[] to a pointer.
And instead copy first netdev_hw_addr into it.

Also, doing a kzalloc(sizeof(struct netdev_hw_addr)) for allocating these structs
might give a block of memory < L1_CACHE_SIZE so kernel is free to give other
part of this cache line to some other layer that could be a hot spot, so
false sharing could happen.

kzalloc(max(sizeof(*ha), L1_CACHE_SIZE)) is thus higly recommended here.

> Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  include/linux/etherdevice.h |   24 ++++
>  include/linux/netdevice.h   |   31 +++++-
>  net/core/dev.c              |  263 +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 316 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
> index a1f17ab..348a75e 100644
> --- a/include/linux/etherdevice.h
> +++ b/include/linux/etherdevice.h
> @@ -205,4 +205,28 @@ static inline int compare_ether_header(const void *a, const void *b)
>  	       (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
>  }
>  
> +/**
> + * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
> + * @dev: Pointer to a device structure
> + * @addr: Pointer to a six-byte array containing the Ethernet address
> + *
> + * Compare passed address with all addresses of the device. Return true if the
> + * address if one of the device addresses.
> + */
> +static inline bool is_etherdev_addr(const struct net_device *dev,
> +				    const u8 *addr)
> +{
> +	struct netdev_hw_addr *ha;
> +	int res = 1;
> +
> +	rcu_read_lock();
> +	for_each_dev_addr(dev, ha) {
> +		res = compare_ether_addr(addr, ha->addr);

compare_ether_addr_64bits() please ?

> +		if (!res)
> +			break;
> +	}
> +	rcu_read_unlock();
> +	return !res;
> +}
> +
>  #endif	/* _LINUX_ETHERDEVICE_H */
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 2e7783f..77abfdf 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -210,6 +210,12 @@ struct dev_addr_list
>  #define dmi_users	da_users
>  #define dmi_gusers	da_gusers
>  
> +struct netdev_hw_addr {
> +	struct list_head	list;
> +	unsigned char		addr[MAX_ADDR_LEN];
> +	int			refcount;
> +};
> +
>  struct hh_cache
>  {
>  	struct hh_cache *hh_next;	/* Next entry			     */
> @@ -776,8 +782,11 @@ struct net_device
>   */
>  	unsigned long		last_rx;	/* Time of last Rx	*/
>  	/* Interface address info used in eth_type_trans() */
> -	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
> -							   because most packets are unicast) */
> +	unsigned char		*dev_addr;	/* hw address, (before bcast
> +						   because most packets are
> +						   unicast) */
> +
> +	struct list_head	dev_addr_list; /* list of device hw addresses */
>  
>  	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
>  
> @@ -1778,6 +1787,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
>  	spin_unlock_bh(&dev->addr_list_lock);
>  }
>  
> +/*
> + * dev_addr_list walker. Should be used only for read access. Call with
> + * rcu_read_lock held.
> + */
> +#define for_each_dev_addr(dev, ha) \
> +		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
> +
>  /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>  
>  extern void		ether_setup(struct net_device *dev);
> @@ -1790,6 +1806,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  	alloc_netdev_mq(sizeof_priv, name, setup, 1)
>  extern int		register_netdev(struct net_device *dev);
>  extern void		unregister_netdev(struct net_device *dev);
> +
> +/* Functions used for device addresses handling */
> +extern int		dev_addr_add(struct net_device *dev,
> +				     unsigned char *addr);
> +extern int		dev_addr_del(struct net_device *dev,
> +				     unsigned char *addr);
> +extern int		dev_addr_add_multiple(struct net_device *to_dev,
> +					      struct net_device *from_dev);
> +extern int		dev_addr_del_multiple(struct net_device *to_dev,
> +					      struct net_device *from_dev);
> +
>  /* Functions used for secondary unicast and multicast support */
>  extern void		dev_set_rx_mode(struct net_device *dev);
>  extern void		__dev_set_rx_mode(struct net_device *dev);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 91d792d..04cddbb 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3437,6 +3437,262 @@ void dev_set_rx_mode(struct net_device *dev)
>  	netif_addr_unlock_bh(dev);
>  }
>  
> +/* hw addresses list handling functions */
> +
> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +
> +	if (addr_len > MAX_ADDR_LEN)
> +		return -EINVAL;
> +
> +	rcu_read_lock();

This locking is highly suspect.

> +	list_for_each_entry_rcu(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			ha->refcount++;
> +			rcu_read_unlock();
> +			return 0;
> +		}
> +	}
> +	rcu_read_unlock();

Since you obviously need a write lock here to be sure following
can be done by one cpu only.

You have same problem all over this patch.

> +
> +	ha = kzalloc(sizeof(*ha), GFP_ATOMIC);

kzalloc(max(sizeof(*ha), L1_CACHE_SIZE), GFP_...) is thus higly recommended here.

Also, why GFP_ATOMIC is needed here ?

> +	if (!ha)
> +		return -ENOMEM;
> +	memcpy(ha->addr, addr, addr_len);
> +	ha->refcount = 1;
> +	list_add_tail_rcu(&ha->list, list);
> +	return 0;
> +}
> +
> +static int __hw_addr_add(struct list_head *list, unsigned char *addr,
> +				int addr_len)
> +{
> +	return __hw_addr_add_ii(list, addr, addr_len, -1);
> +}
> +
> +static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +
> +	list_for_each_entry(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			if (--ha->refcount)
> +				return 0;
> +			list_del_rcu(&ha->list);
> +			synchronize_rcu();

Oh well... I'm pretty sure this synchronize_rcu() call can be avoided,
dont you think ? Check kfree_rcu() or equivalent, as it seems not yet
included in current kernels...

> +			kfree(ha);
> +			return 0;
> +		}
> +	}
> +	return -ENOENT;
> +}
> +
> +static int __hw_addr_del(struct list_head *list, unsigned char *addr,
> +				int addr_len)
> +{
> +	return __hw_addr_del_ii(list, addr, addr_len, -1);
> +}
> +
> +static int __hw_addr_add_multiple_ii(struct list_head *to_list,
> +				     struct list_head *from_list,
> +				     int addr_len, int ignore_index)
> +{
> +	int err = 0;
> +	struct netdev_hw_addr *ha, *ha2;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ha, from_list, list) {
> +		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
> +		if (err)
> +			goto unroll;
> +	}
> +	goto unlock;
> +unroll:
> +	list_for_each_entry_rcu(ha2, from_list, list) {
> +		if (ha2 == ha)
> +			break;
> +		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
> +	}
> +unlock:
> +	rcu_read_unlock();
> +	return err;
> +}
> +
> +static int __hw_addr_add_multiple(struct list_head *to_list,
> +					 struct list_head *from_list,
> +					 int addr_len)
> +{
> +	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_del_multiple_ii(struct list_head *to_list,
> +				      struct list_head *from_list,
> +				      int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ha, from_list, list) {
> +		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
> +	}
> +	rcu_read_unlock();
> +}
> +
> +static void __hw_addr_del_multiple(struct list_head *to_list,
> +					 struct list_head *from_list,
> +					 int addr_len)
> +{
> +	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_flush(struct list_head *list)
> +{
> +	struct netdev_hw_addr *ha, *tmp;
> +
> +	list_for_each_entry_safe(ha, tmp, list, list) {
> +		list_del_rcu(&ha->list);
> +		synchronize_rcu();

	Oh no... :(

> +		kfree(ha);
> +	}
> +}
> +
> +/* Device addresses handling functions */
> +
> +static void dev_addr_flush(struct net_device *dev)
> +{
> +	__hw_addr_flush(&dev->dev_addr_list);
> +	dev->dev_addr = NULL;
> +}
> +
> +static int dev_addr_init(struct net_device *dev)
> +{
> +	unsigned char addr[MAX_ADDR_LEN];
> +	struct netdev_hw_addr *ha;
> +	int err;
> +
> +	INIT_LIST_HEAD(&dev->dev_addr_list);
> +	memset(addr, 0, sizeof(*addr));
> +	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
> +	if (!err) {
> +		/*
> +		 * Get the first (previously created) address from the list
> +		 * and set dev_addr pointer to this location.
> +		 */
> +		rcu_read_lock();

locking is not correct or unnecessary

> +		ha = list_first_entry_rcu(&dev->dev_addr_list,
> +					  struct netdev_hw_addr, list);
> +		dev->dev_addr = ha->addr;
> +		rcu_read_unlock();
> +	}
> +	return err;
> +}
> +
> +/**
> + *	dev_addr_add	- Add a device address
> + *	@dev: device
> + *	@addr: address to add
> + *
> + *	Add a device address to the device or increase the reference count if
> + *	it already exists.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add(struct net_device *dev, unsigned char *addr)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add);
> +
> +/**
> + *	dev_addr_del	- Release a device address.
> + *	@dev: device
> + *	@addr: address to delete
> + *
> + *	Release reference to a device address and remove it from the device
> + *	if the reference count drops to zero.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del(struct net_device *dev, unsigned char *addr)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_del);
> +
> +/**
> + *	dev_addr_add_multiple	- Add device addresses from another device
> + *	@to_dev: device to which addresses will be added
> + *	@from_dev: device from which addresses will be added
> + *
> + *	Add device addresses of the one device to another.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add_multiple(struct net_device *to_dev,
> +			  struct net_device *from_dev)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	if (from_dev->addr_len != to_dev->addr_len)
> +		return -EINVAL;
> +	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> +					&from_dev->dev_addr_list,
> +					to_dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add_multiple);
> +
> +/**
> + *	dev_addr_del_multiple	- Delete device addresses by another device
> + *	@to_dev: device where the addresses will be deleted
> + *	@from_dev: device by which addresses the addresses will be deleted
> + *
> + *	Deletes addresses in to device by the list of addresses in from device.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del_multiple(struct net_device *to_dev,
> +			  struct net_device *from_dev)
> +{
> +	ASSERT_RTNL();
> +
> +	if (from_dev->addr_len != to_dev->addr_len)
> +		return -EINVAL;
> +	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> +				  &from_dev->dev_addr_list,
> +				  to_dev->addr_len, 0);
> +	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> +	return 0;
> +}
> +EXPORT_SYMBOL(dev_addr_del_multiple);
> +
> +/* unicast and multicast addresses handling functions */
> +
>  int __dev_addr_delete(struct dev_addr_list **list, int *count,
>  		      void *addr, int alen, int glbl)
>  {
> @@ -4257,6 +4513,9 @@ static void rollback_registered(struct net_device *dev)
>  	 */
>  	dev_addr_discard(dev);
>  
> +	/* Flush device addresses */
> +	dev_addr_flush(dev);
> +
>  	if (dev->netdev_ops->ndo_uninit)
>  		dev->netdev_ops->ndo_uninit(dev);
>  
> @@ -4779,6 +5038,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  
>  	dev->gso_max_size = GSO_MAX_SIZE;
>  
> +	dev_addr_init(dev);
>  	netdev_init_queues(dev);
>  
>  	INIT_LIST_HEAD(&dev->napi_list);
> @@ -4965,6 +5225,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
>  	 */
>  	dev_addr_discard(dev);
>  
> +	/* Flush device addresses */
> +	dev_addr_flush(dev);
> +
>  	netdev_unregister_kobject(dev);
>  
>  	/* Actually switch the network namespace */



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15  9:27           ` Eric Dumazet
  0 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-04-15  9:27 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, netdev, bridge, Li Zefan, linux-kernel, mschmidt,
	bonding-devel, jgarzik, davem

Jiri Pirko a écrit :
> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 

You see no difference ? Please look more closely...

I see one additional dereference in hot path, to small objects possibly
with false sharing effects.

So I would advise not changing dev_addr[] to a pointer.
And instead copy first netdev_hw_addr into it.

Also, doing a kzalloc(sizeof(struct netdev_hw_addr)) for allocating these structs
might give a block of memory < L1_CACHE_SIZE so kernel is free to give other
part of this cache line to some other layer that could be a hot spot, so
false sharing could happen.

kzalloc(max(sizeof(*ha), L1_CACHE_SIZE)) is thus higly recommended here.

> Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  include/linux/etherdevice.h |   24 ++++
>  include/linux/netdevice.h   |   31 +++++-
>  net/core/dev.c              |  263 +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 316 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
> index a1f17ab..348a75e 100644
> --- a/include/linux/etherdevice.h
> +++ b/include/linux/etherdevice.h
> @@ -205,4 +205,28 @@ static inline int compare_ether_header(const void *a, const void *b)
>  	       (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
>  }
>  
> +/**
> + * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
> + * @dev: Pointer to a device structure
> + * @addr: Pointer to a six-byte array containing the Ethernet address
> + *
> + * Compare passed address with all addresses of the device. Return true if the
> + * address if one of the device addresses.
> + */
> +static inline bool is_etherdev_addr(const struct net_device *dev,
> +				    const u8 *addr)
> +{
> +	struct netdev_hw_addr *ha;
> +	int res = 1;
> +
> +	rcu_read_lock();
> +	for_each_dev_addr(dev, ha) {
> +		res = compare_ether_addr(addr, ha->addr);

compare_ether_addr_64bits() please ?

> +		if (!res)
> +			break;
> +	}
> +	rcu_read_unlock();
> +	return !res;
> +}
> +
>  #endif	/* _LINUX_ETHERDEVICE_H */
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 2e7783f..77abfdf 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -210,6 +210,12 @@ struct dev_addr_list
>  #define dmi_users	da_users
>  #define dmi_gusers	da_gusers
>  
> +struct netdev_hw_addr {
> +	struct list_head	list;
> +	unsigned char		addr[MAX_ADDR_LEN];
> +	int			refcount;
> +};
> +
>  struct hh_cache
>  {
>  	struct hh_cache *hh_next;	/* Next entry			     */
> @@ -776,8 +782,11 @@ struct net_device
>   */
>  	unsigned long		last_rx;	/* Time of last Rx	*/
>  	/* Interface address info used in eth_type_trans() */
> -	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
> -							   because most packets are unicast) */
> +	unsigned char		*dev_addr;	/* hw address, (before bcast
> +						   because most packets are
> +						   unicast) */
> +
> +	struct list_head	dev_addr_list; /* list of device hw addresses */
>  
>  	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
>  
> @@ -1778,6 +1787,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
>  	spin_unlock_bh(&dev->addr_list_lock);
>  }
>  
> +/*
> + * dev_addr_list walker. Should be used only for read access. Call with
> + * rcu_read_lock held.
> + */
> +#define for_each_dev_addr(dev, ha) \
> +		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
> +
>  /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>  
>  extern void		ether_setup(struct net_device *dev);
> @@ -1790,6 +1806,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  	alloc_netdev_mq(sizeof_priv, name, setup, 1)
>  extern int		register_netdev(struct net_device *dev);
>  extern void		unregister_netdev(struct net_device *dev);
> +
> +/* Functions used for device addresses handling */
> +extern int		dev_addr_add(struct net_device *dev,
> +				     unsigned char *addr);
> +extern int		dev_addr_del(struct net_device *dev,
> +				     unsigned char *addr);
> +extern int		dev_addr_add_multiple(struct net_device *to_dev,
> +					      struct net_device *from_dev);
> +extern int		dev_addr_del_multiple(struct net_device *to_dev,
> +					      struct net_device *from_dev);
> +
>  /* Functions used for secondary unicast and multicast support */
>  extern void		dev_set_rx_mode(struct net_device *dev);
>  extern void		__dev_set_rx_mode(struct net_device *dev);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 91d792d..04cddbb 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3437,6 +3437,262 @@ void dev_set_rx_mode(struct net_device *dev)
>  	netif_addr_unlock_bh(dev);
>  }
>  
> +/* hw addresses list handling functions */
> +
> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +
> +	if (addr_len > MAX_ADDR_LEN)
> +		return -EINVAL;
> +
> +	rcu_read_lock();

This locking is highly suspect.

> +	list_for_each_entry_rcu(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			ha->refcount++;
> +			rcu_read_unlock();
> +			return 0;
> +		}
> +	}
> +	rcu_read_unlock();

Since you obviously need a write lock here to be sure following
can be done by one cpu only.

You have same problem all over this patch.

> +
> +	ha = kzalloc(sizeof(*ha), GFP_ATOMIC);

kzalloc(max(sizeof(*ha), L1_CACHE_SIZE), GFP_...) is thus higly recommended here.

Also, why GFP_ATOMIC is needed here ?

> +	if (!ha)
> +		return -ENOMEM;
> +	memcpy(ha->addr, addr, addr_len);
> +	ha->refcount = 1;
> +	list_add_tail_rcu(&ha->list, list);
> +	return 0;
> +}
> +
> +static int __hw_addr_add(struct list_head *list, unsigned char *addr,
> +				int addr_len)
> +{
> +	return __hw_addr_add_ii(list, addr, addr_len, -1);
> +}
> +
> +static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +
> +	list_for_each_entry(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			if (--ha->refcount)
> +				return 0;
> +			list_del_rcu(&ha->list);
> +			synchronize_rcu();

Oh well... I'm pretty sure this synchronize_rcu() call can be avoided,
dont you think ? Check kfree_rcu() or equivalent, as it seems not yet
included in current kernels...

> +			kfree(ha);
> +			return 0;
> +		}
> +	}
> +	return -ENOENT;
> +}
> +
> +static int __hw_addr_del(struct list_head *list, unsigned char *addr,
> +				int addr_len)
> +{
> +	return __hw_addr_del_ii(list, addr, addr_len, -1);
> +}
> +
> +static int __hw_addr_add_multiple_ii(struct list_head *to_list,
> +				     struct list_head *from_list,
> +				     int addr_len, int ignore_index)
> +{
> +	int err = 0;
> +	struct netdev_hw_addr *ha, *ha2;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ha, from_list, list) {
> +		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
> +		if (err)
> +			goto unroll;
> +	}
> +	goto unlock;
> +unroll:
> +	list_for_each_entry_rcu(ha2, from_list, list) {
> +		if (ha2 == ha)
> +			break;
> +		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
> +	}
> +unlock:
> +	rcu_read_unlock();
> +	return err;
> +}
> +
> +static int __hw_addr_add_multiple(struct list_head *to_list,
> +					 struct list_head *from_list,
> +					 int addr_len)
> +{
> +	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_del_multiple_ii(struct list_head *to_list,
> +				      struct list_head *from_list,
> +				      int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ha, from_list, list) {
> +		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
> +	}
> +	rcu_read_unlock();
> +}
> +
> +static void __hw_addr_del_multiple(struct list_head *to_list,
> +					 struct list_head *from_list,
> +					 int addr_len)
> +{
> +	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_flush(struct list_head *list)
> +{
> +	struct netdev_hw_addr *ha, *tmp;
> +
> +	list_for_each_entry_safe(ha, tmp, list, list) {
> +		list_del_rcu(&ha->list);
> +		synchronize_rcu();

	Oh no... :(

> +		kfree(ha);
> +	}
> +}
> +
> +/* Device addresses handling functions */
> +
> +static void dev_addr_flush(struct net_device *dev)
> +{
> +	__hw_addr_flush(&dev->dev_addr_list);
> +	dev->dev_addr = NULL;
> +}
> +
> +static int dev_addr_init(struct net_device *dev)
> +{
> +	unsigned char addr[MAX_ADDR_LEN];
> +	struct netdev_hw_addr *ha;
> +	int err;
> +
> +	INIT_LIST_HEAD(&dev->dev_addr_list);
> +	memset(addr, 0, sizeof(*addr));
> +	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
> +	if (!err) {
> +		/*
> +		 * Get the first (previously created) address from the list
> +		 * and set dev_addr pointer to this location.
> +		 */
> +		rcu_read_lock();

locking is not correct or unnecessary

> +		ha = list_first_entry_rcu(&dev->dev_addr_list,
> +					  struct netdev_hw_addr, list);
> +		dev->dev_addr = ha->addr;
> +		rcu_read_unlock();
> +	}
> +	return err;
> +}
> +
> +/**
> + *	dev_addr_add	- Add a device address
> + *	@dev: device
> + *	@addr: address to add
> + *
> + *	Add a device address to the device or increase the reference count if
> + *	it already exists.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add(struct net_device *dev, unsigned char *addr)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add);
> +
> +/**
> + *	dev_addr_del	- Release a device address.
> + *	@dev: device
> + *	@addr: address to delete
> + *
> + *	Release reference to a device address and remove it from the device
> + *	if the reference count drops to zero.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del(struct net_device *dev, unsigned char *addr)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_del);
> +
> +/**
> + *	dev_addr_add_multiple	- Add device addresses from another device
> + *	@to_dev: device to which addresses will be added
> + *	@from_dev: device from which addresses will be added
> + *
> + *	Add device addresses of the one device to another.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add_multiple(struct net_device *to_dev,
> +			  struct net_device *from_dev)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	if (from_dev->addr_len != to_dev->addr_len)
> +		return -EINVAL;
> +	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> +					&from_dev->dev_addr_list,
> +					to_dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add_multiple);
> +
> +/**
> + *	dev_addr_del_multiple	- Delete device addresses by another device
> + *	@to_dev: device where the addresses will be deleted
> + *	@from_dev: device by which addresses the addresses will be deleted
> + *
> + *	Deletes addresses in to device by the list of addresses in from device.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del_multiple(struct net_device *to_dev,
> +			  struct net_device *from_dev)
> +{
> +	ASSERT_RTNL();
> +
> +	if (from_dev->addr_len != to_dev->addr_len)
> +		return -EINVAL;
> +	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> +				  &from_dev->dev_addr_list,
> +				  to_dev->addr_len, 0);
> +	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> +	return 0;
> +}
> +EXPORT_SYMBOL(dev_addr_del_multiple);
> +
> +/* unicast and multicast addresses handling functions */
> +
>  int __dev_addr_delete(struct dev_addr_list **list, int *count,
>  		      void *addr, int alen, int glbl)
>  {
> @@ -4257,6 +4513,9 @@ static void rollback_registered(struct net_device *dev)
>  	 */
>  	dev_addr_discard(dev);
>  
> +	/* Flush device addresses */
> +	dev_addr_flush(dev);
> +
>  	if (dev->netdev_ops->ndo_uninit)
>  		dev->netdev_ops->ndo_uninit(dev);
>  
> @@ -4779,6 +5038,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  
>  	dev->gso_max_size = GSO_MAX_SIZE;
>  
> +	dev_addr_init(dev);
>  	netdev_init_queues(dev);
>  
>  	INIT_LIST_HEAD(&dev->napi_list);
> @@ -4965,6 +5225,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
>  	 */
>  	dev_addr_discard(dev);
>  
> +	/* Flush device addresses */
> +	dev_addr_flush(dev);
> +
>  	netdev_unregister_kobject(dev);
>  
>  	/* Actually switch the network namespace */



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15  9:27           ` [Bridge] " Eric Dumazet
@ 2009-04-15  9:31             ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-04-15  9:31 UTC (permalink / raw)
  To: dada1
  Cc: jpirko, lizf, linux-kernel, netdev, jgarzik, shemminger, bridge,
	fubar, bonding-devel, kaber, mschmidt, ivecera

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 15 Apr 2009 11:27:50 +0200

> Since you obviously need a write lock here to be sure following
> can be done by one cpu only.
> 
> You have same problem all over this patch.

RTNL semaphore is held across all modification operations.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15  9:31             ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-04-15  9:31 UTC (permalink / raw)
  To: dada1
  Cc: ivecera, fubar, jpirko, netdev, bridge, lizf, linux-kernel,
	mschmidt, jgarzik, bonding-devel

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 15 Apr 2009 11:27:50 +0200

> Since you obviously need a write lock here to be sure following
> can be done by one cpu only.
> 
> You have same problem all over this patch.

RTNL semaphore is held across all modification operations.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15  9:31             ` [Bridge] " David Miller
@ 2009-04-15 10:13               ` Patrick McHardy
  -1 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-04-15 10:13 UTC (permalink / raw)
  To: David Miller
  Cc: dada1, jpirko, lizf, linux-kernel, netdev, jgarzik, shemminger,
	bridge, fubar, bonding-devel, mschmidt, ivecera

David Miller wrote:
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Wed, 15 Apr 2009 11:27:50 +0200
> 
>> Since you obviously need a write lock here to be sure following
>> can be done by one cpu only.
>>
>> You have same problem all over this patch.
> 
> RTNL semaphore is held across all modification operations.

If this will also be used for multicast lists, changes can happen
(IPv6) without the RTNL.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15 10:13               ` Patrick McHardy
  0 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-04-15 10:13 UTC (permalink / raw)
  To: David Miller
  Cc: ivecera, fubar, jpirko, netdev, bridge, lizf, linux-kernel,
	mschmidt, dada1, jgarzik, bonding-devel

David Miller wrote:
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Wed, 15 Apr 2009 11:27:50 +0200
> 
>> Since you obviously need a write lock here to be sure following
>> can be done by one cpu only.
>>
>> You have same problem all over this patch.
> 
> RTNL semaphore is held across all modification operations.

If this will also be used for multicast lists, changes can happen
(IPv6) without the RTNL.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15 10:13               ` [Bridge] " Patrick McHardy
@ 2009-04-15 10:15                 ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-04-15 10:15 UTC (permalink / raw)
  To: kaber
  Cc: dada1, jpirko, lizf, linux-kernel, netdev, jgarzik, shemminger,
	bridge, fubar, bonding-devel, mschmidt, ivecera

From: Patrick McHardy <kaber@trash.net>
Date: Wed, 15 Apr 2009 12:13:57 +0200

> David Miller wrote:
>> From: Eric Dumazet <dada1@cosmosbay.com>
>> Date: Wed, 15 Apr 2009 11:27:50 +0200
>> 
>>> Since you obviously need a write lock here to be sure following
>>> can be done by one cpu only.
>>>
>>> You have same problem all over this patch.
>> RTNL semaphore is held across all modification operations.
> 
> If this will also be used for multicast lists, changes can happen
> (IPv6) without the RTNL.

Indeed, that is true :-/

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15 10:15                 ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-04-15 10:15 UTC (permalink / raw)
  To: kaber
  Cc: ivecera, fubar, jpirko, netdev, bridge, lizf, linux-kernel,
	mschmidt, dada1, jgarzik, bonding-devel

From: Patrick McHardy <kaber@trash.net>
Date: Wed, 15 Apr 2009 12:13:57 +0200

> David Miller wrote:
>> From: Eric Dumazet <dada1@cosmosbay.com>
>> Date: Wed, 15 Apr 2009 11:27:50 +0200
>> 
>>> Since you obviously need a write lock here to be sure following
>>> can be done by one cpu only.
>>>
>>> You have same problem all over this patch.
>> RTNL semaphore is held across all modification operations.
> 
> If this will also be used for multicast lists, changes can happen
> (IPv6) without the RTNL.

Indeed, that is true :-/

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15 10:15                 ` [Bridge] " David Miller
@ 2009-04-15 10:41                   ` Patrick McHardy
  -1 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-04-15 10:41 UTC (permalink / raw)
  To: David Miller
  Cc: dada1, jpirko, lizf, linux-kernel, netdev, jgarzik, shemminger,
	bridge, fubar, bonding-devel, mschmidt, ivecera

David Miller wrote:
> From: Patrick McHardy <kaber@trash.net>
> Date: Wed, 15 Apr 2009 12:13:57 +0200
> 
>> David Miller wrote:
>>> From: Eric Dumazet <dada1@cosmosbay.com>
>>> Date: Wed, 15 Apr 2009 11:27:50 +0200
>>>
>>>> Since you obviously need a write lock here to be sure following
>>>> can be done by one cpu only.
>>>>
>>>> You have same problem all over this patch.
>>> RTNL semaphore is held across all modification operations.
>> If this will also be used for multicast lists, changes can happen
>> (IPv6) without the RTNL.
> 
> Indeed, that is true :-/

Herbert (I think) suggested to make address list updates in softirq
context a two-step process, where addresses would first be added to
a temporary list and the final change would be done in process context
while holding the RTNL.

Given the complicated mess we currently have, this would be a very
worthwhile change IMO.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15 10:41                   ` Patrick McHardy
  0 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-04-15 10:41 UTC (permalink / raw)
  To: David Miller
  Cc: ivecera, fubar, jpirko, netdev, bridge, lizf, linux-kernel,
	mschmidt, dada1, jgarzik, bonding-devel

David Miller wrote:
> From: Patrick McHardy <kaber@trash.net>
> Date: Wed, 15 Apr 2009 12:13:57 +0200
> 
>> David Miller wrote:
>>> From: Eric Dumazet <dada1@cosmosbay.com>
>>> Date: Wed, 15 Apr 2009 11:27:50 +0200
>>>
>>>> Since you obviously need a write lock here to be sure following
>>>> can be done by one cpu only.
>>>>
>>>> You have same problem all over this patch.
>>> RTNL semaphore is held across all modification operations.
>> If this will also be used for multicast lists, changes can happen
>> (IPv6) without the RTNL.
> 
> Indeed, that is true :-/

Herbert (I think) suggested to make address list updates in softirq
context a two-step process, where addresses would first be added to
a temporary list and the final change would be done in process context
while holding the RTNL.

Given the complicated mess we currently have, this would be a very
worthwhile change IMO.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15 10:41                   ` [Bridge] " Patrick McHardy
@ 2009-04-15 10:45                     ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-04-15 10:45 UTC (permalink / raw)
  To: kaber
  Cc: dada1, jpirko, lizf, linux-kernel, netdev, jgarzik, shemminger,
	bridge, fubar, bonding-devel, mschmidt, ivecera

From: Patrick McHardy <kaber@trash.net>
Date: Wed, 15 Apr 2009 12:41:01 +0200

> Herbert (I think) suggested to make address list updates in softirq
> context a two-step process, where addresses would first be added to
> a temporary list and the final change would be done in process context
> while holding the RTNL.
> 
> Given the complicated mess we currently have, this would be a very
> worthwhile change IMO.

This would break the IPV6 TAHI tests if you think we could use
such an idea for that.

When IPV6 packets arrive that influence multicast and unicast
address lists, the effect must be essentially immediate.  Such
that a subsequent packet will cause the kernel the behave
with the necessary side effects, no matter how quickly that
next packet arrives.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15 10:45                     ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-04-15 10:45 UTC (permalink / raw)
  To: kaber
  Cc: ivecera, fubar, jpirko, netdev, bridge, lizf, linux-kernel,
	mschmidt, dada1, jgarzik, bonding-devel

From: Patrick McHardy <kaber@trash.net>
Date: Wed, 15 Apr 2009 12:41:01 +0200

> Herbert (I think) suggested to make address list updates in softirq
> context a two-step process, where addresses would first be added to
> a temporary list and the final change would be done in process context
> while holding the RTNL.
> 
> Given the complicated mess we currently have, this would be a very
> worthwhile change IMO.

This would break the IPV6 TAHI tests if you think we could use
such an idea for that.

When IPV6 packets arrive that influence multicast and unicast
address lists, the effect must be essentially immediate.  Such
that a subsequent packet will cause the kernel the behave
with the necessary side effects, no matter how quickly that
next packet arrives.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15 10:45                     ` [Bridge] " David Miller
@ 2009-04-15 10:47                       ` Patrick McHardy
  -1 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-04-15 10:47 UTC (permalink / raw)
  To: David Miller
  Cc: dada1, jpirko, lizf, linux-kernel, netdev, jgarzik, shemminger,
	bridge, fubar, bonding-devel, mschmidt, ivecera

David Miller wrote:
> From: Patrick McHardy <kaber@trash.net>
> Date: Wed, 15 Apr 2009 12:41:01 +0200
> 
>> Herbert (I think) suggested to make address list updates in softirq
>> context a two-step process, where addresses would first be added to
>> a temporary list and the final change would be done in process context
>> while holding the RTNL.
>>
>> Given the complicated mess we currently have, this would be a very
>> worthwhile change IMO.
> 
> This would break the IPV6 TAHI tests if you think we could use
> such an idea for that.
> 
> When IPV6 packets arrive that influence multicast and unicast
> address lists, the effect must be essentially immediate.  Such
> that a subsequent packet will cause the kernel the behave
> with the necessary side effects, no matter how quickly that
> next packet arrives.

I see, thanks for the explanation.


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15 10:47                       ` Patrick McHardy
  0 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-04-15 10:47 UTC (permalink / raw)
  To: David Miller
  Cc: ivecera, fubar, jpirko, netdev, bridge, lizf, linux-kernel,
	mschmidt, dada1, jgarzik, bonding-devel

David Miller wrote:
> From: Patrick McHardy <kaber@trash.net>
> Date: Wed, 15 Apr 2009 12:41:01 +0200
> 
>> Herbert (I think) suggested to make address list updates in softirq
>> context a two-step process, where addresses would first be added to
>> a temporary list and the final change would be done in process context
>> while holding the RTNL.
>>
>> Given the complicated mess we currently have, this would be a very
>> worthwhile change IMO.
> 
> This would break the IPV6 TAHI tests if you think we could use
> such an idea for that.
> 
> When IPV6 packets arrive that influence multicast and unicast
> address lists, the effect must be essentially immediate.  Such
> that a subsequent packet will cause the kernel the behave
> with the necessary side effects, no matter how quickly that
> next packet arrives.

I see, thanks for the explanation.


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15  9:27           ` [Bridge] " Eric Dumazet
@ 2009-04-15 11:17             ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15 11:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Li Zefan, linux-kernel, netdev, jgarzik, davem, shemminger,
	bridge, fubar, bonding-devel, kaber, mschmidt, ivecera

Wed, Apr 15, 2009 at 11:27:50AM CEST, dada1@cosmosbay.com wrote:
>Jiri Pirko a écrit :
>> This patch introduces a new list in struct net_device and brings a set of
>> functions to handle the work with device address list. The list is a replacement
>> for the original dev_addr field and because in some situations there is need to
>> carry several device addresses with the net device. To be backward compatible,
>> dev_addr is made to point to the first member of the list so original drivers
>> sees no difference.
>> 
>
>You see no difference ? Please look more closely...
>
>I see one additional dereference in hot path, to small objects possibly
>with false sharing effects.
>
>So I would advise not changing dev_addr[] to a pointer.
>And instead copy first netdev_hw_addr into it.

Hmm :( That is what I was trying to avoid. If the first netdev_hw_addr in the
list is a copy of dev_addr, then there must be synchronizing of those two. This
would be a pain.. Plus I thought that eventually dev_addr would not be
accessible directly but only by set of macros/inlines to accesse the list, and
then dev_addr would be removed from struct net_device.
>
>Also, doing a kzalloc(sizeof(struct netdev_hw_addr)) for allocating these structs
>might give a block of memory < L1_CACHE_SIZE so kernel is free to give other
>part of this cache line to some other layer that could be a hot spot, so
>false sharing could happen.
>
>kzalloc(max(sizeof(*ha), L1_CACHE_SIZE)) is thus higly recommended here.
You mean PAGE_CACHE_SIZE? I think that would be little wasting... But I see your
point...
>
>> Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> ---
>>  include/linux/etherdevice.h |   24 ++++
>>  include/linux/netdevice.h   |   31 +++++-
>>  net/core/dev.c              |  263 +++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 316 insertions(+), 2 deletions(-)
>> 
>> diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
>> index a1f17ab..348a75e 100644
>> --- a/include/linux/etherdevice.h
>> +++ b/include/linux/etherdevice.h
>> @@ -205,4 +205,28 @@ static inline int compare_ether_header(const void *a, const void *b)
>>  	       (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
>>  }
>>  
>> +/**
>> + * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
>> + * @dev: Pointer to a device structure
>> + * @addr: Pointer to a six-byte array containing the Ethernet address
>> + *
>> + * Compare passed address with all addresses of the device. Return true if the
>> + * address if one of the device addresses.
>> + */
>> +static inline bool is_etherdev_addr(const struct net_device *dev,
>> +				    const u8 *addr)
>> +{
>> +	struct netdev_hw_addr *ha;
>> +	int res = 1;
>> +
>> +	rcu_read_lock();
>> +	for_each_dev_addr(dev, ha) {
>> +		res = compare_ether_addr(addr, ha->addr);
>
>compare_ether_addr_64bits() please ?
>
I used the original as the bridge code used it. Ok, noted.

<snip>

>> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>> +			    int addr_len, int ignore_index)
>> +{
>> +	struct netdev_hw_addr *ha;
>> +	int i = 0;
>> +
>> +	if (addr_len > MAX_ADDR_LEN)
>> +		return -EINVAL;
>> +
>> +	rcu_read_lock();
>
>This locking is highly suspect.
>
>> +	list_for_each_entry_rcu(ha, list, list) {
>> +		if (i++ != ignore_index &&
>> +		    !memcmp(ha->addr, addr, addr_len)) {
>> +			ha->refcount++;
>> +			rcu_read_unlock();
>> +			return 0;
>> +		}
>> +	}
>> +	rcu_read_unlock();
>
>Since you obviously need a write lock here to be sure following
>can be done by one cpu only.
>
>You have same problem all over this patch.

Yes, as Dave wrote, this is guarded by RTNL mutex.
>
>> +
>> +	ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
>
>kzalloc(max(sizeof(*ha), L1_CACHE_SIZE), GFP_...) is thus higly recommended here.
>
>Also, why GFP_ATOMIC is needed here ?

Yes, it is not needed here. I've copied it here from the original unicast and
multicast add funtion to stay close but as I can see, there is no need for it
there either.
Noted.
>

<snip>

>> +	list_for_each_entry(ha, list, list) {
>> +		if (i++ != ignore_index &&
>> +		    !memcmp(ha->addr, addr, addr_len)) {
>> +			if (--ha->refcount)
>> +				return 0;
>> +			list_del_rcu(&ha->list);
>> +			synchronize_rcu();
>
>Oh well... I'm pretty sure this synchronize_rcu() call can be avoided,
>dont you think ? Check kfree_rcu() or equivalent, as it seems not yet
>included in current kernels...
>
Well once kfree_rcu() will be in the tree I will be happy to replace this.

>> +			kfree(ha);
>> +			return 0;
>> +		}
>> +	}
>> +	return -ENOENT;

<snip>

>> +	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>> +	if (!err) {
>> +		/*
>> +		 * Get the first (previously created) address from the list
>> +		 * and set dev_addr pointer to this location.
>> +		 */
>> +		rcu_read_lock();
>
>locking is not correct or unnecessary

Agree that here locking is not necessary, but I wanted to stay consistent to the
rest of the code. Do you think I should remove locking here entirely?

>
>> +		ha = list_first_entry_rcu(&dev->dev_addr_list,
>> +					  struct netdev_hw_addr, list);
>> +		dev->dev_addr = ha->addr;
>> +		rcu_read_unlock();
>> +	}
>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15 11:17             ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15 11:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: ivecera, fubar, netdev, bridge, Li Zefan, linux-kernel, mschmidt,
	bonding-devel, jgarzik, davem

Wed, Apr 15, 2009 at 11:27:50AM CEST, dada1@cosmosbay.com wrote:
>Jiri Pirko a écrit :
>> This patch introduces a new list in struct net_device and brings a set of
>> functions to handle the work with device address list. The list is a replacement
>> for the original dev_addr field and because in some situations there is need to
>> carry several device addresses with the net device. To be backward compatible,
>> dev_addr is made to point to the first member of the list so original drivers
>> sees no difference.
>> 
>
>You see no difference ? Please look more closely...
>
>I see one additional dereference in hot path, to small objects possibly
>with false sharing effects.
>
>So I would advise not changing dev_addr[] to a pointer.
>And instead copy first netdev_hw_addr into it.

Hmm :( That is what I was trying to avoid. If the first netdev_hw_addr in the
list is a copy of dev_addr, then there must be synchronizing of those two. This
would be a pain.. Plus I thought that eventually dev_addr would not be
accessible directly but only by set of macros/inlines to accesse the list, and
then dev_addr would be removed from struct net_device.
>
>Also, doing a kzalloc(sizeof(struct netdev_hw_addr)) for allocating these structs
>might give a block of memory < L1_CACHE_SIZE so kernel is free to give other
>part of this cache line to some other layer that could be a hot spot, so
>false sharing could happen.
>
>kzalloc(max(sizeof(*ha), L1_CACHE_SIZE)) is thus higly recommended here.
You mean PAGE_CACHE_SIZE? I think that would be little wasting... But I see your
point...
>
>> Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> ---
>>  include/linux/etherdevice.h |   24 ++++
>>  include/linux/netdevice.h   |   31 +++++-
>>  net/core/dev.c              |  263 +++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 316 insertions(+), 2 deletions(-)
>> 
>> diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
>> index a1f17ab..348a75e 100644
>> --- a/include/linux/etherdevice.h
>> +++ b/include/linux/etherdevice.h
>> @@ -205,4 +205,28 @@ static inline int compare_ether_header(const void *a, const void *b)
>>  	       (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
>>  }
>>  
>> +/**
>> + * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
>> + * @dev: Pointer to a device structure
>> + * @addr: Pointer to a six-byte array containing the Ethernet address
>> + *
>> + * Compare passed address with all addresses of the device. Return true if the
>> + * address if one of the device addresses.
>> + */
>> +static inline bool is_etherdev_addr(const struct net_device *dev,
>> +				    const u8 *addr)
>> +{
>> +	struct netdev_hw_addr *ha;
>> +	int res = 1;
>> +
>> +	rcu_read_lock();
>> +	for_each_dev_addr(dev, ha) {
>> +		res = compare_ether_addr(addr, ha->addr);
>
>compare_ether_addr_64bits() please ?
>
I used the original as the bridge code used it. Ok, noted.

<snip>

>> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>> +			    int addr_len, int ignore_index)
>> +{
>> +	struct netdev_hw_addr *ha;
>> +	int i = 0;
>> +
>> +	if (addr_len > MAX_ADDR_LEN)
>> +		return -EINVAL;
>> +
>> +	rcu_read_lock();
>
>This locking is highly suspect.
>
>> +	list_for_each_entry_rcu(ha, list, list) {
>> +		if (i++ != ignore_index &&
>> +		    !memcmp(ha->addr, addr, addr_len)) {
>> +			ha->refcount++;
>> +			rcu_read_unlock();
>> +			return 0;
>> +		}
>> +	}
>> +	rcu_read_unlock();
>
>Since you obviously need a write lock here to be sure following
>can be done by one cpu only.
>
>You have same problem all over this patch.

Yes, as Dave wrote, this is guarded by RTNL mutex.
>
>> +
>> +	ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
>
>kzalloc(max(sizeof(*ha), L1_CACHE_SIZE), GFP_...) is thus higly recommended here.
>
>Also, why GFP_ATOMIC is needed here ?

Yes, it is not needed here. I've copied it here from the original unicast and
multicast add funtion to stay close but as I can see, there is no need for it
there either.
Noted.
>

<snip>

>> +	list_for_each_entry(ha, list, list) {
>> +		if (i++ != ignore_index &&
>> +		    !memcmp(ha->addr, addr, addr_len)) {
>> +			if (--ha->refcount)
>> +				return 0;
>> +			list_del_rcu(&ha->list);
>> +			synchronize_rcu();
>
>Oh well... I'm pretty sure this synchronize_rcu() call can be avoided,
>dont you think ? Check kfree_rcu() or equivalent, as it seems not yet
>included in current kernels...
>
Well once kfree_rcu() will be in the tree I will be happy to replace this.

>> +			kfree(ha);
>> +			return 0;
>> +		}
>> +	}
>> +	return -ENOENT;

<snip>

>> +	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>> +	if (!err) {
>> +		/*
>> +		 * Get the first (previously created) address from the list
>> +		 * and set dev_addr pointer to this location.
>> +		 */
>> +		rcu_read_lock();
>
>locking is not correct or unnecessary

Agree that here locking is not necessary, but I wanted to stay consistent to the
rest of the code. Do you think I should remove locking here entirely?

>
>> +		ha = list_first_entry_rcu(&dev->dev_addr_list,
>> +					  struct netdev_hw_addr, list);
>> +		dev->dev_addr = ha->addr;
>> +		rcu_read_unlock();
>> +	}
>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15 11:17             ` [Bridge] " Jiri Pirko
@ 2009-04-15 11:22               ` Patrick McHardy
  -1 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-04-15 11:22 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Eric Dumazet, Li Zefan, linux-kernel, netdev, jgarzik, davem,
	shemminger, bridge, fubar, bonding-devel, mschmidt, ivecera

Jiri Pirko wrote:

>> Since you obviously need a write lock here to be sure following
>> can be done by one cpu only.
>>
>> You have same problem all over this patch.
> 
> Yes, as Dave wrote, this is guarded by RTNL mutex.

This was incorrect. IPv6 adds multicast addresses in softirq context.

>>> +
>>> +	ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
>> kzalloc(max(sizeof(*ha), L1_CACHE_SIZE), GFP_...) is thus higly recommended here.
>>
>> Also, why GFP_ATOMIC is needed here ?
> 
> Yes, it is not needed here. I've copied it here from the original unicast and
> multicast add funtion to stay close but as I can see, there is no need for it
> there either.
> Noted.

Also needed for IPv6 in softirq context.


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15 11:22               ` Patrick McHardy
  0 siblings, 0 replies; 214+ messages in thread
From: Patrick McHardy @ 2009-04-15 11:22 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, bonding-devel, netdev, bridge, Li Zefan,
	linux-kernel, mschmidt, Eric Dumazet, jgarzik, davem

Jiri Pirko wrote:

>> Since you obviously need a write lock here to be sure following
>> can be done by one cpu only.
>>
>> You have same problem all over this patch.
> 
> Yes, as Dave wrote, this is guarded by RTNL mutex.

This was incorrect. IPv6 adds multicast addresses in softirq context.

>>> +
>>> +	ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
>> kzalloc(max(sizeof(*ha), L1_CACHE_SIZE), GFP_...) is thus higly recommended here.
>>
>> Also, why GFP_ATOMIC is needed here ?
> 
> Yes, it is not needed here. I've copied it here from the original unicast and
> multicast add funtion to stay close but as I can see, there is no need for it
> there either.
> Noted.

Also needed for IPv6 in softirq context.


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15 11:22               ` [Bridge] " Patrick McHardy
@ 2009-04-15 11:28                 ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15 11:28 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Eric Dumazet, Li Zefan, linux-kernel, netdev, jgarzik, davem,
	shemminger, bridge, fubar, bonding-devel, mschmidt, ivecera

Wed, Apr 15, 2009 at 01:22:32PM CEST, kaber@trash.net wrote:
> Jiri Pirko wrote:
>
>>> Since you obviously need a write lock here to be sure following
>>> can be done by one cpu only.
>>>
>>> You have same problem all over this patch.
>>
>> Yes, as Dave wrote, this is guarded by RTNL mutex.
>
> This was incorrect. IPv6 adds multicast addresses in softirq context.

Yes, I see that.
>
>>>> +
>>>> +	ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
>>> kzalloc(max(sizeof(*ha), L1_CACHE_SIZE), GFP_...) is thus higly recommended here.
>>>
>>> Also, why GFP_ATOMIC is needed here ?
>>
>> Yes, it is not needed here. I've copied it here from the original unicast and
>> multicast add funtion to stay close but as I can see, there is no need for it
>> there either.
>> Noted.
>
> Also needed for IPv6 in softirq context.
>

Noted...

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15 11:28                 ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15 11:28 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: ivecera, fubar, bonding-devel, netdev, bridge, Li Zefan,
	linux-kernel, mschmidt, Eric Dumazet, jgarzik, davem

Wed, Apr 15, 2009 at 01:22:32PM CEST, kaber@trash.net wrote:
> Jiri Pirko wrote:
>
>>> Since you obviously need a write lock here to be sure following
>>> can be done by one cpu only.
>>>
>>> You have same problem all over this patch.
>>
>> Yes, as Dave wrote, this is guarded by RTNL mutex.
>
> This was incorrect. IPv6 adds multicast addresses in softirq context.

Yes, I see that.
>
>>>> +
>>>> +	ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
>>> kzalloc(max(sizeof(*ha), L1_CACHE_SIZE), GFP_...) is thus higly recommended here.
>>>
>>> Also, why GFP_ATOMIC is needed here ?
>>
>> Yes, it is not needed here. I've copied it here from the original unicast and
>> multicast add funtion to stay close but as I can see, there is no need for it
>> there either.
>> Noted.
>
> Also needed for IPv6 in softirq context.
>

Noted...

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15 11:17             ` [Bridge] " Jiri Pirko
@ 2009-04-15 12:28               ` Eric Dumazet
  -1 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-04-15 12:28 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Li Zefan, linux-kernel, netdev, jgarzik, davem, shemminger,
	bridge, fubar, bonding-devel, kaber, mschmidt, ivecera

Jiri Pirko a écrit :
> Wed, Apr 15, 2009 at 11:27:50AM CEST, dada1@cosmosbay.com wrote:

>> kzalloc(max(sizeof(*ha), L1_CACHE_SIZE)) is thus higly recommended here.
> You mean PAGE_CACHE_SIZE? I think that would be little wasting... But I see your
> point...

No, I meant L1_CACHE_BYTES    (usually 64 bytes on x86), I always confuse BYTES and SIZE on this one...


>>> +	list_for_each_entry(ha, list, list) {
>>> +		if (i++ != ignore_index &&
>>> +		    !memcmp(ha->addr, addr, addr_len)) {
>>> +			if (--ha->refcount)
>>> +				return 0;
>>> +			list_del_rcu(&ha->list);
>>> +			synchronize_rcu();
>> Oh well... I'm pretty sure this synchronize_rcu() call can be avoided,
>> dont you think ? Check kfree_rcu() or equivalent, as it seems not yet
>> included in current kernels...
>>
> Well once kfree_rcu() will be in the tree I will be happy to replace this.

If kfree_rcu() not yet available, please use a regular call_rcu() construct
(thus adding a struct rcu_head rcu; in struct netdev_hw_addr)

If you delete say 10 addresses on a device, while RTNL (or other lock) locked,
that means a lot of calls to synchronize_rcu() and a long lock hold time.

> 
>>> +			kfree(ha);
>>> +			return 0;
>>> +		}
>>> +	}
>>> +	return -ENOENT;
> 
> <snip>
> 
>>> +	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>>> +	if (!err) {
>>> +		/*
>>> +		 * Get the first (previously created) address from the list
>>> +		 * and set dev_addr pointer to this location.
>>> +		 */
>>> +		rcu_read_lock();
>> locking is not correct or unnecessary
> 
> Agree that here locking is not necessary, but I wanted to stay consistent to the
> rest of the code. Do you think I should remove locking here entirely?

Yes, it is very confusing for reviewers because we feel patch submiter
is not comfortable with locking rules.

Check for example dev_add_pack() in net/core/dev.c : It uses list_add_rcu()
but as it also uses a regular spinlock, there is no point using rcu_read_lock().

void dev_add_pack(struct packet_type *pt)
{
        int hash;

        spin_lock_bh(&ptype_lock);
        if (pt->type == htons(ETH_P_ALL))
                list_add_rcu(&pt->list, &ptype_all);
        else {
                hash = ntohs(pt->type) & PTYPE_HASH_MASK;
                list_add_rcu(&pt->list, &ptype_base[hash]);
        }
        spin_unlock_bh(&ptype_lock);
}



Please note list_add_rcu() (and/or rcu_assign_pointer()) are still needed to protect
readers that dont use the spinlock at all.

If you use fact that RTNL is locked when calling your code, you could add
ASSERT_RTNL();
at strategic points so that this assertion can be checked at runtime.

(but Patrick & David wrote that you should not assume RTNL, so you probably need another lock...)

Thank you


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15 12:28               ` Eric Dumazet
  0 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-04-15 12:28 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, netdev, bridge, Li Zefan, linux-kernel, mschmidt,
	bonding-devel, jgarzik, davem

Jiri Pirko a écrit :
> Wed, Apr 15, 2009 at 11:27:50AM CEST, dada1@cosmosbay.com wrote:

>> kzalloc(max(sizeof(*ha), L1_CACHE_SIZE)) is thus higly recommended here.
> You mean PAGE_CACHE_SIZE? I think that would be little wasting... But I see your
> point...

No, I meant L1_CACHE_BYTES    (usually 64 bytes on x86), I always confuse BYTES and SIZE on this one...


>>> +	list_for_each_entry(ha, list, list) {
>>> +		if (i++ != ignore_index &&
>>> +		    !memcmp(ha->addr, addr, addr_len)) {
>>> +			if (--ha->refcount)
>>> +				return 0;
>>> +			list_del_rcu(&ha->list);
>>> +			synchronize_rcu();
>> Oh well... I'm pretty sure this synchronize_rcu() call can be avoided,
>> dont you think ? Check kfree_rcu() or equivalent, as it seems not yet
>> included in current kernels...
>>
> Well once kfree_rcu() will be in the tree I will be happy to replace this.

If kfree_rcu() not yet available, please use a regular call_rcu() construct
(thus adding a struct rcu_head rcu; in struct netdev_hw_addr)

If you delete say 10 addresses on a device, while RTNL (or other lock) locked,
that means a lot of calls to synchronize_rcu() and a long lock hold time.

> 
>>> +			kfree(ha);
>>> +			return 0;
>>> +		}
>>> +	}
>>> +	return -ENOENT;
> 
> <snip>
> 
>>> +	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>>> +	if (!err) {
>>> +		/*
>>> +		 * Get the first (previously created) address from the list
>>> +		 * and set dev_addr pointer to this location.
>>> +		 */
>>> +		rcu_read_lock();
>> locking is not correct or unnecessary
> 
> Agree that here locking is not necessary, but I wanted to stay consistent to the
> rest of the code. Do you think I should remove locking here entirely?

Yes, it is very confusing for reviewers because we feel patch submiter
is not comfortable with locking rules.

Check for example dev_add_pack() in net/core/dev.c : It uses list_add_rcu()
but as it also uses a regular spinlock, there is no point using rcu_read_lock().

void dev_add_pack(struct packet_type *pt)
{
        int hash;

        spin_lock_bh(&ptype_lock);
        if (pt->type == htons(ETH_P_ALL))
                list_add_rcu(&pt->list, &ptype_all);
        else {
                hash = ntohs(pt->type) & PTYPE_HASH_MASK;
                list_add_rcu(&pt->list, &ptype_base[hash]);
        }
        spin_unlock_bh(&ptype_lock);
}



Please note list_add_rcu() (and/or rcu_assign_pointer()) are still needed to protect
readers that dont use the spinlock at all.

If you use fact that RTNL is locked when calling your code, you could add
ASSERT_RTNL();
at strategic points so that this assertion can be checked at runtime.

(but Patrick & David wrote that you should not assume RTNL, so you probably need another lock...)

Thank you


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
  2009-04-15 10:13               ` [Bridge] " Patrick McHardy
@ 2009-04-15 14:42                 ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15 14:42 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Miller, dada1, lizf, linux-kernel, netdev, jgarzik,
	shemminger, bridge, fubar, bonding-devel, mschmidt, ivecera

Wed, Apr 15, 2009 at 12:13:57PM CEST, kaber@trash.net wrote:
> David Miller wrote:
>> From: Eric Dumazet <dada1@cosmosbay.com>
>> Date: Wed, 15 Apr 2009 11:27:50 +0200
>>
>>> Since you obviously need a write lock here to be sure following
>>> can be done by one cpu only.
>>>
>>> You have same problem all over this patch.
>>
>> RTNL semaphore is held across all modification operations.
>
> If this will also be used for multicast lists, changes can happen
> (IPv6) without the RTNL.

Ok, but for dev_addr_X() functions the RTNL mutex is sufficient so I see no
point of adding another lock here. When the multicast handling functions will be
implemented to use netdev_hw_addr and it's layer, then we need to use update
lock in dev_multicast_X.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list
@ 2009-04-15 14:42                 ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15 14:42 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: ivecera, fubar, bonding-devel, netdev, bridge, lizf,
	linux-kernel, mschmidt, dada1, jgarzik, David Miller

Wed, Apr 15, 2009 at 12:13:57PM CEST, kaber@trash.net wrote:
> David Miller wrote:
>> From: Eric Dumazet <dada1@cosmosbay.com>
>> Date: Wed, 15 Apr 2009 11:27:50 +0200
>>
>>> Since you obviously need a write lock here to be sure following
>>> can be done by one cpu only.
>>>
>>> You have same problem all over this patch.
>>
>> RTNL semaphore is held across all modification operations.
>
> If this will also be used for multicast lists, changes can happen
> (IPv6) without the RTNL.

Ok, but for dev_addr_X() functions the RTNL mutex is sufficient so I see no
point of adding another lock here. When the multicast handling functions will be
implemented to use netdev_hw_addr and it's layer, then we need to use update
lock in dev_multicast_X.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v2)
  2009-04-15  8:18     ` [Bridge] " Jiri Pirko
@ 2009-04-15 18:02       ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15 18:02 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

changes against last patch version:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   27 +++++
 include/linux/netdevice.h   |   32 +++++-
 net/core/dev.c              |  271 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 328 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..3d7a668 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
 	return compare_ether_addr(addr1, addr2);
 #endif
 }
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 addr[6 + 2])
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr_64bits(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
 #endif	/* __KERNEL__ */
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..89ad6d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,13 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	int			refcount;
+	struct rcu_head		rcu_head;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +783,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int		dev_addr_add(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_del(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_add_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+extern int		dev_addr_del_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d..961be4f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3437,6 +3437,270 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			ha->refcount++;
+			rcu_read_unlock();
+			return 0;
+		}
+	}
+	rcu_read_unlock();
+
+	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->refcount = 1;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+			 int addr_len)
+{
+	return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+	struct netdev_hw_addr *ha;
+
+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
+	kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			if (--ha->refcount)
+				return 0;
+			list_del_rcu(&ha->list);
+			call_rcu(&ha->rcu_head, ha_rcu_free);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+			 int addr_len)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, int ignore_index)
+{
+	int err = 0;
+	struct netdev_hw_addr *ha, *ha2;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, from_list, list) {
+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+		if (err)
+			goto unroll;
+	}
+	goto unlock;
+unroll:
+	list_for_each_entry_rcu(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+	}
+unlock:
+	rcu_read_unlock();
+	return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, from_list, list) {
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+	}
+	rcu_read_unlock();
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		call_rcu(&ha->rcu_head, ha_rcu_free);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	ASSERT_RTNL();
+
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	ASSERT_RTNL();
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addr_list,
+				      struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *
+ *	Add device addresses of the one device to another.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4257,6 +4521,9 @@ static void rollback_registered(struct net_device *dev)
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	if (dev->netdev_ops->ndo_uninit)
 		dev->netdev_ops->ndo_uninit(dev);
 
@@ -4779,6 +5046,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4965,6 +5233,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	netdev_unregister_kobject(dev);
 
 	/* Actually switch the network namespace */
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v2)
@ 2009-04-15 18:02       ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-15 18:02 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

changes against last patch version:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   27 +++++
 include/linux/netdevice.h   |   32 +++++-
 net/core/dev.c              |  271 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 328 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..3d7a668 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
 	return compare_ether_addr(addr1, addr2);
 #endif
 }
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 addr[6 + 2])
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr_64bits(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
 #endif	/* __KERNEL__ */
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..89ad6d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,13 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	int			refcount;
+	struct rcu_head		rcu_head;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +783,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int		dev_addr_add(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_del(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_add_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+extern int		dev_addr_del_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d..961be4f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3437,6 +3437,270 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			ha->refcount++;
+			rcu_read_unlock();
+			return 0;
+		}
+	}
+	rcu_read_unlock();
+
+	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->refcount = 1;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+			 int addr_len)
+{
+	return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+	struct netdev_hw_addr *ha;
+
+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
+	kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			if (--ha->refcount)
+				return 0;
+			list_del_rcu(&ha->list);
+			call_rcu(&ha->rcu_head, ha_rcu_free);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+			 int addr_len)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, int ignore_index)
+{
+	int err = 0;
+	struct netdev_hw_addr *ha, *ha2;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, from_list, list) {
+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+		if (err)
+			goto unroll;
+	}
+	goto unlock;
+unroll:
+	list_for_each_entry_rcu(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+	}
+unlock:
+	rcu_read_unlock();
+	return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ha, from_list, list) {
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+	}
+	rcu_read_unlock();
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		call_rcu(&ha->rcu_head, ha_rcu_free);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	ASSERT_RTNL();
+
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	ASSERT_RTNL();
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addr_list,
+				      struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *
+ *	Add device addresses of the one device to another.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4257,6 +4521,9 @@ static void rollback_registered(struct net_device *dev)
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	if (dev->netdev_ops->ndo_uninit)
 		dev->netdev_ops->ndo_uninit(dev);
 
@@ -4779,6 +5046,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4965,6 +5233,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	 */
 	dev_addr_discard(dev);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	netdev_unregister_kobject(dev);
 
 	/* Actually switch the network namespace */
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v2)
  2009-04-15 18:02       ` [Bridge] " Jiri Pirko
@ 2009-04-15 18:54         ` Eric Dumazet
  -1 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-04-15 18:54 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, netdev, jgarzik, davem, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, ivecera

Jiri Pirko a écrit :
> changes against last patch version:
> -added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
> -removed unnecessary rcu_read locking in dev_addr_init
> -use compare_ether_addr_64bits instead of compare_ether_addr
> -use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
> -use call_rcu instead of rcu_synchronize
> -moved is_etherdev_addr into __KERNEL__ ifdef
> 
> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  include/linux/etherdevice.h |   27 +++++
>  include/linux/netdevice.h   |   32 +++++-
>  net/core/dev.c              |  271 +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 328 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
> index a1f17ab..3d7a668 100644
> --- a/include/linux/etherdevice.h
> +++ b/include/linux/etherdevice.h
> @@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
>  	return compare_ether_addr(addr1, addr2);
>  #endif
>  }
> +
> +/**
> + * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
> + * @dev: Pointer to a device structure
> + * @addr: Pointer to a six-byte array containing the Ethernet address
> + *
> + * Compare passed address with all addresses of the device. Return true if the
> + * address if one of the device addresses.
> + *
> + * Note that this function calls compare_ether_addr_64bits() so take care of
> + * the right padding.
> + */
> +static inline bool is_etherdev_addr(const struct net_device *dev,
> +				    const u8 addr[6 + 2])
> +{
> +	struct netdev_hw_addr *ha;
> +	int res = 1;
> +
> +	rcu_read_lock();
> +	for_each_dev_addr(dev, ha) {
> +		res = compare_ether_addr_64bits(addr, ha->addr);
> +		if (!res)
> +			break;
> +	}
> +	rcu_read_unlock();
> +	return !res;
> +}
>  #endif	/* __KERNEL__ */
>  
>  /**
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 2e7783f..89ad6d2 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -210,6 +210,13 @@ struct dev_addr_list
>  #define dmi_users	da_users
>  #define dmi_gusers	da_gusers
>  
> +struct netdev_hw_addr {
> +	struct list_head	list;
> +	unsigned char		addr[MAX_ADDR_LEN];
> +	int			refcount;
> +	struct rcu_head		rcu_head;
> +};
> +
>  struct hh_cache
>  {
>  	struct hh_cache *hh_next;	/* Next entry			     */
> @@ -776,8 +783,11 @@ struct net_device
>   */
>  	unsigned long		last_rx;	/* Time of last Rx	*/
>  	/* Interface address info used in eth_type_trans() */
> -	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
> -							   because most packets are unicast) */
> +	unsigned char		*dev_addr;	/* hw address, (before bcast
> +						   because most packets are
> +						   unicast) */
> +
> +	struct list_head	dev_addr_list; /* list of device hw addresses */
>  
>  	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
>  
> @@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
>  	spin_unlock_bh(&dev->addr_list_lock);
>  }
>  
> +/*
> + * dev_addr_list walker. Should be used only for read access. Call with
> + * rcu_read_lock held.
> + */
> +#define for_each_dev_addr(dev, ha) \
> +		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
> +
>  /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>  
>  extern void		ether_setup(struct net_device *dev);
> @@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  	alloc_netdev_mq(sizeof_priv, name, setup, 1)
>  extern int		register_netdev(struct net_device *dev);
>  extern void		unregister_netdev(struct net_device *dev);
> +
> +/* Functions used for device addresses handling */
> +extern int		dev_addr_add(struct net_device *dev,
> +				     unsigned char *addr);
> +extern int		dev_addr_del(struct net_device *dev,
> +				     unsigned char *addr);
> +extern int		dev_addr_add_multiple(struct net_device *to_dev,
> +					      struct net_device *from_dev);
> +extern int		dev_addr_del_multiple(struct net_device *to_dev,
> +					      struct net_device *from_dev);
> +
>  /* Functions used for secondary unicast and multicast support */
>  extern void		dev_set_rx_mode(struct net_device *dev);
>  extern void		__dev_set_rx_mode(struct net_device *dev);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 91d792d..961be4f 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3437,6 +3437,270 @@ void dev_set_rx_mode(struct net_device *dev)
>  	netif_addr_unlock_bh(dev);
>  }
>  
> +/* hw addresses list handling functions */
> +
> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +
> +	if (addr_len > MAX_ADDR_LEN)
> +		return -EINVAL;
> +

Please put here the ASSERT_RTNL(), not in various callers, since
this is the place where we really assume rtnl lock is locked by us.

You still use rcu_read_lock()/unlock() and rcu variant here...

But caller of this function has RTNL (or other lock) so dont use rcu here, as it seems
inconsistent with kzalloc() code that comes next.

> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			ha->refcount++;
> +			rcu_read_unlock();
> +			return 0;
> +		}
> +	}
> +	rcu_read_unlock();
> +
> +	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
> +	if (!ha)
> +		return -ENOMEM;
> +	memcpy(ha->addr, addr, addr_len);
> +	ha->refcount = 1;
> +	list_add_tail_rcu(&ha->list, list);
> +	return 0;
> +}
> +
> +static int __hw_addr_add(struct list_head *list, unsigned char *addr,
> +			 int addr_len)
> +{
> +	return __hw_addr_add_ii(list, addr, addr_len, -1);
> +}
> +
> +static void ha_rcu_free(struct rcu_head *head)
> +{
> +	struct netdev_hw_addr *ha;
> +
> +	ha = container_of(head, struct netdev_hw_addr, rcu_head);
> +	kfree(ha);
> +}
> +
> +static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +

ASSERT_RTNL() here, not in callers.

> +	list_for_each_entry(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			if (--ha->refcount)
> +				return 0;
> +			list_del_rcu(&ha->list);
> +			call_rcu(&ha->rcu_head, ha_rcu_free);
> +			return 0;
> +		}
> +	}
> +	return -ENOENT;
> +}
> +
> +static int __hw_addr_del(struct list_head *list, unsigned char *addr,
> +			 int addr_len)
> +{
> +	return __hw_addr_del_ii(list, addr, addr_len, -1);
> +}
> +
> +static int __hw_addr_add_multiple_ii(struct list_head *to_list,
> +				     struct list_head *from_list,
> +				     int addr_len, int ignore_index)
> +{
> +	int err = 0;
> +	struct netdev_hw_addr *ha, *ha2;
> +

same here, no need for rcu_read_lock(), since you are going to change list, you
have RTNL lock or equivalent.

> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ha, from_list, list) {
> +		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
> +		if (err)
> +			goto unroll;
> +	}
> +	goto unlock;
> +unroll:
> +	list_for_each_entry_rcu(ha2, from_list, list) {
> +		if (ha2 == ha)
> +			break;
> +		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
> +	}
> +unlock:
> +	rcu_read_unlock();
> +	return err;
> +}
> +
> +static int __hw_addr_add_multiple(struct list_head *to_list,
> +					 struct list_head *from_list,
> +					 int addr_len)
> +{
> +	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_del_multiple_ii(struct list_head *to_list,
> +				      struct list_head *from_list,
> +				      int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +

same here, no rcu_read_lock() needed...

> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ha, from_list, list) {
> +		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
> +	}
> +	rcu_read_unlock();
> +}
> +
> +static void __hw_addr_del_multiple(struct list_head *to_list,
> +					 struct list_head *from_list,
> +					 int addr_len)
> +{
> +	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_flush(struct list_head *list)
> +{
> +	struct netdev_hw_addr *ha, *tmp;
> +

ASSERT_RTNL();

> +	list_for_each_entry_safe(ha, tmp, list, list) {
> +		list_del_rcu(&ha->list);
> +		call_rcu(&ha->rcu_head, ha_rcu_free);
> +	}
> +}
> +
> +/* Device addresses handling functions */
> +
> +static void dev_addr_flush(struct net_device *dev)
> +{
> +	ASSERT_RTNL();
> +
> +	__hw_addr_flush(&dev->dev_addr_list);
> +	dev->dev_addr = NULL;

seems risky here to set this to NULL... You could use a static var to avoid
further NULL dereference.

static char nulladdr[MAX_ADDR_LEN];
dev->dev_addr = nulladdr;

> +}
> +
> +static int dev_addr_init(struct net_device *dev)
> +{
> +	unsigned char addr[MAX_ADDR_LEN];
> +	struct netdev_hw_addr *ha;
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	INIT_LIST_HEAD(&dev->dev_addr_list);
> +	memset(addr, 0, sizeof(*addr));
> +	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
> +	if (!err) {
> +		/*
> +		 * Get the first (previously created) address from the list
> +		 * and set dev_addr pointer to this location.
> +		 */
> +		ha = list_first_entry(&dev->dev_addr_list,
> +				      struct netdev_hw_addr, list);
> +		dev->dev_addr = ha->addr;
> +	}
> +	return err;
> +}
> +
> +/**
> + *	dev_addr_add	- Add a device address
> + *	@dev: device
> + *	@addr: address to add
> + *
> + *	Add a device address to the device or increase the reference count if
> + *	it already exists.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add(struct net_device *dev, unsigned char *addr)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add);
> +
> +/**
> + *	dev_addr_del	- Release a device address.
> + *	@dev: device
> + *	@addr: address to delete
> + *
> + *	Release reference to a device address and remove it from the device
> + *	if the reference count drops to zero.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del(struct net_device *dev, unsigned char *addr)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_del);
> +
> +/**
> + *	dev_addr_add_multiple	- Add device addresses from another device
> + *	@to_dev: device to which addresses will be added
> + *	@from_dev: device from which addresses will be added
> + *
> + *	Add device addresses of the one device to another.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add_multiple(struct net_device *to_dev,
> +			  struct net_device *from_dev)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	if (from_dev->addr_len != to_dev->addr_len)
> +		return -EINVAL;
> +	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> +					&from_dev->dev_addr_list,
> +					to_dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add_multiple);
> +
> +/**
> + *	dev_addr_del_multiple	- Delete device addresses by another device
> + *	@to_dev: device where the addresses will be deleted
> + *	@from_dev: device by which addresses the addresses will be deleted
> + *
> + *	Deletes addresses in to device by the list of addresses in from device.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del_multiple(struct net_device *to_dev,
> +			  struct net_device *from_dev)
> +{
> +	ASSERT_RTNL();
> +
> +	if (from_dev->addr_len != to_dev->addr_len)
> +		return -EINVAL;
> +	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> +				  &from_dev->dev_addr_list,
> +				  to_dev->addr_len, 0);
> +	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> +	return 0;
> +}
> +EXPORT_SYMBOL(dev_addr_del_multiple);
> +
> +/* unicast and multicast addresses handling functions */
> +
>  int __dev_addr_delete(struct dev_addr_list **list, int *count,
>  		      void *addr, int alen, int glbl)
>  {
> @@ -4257,6 +4521,9 @@ static void rollback_registered(struct net_device *dev)
>  	 */
>  	dev_addr_discard(dev);
>  
> +	/* Flush device addresses */
> +	dev_addr_flush(dev);
> +

Are you sure that no driver in tree will dereference dev->dev_addr after this point ?

>  	if (dev->netdev_ops->ndo_uninit)
>  		dev->netdev_ops->ndo_uninit(dev);
>  
> @@ -4779,6 +5046,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  
>  	dev->gso_max_size = GSO_MAX_SIZE;
>  
> +	dev_addr_init(dev);
>  	netdev_init_queues(dev);
>  
>  	INIT_LIST_HEAD(&dev->napi_list);
> @@ -4965,6 +5233,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
>  	 */
>  	dev_addr_discard(dev);
>  
> +	/* Flush device addresses */
> +	dev_addr_flush(dev);
> +
>  	netdev_unregister_kobject(dev);
>  
>  	/* Actually switch the network namespace */



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v2)
@ 2009-04-15 18:54         ` Eric Dumazet
  0 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-04-15 18:54 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, jgarzik, davem

Jiri Pirko a écrit :
> changes against last patch version:
> -added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
> -removed unnecessary rcu_read locking in dev_addr_init
> -use compare_ether_addr_64bits instead of compare_ether_addr
> -use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
> -use call_rcu instead of rcu_synchronize
> -moved is_etherdev_addr into __KERNEL__ ifdef
> 
> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  include/linux/etherdevice.h |   27 +++++
>  include/linux/netdevice.h   |   32 +++++-
>  net/core/dev.c              |  271 +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 328 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
> index a1f17ab..3d7a668 100644
> --- a/include/linux/etherdevice.h
> +++ b/include/linux/etherdevice.h
> @@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
>  	return compare_ether_addr(addr1, addr2);
>  #endif
>  }
> +
> +/**
> + * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
> + * @dev: Pointer to a device structure
> + * @addr: Pointer to a six-byte array containing the Ethernet address
> + *
> + * Compare passed address with all addresses of the device. Return true if the
> + * address if one of the device addresses.
> + *
> + * Note that this function calls compare_ether_addr_64bits() so take care of
> + * the right padding.
> + */
> +static inline bool is_etherdev_addr(const struct net_device *dev,
> +				    const u8 addr[6 + 2])
> +{
> +	struct netdev_hw_addr *ha;
> +	int res = 1;
> +
> +	rcu_read_lock();
> +	for_each_dev_addr(dev, ha) {
> +		res = compare_ether_addr_64bits(addr, ha->addr);
> +		if (!res)
> +			break;
> +	}
> +	rcu_read_unlock();
> +	return !res;
> +}
>  #endif	/* __KERNEL__ */
>  
>  /**
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 2e7783f..89ad6d2 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -210,6 +210,13 @@ struct dev_addr_list
>  #define dmi_users	da_users
>  #define dmi_gusers	da_gusers
>  
> +struct netdev_hw_addr {
> +	struct list_head	list;
> +	unsigned char		addr[MAX_ADDR_LEN];
> +	int			refcount;
> +	struct rcu_head		rcu_head;
> +};
> +
>  struct hh_cache
>  {
>  	struct hh_cache *hh_next;	/* Next entry			     */
> @@ -776,8 +783,11 @@ struct net_device
>   */
>  	unsigned long		last_rx;	/* Time of last Rx	*/
>  	/* Interface address info used in eth_type_trans() */
> -	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
> -							   because most packets are unicast) */
> +	unsigned char		*dev_addr;	/* hw address, (before bcast
> +						   because most packets are
> +						   unicast) */
> +
> +	struct list_head	dev_addr_list; /* list of device hw addresses */
>  
>  	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
>  
> @@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
>  	spin_unlock_bh(&dev->addr_list_lock);
>  }
>  
> +/*
> + * dev_addr_list walker. Should be used only for read access. Call with
> + * rcu_read_lock held.
> + */
> +#define for_each_dev_addr(dev, ha) \
> +		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
> +
>  /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>  
>  extern void		ether_setup(struct net_device *dev);
> @@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  	alloc_netdev_mq(sizeof_priv, name, setup, 1)
>  extern int		register_netdev(struct net_device *dev);
>  extern void		unregister_netdev(struct net_device *dev);
> +
> +/* Functions used for device addresses handling */
> +extern int		dev_addr_add(struct net_device *dev,
> +				     unsigned char *addr);
> +extern int		dev_addr_del(struct net_device *dev,
> +				     unsigned char *addr);
> +extern int		dev_addr_add_multiple(struct net_device *to_dev,
> +					      struct net_device *from_dev);
> +extern int		dev_addr_del_multiple(struct net_device *to_dev,
> +					      struct net_device *from_dev);
> +
>  /* Functions used for secondary unicast and multicast support */
>  extern void		dev_set_rx_mode(struct net_device *dev);
>  extern void		__dev_set_rx_mode(struct net_device *dev);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 91d792d..961be4f 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3437,6 +3437,270 @@ void dev_set_rx_mode(struct net_device *dev)
>  	netif_addr_unlock_bh(dev);
>  }
>  
> +/* hw addresses list handling functions */
> +
> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +
> +	if (addr_len > MAX_ADDR_LEN)
> +		return -EINVAL;
> +

Please put here the ASSERT_RTNL(), not in various callers, since
this is the place where we really assume rtnl lock is locked by us.

You still use rcu_read_lock()/unlock() and rcu variant here...

But caller of this function has RTNL (or other lock) so dont use rcu here, as it seems
inconsistent with kzalloc() code that comes next.

> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			ha->refcount++;
> +			rcu_read_unlock();
> +			return 0;
> +		}
> +	}
> +	rcu_read_unlock();
> +
> +	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
> +	if (!ha)
> +		return -ENOMEM;
> +	memcpy(ha->addr, addr, addr_len);
> +	ha->refcount = 1;
> +	list_add_tail_rcu(&ha->list, list);
> +	return 0;
> +}
> +
> +static int __hw_addr_add(struct list_head *list, unsigned char *addr,
> +			 int addr_len)
> +{
> +	return __hw_addr_add_ii(list, addr, addr_len, -1);
> +}
> +
> +static void ha_rcu_free(struct rcu_head *head)
> +{
> +	struct netdev_hw_addr *ha;
> +
> +	ha = container_of(head, struct netdev_hw_addr, rcu_head);
> +	kfree(ha);
> +}
> +
> +static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +

ASSERT_RTNL() here, not in callers.

> +	list_for_each_entry(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			if (--ha->refcount)
> +				return 0;
> +			list_del_rcu(&ha->list);
> +			call_rcu(&ha->rcu_head, ha_rcu_free);
> +			return 0;
> +		}
> +	}
> +	return -ENOENT;
> +}
> +
> +static int __hw_addr_del(struct list_head *list, unsigned char *addr,
> +			 int addr_len)
> +{
> +	return __hw_addr_del_ii(list, addr, addr_len, -1);
> +}
> +
> +static int __hw_addr_add_multiple_ii(struct list_head *to_list,
> +				     struct list_head *from_list,
> +				     int addr_len, int ignore_index)
> +{
> +	int err = 0;
> +	struct netdev_hw_addr *ha, *ha2;
> +

same here, no need for rcu_read_lock(), since you are going to change list, you
have RTNL lock or equivalent.

> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ha, from_list, list) {
> +		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
> +		if (err)
> +			goto unroll;
> +	}
> +	goto unlock;
> +unroll:
> +	list_for_each_entry_rcu(ha2, from_list, list) {
> +		if (ha2 == ha)
> +			break;
> +		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
> +	}
> +unlock:
> +	rcu_read_unlock();
> +	return err;
> +}
> +
> +static int __hw_addr_add_multiple(struct list_head *to_list,
> +					 struct list_head *from_list,
> +					 int addr_len)
> +{
> +	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_del_multiple_ii(struct list_head *to_list,
> +				      struct list_head *from_list,
> +				      int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +

same here, no rcu_read_lock() needed...

> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ha, from_list, list) {
> +		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
> +	}
> +	rcu_read_unlock();
> +}
> +
> +static void __hw_addr_del_multiple(struct list_head *to_list,
> +					 struct list_head *from_list,
> +					 int addr_len)
> +{
> +	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_flush(struct list_head *list)
> +{
> +	struct netdev_hw_addr *ha, *tmp;
> +

ASSERT_RTNL();

> +	list_for_each_entry_safe(ha, tmp, list, list) {
> +		list_del_rcu(&ha->list);
> +		call_rcu(&ha->rcu_head, ha_rcu_free);
> +	}
> +}
> +
> +/* Device addresses handling functions */
> +
> +static void dev_addr_flush(struct net_device *dev)
> +{
> +	ASSERT_RTNL();
> +
> +	__hw_addr_flush(&dev->dev_addr_list);
> +	dev->dev_addr = NULL;

seems risky here to set this to NULL... You could use a static var to avoid
further NULL dereference.

static char nulladdr[MAX_ADDR_LEN];
dev->dev_addr = nulladdr;

> +}
> +
> +static int dev_addr_init(struct net_device *dev)
> +{
> +	unsigned char addr[MAX_ADDR_LEN];
> +	struct netdev_hw_addr *ha;
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	INIT_LIST_HEAD(&dev->dev_addr_list);
> +	memset(addr, 0, sizeof(*addr));
> +	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
> +	if (!err) {
> +		/*
> +		 * Get the first (previously created) address from the list
> +		 * and set dev_addr pointer to this location.
> +		 */
> +		ha = list_first_entry(&dev->dev_addr_list,
> +				      struct netdev_hw_addr, list);
> +		dev->dev_addr = ha->addr;
> +	}
> +	return err;
> +}
> +
> +/**
> + *	dev_addr_add	- Add a device address
> + *	@dev: device
> + *	@addr: address to add
> + *
> + *	Add a device address to the device or increase the reference count if
> + *	it already exists.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add(struct net_device *dev, unsigned char *addr)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add);
> +
> +/**
> + *	dev_addr_del	- Release a device address.
> + *	@dev: device
> + *	@addr: address to delete
> + *
> + *	Release reference to a device address and remove it from the device
> + *	if the reference count drops to zero.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del(struct net_device *dev, unsigned char *addr)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_del);
> +
> +/**
> + *	dev_addr_add_multiple	- Add device addresses from another device
> + *	@to_dev: device to which addresses will be added
> + *	@from_dev: device from which addresses will be added
> + *
> + *	Add device addresses of the one device to another.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add_multiple(struct net_device *to_dev,
> +			  struct net_device *from_dev)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	if (from_dev->addr_len != to_dev->addr_len)
> +		return -EINVAL;
> +	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> +					&from_dev->dev_addr_list,
> +					to_dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add_multiple);
> +
> +/**
> + *	dev_addr_del_multiple	- Delete device addresses by another device
> + *	@to_dev: device where the addresses will be deleted
> + *	@from_dev: device by which addresses the addresses will be deleted
> + *
> + *	Deletes addresses in to device by the list of addresses in from device.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del_multiple(struct net_device *to_dev,
> +			  struct net_device *from_dev)
> +{
> +	ASSERT_RTNL();
> +
> +	if (from_dev->addr_len != to_dev->addr_len)
> +		return -EINVAL;
> +	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> +				  &from_dev->dev_addr_list,
> +				  to_dev->addr_len, 0);
> +	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> +	return 0;
> +}
> +EXPORT_SYMBOL(dev_addr_del_multiple);
> +
> +/* unicast and multicast addresses handling functions */
> +
>  int __dev_addr_delete(struct dev_addr_list **list, int *count,
>  		      void *addr, int alen, int glbl)
>  {
> @@ -4257,6 +4521,9 @@ static void rollback_registered(struct net_device *dev)
>  	 */
>  	dev_addr_discard(dev);
>  
> +	/* Flush device addresses */
> +	dev_addr_flush(dev);
> +

Are you sure that no driver in tree will dereference dev->dev_addr after this point ?

>  	if (dev->netdev_ops->ndo_uninit)
>  		dev->netdev_ops->ndo_uninit(dev);
>  
> @@ -4779,6 +5046,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  
>  	dev->gso_max_size = GSO_MAX_SIZE;
>  
> +	dev_addr_init(dev);
>  	netdev_init_queues(dev);
>  
>  	INIT_LIST_HEAD(&dev->napi_list);
> @@ -4965,6 +5233,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
>  	 */
>  	dev_addr_discard(dev);
>  
> +	/* Flush device addresses */
> +	dev_addr_flush(dev);
> +
>  	netdev_unregister_kobject(dev);
>  
>  	/* Actually switch the network namespace */



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v2)
  2009-04-15 18:54         ` [Bridge] " Eric Dumazet
@ 2009-04-16  8:46           ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-16  8:46 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: linux-kernel, netdev, jgarzik, davem, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, ivecera

Wed, Apr 15, 2009 at 08:54:05PM CEST, dada1@cosmosbay.com wrote:
>Jiri Pirko a écrit :

<snip>

>> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>> +			    int addr_len, int ignore_index)
>> +{
>> +	struct netdev_hw_addr *ha;
>> +	int i = 0;
>> +
>> +	if (addr_len > MAX_ADDR_LEN)
>> +		return -EINVAL;
>> +
>
>Please put here the ASSERT_RTNL(), not in various callers, since
>this is the place where we really assume rtnl lock is locked by us.

Well I'd like to have ASSERT_RTNL in callers. The reason is that for this
purpose (dev_addr) the guarding lock is rtnl. But for example for multicast
addresses it won't be. It will be most probably a spin lock. But those callers
(multicast) will use this __hw_addr_xxx functions too. Therefore I'd like to
leave locking on current level.
>
>You still use rcu_read_lock()/unlock() and rcu variant here...

Yes this is unecessrary and confusing I agree. Will remove these read locks in
places where there is guarded by rtnl mutex.
>
>But caller of this function has RTNL (or other lock) so dont use rcu here, as it seems
>inconsistent with kzalloc() code that comes next.
>
>> +	rcu_read_lock();
>> +	list_for_each_entry_rcu(ha, list, list) {
>> +		if (i++ != ignore_index &&
>> +		    !memcmp(ha->addr, addr, addr_len)) {
>> +			ha->refcount++;
>> +			rcu_read_unlock();
>> +			return 0;
>> +		}
>> +	}
>> +	rcu_read_unlock();
>> +
>> +	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
>> +	if (!ha)
>> +		return -ENOMEM;
>> +	memcpy(ha->addr, addr, addr_len);
>> +	ha->refcount = 1;
>> +	list_add_tail_rcu(&ha->list, list);
>> +	return 0;

<snip>

>> +static int __hw_addr_add_multiple_ii(struct list_head *to_list,
>> +				     struct list_head *from_list,
>> +				     int addr_len, int ignore_index)
>> +{
>> +	int err = 0;
>> +	struct netdev_hw_addr *ha, *ha2;
>> +
>
>same here, no need for rcu_read_lock(), since you are going to change list, you
>have RTNL lock or equivalent.
>
Yes, I wanted to show that for "from_list" this is a reader...
....unnecessary,foolish -> removing...
>> +	rcu_read_lock();
>> +	list_for_each_entry_rcu(ha, from_list, list) {
>> +		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
>> +		if (err)
>> +			goto unroll;
>> +	}
>> +	goto unlock;
>> +unroll:
>> +	list_for_each_entry_rcu(ha2, from_list, list) {
>> +		if (ha2 == ha)
>> +			break;
>> +		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
>> +	}
>> +unlock:
>> +	rcu_read_unlock();
>> +	return err;
>> +}
>> +

<snip>

>> +static void dev_addr_flush(struct net_device *dev)
>> +{
>> +	ASSERT_RTNL();
>> +
>> +	__hw_addr_flush(&dev->dev_addr_list);
>> +	dev->dev_addr = NULL;
>
>seems risky here to set this to NULL... You could use a static var to avoid
>further NULL dereference.
>
>static char nulladdr[MAX_ADDR_LEN];
>dev->dev_addr = nulladdr;
>
>> +}
>> +

<snip>

>> @@ -4257,6 +4521,9 @@ static void rollback_registered(struct net_device *dev)
>>  	 */
>>  	dev_addr_discard(dev);
>>  
>> +	/* Flush device addresses */
>> +	dev_addr_flush(dev);
>> +
>
>Are you sure that no driver in tree will dereference dev->dev_addr after this point ?

I assume that driver might not use dev_addr after it calls
unregister_netdevice(). But ok - I would rather move calling dev_addr_flush()
somewhere later where there is a guarantee that dev_addr should not be
referenced. Perhaps in free_netdev() ? It would also correspond with calling
dev_addr_init() in alloc_netdev_mq()...
>
>>  	if (dev->netdev_ops->ndo_uninit)
>>  		dev->netdev_ops->ndo_uninit(dev);
>>  
>> @@ -4779,6 +5046,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>>  
>>  	dev->gso_max_size = GSO_MAX_SIZE;
>>  
>> +	dev_addr_init(dev);
>>  	netdev_init_queues(dev);
>>  
>>  	INIT_LIST_HEAD(&dev->napi_list);
>> @@ -4965,6 +5233,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
>>  	 */
>>  	dev_addr_discard(dev);
>>  
>> +	/* Flush device addresses */
>> +	dev_addr_flush(dev);
>> +
>>  	netdev_unregister_kobject(dev);
>>  
>>  	/* Actually switch the network namespace */
>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v2)
@ 2009-04-16  8:46           ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-16  8:46 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, jgarzik, davem

Wed, Apr 15, 2009 at 08:54:05PM CEST, dada1@cosmosbay.com wrote:
>Jiri Pirko a écrit :

<snip>

>> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>> +			    int addr_len, int ignore_index)
>> +{
>> +	struct netdev_hw_addr *ha;
>> +	int i = 0;
>> +
>> +	if (addr_len > MAX_ADDR_LEN)
>> +		return -EINVAL;
>> +
>
>Please put here the ASSERT_RTNL(), not in various callers, since
>this is the place where we really assume rtnl lock is locked by us.

Well I'd like to have ASSERT_RTNL in callers. The reason is that for this
purpose (dev_addr) the guarding lock is rtnl. But for example for multicast
addresses it won't be. It will be most probably a spin lock. But those callers
(multicast) will use this __hw_addr_xxx functions too. Therefore I'd like to
leave locking on current level.
>
>You still use rcu_read_lock()/unlock() and rcu variant here...

Yes this is unecessrary and confusing I agree. Will remove these read locks in
places where there is guarded by rtnl mutex.
>
>But caller of this function has RTNL (or other lock) so dont use rcu here, as it seems
>inconsistent with kzalloc() code that comes next.
>
>> +	rcu_read_lock();
>> +	list_for_each_entry_rcu(ha, list, list) {
>> +		if (i++ != ignore_index &&
>> +		    !memcmp(ha->addr, addr, addr_len)) {
>> +			ha->refcount++;
>> +			rcu_read_unlock();
>> +			return 0;
>> +		}
>> +	}
>> +	rcu_read_unlock();
>> +
>> +	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
>> +	if (!ha)
>> +		return -ENOMEM;
>> +	memcpy(ha->addr, addr, addr_len);
>> +	ha->refcount = 1;
>> +	list_add_tail_rcu(&ha->list, list);
>> +	return 0;

<snip>

>> +static int __hw_addr_add_multiple_ii(struct list_head *to_list,
>> +				     struct list_head *from_list,
>> +				     int addr_len, int ignore_index)
>> +{
>> +	int err = 0;
>> +	struct netdev_hw_addr *ha, *ha2;
>> +
>
>same here, no need for rcu_read_lock(), since you are going to change list, you
>have RTNL lock or equivalent.
>
Yes, I wanted to show that for "from_list" this is a reader...
....unnecessary,foolish -> removing...
>> +	rcu_read_lock();
>> +	list_for_each_entry_rcu(ha, from_list, list) {
>> +		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
>> +		if (err)
>> +			goto unroll;
>> +	}
>> +	goto unlock;
>> +unroll:
>> +	list_for_each_entry_rcu(ha2, from_list, list) {
>> +		if (ha2 == ha)
>> +			break;
>> +		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
>> +	}
>> +unlock:
>> +	rcu_read_unlock();
>> +	return err;
>> +}
>> +

<snip>

>> +static void dev_addr_flush(struct net_device *dev)
>> +{
>> +	ASSERT_RTNL();
>> +
>> +	__hw_addr_flush(&dev->dev_addr_list);
>> +	dev->dev_addr = NULL;
>
>seems risky here to set this to NULL... You could use a static var to avoid
>further NULL dereference.
>
>static char nulladdr[MAX_ADDR_LEN];
>dev->dev_addr = nulladdr;
>
>> +}
>> +

<snip>

>> @@ -4257,6 +4521,9 @@ static void rollback_registered(struct net_device *dev)
>>  	 */
>>  	dev_addr_discard(dev);
>>  
>> +	/* Flush device addresses */
>> +	dev_addr_flush(dev);
>> +
>
>Are you sure that no driver in tree will dereference dev->dev_addr after this point ?

I assume that driver might not use dev_addr after it calls
unregister_netdevice(). But ok - I would rather move calling dev_addr_flush()
somewhere later where there is a guarantee that dev_addr should not be
referenced. Perhaps in free_netdev() ? It would also correspond with calling
dev_addr_init() in alloc_netdev_mq()...
>
>>  	if (dev->netdev_ops->ndo_uninit)
>>  		dev->netdev_ops->ndo_uninit(dev);
>>  
>> @@ -4779,6 +5046,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>>  
>>  	dev->gso_max_size = GSO_MAX_SIZE;
>>  
>> +	dev_addr_init(dev);
>>  	netdev_init_queues(dev);
>>  
>>  	INIT_LIST_HEAD(&dev->napi_list);
>> @@ -4965,6 +5233,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
>>  	 */
>>  	dev_addr_discard(dev);
>>  
>> +	/* Flush device addresses */
>> +	dev_addr_flush(dev);
>> +
>>  	netdev_unregister_kobject(dev);
>>  
>>  	/* Actually switch the network namespace */
>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)
  2009-04-15 18:02       ` [Bridge] " Jiri Pirko
@ 2009-04-17 11:57         ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-17 11:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

v2 -> v3 (current):
-removed unnecessary rcu read locking
-moved dev_addr_flush() calling to ensure no null dereference of dev_addr

v1 -> v2:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   27 +++++
 include/linux/netdevice.h   |   32 +++++-
 net/core/dev.c              |  261 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 318 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..3d7a668 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
 	return compare_ether_addr(addr1, addr2);
 #endif
 }
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 addr[6 + 2])
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr_64bits(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
 #endif	/* __KERNEL__ */
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..89ad6d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,13 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	int			refcount;
+	struct rcu_head		rcu_head;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +783,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int		dev_addr_add(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_del(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_add_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+extern int		dev_addr_del_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 343883f..b4503ac 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			ha->refcount++;
+			return 0;
+		}
+	}
+
+	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->refcount = 1;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+			 int addr_len)
+{
+	return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+	struct netdev_hw_addr *ha;
+
+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
+	kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			if (--ha->refcount)
+				return 0;
+			list_del_rcu(&ha->list);
+			call_rcu(&ha->rcu_head, ha_rcu_free);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+			 int addr_len)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, int ignore_index)
+{
+	int err;
+	struct netdev_hw_addr *ha, *ha2;
+
+	list_for_each_entry(ha, from_list, list) {
+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+		if (err)
+			goto unroll;
+	}
+	return 0;
+
+unroll:
+	list_for_each_entry(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+	}
+	return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+
+	list_for_each_entry(ha, from_list, list) {
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+	}
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		call_rcu(&ha->rcu_head, ha_rcu_free);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	ASSERT_RTNL();
+
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	ASSERT_RTNL();
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addr_list,
+				      struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *
+ *	Add device addresses of the one device to another.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)
@ 2009-04-17 11:57         ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-17 11:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

v2 -> v3 (current):
-removed unnecessary rcu read locking
-moved dev_addr_flush() calling to ensure no null dereference of dev_addr

v1 -> v2:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   27 +++++
 include/linux/netdevice.h   |   32 +++++-
 net/core/dev.c              |  261 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 318 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..3d7a668 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
 	return compare_ether_addr(addr1, addr2);
 #endif
 }
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 addr[6 + 2])
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr_64bits(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
 #endif	/* __KERNEL__ */
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..89ad6d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,13 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	int			refcount;
+	struct rcu_head		rcu_head;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +783,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int		dev_addr_add(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_del(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_add_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+extern int		dev_addr_del_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 343883f..b4503ac 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			ha->refcount++;
+			return 0;
+		}
+	}
+
+	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->refcount = 1;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+			 int addr_len)
+{
+	return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+	struct netdev_hw_addr *ha;
+
+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
+	kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			if (--ha->refcount)
+				return 0;
+			list_del_rcu(&ha->list);
+			call_rcu(&ha->rcu_head, ha_rcu_free);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+			 int addr_len)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, int ignore_index)
+{
+	int err;
+	struct netdev_hw_addr *ha, *ha2;
+
+	list_for_each_entry(ha, from_list, list) {
+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+		if (err)
+			goto unroll;
+	}
+	return 0;
+
+unroll:
+	list_for_each_entry(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+	}
+	return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+
+	list_for_each_entry(ha, from_list, list) {
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+	}
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		call_rcu(&ha->rcu_head, ha_rcu_free);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	ASSERT_RTNL();
+
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	ASSERT_RTNL();
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addr_list,
+				      struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *
+ *	Add device addresses of the one device to another.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)
  2009-04-17 11:57         ` [Bridge] " Jiri Pirko
@ 2009-04-17 15:33           ` Stephen Hemminger
  -1 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-04-17 15:33 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, netdev, jgarzik, davem, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

On Fri, 17 Apr 2009 13:57:24 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> v2 -> v3 (current):
> -removed unnecessary rcu read locking
> -moved dev_addr_flush() calling to ensure no null dereference of dev_addr
> 
> v1 -> v2:
> -added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
> -removed unnecessary rcu_read locking in dev_addr_init
> -use compare_ether_addr_64bits instead of compare_ether_addr
> -use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
> -use call_rcu instead of rcu_synchronize
> -moved is_etherdev_addr into __KERNEL__ ifdef
> 
> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  include/linux/etherdevice.h |   27 +++++
>  include/linux/netdevice.h   |   32 +++++-
>  net/core/dev.c              |  261 +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 318 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
> index a1f17ab..3d7a668 100644
> --- a/include/linux/etherdevice.h
> +++ b/include/linux/etherdevice.h
> @@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
>  	return compare_ether_addr(addr1, addr2);
>  #endif
>  }
> +
> +/**
> + * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
> + * @dev: Pointer to a device structure
> + * @addr: Pointer to a six-byte array containing the Ethernet address
> + *
> + * Compare passed address with all addresses of the device. Return true if the
> + * address if one of the device addresses.
> + *
> + * Note that this function calls compare_ether_addr_64bits() so take care of
> + * the right padding.
> + */
> +static inline bool is_etherdev_addr(const struct net_device *dev,
> +				    const u8 addr[6 + 2])
> +{
> +	struct netdev_hw_addr *ha;
> +	int res = 1;
> +
> +	rcu_read_lock();
> +	for_each_dev_addr(dev, ha) {
> +		res = compare_ether_addr_64bits(addr, ha->addr);
> +		if (!res)
> +			break;
> +	}
> +	rcu_read_unlock();
> +	return !res;
> +}
>  #endif	/* __KERNEL__ */
>  
>  /**
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 2e7783f..89ad6d2 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -210,6 +210,13 @@ struct dev_addr_list
>  #define dmi_users	da_users
>  #define dmi_gusers	da_gusers
>  
> +struct netdev_hw_addr {
> +	struct list_head	list;
> +	unsigned char		addr[MAX_ADDR_LEN];
> +	int			refcount;
> +	struct rcu_head		rcu_head;
> +};

Minor nit, the ordering of elements cause holes that might not be
needed.

Space saving? is rcu_head needed or would using synchronize_net
make code cleaner and save space. 

>  struct hh_cache
>  {
>  	struct hh_cache *hh_next;	/* Next entry			     */
> @@ -776,8 +783,11 @@ struct net_device
>   */
>  	unsigned long		last_rx;	/* Time of last Rx	*/
>  	/* Interface address info used in eth_type_trans() */
> -	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
> -							   because most packets are unicast) */
> +	unsigned char		*dev_addr;	/* hw address, (before bcast
> +						   because most packets are
> +						   unicast) */
> +
> +	struct list_head	dev_addr_list; /* list of device hw addresses */
>  
>  	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
>  
> @@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
>  	spin_unlock_bh(&dev->addr_list_lock);
>  }
>  
> +/*
> + * dev_addr_list walker. Should be used only for read access. Call with
> + * rcu_read_lock held.
> + */
> +#define for_each_dev_addr(dev, ha) \
> +		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
> +
>  /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>  
>  extern void		ether_setup(struct net_device *dev);
> @@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  	alloc_netdev_mq(sizeof_priv, name, setup, 1)
>  extern int		register_netdev(struct net_device *dev);
>  extern void		unregister_netdev(struct net_device *dev);
> +
> +/* Functions used for device addresses handling */
> +extern int		dev_addr_add(struct net_device *dev,
> +				     unsigned char *addr);
> +extern int		dev_addr_del(struct net_device *dev,
> +				     unsigned char *addr);
> +extern int		dev_addr_add_multiple(struct net_device *to_dev,
> +					      struct net_device *from_dev);
> +extern int		dev_addr_del_multiple(struct net_device *to_dev,
> +					      struct net_device *from_dev);
> +
>  /* Functions used for secondary unicast and multicast support */
>  extern void		dev_set_rx_mode(struct net_device *dev);
>  extern void		__dev_set_rx_mode(struct net_device *dev);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 343883f..b4503ac 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
>  	netif_addr_unlock_bh(dev);
>  }
>  
> +/* hw addresses list handling functions */
> +
> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +
> +	if (addr_len > MAX_ADDR_LEN)
> +		return -EINVAL;
> +
> +	list_for_each_entry(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			ha->refcount++;
> +			return 0;
> +		}
> +	}
> +
> +	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
> +	if (!ha)
> +		return -ENOMEM;
Since you are initializing all fields, kzalloc isn't really needed

> +	memcpy(ha->addr, addr, addr_len);
> +	ha->refcount = 1;
> +	list_add_tail_rcu(&ha->list, list);
> +	return 0;
> +}
> +
> +static int __hw_addr_add(struct list_head *list, unsigned char *addr,
> +			 int addr_len)
> +{
> +	return __hw_addr_add_ii(list, addr, addr_len, -1);
> +}
> +
> +static void ha_rcu_free(struct rcu_head *head)
> +{
> +	struct netdev_hw_addr *ha;
> +
> +	ha = container_of(head, struct netdev_hw_addr, rcu_head);
> +	kfree(ha);
> +}
> +
> +static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +
> +	list_for_each_entry(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			if (--ha->refcount)
> +				return 0;
> +			list_del_rcu(&ha->list);
> +			call_rcu(&ha->rcu_head, ha_rcu_free);
> +			return 0;
> +		}
> +	}
> +	return -ENOENT;
> +}
> +
> +static int __hw_addr_del(struct list_head *list, unsigned char *addr,
> +			 int addr_len)
> +{
> +	return __hw_addr_del_ii(list, addr, addr_len, -1);
> +}
> +
> +static int __hw_addr_add_multiple_ii(struct list_head *to_list,
> +				     struct list_head *from_list,
> +				     int addr_len, int ignore_index)
> +{
> +	int err;
> +	struct netdev_hw_addr *ha, *ha2;
> +
> +	list_for_each_entry(ha, from_list, list) {
> +		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
> +		if (err)
> +			goto unroll;
> +	}
> +	return 0;
> +
> +unroll:
> +	list_for_each_entry(ha2, from_list, list) {
> +		if (ha2 == ha)
> +			break;
> +		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
> +	}
> +	return err;
> +}
> +
> +static int __hw_addr_add_multiple(struct list_head *to_list,
> +					 struct list_head *from_list,
> +					 int addr_len)
> +{
> +	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_del_multiple_ii(struct list_head *to_list,
> +				      struct list_head *from_list,
> +				      int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +
> +	list_for_each_entry(ha, from_list, list) {
> +		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
> +	}
> +}
> +
> +static void __hw_addr_del_multiple(struct list_head *to_list,
> +					 struct list_head *from_list,
> +					 int addr_len)
> +{
> +	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_flush(struct list_head *list)
> +{
> +	struct netdev_hw_addr *ha, *tmp;
> +
> +	list_for_each_entry_safe(ha, tmp, list, list) {
> +		list_del_rcu(&ha->list);
> +		call_rcu(&ha->rcu_head, ha_rcu_free);
> +	}
> +}
> +
> +/* Device addresses handling functions */
> +
> +static void dev_addr_flush(struct net_device *dev)
> +{
> +	ASSERT_RTNL();
> +
Since this is local you should be able to audit all
the callers and remove this ASSERT.

> +	__hw_addr_flush(&dev->dev_addr_list);
> +	dev->dev_addr = NULL;
> +}
> +
> +static int dev_addr_init(struct net_device *dev)
> +{
> +	unsigned char addr[MAX_ADDR_LEN];
> +	struct netdev_hw_addr *ha;
> +	int err;
> +
> +	ASSERT_RTNL();
Ditto, ASSERT_RTNL makes sense for exposed kernel API and
initial testing.

> +	INIT_LIST_HEAD(&dev->dev_addr_list);
> +	memset(addr, 0, sizeof(*addr));
> +	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
> +	if (!err) {
> +		/*
> +		 * Get the first (previously created) address from the list
> +		 * and set dev_addr pointer to this location.
> +		 */
> +		ha = list_first_entry(&dev->dev_addr_list,
> +				      struct netdev_hw_addr, list);
> +		dev->dev_addr = ha->addr;
> +	}
> +	return err;
> +}
> +
> +/**
> + *	dev_addr_add	- Add a device address
> + *	@dev: device
> + *	@addr: address to add
> + *
> + *	Add a device address to the device or increase the reference count if
> + *	it already exists.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add(struct net_device *dev, unsigned char *addr)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add);
> +
> +/**
> + *	dev_addr_del	- Release a device address.
> + *	@dev: device
> + *	@addr: address to delete
> + *
> + *	Release reference to a device address and remove it from the device
> + *	if the reference count drops to zero.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del(struct net_device *dev, unsigned char *addr)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_del);
> +
> +/**
> + *	dev_addr_add_multiple	- Add device addresses from another device
> + *	@to_dev: device to which addresses will be added
> + *	@from_dev: device from which addresses will be added
> + *
> + *	Add device addresses of the one device to another.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add_multiple(struct net_device *to_dev,
> +			  struct net_device *from_dev)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	if (from_dev->addr_len != to_dev->addr_len)
> +		return -EINVAL;
> +	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> +					&from_dev->dev_addr_list,
> +					to_dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add_multiple);
> +
> +/**
> + *	dev_addr_del_multiple	- Delete device addresses by another device
> + *	@to_dev: device where the addresses will be deleted
> + *	@from_dev: device by which addresses the addresses will be deleted
> + *
> + *	Deletes addresses in to device by the list of addresses in from device.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del_multiple(struct net_device *to_dev,
> +			  struct net_device *from_dev)
> +{
> +	ASSERT_RTNL();
> +
> +	if (from_dev->addr_len != to_dev->addr_len)
> +		return -EINVAL;
> +	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> +				  &from_dev->dev_addr_list,
> +				  to_dev->addr_len, 0);
> +	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> +	return 0;
> +}
> +EXPORT_SYMBOL(dev_addr_del_multiple);
> +
> +/* unicast and multicast addresses handling functions */
> +
>  int __dev_addr_delete(struct dev_addr_list **list, int *count,
>  		      void *addr, int alen, int glbl)
>  {
> @@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  
>  	dev->gso_max_size = GSO_MAX_SIZE;
>  
> +	dev_addr_init(dev);
>  	netdev_init_queues(dev);
>  
>  	INIT_LIST_HEAD(&dev->napi_list);
> @@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
>  
>  	kfree(dev->_tx);
>  
> +	/* Flush device addresses */
> +	dev_addr_flush(dev);
> +
>  	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
>  		netif_napi_del(p);
>  

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)
@ 2009-04-17 15:33           ` Stephen Hemminger
  0 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-04-17 15:33 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, dada1, jgarzik, davem

On Fri, 17 Apr 2009 13:57:24 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> v2 -> v3 (current):
> -removed unnecessary rcu read locking
> -moved dev_addr_flush() calling to ensure no null dereference of dev_addr
> 
> v1 -> v2:
> -added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
> -removed unnecessary rcu_read locking in dev_addr_init
> -use compare_ether_addr_64bits instead of compare_ether_addr
> -use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
> -use call_rcu instead of rcu_synchronize
> -moved is_etherdev_addr into __KERNEL__ ifdef
> 
> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  include/linux/etherdevice.h |   27 +++++
>  include/linux/netdevice.h   |   32 +++++-
>  net/core/dev.c              |  261 +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 318 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
> index a1f17ab..3d7a668 100644
> --- a/include/linux/etherdevice.h
> +++ b/include/linux/etherdevice.h
> @@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
>  	return compare_ether_addr(addr1, addr2);
>  #endif
>  }
> +
> +/**
> + * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
> + * @dev: Pointer to a device structure
> + * @addr: Pointer to a six-byte array containing the Ethernet address
> + *
> + * Compare passed address with all addresses of the device. Return true if the
> + * address if one of the device addresses.
> + *
> + * Note that this function calls compare_ether_addr_64bits() so take care of
> + * the right padding.
> + */
> +static inline bool is_etherdev_addr(const struct net_device *dev,
> +				    const u8 addr[6 + 2])
> +{
> +	struct netdev_hw_addr *ha;
> +	int res = 1;
> +
> +	rcu_read_lock();
> +	for_each_dev_addr(dev, ha) {
> +		res = compare_ether_addr_64bits(addr, ha->addr);
> +		if (!res)
> +			break;
> +	}
> +	rcu_read_unlock();
> +	return !res;
> +}
>  #endif	/* __KERNEL__ */
>  
>  /**
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 2e7783f..89ad6d2 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -210,6 +210,13 @@ struct dev_addr_list
>  #define dmi_users	da_users
>  #define dmi_gusers	da_gusers
>  
> +struct netdev_hw_addr {
> +	struct list_head	list;
> +	unsigned char		addr[MAX_ADDR_LEN];
> +	int			refcount;
> +	struct rcu_head		rcu_head;
> +};

Minor nit, the ordering of elements cause holes that might not be
needed.

Space saving? is rcu_head needed or would using synchronize_net
make code cleaner and save space. 

>  struct hh_cache
>  {
>  	struct hh_cache *hh_next;	/* Next entry			     */
> @@ -776,8 +783,11 @@ struct net_device
>   */
>  	unsigned long		last_rx;	/* Time of last Rx	*/
>  	/* Interface address info used in eth_type_trans() */
> -	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
> -							   because most packets are unicast) */
> +	unsigned char		*dev_addr;	/* hw address, (before bcast
> +						   because most packets are
> +						   unicast) */
> +
> +	struct list_head	dev_addr_list; /* list of device hw addresses */
>  
>  	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
>  
> @@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
>  	spin_unlock_bh(&dev->addr_list_lock);
>  }
>  
> +/*
> + * dev_addr_list walker. Should be used only for read access. Call with
> + * rcu_read_lock held.
> + */
> +#define for_each_dev_addr(dev, ha) \
> +		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
> +
>  /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>  
>  extern void		ether_setup(struct net_device *dev);
> @@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  	alloc_netdev_mq(sizeof_priv, name, setup, 1)
>  extern int		register_netdev(struct net_device *dev);
>  extern void		unregister_netdev(struct net_device *dev);
> +
> +/* Functions used for device addresses handling */
> +extern int		dev_addr_add(struct net_device *dev,
> +				     unsigned char *addr);
> +extern int		dev_addr_del(struct net_device *dev,
> +				     unsigned char *addr);
> +extern int		dev_addr_add_multiple(struct net_device *to_dev,
> +					      struct net_device *from_dev);
> +extern int		dev_addr_del_multiple(struct net_device *to_dev,
> +					      struct net_device *from_dev);
> +
>  /* Functions used for secondary unicast and multicast support */
>  extern void		dev_set_rx_mode(struct net_device *dev);
>  extern void		__dev_set_rx_mode(struct net_device *dev);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 343883f..b4503ac 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
>  	netif_addr_unlock_bh(dev);
>  }
>  
> +/* hw addresses list handling functions */
> +
> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +
> +	if (addr_len > MAX_ADDR_LEN)
> +		return -EINVAL;
> +
> +	list_for_each_entry(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			ha->refcount++;
> +			return 0;
> +		}
> +	}
> +
> +	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
> +	if (!ha)
> +		return -ENOMEM;
Since you are initializing all fields, kzalloc isn't really needed

> +	memcpy(ha->addr, addr, addr_len);
> +	ha->refcount = 1;
> +	list_add_tail_rcu(&ha->list, list);
> +	return 0;
> +}
> +
> +static int __hw_addr_add(struct list_head *list, unsigned char *addr,
> +			 int addr_len)
> +{
> +	return __hw_addr_add_ii(list, addr, addr_len, -1);
> +}
> +
> +static void ha_rcu_free(struct rcu_head *head)
> +{
> +	struct netdev_hw_addr *ha;
> +
> +	ha = container_of(head, struct netdev_hw_addr, rcu_head);
> +	kfree(ha);
> +}
> +
> +static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +
> +	list_for_each_entry(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			if (--ha->refcount)
> +				return 0;
> +			list_del_rcu(&ha->list);
> +			call_rcu(&ha->rcu_head, ha_rcu_free);
> +			return 0;
> +		}
> +	}
> +	return -ENOENT;
> +}
> +
> +static int __hw_addr_del(struct list_head *list, unsigned char *addr,
> +			 int addr_len)
> +{
> +	return __hw_addr_del_ii(list, addr, addr_len, -1);
> +}
> +
> +static int __hw_addr_add_multiple_ii(struct list_head *to_list,
> +				     struct list_head *from_list,
> +				     int addr_len, int ignore_index)
> +{
> +	int err;
> +	struct netdev_hw_addr *ha, *ha2;
> +
> +	list_for_each_entry(ha, from_list, list) {
> +		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
> +		if (err)
> +			goto unroll;
> +	}
> +	return 0;
> +
> +unroll:
> +	list_for_each_entry(ha2, from_list, list) {
> +		if (ha2 == ha)
> +			break;
> +		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
> +	}
> +	return err;
> +}
> +
> +static int __hw_addr_add_multiple(struct list_head *to_list,
> +					 struct list_head *from_list,
> +					 int addr_len)
> +{
> +	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_del_multiple_ii(struct list_head *to_list,
> +				      struct list_head *from_list,
> +				      int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +
> +	list_for_each_entry(ha, from_list, list) {
> +		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
> +	}
> +}
> +
> +static void __hw_addr_del_multiple(struct list_head *to_list,
> +					 struct list_head *from_list,
> +					 int addr_len)
> +{
> +	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_flush(struct list_head *list)
> +{
> +	struct netdev_hw_addr *ha, *tmp;
> +
> +	list_for_each_entry_safe(ha, tmp, list, list) {
> +		list_del_rcu(&ha->list);
> +		call_rcu(&ha->rcu_head, ha_rcu_free);
> +	}
> +}
> +
> +/* Device addresses handling functions */
> +
> +static void dev_addr_flush(struct net_device *dev)
> +{
> +	ASSERT_RTNL();
> +
Since this is local you should be able to audit all
the callers and remove this ASSERT.

> +	__hw_addr_flush(&dev->dev_addr_list);
> +	dev->dev_addr = NULL;
> +}
> +
> +static int dev_addr_init(struct net_device *dev)
> +{
> +	unsigned char addr[MAX_ADDR_LEN];
> +	struct netdev_hw_addr *ha;
> +	int err;
> +
> +	ASSERT_RTNL();
Ditto, ASSERT_RTNL makes sense for exposed kernel API and
initial testing.

> +	INIT_LIST_HEAD(&dev->dev_addr_list);
> +	memset(addr, 0, sizeof(*addr));
> +	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
> +	if (!err) {
> +		/*
> +		 * Get the first (previously created) address from the list
> +		 * and set dev_addr pointer to this location.
> +		 */
> +		ha = list_first_entry(&dev->dev_addr_list,
> +				      struct netdev_hw_addr, list);
> +		dev->dev_addr = ha->addr;
> +	}
> +	return err;
> +}
> +
> +/**
> + *	dev_addr_add	- Add a device address
> + *	@dev: device
> + *	@addr: address to add
> + *
> + *	Add a device address to the device or increase the reference count if
> + *	it already exists.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add(struct net_device *dev, unsigned char *addr)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add);
> +
> +/**
> + *	dev_addr_del	- Release a device address.
> + *	@dev: device
> + *	@addr: address to delete
> + *
> + *	Release reference to a device address and remove it from the device
> + *	if the reference count drops to zero.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del(struct net_device *dev, unsigned char *addr)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_del);
> +
> +/**
> + *	dev_addr_add_multiple	- Add device addresses from another device
> + *	@to_dev: device to which addresses will be added
> + *	@from_dev: device from which addresses will be added
> + *
> + *	Add device addresses of the one device to another.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add_multiple(struct net_device *to_dev,
> +			  struct net_device *from_dev)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	if (from_dev->addr_len != to_dev->addr_len)
> +		return -EINVAL;
> +	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> +					&from_dev->dev_addr_list,
> +					to_dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add_multiple);
> +
> +/**
> + *	dev_addr_del_multiple	- Delete device addresses by another device
> + *	@to_dev: device where the addresses will be deleted
> + *	@from_dev: device by which addresses the addresses will be deleted
> + *
> + *	Deletes addresses in to device by the list of addresses in from device.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del_multiple(struct net_device *to_dev,
> +			  struct net_device *from_dev)
> +{
> +	ASSERT_RTNL();
> +
> +	if (from_dev->addr_len != to_dev->addr_len)
> +		return -EINVAL;
> +	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> +				  &from_dev->dev_addr_list,
> +				  to_dev->addr_len, 0);
> +	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> +	return 0;
> +}
> +EXPORT_SYMBOL(dev_addr_del_multiple);
> +
> +/* unicast and multicast addresses handling functions */
> +
>  int __dev_addr_delete(struct dev_addr_list **list, int *count,
>  		      void *addr, int alen, int glbl)
>  {
> @@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  
>  	dev->gso_max_size = GSO_MAX_SIZE;
>  
> +	dev_addr_init(dev);
>  	netdev_init_queues(dev);
>  
>  	INIT_LIST_HEAD(&dev->napi_list);
> @@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
>  
>  	kfree(dev->_tx);
>  
> +	/* Flush device addresses */
> +	dev_addr_flush(dev);
> +
>  	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
>  		netif_napi_del(p);
>  

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)
  2009-04-17 15:33           ` [Bridge] " Stephen Hemminger
@ 2009-04-18  7:01             ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-18  7:01 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: linux-kernel, netdev, jgarzik, davem, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

Fri, Apr 17, 2009 at 05:33:15PM CEST, shemminger@vyatta.com wrote:

<snip>

>> +struct netdev_hw_addr {
>> +	struct list_head	list;
>> +	unsigned char		addr[MAX_ADDR_LEN];
>> +	int			refcount;
>> +	struct rcu_head		rcu_head;
>> +};
>
>Minor nit, the ordering of elements cause holes that might not be
>needed.

Agree that ordering might be done better. Will do.
>
>Space saving? is rcu_head needed or would using synchronize_net
>make code cleaner and save space. 
>

Well I originaly had this done by synchronize_rcu(). Eric argued that it might
cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
called) once it hits the tree.

<snip>

>> +	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
>> +	if (!ha)
>> +		return -ENOMEM;
>Since you are initializing all fields, kzalloc isn't really needed

Noted.
>
>> +	memcpy(ha->addr, addr, addr_len);
>> +	ha->refcount = 1;
>> +	list_add_tail_rcu(&ha->list, list);
>> +	return 0;
>> +}

<snip>

>> +static void dev_addr_flush(struct net_device *dev)
>> +{
>> +	ASSERT_RTNL();
>> +
>Since this is local you should be able to audit all
>the callers and remove this ASSERT.

Okay. I will at least put a comment instead of this.
>
>> +	__hw_addr_flush(&dev->dev_addr_list);
>> +	dev->dev_addr = NULL;
>> +}
>> +
>> +static int dev_addr_init(struct net_device *dev)
>> +{
>> +	unsigned char addr[MAX_ADDR_LEN];
>> +	struct netdev_hw_addr *ha;
>> +	int err;
>> +
>> +	ASSERT_RTNL();
>Ditto, ASSERT_RTNL makes sense for exposed kernel API and
>initial testing.
>
>> +	INIT_LIST_HEAD(&dev->dev_addr_list);
>> +	memset(addr, 0, sizeof(*addr));
>> +	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>> +	if (!err) {
>> +		/*
>> +		 * Get the first (previously created) address from the list
>> +		 * and set dev_addr pointer to this location.
>> +		 */
>> +		ha = list_first_entry(&dev->dev_addr_list,
>> +				      struct netdev_hw_addr, list);
>> +		dev->dev_addr = ha->addr;
>> +	}
>> +	return err;
>> +}

<snip>


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)
@ 2009-04-18  7:01             ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-18  7:01 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, dada1, jgarzik, davem

Fri, Apr 17, 2009 at 05:33:15PM CEST, shemminger@vyatta.com wrote:

<snip>

>> +struct netdev_hw_addr {
>> +	struct list_head	list;
>> +	unsigned char		addr[MAX_ADDR_LEN];
>> +	int			refcount;
>> +	struct rcu_head		rcu_head;
>> +};
>
>Minor nit, the ordering of elements cause holes that might not be
>needed.

Agree that ordering might be done better. Will do.
>
>Space saving? is rcu_head needed or would using synchronize_net
>make code cleaner and save space. 
>

Well I originaly had this done by synchronize_rcu(). Eric argued that it might
cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
called) once it hits the tree.

<snip>

>> +	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
>> +	if (!ha)
>> +		return -ENOMEM;
>Since you are initializing all fields, kzalloc isn't really needed

Noted.
>
>> +	memcpy(ha->addr, addr, addr_len);
>> +	ha->refcount = 1;
>> +	list_add_tail_rcu(&ha->list, list);
>> +	return 0;
>> +}

<snip>

>> +static void dev_addr_flush(struct net_device *dev)
>> +{
>> +	ASSERT_RTNL();
>> +
>Since this is local you should be able to audit all
>the callers and remove this ASSERT.

Okay. I will at least put a comment instead of this.
>
>> +	__hw_addr_flush(&dev->dev_addr_list);
>> +	dev->dev_addr = NULL;
>> +}
>> +
>> +static int dev_addr_init(struct net_device *dev)
>> +{
>> +	unsigned char addr[MAX_ADDR_LEN];
>> +	struct netdev_hw_addr *ha;
>> +	int err;
>> +
>> +	ASSERT_RTNL();
>Ditto, ASSERT_RTNL makes sense for exposed kernel API and
>initial testing.
>
>> +	INIT_LIST_HEAD(&dev->dev_addr_list);
>> +	memset(addr, 0, sizeof(*addr));
>> +	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>> +	if (!err) {
>> +		/*
>> +		 * Get the first (previously created) address from the list
>> +		 * and set dev_addr pointer to this location.
>> +		 */
>> +		ha = list_first_entry(&dev->dev_addr_list,
>> +				      struct netdev_hw_addr, list);
>> +		dev->dev_addr = ha->addr;
>> +	}
>> +	return err;
>> +}

<snip>


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)
  2009-04-18  7:01             ` [Bridge] " Jiri Pirko
@ 2009-04-18  7:35               ` Eric Dumazet
  -1 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-04-18  7:35 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Stephen Hemminger, linux-kernel, netdev, jgarzik, davem, bridge,
	fubar, bonding-devel, kaber, mschmidt, ivecera

Jiri Pirko a écrit :
> Fri, Apr 17, 2009 at 05:33:15PM CEST, shemminger@vyatta.com wrote:
> 
> <snip>
> 
>>> +struct netdev_hw_addr {
>>> +	struct list_head	list;
>>> +	unsigned char		addr[MAX_ADDR_LEN];
>>> +	int			refcount;
>>> +	struct rcu_head		rcu_head;
>>> +};
>> Minor nit, the ordering of elements cause holes that might not be
>> needed.
> 
> Agree that ordering might be done better. Will do.
>> Space saving? is rcu_head needed or would using synchronize_net
>> make code cleaner and save space. 
>>
> 
> Well I originaly had this done by synchronize_rcu(). Eric argued that it might
> cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
> call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
> called) once it hits the tree.
> 

Yes, and dont forget we wont save space, as we allocate a full
cache line to hold a 'struct netdev_hw_addr', since we dont want this
critical and read_mostly object polluted by a hot spot elsewhere in kernel...

Considering this, letting 'rcu_head' at the end of structure, even if we
have an eventual hole on 64 bit arches is not really a problem, and IMHO
the best thing to do, as rcu_head is only used at dismantle time.

And yes, maybe kfree_rcu() will makes its way in kernel, eventually :)

Thank you



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)
@ 2009-04-18  7:35               ` Eric Dumazet
  0 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-04-18  7:35 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, Stephen Hemminger, jgarzik, davem

Jiri Pirko a écrit :
> Fri, Apr 17, 2009 at 05:33:15PM CEST, shemminger@vyatta.com wrote:
> 
> <snip>
> 
>>> +struct netdev_hw_addr {
>>> +	struct list_head	list;
>>> +	unsigned char		addr[MAX_ADDR_LEN];
>>> +	int			refcount;
>>> +	struct rcu_head		rcu_head;
>>> +};
>> Minor nit, the ordering of elements cause holes that might not be
>> needed.
> 
> Agree that ordering might be done better. Will do.
>> Space saving? is rcu_head needed or would using synchronize_net
>> make code cleaner and save space. 
>>
> 
> Well I originaly had this done by synchronize_rcu(). Eric argued that it might
> cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
> call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
> called) once it hits the tree.
> 

Yes, and dont forget we wont save space, as we allocate a full
cache line to hold a 'struct netdev_hw_addr', since we dont want this
critical and read_mostly object polluted by a hot spot elsewhere in kernel...

Considering this, letting 'rcu_head' at the end of structure, even if we
have an eventual hole on 64 bit arches is not really a problem, and IMHO
the best thing to do, as rcu_head is only used at dismantle time.

And yes, maybe kfree_rcu() will makes its way in kernel, eventually :)

Thank you



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)
  2009-04-18  7:35               ` [Bridge] " Eric Dumazet
@ 2009-04-18  7:44                 ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-18  7:44 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, linux-kernel, netdev, jgarzik, davem, bridge,
	fubar, bonding-devel, kaber, mschmidt, ivecera

Sat, Apr 18, 2009 at 09:35:32AM CEST, dada1@cosmosbay.com wrote:
>Jiri Pirko a écrit :
>> Fri, Apr 17, 2009 at 05:33:15PM CEST, shemminger@vyatta.com wrote:
>> 
>> <snip>
>> 
>>>> +struct netdev_hw_addr {
>>>> +	struct list_head	list;
>>>> +	unsigned char		addr[MAX_ADDR_LEN];
>>>> +	int			refcount;
>>>> +	struct rcu_head		rcu_head;
>>>> +};
>>> Minor nit, the ordering of elements cause holes that might not be
>>> needed.
>> 
>> Agree that ordering might be done better. Will do.
>>> Space saving? is rcu_head needed or would using synchronize_net
>>> make code cleaner and save space. 
>>>
>> 
>> Well I originaly had this done by synchronize_rcu(). Eric argued that it might
>> cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
>> call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
>> called) once it hits the tree.
>> 
>
>Yes, and dont forget we wont save space, as we allocate a full
>cache line to hold a 'struct netdev_hw_addr', since we dont want this
>critical and read_mostly object polluted by a hot spot elsewhere in kernel...
>
>Considering this, letting 'rcu_head' at the end of structure, even if we
>have an eventual hole on 64 bit arches is not really a problem, and IMHO
>the best thing to do, as rcu_head is only used at dismantle time.

I will order the struct better, there are archs with small cache line size where
it makes sense.

>
>And yes, maybe kfree_rcu() will makes its way in kernel, eventually :)
>
>Thank you
>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)
@ 2009-04-18  7:44                 ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-18  7:44 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, Stephen Hemminger, jgarzik, davem

Sat, Apr 18, 2009 at 09:35:32AM CEST, dada1@cosmosbay.com wrote:
>Jiri Pirko a écrit :
>> Fri, Apr 17, 2009 at 05:33:15PM CEST, shemminger@vyatta.com wrote:
>> 
>> <snip>
>> 
>>>> +struct netdev_hw_addr {
>>>> +	struct list_head	list;
>>>> +	unsigned char		addr[MAX_ADDR_LEN];
>>>> +	int			refcount;
>>>> +	struct rcu_head		rcu_head;
>>>> +};
>>> Minor nit, the ordering of elements cause holes that might not be
>>> needed.
>> 
>> Agree that ordering might be done better. Will do.
>>> Space saving? is rcu_head needed or would using synchronize_net
>>> make code cleaner and save space. 
>>>
>> 
>> Well I originaly had this done by synchronize_rcu(). Eric argued that it might
>> cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
>> call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
>> called) once it hits the tree.
>> 
>
>Yes, and dont forget we wont save space, as we allocate a full
>cache line to hold a 'struct netdev_hw_addr', since we dont want this
>critical and read_mostly object polluted by a hot spot elsewhere in kernel...
>
>Considering this, letting 'rcu_head' at the end of structure, even if we
>have an eventual hole on 64 bit arches is not really a problem, and IMHO
>the best thing to do, as rcu_head is only used at dismantle time.

I will order the struct better, there are archs with small cache line size where
it makes sense.

>
>And yes, maybe kfree_rcu() will makes its way in kernel, eventually :)
>
>Thank you
>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)
  2009-04-18  7:44                 ` [Bridge] " Jiri Pirko
@ 2009-04-18  8:06                   ` Eric Dumazet
  -1 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-04-18  8:06 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Stephen Hemminger, linux-kernel, netdev, jgarzik, davem, bridge,
	fubar, bonding-devel, kaber, mschmidt, ivecera

Jiri Pirko a écrit :
> Sat, Apr 18, 2009 at 09:35:32AM CEST, dada1@cosmosbay.com wrote:
>> Jiri Pirko a écrit :
>>> Fri, Apr 17, 2009 at 05:33:15PM CEST, shemminger@vyatta.com wrote:
>>>
>>> <snip>
>>>
>>>>> +struct netdev_hw_addr {
>>>>> +	struct list_head	list;
>>>>> +	unsigned char		addr[MAX_ADDR_LEN];
>>>>> +	int			refcount;
>>>>> +	struct rcu_head		rcu_head;
>>>>> +};
>>>> Minor nit, the ordering of elements cause holes that might not be
>>>> needed.
>>> Agree that ordering might be done better. Will do.
>>>> Space saving? is rcu_head needed or would using synchronize_net
>>>> make code cleaner and save space. 
>>>>
>>> Well I originaly had this done by synchronize_rcu(). Eric argued that it might
>>> cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
>>> call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
>>> called) once it hits the tree.
>>>
>> Yes, and dont forget we wont save space, as we allocate a full
>> cache line to hold a 'struct netdev_hw_addr', since we dont want this
>> critical and read_mostly object polluted by a hot spot elsewhere in kernel...
>>
>> Considering this, letting 'rcu_head' at the end of structure, even if we
>> have an eventual hole on 64 bit arches is not really a problem, and IMHO
>> the best thing to do, as rcu_head is only used at dismantle time.
> 
> I will order the struct better, there are archs with small cache line size where
> it makes sense.

How exactly ?

If you consider a 32bit arch with 16 or 32 bytes cache line,
sizeof(struct_list_dead) is 8
sizeof(addr) = 32     (but we really use 6 bytes for ethernet)

struct netdev_hw_addr {
	unsigned char		addr[MAX_ADDR_LEN];
	struct list_head	list;
	int			refcount;
	struct rcu_head		rcu_head;
};

would cost more at lookup time, since we would use two cache lines

struct netdev_hw_addr {
	struct list_head	list;
	unsigned char		addr[MAX_ADDR_LEN];
	int			refcount;
	struct rcu_head		rcu_head;
};

Is nicer, because at least 8 bytes of addr share the same cache line
than list. So direct dev->dev_addr would be fast (for devices with one
address), and is_etherdev_addr() would still use one cache line per
item.



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)
@ 2009-04-18  8:06                   ` Eric Dumazet
  0 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-04-18  8:06 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, Stephen Hemminger, jgarzik, davem

Jiri Pirko a écrit :
> Sat, Apr 18, 2009 at 09:35:32AM CEST, dada1@cosmosbay.com wrote:
>> Jiri Pirko a écrit :
>>> Fri, Apr 17, 2009 at 05:33:15PM CEST, shemminger@vyatta.com wrote:
>>>
>>> <snip>
>>>
>>>>> +struct netdev_hw_addr {
>>>>> +	struct list_head	list;
>>>>> +	unsigned char		addr[MAX_ADDR_LEN];
>>>>> +	int			refcount;
>>>>> +	struct rcu_head		rcu_head;
>>>>> +};
>>>> Minor nit, the ordering of elements cause holes that might not be
>>>> needed.
>>> Agree that ordering might be done better. Will do.
>>>> Space saving? is rcu_head needed or would using synchronize_net
>>>> make code cleaner and save space. 
>>>>
>>> Well I originaly had this done by synchronize_rcu(). Eric argued that it might
>>> cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
>>> call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
>>> called) once it hits the tree.
>>>
>> Yes, and dont forget we wont save space, as we allocate a full
>> cache line to hold a 'struct netdev_hw_addr', since we dont want this
>> critical and read_mostly object polluted by a hot spot elsewhere in kernel...
>>
>> Considering this, letting 'rcu_head' at the end of structure, even if we
>> have an eventual hole on 64 bit arches is not really a problem, and IMHO
>> the best thing to do, as rcu_head is only used at dismantle time.
> 
> I will order the struct better, there are archs with small cache line size where
> it makes sense.

How exactly ?

If you consider a 32bit arch with 16 or 32 bytes cache line,
sizeof(struct_list_dead) is 8
sizeof(addr) = 32     (but we really use 6 bytes for ethernet)

struct netdev_hw_addr {
	unsigned char		addr[MAX_ADDR_LEN];
	struct list_head	list;
	int			refcount;
	struct rcu_head		rcu_head;
};

would cost more at lookup time, since we would use two cache lines

struct netdev_hw_addr {
	struct list_head	list;
	unsigned char		addr[MAX_ADDR_LEN];
	int			refcount;
	struct rcu_head		rcu_head;
};

Is nicer, because at least 8 bytes of addr share the same cache line
than list. So direct dev->dev_addr would be fast (for devices with one
address), and is_etherdev_addr() would still use one cache line per
item.



^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4)
  2009-04-17 11:57         ` [Bridge] " Jiri Pirko
@ 2009-04-18  8:58           ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-18  8:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

v3 -> v4 (current):
-changed kzalloc to kmalloc in __hw_addr_add_ii()
-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()

v2 -> v3:
-removed unnecessary rcu read locking
-moved dev_addr_flush() calling to ensure no null dereference of dev_addr

v1 -> v2:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   27 +++++
 include/linux/netdevice.h   |   32 +++++-
 net/core/dev.c              |  261 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 318 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..3d7a668 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
 	return compare_ether_addr(addr1, addr2);
 #endif
 }
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 addr[6 + 2])
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr_64bits(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
 #endif	/* __KERNEL__ */
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..89ad6d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,13 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	int			refcount;
+	struct rcu_head		rcu_head;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +783,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int		dev_addr_add(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_del(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_add_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+extern int		dev_addr_del_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 343883f..2274294 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			ha->refcount++;
+			return 0;
+		}
+	}
+
+	ha = kmalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->refcount = 1;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+			 int addr_len)
+{
+	return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+	struct netdev_hw_addr *ha;
+
+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
+	kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			if (--ha->refcount)
+				return 0;
+			list_del_rcu(&ha->list);
+			call_rcu(&ha->rcu_head, ha_rcu_free);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+			 int addr_len)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, int ignore_index)
+{
+	int err;
+	struct netdev_hw_addr *ha, *ha2;
+
+	list_for_each_entry(ha, from_list, list) {
+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+		if (err)
+			goto unroll;
+	}
+	return 0;
+
+unroll:
+	list_for_each_entry(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+	}
+	return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+
+	list_for_each_entry(ha, from_list, list) {
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+	}
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		call_rcu(&ha->rcu_head, ha_rcu_free);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	/* rtnl_mutex must be held here */
+
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	/* rtnl_mutex must be held here */
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addr_list,
+				      struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *
+ *	Add device addresses of the one device to another.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4)
@ 2009-04-18  8:58           ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-18  8:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

v3 -> v4 (current):
-changed kzalloc to kmalloc in __hw_addr_add_ii()
-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()

v2 -> v3:
-removed unnecessary rcu read locking
-moved dev_addr_flush() calling to ensure no null dereference of dev_addr

v1 -> v2:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   27 +++++
 include/linux/netdevice.h   |   32 +++++-
 net/core/dev.c              |  261 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 318 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..3d7a668 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
 	return compare_ether_addr(addr1, addr2);
 #endif
 }
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 addr[6 + 2])
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr_64bits(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
 #endif	/* __KERNEL__ */
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..89ad6d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,13 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	int			refcount;
+	struct rcu_head		rcu_head;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +783,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int		dev_addr_add(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_del(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_add_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+extern int		dev_addr_del_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 343883f..2274294 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			ha->refcount++;
+			return 0;
+		}
+	}
+
+	ha = kmalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->refcount = 1;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+			 int addr_len)
+{
+	return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+	struct netdev_hw_addr *ha;
+
+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
+	kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			if (--ha->refcount)
+				return 0;
+			list_del_rcu(&ha->list);
+			call_rcu(&ha->rcu_head, ha_rcu_free);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+			 int addr_len)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, int ignore_index)
+{
+	int err;
+	struct netdev_hw_addr *ha, *ha2;
+
+	list_for_each_entry(ha, from_list, list) {
+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+		if (err)
+			goto unroll;
+	}
+	return 0;
+
+unroll:
+	list_for_each_entry(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+	}
+	return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+
+	list_for_each_entry(ha, from_list, list) {
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+	}
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		call_rcu(&ha->rcu_head, ha_rcu_free);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	/* rtnl_mutex must be held here */
+
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	/* rtnl_mutex must be held here */
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addr_list,
+				      struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *
+ *	Add device addresses of the one device to another.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4)
  2009-04-18  8:58           ` [Bridge] " Jiri Pirko
@ 2009-04-20 16:11             ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-20 16:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

How about this (and another 2 patches in patchset)? What's your opinion guys?

Thanks,

Jirka

Sat, Apr 18, 2009 at 10:58:49AM CEST, jpirko@redhat.com wrote:
>v3 -> v4 (current):
>-changed kzalloc to kmalloc in __hw_addr_add_ii()
>-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()
>
>v2 -> v3:
>-removed unnecessary rcu read locking
>-moved dev_addr_flush() calling to ensure no null dereference of dev_addr
>
>v1 -> v2:
>-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
>-removed unnecessary rcu_read locking in dev_addr_init
>-use compare_ether_addr_64bits instead of compare_ether_addr
>-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
>-use call_rcu instead of rcu_synchronize
>-moved is_etherdev_addr into __KERNEL__ ifdef
>
>This patch introduces a new list in struct net_device and brings a set of
>functions to handle the work with device address list. The list is a replacement
>for the original dev_addr field and because in some situations there is need to
>carry several device addresses with the net device. To be backward compatible,
>dev_addr is made to point to the first member of the list so original drivers
>sees no difference.
>
>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>---
> include/linux/etherdevice.h |   27 +++++
> include/linux/netdevice.h   |   32 +++++-
> net/core/dev.c              |  261 +++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 318 insertions(+), 2 deletions(-)
>
>diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
>index a1f17ab..3d7a668 100644
>--- a/include/linux/etherdevice.h
>+++ b/include/linux/etherdevice.h
>@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
> 	return compare_ether_addr(addr1, addr2);
> #endif
> }
>+
>+/**
>+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
>+ * @dev: Pointer to a device structure
>+ * @addr: Pointer to a six-byte array containing the Ethernet address
>+ *
>+ * Compare passed address with all addresses of the device. Return true if the
>+ * address if one of the device addresses.
>+ *
>+ * Note that this function calls compare_ether_addr_64bits() so take care of
>+ * the right padding.
>+ */
>+static inline bool is_etherdev_addr(const struct net_device *dev,
>+				    const u8 addr[6 + 2])
>+{
>+	struct netdev_hw_addr *ha;
>+	int res = 1;
>+
>+	rcu_read_lock();
>+	for_each_dev_addr(dev, ha) {
>+		res = compare_ether_addr_64bits(addr, ha->addr);
>+		if (!res)
>+			break;
>+	}
>+	rcu_read_unlock();
>+	return !res;
>+}
> #endif	/* __KERNEL__ */
> 
> /**
>diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>index 2e7783f..89ad6d2 100644
>--- a/include/linux/netdevice.h
>+++ b/include/linux/netdevice.h
>@@ -210,6 +210,13 @@ struct dev_addr_list
> #define dmi_users	da_users
> #define dmi_gusers	da_gusers
> 
>+struct netdev_hw_addr {
>+	struct list_head	list;
>+	unsigned char		addr[MAX_ADDR_LEN];
>+	int			refcount;
>+	struct rcu_head		rcu_head;
>+};
>+
> struct hh_cache
> {
> 	struct hh_cache *hh_next;	/* Next entry			     */
>@@ -776,8 +783,11 @@ struct net_device
>  */
> 	unsigned long		last_rx;	/* Time of last Rx	*/
> 	/* Interface address info used in eth_type_trans() */
>-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
>-							   because most packets are unicast) */
>+	unsigned char		*dev_addr;	/* hw address, (before bcast
>+						   because most packets are
>+						   unicast) */
>+
>+	struct list_head	dev_addr_list; /* list of device hw addresses */
> 
> 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
> 
>@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
> 	spin_unlock_bh(&dev->addr_list_lock);
> }
> 
>+/*
>+ * dev_addr_list walker. Should be used only for read access. Call with
>+ * rcu_read_lock held.
>+ */
>+#define for_each_dev_addr(dev, ha) \
>+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
>+
> /* These functions live elsewhere (drivers/net/net_init.c, but related) */
> 
> extern void		ether_setup(struct net_device *dev);
>@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
> 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
> extern int		register_netdev(struct net_device *dev);
> extern void		unregister_netdev(struct net_device *dev);
>+
>+/* Functions used for device addresses handling */
>+extern int		dev_addr_add(struct net_device *dev,
>+				     unsigned char *addr);
>+extern int		dev_addr_del(struct net_device *dev,
>+				     unsigned char *addr);
>+extern int		dev_addr_add_multiple(struct net_device *to_dev,
>+					      struct net_device *from_dev);
>+extern int		dev_addr_del_multiple(struct net_device *to_dev,
>+					      struct net_device *from_dev);
>+
> /* Functions used for secondary unicast and multicast support */
> extern void		dev_set_rx_mode(struct net_device *dev);
> extern void		__dev_set_rx_mode(struct net_device *dev);
>diff --git a/net/core/dev.c b/net/core/dev.c
>index 343883f..2274294 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
> 	netif_addr_unlock_bh(dev);
> }
> 
>+/* hw addresses list handling functions */
>+
>+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>+			    int addr_len, int ignore_index)
>+{
>+	struct netdev_hw_addr *ha;
>+	int i = 0;
>+
>+	if (addr_len > MAX_ADDR_LEN)
>+		return -EINVAL;
>+
>+	list_for_each_entry(ha, list, list) {
>+		if (i++ != ignore_index &&
>+		    !memcmp(ha->addr, addr, addr_len)) {
>+			ha->refcount++;
>+			return 0;
>+		}
>+	}
>+
>+	ha = kmalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
>+	if (!ha)
>+		return -ENOMEM;
>+	memcpy(ha->addr, addr, addr_len);
>+	ha->refcount = 1;
>+	list_add_tail_rcu(&ha->list, list);
>+	return 0;
>+}
>+
>+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
>+			 int addr_len)
>+{
>+	return __hw_addr_add_ii(list, addr, addr_len, -1);
>+}
>+
>+static void ha_rcu_free(struct rcu_head *head)
>+{
>+	struct netdev_hw_addr *ha;
>+
>+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
>+	kfree(ha);
>+}
>+
>+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
>+			    int addr_len, int ignore_index)
>+{
>+	struct netdev_hw_addr *ha;
>+	int i = 0;
>+
>+	list_for_each_entry(ha, list, list) {
>+		if (i++ != ignore_index &&
>+		    !memcmp(ha->addr, addr, addr_len)) {
>+			if (--ha->refcount)
>+				return 0;
>+			list_del_rcu(&ha->list);
>+			call_rcu(&ha->rcu_head, ha_rcu_free);
>+			return 0;
>+		}
>+	}
>+	return -ENOENT;
>+}
>+
>+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
>+			 int addr_len)
>+{
>+	return __hw_addr_del_ii(list, addr, addr_len, -1);
>+}
>+
>+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
>+				     struct list_head *from_list,
>+				     int addr_len, int ignore_index)
>+{
>+	int err;
>+	struct netdev_hw_addr *ha, *ha2;
>+
>+	list_for_each_entry(ha, from_list, list) {
>+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
>+		if (err)
>+			goto unroll;
>+	}
>+	return 0;
>+
>+unroll:
>+	list_for_each_entry(ha2, from_list, list) {
>+		if (ha2 == ha)
>+			break;
>+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
>+	}
>+	return err;
>+}
>+
>+static int __hw_addr_add_multiple(struct list_head *to_list,
>+					 struct list_head *from_list,
>+					 int addr_len)
>+{
>+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
>+}
>+
>+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
>+				      struct list_head *from_list,
>+				      int addr_len, int ignore_index)
>+{
>+	struct netdev_hw_addr *ha;
>+
>+	list_for_each_entry(ha, from_list, list) {
>+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
>+	}
>+}
>+
>+static void __hw_addr_del_multiple(struct list_head *to_list,
>+					 struct list_head *from_list,
>+					 int addr_len)
>+{
>+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
>+}
>+
>+static void __hw_addr_flush(struct list_head *list)
>+{
>+	struct netdev_hw_addr *ha, *tmp;
>+
>+	list_for_each_entry_safe(ha, tmp, list, list) {
>+		list_del_rcu(&ha->list);
>+		call_rcu(&ha->rcu_head, ha_rcu_free);
>+	}
>+}
>+
>+/* Device addresses handling functions */
>+
>+static void dev_addr_flush(struct net_device *dev)
>+{
>+	/* rtnl_mutex must be held here */
>+
>+	__hw_addr_flush(&dev->dev_addr_list);
>+	dev->dev_addr = NULL;
>+}
>+
>+static int dev_addr_init(struct net_device *dev)
>+{
>+	unsigned char addr[MAX_ADDR_LEN];
>+	struct netdev_hw_addr *ha;
>+	int err;
>+
>+	/* rtnl_mutex must be held here */
>+
>+	INIT_LIST_HEAD(&dev->dev_addr_list);
>+	memset(addr, 0, sizeof(*addr));
>+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>+	if (!err) {
>+		/*
>+		 * Get the first (previously created) address from the list
>+		 * and set dev_addr pointer to this location.
>+		 */
>+		ha = list_first_entry(&dev->dev_addr_list,
>+				      struct netdev_hw_addr, list);
>+		dev->dev_addr = ha->addr;
>+	}
>+	return err;
>+}
>+
>+/**
>+ *	dev_addr_add	- Add a device address
>+ *	@dev: device
>+ *	@addr: address to add
>+ *
>+ *	Add a device address to the device or increase the reference count if
>+ *	it already exists.
>+ *
>+ *	The caller must hold the rtnl_mutex.
>+ */
>+int dev_addr_add(struct net_device *dev, unsigned char *addr)
>+{
>+	int err;
>+
>+	ASSERT_RTNL();
>+
>+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
>+	if (!err)
>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>+	return err;
>+}
>+EXPORT_SYMBOL(dev_addr_add);
>+
>+/**
>+ *	dev_addr_del	- Release a device address.
>+ *	@dev: device
>+ *	@addr: address to delete
>+ *
>+ *	Release reference to a device address and remove it from the device
>+ *	if the reference count drops to zero.
>+ *
>+ *	The caller must hold the rtnl_mutex.
>+ */
>+int dev_addr_del(struct net_device *dev, unsigned char *addr)
>+{
>+	int err;
>+
>+	ASSERT_RTNL();
>+
>+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
>+	if (!err)
>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>+	return err;
>+}
>+EXPORT_SYMBOL(dev_addr_del);
>+
>+/**
>+ *	dev_addr_add_multiple	- Add device addresses from another device
>+ *	@to_dev: device to which addresses will be added
>+ *	@from_dev: device from which addresses will be added
>+ *
>+ *	Add device addresses of the one device to another.
>+ *
>+ *	The caller must hold the rtnl_mutex.
>+ */
>+int dev_addr_add_multiple(struct net_device *to_dev,
>+			  struct net_device *from_dev)
>+{
>+	int err;
>+
>+	ASSERT_RTNL();
>+
>+	if (from_dev->addr_len != to_dev->addr_len)
>+		return -EINVAL;
>+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
>+					&from_dev->dev_addr_list,
>+					to_dev->addr_len, 0);
>+	if (!err)
>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
>+	return err;
>+}
>+EXPORT_SYMBOL(dev_addr_add_multiple);
>+
>+/**
>+ *	dev_addr_del_multiple	- Delete device addresses by another device
>+ *	@to_dev: device where the addresses will be deleted
>+ *	@from_dev: device by which addresses the addresses will be deleted
>+ *
>+ *	Deletes addresses in to device by the list of addresses in from device.
>+ *
>+ *	The caller must hold the rtnl_mutex.
>+ */
>+int dev_addr_del_multiple(struct net_device *to_dev,
>+			  struct net_device *from_dev)
>+{
>+	ASSERT_RTNL();
>+
>+	if (from_dev->addr_len != to_dev->addr_len)
>+		return -EINVAL;
>+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
>+				  &from_dev->dev_addr_list,
>+				  to_dev->addr_len, 0);
>+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
>+	return 0;
>+}
>+EXPORT_SYMBOL(dev_addr_del_multiple);
>+
>+/* unicast and multicast addresses handling functions */
>+
> int __dev_addr_delete(struct dev_addr_list **list, int *count,
> 		      void *addr, int alen, int glbl)
> {
>@@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
> 
> 	dev->gso_max_size = GSO_MAX_SIZE;
> 
>+	dev_addr_init(dev);
> 	netdev_init_queues(dev);
> 
> 	INIT_LIST_HEAD(&dev->napi_list);
>@@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
> 
> 	kfree(dev->_tx);
> 
>+	/* Flush device addresses */
>+	dev_addr_flush(dev);
>+
> 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
> 		netif_napi_del(p);
> 
>-- 
>1.6.0.6
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4)
@ 2009-04-20 16:11             ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-20 16:11 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

How about this (and another 2 patches in patchset)? What's your opinion guys?

Thanks,

Jirka

Sat, Apr 18, 2009 at 10:58:49AM CEST, jpirko@redhat.com wrote:
>v3 -> v4 (current):
>-changed kzalloc to kmalloc in __hw_addr_add_ii()
>-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()
>
>v2 -> v3:
>-removed unnecessary rcu read locking
>-moved dev_addr_flush() calling to ensure no null dereference of dev_addr
>
>v1 -> v2:
>-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
>-removed unnecessary rcu_read locking in dev_addr_init
>-use compare_ether_addr_64bits instead of compare_ether_addr
>-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
>-use call_rcu instead of rcu_synchronize
>-moved is_etherdev_addr into __KERNEL__ ifdef
>
>This patch introduces a new list in struct net_device and brings a set of
>functions to handle the work with device address list. The list is a replacement
>for the original dev_addr field and because in some situations there is need to
>carry several device addresses with the net device. To be backward compatible,
>dev_addr is made to point to the first member of the list so original drivers
>sees no difference.
>
>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>---
> include/linux/etherdevice.h |   27 +++++
> include/linux/netdevice.h   |   32 +++++-
> net/core/dev.c              |  261 +++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 318 insertions(+), 2 deletions(-)
>
>diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
>index a1f17ab..3d7a668 100644
>--- a/include/linux/etherdevice.h
>+++ b/include/linux/etherdevice.h
>@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
> 	return compare_ether_addr(addr1, addr2);
> #endif
> }
>+
>+/**
>+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
>+ * @dev: Pointer to a device structure
>+ * @addr: Pointer to a six-byte array containing the Ethernet address
>+ *
>+ * Compare passed address with all addresses of the device. Return true if the
>+ * address if one of the device addresses.
>+ *
>+ * Note that this function calls compare_ether_addr_64bits() so take care of
>+ * the right padding.
>+ */
>+static inline bool is_etherdev_addr(const struct net_device *dev,
>+				    const u8 addr[6 + 2])
>+{
>+	struct netdev_hw_addr *ha;
>+	int res = 1;
>+
>+	rcu_read_lock();
>+	for_each_dev_addr(dev, ha) {
>+		res = compare_ether_addr_64bits(addr, ha->addr);
>+		if (!res)
>+			break;
>+	}
>+	rcu_read_unlock();
>+	return !res;
>+}
> #endif	/* __KERNEL__ */
> 
> /**
>diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>index 2e7783f..89ad6d2 100644
>--- a/include/linux/netdevice.h
>+++ b/include/linux/netdevice.h
>@@ -210,6 +210,13 @@ struct dev_addr_list
> #define dmi_users	da_users
> #define dmi_gusers	da_gusers
> 
>+struct netdev_hw_addr {
>+	struct list_head	list;
>+	unsigned char		addr[MAX_ADDR_LEN];
>+	int			refcount;
>+	struct rcu_head		rcu_head;
>+};
>+
> struct hh_cache
> {
> 	struct hh_cache *hh_next;	/* Next entry			     */
>@@ -776,8 +783,11 @@ struct net_device
>  */
> 	unsigned long		last_rx;	/* Time of last Rx	*/
> 	/* Interface address info used in eth_type_trans() */
>-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
>-							   because most packets are unicast) */
>+	unsigned char		*dev_addr;	/* hw address, (before bcast
>+						   because most packets are
>+						   unicast) */
>+
>+	struct list_head	dev_addr_list; /* list of device hw addresses */
> 
> 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
> 
>@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
> 	spin_unlock_bh(&dev->addr_list_lock);
> }
> 
>+/*
>+ * dev_addr_list walker. Should be used only for read access. Call with
>+ * rcu_read_lock held.
>+ */
>+#define for_each_dev_addr(dev, ha) \
>+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
>+
> /* These functions live elsewhere (drivers/net/net_init.c, but related) */
> 
> extern void		ether_setup(struct net_device *dev);
>@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
> 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
> extern int		register_netdev(struct net_device *dev);
> extern void		unregister_netdev(struct net_device *dev);
>+
>+/* Functions used for device addresses handling */
>+extern int		dev_addr_add(struct net_device *dev,
>+				     unsigned char *addr);
>+extern int		dev_addr_del(struct net_device *dev,
>+				     unsigned char *addr);
>+extern int		dev_addr_add_multiple(struct net_device *to_dev,
>+					      struct net_device *from_dev);
>+extern int		dev_addr_del_multiple(struct net_device *to_dev,
>+					      struct net_device *from_dev);
>+
> /* Functions used for secondary unicast and multicast support */
> extern void		dev_set_rx_mode(struct net_device *dev);
> extern void		__dev_set_rx_mode(struct net_device *dev);
>diff --git a/net/core/dev.c b/net/core/dev.c
>index 343883f..2274294 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
> 	netif_addr_unlock_bh(dev);
> }
> 
>+/* hw addresses list handling functions */
>+
>+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>+			    int addr_len, int ignore_index)
>+{
>+	struct netdev_hw_addr *ha;
>+	int i = 0;
>+
>+	if (addr_len > MAX_ADDR_LEN)
>+		return -EINVAL;
>+
>+	list_for_each_entry(ha, list, list) {
>+		if (i++ != ignore_index &&
>+		    !memcmp(ha->addr, addr, addr_len)) {
>+			ha->refcount++;
>+			return 0;
>+		}
>+	}
>+
>+	ha = kmalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
>+	if (!ha)
>+		return -ENOMEM;
>+	memcpy(ha->addr, addr, addr_len);
>+	ha->refcount = 1;
>+	list_add_tail_rcu(&ha->list, list);
>+	return 0;
>+}
>+
>+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
>+			 int addr_len)
>+{
>+	return __hw_addr_add_ii(list, addr, addr_len, -1);
>+}
>+
>+static void ha_rcu_free(struct rcu_head *head)
>+{
>+	struct netdev_hw_addr *ha;
>+
>+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
>+	kfree(ha);
>+}
>+
>+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
>+			    int addr_len, int ignore_index)
>+{
>+	struct netdev_hw_addr *ha;
>+	int i = 0;
>+
>+	list_for_each_entry(ha, list, list) {
>+		if (i++ != ignore_index &&
>+		    !memcmp(ha->addr, addr, addr_len)) {
>+			if (--ha->refcount)
>+				return 0;
>+			list_del_rcu(&ha->list);
>+			call_rcu(&ha->rcu_head, ha_rcu_free);
>+			return 0;
>+		}
>+	}
>+	return -ENOENT;
>+}
>+
>+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
>+			 int addr_len)
>+{
>+	return __hw_addr_del_ii(list, addr, addr_len, -1);
>+}
>+
>+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
>+				     struct list_head *from_list,
>+				     int addr_len, int ignore_index)
>+{
>+	int err;
>+	struct netdev_hw_addr *ha, *ha2;
>+
>+	list_for_each_entry(ha, from_list, list) {
>+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
>+		if (err)
>+			goto unroll;
>+	}
>+	return 0;
>+
>+unroll:
>+	list_for_each_entry(ha2, from_list, list) {
>+		if (ha2 == ha)
>+			break;
>+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
>+	}
>+	return err;
>+}
>+
>+static int __hw_addr_add_multiple(struct list_head *to_list,
>+					 struct list_head *from_list,
>+					 int addr_len)
>+{
>+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
>+}
>+
>+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
>+				      struct list_head *from_list,
>+				      int addr_len, int ignore_index)
>+{
>+	struct netdev_hw_addr *ha;
>+
>+	list_for_each_entry(ha, from_list, list) {
>+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
>+	}
>+}
>+
>+static void __hw_addr_del_multiple(struct list_head *to_list,
>+					 struct list_head *from_list,
>+					 int addr_len)
>+{
>+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
>+}
>+
>+static void __hw_addr_flush(struct list_head *list)
>+{
>+	struct netdev_hw_addr *ha, *tmp;
>+
>+	list_for_each_entry_safe(ha, tmp, list, list) {
>+		list_del_rcu(&ha->list);
>+		call_rcu(&ha->rcu_head, ha_rcu_free);
>+	}
>+}
>+
>+/* Device addresses handling functions */
>+
>+static void dev_addr_flush(struct net_device *dev)
>+{
>+	/* rtnl_mutex must be held here */
>+
>+	__hw_addr_flush(&dev->dev_addr_list);
>+	dev->dev_addr = NULL;
>+}
>+
>+static int dev_addr_init(struct net_device *dev)
>+{
>+	unsigned char addr[MAX_ADDR_LEN];
>+	struct netdev_hw_addr *ha;
>+	int err;
>+
>+	/* rtnl_mutex must be held here */
>+
>+	INIT_LIST_HEAD(&dev->dev_addr_list);
>+	memset(addr, 0, sizeof(*addr));
>+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>+	if (!err) {
>+		/*
>+		 * Get the first (previously created) address from the list
>+		 * and set dev_addr pointer to this location.
>+		 */
>+		ha = list_first_entry(&dev->dev_addr_list,
>+				      struct netdev_hw_addr, list);
>+		dev->dev_addr = ha->addr;
>+	}
>+	return err;
>+}
>+
>+/**
>+ *	dev_addr_add	- Add a device address
>+ *	@dev: device
>+ *	@addr: address to add
>+ *
>+ *	Add a device address to the device or increase the reference count if
>+ *	it already exists.
>+ *
>+ *	The caller must hold the rtnl_mutex.
>+ */
>+int dev_addr_add(struct net_device *dev, unsigned char *addr)
>+{
>+	int err;
>+
>+	ASSERT_RTNL();
>+
>+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
>+	if (!err)
>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>+	return err;
>+}
>+EXPORT_SYMBOL(dev_addr_add);
>+
>+/**
>+ *	dev_addr_del	- Release a device address.
>+ *	@dev: device
>+ *	@addr: address to delete
>+ *
>+ *	Release reference to a device address and remove it from the device
>+ *	if the reference count drops to zero.
>+ *
>+ *	The caller must hold the rtnl_mutex.
>+ */
>+int dev_addr_del(struct net_device *dev, unsigned char *addr)
>+{
>+	int err;
>+
>+	ASSERT_RTNL();
>+
>+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
>+	if (!err)
>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>+	return err;
>+}
>+EXPORT_SYMBOL(dev_addr_del);
>+
>+/**
>+ *	dev_addr_add_multiple	- Add device addresses from another device
>+ *	@to_dev: device to which addresses will be added
>+ *	@from_dev: device from which addresses will be added
>+ *
>+ *	Add device addresses of the one device to another.
>+ *
>+ *	The caller must hold the rtnl_mutex.
>+ */
>+int dev_addr_add_multiple(struct net_device *to_dev,
>+			  struct net_device *from_dev)
>+{
>+	int err;
>+
>+	ASSERT_RTNL();
>+
>+	if (from_dev->addr_len != to_dev->addr_len)
>+		return -EINVAL;
>+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
>+					&from_dev->dev_addr_list,
>+					to_dev->addr_len, 0);
>+	if (!err)
>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
>+	return err;
>+}
>+EXPORT_SYMBOL(dev_addr_add_multiple);
>+
>+/**
>+ *	dev_addr_del_multiple	- Delete device addresses by another device
>+ *	@to_dev: device where the addresses will be deleted
>+ *	@from_dev: device by which addresses the addresses will be deleted
>+ *
>+ *	Deletes addresses in to device by the list of addresses in from device.
>+ *
>+ *	The caller must hold the rtnl_mutex.
>+ */
>+int dev_addr_del_multiple(struct net_device *to_dev,
>+			  struct net_device *from_dev)
>+{
>+	ASSERT_RTNL();
>+
>+	if (from_dev->addr_len != to_dev->addr_len)
>+		return -EINVAL;
>+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
>+				  &from_dev->dev_addr_list,
>+				  to_dev->addr_len, 0);
>+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
>+	return 0;
>+}
>+EXPORT_SYMBOL(dev_addr_del_multiple);
>+
>+/* unicast and multicast addresses handling functions */
>+
> int __dev_addr_delete(struct dev_addr_list **list, int *count,
> 		      void *addr, int alen, int glbl)
> {
>@@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
> 
> 	dev->gso_max_size = GSO_MAX_SIZE;
> 
>+	dev_addr_init(dev);
> 	netdev_init_queues(dev);
> 
> 	INIT_LIST_HEAD(&dev->napi_list);
>@@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
> 
> 	kfree(dev->_tx);
> 
>+	/* Flush device addresses */
>+	dev_addr_flush(dev);
>+
> 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
> 		netif_napi_del(p);
> 
>-- 
>1.6.0.6
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4)
  2009-04-20 16:11             ` [Bridge] " Jiri Pirko
  (?)
@ 2009-04-23  8:09               ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-23  8:09 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

Mon, Apr 20, 2009 at 06:11:56PM CEST, jpirko@redhat.com wrote:
>How about this (and another 2 patches in patchset)? What's your opinion guys?

Eric, Stephen, Patrick? Can you please look at this? Any objections?
>
>Thanks,
>
>Jirka
>
>Sat, Apr 18, 2009 at 10:58:49AM CEST, jpirko@redhat.com wrote:
>>v3 -> v4 (current):
>>-changed kzalloc to kmalloc in __hw_addr_add_ii()
>>-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()
>>
>>v2 -> v3:
>>-removed unnecessary rcu read locking
>>-moved dev_addr_flush() calling to ensure no null dereference of dev_addr
>>
>>v1 -> v2:
>>-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
>>-removed unnecessary rcu_read locking in dev_addr_init
>>-use compare_ether_addr_64bits instead of compare_ether_addr
>>-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
>>-use call_rcu instead of rcu_synchronize
>>-moved is_etherdev_addr into __KERNEL__ ifdef
>>
>>This patch introduces a new list in struct net_device and brings a set of
>>functions to handle the work with device address list. The list is a replacement
>>for the original dev_addr field and because in some situations there is need to
>>carry several device addresses with the net device. To be backward compatible,
>>dev_addr is made to point to the first member of the list so original drivers
>>sees no difference.
>>
>>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>>---
>> include/linux/etherdevice.h |   27 +++++
>> include/linux/netdevice.h   |   32 +++++-
>> net/core/dev.c              |  261 +++++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 318 insertions(+), 2 deletions(-)
>>
>>diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
>>index a1f17ab..3d7a668 100644
>>--- a/include/linux/etherdevice.h
>>+++ b/include/linux/etherdevice.h
>>@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
>> 	return compare_ether_addr(addr1, addr2);
>> #endif
>> }
>>+
>>+/**
>>+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
>>+ * @dev: Pointer to a device structure
>>+ * @addr: Pointer to a six-byte array containing the Ethernet address
>>+ *
>>+ * Compare passed address with all addresses of the device. Return true if the
>>+ * address if one of the device addresses.
>>+ *
>>+ * Note that this function calls compare_ether_addr_64bits() so take care of
>>+ * the right padding.
>>+ */
>>+static inline bool is_etherdev_addr(const struct net_device *dev,
>>+				    const u8 addr[6 + 2])
>>+{
>>+	struct netdev_hw_addr *ha;
>>+	int res = 1;
>>+
>>+	rcu_read_lock();
>>+	for_each_dev_addr(dev, ha) {
>>+		res = compare_ether_addr_64bits(addr, ha->addr);
>>+		if (!res)
>>+			break;
>>+	}
>>+	rcu_read_unlock();
>>+	return !res;
>>+}
>> #endif	/* __KERNEL__ */
>> 
>> /**
>>diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>index 2e7783f..89ad6d2 100644
>>--- a/include/linux/netdevice.h
>>+++ b/include/linux/netdevice.h
>>@@ -210,6 +210,13 @@ struct dev_addr_list
>> #define dmi_users	da_users
>> #define dmi_gusers	da_gusers
>> 
>>+struct netdev_hw_addr {
>>+	struct list_head	list;
>>+	unsigned char		addr[MAX_ADDR_LEN];
>>+	int			refcount;
>>+	struct rcu_head		rcu_head;
>>+};
>>+
>> struct hh_cache
>> {
>> 	struct hh_cache *hh_next;	/* Next entry			     */
>>@@ -776,8 +783,11 @@ struct net_device
>>  */
>> 	unsigned long		last_rx;	/* Time of last Rx	*/
>> 	/* Interface address info used in eth_type_trans() */
>>-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
>>-							   because most packets are unicast) */
>>+	unsigned char		*dev_addr;	/* hw address, (before bcast
>>+						   because most packets are
>>+						   unicast) */
>>+
>>+	struct list_head	dev_addr_list; /* list of device hw addresses */
>> 
>> 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
>> 
>>@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
>> 	spin_unlock_bh(&dev->addr_list_lock);
>> }
>> 
>>+/*
>>+ * dev_addr_list walker. Should be used only for read access. Call with
>>+ * rcu_read_lock held.
>>+ */
>>+#define for_each_dev_addr(dev, ha) \
>>+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
>>+
>> /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>> 
>> extern void		ether_setup(struct net_device *dev);
>>@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>> 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
>> extern int		register_netdev(struct net_device *dev);
>> extern void		unregister_netdev(struct net_device *dev);
>>+
>>+/* Functions used for device addresses handling */
>>+extern int		dev_addr_add(struct net_device *dev,
>>+				     unsigned char *addr);
>>+extern int		dev_addr_del(struct net_device *dev,
>>+				     unsigned char *addr);
>>+extern int		dev_addr_add_multiple(struct net_device *to_dev,
>>+					      struct net_device *from_dev);
>>+extern int		dev_addr_del_multiple(struct net_device *to_dev,
>>+					      struct net_device *from_dev);
>>+
>> /* Functions used for secondary unicast and multicast support */
>> extern void		dev_set_rx_mode(struct net_device *dev);
>> extern void		__dev_set_rx_mode(struct net_device *dev);
>>diff --git a/net/core/dev.c b/net/core/dev.c
>>index 343883f..2274294 100644
>>--- a/net/core/dev.c
>>+++ b/net/core/dev.c
>>@@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
>> 	netif_addr_unlock_bh(dev);
>> }
>> 
>>+/* hw addresses list handling functions */
>>+
>>+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>>+			    int addr_len, int ignore_index)
>>+{
>>+	struct netdev_hw_addr *ha;
>>+	int i = 0;
>>+
>>+	if (addr_len > MAX_ADDR_LEN)
>>+		return -EINVAL;
>>+
>>+	list_for_each_entry(ha, list, list) {
>>+		if (i++ != ignore_index &&
>>+		    !memcmp(ha->addr, addr, addr_len)) {
>>+			ha->refcount++;
>>+			return 0;
>>+		}
>>+	}
>>+
>>+	ha = kmalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
>>+	if (!ha)
>>+		return -ENOMEM;
>>+	memcpy(ha->addr, addr, addr_len);
>>+	ha->refcount = 1;
>>+	list_add_tail_rcu(&ha->list, list);
>>+	return 0;
>>+}
>>+
>>+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
>>+			 int addr_len)
>>+{
>>+	return __hw_addr_add_ii(list, addr, addr_len, -1);
>>+}
>>+
>>+static void ha_rcu_free(struct rcu_head *head)
>>+{
>>+	struct netdev_hw_addr *ha;
>>+
>>+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
>>+	kfree(ha);
>>+}
>>+
>>+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
>>+			    int addr_len, int ignore_index)
>>+{
>>+	struct netdev_hw_addr *ha;
>>+	int i = 0;
>>+
>>+	list_for_each_entry(ha, list, list) {
>>+		if (i++ != ignore_index &&
>>+		    !memcmp(ha->addr, addr, addr_len)) {
>>+			if (--ha->refcount)
>>+				return 0;
>>+			list_del_rcu(&ha->list);
>>+			call_rcu(&ha->rcu_head, ha_rcu_free);
>>+			return 0;
>>+		}
>>+	}
>>+	return -ENOENT;
>>+}
>>+
>>+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
>>+			 int addr_len)
>>+{
>>+	return __hw_addr_del_ii(list, addr, addr_len, -1);
>>+}
>>+
>>+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
>>+				     struct list_head *from_list,
>>+				     int addr_len, int ignore_index)
>>+{
>>+	int err;
>>+	struct netdev_hw_addr *ha, *ha2;
>>+
>>+	list_for_each_entry(ha, from_list, list) {
>>+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
>>+		if (err)
>>+			goto unroll;
>>+	}
>>+	return 0;
>>+
>>+unroll:
>>+	list_for_each_entry(ha2, from_list, list) {
>>+		if (ha2 == ha)
>>+			break;
>>+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
>>+	}
>>+	return err;
>>+}
>>+
>>+static int __hw_addr_add_multiple(struct list_head *to_list,
>>+					 struct list_head *from_list,
>>+					 int addr_len)
>>+{
>>+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
>>+}
>>+
>>+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
>>+				      struct list_head *from_list,
>>+				      int addr_len, int ignore_index)
>>+{
>>+	struct netdev_hw_addr *ha;
>>+
>>+	list_for_each_entry(ha, from_list, list) {
>>+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
>>+	}
>>+}
>>+
>>+static void __hw_addr_del_multiple(struct list_head *to_list,
>>+					 struct list_head *from_list,
>>+					 int addr_len)
>>+{
>>+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
>>+}
>>+
>>+static void __hw_addr_flush(struct list_head *list)
>>+{
>>+	struct netdev_hw_addr *ha, *tmp;
>>+
>>+	list_for_each_entry_safe(ha, tmp, list, list) {
>>+		list_del_rcu(&ha->list);
>>+		call_rcu(&ha->rcu_head, ha_rcu_free);
>>+	}
>>+}
>>+
>>+/* Device addresses handling functions */
>>+
>>+static void dev_addr_flush(struct net_device *dev)
>>+{
>>+	/* rtnl_mutex must be held here */
>>+
>>+	__hw_addr_flush(&dev->dev_addr_list);
>>+	dev->dev_addr = NULL;
>>+}
>>+
>>+static int dev_addr_init(struct net_device *dev)
>>+{
>>+	unsigned char addr[MAX_ADDR_LEN];
>>+	struct netdev_hw_addr *ha;
>>+	int err;
>>+
>>+	/* rtnl_mutex must be held here */
>>+
>>+	INIT_LIST_HEAD(&dev->dev_addr_list);
>>+	memset(addr, 0, sizeof(*addr));
>>+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>>+	if (!err) {
>>+		/*
>>+		 * Get the first (previously created) address from the list
>>+		 * and set dev_addr pointer to this location.
>>+		 */
>>+		ha = list_first_entry(&dev->dev_addr_list,
>>+				      struct netdev_hw_addr, list);
>>+		dev->dev_addr = ha->addr;
>>+	}
>>+	return err;
>>+}
>>+
>>+/**
>>+ *	dev_addr_add	- Add a device address
>>+ *	@dev: device
>>+ *	@addr: address to add
>>+ *
>>+ *	Add a device address to the device or increase the reference count if
>>+ *	it already exists.
>>+ *
>>+ *	The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_add(struct net_device *dev, unsigned char *addr)
>>+{
>>+	int err;
>>+
>>+	ASSERT_RTNL();
>>+
>>+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
>>+	if (!err)
>>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>>+	return err;
>>+}
>>+EXPORT_SYMBOL(dev_addr_add);
>>+
>>+/**
>>+ *	dev_addr_del	- Release a device address.
>>+ *	@dev: device
>>+ *	@addr: address to delete
>>+ *
>>+ *	Release reference to a device address and remove it from the device
>>+ *	if the reference count drops to zero.
>>+ *
>>+ *	The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_del(struct net_device *dev, unsigned char *addr)
>>+{
>>+	int err;
>>+
>>+	ASSERT_RTNL();
>>+
>>+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
>>+	if (!err)
>>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>>+	return err;
>>+}
>>+EXPORT_SYMBOL(dev_addr_del);
>>+
>>+/**
>>+ *	dev_addr_add_multiple	- Add device addresses from another device
>>+ *	@to_dev: device to which addresses will be added
>>+ *	@from_dev: device from which addresses will be added
>>+ *
>>+ *	Add device addresses of the one device to another.
>>+ *
>>+ *	The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_add_multiple(struct net_device *to_dev,
>>+			  struct net_device *from_dev)
>>+{
>>+	int err;
>>+
>>+	ASSERT_RTNL();
>>+
>>+	if (from_dev->addr_len != to_dev->addr_len)
>>+		return -EINVAL;
>>+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
>>+					&from_dev->dev_addr_list,
>>+					to_dev->addr_len, 0);
>>+	if (!err)
>>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
>>+	return err;
>>+}
>>+EXPORT_SYMBOL(dev_addr_add_multiple);
>>+
>>+/**
>>+ *	dev_addr_del_multiple	- Delete device addresses by another device
>>+ *	@to_dev: device where the addresses will be deleted
>>+ *	@from_dev: device by which addresses the addresses will be deleted
>>+ *
>>+ *	Deletes addresses in to device by the list of addresses in from device.
>>+ *
>>+ *	The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_del_multiple(struct net_device *to_dev,
>>+			  struct net_device *from_dev)
>>+{
>>+	ASSERT_RTNL();
>>+
>>+	if (from_dev->addr_len != to_dev->addr_len)
>>+		return -EINVAL;
>>+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
>>+				  &from_dev->dev_addr_list,
>>+				  to_dev->addr_len, 0);
>>+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
>>+	return 0;
>>+}
>>+EXPORT_SYMBOL(dev_addr_del_multiple);
>>+
>>+/* unicast and multicast addresses handling functions */
>>+
>> int __dev_addr_delete(struct dev_addr_list **list, int *count,
>> 		      void *addr, int alen, int glbl)
>> {
>>@@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>> 
>> 	dev->gso_max_size = GSO_MAX_SIZE;
>> 
>>+	dev_addr_init(dev);
>> 	netdev_init_queues(dev);
>> 
>> 	INIT_LIST_HEAD(&dev->napi_list);
>>@@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
>> 
>> 	kfree(dev->_tx);
>> 
>>+	/* Flush device addresses */
>>+	dev_addr_flush(dev);
>>+
>> 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
>> 		netif_napi_del(p);
>> 
>>-- 
>>1.6.0.6
>>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4)
@ 2009-04-23  8:09               ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-23  8:09 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

Mon, Apr 20, 2009 at 06:11:56PM CEST, jpirko@redhat.com wrote:
>How about this (and another 2 patches in patchset)? What's your opinion guys?

Eric, Stephen, Patrick? Can you please look at this? Any objections?
>
>Thanks,
>
>Jirka
>
>Sat, Apr 18, 2009 at 10:58:49AM CEST, jpirko@redhat.com wrote:
>>v3 -> v4 (current):
>>-changed kzalloc to kmalloc in __hw_addr_add_ii()
>>-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()
>>
>>v2 -> v3:
>>-removed unnecessary rcu read locking
>>-moved dev_addr_flush() calling to ensure no null dereference of dev_addr
>>
>>v1 -> v2:
>>-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
>>-removed unnecessary rcu_read locking in dev_addr_init
>>-use compare_ether_addr_64bits instead of compare_ether_addr
>>-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
>>-use call_rcu instead of rcu_synchronize
>>-moved is_etherdev_addr into __KERNEL__ ifdef
>>
>>This patch introduces a new list in struct net_device and brings a set of
>>functions to handle the work with device address list. The list is a replacement
>>for the original dev_addr field and because in some situations there is need to
>>carry several device addresses with the net device. To be backward compatible,
>>dev_addr is made to point to the first member of the list so original drivers
>>sees no difference.
>>
>>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>>---
>> include/linux/etherdevice.h |   27 +++++
>> include/linux/netdevice.h   |   32 +++++-
>> net/core/dev.c              |  261 +++++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 318 insertions(+), 2 deletions(-)
>>
>>diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
>>index a1f17ab..3d7a668 100644
>>--- a/include/linux/etherdevice.h
>>+++ b/include/linux/etherdevice.h
>>@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
>> 	return compare_ether_addr(addr1, addr2);
>> #endif
>> }
>>+
>>+/**
>>+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
>>+ * @dev: Pointer to a device structure
>>+ * @addr: Pointer to a six-byte array containing the Ethernet address
>>+ *
>>+ * Compare passed address with all addresses of the device. Return true if the
>>+ * address if one of the device addresses.
>>+ *
>>+ * Note that this function calls compare_ether_addr_64bits() so take care of
>>+ * the right padding.
>>+ */
>>+static inline bool is_etherdev_addr(const struct net_device *dev,
>>+				    const u8 addr[6 + 2])
>>+{
>>+	struct netdev_hw_addr *ha;
>>+	int res = 1;
>>+
>>+	rcu_read_lock();
>>+	for_each_dev_addr(dev, ha) {
>>+		res = compare_ether_addr_64bits(addr, ha->addr);
>>+		if (!res)
>>+			break;
>>+	}
>>+	rcu_read_unlock();
>>+	return !res;
>>+}
>> #endif	/* __KERNEL__ */
>> 
>> /**
>>diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>index 2e7783f..89ad6d2 100644
>>--- a/include/linux/netdevice.h
>>+++ b/include/linux/netdevice.h
>>@@ -210,6 +210,13 @@ struct dev_addr_list
>> #define dmi_users	da_users
>> #define dmi_gusers	da_gusers
>> 
>>+struct netdev_hw_addr {
>>+	struct list_head	list;
>>+	unsigned char		addr[MAX_ADDR_LEN];
>>+	int			refcount;
>>+	struct rcu_head		rcu_head;
>>+};
>>+
>> struct hh_cache
>> {
>> 	struct hh_cache *hh_next;	/* Next entry			     */
>>@@ -776,8 +783,11 @@ struct net_device
>>  */
>> 	unsigned long		last_rx;	/* Time of last Rx	*/
>> 	/* Interface address info used in eth_type_trans() */
>>-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
>>-							   because most packets are unicast) */
>>+	unsigned char		*dev_addr;	/* hw address, (before bcast
>>+						   because most packets are
>>+						   unicast) */
>>+
>>+	struct list_head	dev_addr_list; /* list of device hw addresses */
>> 
>> 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
>> 
>>@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
>> 	spin_unlock_bh(&dev->addr_list_lock);
>> }
>> 
>>+/*
>>+ * dev_addr_list walker. Should be used only for read access. Call with
>>+ * rcu_read_lock held.
>>+ */
>>+#define for_each_dev_addr(dev, ha) \
>>+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
>>+
>> /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>> 
>> extern void		ether_setup(struct net_device *dev);
>>@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>> 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
>> extern int		register_netdev(struct net_device *dev);
>> extern void		unregister_netdev(struct net_device *dev);
>>+
>>+/* Functions used for device addresses handling */
>>+extern int		dev_addr_add(struct net_device *dev,
>>+				     unsigned char *addr);
>>+extern int		dev_addr_del(struct net_device *dev,
>>+				     unsigned char *addr);
>>+extern int		dev_addr_add_multiple(struct net_device *to_dev,
>>+					      struct net_device *from_dev);
>>+extern int		dev_addr_del_multiple(struct net_device *to_dev,
>>+					      struct net_device *from_dev);
>>+
>> /* Functions used for secondary unicast and multicast support */
>> extern void		dev_set_rx_mode(struct net_device *dev);
>> extern void		__dev_set_rx_mode(struct net_device *dev);
>>diff --git a/net/core/dev.c b/net/core/dev.c
>>index 343883f..2274294 100644
>>--- a/net/core/dev.c
>>+++ b/net/core/dev.c
>>@@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
>> 	netif_addr_unlock_bh(dev);
>> }
>> 
>>+/* hw addresses list handling functions */
>>+
>>+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>>+			    int addr_len, int ignore_index)
>>+{
>>+	struct netdev_hw_addr *ha;
>>+	int i = 0;
>>+
>>+	if (addr_len > MAX_ADDR_LEN)
>>+		return -EINVAL;
>>+
>>+	list_for_each_entry(ha, list, list) {
>>+		if (i++ != ignore_index &&
>>+		    !memcmp(ha->addr, addr, addr_len)) {
>>+			ha->refcount++;
>>+			return 0;
>>+		}
>>+	}
>>+
>>+	ha = kmalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
>>+	if (!ha)
>>+		return -ENOMEM;
>>+	memcpy(ha->addr, addr, addr_len);
>>+	ha->refcount = 1;
>>+	list_add_tail_rcu(&ha->list, list);
>>+	return 0;
>>+}
>>+
>>+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
>>+			 int addr_len)
>>+{
>>+	return __hw_addr_add_ii(list, addr, addr_len, -1);
>>+}
>>+
>>+static void ha_rcu_free(struct rcu_head *head)
>>+{
>>+	struct netdev_hw_addr *ha;
>>+
>>+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
>>+	kfree(ha);
>>+}
>>+
>>+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
>>+			    int addr_len, int ignore_index)
>>+{
>>+	struct netdev_hw_addr *ha;
>>+	int i = 0;
>>+
>>+	list_for_each_entry(ha, list, list) {
>>+		if (i++ != ignore_index &&
>>+		    !memcmp(ha->addr, addr, addr_len)) {
>>+			if (--ha->refcount)
>>+				return 0;
>>+			list_del_rcu(&ha->list);
>>+			call_rcu(&ha->rcu_head, ha_rcu_free);
>>+			return 0;
>>+		}
>>+	}
>>+	return -ENOENT;
>>+}
>>+
>>+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
>>+			 int addr_len)
>>+{
>>+	return __hw_addr_del_ii(list, addr, addr_len, -1);
>>+}
>>+
>>+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
>>+				     struct list_head *from_list,
>>+				     int addr_len, int ignore_index)
>>+{
>>+	int err;
>>+	struct netdev_hw_addr *ha, *ha2;
>>+
>>+	list_for_each_entry(ha, from_list, list) {
>>+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
>>+		if (err)
>>+			goto unroll;
>>+	}
>>+	return 0;
>>+
>>+unroll:
>>+	list_for_each_entry(ha2, from_list, list) {
>>+		if (ha2 == ha)
>>+			break;
>>+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
>>+	}
>>+	return err;
>>+}
>>+
>>+static int __hw_addr_add_multiple(struct list_head *to_list,
>>+					 struct list_head *from_list,
>>+					 int addr_len)
>>+{
>>+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
>>+}
>>+
>>+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
>>+				      struct list_head *from_list,
>>+				      int addr_len, int ignore_index)
>>+{
>>+	struct netdev_hw_addr *ha;
>>+
>>+	list_for_each_entry(ha, from_list, list) {
>>+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
>>+	}
>>+}
>>+
>>+static void __hw_addr_del_multiple(struct list_head *to_list,
>>+					 struct list_head *from_list,
>>+					 int addr_len)
>>+{
>>+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
>>+}
>>+
>>+static void __hw_addr_flush(struct list_head *list)
>>+{
>>+	struct netdev_hw_addr *ha, *tmp;
>>+
>>+	list_for_each_entry_safe(ha, tmp, list, list) {
>>+		list_del_rcu(&ha->list);
>>+		call_rcu(&ha->rcu_head, ha_rcu_free);
>>+	}
>>+}
>>+
>>+/* Device addresses handling functions */
>>+
>>+static void dev_addr_flush(struct net_device *dev)
>>+{
>>+	/* rtnl_mutex must be held here */
>>+
>>+	__hw_addr_flush(&dev->dev_addr_list);
>>+	dev->dev_addr = NULL;
>>+}
>>+
>>+static int dev_addr_init(struct net_device *dev)
>>+{
>>+	unsigned char addr[MAX_ADDR_LEN];
>>+	struct netdev_hw_addr *ha;
>>+	int err;
>>+
>>+	/* rtnl_mutex must be held here */
>>+
>>+	INIT_LIST_HEAD(&dev->dev_addr_list);
>>+	memset(addr, 0, sizeof(*addr));
>>+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>>+	if (!err) {
>>+		/*
>>+		 * Get the first (previously created) address from the list
>>+		 * and set dev_addr pointer to this location.
>>+		 */
>>+		ha = list_first_entry(&dev->dev_addr_list,
>>+				      struct netdev_hw_addr, list);
>>+		dev->dev_addr = ha->addr;
>>+	}
>>+	return err;
>>+}
>>+
>>+/**
>>+ *	dev_addr_add	- Add a device address
>>+ *	@dev: device
>>+ *	@addr: address to add
>>+ *
>>+ *	Add a device address to the device or increase the reference count if
>>+ *	it already exists.
>>+ *
>>+ *	The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_add(struct net_device *dev, unsigned char *addr)
>>+{
>>+	int err;
>>+
>>+	ASSERT_RTNL();
>>+
>>+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
>>+	if (!err)
>>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>>+	return err;
>>+}
>>+EXPORT_SYMBOL(dev_addr_add);
>>+
>>+/**
>>+ *	dev_addr_del	- Release a device address.
>>+ *	@dev: device
>>+ *	@addr: address to delete
>>+ *
>>+ *	Release reference to a device address and remove it from the device
>>+ *	if the reference count drops to zero.
>>+ *
>>+ *	The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_del(struct net_device *dev, unsigned char *addr)
>>+{
>>+	int err;
>>+
>>+	ASSERT_RTNL();
>>+
>>+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
>>+	if (!err)
>>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>>+	return err;
>>+}
>>+EXPORT_SYMBOL(dev_addr_del);
>>+
>>+/**
>>+ *	dev_addr_add_multiple	- Add device addresses from another device
>>+ *	@to_dev: device to which addresses will be added
>>+ *	@from_dev: device from which addresses will be added
>>+ *
>>+ *	Add device addresses of the one device to another.
>>+ *
>>+ *	The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_add_multiple(struct net_device *to_dev,
>>+			  struct net_device *from_dev)
>>+{
>>+	int err;
>>+
>>+	ASSERT_RTNL();
>>+
>>+	if (from_dev->addr_len != to_dev->addr_len)
>>+		return -EINVAL;
>>+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
>>+					&from_dev->dev_addr_list,
>>+					to_dev->addr_len, 0);
>>+	if (!err)
>>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
>>+	return err;
>>+}
>>+EXPORT_SYMBOL(dev_addr_add_multiple);
>>+
>>+/**
>>+ *	dev_addr_del_multiple	- Delete device addresses by another device
>>+ *	@to_dev: device where the addresses will be deleted
>>+ *	@from_dev: device by which addresses the addresses will be deleted
>>+ *
>>+ *	Deletes addresses in to device by the list of addresses in from device.
>>+ *
>>+ *	The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_del_multiple(struct net_device *to_dev,
>>+			  struct net_device *from_dev)
>>+{
>>+	ASSERT_RTNL();
>>+
>>+	if (from_dev->addr_len != to_dev->addr_len)
>>+		return -EINVAL;
>>+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
>>+				  &from_dev->dev_addr_list,
>>+				  to_dev->addr_len, 0);
>>+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
>>+	return 0;
>>+}
>>+EXPORT_SYMBOL(dev_addr_del_multiple);
>>+
>>+/* unicast and multicast addresses handling functions */
>>+
>> int __dev_addr_delete(struct dev_addr_list **list, int *count,
>> 		      void *addr, int alen, int glbl)
>> {
>>@@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>> 
>> 	dev->gso_max_size = GSO_MAX_SIZE;
>> 
>>+	dev_addr_init(dev);
>> 	netdev_init_queues(dev);
>> 
>> 	INIT_LIST_HEAD(&dev->napi_list);
>>@@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
>> 
>> 	kfree(dev->_tx);
>> 
>>+	/* Flush device addresses */
>>+	dev_addr_flush(dev);
>>+
>> 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
>> 		netif_napi_del(p);
>> 
>>-- 
>>1.6.0.6
>>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4)
@ 2009-04-23  8:09               ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-23  8:09 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

Mon, Apr 20, 2009 at 06:11:56PM CEST, jpirko@redhat.com wrote:
>How about this (and another 2 patches in patchset)? What's your opinion guys?

Eric, Stephen, Patrick? Can you please look at this? Any objections?
>
>Thanks,
>
>Jirka
>
>Sat, Apr 18, 2009 at 10:58:49AM CEST, jpirko@redhat.com wrote:
>>v3 -> v4 (current):
>>-changed kzalloc to kmalloc in __hw_addr_add_ii()
>>-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()
>>
>>v2 -> v3:
>>-removed unnecessary rcu read locking
>>-moved dev_addr_flush() calling to ensure no null dereference of dev_addr
>>
>>v1 -> v2:
>>-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
>>-removed unnecessary rcu_read locking in dev_addr_init
>>-use compare_ether_addr_64bits instead of compare_ether_addr
>>-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
>>-use call_rcu instead of rcu_synchronize
>>-moved is_etherdev_addr into __KERNEL__ ifdef
>>
>>This patch introduces a new list in struct net_device and brings a set of
>>functions to handle the work with device address list. The list is a replacement
>>for the original dev_addr field and because in some situations there is need to
>>carry several device addresses with the net device. To be backward compatible,
>>dev_addr is made to point to the first member of the list so original drivers
>>sees no difference.
>>
>>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>>---
>> include/linux/etherdevice.h |   27 +++++
>> include/linux/netdevice.h   |   32 +++++-
>> net/core/dev.c              |  261 +++++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 318 insertions(+), 2 deletions(-)
>>
>>diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
>>index a1f17ab..3d7a668 100644
>>--- a/include/linux/etherdevice.h
>>+++ b/include/linux/etherdevice.h
>>@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
>> 	return compare_ether_addr(addr1, addr2);
>> #endif
>> }
>>+
>>+/**
>>+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
>>+ * @dev: Pointer to a device structure
>>+ * @addr: Pointer to a six-byte array containing the Ethernet address
>>+ *
>>+ * Compare passed address with all addresses of the device. Return true if the
>>+ * address if one of the device addresses.
>>+ *
>>+ * Note that this function calls compare_ether_addr_64bits() so take care of
>>+ * the right padding.
>>+ */
>>+static inline bool is_etherdev_addr(const struct net_device *dev,
>>+				    const u8 addr[6 + 2])
>>+{
>>+	struct netdev_hw_addr *ha;
>>+	int res = 1;
>>+
>>+	rcu_read_lock();
>>+	for_each_dev_addr(dev, ha) {
>>+		res = compare_ether_addr_64bits(addr, ha->addr);
>>+		if (!res)
>>+			break;
>>+	}
>>+	rcu_read_unlock();
>>+	return !res;
>>+}
>> #endif	/* __KERNEL__ */
>> 
>> /**
>>diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>index 2e7783f..89ad6d2 100644
>>--- a/include/linux/netdevice.h
>>+++ b/include/linux/netdevice.h
>>@@ -210,6 +210,13 @@ struct dev_addr_list
>> #define dmi_users	da_users
>> #define dmi_gusers	da_gusers
>> 
>>+struct netdev_hw_addr {
>>+	struct list_head	list;
>>+	unsigned char		addr[MAX_ADDR_LEN];
>>+	int			refcount;
>>+	struct rcu_head		rcu_head;
>>+};
>>+
>> struct hh_cache
>> {
>> 	struct hh_cache *hh_next;	/* Next entry			     */
>>@@ -776,8 +783,11 @@ struct net_device
>>  */
>> 	unsigned long		last_rx;	/* Time of last Rx	*/
>> 	/* Interface address info used in eth_type_trans() */
>>-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
>>-							   because most packets are unicast) */
>>+	unsigned char		*dev_addr;	/* hw address, (before bcast
>>+						   because most packets are
>>+						   unicast) */
>>+
>>+	struct list_head	dev_addr_list; /* list of device hw addresses */
>> 
>> 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
>> 
>>@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
>> 	spin_unlock_bh(&dev->addr_list_lock);
>> }
>> 
>>+/*
>>+ * dev_addr_list walker. Should be used only for read access. Call with
>>+ * rcu_read_lock held.
>>+ */
>>+#define for_each_dev_addr(dev, ha) \
>>+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
>>+
>> /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>> 
>> extern void		ether_setup(struct net_device *dev);
>>@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>> 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
>> extern int		register_netdev(struct net_device *dev);
>> extern void		unregister_netdev(struct net_device *dev);
>>+
>>+/* Functions used for device addresses handling */
>>+extern int		dev_addr_add(struct net_device *dev,
>>+				     unsigned char *addr);
>>+extern int		dev_addr_del(struct net_device *dev,
>>+				     unsigned char *addr);
>>+extern int		dev_addr_add_multiple(struct net_device *to_dev,
>>+					      struct net_device *from_dev);
>>+extern int		dev_addr_del_multiple(struct net_device *to_dev,
>>+					      struct net_device *from_dev);
>>+
>> /* Functions used for secondary unicast and multicast support */
>> extern void		dev_set_rx_mode(struct net_device *dev);
>> extern void		__dev_set_rx_mode(struct net_device *dev);
>>diff --git a/net/core/dev.c b/net/core/dev.c
>>index 343883f..2274294 100644
>>--- a/net/core/dev.c
>>+++ b/net/core/dev.c
>>@@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
>> 	netif_addr_unlock_bh(dev);
>> }
>> 
>>+/* hw addresses list handling functions */
>>+
>>+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>>+			    int addr_len, int ignore_index)
>>+{
>>+	struct netdev_hw_addr *ha;
>>+	int i = 0;
>>+
>>+	if (addr_len > MAX_ADDR_LEN)
>>+		return -EINVAL;
>>+
>>+	list_for_each_entry(ha, list, list) {
>>+		if (i++ != ignore_index &&
>>+		    !memcmp(ha->addr, addr, addr_len)) {
>>+			ha->refcount++;
>>+			return 0;
>>+		}
>>+	}
>>+
>>+	ha = kmalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
>>+	if (!ha)
>>+		return -ENOMEM;
>>+	memcpy(ha->addr, addr, addr_len);
>>+	ha->refcount = 1;
>>+	list_add_tail_rcu(&ha->list, list);
>>+	return 0;
>>+}
>>+
>>+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
>>+			 int addr_len)
>>+{
>>+	return __hw_addr_add_ii(list, addr, addr_len, -1);
>>+}
>>+
>>+static void ha_rcu_free(struct rcu_head *head)
>>+{
>>+	struct netdev_hw_addr *ha;
>>+
>>+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
>>+	kfree(ha);
>>+}
>>+
>>+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
>>+			    int addr_len, int ignore_index)
>>+{
>>+	struct netdev_hw_addr *ha;
>>+	int i = 0;
>>+
>>+	list_for_each_entry(ha, list, list) {
>>+		if (i++ != ignore_index &&
>>+		    !memcmp(ha->addr, addr, addr_len)) {
>>+			if (--ha->refcount)
>>+				return 0;
>>+			list_del_rcu(&ha->list);
>>+			call_rcu(&ha->rcu_head, ha_rcu_free);
>>+			return 0;
>>+		}
>>+	}
>>+	return -ENOENT;
>>+}
>>+
>>+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
>>+			 int addr_len)
>>+{
>>+	return __hw_addr_del_ii(list, addr, addr_len, -1);
>>+}
>>+
>>+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
>>+				     struct list_head *from_list,
>>+				     int addr_len, int ignore_index)
>>+{
>>+	int err;
>>+	struct netdev_hw_addr *ha, *ha2;
>>+
>>+	list_for_each_entry(ha, from_list, list) {
>>+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
>>+		if (err)
>>+			goto unroll;
>>+	}
>>+	return 0;
>>+
>>+unroll:
>>+	list_for_each_entry(ha2, from_list, list) {
>>+		if (ha2 == ha)
>>+			break;
>>+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
>>+	}
>>+	return err;
>>+}
>>+
>>+static int __hw_addr_add_multiple(struct list_head *to_list,
>>+					 struct list_head *from_list,
>>+					 int addr_len)
>>+{
>>+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
>>+}
>>+
>>+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
>>+				      struct list_head *from_list,
>>+				      int addr_len, int ignore_index)
>>+{
>>+	struct netdev_hw_addr *ha;
>>+
>>+	list_for_each_entry(ha, from_list, list) {
>>+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
>>+	}
>>+}
>>+
>>+static void __hw_addr_del_multiple(struct list_head *to_list,
>>+					 struct list_head *from_list,
>>+					 int addr_len)
>>+{
>>+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
>>+}
>>+
>>+static void __hw_addr_flush(struct list_head *list)
>>+{
>>+	struct netdev_hw_addr *ha, *tmp;
>>+
>>+	list_for_each_entry_safe(ha, tmp, list, list) {
>>+		list_del_rcu(&ha->list);
>>+		call_rcu(&ha->rcu_head, ha_rcu_free);
>>+	}
>>+}
>>+
>>+/* Device addresses handling functions */
>>+
>>+static void dev_addr_flush(struct net_device *dev)
>>+{
>>+	/* rtnl_mutex must be held here */
>>+
>>+	__hw_addr_flush(&dev->dev_addr_list);
>>+	dev->dev_addr = NULL;
>>+}
>>+
>>+static int dev_addr_init(struct net_device *dev)
>>+{
>>+	unsigned char addr[MAX_ADDR_LEN];
>>+	struct netdev_hw_addr *ha;
>>+	int err;
>>+
>>+	/* rtnl_mutex must be held here */
>>+
>>+	INIT_LIST_HEAD(&dev->dev_addr_list);
>>+	memset(addr, 0, sizeof(*addr));
>>+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>>+	if (!err) {
>>+		/*
>>+		 * Get the first (previously created) address from the list
>>+		 * and set dev_addr pointer to this location.
>>+		 */
>>+		ha = list_first_entry(&dev->dev_addr_list,
>>+				      struct netdev_hw_addr, list);
>>+		dev->dev_addr = ha->addr;
>>+	}
>>+	return err;
>>+}
>>+
>>+/**
>>+ *	dev_addr_add	- Add a device address
>>+ *	@dev: device
>>+ *	@addr: address to add
>>+ *
>>+ *	Add a device address to the device or increase the reference count if
>>+ *	it already exists.
>>+ *
>>+ *	The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_add(struct net_device *dev, unsigned char *addr)
>>+{
>>+	int err;
>>+
>>+	ASSERT_RTNL();
>>+
>>+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
>>+	if (!err)
>>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>>+	return err;
>>+}
>>+EXPORT_SYMBOL(dev_addr_add);
>>+
>>+/**
>>+ *	dev_addr_del	- Release a device address.
>>+ *	@dev: device
>>+ *	@addr: address to delete
>>+ *
>>+ *	Release reference to a device address and remove it from the device
>>+ *	if the reference count drops to zero.
>>+ *
>>+ *	The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_del(struct net_device *dev, unsigned char *addr)
>>+{
>>+	int err;
>>+
>>+	ASSERT_RTNL();
>>+
>>+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
>>+	if (!err)
>>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>>+	return err;
>>+}
>>+EXPORT_SYMBOL(dev_addr_del);
>>+
>>+/**
>>+ *	dev_addr_add_multiple	- Add device addresses from another device
>>+ *	@to_dev: device to which addresses will be added
>>+ *	@from_dev: device from which addresses will be added
>>+ *
>>+ *	Add device addresses of the one device to another.
>>+ *
>>+ *	The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_add_multiple(struct net_device *to_dev,
>>+			  struct net_device *from_dev)
>>+{
>>+	int err;
>>+
>>+	ASSERT_RTNL();
>>+
>>+	if (from_dev->addr_len != to_dev->addr_len)
>>+		return -EINVAL;
>>+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
>>+					&from_dev->dev_addr_list,
>>+					to_dev->addr_len, 0);
>>+	if (!err)
>>+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
>>+	return err;
>>+}
>>+EXPORT_SYMBOL(dev_addr_add_multiple);
>>+
>>+/**
>>+ *	dev_addr_del_multiple	- Delete device addresses by another device
>>+ *	@to_dev: device where the addresses will be deleted
>>+ *	@from_dev: device by which addresses the addresses will be deleted
>>+ *
>>+ *	Deletes addresses in to device by the list of addresses in from device.
>>+ *
>>+ *	The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_del_multiple(struct net_device *to_dev,
>>+			  struct net_device *from_dev)
>>+{
>>+	ASSERT_RTNL();
>>+
>>+	if (from_dev->addr_len != to_dev->addr_len)
>>+		return -EINVAL;
>>+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
>>+				  &from_dev->dev_addr_list,
>>+				  to_dev->addr_len, 0);
>>+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
>>+	return 0;
>>+}
>>+EXPORT_SYMBOL(dev_addr_del_multiple);
>>+
>>+/* unicast and multicast addresses handling functions */
>>+
>> int __dev_addr_delete(struct dev_addr_list **list, int *count,
>> 		      void *addr, int alen, int glbl)
>> {
>>@@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>> 
>> 	dev->gso_max_size = GSO_MAX_SIZE;
>> 
>>+	dev_addr_init(dev);
>> 	netdev_init_queues(dev);
>> 
>> 	INIT_LIST_HEAD(&dev->napi_list);
>>@@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
>> 
>> 	kfree(dev->_tx);
>> 
>>+	/* Flush device addresses */
>>+	dev_addr_flush(dev);
>>+
>> 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
>> 		netif_napi_del(p);
>> 
>>-- 
>>1.6.0.6
>>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bonding-devel] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4)
  2009-04-18  8:58           ` [Bridge] " Jiri Pirko
@ 2009-04-23 15:58             ` Stephen Hemminger
  -1 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-04-23 15:58 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, kaber, netdev, bridge, mschmidt, bonding-devel,
	jgarzik, dada1, davem

On Sat, 18 Apr 2009 10:58:49 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> v3 -> v4 (current):
> -changed kzalloc to kmalloc in __hw_addr_add_ii()
> -ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()
> 
> v2 -> v3:
> -removed unnecessary rcu read locking
> -moved dev_addr_flush() calling to ensure no null dereference of dev_addr
> 
> v1 -> v2:
> -added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
> -removed unnecessary rcu_read locking in dev_addr_init
> -use compare_ether_addr_64bits instead of compare_ether_addr
> -use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
> -use call_rcu instead of rcu_synchronize
> -moved is_etherdev_addr into __KERNEL__ ifdef
> 
> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>

I am still unsure why this added complexity to the network device model is needed.

How does this interact with neighbor table (ARP)?
Isn't this what macvlan already does.




^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [Bonding-devel] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4)
@ 2009-04-23 15:58             ` Stephen Hemminger
  0 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-04-23 15:58 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, bridge, mschmidt, davem, netdev, fubar, bonding-devel,
	dada1, jgarzik

On Sat, 18 Apr 2009 10:58:49 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> v3 -> v4 (current):
> -changed kzalloc to kmalloc in __hw_addr_add_ii()
> -ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()
> 
> v2 -> v3:
> -removed unnecessary rcu read locking
> -moved dev_addr_flush() calling to ensure no null dereference of dev_addr
> 
> v1 -> v2:
> -added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
> -removed unnecessary rcu_read locking in dev_addr_init
> -use compare_ether_addr_64bits instead of compare_ether_addr
> -use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
> -use call_rcu instead of rcu_synchronize
> -moved is_etherdev_addr into __KERNEL__ ifdef
> 
> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>

I am still unsure why this added complexity to the network device model is needed.

How does this interact with neighbor table (ARP)?
Isn't this what macvlan already does.




^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bonding-devel] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4)
  2009-04-23 15:58             ` [Bridge] " Stephen Hemminger
@ 2009-04-24 21:26               ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-24 21:26 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: ivecera, fubar, kaber, netdev, bridge, mschmidt, bonding-devel,
	jgarzik, dada1, davem

Thu, Apr 23, 2009 at 05:58:55PM CEST, shemminger@vyatta.com wrote:
>On Sat, 18 Apr 2009 10:58:49 +0200
>Jiri Pirko <jpirko@redhat.com> wrote:
>
>> v3 -> v4 (current):
>> -changed kzalloc to kmalloc in __hw_addr_add_ii()
>> -ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()
>> 
>> v2 -> v3:
>> -removed unnecessary rcu read locking
>> -moved dev_addr_flush() calling to ensure no null dereference of dev_addr
>> 
>> v1 -> v2:
>> -added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
>> -removed unnecessary rcu_read locking in dev_addr_init
>> -use compare_ether_addr_64bits instead of compare_ether_addr
>> -use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
>> -use call_rcu instead of rcu_synchronize
>> -moved is_etherdev_addr into __KERNEL__ ifdef
>> 
>> This patch introduces a new list in struct net_device and brings a set of
>> functions to handle the work with device address list. The list is a replacement
>> for the original dev_addr field and because in some situations there is need to
>> carry several device addresses with the net device. To be backward compatible,
>> dev_addr is made to point to the first member of the list so original drivers
>> sees no difference.
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>
>I am still unsure why this added complexity to the network device model is needed.

I've found no other way how to support devices with multiple macs i.e. as a port
in the bridge (without horrible hooks).
>
>How does this interact with neighbor table (ARP)?

It doesn't interact ARP. ARP uses only the first device (dev_addr ptr). I'm not
sure this is the right behaviour, but I think that this is not crucial now.

>Isn't this what macvlan already does.

Hmm I'm looking at this and I'm not sure it would help (I've found no
documentation). Can you please show an example how this solves "bonding
interface in atb mode as a port in the bridge" problem?

Thanks
>
>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [Bonding-devel] [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4)
@ 2009-04-24 21:26               ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-04-24 21:26 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: ivecera, bridge, mschmidt, davem, netdev, fubar, bonding-devel,
	dada1, jgarzik

Thu, Apr 23, 2009 at 05:58:55PM CEST, shemminger@vyatta.com wrote:
>On Sat, 18 Apr 2009 10:58:49 +0200
>Jiri Pirko <jpirko@redhat.com> wrote:
>
>> v3 -> v4 (current):
>> -changed kzalloc to kmalloc in __hw_addr_add_ii()
>> -ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()
>> 
>> v2 -> v3:
>> -removed unnecessary rcu read locking
>> -moved dev_addr_flush() calling to ensure no null dereference of dev_addr
>> 
>> v1 -> v2:
>> -added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
>> -removed unnecessary rcu_read locking in dev_addr_init
>> -use compare_ether_addr_64bits instead of compare_ether_addr
>> -use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
>> -use call_rcu instead of rcu_synchronize
>> -moved is_etherdev_addr into __KERNEL__ ifdef
>> 
>> This patch introduces a new list in struct net_device and brings a set of
>> functions to handle the work with device address list. The list is a replacement
>> for the original dev_addr field and because in some situations there is need to
>> carry several device addresses with the net device. To be backward compatible,
>> dev_addr is made to point to the first member of the list so original drivers
>> sees no difference.
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>
>I am still unsure why this added complexity to the network device model is needed.

I've found no other way how to support devices with multiple macs i.e. as a port
in the bridge (without horrible hooks).
>
>How does this interact with neighbor table (ARP)?

It doesn't interact ARP. ARP uses only the first device (dev_addr ptr). I'm not
sure this is the right behaviour, but I think that this is not crucial now.

>Isn't this what macvlan already does.

Hmm I'm looking at this and I'm not sure it would help (I've found no
documentation). Can you please show an example how this solves "bonding
interface in atb mode as a port in the bridge" problem?

Thanks
>
>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH] net: introduce a list of device addresses dev_addr_list (v5)
  2009-04-18  8:58           ` [Bridge] " Jiri Pirko
@ 2009-05-04 11:14             ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-04 11:14 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

v4 -> v5 (current):
-added device address type (suggested by davem)
-removed refcounting (better to have simplier code then safe potentially few
 bytes)

v3 -> v4:
-changed kzalloc to kmalloc in __hw_addr_add_ii()
-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()

v2 -> v3:
-removed unnecessary rcu read locking
-moved dev_addr_flush() calling to ensure no null dereference of dev_addr

v1 -> v2:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   27 +++++
 include/linux/netdevice.h   |   37 ++++++-
 net/core/dev.c              |  271 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 333 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..3d7a668 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
 	return compare_ether_addr(addr1, addr2);
 #endif
 }
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 addr[6 + 2])
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr_64bits(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
 #endif	/* __KERNEL__ */
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5a96a1a..a95befc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,16 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	unsigned char		type;
+#define NETDEV_HW_ADDR_T_LAN	1
+#define NETDEV_HW_ADDR_T_SAN	2
+#define NETDEV_HW_ADDR_T_SLAVE	3
+	struct rcu_head		rcu_head;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +786,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1791,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1810,19 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int dev_addr_add(struct net_device *dev, unsigned char *addr,
+			unsigned char addr_type);
+extern int dev_addr_del(struct net_device *dev, unsigned char *addr,
+			unsigned char addr_type);
+extern int dev_addr_add_multiple(struct net_device *to_dev,
+				 struct net_device *from_dev,
+				 unsigned char addr_type);
+extern int dev_addr_del_multiple(struct net_device *to_dev,
+				 struct net_device *from_dev,
+				 unsigned char addr_type);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 308a7d0..d5770f9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3443,6 +3443,273 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+			 int addr_len, unsigned char addr_type)
+{
+	struct netdev_hw_addr *ha;
+	int alloc_size;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	alloc_size = sizeof(*ha);
+	if (alloc_size < L1_CACHE_BYTES)
+		alloc_size = L1_CACHE_BYTES;
+	ha = kmalloc(alloc_size, GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->type = addr_type;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+	struct netdev_hw_addr *ha;
+
+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
+	kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, unsigned char addr_type,
+			    int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len) &&
+		    (ha->type == addr_type || !addr_type)) {
+			list_del_rcu(&ha->list);
+			call_rcu(&ha->rcu_head, ha_rcu_free);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+			 int addr_len, unsigned char addr_type)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, addr_type, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, unsigned char addr_type,
+				     int ignore_index)
+{
+	int err;
+	struct netdev_hw_addr *ha, *ha2;
+	unsigned char type;
+
+	list_for_each_entry(ha, from_list, list) {
+		type = addr_type ? addr_type : ha->type;
+		err = __hw_addr_add(to_list, ha->addr, addr_len, type);
+		if (err)
+			goto unroll;
+	}
+	return 0;
+
+unroll:
+	list_for_each_entry(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		type = addr_type ? addr_type : ha2->type;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, type,
+				 ignore_index);
+	}
+	return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len, unsigned char addr_type)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len,
+					 addr_type, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, unsigned char addr_type,
+				      int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	unsigned char type;
+
+	list_for_each_entry(ha, from_list, list) {
+		type = addr_type ? addr_type : ha->type;
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, addr_type,
+				 ignore_index);
+	}
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+				   struct list_head *from_list,
+				   int addr_len, unsigned char addr_type)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, addr_type, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		call_rcu(&ha->rcu_head, ha_rcu_free);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	/* rtnl_mutex must be held here */
+
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	/* rtnl_mutex must be held here */
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr),
+			    NETDEV_HW_ADDR_T_LAN);
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addr_list,
+				      struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *	@addr_type: address type
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr,
+		 unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add(&dev->dev_addr_list, addr, dev->addr_len,
+			    addr_type);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *	@addr_type: address type
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr,
+		 unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len,
+			       addr_type, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *	@addr_type: address type - 0 means type will be used from from_dev
+ *
+ *	Add device addresses of the one device to another.
+ **
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev,
+			  unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, addr_type, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *	@addr_type: address type - 0 means type will used from from_dev
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev,
+			  unsigned char addr_type)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, addr_type, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4785,6 +5052,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4810,6 +5078,9 @@ void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH] net: introduce a list of device addresses dev_addr_list (v5)
@ 2009-05-04 11:14             ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-04 11:14 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

v4 -> v5 (current):
-added device address type (suggested by davem)
-removed refcounting (better to have simplier code then safe potentially few
 bytes)

v3 -> v4:
-changed kzalloc to kmalloc in __hw_addr_add_ii()
-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()

v2 -> v3:
-removed unnecessary rcu read locking
-moved dev_addr_flush() calling to ensure no null dereference of dev_addr

v1 -> v2:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   27 +++++
 include/linux/netdevice.h   |   37 ++++++-
 net/core/dev.c              |  271 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 333 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..3d7a668 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
 	return compare_ether_addr(addr1, addr2);
 #endif
 }
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 addr[6 + 2])
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr_64bits(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
 #endif	/* __KERNEL__ */
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5a96a1a..a95befc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,16 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	unsigned char		type;
+#define NETDEV_HW_ADDR_T_LAN	1
+#define NETDEV_HW_ADDR_T_SAN	2
+#define NETDEV_HW_ADDR_T_SLAVE	3
+	struct rcu_head		rcu_head;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +786,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1791,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1810,19 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int dev_addr_add(struct net_device *dev, unsigned char *addr,
+			unsigned char addr_type);
+extern int dev_addr_del(struct net_device *dev, unsigned char *addr,
+			unsigned char addr_type);
+extern int dev_addr_add_multiple(struct net_device *to_dev,
+				 struct net_device *from_dev,
+				 unsigned char addr_type);
+extern int dev_addr_del_multiple(struct net_device *to_dev,
+				 struct net_device *from_dev,
+				 unsigned char addr_type);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 308a7d0..d5770f9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3443,6 +3443,273 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+			 int addr_len, unsigned char addr_type)
+{
+	struct netdev_hw_addr *ha;
+	int alloc_size;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	alloc_size = sizeof(*ha);
+	if (alloc_size < L1_CACHE_BYTES)
+		alloc_size = L1_CACHE_BYTES;
+	ha = kmalloc(alloc_size, GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->type = addr_type;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+	struct netdev_hw_addr *ha;
+
+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
+	kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, unsigned char addr_type,
+			    int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len) &&
+		    (ha->type == addr_type || !addr_type)) {
+			list_del_rcu(&ha->list);
+			call_rcu(&ha->rcu_head, ha_rcu_free);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+			 int addr_len, unsigned char addr_type)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, addr_type, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, unsigned char addr_type,
+				     int ignore_index)
+{
+	int err;
+	struct netdev_hw_addr *ha, *ha2;
+	unsigned char type;
+
+	list_for_each_entry(ha, from_list, list) {
+		type = addr_type ? addr_type : ha->type;
+		err = __hw_addr_add(to_list, ha->addr, addr_len, type);
+		if (err)
+			goto unroll;
+	}
+	return 0;
+
+unroll:
+	list_for_each_entry(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		type = addr_type ? addr_type : ha2->type;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, type,
+				 ignore_index);
+	}
+	return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len, unsigned char addr_type)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len,
+					 addr_type, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, unsigned char addr_type,
+				      int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	unsigned char type;
+
+	list_for_each_entry(ha, from_list, list) {
+		type = addr_type ? addr_type : ha->type;
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, addr_type,
+				 ignore_index);
+	}
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+				   struct list_head *from_list,
+				   int addr_len, unsigned char addr_type)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, addr_type, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		call_rcu(&ha->rcu_head, ha_rcu_free);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	/* rtnl_mutex must be held here */
+
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	/* rtnl_mutex must be held here */
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr),
+			    NETDEV_HW_ADDR_T_LAN);
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addr_list,
+				      struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *	@addr_type: address type
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr,
+		 unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add(&dev->dev_addr_list, addr, dev->addr_len,
+			    addr_type);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *	@addr_type: address type
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr,
+		 unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len,
+			       addr_type, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *	@addr_type: address type - 0 means type will be used from from_dev
+ *
+ *	Add device addresses of the one device to another.
+ **
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev,
+			  unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, addr_type, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *	@addr_type: address type - 0 means type will used from from_dev
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev,
+			  unsigned char addr_type)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, addr_type, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4785,6 +5052,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4810,6 +5078,9 @@ void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH] net: introduce a list of device addresses dev_addr_list (v5)
  2009-05-04 11:14             ` [Bridge] " Jiri Pirko
@ 2009-05-05  4:37               ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-05-05  4:37 UTC (permalink / raw)
  To: jpirko
  Cc: linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 4 May 2009 13:14:18 +0200

> +static void __hw_addr_del_multiple(struct list_head *to_list,
> +				   struct list_head *from_list,
> +				   int addr_len, unsigned char addr_type)
> +{
> +	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, addr_type, -1);
> +}

Unused static function, this will create build warnings.

Or, it should :-)

If you plan to use such a function in subsequent patches, add
it in those changes not here.

Otherwise I have no fundamental objection to this patch, nice
work!

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] net: introduce a list of device addresses dev_addr_list (v5)
@ 2009-05-05  4:37               ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-05-05  4:37 UTC (permalink / raw)
  To: jpirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt, jgarzik,
	dada1, bonding-devel

From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 4 May 2009 13:14:18 +0200

> +static void __hw_addr_del_multiple(struct list_head *to_list,
> +				   struct list_head *from_list,
> +				   int addr_len, unsigned char addr_type)
> +{
> +	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, addr_type, -1);
> +}

Unused static function, this will create build warnings.

Or, it should :-)

If you plan to use such a function in subsequent patches, add
it in those changes not here.

Otherwise I have no fundamental objection to this patch, nice
work!

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] net: introduce a list of device addresses dev_addr_list (v5)
  2009-05-05  4:37               ` [Bridge] " David Miller
@ 2009-05-05  6:37                 ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-05  6:37 UTC (permalink / raw)
  To: David Miller
  Cc: linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

Tue, May 05, 2009 at 06:37:30AM CEST, davem@davemloft.net wrote:
>From: Jiri Pirko <jpirko@redhat.com>
>Date: Mon, 4 May 2009 13:14:18 +0200
>
>> +static void __hw_addr_del_multiple(struct list_head *to_list,
>> +				   struct list_head *from_list,
>> +				   int addr_len, unsigned char addr_type)
>> +{
>> +	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, addr_type, -1);
>> +}
>
>Unused static function, this will create build warnings.
>
>Or, it should :-)

I'm aware.
>
>If you plan to use such a function in subsequent patches, add
>it in those changes not here.

Yes, Ok, I was not quite sure. Thanks for explanation, I'll resubmit later
today.
>
>Otherwise I have no fundamental objection to this patch, nice
>work!

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] net: introduce a list of device addresses dev_addr_list (v5)
@ 2009-05-05  6:37                 ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-05  6:37 UTC (permalink / raw)
  To: David Miller
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt, jgarzik,
	dada1, bonding-devel

Tue, May 05, 2009 at 06:37:30AM CEST, davem@davemloft.net wrote:
>From: Jiri Pirko <jpirko@redhat.com>
>Date: Mon, 4 May 2009 13:14:18 +0200
>
>> +static void __hw_addr_del_multiple(struct list_head *to_list,
>> +				   struct list_head *from_list,
>> +				   int addr_len, unsigned char addr_type)
>> +{
>> +	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, addr_type, -1);
>> +}
>
>Unused static function, this will create build warnings.
>
>Or, it should :-)

I'm aware.
>
>If you plan to use such a function in subsequent patches, add
>it in those changes not here.

Yes, Ok, I was not quite sure. Thanks for explanation, I'll resubmit later
today.
>
>Otherwise I have no fundamental objection to this patch, nice
>work!

^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH] net: introduce a list of device addresses dev_addr_list (v6)
  2009-05-04 11:14             ` [Bridge] " Jiri Pirko
@ 2009-05-05 12:48               ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-05 12:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

v5 -> v6 (current):
-removed so far unused static functions
-corrected dev_addr_del_multiple to call del instead of add

v4 -> v5:
-added device address type (suggested by davem)
-removed refcounting (better to have simplier code then safe potentially few
 bytes)

v3 -> v4:
-changed kzalloc to kmalloc in __hw_addr_add_ii()
-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()

v2 -> v3:
-removed unnecessary rcu read locking
-moved dev_addr_flush() calling to ensure no null dereference of dev_addr

v1 -> v2:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   27 +++++
 include/linux/netdevice.h   |   37 ++++++-
 net/core/dev.c              |  250 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 312 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..3d7a668 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
 	return compare_ether_addr(addr1, addr2);
 #endif
 }
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 addr[6 + 2])
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr_64bits(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
 #endif	/* __KERNEL__ */
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5a96a1a..a95befc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,16 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	unsigned char		type;
+#define NETDEV_HW_ADDR_T_LAN	1
+#define NETDEV_HW_ADDR_T_SAN	2
+#define NETDEV_HW_ADDR_T_SLAVE	3
+	struct rcu_head		rcu_head;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +786,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1791,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1810,19 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int dev_addr_add(struct net_device *dev, unsigned char *addr,
+			unsigned char addr_type);
+extern int dev_addr_del(struct net_device *dev, unsigned char *addr,
+			unsigned char addr_type);
+extern int dev_addr_add_multiple(struct net_device *to_dev,
+				 struct net_device *from_dev,
+				 unsigned char addr_type);
+extern int dev_addr_del_multiple(struct net_device *to_dev,
+				 struct net_device *from_dev,
+				 unsigned char addr_type);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 308a7d0..b2f752b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3443,6 +3443,252 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+			 int addr_len, unsigned char addr_type)
+{
+	struct netdev_hw_addr *ha;
+	int alloc_size;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	alloc_size = sizeof(*ha);
+	if (alloc_size < L1_CACHE_BYTES)
+		alloc_size = L1_CACHE_BYTES;
+	ha = kmalloc(alloc_size, GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->type = addr_type;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+	struct netdev_hw_addr *ha;
+
+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
+	kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, unsigned char addr_type,
+			    int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len) &&
+		    (ha->type == addr_type || !addr_type)) {
+			list_del_rcu(&ha->list);
+			call_rcu(&ha->rcu_head, ha_rcu_free);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, unsigned char addr_type,
+				     int ignore_index)
+{
+	int err;
+	struct netdev_hw_addr *ha, *ha2;
+	unsigned char type;
+
+	list_for_each_entry(ha, from_list, list) {
+		type = addr_type ? addr_type : ha->type;
+		err = __hw_addr_add(to_list, ha->addr, addr_len, type);
+		if (err)
+			goto unroll;
+	}
+	return 0;
+
+unroll:
+	list_for_each_entry(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		type = addr_type ? addr_type : ha2->type;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, type,
+				 ignore_index);
+	}
+	return err;
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, unsigned char addr_type,
+				      int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	unsigned char type;
+
+	list_for_each_entry(ha, from_list, list) {
+		type = addr_type ? addr_type : ha->type;
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, addr_type,
+				 ignore_index);
+	}
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		call_rcu(&ha->rcu_head, ha_rcu_free);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	/* rtnl_mutex must be held here */
+
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	/* rtnl_mutex must be held here */
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr),
+			    NETDEV_HW_ADDR_T_LAN);
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addr_list,
+				      struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *	@addr_type: address type
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr,
+		 unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add(&dev->dev_addr_list, addr, dev->addr_len,
+			    addr_type);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *	@addr_type: address type
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr,
+		 unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len,
+			       addr_type, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *	@addr_type: address type - 0 means type will be used from from_dev
+ *
+ *	Add device addresses of the one device to another.
+ **
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev,
+			  unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, addr_type, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *	@addr_type: address type - 0 means type will used from from_dev
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev,
+			  unsigned char addr_type)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_del_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, addr_type, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4785,6 +5031,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4810,6 +5057,9 @@ void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH] net: introduce a list of device addresses dev_addr_list (v6)
@ 2009-05-05 12:48               ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-05 12:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

v5 -> v6 (current):
-removed so far unused static functions
-corrected dev_addr_del_multiple to call del instead of add

v4 -> v5:
-added device address type (suggested by davem)
-removed refcounting (better to have simplier code then safe potentially few
 bytes)

v3 -> v4:
-changed kzalloc to kmalloc in __hw_addr_add_ii()
-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()

v2 -> v3:
-removed unnecessary rcu read locking
-moved dev_addr_flush() calling to ensure no null dereference of dev_addr

v1 -> v2:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   27 +++++
 include/linux/netdevice.h   |   37 ++++++-
 net/core/dev.c              |  250 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 312 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..3d7a668 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
 	return compare_ether_addr(addr1, addr2);
 #endif
 }
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 addr[6 + 2])
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr_64bits(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
 #endif	/* __KERNEL__ */
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5a96a1a..a95befc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,16 @@ struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	unsigned char		type;
+#define NETDEV_HW_ADDR_T_LAN	1
+#define NETDEV_HW_ADDR_T_SAN	2
+#define NETDEV_HW_ADDR_T_SLAVE	3
+	struct rcu_head		rcu_head;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +786,11 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1791,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1810,19 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int dev_addr_add(struct net_device *dev, unsigned char *addr,
+			unsigned char addr_type);
+extern int dev_addr_del(struct net_device *dev, unsigned char *addr,
+			unsigned char addr_type);
+extern int dev_addr_add_multiple(struct net_device *to_dev,
+				 struct net_device *from_dev,
+				 unsigned char addr_type);
+extern int dev_addr_del_multiple(struct net_device *to_dev,
+				 struct net_device *from_dev,
+				 unsigned char addr_type);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 308a7d0..b2f752b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3443,6 +3443,252 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+			 int addr_len, unsigned char addr_type)
+{
+	struct netdev_hw_addr *ha;
+	int alloc_size;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	alloc_size = sizeof(*ha);
+	if (alloc_size < L1_CACHE_BYTES)
+		alloc_size = L1_CACHE_BYTES;
+	ha = kmalloc(alloc_size, GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->type = addr_type;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+	struct netdev_hw_addr *ha;
+
+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
+	kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, unsigned char addr_type,
+			    int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len) &&
+		    (ha->type == addr_type || !addr_type)) {
+			list_del_rcu(&ha->list);
+			call_rcu(&ha->rcu_head, ha_rcu_free);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, unsigned char addr_type,
+				     int ignore_index)
+{
+	int err;
+	struct netdev_hw_addr *ha, *ha2;
+	unsigned char type;
+
+	list_for_each_entry(ha, from_list, list) {
+		type = addr_type ? addr_type : ha->type;
+		err = __hw_addr_add(to_list, ha->addr, addr_len, type);
+		if (err)
+			goto unroll;
+	}
+	return 0;
+
+unroll:
+	list_for_each_entry(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		type = addr_type ? addr_type : ha2->type;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, type,
+				 ignore_index);
+	}
+	return err;
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, unsigned char addr_type,
+				      int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	unsigned char type;
+
+	list_for_each_entry(ha, from_list, list) {
+		type = addr_type ? addr_type : ha->type;
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, addr_type,
+				 ignore_index);
+	}
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		call_rcu(&ha->rcu_head, ha_rcu_free);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	/* rtnl_mutex must be held here */
+
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	/* rtnl_mutex must be held here */
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr),
+			    NETDEV_HW_ADDR_T_LAN);
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addr_list,
+				      struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *	@addr_type: address type
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr,
+		 unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add(&dev->dev_addr_list, addr, dev->addr_len,
+			    addr_type);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *	@addr_type: address type
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr,
+		 unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len,
+			       addr_type, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *	@addr_type: address type - 0 means type will be used from from_dev
+ *
+ *	Add device addresses of the one device to another.
+ **
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev,
+			  unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, addr_type, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *	@addr_type: address type - 0 means type will used from from_dev
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev,
+			  unsigned char addr_type)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_del_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, addr_type, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4785,6 +5031,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4810,6 +5057,9 @@ void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH] net: introduce a list of device addresses dev_addr_list (v6)
  2009-05-05 12:48               ` [Bridge] " Jiri Pirko
@ 2009-05-05 19:27                 ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-05-05 19:27 UTC (permalink / raw)
  To: jpirko
  Cc: linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 5 May 2009 14:48:28 +0200

> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>

Applied to net-next-2.6, thanks!

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] net: introduce a list of device addresses dev_addr_list (v6)
@ 2009-05-05 19:27                 ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-05-05 19:27 UTC (permalink / raw)
  To: jpirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt, jgarzik,
	dada1, bonding-devel

From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 5 May 2009 14:48:28 +0200

> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>

Applied to net-next-2.6, thanks!

^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH net-next] net: bridge: use device address list instead of dev_addr (repost)
  2009-04-15  8:21     ` [Bridge] " Jiri Pirko
@ 2009-05-06 14:46       ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-06 14:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, ivecera

(repost, no modifications)

This patch changes the handling of mac addresses of bridge port devices. Now
it uses previously introduced list of device addresses. It allows the bridge to
know more then one local mac address per port which is mandatory for the right
work in some cases.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 net/bridge/br_fdb.c     |  101 ++++++++++++++++++++++++++++++----------------
 net/bridge/br_if.c      |    2 +-
 net/bridge/br_notify.c  |    2 +-
 net/bridge/br_private.h |    4 +-
 4 files changed, 70 insertions(+), 39 deletions(-)

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index a48f5ef..1e63f76 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -77,10 +77,26 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
 	br_fdb_put(f);
 }
 
-void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
+static bool another_port_has_addr(const struct net_bridge_port *p,
+				  struct net_bridge_fdb_entry *f)
+{
+	struct net_bridge *br = p->br;
+	struct net_bridge_port *op;
+
+	list_for_each_entry(op, &br->port_list, list) {
+		if (op != p && is_etherdev_addr(op->dev, f->addr.addr)) {
+			f->dst = op;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
 {
 	struct net_bridge *br = p->br;
 	int i;
+	struct netdev_hw_addr *ha;
 
 	spin_lock_bh(&br->hash_lock);
 
@@ -92,26 +108,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
 
 			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
 			if (f->dst == p && f->is_local) {
-				/* maybe another port has same hw addr? */
-				struct net_bridge_port *op;
-				list_for_each_entry(op, &br->port_list, list) {
-					if (op != p &&
-					    !compare_ether_addr(op->dev->dev_addr,
-								f->addr.addr)) {
-						f->dst = op;
-						goto insert;
-					}
-				}
-
-				/* delete old one */
-				fdb_delete(f);
-				goto insert;
+				/*
+				 * maybe another port has same hw addr?,
+				 * if not then delete it
+				 */
+				if (!another_port_has_addr(p, f))
+					fdb_delete(f);
 			}
 		}
 	}
- insert:
-	/* insert new address,  may fail if invalid address or dup. */
-	fdb_insert(br, p, newaddr);
+
+	/* insert device addresses, may fail if invalid address. */
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		fdb_insert(br, p, ha->addr);
+	}
+	rcu_read_unlock();
 
 	spin_unlock_bh(&br->hash_lock);
 }
@@ -189,20 +202,9 @@ void br_fdb_delete_by_port(struct net_bridge *br,
 			 * then when one port is deleted, assign
 			 * the local entry to other port
 			 */
-			if (f->is_local) {
-				struct net_bridge_port *op;
-				list_for_each_entry(op, &br->port_list, list) {
-					if (op != p &&
-					    !compare_ether_addr(op->dev->dev_addr,
-								f->addr.addr)) {
-						f->dst = op;
-						goto skip_delete;
-					}
-				}
-			}
-
-			fdb_delete(f);
-		skip_delete: ;
+			if (!f->is_local ||
+			    !another_port_has_addr(p, f))
+				fdb_delete(f);
 		}
 	}
 	spin_unlock_bh(&br->hash_lock);
@@ -338,7 +340,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 }
 
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
-		  const unsigned char *addr)
+		      const unsigned char *addr)
 {
 	struct hlist_head *head = &br->hash[br_mac_hash(addr)];
 	struct net_bridge_fdb_entry *fdb;
@@ -366,13 +368,42 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 	return 0;
 }
 
+static int fdb_insert_dev(struct net_bridge *br, struct net_bridge_port *source,
+			  struct net_device *dev)
+{
+	struct netdev_hw_addr *ha, *ha2;
+	struct net_bridge_fdb_entry *fdb;
+	struct hlist_head *head;
+	int ret = 0;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		ret = fdb_insert(br, source, ha->addr);
+		if (ret)
+			goto unroll;
+	}
+	goto unlock;
+unroll:
+	for_each_dev_addr(dev, ha2) {
+		if (ha2 == ha)
+			break;
+		head = &br->hash[br_mac_hash(ha2->addr)];
+		fdb = fdb_find(head, ha2->addr);
+		if (fdb && fdb->is_local)
+			fdb_delete(fdb);
+	}
+unlock:
+	rcu_read_unlock();
+	return ret;
+}
+
 int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
-		  const unsigned char *addr)
+		  struct net_device *dev)
 {
 	int ret;
 
 	spin_lock_bh(&br->hash_lock);
-	ret = fdb_insert(br, source, addr);
+	ret = fdb_insert_dev(br, source, dev);
 	spin_unlock_bh(&br->hash_lock);
 	return ret;
 }
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 8a96672..789cb30 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -392,7 +392,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	if (err)
 		goto err0;
 
-	err = br_fdb_insert(br, p, dev->dev_addr);
+	err = br_fdb_insert(br, p, dev);
 	if (err)
 		goto err1;
 
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 763a3ec..1423541 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -48,7 +48,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 
 	case NETDEV_CHANGEADDR:
 		spin_lock_bh(&br->lock);
-		br_fdb_changeaddr(p, dev->dev_addr);
+		br_fdb_changeaddr(p, dev);
 		br_stp_recalculate_bridge_id(br);
 		spin_unlock_bh(&br->lock);
 		break;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index b6c3b71..65ffe3d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -148,7 +148,7 @@ extern int br_fdb_init(void);
 extern void br_fdb_fini(void);
 extern void br_fdb_flush(struct net_bridge *br);
 extern void br_fdb_changeaddr(struct net_bridge_port *p,
-			      const unsigned char *newaddr);
+			      struct net_device *dev);
 extern void br_fdb_cleanup(unsigned long arg);
 extern void br_fdb_delete_by_port(struct net_bridge *br,
 				  const struct net_bridge_port *p, int do_all);
@@ -161,7 +161,7 @@ extern int br_fdb_fillbuf(struct net_bridge *br, void *buf,
 			  unsigned long count, unsigned long off);
 extern int br_fdb_insert(struct net_bridge *br,
 			 struct net_bridge_port *source,
-			 const unsigned char *addr);
+			 struct net_device *dev);
 extern void br_fdb_update(struct net_bridge *br,
 			  struct net_bridge_port *source,
 			  const unsigned char *addr);

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH net-next] net: bridge: use device address list instead of dev_addr (repost)
@ 2009-05-06 14:46       ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-06 14:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: ivecera, fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik,
	dada1, davem

(repost, no modifications)

This patch changes the handling of mac addresses of bridge port devices. Now
it uses previously introduced list of device addresses. It allows the bridge to
know more then one local mac address per port which is mandatory for the right
work in some cases.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 net/bridge/br_fdb.c     |  101 ++++++++++++++++++++++++++++++----------------
 net/bridge/br_if.c      |    2 +-
 net/bridge/br_notify.c  |    2 +-
 net/bridge/br_private.h |    4 +-
 4 files changed, 70 insertions(+), 39 deletions(-)

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index a48f5ef..1e63f76 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -77,10 +77,26 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
 	br_fdb_put(f);
 }
 
-void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
+static bool another_port_has_addr(const struct net_bridge_port *p,
+				  struct net_bridge_fdb_entry *f)
+{
+	struct net_bridge *br = p->br;
+	struct net_bridge_port *op;
+
+	list_for_each_entry(op, &br->port_list, list) {
+		if (op != p && is_etherdev_addr(op->dev, f->addr.addr)) {
+			f->dst = op;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
 {
 	struct net_bridge *br = p->br;
 	int i;
+	struct netdev_hw_addr *ha;
 
 	spin_lock_bh(&br->hash_lock);
 
@@ -92,26 +108,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
 
 			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
 			if (f->dst == p && f->is_local) {
-				/* maybe another port has same hw addr? */
-				struct net_bridge_port *op;
-				list_for_each_entry(op, &br->port_list, list) {
-					if (op != p &&
-					    !compare_ether_addr(op->dev->dev_addr,
-								f->addr.addr)) {
-						f->dst = op;
-						goto insert;
-					}
-				}
-
-				/* delete old one */
-				fdb_delete(f);
-				goto insert;
+				/*
+				 * maybe another port has same hw addr?,
+				 * if not then delete it
+				 */
+				if (!another_port_has_addr(p, f))
+					fdb_delete(f);
 			}
 		}
 	}
- insert:
-	/* insert new address,  may fail if invalid address or dup. */
-	fdb_insert(br, p, newaddr);
+
+	/* insert device addresses, may fail if invalid address. */
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		fdb_insert(br, p, ha->addr);
+	}
+	rcu_read_unlock();
 
 	spin_unlock_bh(&br->hash_lock);
 }
@@ -189,20 +202,9 @@ void br_fdb_delete_by_port(struct net_bridge *br,
 			 * then when one port is deleted, assign
 			 * the local entry to other port
 			 */
-			if (f->is_local) {
-				struct net_bridge_port *op;
-				list_for_each_entry(op, &br->port_list, list) {
-					if (op != p &&
-					    !compare_ether_addr(op->dev->dev_addr,
-								f->addr.addr)) {
-						f->dst = op;
-						goto skip_delete;
-					}
-				}
-			}
-
-			fdb_delete(f);
-		skip_delete: ;
+			if (!f->is_local ||
+			    !another_port_has_addr(p, f))
+				fdb_delete(f);
 		}
 	}
 	spin_unlock_bh(&br->hash_lock);
@@ -338,7 +340,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 }
 
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
-		  const unsigned char *addr)
+		      const unsigned char *addr)
 {
 	struct hlist_head *head = &br->hash[br_mac_hash(addr)];
 	struct net_bridge_fdb_entry *fdb;
@@ -366,13 +368,42 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 	return 0;
 }
 
+static int fdb_insert_dev(struct net_bridge *br, struct net_bridge_port *source,
+			  struct net_device *dev)
+{
+	struct netdev_hw_addr *ha, *ha2;
+	struct net_bridge_fdb_entry *fdb;
+	struct hlist_head *head;
+	int ret = 0;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		ret = fdb_insert(br, source, ha->addr);
+		if (ret)
+			goto unroll;
+	}
+	goto unlock;
+unroll:
+	for_each_dev_addr(dev, ha2) {
+		if (ha2 == ha)
+			break;
+		head = &br->hash[br_mac_hash(ha2->addr)];
+		fdb = fdb_find(head, ha2->addr);
+		if (fdb && fdb->is_local)
+			fdb_delete(fdb);
+	}
+unlock:
+	rcu_read_unlock();
+	return ret;
+}
+
 int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
-		  const unsigned char *addr)
+		  struct net_device *dev)
 {
 	int ret;
 
 	spin_lock_bh(&br->hash_lock);
-	ret = fdb_insert(br, source, addr);
+	ret = fdb_insert_dev(br, source, dev);
 	spin_unlock_bh(&br->hash_lock);
 	return ret;
 }
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 8a96672..789cb30 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -392,7 +392,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	if (err)
 		goto err0;
 
-	err = br_fdb_insert(br, p, dev->dev_addr);
+	err = br_fdb_insert(br, p, dev);
 	if (err)
 		goto err1;
 
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 763a3ec..1423541 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -48,7 +48,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 
 	case NETDEV_CHANGEADDR:
 		spin_lock_bh(&br->lock);
-		br_fdb_changeaddr(p, dev->dev_addr);
+		br_fdb_changeaddr(p, dev);
 		br_stp_recalculate_bridge_id(br);
 		spin_unlock_bh(&br->lock);
 		break;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index b6c3b71..65ffe3d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -148,7 +148,7 @@ extern int br_fdb_init(void);
 extern void br_fdb_fini(void);
 extern void br_fdb_flush(struct net_bridge *br);
 extern void br_fdb_changeaddr(struct net_bridge_port *p,
-			      const unsigned char *newaddr);
+			      struct net_device *dev);
 extern void br_fdb_cleanup(unsigned long arg);
 extern void br_fdb_delete_by_port(struct net_bridge *br,
 				  const struct net_bridge_port *p, int do_all);
@@ -161,7 +161,7 @@ extern int br_fdb_fillbuf(struct net_bridge *br, void *buf,
 			  unsigned long count, unsigned long off);
 extern int br_fdb_insert(struct net_bridge *br,
 			 struct net_bridge_port *source,
-			 const unsigned char *addr);
+			 struct net_device *dev);
 extern void br_fdb_update(struct net_bridge *br,
 			  struct net_bridge_port *source,
 			  const unsigned char *addr);

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH net-next] net: bridge: use device address list instead of dev_addr (repost)
  2009-05-06 14:46       ` [Bridge] " Jiri Pirko
@ 2009-05-06 15:08         ` Eric Dumazet
  -1 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-05-06 15:08 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, netdev, jgarzik, davem, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, ivecera

Jiri Pirko a écrit :
> (repost, no modifications)

Well, some changes are welcome :)

> 
> This patch changes the handling of mac addresses of bridge port devices. Now
> it uses previously introduced list of device addresses. It allows the bridge to
> know more then one local mac address per port which is mandatory for the right
> work in some cases.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  net/bridge/br_fdb.c     |  101 ++++++++++++++++++++++++++++++----------------
>  net/bridge/br_if.c      |    2 +-
>  net/bridge/br_notify.c  |    2 +-
>  net/bridge/br_private.h |    4 +-
>  4 files changed, 70 insertions(+), 39 deletions(-)
> 
> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index a48f5ef..1e63f76 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
> @@ -77,10 +77,26 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
>  	br_fdb_put(f);
>  }
>  
> -void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
> +static bool another_port_has_addr(const struct net_bridge_port *p,
> +				  struct net_bridge_fdb_entry *f)
> +{
> +	struct net_bridge *br = p->br;
> +	struct net_bridge_port *op;
> +
> +	list_for_each_entry(op, &br->port_list, list) {
> +		if (op != p && is_etherdev_addr(op->dev, f->addr.addr)) {
> +			f->dst = op;
> +			return 1;
> +		}
> +	}
> +	return 0;
> +}
> +
> +void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
>  {
>  	struct net_bridge *br = p->br;
>  	int i;
> +	struct netdev_hw_addr *ha;
>  
>  	spin_lock_bh(&br->hash_lock);
>  
> @@ -92,26 +108,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
>  
>  			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
>  			if (f->dst == p && f->is_local) {
> -				/* maybe another port has same hw addr? */
> -				struct net_bridge_port *op;
> -				list_for_each_entry(op, &br->port_list, list) {
> -					if (op != p &&
> -					    !compare_ether_addr(op->dev->dev_addr,
> -								f->addr.addr)) {
> -						f->dst = op;
> -						goto insert;
> -					}
> -				}
> -
> -				/* delete old one */
> -				fdb_delete(f);
> -				goto insert;
> +				/*
> +				 * maybe another port has same hw addr?,
> +				 * if not then delete it
> +				 */
> +				if (!another_port_has_addr(p, f))
> +					fdb_delete(f);
>  			}
>  		}
>  	}
> - insert:
> -	/* insert new address,  may fail if invalid address or dup. */
> -	fdb_insert(br, p, newaddr);
> +
> +	/* insert device addresses, may fail if invalid address. */
> +
> +	rcu_read_lock();

Same problem than a previous patch Jiri.

You should not use rcu_read_lock()/rcu_read_unlock() at all in this context,
since you already own a lock.


> +	for_each_dev_addr(dev, ha) {
> +		fdb_insert(br, p, ha->addr);
> +	}
> +	rcu_read_unlock();
>  
>  	spin_unlock_bh(&br->hash_lock);
>  }
> @@ -189,20 +202,9 @@ void br_fdb_delete_by_port(struct net_bridge *br,
>  			 * then when one port is deleted, assign
>  			 * the local entry to other port
>  			 */
> -			if (f->is_local) {
> -				struct net_bridge_port *op;
> -				list_for_each_entry(op, &br->port_list, list) {
> -					if (op != p &&
> -					    !compare_ether_addr(op->dev->dev_addr,
> -								f->addr.addr)) {
> -						f->dst = op;
> -						goto skip_delete;
> -					}
> -				}
> -			}
> -
> -			fdb_delete(f);
> -		skip_delete: ;
> +			if (!f->is_local ||
> +			    !another_port_has_addr(p, f))
> +				fdb_delete(f);
>  		}
>  	}
>  	spin_unlock_bh(&br->hash_lock);
> @@ -338,7 +340,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
>  }
>  
>  static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
> -		  const unsigned char *addr)
> +		      const unsigned char *addr)
>  {
>  	struct hlist_head *head = &br->hash[br_mac_hash(addr)];
>  	struct net_bridge_fdb_entry *fdb;
> @@ -366,13 +368,42 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
>  	return 0;
>  }
>  
> +static int fdb_insert_dev(struct net_bridge *br, struct net_bridge_port *source,
> +			  struct net_device *dev)
> +{
> +	struct netdev_hw_addr *ha, *ha2;
> +	struct net_bridge_fdb_entry *fdb;
> +	struct hlist_head *head;
> +	int ret = 0;
> +
> +	rcu_read_lock();


You should not use rcu_read_lock()/rcu_read_unlock() at all in this context


> +	for_each_dev_addr(dev, ha) {
> +		ret = fdb_insert(br, source, ha->addr);
> +		if (ret)
> +			goto unroll;
> +	}
> +	goto unlock;
> +unroll:
> +	for_each_dev_addr(dev, ha2) {
> +		if (ha2 == ha)
> +			break;
> +		head = &br->hash[br_mac_hash(ha2->addr)];
> +		fdb = fdb_find(head, ha2->addr);
> +		if (fdb && fdb->is_local)
> +			fdb_delete(fdb);
> +	}
> +unlock:
> +	rcu_read_unlock();
> +	return ret;
> +}
> +
>  int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
> -		  const unsigned char *addr)
> +		  struct net_device *dev)
>  {
>  	int ret;
>  
>  	spin_lock_bh(&br->hash_lock);
> -	ret = fdb_insert(br, source, addr);
> +	ret = fdb_insert_dev(br, source, dev);
>  	spin_unlock_bh(&br->hash_lock);
>  	return ret;
>  }
> diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
> index 8a96672..789cb30 100644
> --- a/net/bridge/br_if.c
> +++ b/net/bridge/br_if.c
> @@ -392,7 +392,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
>  	if (err)
>  		goto err0;
>  
> -	err = br_fdb_insert(br, p, dev->dev_addr);
> +	err = br_fdb_insert(br, p, dev);
>  	if (err)
>  		goto err1;
>  
> diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
> index 763a3ec..1423541 100644
> --- a/net/bridge/br_notify.c
> +++ b/net/bridge/br_notify.c
> @@ -48,7 +48,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
>  
>  	case NETDEV_CHANGEADDR:
>  		spin_lock_bh(&br->lock);
> -		br_fdb_changeaddr(p, dev->dev_addr);
> +		br_fdb_changeaddr(p, dev);
>  		br_stp_recalculate_bridge_id(br);
>  		spin_unlock_bh(&br->lock);
>  		break;
> diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
> index b6c3b71..65ffe3d 100644
> --- a/net/bridge/br_private.h
> +++ b/net/bridge/br_private.h
> @@ -148,7 +148,7 @@ extern int br_fdb_init(void);
>  extern void br_fdb_fini(void);
>  extern void br_fdb_flush(struct net_bridge *br);
>  extern void br_fdb_changeaddr(struct net_bridge_port *p,
> -			      const unsigned char *newaddr);
> +			      struct net_device *dev);
>  extern void br_fdb_cleanup(unsigned long arg);
>  extern void br_fdb_delete_by_port(struct net_bridge *br,
>  				  const struct net_bridge_port *p, int do_all);
> @@ -161,7 +161,7 @@ extern int br_fdb_fillbuf(struct net_bridge *br, void *buf,
>  			  unsigned long count, unsigned long off);
>  extern int br_fdb_insert(struct net_bridge *br,
>  			 struct net_bridge_port *source,
> -			 const unsigned char *addr);
> +			 struct net_device *dev);
>  extern void br_fdb_update(struct net_bridge *br,
>  			  struct net_bridge_port *source,
>  			  const unsigned char *addr);
> 
> 



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH net-next] net: bridge: use device address list instead of dev_addr (repost)
@ 2009-05-06 15:08         ` Eric Dumazet
  0 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-05-06 15:08 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, jgarzik, davem

Jiri Pirko a écrit :
> (repost, no modifications)

Well, some changes are welcome :)

> 
> This patch changes the handling of mac addresses of bridge port devices. Now
> it uses previously introduced list of device addresses. It allows the bridge to
> know more then one local mac address per port which is mandatory for the right
> work in some cases.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  net/bridge/br_fdb.c     |  101 ++++++++++++++++++++++++++++++----------------
>  net/bridge/br_if.c      |    2 +-
>  net/bridge/br_notify.c  |    2 +-
>  net/bridge/br_private.h |    4 +-
>  4 files changed, 70 insertions(+), 39 deletions(-)
> 
> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index a48f5ef..1e63f76 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
> @@ -77,10 +77,26 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
>  	br_fdb_put(f);
>  }
>  
> -void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
> +static bool another_port_has_addr(const struct net_bridge_port *p,
> +				  struct net_bridge_fdb_entry *f)
> +{
> +	struct net_bridge *br = p->br;
> +	struct net_bridge_port *op;
> +
> +	list_for_each_entry(op, &br->port_list, list) {
> +		if (op != p && is_etherdev_addr(op->dev, f->addr.addr)) {
> +			f->dst = op;
> +			return 1;
> +		}
> +	}
> +	return 0;
> +}
> +
> +void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
>  {
>  	struct net_bridge *br = p->br;
>  	int i;
> +	struct netdev_hw_addr *ha;
>  
>  	spin_lock_bh(&br->hash_lock);
>  
> @@ -92,26 +108,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
>  
>  			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
>  			if (f->dst == p && f->is_local) {
> -				/* maybe another port has same hw addr? */
> -				struct net_bridge_port *op;
> -				list_for_each_entry(op, &br->port_list, list) {
> -					if (op != p &&
> -					    !compare_ether_addr(op->dev->dev_addr,
> -								f->addr.addr)) {
> -						f->dst = op;
> -						goto insert;
> -					}
> -				}
> -
> -				/* delete old one */
> -				fdb_delete(f);
> -				goto insert;
> +				/*
> +				 * maybe another port has same hw addr?,
> +				 * if not then delete it
> +				 */
> +				if (!another_port_has_addr(p, f))
> +					fdb_delete(f);
>  			}
>  		}
>  	}
> - insert:
> -	/* insert new address,  may fail if invalid address or dup. */
> -	fdb_insert(br, p, newaddr);
> +
> +	/* insert device addresses, may fail if invalid address. */
> +
> +	rcu_read_lock();

Same problem than a previous patch Jiri.

You should not use rcu_read_lock()/rcu_read_unlock() at all in this context,
since you already own a lock.


> +	for_each_dev_addr(dev, ha) {
> +		fdb_insert(br, p, ha->addr);
> +	}
> +	rcu_read_unlock();
>  
>  	spin_unlock_bh(&br->hash_lock);
>  }
> @@ -189,20 +202,9 @@ void br_fdb_delete_by_port(struct net_bridge *br,
>  			 * then when one port is deleted, assign
>  			 * the local entry to other port
>  			 */
> -			if (f->is_local) {
> -				struct net_bridge_port *op;
> -				list_for_each_entry(op, &br->port_list, list) {
> -					if (op != p &&
> -					    !compare_ether_addr(op->dev->dev_addr,
> -								f->addr.addr)) {
> -						f->dst = op;
> -						goto skip_delete;
> -					}
> -				}
> -			}
> -
> -			fdb_delete(f);
> -		skip_delete: ;
> +			if (!f->is_local ||
> +			    !another_port_has_addr(p, f))
> +				fdb_delete(f);
>  		}
>  	}
>  	spin_unlock_bh(&br->hash_lock);
> @@ -338,7 +340,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
>  }
>  
>  static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
> -		  const unsigned char *addr)
> +		      const unsigned char *addr)
>  {
>  	struct hlist_head *head = &br->hash[br_mac_hash(addr)];
>  	struct net_bridge_fdb_entry *fdb;
> @@ -366,13 +368,42 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
>  	return 0;
>  }
>  
> +static int fdb_insert_dev(struct net_bridge *br, struct net_bridge_port *source,
> +			  struct net_device *dev)
> +{
> +	struct netdev_hw_addr *ha, *ha2;
> +	struct net_bridge_fdb_entry *fdb;
> +	struct hlist_head *head;
> +	int ret = 0;
> +
> +	rcu_read_lock();


You should not use rcu_read_lock()/rcu_read_unlock() at all in this context


> +	for_each_dev_addr(dev, ha) {
> +		ret = fdb_insert(br, source, ha->addr);
> +		if (ret)
> +			goto unroll;
> +	}
> +	goto unlock;
> +unroll:
> +	for_each_dev_addr(dev, ha2) {
> +		if (ha2 == ha)
> +			break;
> +		head = &br->hash[br_mac_hash(ha2->addr)];
> +		fdb = fdb_find(head, ha2->addr);
> +		if (fdb && fdb->is_local)
> +			fdb_delete(fdb);
> +	}
> +unlock:
> +	rcu_read_unlock();
> +	return ret;
> +}
> +
>  int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
> -		  const unsigned char *addr)
> +		  struct net_device *dev)
>  {
>  	int ret;
>  
>  	spin_lock_bh(&br->hash_lock);
> -	ret = fdb_insert(br, source, addr);
> +	ret = fdb_insert_dev(br, source, dev);
>  	spin_unlock_bh(&br->hash_lock);
>  	return ret;
>  }
> diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
> index 8a96672..789cb30 100644
> --- a/net/bridge/br_if.c
> +++ b/net/bridge/br_if.c
> @@ -392,7 +392,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
>  	if (err)
>  		goto err0;
>  
> -	err = br_fdb_insert(br, p, dev->dev_addr);
> +	err = br_fdb_insert(br, p, dev);
>  	if (err)
>  		goto err1;
>  
> diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
> index 763a3ec..1423541 100644
> --- a/net/bridge/br_notify.c
> +++ b/net/bridge/br_notify.c
> @@ -48,7 +48,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
>  
>  	case NETDEV_CHANGEADDR:
>  		spin_lock_bh(&br->lock);
> -		br_fdb_changeaddr(p, dev->dev_addr);
> +		br_fdb_changeaddr(p, dev);
>  		br_stp_recalculate_bridge_id(br);
>  		spin_unlock_bh(&br->lock);
>  		break;
> diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
> index b6c3b71..65ffe3d 100644
> --- a/net/bridge/br_private.h
> +++ b/net/bridge/br_private.h
> @@ -148,7 +148,7 @@ extern int br_fdb_init(void);
>  extern void br_fdb_fini(void);
>  extern void br_fdb_flush(struct net_bridge *br);
>  extern void br_fdb_changeaddr(struct net_bridge_port *p,
> -			      const unsigned char *newaddr);
> +			      struct net_device *dev);
>  extern void br_fdb_cleanup(unsigned long arg);
>  extern void br_fdb_delete_by_port(struct net_bridge *br,
>  				  const struct net_bridge_port *p, int do_all);
> @@ -161,7 +161,7 @@ extern int br_fdb_fillbuf(struct net_bridge *br, void *buf,
>  			  unsigned long count, unsigned long off);
>  extern int br_fdb_insert(struct net_bridge *br,
>  			 struct net_bridge_port *source,
> -			 const unsigned char *addr);
> +			 struct net_device *dev);
>  extern void br_fdb_update(struct net_bridge *br,
>  			  struct net_bridge_port *source,
>  			  const unsigned char *addr);
> 
> 



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH net-next] net: bridge: use device address list instead of dev_addr (repost)
  2009-05-06 14:46       ` [Bridge] " Jiri Pirko
@ 2009-05-06 19:26         ` Stephen Hemminger
  -1 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-05-06 19:26 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, netdev, jgarzik, davem, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

On Wed, 6 May 2009 16:46:25 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> (repost, no modifications)
> 
> This patch changes the handling of mac addresses of bridge port devices. Now
> it uses previously introduced list of device addresses. It allows the bridge to
> know more then one local mac address per port which is mandatory for the right
> work in some cases.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  net/bridge/br_fdb.c     |  101 ++++++++++++++++++++++++++++++----------------
>  net/bridge/br_if.c      |    2 +-
>  net/bridge/br_notify.c  |    2 +-
>  net/bridge/br_private.h |    4 +-
>  4 files changed, 70 insertions(+), 39 deletions(-)
> 
> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index a48f5ef..1e63f76 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
> @@ -77,10 +77,26 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
>  	br_fdb_put(f);
>  }
>  
> -void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
> +static bool another_port_has_addr(const struct net_bridge_port *p,
> +				  struct net_bridge_fdb_entry *f)
> +{
> +	struct net_bridge *br = p->br;
> +	struct net_bridge_port *op;
> +
> +	list_for_each_entry(op, &br->port_list, list) {
> +		if (op != p && is_etherdev_addr(op->dev, f->addr.addr)) {
> +			f->dst = op;
> +			return 1;
> +		}
> +	}
> +	return 0;
> +}
> +
> +void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
>  {
>  	struct net_bridge *br = p->br;
>  	int i;
> +	struct netdev_hw_addr *ha;
>  
>  	spin_lock_bh(&br->hash_lock);
>  
> @@ -92,26 +108,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
>  
>  			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
>  			if (f->dst == p && f->is_local) {
> -				/* maybe another port has same hw addr? */
> -				struct net_bridge_port *op;
> -				list_for_each_entry(op, &br->port_list, list) {
> -					if (op != p &&
> -					    !compare_ether_addr(op->dev->dev_addr,
> -								f->addr.addr)) {
> -						f->dst = op;
> -						goto insert;
> -					}
> -				}
> -
> -				/* delete old one */
> -				fdb_delete(f);
> -				goto insert;
> +				/*
> +				 * maybe another port has same hw addr?,
> +				 * if not then delete it
> +				 */
> +				if (!another_port_has_addr(p, f))
> +					fdb_delete(f);
>  			}
>  		}
>  	}
> - insert:
> -	/* insert new address,  may fail if invalid address or dup. */
> -	fdb_insert(br, p, newaddr);
> +
> +	/* insert device addresses, may fail if invalid address. */
> +
> +	rcu_read_lock();
> +	for_each_dev_addr(dev, ha) {
> +		fdb_insert(br, p, ha->addr);
> +	}
> +	rcu_read_unlock();
>  
>  	spin_unlock_bh(&br->hash_lock);
>  }
> @@ -189,20 +202,9 @@ void br_fdb_delete_by_port(struct net_bridge *br,
>  			 * then when one port is deleted, assign
>  			 * the local entry to other port
>  			 */
> -			if (f->is_local) {
> -				struct net_bridge_port *op;
> -				list_for_each_entry(op, &br->port_list, list) {
> -					if (op != p &&
> -					    !compare_ether_addr(op->dev->dev_addr,
> -								f->addr.addr)) {
> -						f->dst = op;
> -						goto skip_delete;
> -					}
> -				}
> -			}
> -
> -			fdb_delete(f);
> -		skip_delete: ;
> +			if (!f->is_local ||
> +			    !another_port_has_addr(p, f))
> +				fdb_delete(f);
>  		}
>  	}
>  	spin_unlock_bh(&br->hash_lock);
> @@ -338,7 +340,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
>  }
>  
>  static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
> -		  const unsigned char *addr)
> +		      const unsigned char *addr)
>  {
>  	struct hlist_head *head = &br->hash[br_mac_hash(addr)];
>  	struct net_bridge_fdb_entry *fdb;
> @@ -366,13 +368,42 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
>  	return 0;
>  }
>  
> +static int fdb_insert_dev(struct net_bridge *br, struct net_bridge_port *source,
> +			  struct net_device *dev)
> +{
> +	struct netdev_hw_addr *ha, *ha2;
> +	struct net_bridge_fdb_entry *fdb;
> +	struct hlist_head *head;
> +	int ret = 0;
> +
> +	rcu_read_lock();
> +	for_each_dev_addr(dev, ha) {
> +		ret = fdb_insert(br, source, ha->addr);
> +		if (ret)
> +			goto unroll;
> +	}
> +	goto unlock;
> +unroll:
> +	for_each_dev_addr(dev, ha2) {
> +		if (ha2 == ha)
> +			break;
> +		head = &br->hash[br_mac_hash(ha2->addr)];
> +		fdb = fdb_find(head, ha2->addr);
> +		if (fdb && fdb->is_local)
> +			fdb_delete(fdb);
> +	}
> +unlock:
> +	rcu_read_unlock();
> +	return ret;
> +}
> +
>  int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
> -		  const unsigned char *addr)
> +		  struct net_device *dev)
>  {
>  	int ret;
>  
>  	spin_lock_bh(&br->hash_lock);
> -	ret = fdb_insert(br, source, addr);
> +	ret = fdb_insert_dev(br, source, dev);
>  	spin_unlock_bh(&br->hash_lock);
>  	return ret;
>  }
> diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
> index 8a96672..789cb30 100644
> --- a/net/bridge/br_if.c
> +++ b/net/bridge/br_if.c
> @@ -392,7 +392,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
>  	if (err)
>  		goto err0;
>  
> -	err = br_fdb_insert(br, p, dev->dev_addr);
> +	err = br_fdb_insert(br, p, dev);
>  	if (err)
>  		goto err1;
>  
> diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
> index 763a3ec..1423541 100644
> --- a/net/bridge/br_notify.c
> +++ b/net/bridge/br_notify.c
> @@ -48,7 +48,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
>  
>  	case NETDEV_CHANGEADDR:
>  		spin_lock_bh(&br->lock);
> -		br_fdb_changeaddr(p, dev->dev_addr);
> +		br_fdb_changeaddr(p, dev);
>  		br_stp_recalculate_bridge_id(br);
>  		spin_unlock_bh(&br->lock);
>  		break;
> diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
> index b6c3b71..65ffe3d 100644
> --- a/net/bridge/br_private.h
> +++ b/net/bridge/br_private.h
> @@ -148,7 +148,7 @@ extern int br_fdb_init(void);
>  extern void br_fdb_fini(void);
>  extern void br_fdb_flush(struct net_bridge *br);
>  extern void br_fdb_changeaddr(struct net_bridge_port *p,
> -			      const unsigned char *newaddr);
> +			      struct net_device *dev);
>  extern void br_fdb_cleanup(unsigned long arg);
>  extern void br_fdb_delete_by_port(struct net_bridge *br,
>  				  const struct net_bridge_port *p, int do_all);
> @@ -161,7 +161,7 @@ extern int br_fdb_fillbuf(struct net_bridge *br, void *buf,
>  			  unsigned long count, unsigned long off);
>  extern int br_fdb_insert(struct net_bridge *br,
>  			 struct net_bridge_port *source,
> -			 const unsigned char *addr);
> +			 struct net_device *dev);
>  extern void br_fdb_update(struct net_bridge *br,
>  			  struct net_bridge_port *source,
>  			  const unsigned char *addr);

Won't this all break spanning tree which expects 1:1 relationship between
address and port.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH net-next] net: bridge: use device address list instead of dev_addr (repost)
@ 2009-05-06 19:26         ` Stephen Hemminger
  0 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-05-06 19:26 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: ivecera, fubar, netdev, bridge, linux-kernel, mschmidt,
	bonding-devel, dada1, jgarzik, davem

On Wed, 6 May 2009 16:46:25 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> (repost, no modifications)
> 
> This patch changes the handling of mac addresses of bridge port devices. Now
> it uses previously introduced list of device addresses. It allows the bridge to
> know more then one local mac address per port which is mandatory for the right
> work in some cases.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  net/bridge/br_fdb.c     |  101 ++++++++++++++++++++++++++++++----------------
>  net/bridge/br_if.c      |    2 +-
>  net/bridge/br_notify.c  |    2 +-
>  net/bridge/br_private.h |    4 +-
>  4 files changed, 70 insertions(+), 39 deletions(-)
> 
> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index a48f5ef..1e63f76 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
> @@ -77,10 +77,26 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
>  	br_fdb_put(f);
>  }
>  
> -void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
> +static bool another_port_has_addr(const struct net_bridge_port *p,
> +				  struct net_bridge_fdb_entry *f)
> +{
> +	struct net_bridge *br = p->br;
> +	struct net_bridge_port *op;
> +
> +	list_for_each_entry(op, &br->port_list, list) {
> +		if (op != p && is_etherdev_addr(op->dev, f->addr.addr)) {
> +			f->dst = op;
> +			return 1;
> +		}
> +	}
> +	return 0;
> +}
> +
> +void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
>  {
>  	struct net_bridge *br = p->br;
>  	int i;
> +	struct netdev_hw_addr *ha;
>  
>  	spin_lock_bh(&br->hash_lock);
>  
> @@ -92,26 +108,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
>  
>  			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
>  			if (f->dst == p && f->is_local) {
> -				/* maybe another port has same hw addr? */
> -				struct net_bridge_port *op;
> -				list_for_each_entry(op, &br->port_list, list) {
> -					if (op != p &&
> -					    !compare_ether_addr(op->dev->dev_addr,
> -								f->addr.addr)) {
> -						f->dst = op;
> -						goto insert;
> -					}
> -				}
> -
> -				/* delete old one */
> -				fdb_delete(f);
> -				goto insert;
> +				/*
> +				 * maybe another port has same hw addr?,
> +				 * if not then delete it
> +				 */
> +				if (!another_port_has_addr(p, f))
> +					fdb_delete(f);
>  			}
>  		}
>  	}
> - insert:
> -	/* insert new address,  may fail if invalid address or dup. */
> -	fdb_insert(br, p, newaddr);
> +
> +	/* insert device addresses, may fail if invalid address. */
> +
> +	rcu_read_lock();
> +	for_each_dev_addr(dev, ha) {
> +		fdb_insert(br, p, ha->addr);
> +	}
> +	rcu_read_unlock();
>  
>  	spin_unlock_bh(&br->hash_lock);
>  }
> @@ -189,20 +202,9 @@ void br_fdb_delete_by_port(struct net_bridge *br,
>  			 * then when one port is deleted, assign
>  			 * the local entry to other port
>  			 */
> -			if (f->is_local) {
> -				struct net_bridge_port *op;
> -				list_for_each_entry(op, &br->port_list, list) {
> -					if (op != p &&
> -					    !compare_ether_addr(op->dev->dev_addr,
> -								f->addr.addr)) {
> -						f->dst = op;
> -						goto skip_delete;
> -					}
> -				}
> -			}
> -
> -			fdb_delete(f);
> -		skip_delete: ;
> +			if (!f->is_local ||
> +			    !another_port_has_addr(p, f))
> +				fdb_delete(f);
>  		}
>  	}
>  	spin_unlock_bh(&br->hash_lock);
> @@ -338,7 +340,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
>  }
>  
>  static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
> -		  const unsigned char *addr)
> +		      const unsigned char *addr)
>  {
>  	struct hlist_head *head = &br->hash[br_mac_hash(addr)];
>  	struct net_bridge_fdb_entry *fdb;
> @@ -366,13 +368,42 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
>  	return 0;
>  }
>  
> +static int fdb_insert_dev(struct net_bridge *br, struct net_bridge_port *source,
> +			  struct net_device *dev)
> +{
> +	struct netdev_hw_addr *ha, *ha2;
> +	struct net_bridge_fdb_entry *fdb;
> +	struct hlist_head *head;
> +	int ret = 0;
> +
> +	rcu_read_lock();
> +	for_each_dev_addr(dev, ha) {
> +		ret = fdb_insert(br, source, ha->addr);
> +		if (ret)
> +			goto unroll;
> +	}
> +	goto unlock;
> +unroll:
> +	for_each_dev_addr(dev, ha2) {
> +		if (ha2 == ha)
> +			break;
> +		head = &br->hash[br_mac_hash(ha2->addr)];
> +		fdb = fdb_find(head, ha2->addr);
> +		if (fdb && fdb->is_local)
> +			fdb_delete(fdb);
> +	}
> +unlock:
> +	rcu_read_unlock();
> +	return ret;
> +}
> +
>  int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
> -		  const unsigned char *addr)
> +		  struct net_device *dev)
>  {
>  	int ret;
>  
>  	spin_lock_bh(&br->hash_lock);
> -	ret = fdb_insert(br, source, addr);
> +	ret = fdb_insert_dev(br, source, dev);
>  	spin_unlock_bh(&br->hash_lock);
>  	return ret;
>  }
> diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
> index 8a96672..789cb30 100644
> --- a/net/bridge/br_if.c
> +++ b/net/bridge/br_if.c
> @@ -392,7 +392,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
>  	if (err)
>  		goto err0;
>  
> -	err = br_fdb_insert(br, p, dev->dev_addr);
> +	err = br_fdb_insert(br, p, dev);
>  	if (err)
>  		goto err1;
>  
> diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
> index 763a3ec..1423541 100644
> --- a/net/bridge/br_notify.c
> +++ b/net/bridge/br_notify.c
> @@ -48,7 +48,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
>  
>  	case NETDEV_CHANGEADDR:
>  		spin_lock_bh(&br->lock);
> -		br_fdb_changeaddr(p, dev->dev_addr);
> +		br_fdb_changeaddr(p, dev);
>  		br_stp_recalculate_bridge_id(br);
>  		spin_unlock_bh(&br->lock);
>  		break;
> diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
> index b6c3b71..65ffe3d 100644
> --- a/net/bridge/br_private.h
> +++ b/net/bridge/br_private.h
> @@ -148,7 +148,7 @@ extern int br_fdb_init(void);
>  extern void br_fdb_fini(void);
>  extern void br_fdb_flush(struct net_bridge *br);
>  extern void br_fdb_changeaddr(struct net_bridge_port *p,
> -			      const unsigned char *newaddr);
> +			      struct net_device *dev);
>  extern void br_fdb_cleanup(unsigned long arg);
>  extern void br_fdb_delete_by_port(struct net_bridge *br,
>  				  const struct net_bridge_port *p, int do_all);
> @@ -161,7 +161,7 @@ extern int br_fdb_fillbuf(struct net_bridge *br, void *buf,
>  			  unsigned long count, unsigned long off);
>  extern int br_fdb_insert(struct net_bridge *br,
>  			 struct net_bridge_port *source,
> -			 const unsigned char *addr);
> +			 struct net_device *dev);
>  extern void br_fdb_update(struct net_bridge *br,
>  			  struct net_bridge_port *source,
>  			  const unsigned char *addr);

Won't this all break spanning tree which expects 1:1 relationship between
address and port.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH net-next] net: bridge: use device address list instead of dev_addr (repost)
  2009-05-06 19:26         ` [Bridge] " Stephen Hemminger
@ 2009-05-07 22:03           ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-05-07 22:03 UTC (permalink / raw)
  To: shemminger
  Cc: jpirko, linux-kernel, netdev, jgarzik, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 6 May 2009 12:26:45 -0700

> Won't this all break spanning tree which expects 1:1 relationship
> between address and port.

Indeed, this could be a fundamental issue with this change.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH net-next] net: bridge: use device address list instead of dev_addr (repost)
@ 2009-05-07 22:03           ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-05-07 22:03 UTC (permalink / raw)
  To: shemminger
  Cc: ivecera, fubar, jpirko, netdev, bridge, linux-kernel, mschmidt,
	dada1, jgarzik, bonding-devel

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 6 May 2009 12:26:45 -0700

> Won't this all break spanning tree which expects 1:1 relationship
> between address and port.

Indeed, this could be a fundamental issue with this change.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] net: introduce a list of device addresses dev_addr_list (v6)
  2009-05-05 19:27                 ` [Bridge] " David Miller
@ 2009-05-08 22:38                   ` Stephen Hemminger
  -1 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-05-08 22:38 UTC (permalink / raw)
  To: David Miller
  Cc: jpirko, linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

On Tue, 05 May 2009 12:27:18 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Jiri Pirko <jpirko@redhat.com>
> Date: Tue, 5 May 2009 14:48:28 +0200
> 
> > This patch introduces a new list in struct net_device and brings a set of
> > functions to handle the work with device address list. The list is a replacement
> > for the original dev_addr field and because in some situations there is need to
> > carry several device addresses with the net device. To be backward compatible,
> > dev_addr is made to point to the first member of the list so original drivers
> > sees no difference.
> > 
> > Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> Applied to net-next-2.6, thanks!
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Not sure if this is such a good idea since the purpose of this was to fix
a bonding/bridging interaction, but it breaks STP on bridging.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] net: introduce a list of device addresses dev_addr_list (v6)
@ 2009-05-08 22:38                   ` Stephen Hemminger
  0 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-05-08 22:38 UTC (permalink / raw)
  To: David Miller
  Cc: ivecera, fubar, jpirko, netdev, bridge, linux-kernel, mschmidt,
	jgarzik, dada1, bonding-devel

On Tue, 05 May 2009 12:27:18 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Jiri Pirko <jpirko@redhat.com>
> Date: Tue, 5 May 2009 14:48:28 +0200
> 
> > This patch introduces a new list in struct net_device and brings a set of
> > functions to handle the work with device address list. The list is a replacement
> > for the original dev_addr field and because in some situations there is need to
> > carry several device addresses with the net device. To be backward compatible,
> > dev_addr is made to point to the first member of the list so original drivers
> > sees no difference.
> > 
> > Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> Applied to net-next-2.6, thanks!
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Not sure if this is such a good idea since the purpose of this was to fix
a bonding/bridging interaction, but it breaks STP on bridging.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] net: introduce a list of device addresses dev_addr_list (v6)
  2009-05-08 22:38                   ` [Bridge] " Stephen Hemminger
@ 2009-05-08 23:00                     ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-05-08 23:00 UTC (permalink / raw)
  To: shemminger
  Cc: jpirko, linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 8 May 2009 15:38:42 -0700

> Not sure if this is such a good idea since the purpose of this was to fix
> a bonding/bridging interaction, but it breaks STP on bridging.

Thanks for not paying attention... :-/

The Intel folks want to have an address list functionality so
they can public MAC addresses meant for FCOE and other purposes.

So even if the bonding bits bomb, we still need this.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] net: introduce a list of device addresses dev_addr_list (v6)
@ 2009-05-08 23:00                     ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-05-08 23:00 UTC (permalink / raw)
  To: shemminger
  Cc: ivecera, fubar, jpirko, netdev, bridge, linux-kernel, mschmidt,
	jgarzik, dada1, bonding-devel

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 8 May 2009 15:38:42 -0700

> Not sure if this is such a good idea since the purpose of this was to fix
> a bonding/bridging interaction, but it breaks STP on bridging.

Thanks for not paying attention... :-/

The Intel folks want to have an address list functionality so
they can public MAC addresses meant for FCOE and other purposes.

So even if the bonding bits bomb, we still need this.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] net: introduce a list of device addresses dev_addr_list (v6)
  2009-05-08 23:00                     ` [Bridge] " David Miller
@ 2009-05-08 23:12                       ` Stephen Hemminger
  -1 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-05-08 23:12 UTC (permalink / raw)
  To: David Miller
  Cc: jpirko, linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

On Fri, 08 May 2009 16:00:08 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Fri, 8 May 2009 15:38:42 -0700
> 
> > Not sure if this is such a good idea since the purpose of this was to fix
> > a bonding/bridging interaction, but it breaks STP on bridging.
> 
> Thanks for not paying attention... :-/
> 
> The Intel folks want to have an address list functionality so
> they can public MAC addresses meant for FCOE and other purposes.
> 
> So even if the bonding bits bomb, we still need this.

But the other infrastructure may have same issues (netfilter, etc).
Just seems like it would be either to have multiple network devices
so that upper layers could disambiguate easier.

-- 

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] net: introduce a list of device addresses dev_addr_list (v6)
@ 2009-05-08 23:12                       ` Stephen Hemminger
  0 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-05-08 23:12 UTC (permalink / raw)
  To: David Miller
  Cc: ivecera, fubar, jpirko, netdev, bridge, linux-kernel, mschmidt,
	jgarzik, dada1, bonding-devel

On Fri, 08 May 2009 16:00:08 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Fri, 8 May 2009 15:38:42 -0700
> 
> > Not sure if this is such a good idea since the purpose of this was to fix
> > a bonding/bridging interaction, but it breaks STP on bridging.
> 
> Thanks for not paying attention... :-/
> 
> The Intel folks want to have an address list functionality so
> they can public MAC addresses meant for FCOE and other purposes.
> 
> So even if the bonding bits bomb, we still need this.

But the other infrastructure may have same issues (netfilter, etc).
Just seems like it would be either to have multiple network devices
so that upper layers could disambiguate easier.

-- 

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] net: introduce a list of device addresses dev_addr_list (v6)
  2009-05-08 23:12                       ` [Bridge] " Stephen Hemminger
@ 2009-05-08 23:25                         ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-05-08 23:25 UTC (permalink / raw)
  To: shemminger
  Cc: jpirko, linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 8 May 2009 16:12:04 -0700

> But the other infrastructure may have same issues (netfilter, etc).
> Just seems like it would be either to have multiple network devices
> so that upper layers could disambiguate easier.

That's quite a heavyweight solution to what is purely
an addressing issue, don't you think?

We can just revert all of that netdev_ops stuff if you
think per-netdev cost doesn't matter :-)

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] net: introduce a list of device addresses dev_addr_list (v6)
@ 2009-05-08 23:25                         ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-05-08 23:25 UTC (permalink / raw)
  To: shemminger
  Cc: ivecera, fubar, jpirko, netdev, bridge, linux-kernel, mschmidt,
	jgarzik, dada1, bonding-devel

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 8 May 2009 16:12:04 -0700

> But the other infrastructure may have same issues (netfilter, etc).
> Just seems like it would be either to have multiple network devices
> so that upper layers could disambiguate easier.

That's quite a heavyweight solution to what is purely
an addressing issue, don't you think?

We can just revert all of that netdev_ops stuff if you
think per-netdev cost doesn't matter :-)

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] net: introduce a list of device addresses dev_addr_list (v6)
  2009-05-08 23:25                         ` [Bridge] " David Miller
@ 2009-05-08 23:29                           ` Stephen Hemminger
  -1 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-05-08 23:29 UTC (permalink / raw)
  To: David Miller
  Cc: jpirko, linux-kernel, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, dada1, ivecera

On Fri, 08 May 2009 16:25:55 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Fri, 8 May 2009 16:12:04 -0700
> 
> > But the other infrastructure may have same issues (netfilter, etc).
> > Just seems like it would be either to have multiple network devices
> > so that upper layers could disambiguate easier.
> 
> That's quite a heavyweight solution to what is purely
> an addressing issue, don't you think?
> 
> We can just revert all of that netdev_ops stuff if you
> think per-netdev cost doesn't matter :-)

I am just concerned that is a fundamental change (like MQ) was and
it will take a couple of releases to shake out all the inter-relationships.
When an architectural change is made, would like to see more analysis done.
Netdev-ops was more of a refactoring than a change to what protocols
and qdisc/filters/... see.

-- 

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH] net: introduce a list of device addresses dev_addr_list (v6)
@ 2009-05-08 23:29                           ` Stephen Hemminger
  0 siblings, 0 replies; 214+ messages in thread
From: Stephen Hemminger @ 2009-05-08 23:29 UTC (permalink / raw)
  To: David Miller
  Cc: ivecera, fubar, jpirko, netdev, bridge, linux-kernel, mschmidt,
	jgarzik, dada1, bonding-devel

On Fri, 08 May 2009 16:25:55 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Fri, 8 May 2009 16:12:04 -0700
> 
> > But the other infrastructure may have same issues (netfilter, etc).
> > Just seems like it would be either to have multiple network devices
> > so that upper layers could disambiguate easier.
> 
> That's quite a heavyweight solution to what is purely
> an addressing issue, don't you think?
> 
> We can just revert all of that netdev_ops stuff if you
> think per-netdev cost doesn't matter :-)

I am just concerned that is a fundamental change (like MQ) was and
it will take a couple of releases to shake out all the inter-relationships.
When an architectural change is made, would like to see more analysis done.
Netdev-ops was more of a refactoring than a change to what protocols
and qdisc/filters/... see.

-- 

^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
  2009-03-26 15:52   ` [Bridge] " Jiri Pirko
@ 2009-05-26 15:17     ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-26 15:17 UTC (permalink / raw)
  To: netdev
  Cc: jgarzik, davem, shemminger, bridge, fubar, bonding-devel, kaber,
	mschmidt, dada1

[PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

***
When the multiple addresses for bridge port approach failed to solve this issue
due to STP I started to think other way to solve this. I returned to previous
solution but tweaked one.

This patch solves the situation in the bonding without touching bridge code.
For every incoming frame to bonding the destination address is compared to
current address of the slave device from which tha packet came. If these two
match destination address is replaced by mac address of the master. This address
is known by bridge so it is delivered properly.

I experimentally tried that this works as good as searching through the slave
list (v4 of this patch).

I was forced to create a new header because I need to use
compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
linux/netdevice.h. I've hit some cross include issues. I think that it's good
to have skb_bond_should_drop() in a separate file anyway.

Jirka


Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/include/linux/bonding.h b/include/linux/bonding.h
new file mode 100644
index 0000000..3081ddb
--- /dev/null
+++ b/include/linux/bonding.h
@@ -0,0 +1,78 @@
+/*
+ * include/linux/bonding.h
+ *
+ * Copyright (C) 2009 Jiri Pirko <jpirko@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Bonding device helpers.
+ */
+
+#ifndef _LINUX_BONDING_H
+#define _LINUX_BONDING_H
+
+#ifdef __KERNEL__
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+
+static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
+					      struct net_device *dev,
+					      struct net_device *master)
+{
+	unsigned char *dest = eth_hdr(skb)->h_dest;
+
+	if (compare_ether_addr_64bits(dest, master->dev_addr) &&
+	    !compare_ether_addr_64bits(dest, dev->dev_addr))
+		memcpy(dest, master->dev_addr, ETH_ALEN);
+}
+
+/* On bonding slaves other than the currently active slave, suppress
+ * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
+ * ARP on active-backup slaves with arp_validate enabled.
+ */
+static inline int skb_bond_should_drop(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct net_device *master = dev->master;
+
+	if (master) {
+		if (master->priv_flags & IFF_MASTER_ARPMON)
+			dev->last_rx = jiffies;
+
+		if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
+			/* Do address unmangle. The local destination address
+			 * will be always the one master has. Provides the right
+			 * functionality in a bridge.
+			 */
+			skb_bond_set_mac_by_master(skb, dev, master);
+		}
+
+		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
+			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
+			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
+				return 0;
+
+			if (master->priv_flags & IFF_MASTER_ALB) {
+				if (skb->pkt_type != PACKET_BROADCAST &&
+				    skb->pkt_type != PACKET_MULTICAST)
+					return 0;
+			}
+			if (master->priv_flags & IFF_MASTER_8023AD &&
+			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
+				return 0;
+
+			return 1;
+		}
+	}
+	return 0;
+}
+
+#endif /* __KERNEL__ */
+
+#endif	/* _LINUX_BONDING_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ae3c209..06e24ae 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1897,39 +1897,6 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 	dev->gso_max_size = size;
 }
 
-/* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
- */
-static inline int skb_bond_should_drop(struct sk_buff *skb)
-{
-	struct net_device *dev = skb->dev;
-	struct net_device *master = dev->master;
-
-	if (master) {
-		if (master->priv_flags & IFF_MASTER_ARPMON)
-			dev->last_rx = jiffies;
-
-		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
-			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
-			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
-				return 0;
-
-			if (master->priv_flags & IFF_MASTER_ALB) {
-				if (skb->pkt_type != PACKET_BROADCAST &&
-				    skb->pkt_type != PACKET_MULTICAST)
-					return 0;
-			}
-			if (master->priv_flags & IFF_MASTER_8023AD &&
-			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
-				return 0;
-
-			return 1;
-		}
-	}
-	return 0;
-}
-
 extern struct pernet_operations __net_initdata loopback_net_ops;
 
 static inline int dev_ethtool_get_settings(struct net_device *dev,
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 7f7de1a..c6eae40 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -2,6 +2,7 @@
 #include <linux/netdevice.h>
 #include <linux/if_vlan.h>
 #include <linux/netpoll.h>
+#include <linux/bonding.h>
 #include "vlan.h"
 
 /* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
diff --git a/net/core/dev.c b/net/core/dev.c
index 241613f..221b43f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -127,6 +127,7 @@
 #include <linux/jhash.h>
 #include <linux/random.h>
 #include <trace/napi.h>
+#include <linux/bonding.h>
 
 #include "net-sysfs.h"
 

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
@ 2009-05-26 15:17     ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-26 15:17 UTC (permalink / raw)
  To: netdev; +Cc: fubar, jgarzik, bridge, mschmidt, bonding-devel, dada1, davem

[PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

***
When the multiple addresses for bridge port approach failed to solve this issue
due to STP I started to think other way to solve this. I returned to previous
solution but tweaked one.

This patch solves the situation in the bonding without touching bridge code.
For every incoming frame to bonding the destination address is compared to
current address of the slave device from which tha packet came. If these two
match destination address is replaced by mac address of the master. This address
is known by bridge so it is delivered properly.

I experimentally tried that this works as good as searching through the slave
list (v4 of this patch).

I was forced to create a new header because I need to use
compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
linux/netdevice.h. I've hit some cross include issues. I think that it's good
to have skb_bond_should_drop() in a separate file anyway.

Jirka


Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/include/linux/bonding.h b/include/linux/bonding.h
new file mode 100644
index 0000000..3081ddb
--- /dev/null
+++ b/include/linux/bonding.h
@@ -0,0 +1,78 @@
+/*
+ * include/linux/bonding.h
+ *
+ * Copyright (C) 2009 Jiri Pirko <jpirko@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Bonding device helpers.
+ */
+
+#ifndef _LINUX_BONDING_H
+#define _LINUX_BONDING_H
+
+#ifdef __KERNEL__
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+
+static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
+					      struct net_device *dev,
+					      struct net_device *master)
+{
+	unsigned char *dest = eth_hdr(skb)->h_dest;
+
+	if (compare_ether_addr_64bits(dest, master->dev_addr) &&
+	    !compare_ether_addr_64bits(dest, dev->dev_addr))
+		memcpy(dest, master->dev_addr, ETH_ALEN);
+}
+
+/* On bonding slaves other than the currently active slave, suppress
+ * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
+ * ARP on active-backup slaves with arp_validate enabled.
+ */
+static inline int skb_bond_should_drop(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct net_device *master = dev->master;
+
+	if (master) {
+		if (master->priv_flags & IFF_MASTER_ARPMON)
+			dev->last_rx = jiffies;
+
+		if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
+			/* Do address unmangle. The local destination address
+			 * will be always the one master has. Provides the right
+			 * functionality in a bridge.
+			 */
+			skb_bond_set_mac_by_master(skb, dev, master);
+		}
+
+		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
+			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
+			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
+				return 0;
+
+			if (master->priv_flags & IFF_MASTER_ALB) {
+				if (skb->pkt_type != PACKET_BROADCAST &&
+				    skb->pkt_type != PACKET_MULTICAST)
+					return 0;
+			}
+			if (master->priv_flags & IFF_MASTER_8023AD &&
+			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
+				return 0;
+
+			return 1;
+		}
+	}
+	return 0;
+}
+
+#endif /* __KERNEL__ */
+
+#endif	/* _LINUX_BONDING_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ae3c209..06e24ae 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1897,39 +1897,6 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 	dev->gso_max_size = size;
 }
 
-/* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
- */
-static inline int skb_bond_should_drop(struct sk_buff *skb)
-{
-	struct net_device *dev = skb->dev;
-	struct net_device *master = dev->master;
-
-	if (master) {
-		if (master->priv_flags & IFF_MASTER_ARPMON)
-			dev->last_rx = jiffies;
-
-		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
-			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
-			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
-				return 0;
-
-			if (master->priv_flags & IFF_MASTER_ALB) {
-				if (skb->pkt_type != PACKET_BROADCAST &&
-				    skb->pkt_type != PACKET_MULTICAST)
-					return 0;
-			}
-			if (master->priv_flags & IFF_MASTER_8023AD &&
-			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
-				return 0;
-
-			return 1;
-		}
-	}
-	return 0;
-}
-
 extern struct pernet_operations __net_initdata loopback_net_ops;
 
 static inline int dev_ethtool_get_settings(struct net_device *dev,
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 7f7de1a..c6eae40 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -2,6 +2,7 @@
 #include <linux/netdevice.h>
 #include <linux/if_vlan.h>
 #include <linux/netpoll.h>
+#include <linux/bonding.h>
 #include "vlan.h"
 
 /* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
diff --git a/net/core/dev.c b/net/core/dev.c
index 241613f..221b43f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -127,6 +127,7 @@
 #include <linux/jhash.h>
 #include <linux/random.h>
 #include <trace/napi.h>
+#include <linux/bonding.h>
 
 #include "net-sysfs.h"
 

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
  2009-05-26 15:17     ` [Bridge] " Jiri Pirko
@ 2009-05-26 16:32       ` Andy Gospodarek
  -1 siblings, 0 replies; 214+ messages in thread
From: Andy Gospodarek @ 2009-05-26 16:32 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1

On Tue, May 26, 2009 at 05:17:17PM +0200, Jiri Pirko wrote:
> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
> 
> Hi all.
> 
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> 
> Basically here's what's going on. In every mode, bonding interface uses the same
> mac address for all enslaved devices (except fail_over_mac). Only balance-alb
> will simultaneously use multiple MAC addresses across different slaves. When you
> put this kind of bond device into a bridge it will only add one of mac adresses
> into a hash list of mac addresses, say X. This mac address is marked as local.
> But this bonding interface also has mac address Y. Now then packet arrives with
> destination address Y, this address is not marked as local and the packed looks
> like it needs to be forwarded. This packet is then lost which is wrong.
> 
> Notice that interfaces can be added and removed from bond while it is in bridge.
> 
> ***
> When the multiple addresses for bridge port approach failed to solve this issue
> due to STP I started to think other way to solve this. I returned to previous
> solution but tweaked one.
> 
> This patch solves the situation in the bonding without touching bridge code.
> For every incoming frame to bonding the destination address is compared to
> current address of the slave device from which tha packet came. If these two
> match destination address is replaced by mac address of the master. This address
> is known by bridge so it is delivered properly.

Did you test this with a bond with more than 2 ports?  I ask because I
might also expect a check against all the members of the bond (rather
than simply the receiving device).

That check would be quite expensive for every frame and I think the
scenario is quite unlikely based on the frequency of 'learning frames'
sent by the alb code (so the switch connected to the host should have
it's forwarding database correct), but it might be something to think
about in the future.

> I experimentally tried that this works as good as searching through the slave
> list (v4 of this patch).
> 
> I was forced to create a new header because I need to use
> compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
> linux/netdevice.h. I've hit some cross include issues. I think that it's good
> to have skb_bond_should_drop() in a separate file anyway.
> 
> Jirka
> 
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 

This certainly won't cure all of the problems that arise with bonding
and bridging interactions, but it's a step in the right direction.

Acked-by: Andy Gospodarek <andy@greyhouse.net>


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
@ 2009-05-26 16:32       ` Andy Gospodarek
  0 siblings, 0 replies; 214+ messages in thread
From: Andy Gospodarek @ 2009-05-26 16:32 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik, dada1, davem

On Tue, May 26, 2009 at 05:17:17PM +0200, Jiri Pirko wrote:
> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
> 
> Hi all.
> 
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> 
> Basically here's what's going on. In every mode, bonding interface uses the same
> mac address for all enslaved devices (except fail_over_mac). Only balance-alb
> will simultaneously use multiple MAC addresses across different slaves. When you
> put this kind of bond device into a bridge it will only add one of mac adresses
> into a hash list of mac addresses, say X. This mac address is marked as local.
> But this bonding interface also has mac address Y. Now then packet arrives with
> destination address Y, this address is not marked as local and the packed looks
> like it needs to be forwarded. This packet is then lost which is wrong.
> 
> Notice that interfaces can be added and removed from bond while it is in bridge.
> 
> ***
> When the multiple addresses for bridge port approach failed to solve this issue
> due to STP I started to think other way to solve this. I returned to previous
> solution but tweaked one.
> 
> This patch solves the situation in the bonding without touching bridge code.
> For every incoming frame to bonding the destination address is compared to
> current address of the slave device from which tha packet came. If these two
> match destination address is replaced by mac address of the master. This address
> is known by bridge so it is delivered properly.

Did you test this with a bond with more than 2 ports?  I ask because I
might also expect a check against all the members of the bond (rather
than simply the receiving device).

That check would be quite expensive for every frame and I think the
scenario is quite unlikely based on the frequency of 'learning frames'
sent by the alb code (so the switch connected to the host should have
it's forwarding database correct), but it might be something to think
about in the future.

> I experimentally tried that this works as good as searching through the slave
> list (v4 of this patch).
> 
> I was forced to create a new header because I need to use
> compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
> linux/netdevice.h. I've hit some cross include issues. I think that it's good
> to have skb_bond_should_drop() in a separate file anyway.
> 
> Jirka
> 
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 

This certainly won't cure all of the problems that arise with bonding
and bridging interactions, but it's a step in the right direction.

Acked-by: Andy Gospodarek <andy@greyhouse.net>


^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
  2009-05-26 15:17     ` [Bridge] " Jiri Pirko
@ 2009-05-26 16:59       ` Eric Dumazet
  -1 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-05-26 16:59 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt

Jiri Pirko a écrit :
> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
> 
> Hi all.
> 
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> 
> Basically here's what's going on. In every mode, bonding interface uses the same
> mac address for all enslaved devices (except fail_over_mac). Only balance-alb
> will simultaneously use multiple MAC addresses across different slaves. When you
> put this kind of bond device into a bridge it will only add one of mac adresses
> into a hash list of mac addresses, say X. This mac address is marked as local.
> But this bonding interface also has mac address Y. Now then packet arrives with
> destination address Y, this address is not marked as local and the packed looks
> like it needs to be forwarded. This packet is then lost which is wrong.
> 
> Notice that interfaces can be added and removed from bond while it is in bridge.
> 
> ***
> When the multiple addresses for bridge port approach failed to solve this issue
> due to STP I started to think other way to solve this. I returned to previous
> solution but tweaked one.
> 
> This patch solves the situation in the bonding without touching bridge code.
> For every incoming frame to bonding the destination address is compared to
> current address of the slave device from which tha packet came. If these two
> match destination address is replaced by mac address of the master. This address
> is known by bridge so it is delivered properly.
> 
> I experimentally tried that this works as good as searching through the slave
> list (v4 of this patch).
> 
> I was forced to create a new header because I need to use
> compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
> linux/netdevice.h. I've hit some cross include issues. I think that it's good
> to have skb_bond_should_drop() in a separate file anyway.
> 
> Jirka
> 
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> diff --git a/include/linux/bonding.h b/include/linux/bonding.h
> new file mode 100644
> index 0000000..3081ddb
> --- /dev/null
> +++ b/include/linux/bonding.h
> @@ -0,0 +1,78 @@
> +/*
> + * include/linux/bonding.h
> + *
> + * Copyright (C) 2009 Jiri Pirko <jpirko@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2
> + * as published by the Free Software Foundation.
> + *
> + * Bonding device helpers.
> + */
> +
> +#ifndef _LINUX_BONDING_H
> +#define _LINUX_BONDING_H
> +
> +#ifdef __KERNEL__
> +
> +#include <linux/skbuff.h>
> +#include <linux/netdevice.h>
> +#include <linux/if.h>
> +#include <linux/etherdevice.h>
> +#include <linux/if_ether.h>
> +
> +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
> +					      struct net_device *dev,
> +					      struct net_device *master)
> +{
> +	unsigned char *dest = eth_hdr(skb)->h_dest;
> +
> +	if (compare_ether_addr_64bits(dest, master->dev_addr) &&
> +	    !compare_ether_addr_64bits(dest, dev->dev_addr))
> +		memcpy(dest, master->dev_addr, ETH_ALEN);

But couldnt we test skb->pkt_type == PACKET_HOST instead,
Or eth_type_trans() not yet called at this point ?

I would suggest :

if (skb->pkt_type == PACKET_HOST)
	memcpy(dest, master->dev_addr, ETH_ALEN);

> +}
> +
> +/* On bonding slaves other than the currently active slave, suppress
> + * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
> + * ARP on active-backup slaves with arp_validate enabled.
> + */
> +static inline int skb_bond_should_drop(struct sk_buff *skb)
> +{
> +	struct net_device *dev = skb->dev;
> +	struct net_device *master = dev->master;
> +
> +	if (master) {
> +		if (master->priv_flags & IFF_MASTER_ARPMON)
> +			dev->last_rx = jiffies;
> +
> +		if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
> +			/* Do address unmangle. The local destination address
> +			 * will be always the one master has. Provides the right
> +			 * functionality in a bridge.
> +			 */
> +			skb_bond_set_mac_by_master(skb, dev, master);
> +		}
> +
> +		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
> +			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
> +			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
> +				return 0;
> +
> +			if (master->priv_flags & IFF_MASTER_ALB) {
> +				if (skb->pkt_type != PACKET_BROADCAST &&
> +				    skb->pkt_type != PACKET_MULTICAST)
> +					return 0;
> +			}
> +			if (master->priv_flags & IFF_MASTER_8023AD &&
> +			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
> +				return 0;
> +
> +			return 1;
> +		}
> +	}
> +	return 0;
> +}
> +
> +#endif /* __KERNEL__ */
> +
> +#endif	/* _LINUX_BONDING_H */
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index ae3c209..06e24ae 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1897,39 +1897,6 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
>  	dev->gso_max_size = size;
>  }
>  
> -/* On bonding slaves other than the currently active slave, suppress
> - * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
> - * ARP on active-backup slaves with arp_validate enabled.
> - */
> -static inline int skb_bond_should_drop(struct sk_buff *skb)
> -{
> -	struct net_device *dev = skb->dev;
> -	struct net_device *master = dev->master;
> -
> -	if (master) {
> -		if (master->priv_flags & IFF_MASTER_ARPMON)
> -			dev->last_rx = jiffies;
> -
> -		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
> -			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
> -			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
> -				return 0;
> -
> -			if (master->priv_flags & IFF_MASTER_ALB) {
> -				if (skb->pkt_type != PACKET_BROADCAST &&
> -				    skb->pkt_type != PACKET_MULTICAST)
> -					return 0;
> -			}
> -			if (master->priv_flags & IFF_MASTER_8023AD &&
> -			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
> -				return 0;
> -
> -			return 1;
> -		}
> -	}
> -	return 0;
> -}
> -
>  extern struct pernet_operations __net_initdata loopback_net_ops;
>  
>  static inline int dev_ethtool_get_settings(struct net_device *dev,
> diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
> index 7f7de1a..c6eae40 100644
> --- a/net/8021q/vlan_core.c
> +++ b/net/8021q/vlan_core.c
> @@ -2,6 +2,7 @@
>  #include <linux/netdevice.h>
>  #include <linux/if_vlan.h>
>  #include <linux/netpoll.h>
> +#include <linux/bonding.h>
>  #include "vlan.h"
>  
>  /* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 241613f..221b43f 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -127,6 +127,7 @@
>  #include <linux/jhash.h>
>  #include <linux/random.h>
>  #include <trace/napi.h>
> +#include <linux/bonding.h>
>  
>  #include "net-sysfs.h"
>  
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
@ 2009-05-26 16:59       ` Eric Dumazet
  0 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-05-26 16:59 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik, davem

Jiri Pirko a écrit :
> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
> 
> Hi all.
> 
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> 
> Basically here's what's going on. In every mode, bonding interface uses the same
> mac address for all enslaved devices (except fail_over_mac). Only balance-alb
> will simultaneously use multiple MAC addresses across different slaves. When you
> put this kind of bond device into a bridge it will only add one of mac adresses
> into a hash list of mac addresses, say X. This mac address is marked as local.
> But this bonding interface also has mac address Y. Now then packet arrives with
> destination address Y, this address is not marked as local and the packed looks
> like it needs to be forwarded. This packet is then lost which is wrong.
> 
> Notice that interfaces can be added and removed from bond while it is in bridge.
> 
> ***
> When the multiple addresses for bridge port approach failed to solve this issue
> due to STP I started to think other way to solve this. I returned to previous
> solution but tweaked one.
> 
> This patch solves the situation in the bonding without touching bridge code.
> For every incoming frame to bonding the destination address is compared to
> current address of the slave device from which tha packet came. If these two
> match destination address is replaced by mac address of the master. This address
> is known by bridge so it is delivered properly.
> 
> I experimentally tried that this works as good as searching through the slave
> list (v4 of this patch).
> 
> I was forced to create a new header because I need to use
> compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
> linux/netdevice.h. I've hit some cross include issues. I think that it's good
> to have skb_bond_should_drop() in a separate file anyway.
> 
> Jirka
> 
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> diff --git a/include/linux/bonding.h b/include/linux/bonding.h
> new file mode 100644
> index 0000000..3081ddb
> --- /dev/null
> +++ b/include/linux/bonding.h
> @@ -0,0 +1,78 @@
> +/*
> + * include/linux/bonding.h
> + *
> + * Copyright (C) 2009 Jiri Pirko <jpirko@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2
> + * as published by the Free Software Foundation.
> + *
> + * Bonding device helpers.
> + */
> +
> +#ifndef _LINUX_BONDING_H
> +#define _LINUX_BONDING_H
> +
> +#ifdef __KERNEL__
> +
> +#include <linux/skbuff.h>
> +#include <linux/netdevice.h>
> +#include <linux/if.h>
> +#include <linux/etherdevice.h>
> +#include <linux/if_ether.h>
> +
> +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
> +					      struct net_device *dev,
> +					      struct net_device *master)
> +{
> +	unsigned char *dest = eth_hdr(skb)->h_dest;
> +
> +	if (compare_ether_addr_64bits(dest, master->dev_addr) &&
> +	    !compare_ether_addr_64bits(dest, dev->dev_addr))
> +		memcpy(dest, master->dev_addr, ETH_ALEN);

But couldnt we test skb->pkt_type == PACKET_HOST instead,
Or eth_type_trans() not yet called at this point ?

I would suggest :

if (skb->pkt_type == PACKET_HOST)
	memcpy(dest, master->dev_addr, ETH_ALEN);

> +}
> +
> +/* On bonding slaves other than the currently active slave, suppress
> + * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
> + * ARP on active-backup slaves with arp_validate enabled.
> + */
> +static inline int skb_bond_should_drop(struct sk_buff *skb)
> +{
> +	struct net_device *dev = skb->dev;
> +	struct net_device *master = dev->master;
> +
> +	if (master) {
> +		if (master->priv_flags & IFF_MASTER_ARPMON)
> +			dev->last_rx = jiffies;
> +
> +		if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
> +			/* Do address unmangle. The local destination address
> +			 * will be always the one master has. Provides the right
> +			 * functionality in a bridge.
> +			 */
> +			skb_bond_set_mac_by_master(skb, dev, master);
> +		}
> +
> +		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
> +			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
> +			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
> +				return 0;
> +
> +			if (master->priv_flags & IFF_MASTER_ALB) {
> +				if (skb->pkt_type != PACKET_BROADCAST &&
> +				    skb->pkt_type != PACKET_MULTICAST)
> +					return 0;
> +			}
> +			if (master->priv_flags & IFF_MASTER_8023AD &&
> +			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
> +				return 0;
> +
> +			return 1;
> +		}
> +	}
> +	return 0;
> +}
> +
> +#endif /* __KERNEL__ */
> +
> +#endif	/* _LINUX_BONDING_H */
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index ae3c209..06e24ae 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1897,39 +1897,6 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
>  	dev->gso_max_size = size;
>  }
>  
> -/* On bonding slaves other than the currently active slave, suppress
> - * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
> - * ARP on active-backup slaves with arp_validate enabled.
> - */
> -static inline int skb_bond_should_drop(struct sk_buff *skb)
> -{
> -	struct net_device *dev = skb->dev;
> -	struct net_device *master = dev->master;
> -
> -	if (master) {
> -		if (master->priv_flags & IFF_MASTER_ARPMON)
> -			dev->last_rx = jiffies;
> -
> -		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
> -			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
> -			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
> -				return 0;
> -
> -			if (master->priv_flags & IFF_MASTER_ALB) {
> -				if (skb->pkt_type != PACKET_BROADCAST &&
> -				    skb->pkt_type != PACKET_MULTICAST)
> -					return 0;
> -			}
> -			if (master->priv_flags & IFF_MASTER_8023AD &&
> -			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
> -				return 0;
> -
> -			return 1;
> -		}
> -	}
> -	return 0;
> -}
> -
>  extern struct pernet_operations __net_initdata loopback_net_ops;
>  
>  static inline int dev_ethtool_get_settings(struct net_device *dev,
> diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
> index 7f7de1a..c6eae40 100644
> --- a/net/8021q/vlan_core.c
> +++ b/net/8021q/vlan_core.c
> @@ -2,6 +2,7 @@
>  #include <linux/netdevice.h>
>  #include <linux/if_vlan.h>
>  #include <linux/netpoll.h>
> +#include <linux/bonding.h>
>  #include "vlan.h"
>  
>  /* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 241613f..221b43f 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -127,6 +127,7 @@
>  #include <linux/jhash.h>
>  #include <linux/random.h>
>  #include <trace/napi.h>
> +#include <linux/bonding.h>
>  
>  #include "net-sysfs.h"
>  
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
  2009-05-26 16:32       ` [Bridge] " Andy Gospodarek
@ 2009-05-27  8:25         ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-27  8:25 UTC (permalink / raw)
  To: Andy Gospodarek
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1

Tue, May 26, 2009 at 06:32:42PM CEST, andy@greyhouse.net wrote:
>On Tue, May 26, 2009 at 05:17:17PM +0200, Jiri Pirko wrote:
>> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
>> 
>> Hi all.
>> 
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>> 
>> Basically here's what's going on. In every mode, bonding interface uses the same
>> mac address for all enslaved devices (except fail_over_mac). Only balance-alb
>> will simultaneously use multiple MAC addresses across different slaves. When you
>> put this kind of bond device into a bridge it will only add one of mac adresses
>> into a hash list of mac addresses, say X. This mac address is marked as local.
>> But this bonding interface also has mac address Y. Now then packet arrives with
>> destination address Y, this address is not marked as local and the packed looks
>> like it needs to be forwarded. This packet is then lost which is wrong.
>> 
>> Notice that interfaces can be added and removed from bond while it is in bridge.
>> 
>> ***
>> When the multiple addresses for bridge port approach failed to solve this issue
>> due to STP I started to think other way to solve this. I returned to previous
>> solution but tweaked one.
>> 
>> This patch solves the situation in the bonding without touching bridge code.
>> For every incoming frame to bonding the destination address is compared to
>> current address of the slave device from which tha packet came. If these two
>> match destination address is replaced by mac address of the master. This address
>> is known by bridge so it is delivered properly.
>
>Did you test this with a bond with more than 2 ports?  I ask because I
>might also expect a check against all the members of the bond (rather
>than simply the receiving device).

Yes, my testing machine has 3 interfaces for bond. Works fine.
>
>That check would be quite expensive for every frame and I think the
>scenario is quite unlikely based on the frequency of 'learning frames'
>sent by the alb code (so the switch connected to the host should have
>it's forwarding database correct), but it might be something to think
>about in the future.

As you can see, my previous patch did the checking vs all slaves. I tried this
experimentally and searched address from the list and dev->dev_addr differs only
when I unplug cable and mac swap occurs. Then one packet is lost. But there are
many lost packet during the unplug anyway so....

>
>> I experimentally tried that this works as good as searching through the slave
>> list (v4 of this patch).
>> 
>> I was forced to create a new header because I need to use
>> compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
>> linux/netdevice.h. I've hit some cross include issues. I think that it's good
>> to have skb_bond_should_drop() in a separate file anyway.
>> 
>> Jirka
>> 
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> 
>
>This certainly won't cure all of the problems that arise with bonding
>and bridging interactions, but it's a step in the right direction.
>
>Acked-by: Andy Gospodarek <andy@greyhouse.net>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
@ 2009-05-27  8:25         ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-27  8:25 UTC (permalink / raw)
  To: Andy Gospodarek
  Cc: fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik, dada1, davem

Tue, May 26, 2009 at 06:32:42PM CEST, andy@greyhouse.net wrote:
>On Tue, May 26, 2009 at 05:17:17PM +0200, Jiri Pirko wrote:
>> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
>> 
>> Hi all.
>> 
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>> 
>> Basically here's what's going on. In every mode, bonding interface uses the same
>> mac address for all enslaved devices (except fail_over_mac). Only balance-alb
>> will simultaneously use multiple MAC addresses across different slaves. When you
>> put this kind of bond device into a bridge it will only add one of mac adresses
>> into a hash list of mac addresses, say X. This mac address is marked as local.
>> But this bonding interface also has mac address Y. Now then packet arrives with
>> destination address Y, this address is not marked as local and the packed looks
>> like it needs to be forwarded. This packet is then lost which is wrong.
>> 
>> Notice that interfaces can be added and removed from bond while it is in bridge.
>> 
>> ***
>> When the multiple addresses for bridge port approach failed to solve this issue
>> due to STP I started to think other way to solve this. I returned to previous
>> solution but tweaked one.
>> 
>> This patch solves the situation in the bonding without touching bridge code.
>> For every incoming frame to bonding the destination address is compared to
>> current address of the slave device from which tha packet came. If these two
>> match destination address is replaced by mac address of the master. This address
>> is known by bridge so it is delivered properly.
>
>Did you test this with a bond with more than 2 ports?  I ask because I
>might also expect a check against all the members of the bond (rather
>than simply the receiving device).

Yes, my testing machine has 3 interfaces for bond. Works fine.
>
>That check would be quite expensive for every frame and I think the
>scenario is quite unlikely based on the frequency of 'learning frames'
>sent by the alb code (so the switch connected to the host should have
>it's forwarding database correct), but it might be something to think
>about in the future.

As you can see, my previous patch did the checking vs all slaves. I tried this
experimentally and searched address from the list and dev->dev_addr differs only
when I unplug cable and mac swap occurs. Then one packet is lost. But there are
many lost packet during the unplug anyway so....

>
>> I experimentally tried that this works as good as searching through the slave
>> list (v4 of this patch).
>> 
>> I was forced to create a new header because I need to use
>> compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
>> linux/netdevice.h. I've hit some cross include issues. I think that it's good
>> to have skb_bond_should_drop() in a separate file anyway.
>> 
>> Jirka
>> 
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> 
>
>This certainly won't cure all of the problems that arise with bonding
>and bridging interactions, but it's a step in the right direction.
>
>Acked-by: Andy Gospodarek <andy@greyhouse.net>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
  2009-05-26 16:59       ` [Bridge] " Eric Dumazet
@ 2009-05-27  8:42         ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-27  8:42 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt

Tue, May 26, 2009 at 06:59:53PM CEST, dada1@cosmosbay.com wrote:
>Jiri Pirko a écrit :
>> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
>> 
>> Hi all.
>> 
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>> 
>> Basically here's what's going on. In every mode, bonding interface uses the same
>> mac address for all enslaved devices (except fail_over_mac). Only balance-alb
>> will simultaneously use multiple MAC addresses across different slaves. When you
>> put this kind of bond device into a bridge it will only add one of mac adresses
>> into a hash list of mac addresses, say X. This mac address is marked as local.
>> But this bonding interface also has mac address Y. Now then packet arrives with
>> destination address Y, this address is not marked as local and the packed looks
>> like it needs to be forwarded. This packet is then lost which is wrong.
>> 
>> Notice that interfaces can be added and removed from bond while it is in bridge.
>> 
>> ***
>> When the multiple addresses for bridge port approach failed to solve this issue
>> due to STP I started to think other way to solve this. I returned to previous
>> solution but tweaked one.
>> 
>> This patch solves the situation in the bonding without touching bridge code.
>> For every incoming frame to bonding the destination address is compared to
>> current address of the slave device from which tha packet came. If these two
>> match destination address is replaced by mac address of the master. This address
>> is known by bridge so it is delivered properly.
>> 
>> I experimentally tried that this works as good as searching through the slave
>> list (v4 of this patch).
>> 
>> I was forced to create a new header because I need to use
>> compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
>> linux/netdevice.h. I've hit some cross include issues. I think that it's good
>> to have skb_bond_should_drop() in a separate file anyway.
>> 
>> Jirka
>> 
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> 
>> diff --git a/include/linux/bonding.h b/include/linux/bonding.h
>> new file mode 100644
>> index 0000000..3081ddb
>> --- /dev/null
>> +++ b/include/linux/bonding.h
>> @@ -0,0 +1,78 @@
>> +/*
>> + * include/linux/bonding.h
>> + *
>> + * Copyright (C) 2009 Jiri Pirko <jpirko@redhat.com>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2
>> + * as published by the Free Software Foundation.
>> + *
>> + * Bonding device helpers.
>> + */
>> +
>> +#ifndef _LINUX_BONDING_H
>> +#define _LINUX_BONDING_H
>> +
>> +#ifdef __KERNEL__
>> +
>> +#include <linux/skbuff.h>
>> +#include <linux/netdevice.h>
>> +#include <linux/if.h>
>> +#include <linux/etherdevice.h>
>> +#include <linux/if_ether.h>
>> +
>> +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
>> +					      struct net_device *dev,
>> +					      struct net_device *master)
>> +{
>> +	unsigned char *dest = eth_hdr(skb)->h_dest;
>> +
>> +	if (compare_ether_addr_64bits(dest, master->dev_addr) &&
>> +	    !compare_ether_addr_64bits(dest, dev->dev_addr))
>> +		memcpy(dest, master->dev_addr, ETH_ALEN);
>
>But couldnt we test skb->pkt_type == PACKET_HOST instead,
>Or eth_type_trans() not yet called at this point ?
>
>I would suggest :
>
>if (skb->pkt_type == PACKET_HOST)
>	memcpy(dest, master->dev_addr, ETH_ALEN);

Yes Eric, you are right, good point.

eth_type_trans() is called in any driver before and
compare_ether_addr_64bits(dest, dev->dev_addr) is done there. So it's safe
to use this here (and save some ticks). I'm going to make a new patch and
test it.

		Jirka
>
>> +}
>> +
>> +/* On bonding slaves other than the currently active slave, suppress
>> + * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
>> + * ARP on active-backup slaves with arp_validate enabled.
>> + */
>> +static inline int skb_bond_should_drop(struct sk_buff *skb)
>> +{
>> +	struct net_device *dev = skb->dev;
>> +	struct net_device *master = dev->master;
>> +
>> +	if (master) {
>> +		if (master->priv_flags & IFF_MASTER_ARPMON)
>> +			dev->last_rx = jiffies;
>> +
>> +		if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
>> +			/* Do address unmangle. The local destination address
>> +			 * will be always the one master has. Provides the right
>> +			 * functionality in a bridge.
>> +			 */
>> +			skb_bond_set_mac_by_master(skb, dev, master);
>> +		}
>> +
>> +		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
>> +			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
>> +			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
>> +				return 0;
>> +
>> +			if (master->priv_flags & IFF_MASTER_ALB) {
>> +				if (skb->pkt_type != PACKET_BROADCAST &&
>> +				    skb->pkt_type != PACKET_MULTICAST)
>> +					return 0;
>> +			}
>> +			if (master->priv_flags & IFF_MASTER_8023AD &&
>> +			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
>> +				return 0;
>> +
>> +			return 1;
>> +		}
>> +	}
>> +	return 0;
>> +}
>> +
>> +#endif /* __KERNEL__ */
>> +
>> +#endif	/* _LINUX_BONDING_H */
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index ae3c209..06e24ae 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -1897,39 +1897,6 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
>>  	dev->gso_max_size = size;
>>  }
>>  
>> -/* On bonding slaves other than the currently active slave, suppress
>> - * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
>> - * ARP on active-backup slaves with arp_validate enabled.
>> - */
>> -static inline int skb_bond_should_drop(struct sk_buff *skb)
>> -{
>> -	struct net_device *dev = skb->dev;
>> -	struct net_device *master = dev->master;
>> -
>> -	if (master) {
>> -		if (master->priv_flags & IFF_MASTER_ARPMON)
>> -			dev->last_rx = jiffies;
>> -
>> -		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
>> -			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
>> -			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
>> -				return 0;
>> -
>> -			if (master->priv_flags & IFF_MASTER_ALB) {
>> -				if (skb->pkt_type != PACKET_BROADCAST &&
>> -				    skb->pkt_type != PACKET_MULTICAST)
>> -					return 0;
>> -			}
>> -			if (master->priv_flags & IFF_MASTER_8023AD &&
>> -			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
>> -				return 0;
>> -
>> -			return 1;
>> -		}
>> -	}
>> -	return 0;
>> -}
>> -
>>  extern struct pernet_operations __net_initdata loopback_net_ops;
>>  
>>  static inline int dev_ethtool_get_settings(struct net_device *dev,
>> diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
>> index 7f7de1a..c6eae40 100644
>> --- a/net/8021q/vlan_core.c
>> +++ b/net/8021q/vlan_core.c
>> @@ -2,6 +2,7 @@
>>  #include <linux/netdevice.h>
>>  #include <linux/if_vlan.h>
>>  #include <linux/netpoll.h>
>> +#include <linux/bonding.h>
>>  #include "vlan.h"
>>  
>>  /* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 241613f..221b43f 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -127,6 +127,7 @@
>>  #include <linux/jhash.h>
>>  #include <linux/random.h>
>>  #include <trace/napi.h>
>> +#include <linux/bonding.h>
>>  
>>  #include "net-sysfs.h"
>>  
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> 
>> 
>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
@ 2009-05-27  8:42         ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-27  8:42 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik, davem

Tue, May 26, 2009 at 06:59:53PM CEST, dada1@cosmosbay.com wrote:
>Jiri Pirko a écrit :
>> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1
>> 
>> Hi all.
>> 
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>> 
>> Basically here's what's going on. In every mode, bonding interface uses the same
>> mac address for all enslaved devices (except fail_over_mac). Only balance-alb
>> will simultaneously use multiple MAC addresses across different slaves. When you
>> put this kind of bond device into a bridge it will only add one of mac adresses
>> into a hash list of mac addresses, say X. This mac address is marked as local.
>> But this bonding interface also has mac address Y. Now then packet arrives with
>> destination address Y, this address is not marked as local and the packed looks
>> like it needs to be forwarded. This packet is then lost which is wrong.
>> 
>> Notice that interfaces can be added and removed from bond while it is in bridge.
>> 
>> ***
>> When the multiple addresses for bridge port approach failed to solve this issue
>> due to STP I started to think other way to solve this. I returned to previous
>> solution but tweaked one.
>> 
>> This patch solves the situation in the bonding without touching bridge code.
>> For every incoming frame to bonding the destination address is compared to
>> current address of the slave device from which tha packet came. If these two
>> match destination address is replaced by mac address of the master. This address
>> is known by bridge so it is delivered properly.
>> 
>> I experimentally tried that this works as good as searching through the slave
>> list (v4 of this patch).
>> 
>> I was forced to create a new header because I need to use
>> compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
>> linux/netdevice.h. I've hit some cross include issues. I think that it's good
>> to have skb_bond_should_drop() in a separate file anyway.
>> 
>> Jirka
>> 
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> 
>> diff --git a/include/linux/bonding.h b/include/linux/bonding.h
>> new file mode 100644
>> index 0000000..3081ddb
>> --- /dev/null
>> +++ b/include/linux/bonding.h
>> @@ -0,0 +1,78 @@
>> +/*
>> + * include/linux/bonding.h
>> + *
>> + * Copyright (C) 2009 Jiri Pirko <jpirko@redhat.com>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2
>> + * as published by the Free Software Foundation.
>> + *
>> + * Bonding device helpers.
>> + */
>> +
>> +#ifndef _LINUX_BONDING_H
>> +#define _LINUX_BONDING_H
>> +
>> +#ifdef __KERNEL__
>> +
>> +#include <linux/skbuff.h>
>> +#include <linux/netdevice.h>
>> +#include <linux/if.h>
>> +#include <linux/etherdevice.h>
>> +#include <linux/if_ether.h>
>> +
>> +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
>> +					      struct net_device *dev,
>> +					      struct net_device *master)
>> +{
>> +	unsigned char *dest = eth_hdr(skb)->h_dest;
>> +
>> +	if (compare_ether_addr_64bits(dest, master->dev_addr) &&
>> +	    !compare_ether_addr_64bits(dest, dev->dev_addr))
>> +		memcpy(dest, master->dev_addr, ETH_ALEN);
>
>But couldnt we test skb->pkt_type == PACKET_HOST instead,
>Or eth_type_trans() not yet called at this point ?
>
>I would suggest :
>
>if (skb->pkt_type == PACKET_HOST)
>	memcpy(dest, master->dev_addr, ETH_ALEN);

Yes Eric, you are right, good point.

eth_type_trans() is called in any driver before and
compare_ether_addr_64bits(dest, dev->dev_addr) is done there. So it's safe
to use this here (and save some ticks). I'm going to make a new patch and
test it.

		Jirka
>
>> +}
>> +
>> +/* On bonding slaves other than the currently active slave, suppress
>> + * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
>> + * ARP on active-backup slaves with arp_validate enabled.
>> + */
>> +static inline int skb_bond_should_drop(struct sk_buff *skb)
>> +{
>> +	struct net_device *dev = skb->dev;
>> +	struct net_device *master = dev->master;
>> +
>> +	if (master) {
>> +		if (master->priv_flags & IFF_MASTER_ARPMON)
>> +			dev->last_rx = jiffies;
>> +
>> +		if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
>> +			/* Do address unmangle. The local destination address
>> +			 * will be always the one master has. Provides the right
>> +			 * functionality in a bridge.
>> +			 */
>> +			skb_bond_set_mac_by_master(skb, dev, master);
>> +		}
>> +
>> +		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
>> +			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
>> +			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
>> +				return 0;
>> +
>> +			if (master->priv_flags & IFF_MASTER_ALB) {
>> +				if (skb->pkt_type != PACKET_BROADCAST &&
>> +				    skb->pkt_type != PACKET_MULTICAST)
>> +					return 0;
>> +			}
>> +			if (master->priv_flags & IFF_MASTER_8023AD &&
>> +			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
>> +				return 0;
>> +
>> +			return 1;
>> +		}
>> +	}
>> +	return 0;
>> +}
>> +
>> +#endif /* __KERNEL__ */
>> +
>> +#endif	/* _LINUX_BONDING_H */
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index ae3c209..06e24ae 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -1897,39 +1897,6 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
>>  	dev->gso_max_size = size;
>>  }
>>  
>> -/* On bonding slaves other than the currently active slave, suppress
>> - * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
>> - * ARP on active-backup slaves with arp_validate enabled.
>> - */
>> -static inline int skb_bond_should_drop(struct sk_buff *skb)
>> -{
>> -	struct net_device *dev = skb->dev;
>> -	struct net_device *master = dev->master;
>> -
>> -	if (master) {
>> -		if (master->priv_flags & IFF_MASTER_ARPMON)
>> -			dev->last_rx = jiffies;
>> -
>> -		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
>> -			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
>> -			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
>> -				return 0;
>> -
>> -			if (master->priv_flags & IFF_MASTER_ALB) {
>> -				if (skb->pkt_type != PACKET_BROADCAST &&
>> -				    skb->pkt_type != PACKET_MULTICAST)
>> -					return 0;
>> -			}
>> -			if (master->priv_flags & IFF_MASTER_8023AD &&
>> -			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
>> -				return 0;
>> -
>> -			return 1;
>> -		}
>> -	}
>> -	return 0;
>> -}
>> -
>>  extern struct pernet_operations __net_initdata loopback_net_ops;
>>  
>>  static inline int dev_ethtool_get_settings(struct net_device *dev,
>> diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
>> index 7f7de1a..c6eae40 100644
>> --- a/net/8021q/vlan_core.c
>> +++ b/net/8021q/vlan_core.c
>> @@ -2,6 +2,7 @@
>>  #include <linux/netdevice.h>
>>  #include <linux/if_vlan.h>
>>  #include <linux/netpoll.h>
>> +#include <linux/bonding.h>
>>  #include "vlan.h"
>>  
>>  /* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 241613f..221b43f 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -127,6 +127,7 @@
>>  #include <linux/jhash.h>
>>  #include <linux/random.h>
>>  #include <trace/napi.h>
>> +#include <linux/bonding.h>
>>  
>>  #include "net-sysfs.h"
>>  
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> 
>> 
>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.2
  2009-05-26 15:17     ` [Bridge] " Jiri Pirko
@ 2009-05-27 13:53       ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-27 13:53 UTC (permalink / raw)
  To: netdev
  Cc: jgarzik, davem, shemminger, bridge, fubar, bonding-devel, kaber,
	mschmidt, dada1, andy

[PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.2

(updated)
changes v4.1 -> v4.2
- use skb->pkt_type == PACKET_HOST compare rather then comparing skb dest addr
  against skb->dev->dev_addr

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

***
When the multiple addresses for bridge port approach failed to solve this issue
due to STP I started to think other way to solve this. I returned to previous
solution but tweaked one.

This patch solves the situation in the bonding without touching bridge code.
For every incoming frame to bonding the destination address is compared to
current address of the slave device from which tha packet came. If these two
match destination address is replaced by mac address of the master. This address
is known by bridge so it is delivered properly. Note that the comparsion is not
made directly, it's used skb->pkt_type == PACKET_HOST instead. This is "set"
previously in eth_type_trans().

I experimentally tried that this works as good as searching through the slave
list (v4 of this patch).

I was forced to create a new header because I need to use
compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
linux/netdevice.h. I've hit some cross include issues. I think that it's good
to have skb_bond_should_drop() in a separate file anyway.

Jirka


Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/include/linux/bonding.h b/include/linux/bonding.h
new file mode 100644
index 0000000..e50939d
--- /dev/null
+++ b/include/linux/bonding.h
@@ -0,0 +1,78 @@
+/*
+ * include/linux/bonding.h
+ *
+ * Copyright (C) 2009 Jiri Pirko <jpirko@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Bonding device helpers.
+ */
+
+#ifndef _LINUX_BONDING_H
+#define _LINUX_BONDING_H
+
+#ifdef __KERNEL__
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+
+static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
+					      struct net_device *master)
+{
+	unsigned char *dest = eth_hdr(skb)->h_dest;
+
+	if (compare_ether_addr_64bits(dest, master->dev_addr) &&
+	    (skb->pkt_type == PACKET_HOST))
+		memcpy(dest, master->dev_addr, ETH_ALEN);
+}
+
+/* On bonding slaves other than the currently active slave, suppress
+ * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
+ * ARP on active-backup slaves with arp_validate enabled.
+ */
+static inline int skb_bond_should_drop(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct net_device *master = dev->master;
+
+	if (master) {
+		if (master->priv_flags & IFF_MASTER_ARPMON)
+			dev->last_rx = jiffies;
+
+		if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
+			/* Do address unmangle. The local destination address
+			 * will be always the one master has. Provides the right
+			 * functionality in a bridge.
+			 */
+			skb_bond_set_mac_by_master(skb, master);
+		}
+
+		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
+			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
+			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
+				return 0;
+
+			if (master->priv_flags & IFF_MASTER_ALB) {
+				if (skb->pkt_type != PACKET_BROADCAST &&
+				    skb->pkt_type != PACKET_MULTICAST)
+					return 0;
+			}
+			if (master->priv_flags & IFF_MASTER_8023AD &&
+			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
+				return 0;
+
+			return 1;
+		}
+	}
+	return 0;
+}
+
+#endif /* __KERNEL__ */
+
+#endif	/* _LINUX_BONDING_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 586b71f..6543b2d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1903,39 +1903,6 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 	dev->gso_max_size = size;
 }
 
-/* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
- */
-static inline int skb_bond_should_drop(struct sk_buff *skb)
-{
-	struct net_device *dev = skb->dev;
-	struct net_device *master = dev->master;
-
-	if (master) {
-		if (master->priv_flags & IFF_MASTER_ARPMON)
-			dev->last_rx = jiffies;
-
-		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
-			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
-			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
-				return 0;
-
-			if (master->priv_flags & IFF_MASTER_ALB) {
-				if (skb->pkt_type != PACKET_BROADCAST &&
-				    skb->pkt_type != PACKET_MULTICAST)
-					return 0;
-			}
-			if (master->priv_flags & IFF_MASTER_8023AD &&
-			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
-				return 0;
-
-			return 1;
-		}
-	}
-	return 0;
-}
-
 extern struct pernet_operations __net_initdata loopback_net_ops;
 
 static inline int dev_ethtool_get_settings(struct net_device *dev,
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 7f7de1a..c6eae40 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -2,6 +2,7 @@
 #include <linux/netdevice.h>
 #include <linux/if_vlan.h>
 #include <linux/netpoll.h>
+#include <linux/bonding.h>
 #include "vlan.h"
 
 /* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
diff --git a/net/core/dev.c b/net/core/dev.c
index 5eb3e48..56572b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -127,6 +127,7 @@
 #include <linux/jhash.h>
 #include <linux/random.h>
 #include <trace/napi.h>
+#include <linux/bonding.h>
 
 #include "net-sysfs.h"
 

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.2
@ 2009-05-27 13:53       ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-27 13:53 UTC (permalink / raw)
  To: netdev; +Cc: fubar, jgarzik, bridge, mschmidt, bonding-devel, dada1, davem

[PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.2

(updated)
changes v4.1 -> v4.2
- use skb->pkt_type == PACKET_HOST compare rather then comparing skb dest addr
  against skb->dev->dev_addr

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

***
When the multiple addresses for bridge port approach failed to solve this issue
due to STP I started to think other way to solve this. I returned to previous
solution but tweaked one.

This patch solves the situation in the bonding without touching bridge code.
For every incoming frame to bonding the destination address is compared to
current address of the slave device from which tha packet came. If these two
match destination address is replaced by mac address of the master. This address
is known by bridge so it is delivered properly. Note that the comparsion is not
made directly, it's used skb->pkt_type == PACKET_HOST instead. This is "set"
previously in eth_type_trans().

I experimentally tried that this works as good as searching through the slave
list (v4 of this patch).

I was forced to create a new header because I need to use
compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
linux/netdevice.h. I've hit some cross include issues. I think that it's good
to have skb_bond_should_drop() in a separate file anyway.

Jirka


Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/include/linux/bonding.h b/include/linux/bonding.h
new file mode 100644
index 0000000..e50939d
--- /dev/null
+++ b/include/linux/bonding.h
@@ -0,0 +1,78 @@
+/*
+ * include/linux/bonding.h
+ *
+ * Copyright (C) 2009 Jiri Pirko <jpirko@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Bonding device helpers.
+ */
+
+#ifndef _LINUX_BONDING_H
+#define _LINUX_BONDING_H
+
+#ifdef __KERNEL__
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+
+static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
+					      struct net_device *master)
+{
+	unsigned char *dest = eth_hdr(skb)->h_dest;
+
+	if (compare_ether_addr_64bits(dest, master->dev_addr) &&
+	    (skb->pkt_type == PACKET_HOST))
+		memcpy(dest, master->dev_addr, ETH_ALEN);
+}
+
+/* On bonding slaves other than the currently active slave, suppress
+ * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
+ * ARP on active-backup slaves with arp_validate enabled.
+ */
+static inline int skb_bond_should_drop(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct net_device *master = dev->master;
+
+	if (master) {
+		if (master->priv_flags & IFF_MASTER_ARPMON)
+			dev->last_rx = jiffies;
+
+		if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
+			/* Do address unmangle. The local destination address
+			 * will be always the one master has. Provides the right
+			 * functionality in a bridge.
+			 */
+			skb_bond_set_mac_by_master(skb, master);
+		}
+
+		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
+			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
+			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
+				return 0;
+
+			if (master->priv_flags & IFF_MASTER_ALB) {
+				if (skb->pkt_type != PACKET_BROADCAST &&
+				    skb->pkt_type != PACKET_MULTICAST)
+					return 0;
+			}
+			if (master->priv_flags & IFF_MASTER_8023AD &&
+			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
+				return 0;
+
+			return 1;
+		}
+	}
+	return 0;
+}
+
+#endif /* __KERNEL__ */
+
+#endif	/* _LINUX_BONDING_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 586b71f..6543b2d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1903,39 +1903,6 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 	dev->gso_max_size = size;
 }
 
-/* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
- */
-static inline int skb_bond_should_drop(struct sk_buff *skb)
-{
-	struct net_device *dev = skb->dev;
-	struct net_device *master = dev->master;
-
-	if (master) {
-		if (master->priv_flags & IFF_MASTER_ARPMON)
-			dev->last_rx = jiffies;
-
-		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
-			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
-			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
-				return 0;
-
-			if (master->priv_flags & IFF_MASTER_ALB) {
-				if (skb->pkt_type != PACKET_BROADCAST &&
-				    skb->pkt_type != PACKET_MULTICAST)
-					return 0;
-			}
-			if (master->priv_flags & IFF_MASTER_8023AD &&
-			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
-				return 0;
-
-			return 1;
-		}
-	}
-	return 0;
-}
-
 extern struct pernet_operations __net_initdata loopback_net_ops;
 
 static inline int dev_ethtool_get_settings(struct net_device *dev,
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 7f7de1a..c6eae40 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -2,6 +2,7 @@
 #include <linux/netdevice.h>
 #include <linux/if_vlan.h>
 #include <linux/netpoll.h>
+#include <linux/bonding.h>
 #include "vlan.h"
 
 /* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
diff --git a/net/core/dev.c b/net/core/dev.c
index 5eb3e48..56572b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -127,6 +127,7 @@
 #include <linux/jhash.h>
 #include <linux/random.h>
 #include <trace/napi.h>
+#include <linux/bonding.h>
 
 #include "net-sysfs.h"
 

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.2
  2009-05-27 13:53       ` [Bridge] " Jiri Pirko
@ 2009-05-27 14:39         ` Eric Dumazet
  -1 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-05-27 14:39 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, andy

Jiri Pirko a écrit :
> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.2
> 
> (updated)
> changes v4.1 -> v4.2
> - use skb->pkt_type == PACKET_HOST compare rather then comparing skb dest addr
>   against skb->dev->dev_addr
> 
> Hi all.
> 
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> 
> Basically here's what's going on. In every mode, bonding interface uses the same
> mac address for all enslaved devices (except fail_over_mac). Only balance-alb
> will simultaneously use multiple MAC addresses across different slaves. When you
> put this kind of bond device into a bridge it will only add one of mac adresses
> into a hash list of mac addresses, say X. This mac address is marked as local.
> But this bonding interface also has mac address Y. Now then packet arrives with
> destination address Y, this address is not marked as local and the packed looks
> like it needs to be forwarded. This packet is then lost which is wrong.
> 
> Notice that interfaces can be added and removed from bond while it is in bridge.
> 
> ***
> When the multiple addresses for bridge port approach failed to solve this issue
> due to STP I started to think other way to solve this. I returned to previous
> solution but tweaked one.
> 
> This patch solves the situation in the bonding without touching bridge code.
> For every incoming frame to bonding the destination address is compared to
> current address of the slave device from which tha packet came. If these two
> match destination address is replaced by mac address of the master. This address
> is known by bridge so it is delivered properly. Note that the comparsion is not
> made directly, it's used skb->pkt_type == PACKET_HOST instead. This is "set"
> previously in eth_type_trans().
> 
> I experimentally tried that this works as good as searching through the slave
> list (v4 of this patch).
> 
> I was forced to create a new header because I need to use
> compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
> linux/netdevice.h. I've hit some cross include issues. I think that it's good
> to have skb_bond_should_drop() in a separate file anyway.
> 
> Jirka
> 
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> diff --git a/include/linux/bonding.h b/include/linux/bonding.h
> new file mode 100644
> index 0000000..e50939d
> --- /dev/null
> +++ b/include/linux/bonding.h
> @@ -0,0 +1,78 @@
> +/*
> + * include/linux/bonding.h
> + *
> + * Copyright (C) 2009 Jiri Pirko <jpirko@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2
> + * as published by the Free Software Foundation.
> + *
> + * Bonding device helpers.
> + */
> +
> +#ifndef _LINUX_BONDING_H
> +#define _LINUX_BONDING_H
> +
> +#ifdef __KERNEL__
> +
> +#include <linux/skbuff.h>
> +#include <linux/netdevice.h>
> +#include <linux/if.h>
> +#include <linux/etherdevice.h>
> +#include <linux/if_ether.h>
> +#include <linux/if_packet.h>
> +
> +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
> +					      struct net_device *master)
> +{
> +	unsigned char *dest = eth_hdr(skb)->h_dest;
> +
> +	if (compare_ether_addr_64bits(dest, master->dev_addr) &&
> +	    (skb->pkt_type == PACKET_HOST))
> +		memcpy(dest, master->dev_addr, ETH_ALEN);

Just overwriting the dest would be faster, and avoids
to include <linux/etherdevice.h>, maybe a new include file
could be avoided ?

If it is already the master->dev_addr, then memcpy() is a no-op
If it wasnt the master->dev_addr, then memcpy() does what you wanted.

You can also give a hint to gcc as h_dest is guaranteed to be 16 bit aligned

static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
					      struct net_device *master)
{
	if (skb->pkt_type == PACKET_HOST) {
		u16 *dest = (u16 *)eth_hdr(skb)->h_dest;

		memcpy(dest, master->dev_addr, ETH_ALEN);
	}
}

Compiler will emit better code for memcpy() on some arches.
(not on x86, as it already does one 32bit and one 16bit move)



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.2
@ 2009-05-27 14:39         ` Eric Dumazet
  0 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-05-27 14:39 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: fubar, netdev, bridge, mschmidt, bonding-devel, jgarzik, davem

Jiri Pirko a écrit :
> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.2
> 
> (updated)
> changes v4.1 -> v4.2
> - use skb->pkt_type == PACKET_HOST compare rather then comparing skb dest addr
>   against skb->dev->dev_addr
> 
> Hi all.
> 
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> 
> Basically here's what's going on. In every mode, bonding interface uses the same
> mac address for all enslaved devices (except fail_over_mac). Only balance-alb
> will simultaneously use multiple MAC addresses across different slaves. When you
> put this kind of bond device into a bridge it will only add one of mac adresses
> into a hash list of mac addresses, say X. This mac address is marked as local.
> But this bonding interface also has mac address Y. Now then packet arrives with
> destination address Y, this address is not marked as local and the packed looks
> like it needs to be forwarded. This packet is then lost which is wrong.
> 
> Notice that interfaces can be added and removed from bond while it is in bridge.
> 
> ***
> When the multiple addresses for bridge port approach failed to solve this issue
> due to STP I started to think other way to solve this. I returned to previous
> solution but tweaked one.
> 
> This patch solves the situation in the bonding without touching bridge code.
> For every incoming frame to bonding the destination address is compared to
> current address of the slave device from which tha packet came. If these two
> match destination address is replaced by mac address of the master. This address
> is known by bridge so it is delivered properly. Note that the comparsion is not
> made directly, it's used skb->pkt_type == PACKET_HOST instead. This is "set"
> previously in eth_type_trans().
> 
> I experimentally tried that this works as good as searching through the slave
> list (v4 of this patch).
> 
> I was forced to create a new header because I need to use
> compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
> linux/netdevice.h. I've hit some cross include issues. I think that it's good
> to have skb_bond_should_drop() in a separate file anyway.
> 
> Jirka
> 
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> diff --git a/include/linux/bonding.h b/include/linux/bonding.h
> new file mode 100644
> index 0000000..e50939d
> --- /dev/null
> +++ b/include/linux/bonding.h
> @@ -0,0 +1,78 @@
> +/*
> + * include/linux/bonding.h
> + *
> + * Copyright (C) 2009 Jiri Pirko <jpirko@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2
> + * as published by the Free Software Foundation.
> + *
> + * Bonding device helpers.
> + */
> +
> +#ifndef _LINUX_BONDING_H
> +#define _LINUX_BONDING_H
> +
> +#ifdef __KERNEL__
> +
> +#include <linux/skbuff.h>
> +#include <linux/netdevice.h>
> +#include <linux/if.h>
> +#include <linux/etherdevice.h>
> +#include <linux/if_ether.h>
> +#include <linux/if_packet.h>
> +
> +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
> +					      struct net_device *master)
> +{
> +	unsigned char *dest = eth_hdr(skb)->h_dest;
> +
> +	if (compare_ether_addr_64bits(dest, master->dev_addr) &&
> +	    (skb->pkt_type == PACKET_HOST))
> +		memcpy(dest, master->dev_addr, ETH_ALEN);

Just overwriting the dest would be faster, and avoids
to include <linux/etherdevice.h>, maybe a new include file
could be avoided ?

If it is already the master->dev_addr, then memcpy() is a no-op
If it wasnt the master->dev_addr, then memcpy() does what you wanted.

You can also give a hint to gcc as h_dest is guaranteed to be 16 bit aligned

static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
					      struct net_device *master)
{
	if (skb->pkt_type == PACKET_HOST) {
		u16 *dest = (u16 *)eth_hdr(skb)->h_dest;

		memcpy(dest, master->dev_addr, ETH_ALEN);
	}
}

Compiler will emit better code for memcpy() on some arches.
(not on x86, as it already does one 32bit and one 16bit move)



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.2
  2009-05-27 14:39         ` [Bridge] " Eric Dumazet
@ 2009-05-28  9:57           ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-28  9:57 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, andy, oleg

Wed, May 27, 2009 at 04:39:22PM CEST, dada1@cosmosbay.com wrote:
>Jiri Pirko a écrit :
>> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.2
>> 
>> (updated)
>> changes v4.1 -> v4.2
>> - use skb->pkt_type == PACKET_HOST compare rather then comparing skb dest addr
>>   against skb->dev->dev_addr
>> 
>> Hi all.
>> 
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>> 
>> Basically here's what's going on. In every mode, bonding interface uses the same
>> mac address for all enslaved devices (except fail_over_mac). Only balance-alb
>> will simultaneously use multiple MAC addresses across different slaves. When you
>> put this kind of bond device into a bridge it will only add one of mac adresses
>> into a hash list of mac addresses, say X. This mac address is marked as local.
>> But this bonding interface also has mac address Y. Now then packet arrives with
>> destination address Y, this address is not marked as local and the packed looks
>> like it needs to be forwarded. This packet is then lost which is wrong.
>> 
>> Notice that interfaces can be added and removed from bond while it is in bridge.
>> 
>> ***
>> When the multiple addresses for bridge port approach failed to solve this issue
>> due to STP I started to think other way to solve this. I returned to previous
>> solution but tweaked one.
>> 
>> This patch solves the situation in the bonding without touching bridge code.
>> For every incoming frame to bonding the destination address is compared to
>> current address of the slave device from which tha packet came. If these two
>> match destination address is replaced by mac address of the master. This address
>> is known by bridge so it is delivered properly. Note that the comparsion is not
>> made directly, it's used skb->pkt_type == PACKET_HOST instead. This is "set"
>> previously in eth_type_trans().
>> 
>> I experimentally tried that this works as good as searching through the slave
>> list (v4 of this patch).
>> 
>> I was forced to create a new header because I need to use
>> compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
>> linux/netdevice.h. I've hit some cross include issues. I think that it's good
>> to have skb_bond_should_drop() in a separate file anyway.
>> 
>> Jirka
>> 
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> 
>> diff --git a/include/linux/bonding.h b/include/linux/bonding.h
>> new file mode 100644
>> index 0000000..e50939d
>> --- /dev/null
>> +++ b/include/linux/bonding.h
>> @@ -0,0 +1,78 @@
>> +/*
>> + * include/linux/bonding.h
>> + *
>> + * Copyright (C) 2009 Jiri Pirko <jpirko@redhat.com>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2
>> + * as published by the Free Software Foundation.
>> + *
>> + * Bonding device helpers.
>> + */
>> +
>> +#ifndef _LINUX_BONDING_H
>> +#define _LINUX_BONDING_H
>> +
>> +#ifdef __KERNEL__
>> +
>> +#include <linux/skbuff.h>
>> +#include <linux/netdevice.h>
>> +#include <linux/if.h>
>> +#include <linux/etherdevice.h>
>> +#include <linux/if_ether.h>
>> +#include <linux/if_packet.h>
>> +
>> +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
>> +					      struct net_device *master)
>> +{
>> +	unsigned char *dest = eth_hdr(skb)->h_dest;
>> +
>> +	if (compare_ether_addr_64bits(dest, master->dev_addr) &&
>> +	    (skb->pkt_type == PACKET_HOST))
>> +		memcpy(dest, master->dev_addr, ETH_ALEN);
>
>Just overwriting the dest would be faster, and avoids
>to include <linux/etherdevice.h>, maybe a new include file
>could be avoided ?
>
>If it is already the master->dev_addr, then memcpy() is a no-op
>If it wasnt the master->dev_addr, then memcpy() does what you wanted.
>
>You can also give a hint to gcc as h_dest is guaranteed to be 16 bit aligned
>
>static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
>					      struct net_device *master)
>{
>	if (skb->pkt_type == PACKET_HOST) {
>		u16 *dest = (u16 *)eth_hdr(skb)->h_dest;
>
>		memcpy(dest, master->dev_addr, ETH_ALEN);
>	}
>}
>
>Compiler will emit better code for memcpy() on some arches.
>(not on x86, as it already does one 32bit and one 16bit move)

Okay, I consulted the comparing/memcpy question with Oleg (cc'ed) and he also
agree to do this your way. I'll make a patch, test it and post it soon.

Thanks Eric.
>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.2
@ 2009-05-28  9:57           ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-28  9:57 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: fubar, netdev, bridge, oleg, mschmidt, bonding-devel, jgarzik, davem

Wed, May 27, 2009 at 04:39:22PM CEST, dada1@cosmosbay.com wrote:
>Jiri Pirko a écrit :
>> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.2
>> 
>> (updated)
>> changes v4.1 -> v4.2
>> - use skb->pkt_type == PACKET_HOST compare rather then comparing skb dest addr
>>   against skb->dev->dev_addr
>> 
>> Hi all.
>> 
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>> 
>> Basically here's what's going on. In every mode, bonding interface uses the same
>> mac address for all enslaved devices (except fail_over_mac). Only balance-alb
>> will simultaneously use multiple MAC addresses across different slaves. When you
>> put this kind of bond device into a bridge it will only add one of mac adresses
>> into a hash list of mac addresses, say X. This mac address is marked as local.
>> But this bonding interface also has mac address Y. Now then packet arrives with
>> destination address Y, this address is not marked as local and the packed looks
>> like it needs to be forwarded. This packet is then lost which is wrong.
>> 
>> Notice that interfaces can be added and removed from bond while it is in bridge.
>> 
>> ***
>> When the multiple addresses for bridge port approach failed to solve this issue
>> due to STP I started to think other way to solve this. I returned to previous
>> solution but tweaked one.
>> 
>> This patch solves the situation in the bonding without touching bridge code.
>> For every incoming frame to bonding the destination address is compared to
>> current address of the slave device from which tha packet came. If these two
>> match destination address is replaced by mac address of the master. This address
>> is known by bridge so it is delivered properly. Note that the comparsion is not
>> made directly, it's used skb->pkt_type == PACKET_HOST instead. This is "set"
>> previously in eth_type_trans().
>> 
>> I experimentally tried that this works as good as searching through the slave
>> list (v4 of this patch).
>> 
>> I was forced to create a new header because I need to use
>> compare_ether_addr_64bits() (defined in linux/etherdevice.h) in
>> linux/netdevice.h. I've hit some cross include issues. I think that it's good
>> to have skb_bond_should_drop() in a separate file anyway.
>> 
>> Jirka
>> 
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> 
>> diff --git a/include/linux/bonding.h b/include/linux/bonding.h
>> new file mode 100644
>> index 0000000..e50939d
>> --- /dev/null
>> +++ b/include/linux/bonding.h
>> @@ -0,0 +1,78 @@
>> +/*
>> + * include/linux/bonding.h
>> + *
>> + * Copyright (C) 2009 Jiri Pirko <jpirko@redhat.com>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2
>> + * as published by the Free Software Foundation.
>> + *
>> + * Bonding device helpers.
>> + */
>> +
>> +#ifndef _LINUX_BONDING_H
>> +#define _LINUX_BONDING_H
>> +
>> +#ifdef __KERNEL__
>> +
>> +#include <linux/skbuff.h>
>> +#include <linux/netdevice.h>
>> +#include <linux/if.h>
>> +#include <linux/etherdevice.h>
>> +#include <linux/if_ether.h>
>> +#include <linux/if_packet.h>
>> +
>> +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
>> +					      struct net_device *master)
>> +{
>> +	unsigned char *dest = eth_hdr(skb)->h_dest;
>> +
>> +	if (compare_ether_addr_64bits(dest, master->dev_addr) &&
>> +	    (skb->pkt_type == PACKET_HOST))
>> +		memcpy(dest, master->dev_addr, ETH_ALEN);
>
>Just overwriting the dest would be faster, and avoids
>to include <linux/etherdevice.h>, maybe a new include file
>could be avoided ?
>
>If it is already the master->dev_addr, then memcpy() is a no-op
>If it wasnt the master->dev_addr, then memcpy() does what you wanted.
>
>You can also give a hint to gcc as h_dest is guaranteed to be 16 bit aligned
>
>static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
>					      struct net_device *master)
>{
>	if (skb->pkt_type == PACKET_HOST) {
>		u16 *dest = (u16 *)eth_hdr(skb)->h_dest;
>
>		memcpy(dest, master->dev_addr, ETH_ALEN);
>	}
>}
>
>Compiler will emit better code for memcpy() on some arches.
>(not on x86, as it already does one 32bit and one 16bit move)

Okay, I consulted the comparing/memcpy question with Oleg (cc'ed) and he also
agree to do this your way. I'll make a patch, test it and post it soon.

Thanks Eric.
>
>

^ permalink raw reply	[flat|nested] 214+ messages in thread

* [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3
  2009-05-27 13:53       ` [Bridge] " Jiri Pirko
@ 2009-05-28 11:05         ` Jiri Pirko
  -1 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-28 11:05 UTC (permalink / raw)
  To: netdev
  Cc: jgarzik, davem, shemminger, bridge, fubar, bonding-devel, kaber,
	mschmidt, dada1, andy, oleg

[PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3

(updated)
changes v4.2 -> v4.3
- memcpy the address always, not just in case it differs from master->dev_addr
- compare_ether_addr_64bits() is not used so there is no direct need to make new
  header file (I think it would be good to have bond stuff in separate file
  anyway).

changes v4.1 -> v4.2
- use skb->pkt_type == PACKET_HOST compare rather then comparing skb dest addr
  against skb->dev->dev_addr

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

***
When the multiple addresses for bridge port approach failed to solve this issue
due to STP I started to think other way to solve this. I returned to previous
solution but tweaked one.

This patch solves the situation in the bonding without touching bridge code.
For every incoming frame to bonding the destination address is compared to
current address of the slave device from which tha packet came. If these two
match destination address is replaced by mac address of the master. This address
is known by bridge so it is delivered properly. Note that the comparsion is not
made directly, it's used skb->pkt_type == PACKET_HOST instead. This is "set"
previously in eth_type_trans().

I experimentally tried that this works as good as searching through the slave
list (v4 of this patch).

Jirka


Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 371ece5..9b4db94 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1920,6 +1920,16 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 	dev->gso_max_size = size;
 }
 
+static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
+					      struct net_device *master)
+{
+	if (skb->pkt_type == PACKET_HOST) {
+		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
+
+		memcpy(dest, master->dev_addr, ETH_ALEN);
+	}
+}
+
 /* On bonding slaves other than the currently active slave, suppress
  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
  * ARP on active-backup slaves with arp_validate enabled.
@@ -1933,6 +1943,14 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 		if (master->priv_flags & IFF_MASTER_ARPMON)
 			dev->last_rx = jiffies;
 
+		if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
+			/* Do address unmangle. The local destination address
+			 * will be always the one master has. Provides the right
+			 * functionality in a bridge.
+			 */
+			skb_bond_set_mac_by_master(skb, master);
+		}
+
 		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
 			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
 			    skb->protocol == __cpu_to_be16(ETH_P_ARP))

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* [Bridge] [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3
@ 2009-05-28 11:05         ` Jiri Pirko
  0 siblings, 0 replies; 214+ messages in thread
From: Jiri Pirko @ 2009-05-28 11:05 UTC (permalink / raw)
  To: netdev
  Cc: fubar, jgarzik, bridge, oleg, mschmidt, bonding-devel, dada1, davem

[PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3

(updated)
changes v4.2 -> v4.3
- memcpy the address always, not just in case it differs from master->dev_addr
- compare_ether_addr_64bits() is not used so there is no direct need to make new
  header file (I think it would be good to have bond stuff in separate file
  anyway).

changes v4.1 -> v4.2
- use skb->pkt_type == PACKET_HOST compare rather then comparing skb dest addr
  against skb->dev->dev_addr

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

***
When the multiple addresses for bridge port approach failed to solve this issue
due to STP I started to think other way to solve this. I returned to previous
solution but tweaked one.

This patch solves the situation in the bonding without touching bridge code.
For every incoming frame to bonding the destination address is compared to
current address of the slave device from which tha packet came. If these two
match destination address is replaced by mac address of the master. This address
is known by bridge so it is delivered properly. Note that the comparsion is not
made directly, it's used skb->pkt_type == PACKET_HOST instead. This is "set"
previously in eth_type_trans().

I experimentally tried that this works as good as searching through the slave
list (v4 of this patch).

Jirka


Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 371ece5..9b4db94 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1920,6 +1920,16 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 	dev->gso_max_size = size;
 }
 
+static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
+					      struct net_device *master)
+{
+	if (skb->pkt_type == PACKET_HOST) {
+		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
+
+		memcpy(dest, master->dev_addr, ETH_ALEN);
+	}
+}
+
 /* On bonding slaves other than the currently active slave, suppress
  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
  * ARP on active-backup slaves with arp_validate enabled.
@@ -1933,6 +1943,14 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 		if (master->priv_flags & IFF_MASTER_ARPMON)
 			dev->last_rx = jiffies;
 
+		if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
+			/* Do address unmangle. The local destination address
+			 * will be always the one master has. Provides the right
+			 * functionality in a bridge.
+			 */
+			skb_bond_set_mac_by_master(skb, master);
+		}
+
 		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
 			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
 			    skb->protocol == __cpu_to_be16(ETH_P_ARP))

^ permalink raw reply related	[flat|nested] 214+ messages in thread

* Re: [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3
  2009-05-28 11:05         ` [Bridge] " Jiri Pirko
@ 2009-05-28 11:41           ` Eric Dumazet
  -1 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-05-28 11:41 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, andy, oleg

Jiri Pirko a écrit :
> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3
> 
> (updated)
> changes v4.2 -> v4.3
> - memcpy the address always, not just in case it differs from master->dev_addr
> - compare_ether_addr_64bits() is not used so there is no direct need to make new
>   header file (I think it would be good to have bond stuff in separate file
>   anyway).

Yes, this could be done in a future cleanup patch.
I find this (short) version easier to review for a new feature.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

> 
> changes v4.1 -> v4.2
> - use skb->pkt_type == PACKET_HOST compare rather then comparing skb dest addr
>   against skb->dev->dev_addr
> 
> Hi all.
> 
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> 
> Basically here's what's going on. In every mode, bonding interface uses the same
> mac address for all enslaved devices (except fail_over_mac). Only balance-alb
> will simultaneously use multiple MAC addresses across different slaves. When you
> put this kind of bond device into a bridge it will only add one of mac adresses
> into a hash list of mac addresses, say X. This mac address is marked as local.
> But this bonding interface also has mac address Y. Now then packet arrives with
> destination address Y, this address is not marked as local and the packed looks
> like it needs to be forwarded. This packet is then lost which is wrong.
> 
> Notice that interfaces can be added and removed from bond while it is in bridge.
> 
> ***
> When the multiple addresses for bridge port approach failed to solve this issue
> due to STP I started to think other way to solve this. I returned to previous
> solution but tweaked one.
> 
> This patch solves the situation in the bonding without touching bridge code.
> For every incoming frame to bonding the destination address is compared to
> current address of the slave device from which tha packet came. If these two
> match destination address is replaced by mac address of the master. This address
> is known by bridge so it is delivered properly. Note that the comparsion is not
> made directly, it's used skb->pkt_type == PACKET_HOST instead. This is "set"
> previously in eth_type_trans().
> 
> I experimentally tried that this works as good as searching through the slave
> list (v4 of this patch).
> 
> Jirka
> 
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 371ece5..9b4db94 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1920,6 +1920,16 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
>  	dev->gso_max_size = size;
>  }
>  
> +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
> +					      struct net_device *master)
> +{
> +	if (skb->pkt_type == PACKET_HOST) {
> +		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
> +
> +		memcpy(dest, master->dev_addr, ETH_ALEN);
> +	}
> +}
> +
>  /* On bonding slaves other than the currently active slave, suppress
>   * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
>   * ARP on active-backup slaves with arp_validate enabled.
> @@ -1933,6 +1943,14 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
>  		if (master->priv_flags & IFF_MASTER_ARPMON)
>  			dev->last_rx = jiffies;
>  
> +		if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
> +			/* Do address unmangle. The local destination address
> +			 * will be always the one master has. Provides the right
> +			 * functionality in a bridge.
> +			 */
> +			skb_bond_set_mac_by_master(skb, master);
> +		}
> +
>  		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
>  			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
>  			    skb->protocol == __cpu_to_be16(ETH_P_ARP))



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3
@ 2009-05-28 11:41           ` Eric Dumazet
  0 siblings, 0 replies; 214+ messages in thread
From: Eric Dumazet @ 2009-05-28 11:41 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: fubar, netdev, bridge, oleg, mschmidt, bonding-devel, jgarzik, davem

Jiri Pirko a écrit :
> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3
> 
> (updated)
> changes v4.2 -> v4.3
> - memcpy the address always, not just in case it differs from master->dev_addr
> - compare_ether_addr_64bits() is not used so there is no direct need to make new
>   header file (I think it would be good to have bond stuff in separate file
>   anyway).

Yes, this could be done in a future cleanup patch.
I find this (short) version easier to review for a new feature.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

> 
> changes v4.1 -> v4.2
> - use skb->pkt_type == PACKET_HOST compare rather then comparing skb dest addr
>   against skb->dev->dev_addr
> 
> Hi all.
> 
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> 
> Basically here's what's going on. In every mode, bonding interface uses the same
> mac address for all enslaved devices (except fail_over_mac). Only balance-alb
> will simultaneously use multiple MAC addresses across different slaves. When you
> put this kind of bond device into a bridge it will only add one of mac adresses
> into a hash list of mac addresses, say X. This mac address is marked as local.
> But this bonding interface also has mac address Y. Now then packet arrives with
> destination address Y, this address is not marked as local and the packed looks
> like it needs to be forwarded. This packet is then lost which is wrong.
> 
> Notice that interfaces can be added and removed from bond while it is in bridge.
> 
> ***
> When the multiple addresses for bridge port approach failed to solve this issue
> due to STP I started to think other way to solve this. I returned to previous
> solution but tweaked one.
> 
> This patch solves the situation in the bonding without touching bridge code.
> For every incoming frame to bonding the destination address is compared to
> current address of the slave device from which tha packet came. If these two
> match destination address is replaced by mac address of the master. This address
> is known by bridge so it is delivered properly. Note that the comparsion is not
> made directly, it's used skb->pkt_type == PACKET_HOST instead. This is "set"
> previously in eth_type_trans().
> 
> I experimentally tried that this works as good as searching through the slave
> list (v4 of this patch).
> 
> Jirka
> 
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 371ece5..9b4db94 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1920,6 +1920,16 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
>  	dev->gso_max_size = size;
>  }
>  
> +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
> +					      struct net_device *master)
> +{
> +	if (skb->pkt_type == PACKET_HOST) {
> +		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
> +
> +		memcpy(dest, master->dev_addr, ETH_ALEN);
> +	}
> +}
> +
>  /* On bonding slaves other than the currently active slave, suppress
>   * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
>   * ARP on active-backup slaves with arp_validate enabled.
> @@ -1933,6 +1943,14 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
>  		if (master->priv_flags & IFF_MASTER_ARPMON)
>  			dev->last_rx = jiffies;
>  
> +		if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
> +			/* Do address unmangle. The local destination address
> +			 * will be always the one master has. Provides the right
> +			 * functionality in a bridge.
> +			 */
> +			skb_bond_set_mac_by_master(skb, master);
> +		}
> +
>  		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
>  			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
>  			    skb->protocol == __cpu_to_be16(ETH_P_ARP))



^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3
  2009-05-28 11:05         ` [Bridge] " Jiri Pirko
@ 2009-05-28 12:11           ` Andy Gospodarek
  -1 siblings, 0 replies; 214+ messages in thread
From: Andy Gospodarek @ 2009-05-28 12:11 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, jgarzik, davem, shemminger, bridge, fubar, bonding-devel,
	kaber, mschmidt, dada1, andy, oleg

On May 28, 2009, at 7:05 AM, Jiri Pirko <jpirko@redhat.com> wrote:

> [PATCH net-next] bonding: allow bond in mode balance-alb to work  
> properly in bridge -try4.3
>
> (updated)
> changes v4.2 -> v4.3
> - memcpy the address always, not just in case it differs from master- 
> >dev_addr
> - compare_ether_addr_64bits() is not used so there is no direct need  
> to make new
>  header file (I think it would be good to have bond stuff in  
> separate file
>  anyway).
>
> changes v4.1 -> v4.2
> - use skb->pkt_type == PACKET_HOST compare rather then comparing skb  
> dest addr
>  against skb->dev->dev_addr
>
> Hi all.
>
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>
> Basically here's what's going on. In every mode, bonding interface  
> uses the same
> mac address for all enslaved devices (except fail_over_mac). Only  
> balance-alb
> will simultaneously use multiple MAC addresses across different  
> slaves. When you
> put this kind of bond device into a bridge it will only add one of  
> mac adresses
> into a hash list of mac addresses, say X. This mac address is marked  
> as local.
> But this bonding interface also has mac address Y. Now then packet  
> arrives with
> destination address Y, this address is not marked as local and the  
> packed looks
> like it needs to be forwarded. This packet is then lost which is  
> wrong.
>
> Notice that interfaces can be added and removed from bond while it  
> is in bridge.
>
> ***
> When the multiple addresses for bridge port approach failed to solve  
> this issue
> due to STP I started to think other way to solve this. I returned to  
> previous
> solution but tweaked one.
>
> This patch solves the situation in the bonding without touching  
> bridge code.
> For every incoming frame to bonding the destination address is  
> compared to
> current address of the slave device from which tha packet came. If  
> these two
> match destination address is replaced by mac address of the master.  
> This address
> is known by bridge so it is delivered properly. Note that the  
> comparsion is not
> made directly, it's used skb->pkt_type == PACKET_HOST instead. This  
> is "set"
> previously in eth_type_trans().
>
> I experimentally tried that this works as good as searching through  
> the slave
> list (v4 of this patch).
>
> Jirka
>
>
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>

This one is fine too.

Signed-off-by: Andy Gospodarek <andy@greyhouse.net>

> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 371ece5..9b4db94 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1920,6 +1920,16 @@ static inline void  
> netif_set_gso_max_size(struct net_device *dev,
>    dev->gso_max_size = size;
> }
>
> +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
> +                          struct net_device *master)
> +{
> +    if (skb->pkt_type == PACKET_HOST) {
> +        u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
> +
> +        memcpy(dest, master->dev_addr, ETH_ALEN);
> +    }
> +}
> +
> /* On bonding slaves other than the currently active slave, suppress
>  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
>  * ARP on active-backup slaves with arp_validate enabled.
> @@ -1933,6 +1943,14 @@ static inline int skb_bond_should_drop(struct  
> sk_buff *skb)
>        if (master->priv_flags & IFF_MASTER_ARPMON)
>            dev->last_rx = jiffies;
>
> +        if ((master->priv_flags & IFF_MASTER_ALB) && master- 
> >br_port) {
> +            /* Do address unmangle. The local destination address
> +             * will be always the one master has. Provides the right
> +             * functionality in a bridge.
> +             */
> +            skb_bond_set_mac_by_master(skb, master);
> +        }
> +
>        if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
>            if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
>                skb->protocol == __cpu_to_be16(ETH_P_ARP))

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3
@ 2009-05-28 12:11           ` Andy Gospodarek
  0 siblings, 0 replies; 214+ messages in thread
From: Andy Gospodarek @ 2009-05-28 12:11 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: fubar, netdev, bridge, oleg, mschmidt, bonding-devel, jgarzik,
	dada1, davem

On May 28, 2009, at 7:05 AM, Jiri Pirko <jpirko@redhat.com> wrote:

> [PATCH net-next] bonding: allow bond in mode balance-alb to work  
> properly in bridge -try4.3
>
> (updated)
> changes v4.2 -> v4.3
> - memcpy the address always, not just in case it differs from master- 
> >dev_addr
> - compare_ether_addr_64bits() is not used so there is no direct need  
> to make new
>  header file (I think it would be good to have bond stuff in  
> separate file
>  anyway).
>
> changes v4.1 -> v4.2
> - use skb->pkt_type == PACKET_HOST compare rather then comparing skb  
> dest addr
>  against skb->dev->dev_addr
>
> Hi all.
>
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>
> Basically here's what's going on. In every mode, bonding interface  
> uses the same
> mac address for all enslaved devices (except fail_over_mac). Only  
> balance-alb
> will simultaneously use multiple MAC addresses across different  
> slaves. When you
> put this kind of bond device into a bridge it will only add one of  
> mac adresses
> into a hash list of mac addresses, say X. This mac address is marked  
> as local.
> But this bonding interface also has mac address Y. Now then packet  
> arrives with
> destination address Y, this address is not marked as local and the  
> packed looks
> like it needs to be forwarded. This packet is then lost which is  
> wrong.
>
> Notice that interfaces can be added and removed from bond while it  
> is in bridge.
>
> ***
> When the multiple addresses for bridge port approach failed to solve  
> this issue
> due to STP I started to think other way to solve this. I returned to  
> previous
> solution but tweaked one.
>
> This patch solves the situation in the bonding without touching  
> bridge code.
> For every incoming frame to bonding the destination address is  
> compared to
> current address of the slave device from which tha packet came. If  
> these two
> match destination address is replaced by mac address of the master.  
> This address
> is known by bridge so it is delivered properly. Note that the  
> comparsion is not
> made directly, it's used skb->pkt_type == PACKET_HOST instead. This  
> is "set"
> previously in eth_type_trans().
>
> I experimentally tried that this works as good as searching through  
> the slave
> list (v4 of this patch).
>
> Jirka
>
>
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>

This one is fine too.

Signed-off-by: Andy Gospodarek <andy@greyhouse.net>

> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 371ece5..9b4db94 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1920,6 +1920,16 @@ static inline void  
> netif_set_gso_max_size(struct net_device *dev,
>    dev->gso_max_size = size;
> }
>
> +static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
> +                          struct net_device *master)
> +{
> +    if (skb->pkt_type == PACKET_HOST) {
> +        u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
> +
> +        memcpy(dest, master->dev_addr, ETH_ALEN);
> +    }
> +}
> +
> /* On bonding slaves other than the currently active slave, suppress
>  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
>  * ARP on active-backup slaves with arp_validate enabled.
> @@ -1933,6 +1943,14 @@ static inline int skb_bond_should_drop(struct  
> sk_buff *skb)
>        if (master->priv_flags & IFF_MASTER_ARPMON)
>            dev->last_rx = jiffies;
>
> +        if ((master->priv_flags & IFF_MASTER_ALB) && master- 
> >br_port) {
> +            /* Do address unmangle. The local destination address
> +             * will be always the one master has. Provides the right
> +             * functionality in a bridge.
> +             */
> +            skb_bond_set_mac_by_master(skb, master);
> +        }
> +
>        if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
>            if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
>                skb->protocol == __cpu_to_be16(ETH_P_ARP))

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3
  2009-05-28 11:41           ` [Bridge] " Eric Dumazet
@ 2009-05-29  8:52             ` David Miller
  -1 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-05-29  8:52 UTC (permalink / raw)
  To: eric.dumazet
  Cc: jpirko, netdev, jgarzik, shemminger, bridge, fubar,
	bonding-devel, kaber, mschmidt, andy, oleg

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 28 May 2009 13:41:59 +0200

> Jiri Pirko a écrit :
>> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3
>> 
>> (updated)
>> changes v4.2 -> v4.3
>> - memcpy the address always, not just in case it differs from master->dev_addr
>> - compare_ether_addr_64bits() is not used so there is no direct need to make new
>>   header file (I think it would be good to have bond stuff in separate file
>>   anyway).
> 
> Yes, this could be done in a future cleanup patch.
> I find this (short) version easier to review for a new feature.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied, thanks everyone.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [Bridge] [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3
@ 2009-05-29  8:52             ` David Miller
  0 siblings, 0 replies; 214+ messages in thread
From: David Miller @ 2009-05-29  8:52 UTC (permalink / raw)
  To: eric.dumazet
  Cc: fubar, jpirko, netdev, bridge, oleg, mschmidt, jgarzik, bonding-devel

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 28 May 2009 13:41:59 +0200

> Jiri Pirko a écrit :
>> [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3
>> 
>> (updated)
>> changes v4.2 -> v4.3
>> - memcpy the address always, not just in case it differs from master->dev_addr
>> - compare_ether_addr_64bits() is not used so there is no direct need to make new
>>   header file (I think it would be good to have bond stuff in separate file
>>   anyway).
> 
> Yes, this could be done in a future cleanup patch.
> I find this (short) version easier to review for a new feature.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied, thanks everyone.

^ permalink raw reply	[flat|nested] 214+ messages in thread

* Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge
@ 2009-03-24  9:54 Stanichenko Marat
  0 siblings, 0 replies; 214+ messages in thread
From: Stanichenko Marat @ 2009-03-24  9:54 UTC (permalink / raw)
  To: linux-kernel

> Thu, Mar 19, 2009 at 11:21:43AM CET, davem@davemloft.net wrote:
>>From: Jiri Pirko <jpirko@redhat.com>
>>Date: Thu, 19 Mar 2009 09:44:45 +0100
>>
>>> Yes I was looking at this thing yesterday (uc_list). But this list serves
>>> to different purpose. Do you think that it will be correct to use it for this? I
>>> would maybe like to make a new list similar to this for our purpose
>>> (say addr_list). I think it would be more correct.
>>
>>Whatever you do with that list privately inside of the bonding
>>driver should be fine.
> Well I do not need it only inside the bonding driver. I want bridge to use this
> list when adding a device in it and get mac addresses from there into its
> hashlist (to recognize these addresses as local).
Please correct me if I understand you improperly. You're going to mark all mac 
addresses that belong to slaves as "local" when adding a bond device to the 
bridge, aren't you? The only thing I'd like to notice (this might be an obvious 
one): a packet that is pushed out from one slave might reach the host through 
another slave. Considering all slaves as "local" in bridge code might lead to 
numerous messages "received packet with own address as source address".

Please CC me personally when answering this message.

Thanks,
Marat.

^ permalink raw reply	[flat|nested] 214+ messages in thread

end of thread, other threads:[~2009-05-29  8:52 UTC | newest]

Thread overview: 214+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-03-13 18:33 [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge Jiri Pirko
2009-03-13 18:33 ` [Bridge] " Jiri Pirko
2009-03-14  5:39 ` Stephen Hemminger
2009-03-14  5:39   ` [Bridge] " Stephen Hemminger
2009-03-14  9:49   ` Jiri Pirko
2009-03-14  9:49     ` [Bridge] " Jiri Pirko
2009-03-15 23:12     ` Stephen Hemminger
2009-03-15 23:12       ` [Bridge] " Stephen Hemminger
2009-03-16 11:11       ` Jiri Pirko
2009-03-16 11:11         ` [Bridge] " Jiri Pirko
2009-03-19  6:20         ` David Miller
2009-03-19  6:20           ` [Bridge] " David Miller
2009-03-19  8:44           ` Jiri Pirko
2009-03-19  8:44             ` [Bridge] " Jiri Pirko
2009-03-19 10:21             ` David Miller
2009-03-19 10:21               ` [Bridge] " David Miller
2009-03-19 11:19               ` Jiri Pirko
2009-03-19 11:19                 ` [Bridge] " Jiri Pirko
2009-03-19  8:50           ` Patrick McHardy
2009-03-19  8:50             ` [Bridge] " Patrick McHardy
2009-03-19 16:31             ` Jiri Pirko
2009-03-19 16:31               ` [Bridge] " Jiri Pirko
2009-03-25 13:04 ` [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try2 Jiri Pirko
2009-03-25 13:04   ` [Bridge] " Jiri Pirko
2009-03-25 13:40   ` Eric Dumazet
2009-03-25 13:40     ` [Bridge] " Eric Dumazet
2009-03-25 14:39     ` Jiri Pirko
2009-03-25 14:39       ` [Bridge] " Jiri Pirko
2009-03-25 15:19 ` [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3 Jiri Pirko
2009-03-25 15:19   ` [Bridge] " Jiri Pirko
2009-03-25 16:31   ` Jay Vosburgh
2009-03-25 16:31     ` [Bridge] " Jay Vosburgh
2009-03-25 17:44     ` Jiri Pirko
2009-03-25 17:44       ` [Bridge] " Jiri Pirko
2009-03-26  0:24       ` David Miller
2009-03-26  0:24         ` [Bridge] " David Miller
2009-03-26  0:34       ` Jay Vosburgh
2009-03-26  0:34         ` [Bridge] " Jay Vosburgh
2009-03-26 11:12     ` Jiri Pirko
2009-03-26 11:12       ` [Bridge] " Jiri Pirko
2009-03-26 15:52 ` [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4 Jiri Pirko
2009-03-26 15:52   ` [Bridge] " Jiri Pirko
2009-03-27  7:38   ` David Miller
2009-03-27  7:38     ` [Bridge] " David Miller
2009-03-27  7:46     ` Jiri Pirko
2009-03-27  7:46       ` [Bridge] " Jiri Pirko
2009-03-27  7:53     ` Patrick McHardy
2009-03-27  7:53       ` [Bridge] " Patrick McHardy
2009-03-27  8:41       ` Jiri Pirko
2009-03-27  8:41         ` [Bridge] " Jiri Pirko
2009-03-27  8:55         ` Patrick McHardy
2009-03-27  8:55           ` [Bridge] " Patrick McHardy
2009-03-27  9:47           ` Jiri Pirko
2009-03-27  9:47             ` [Bridge] " Jiri Pirko
2009-03-29 20:53       ` David Miller
2009-03-29 20:53         ` [Bridge] " David Miller
2009-03-30 12:04         ` Patrick McHardy
2009-03-30 12:04           ` [Bridge] " Patrick McHardy
2009-03-30 12:40           ` Jiri Pirko
2009-03-30 12:40             ` [Bridge] " Jiri Pirko
2009-03-30 12:47             ` Patrick McHardy
2009-03-30 12:47               ` [Bridge] " Patrick McHardy
2009-03-30 12:52               ` Jiri Pirko
2009-03-30 12:52                 ` [Bridge] " Jiri Pirko
2009-03-30 12:58                 ` Patrick McHardy
2009-03-30 12:58                   ` [Bridge] " Patrick McHardy
2009-05-26 15:17   ` [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.1 Jiri Pirko
2009-05-26 15:17     ` [Bridge] " Jiri Pirko
2009-05-26 16:32     ` Andy Gospodarek
2009-05-26 16:32       ` [Bridge] " Andy Gospodarek
2009-05-27  8:25       ` Jiri Pirko
2009-05-27  8:25         ` [Bridge] " Jiri Pirko
2009-05-26 16:59     ` Eric Dumazet
2009-05-26 16:59       ` [Bridge] " Eric Dumazet
2009-05-27  8:42       ` Jiri Pirko
2009-05-27  8:42         ` [Bridge] " Jiri Pirko
2009-05-27 13:53     ` [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.2 Jiri Pirko
2009-05-27 13:53       ` [Bridge] " Jiri Pirko
2009-05-27 14:39       ` Eric Dumazet
2009-05-27 14:39         ` [Bridge] " Eric Dumazet
2009-05-28  9:57         ` Jiri Pirko
2009-05-28  9:57           ` [Bridge] " Jiri Pirko
2009-05-28 11:05       ` [PATCH net-next] bonding: allow bond in mode balance-alb to work properly in bridge -try4.3 Jiri Pirko
2009-05-28 11:05         ` [Bridge] " Jiri Pirko
2009-05-28 11:41         ` Eric Dumazet
2009-05-28 11:41           ` [Bridge] " Eric Dumazet
2009-05-29  8:52           ` David Miller
2009-05-29  8:52             ` [Bridge] " David Miller
2009-05-28 12:11         ` Andy Gospodarek
2009-05-28 12:11           ` [Bridge] " Andy Gospodarek
2009-04-13  8:37 ` [PATCH 0/4] bonding: allow bond in mode balance-alb to work properly in bridge -try5 Jiri Pirko
2009-04-13  8:37   ` [Bridge] " Jiri Pirko
2009-04-13  8:38   ` [PATCH 1/4] net: introduce dev_mac_address_changed Jiri Pirko
2009-04-13  8:38     ` [Bridge] " Jiri Pirko
2009-04-13 14:58     ` Stephen Hemminger
2009-04-13 14:58       ` [Bridge] " Stephen Hemminger
2009-04-13  8:42   ` [PATCH 2/4] net: introduce a list of device addresses dev_addr_list Jiri Pirko
2009-04-13  8:42     ` [Bridge] " Jiri Pirko
2009-04-13 14:49     ` Stephen Hemminger
2009-04-13 14:49       ` [Bridge] " Stephen Hemminger
2009-04-13 22:54       ` David Miller
2009-04-13 22:54         ` [Bridge] " David Miller
2009-04-13 22:53     ` David Miller
2009-04-13 22:53       ` [Bridge] " David Miller
2009-04-13  8:44   ` [PATCH 3/4] net: bridge: use device address list instead of dev_addr Jiri Pirko
2009-04-13  8:44     ` [Bridge] " Jiri Pirko
2009-04-13 14:54     ` Stephen Hemminger
2009-04-13 14:54       ` [Bridge] " Stephen Hemminger
2009-04-14 10:15       ` Jiri Pirko
2009-04-14 10:15         ` [Bridge] " Jiri Pirko
2009-04-13 22:54     ` David Miller
2009-04-13 22:54       ` [Bridge] " David Miller
2009-04-13  8:46   ` [PATCH 4/4] net: bonding: add slave device addresses in mode alb Jiri Pirko
2009-04-13  8:46     ` [Bridge] " Jiri Pirko
2009-04-13 14:56     ` Stephen Hemminger
2009-04-13 14:56       ` [Bridge] " Stephen Hemminger
2009-04-15  8:17 ` [PATCH 0/3] bonding: allow bond in mode balance-alb to work properly in bridge -try6 Jiri Pirko
2009-04-15  8:17   ` [Bridge] " Jiri Pirko
2009-04-15  8:18   ` [PATCH 1/3] net: introduce a list of device addresses dev_addr_list Jiri Pirko
2009-04-15  8:18     ` [Bridge] " Jiri Pirko
2009-04-15  8:26     ` Li Zefan
2009-04-15  8:26       ` [Bridge] " Li Zefan
2009-04-15  8:29       ` Jiri Pirko
2009-04-15  8:29         ` [Bridge] " Jiri Pirko
2009-04-15  8:32       ` Jiri Pirko
2009-04-15  8:32         ` [Bridge] " Jiri Pirko
2009-04-15  9:21         ` David Miller
2009-04-15  9:21           ` [Bridge] " David Miller
2009-04-15  9:27         ` Eric Dumazet
2009-04-15  9:27           ` [Bridge] " Eric Dumazet
2009-04-15  9:31           ` David Miller
2009-04-15  9:31             ` [Bridge] " David Miller
2009-04-15 10:13             ` Patrick McHardy
2009-04-15 10:13               ` [Bridge] " Patrick McHardy
2009-04-15 10:15               ` David Miller
2009-04-15 10:15                 ` [Bridge] " David Miller
2009-04-15 10:41                 ` Patrick McHardy
2009-04-15 10:41                   ` [Bridge] " Patrick McHardy
2009-04-15 10:45                   ` David Miller
2009-04-15 10:45                     ` [Bridge] " David Miller
2009-04-15 10:47                     ` Patrick McHardy
2009-04-15 10:47                       ` [Bridge] " Patrick McHardy
2009-04-15 14:42               ` Jiri Pirko
2009-04-15 14:42                 ` [Bridge] " Jiri Pirko
2009-04-15 11:17           ` Jiri Pirko
2009-04-15 11:17             ` [Bridge] " Jiri Pirko
2009-04-15 11:22             ` Patrick McHardy
2009-04-15 11:22               ` [Bridge] " Patrick McHardy
2009-04-15 11:28               ` Jiri Pirko
2009-04-15 11:28                 ` [Bridge] " Jiri Pirko
2009-04-15 12:28             ` Eric Dumazet
2009-04-15 12:28               ` [Bridge] " Eric Dumazet
2009-04-15 18:02     ` [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v2) Jiri Pirko
2009-04-15 18:02       ` [Bridge] " Jiri Pirko
2009-04-15 18:54       ` Eric Dumazet
2009-04-15 18:54         ` [Bridge] " Eric Dumazet
2009-04-16  8:46         ` Jiri Pirko
2009-04-16  8:46           ` [Bridge] " Jiri Pirko
2009-04-17 11:57       ` [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3) Jiri Pirko
2009-04-17 11:57         ` [Bridge] " Jiri Pirko
2009-04-17 15:33         ` Stephen Hemminger
2009-04-17 15:33           ` [Bridge] " Stephen Hemminger
2009-04-18  7:01           ` Jiri Pirko
2009-04-18  7:01             ` [Bridge] " Jiri Pirko
2009-04-18  7:35             ` Eric Dumazet
2009-04-18  7:35               ` [Bridge] " Eric Dumazet
2009-04-18  7:44               ` Jiri Pirko
2009-04-18  7:44                 ` [Bridge] " Jiri Pirko
2009-04-18  8:06                 ` Eric Dumazet
2009-04-18  8:06                   ` [Bridge] " Eric Dumazet
2009-04-18  8:58         ` [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4) Jiri Pirko
2009-04-18  8:58           ` [Bridge] " Jiri Pirko
2009-04-20 16:11           ` Jiri Pirko
2009-04-20 16:11             ` [Bridge] " Jiri Pirko
2009-04-23  8:09             ` Jiri Pirko
2009-04-23  8:09               ` [Bridge] " Jiri Pirko
2009-04-23  8:09               ` Jiri Pirko
2009-04-23 15:58           ` [Bonding-devel] " Stephen Hemminger
2009-04-23 15:58             ` [Bridge] " Stephen Hemminger
2009-04-24 21:26             ` Jiri Pirko
2009-04-24 21:26               ` [Bridge] " Jiri Pirko
2009-05-04 11:14           ` [PATCH] net: introduce a list of device addresses dev_addr_list (v5) Jiri Pirko
2009-05-04 11:14             ` [Bridge] " Jiri Pirko
2009-05-05  4:37             ` David Miller
2009-05-05  4:37               ` [Bridge] " David Miller
2009-05-05  6:37               ` Jiri Pirko
2009-05-05  6:37                 ` [Bridge] " Jiri Pirko
2009-05-05 12:48             ` [PATCH] net: introduce a list of device addresses dev_addr_list (v6) Jiri Pirko
2009-05-05 12:48               ` [Bridge] " Jiri Pirko
2009-05-05 19:27               ` David Miller
2009-05-05 19:27                 ` [Bridge] " David Miller
2009-05-08 22:38                 ` Stephen Hemminger
2009-05-08 22:38                   ` [Bridge] " Stephen Hemminger
2009-05-08 23:00                   ` David Miller
2009-05-08 23:00                     ` [Bridge] " David Miller
2009-05-08 23:12                     ` Stephen Hemminger
2009-05-08 23:12                       ` [Bridge] " Stephen Hemminger
2009-05-08 23:25                       ` David Miller
2009-05-08 23:25                         ` [Bridge] " David Miller
2009-05-08 23:29                         ` Stephen Hemminger
2009-05-08 23:29                           ` [Bridge] " Stephen Hemminger
2009-04-15  8:21   ` [PATCH 2/3] net: bridge: use device address list instead of dev_addr Jiri Pirko
2009-04-15  8:21     ` [Bridge] " Jiri Pirko
2009-05-06 14:46     ` [PATCH net-next] net: bridge: use device address list instead of dev_addr (repost) Jiri Pirko
2009-05-06 14:46       ` [Bridge] " Jiri Pirko
2009-05-06 15:08       ` Eric Dumazet
2009-05-06 15:08         ` [Bridge] " Eric Dumazet
2009-05-06 19:26       ` Stephen Hemminger
2009-05-06 19:26         ` [Bridge] " Stephen Hemminger
2009-05-07 22:03         ` David Miller
2009-05-07 22:03           ` [Bridge] " David Miller
2009-04-15  8:22   ` [PATCH 3/3] net: bonding: add slave device addresses in mode alb Jiri Pirko
2009-04-15  8:22     ` [Bridge] " Jiri Pirko
2009-03-24  9:54 [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge Stanichenko Marat

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.