All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option
@ 2020-12-18 19:30 Jarod Wilson
  2020-12-19  0:18 ` Jay Vosburgh
                   ` (2 more replies)
  0 siblings, 3 replies; 30+ messages in thread
From: Jarod Wilson @ 2020-12-18 19:30 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jarod Wilson, Jay Vosburgh, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Jakub Kicinski, Thomas Davis, netdev

This comes from an end-user request, where they're running multiple VMs on
hosts with bonded interfaces connected to some interest switch topologies,
where 802.3ad isn't an option. They're currently running a proprietary
solution that effectively achieves load-balancing of VMs and bandwidth
utilization improvements with a similar form of transmission algorithm.

Basically, each VM has it's own vlan, so it always sends its traffic out
the same interface, unless that interface fails. Traffic gets split
between the interfaces, maintaining a consistent path, with failover still
available if an interface goes down.

This has been rudimetarily tested to provide similar results, suitable for
them to use to move off their current proprietary solution.

Still on the TODO list, if these even looks sane to begin with, is
fleshing out Documentation/networking/bonding.rst.

Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Thomas Davis <tadavis@lbl.gov>
Cc: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
---
 drivers/net/bonding/bond_main.c    | 27 +++++++++++++++++++++++++--
 drivers/net/bonding/bond_options.c |  1 +
 include/linux/netdevice.h          |  1 +
 include/uapi/linux/if_bonding.h    |  1 +
 4 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5fe5232cc3f3..151ce8c7a56f 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -164,7 +164,7 @@ module_param(xmit_hash_policy, charp, 0);
 MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; "
 				   "0 for layer 2 (default), 1 for layer 3+4, "
 				   "2 for layer 2+3, 3 for encap layer 2+3, "
-				   "4 for encap layer 3+4");
+				   "4 for encap layer 3+4, 5 for vlan+srcmac");
 module_param(arp_interval, int, 0);
 MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
 module_param_array(arp_ip_target, charp, NULL, 0);
@@ -1434,6 +1434,8 @@ static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond,
 		return NETDEV_LAG_HASH_E23;
 	case BOND_XMIT_POLICY_ENCAP34:
 		return NETDEV_LAG_HASH_E34;
+	case BOND_XMIT_POLICY_VLAN_SRCMAC:
+		return NETDEV_LAG_HASH_VLAN_SRCMAC;
 	default:
 		return NETDEV_LAG_HASH_UNKNOWN;
 	}
@@ -3494,6 +3496,20 @@ static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk,
 	return true;
 }
 
+static inline u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
+{
+	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
+	u32 srcmac = mac_hdr->h_source[5];
+	u16 vlan;
+
+	if (!skb_vlan_tag_present(skb))
+		return srcmac;
+
+	vlan = skb_vlan_tag_get(skb);
+
+	return srcmac ^ vlan;
+}
+
 /* Extract the appropriate headers based on bond's xmit policy */
 static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 			      struct flow_keys *fk)
@@ -3501,10 +3517,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34;
 	int noff, proto = -1;
 
-	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) {
+	switch (bond->params.xmit_policy) {
+	case BOND_XMIT_POLICY_ENCAP23:
+	case BOND_XMIT_POLICY_ENCAP34:
 		memset(fk, 0, sizeof(*fk));
 		return __skb_flow_dissect(NULL, skb, &flow_keys_bonding,
 					  fk, NULL, 0, 0, 0, 0);
+	default:
+		break;
 	}
 
 	fk->ports.ports = 0;
@@ -3556,6 +3576,9 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
 	    skb->l4_hash)
 		return skb->hash;
 
+	if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC)
+		return bond_vlan_srcmac_hash(skb);
+
 	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 ||
 	    !bond_flow_dissect(bond, skb, &flow))
 		return bond_eth_hash(skb);
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index a4e4e15f574d..9826fe46fca1 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -101,6 +101,7 @@ static const struct bond_opt_value bond_xmit_hashtype_tbl[] = {
 	{ "layer2+3", BOND_XMIT_POLICY_LAYER23, 0},
 	{ "encap2+3", BOND_XMIT_POLICY_ENCAP23, 0},
 	{ "encap3+4", BOND_XMIT_POLICY_ENCAP34, 0},
+	{ "vlansrc",  BOND_XMIT_POLICY_VLAN_SRCMAC,  0},
 	{ NULL,       -1,                       0},
 };
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7bf167993c05..551eac4ab747 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2633,6 +2633,7 @@ enum netdev_lag_hash {
 	NETDEV_LAG_HASH_L23,
 	NETDEV_LAG_HASH_E23,
 	NETDEV_LAG_HASH_E34,
+	NETDEV_LAG_HASH_VLAN_SRCMAC,
 	NETDEV_LAG_HASH_UNKNOWN,
 };
 
diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
index 45f3750aa861..e8eb4ad03cf1 100644
--- a/include/uapi/linux/if_bonding.h
+++ b/include/uapi/linux/if_bonding.h
@@ -94,6 +94,7 @@
 #define BOND_XMIT_POLICY_LAYER23	2 /* layer 2+3 (IP ^ MAC) */
 #define BOND_XMIT_POLICY_ENCAP23	3 /* encapsulated layer 2+3 */
 #define BOND_XMIT_POLICY_ENCAP34	4 /* encapsulated layer 3+4 */
+#define BOND_XMIT_POLICY_VLAN_SRCMAC	5 /* vlan + source MAC */
 
 /* 802.3ad port state definitions (43.4.2.2 in the 802.3ad standard) */
 #define LACP_STATE_LACP_ACTIVITY   0x1
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option
  2020-12-18 19:30 [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option Jarod Wilson
@ 2020-12-19  0:18 ` Jay Vosburgh
  2021-01-08  0:03   ` Jarod Wilson
  2020-12-28 10:11 ` Jiri Pirko
  2021-01-13 22:35 ` [PATCH net-next v2] bonding: add a vlan+mac tx hashing option Jarod Wilson
  2 siblings, 1 reply; 30+ messages in thread
From: Jay Vosburgh @ 2020-12-19  0:18 UTC (permalink / raw)
  To: Jarod Wilson
  Cc: linux-kernel, Veaceslav Falico, Andy Gospodarek, David S. Miller,
	Jakub Kicinski, Thomas Davis, netdev

Jarod Wilson <jarod@redhat.com> wrote:

>This comes from an end-user request, where they're running multiple VMs on
>hosts with bonded interfaces connected to some interest switch topologies,
>where 802.3ad isn't an option. They're currently running a proprietary
>solution that effectively achieves load-balancing of VMs and bandwidth
>utilization improvements with a similar form of transmission algorithm.
>
>Basically, each VM has it's own vlan, so it always sends its traffic out
>the same interface, unless that interface fails. Traffic gets split
>between the interfaces, maintaining a consistent path, with failover still
>available if an interface goes down.
>
>This has been rudimetarily tested to provide similar results, suitable for
>them to use to move off their current proprietary solution.
>
>Still on the TODO list, if these even looks sane to begin with, is
>fleshing out Documentation/networking/bonding.rst.

	I'm sure you're aware, but any final submission will also need
to include netlink and iproute2 support.

>Cc: Jay Vosburgh <j.vosburgh@gmail.com>
>Cc: Veaceslav Falico <vfalico@gmail.com>
>Cc: Andy Gospodarek <andy@greyhouse.net>
>Cc: "David S. Miller" <davem@davemloft.net>
>Cc: Jakub Kicinski <kuba@kernel.org>
>Cc: Thomas Davis <tadavis@lbl.gov>
>Cc: netdev@vger.kernel.org
>Signed-off-by: Jarod Wilson <jarod@redhat.com>
>---
> drivers/net/bonding/bond_main.c    | 27 +++++++++++++++++++++++++--
> drivers/net/bonding/bond_options.c |  1 +
> include/linux/netdevice.h          |  1 +
> include/uapi/linux/if_bonding.h    |  1 +
> 4 files changed, 28 insertions(+), 2 deletions(-)
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 5fe5232cc3f3..151ce8c7a56f 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -164,7 +164,7 @@ module_param(xmit_hash_policy, charp, 0);
> MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; "
> 				   "0 for layer 2 (default), 1 for layer 3+4, "
> 				   "2 for layer 2+3, 3 for encap layer 2+3, "
>-				   "4 for encap layer 3+4");
>+				   "4 for encap layer 3+4, 5 for vlan+srcmac");
> module_param(arp_interval, int, 0);
> MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
> module_param_array(arp_ip_target, charp, NULL, 0);
>@@ -1434,6 +1434,8 @@ static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond,
> 		return NETDEV_LAG_HASH_E23;
> 	case BOND_XMIT_POLICY_ENCAP34:
> 		return NETDEV_LAG_HASH_E34;
>+	case BOND_XMIT_POLICY_VLAN_SRCMAC:
>+		return NETDEV_LAG_HASH_VLAN_SRCMAC;
> 	default:
> 		return NETDEV_LAG_HASH_UNKNOWN;
> 	}
>@@ -3494,6 +3496,20 @@ static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk,
> 	return true;
> }
> 
>+static inline u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
>+{
>+	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
>+	u32 srcmac = mac_hdr->h_source[5];
>+	u16 vlan;
>+
>+	if (!skb_vlan_tag_present(skb))
>+		return srcmac;
>+
>+	vlan = skb_vlan_tag_get(skb);
>+
>+	return srcmac ^ vlan;

	For the common configuration wherein multiple VLANs are
configured atop a single interface (and thus by default end up with the
same MAC address), this seems like a fairly weak hash.  The TCI is 16
bits (12 of which are the VID), but only 8 are used from the MAC, which
will be a constant.

	Is this algorithm copying the proprietary solution you mention?

	-J

>+}
>+
> /* Extract the appropriate headers based on bond's xmit policy */
> static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
> 			      struct flow_keys *fk)
>@@ -3501,10 +3517,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
> 	bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34;
> 	int noff, proto = -1;
> 
>-	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) {
>+	switch (bond->params.xmit_policy) {
>+	case BOND_XMIT_POLICY_ENCAP23:
>+	case BOND_XMIT_POLICY_ENCAP34:
> 		memset(fk, 0, sizeof(*fk));
> 		return __skb_flow_dissect(NULL, skb, &flow_keys_bonding,
> 					  fk, NULL, 0, 0, 0, 0);
>+	default:
>+		break;
> 	}
> 
> 	fk->ports.ports = 0;
>@@ -3556,6 +3576,9 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
> 	    skb->l4_hash)
> 		return skb->hash;
> 
>+	if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC)
>+		return bond_vlan_srcmac_hash(skb);
>+
> 	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 ||
> 	    !bond_flow_dissect(bond, skb, &flow))
> 		return bond_eth_hash(skb);
>diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
>index a4e4e15f574d..9826fe46fca1 100644
>--- a/drivers/net/bonding/bond_options.c
>+++ b/drivers/net/bonding/bond_options.c
>@@ -101,6 +101,7 @@ static const struct bond_opt_value bond_xmit_hashtype_tbl[] = {
> 	{ "layer2+3", BOND_XMIT_POLICY_LAYER23, 0},
> 	{ "encap2+3", BOND_XMIT_POLICY_ENCAP23, 0},
> 	{ "encap3+4", BOND_XMIT_POLICY_ENCAP34, 0},
>+	{ "vlansrc",  BOND_XMIT_POLICY_VLAN_SRCMAC,  0},
> 	{ NULL,       -1,                       0},
> };
> 
>diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>index 7bf167993c05..551eac4ab747 100644
>--- a/include/linux/netdevice.h
>+++ b/include/linux/netdevice.h
>@@ -2633,6 +2633,7 @@ enum netdev_lag_hash {
> 	NETDEV_LAG_HASH_L23,
> 	NETDEV_LAG_HASH_E23,
> 	NETDEV_LAG_HASH_E34,
>+	NETDEV_LAG_HASH_VLAN_SRCMAC,
> 	NETDEV_LAG_HASH_UNKNOWN,
> };
> 
>diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
>index 45f3750aa861..e8eb4ad03cf1 100644
>--- a/include/uapi/linux/if_bonding.h
>+++ b/include/uapi/linux/if_bonding.h
>@@ -94,6 +94,7 @@
> #define BOND_XMIT_POLICY_LAYER23	2 /* layer 2+3 (IP ^ MAC) */
> #define BOND_XMIT_POLICY_ENCAP23	3 /* encapsulated layer 2+3 */
> #define BOND_XMIT_POLICY_ENCAP34	4 /* encapsulated layer 3+4 */
>+#define BOND_XMIT_POLICY_VLAN_SRCMAC	5 /* vlan + source MAC */
> 
> /* 802.3ad port state definitions (43.4.2.2 in the 802.3ad standard) */
> #define LACP_STATE_LACP_ACTIVITY   0x1
>-- 
>2.29.2
>

---
	-Jay Vosburgh, jay.vosburgh@canonical.com

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option
  2020-12-18 19:30 [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option Jarod Wilson
  2020-12-19  0:18 ` Jay Vosburgh
@ 2020-12-28 10:11 ` Jiri Pirko
  2021-01-07 23:58   ` Jarod Wilson
  2021-01-13 22:35 ` [PATCH net-next v2] bonding: add a vlan+mac tx hashing option Jarod Wilson
  2 siblings, 1 reply; 30+ messages in thread
From: Jiri Pirko @ 2020-12-28 10:11 UTC (permalink / raw)
  To: Jarod Wilson
  Cc: linux-kernel, Jay Vosburgh, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Jakub Kicinski, Thomas Davis, netdev

Fri, Dec 18, 2020 at 08:30:33PM CET, jarod@redhat.com wrote:
>This comes from an end-user request, where they're running multiple VMs on
>hosts with bonded interfaces connected to some interest switch topologies,
>where 802.3ad isn't an option. They're currently running a proprietary
>solution that effectively achieves load-balancing of VMs and bandwidth
>utilization improvements with a similar form of transmission algorithm.
>
>Basically, each VM has it's own vlan, so it always sends its traffic out
>the same interface, unless that interface fails. Traffic gets split
>between the interfaces, maintaining a consistent path, with failover still
>available if an interface goes down.
>
>This has been rudimetarily tested to provide similar results, suitable for
>them to use to move off their current proprietary solution.
>
>Still on the TODO list, if these even looks sane to begin with, is
>fleshing out Documentation/networking/bonding.rst.

Jarod, did you consider using team driver instead ? :)

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option
  2020-12-28 10:11 ` Jiri Pirko
@ 2021-01-07 23:58   ` Jarod Wilson
  2021-01-08 13:12     ` Jiri Pirko
  0 siblings, 1 reply; 30+ messages in thread
From: Jarod Wilson @ 2021-01-07 23:58 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, Jay Vosburgh, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Jakub Kicinski, Thomas Davis, netdev

On Mon, Dec 28, 2020 at 11:11:45AM +0100, Jiri Pirko wrote:
> Fri, Dec 18, 2020 at 08:30:33PM CET, jarod@redhat.com wrote:
> >This comes from an end-user request, where they're running multiple VMs on
> >hosts with bonded interfaces connected to some interest switch topologies,
> >where 802.3ad isn't an option. They're currently running a proprietary
> >solution that effectively achieves load-balancing of VMs and bandwidth
> >utilization improvements with a similar form of transmission algorithm.
> >
> >Basically, each VM has it's own vlan, so it always sends its traffic out
> >the same interface, unless that interface fails. Traffic gets split
> >between the interfaces, maintaining a consistent path, with failover still
> >available if an interface goes down.
> >
> >This has been rudimetarily tested to provide similar results, suitable for
> >them to use to move off their current proprietary solution.
> >
> >Still on the TODO list, if these even looks sane to begin with, is
> >fleshing out Documentation/networking/bonding.rst.
> 
> Jarod, did you consider using team driver instead ? :)

That's actually one of the things that was suggested, since team I believe
already has support for this, but the user really wants to use bonding.
We're finding that a lot of users really still prefer bonding over team.

-- 
Jarod Wilson
jarod@redhat.com


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option
  2020-12-19  0:18 ` Jay Vosburgh
@ 2021-01-08  0:03   ` Jarod Wilson
  2021-01-12 21:12     ` Jarod Wilson
  0 siblings, 1 reply; 30+ messages in thread
From: Jarod Wilson @ 2021-01-08  0:03 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: linux-kernel, Veaceslav Falico, Andy Gospodarek, David S. Miller,
	Jakub Kicinski, Thomas Davis, netdev

On Fri, Dec 18, 2020 at 04:18:59PM -0800, Jay Vosburgh wrote:
> Jarod Wilson <jarod@redhat.com> wrote:
> 
> >This comes from an end-user request, where they're running multiple VMs on
> >hosts with bonded interfaces connected to some interest switch topologies,
> >where 802.3ad isn't an option. They're currently running a proprietary
> >solution that effectively achieves load-balancing of VMs and bandwidth
> >utilization improvements with a similar form of transmission algorithm.
> >
> >Basically, each VM has it's own vlan, so it always sends its traffic out
> >the same interface, unless that interface fails. Traffic gets split
> >between the interfaces, maintaining a consistent path, with failover still
> >available if an interface goes down.
> >
> >This has been rudimetarily tested to provide similar results, suitable for
> >them to use to move off their current proprietary solution.
> >
> >Still on the TODO list, if these even looks sane to begin with, is
> >fleshing out Documentation/networking/bonding.rst.
> 
> 	I'm sure you're aware, but any final submission will also need
> to include netlink and iproute2 support.

I believe everything for netlink support is already included, but I'll
double-check that before submitting something for inclusion consideration.

> >diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> >index 5fe5232cc3f3..151ce8c7a56f 100644
> >--- a/drivers/net/bonding/bond_main.c
> >+++ b/drivers/net/bonding/bond_main.c
> >@@ -164,7 +164,7 @@ module_param(xmit_hash_policy, charp, 0);
> > MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; "
> > 				   "0 for layer 2 (default), 1 for layer 3+4, "
> > 				   "2 for layer 2+3, 3 for encap layer 2+3, "
> >-				   "4 for encap layer 3+4");
> >+				   "4 for encap layer 3+4, 5 for vlan+srcmac");
> > module_param(arp_interval, int, 0);
> > MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
> > module_param_array(arp_ip_target, charp, NULL, 0);
> >@@ -1434,6 +1434,8 @@ static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond,
> > 		return NETDEV_LAG_HASH_E23;
> > 	case BOND_XMIT_POLICY_ENCAP34:
> > 		return NETDEV_LAG_HASH_E34;
> >+	case BOND_XMIT_POLICY_VLAN_SRCMAC:
> >+		return NETDEV_LAG_HASH_VLAN_SRCMAC;
> > 	default:
> > 		return NETDEV_LAG_HASH_UNKNOWN;
> > 	}
> >@@ -3494,6 +3496,20 @@ static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk,
> > 	return true;
> > }
> > 
> >+static inline u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
> >+{
> >+	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
> >+	u32 srcmac = mac_hdr->h_source[5];
> >+	u16 vlan;
> >+
> >+	if (!skb_vlan_tag_present(skb))
> >+		return srcmac;
> >+
> >+	vlan = skb_vlan_tag_get(skb);
> >+
> >+	return srcmac ^ vlan;
> 
> 	For the common configuration wherein multiple VLANs are
> configured atop a single interface (and thus by default end up with the
> same MAC address), this seems like a fairly weak hash.  The TCI is 16
> bits (12 of which are the VID), but only 8 are used from the MAC, which
> will be a constant.
> 
> 	Is this algorithm copying the proprietary solution you mention?

I've not actually seen the code in question, so I can't be 100% certain,
but this is exactly how it was described to me, and testing seems to bear
out that it behaves at least similarly enough for the user. They like
simplicity, and the very basic hash suits their needs, which are basically
just getting some load-balancing with failover w/o having to have lacp,
running on some older switches that can't do lacp.

-- 
Jarod Wilson
jarod@redhat.com


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option
  2021-01-07 23:58   ` Jarod Wilson
@ 2021-01-08 13:12     ` Jiri Pirko
  2021-01-08 15:21       ` Jarod Wilson
  2021-01-15  2:02       ` question about bonding mode 4 moyufeng
  0 siblings, 2 replies; 30+ messages in thread
From: Jiri Pirko @ 2021-01-08 13:12 UTC (permalink / raw)
  To: Jarod Wilson
  Cc: linux-kernel, Jay Vosburgh, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Jakub Kicinski, Thomas Davis, netdev

Fri, Jan 08, 2021 at 12:58:13AM CET, jarod@redhat.com wrote:
>On Mon, Dec 28, 2020 at 11:11:45AM +0100, Jiri Pirko wrote:
>> Fri, Dec 18, 2020 at 08:30:33PM CET, jarod@redhat.com wrote:
>> >This comes from an end-user request, where they're running multiple VMs on
>> >hosts with bonded interfaces connected to some interest switch topologies,
>> >where 802.3ad isn't an option. They're currently running a proprietary
>> >solution that effectively achieves load-balancing of VMs and bandwidth
>> >utilization improvements with a similar form of transmission algorithm.
>> >
>> >Basically, each VM has it's own vlan, so it always sends its traffic out
>> >the same interface, unless that interface fails. Traffic gets split
>> >between the interfaces, maintaining a consistent path, with failover still
>> >available if an interface goes down.
>> >
>> >This has been rudimetarily tested to provide similar results, suitable for
>> >them to use to move off their current proprietary solution.
>> >
>> >Still on the TODO list, if these even looks sane to begin with, is
>> >fleshing out Documentation/networking/bonding.rst.
>> 
>> Jarod, did you consider using team driver instead ? :)
>
>That's actually one of the things that was suggested, since team I believe
>already has support for this, but the user really wants to use bonding.
>We're finding that a lot of users really still prefer bonding over team.

Do you know the reason, other than "nostalgia"?

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option
  2021-01-08 13:12     ` Jiri Pirko
@ 2021-01-08 15:21       ` Jarod Wilson
  2021-01-15  2:02       ` question about bonding mode 4 moyufeng
  1 sibling, 0 replies; 30+ messages in thread
From: Jarod Wilson @ 2021-01-08 15:21 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: linux-kernel, Jay Vosburgh, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Jakub Kicinski, Thomas Davis, netdev

On Fri, Jan 08, 2021 at 02:12:56PM +0100, Jiri Pirko wrote:
> Fri, Jan 08, 2021 at 12:58:13AM CET, jarod@redhat.com wrote:
> >On Mon, Dec 28, 2020 at 11:11:45AM +0100, Jiri Pirko wrote:
> >> Fri, Dec 18, 2020 at 08:30:33PM CET, jarod@redhat.com wrote:
> >> >This comes from an end-user request, where they're running multiple VMs on
> >> >hosts with bonded interfaces connected to some interest switch topologies,
> >> >where 802.3ad isn't an option. They're currently running a proprietary
> >> >solution that effectively achieves load-balancing of VMs and bandwidth
> >> >utilization improvements with a similar form of transmission algorithm.
> >> >
> >> >Basically, each VM has it's own vlan, so it always sends its traffic out
> >> >the same interface, unless that interface fails. Traffic gets split
> >> >between the interfaces, maintaining a consistent path, with failover still
> >> >available if an interface goes down.
> >> >
> >> >This has been rudimetarily tested to provide similar results, suitable for
> >> >them to use to move off their current proprietary solution.
> >> >
> >> >Still on the TODO list, if these even looks sane to begin with, is
> >> >fleshing out Documentation/networking/bonding.rst.
> >> 
> >> Jarod, did you consider using team driver instead ? :)
> >
> >That's actually one of the things that was suggested, since team I believe
> >already has support for this, but the user really wants to use bonding.
> >We're finding that a lot of users really still prefer bonding over team.
> 
> Do you know the reason, other than "nostalgia"?

I've heard a few different reasons that come to mind:

1) nostalgia is definitely one -- "we know bonding here"
2) support -- "the things I'm running say I need bonding to properly
support failover in their environment". How accurate this is, I don't
actually know.
3) monitoring -- "my monitoring solution knows about bonding, but not
about team". This is probably easily fixed, but may or may not be in the
user's direct control.
4) footprint -- "bonding does the job w/o team's userspace footprint".
I think this one is kind of hard for team to do anything about, bonding
really does have a smaller userspace footprint, which is a plus for
embedded type applications and high-security environments looking to keep
things as minimal as possible.

I think I've heard a few "we tried team years ago and it didn't work" as
well, which of course is ridiculous as a reason not to try something again,
since a lot can change in a few years in this world.

-- 
Jarod Wilson
jarod@redhat.com


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option
  2021-01-08  0:03   ` Jarod Wilson
@ 2021-01-12 21:12     ` Jarod Wilson
  2021-01-12 21:39       ` Jay Vosburgh
  0 siblings, 1 reply; 30+ messages in thread
From: Jarod Wilson @ 2021-01-12 21:12 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: linux-kernel, Veaceslav Falico, Andy Gospodarek, David S. Miller,
	Jakub Kicinski, Thomas Davis, netdev

On Thu, Jan 07, 2021 at 07:03:40PM -0500, Jarod Wilson wrote:
> On Fri, Dec 18, 2020 at 04:18:59PM -0800, Jay Vosburgh wrote:
> > Jarod Wilson <jarod@redhat.com> wrote:
> > 
> > >This comes from an end-user request, where they're running multiple VMs on
> > >hosts with bonded interfaces connected to some interest switch topologies,
> > >where 802.3ad isn't an option. They're currently running a proprietary
> > >solution that effectively achieves load-balancing of VMs and bandwidth
> > >utilization improvements with a similar form of transmission algorithm.
> > >
> > >Basically, each VM has it's own vlan, so it always sends its traffic out
> > >the same interface, unless that interface fails. Traffic gets split
> > >between the interfaces, maintaining a consistent path, with failover still
> > >available if an interface goes down.
> > >
> > >This has been rudimetarily tested to provide similar results, suitable for
> > >them to use to move off their current proprietary solution.
> > >
> > >Still on the TODO list, if these even looks sane to begin with, is
> > >fleshing out Documentation/networking/bonding.rst.
> > 
> > 	I'm sure you're aware, but any final submission will also need
> > to include netlink and iproute2 support.
> 
> I believe everything for netlink support is already included, but I'll
> double-check that before submitting something for inclusion consideration.

I'm not certain if what you actually meant was that I'd have to patch
iproute2 as well, which I've definitely stumbled onto today, but it's a
2-line patch, and everything seems to be working fine with it:

$ sudo ip link set bond0 type bond xmit_hash_policy 5
$ ip -d link show bond0
11: bond0: <BROADCAST,MULTICAST,MASTER> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
    link/ether ce:85:5e:24:ce:90 brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535
    bond mode balance-xor miimon 0 updelay 0 downdelay 0 peer_notify_delay 0 use_carrier 1 arp_interval 0 arp_validate none arp_all_targets any primary_reselect always fail_over_mac none xmit_hash_policy vlansrc resend_igmp 1 num_grat_arp 1 all_slaves_active 0 min_links 0 lp_interval 1 packets_per_slave 1 lacp_rate slow ad_select stable tlb_dynamic_lb 1 addrgenmode eui64 numtxqueues 16 numrxqueues 16 gso_max_size 65536 gso_max_segs 65535
$ grep Hash /proc/net/bonding/bond0
Transmit Hash Policy: vlansrc (5)

Nothing bad seems to happen on an older kernel if one tries to set the new
hash, you just get told that it's an invalid argument.

I *think* this is all ready for submission then, so I'll get both the kernel
and iproute2 patches out soon.

-- 
Jarod Wilson
jarod@redhat.com


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option
  2021-01-12 21:12     ` Jarod Wilson
@ 2021-01-12 21:39       ` Jay Vosburgh
  2021-01-12 22:32         ` Jarod Wilson
  0 siblings, 1 reply; 30+ messages in thread
From: Jay Vosburgh @ 2021-01-12 21:39 UTC (permalink / raw)
  To: Jarod Wilson
  Cc: linux-kernel, Veaceslav Falico, Andy Gospodarek, David S. Miller,
	Jakub Kicinski, Thomas Davis, netdev

Jarod Wilson <jarod@redhat.com> wrote:

>On Thu, Jan 07, 2021 at 07:03:40PM -0500, Jarod Wilson wrote:
>> On Fri, Dec 18, 2020 at 04:18:59PM -0800, Jay Vosburgh wrote:
>> > Jarod Wilson <jarod@redhat.com> wrote:
>> > 
>> > >This comes from an end-user request, where they're running multiple VMs on
>> > >hosts with bonded interfaces connected to some interest switch topologies,
>> > >where 802.3ad isn't an option. They're currently running a proprietary
>> > >solution that effectively achieves load-balancing of VMs and bandwidth
>> > >utilization improvements with a similar form of transmission algorithm.
>> > >
>> > >Basically, each VM has it's own vlan, so it always sends its traffic out
>> > >the same interface, unless that interface fails. Traffic gets split
>> > >between the interfaces, maintaining a consistent path, with failover still
>> > >available if an interface goes down.
>> > >
>> > >This has been rudimetarily tested to provide similar results, suitable for
>> > >them to use to move off their current proprietary solution.
>> > >
>> > >Still on the TODO list, if these even looks sane to begin with, is
>> > >fleshing out Documentation/networking/bonding.rst.
>> > 
>> > 	I'm sure you're aware, but any final submission will also need
>> > to include netlink and iproute2 support.
>> 
>> I believe everything for netlink support is already included, but I'll
>> double-check that before submitting something for inclusion consideration.
>
>I'm not certain if what you actually meant was that I'd have to patch
>iproute2 as well, which I've definitely stumbled onto today, but it's a
>2-line patch, and everything seems to be working fine with it:

	Yes, that's what I meant.

>$ sudo ip link set bond0 type bond xmit_hash_policy 5

	Does the above work with the text label (presumably "vlansrc")
as well as the number, and does "ip link add test type bond help" print
the correct text for XMIT_HASH_POLICY?

	-J

>$ ip -d link show bond0
>11: bond0: <BROADCAST,MULTICAST,MASTER> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
>    link/ether ce:85:5e:24:ce:90 brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535
>    bond mode balance-xor miimon 0 updelay 0 downdelay 0 peer_notify_delay 0 use_carrier 1 arp_interval 0 arp_validate none arp_all_targets any primary_reselect always fail_over_mac none xmit_hash_policy vlansrc resend_igmp 1 num_grat_arp 1 all_slaves_active 0 min_links 0 lp_interval 1 packets_per_slave 1 lacp_rate slow ad_select stable tlb_dynamic_lb 1 addrgenmode eui64 numtxqueues 16 numrxqueues 16 gso_max_size 65536 gso_max_segs 65535
>$ grep Hash /proc/net/bonding/bond0
>Transmit Hash Policy: vlansrc (5)
>
>Nothing bad seems to happen on an older kernel if one tries to set the new
>hash, you just get told that it's an invalid argument.
>
>I *think* this is all ready for submission then, so I'll get both the kernel
>and iproute2 patches out soon.
>
>-- 
>Jarod Wilson
>jarod@redhat.com

---
	-Jay Vosburgh, jay.vosburgh@canonical.com

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option
  2021-01-12 21:39       ` Jay Vosburgh
@ 2021-01-12 22:32         ` Jarod Wilson
  0 siblings, 0 replies; 30+ messages in thread
From: Jarod Wilson @ 2021-01-12 22:32 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: linux-kernel, Veaceslav Falico, Andy Gospodarek, David S. Miller,
	Jakub Kicinski, Thomas Davis, netdev

On Tue, Jan 12, 2021 at 01:39:10PM -0800, Jay Vosburgh wrote:
> Jarod Wilson <jarod@redhat.com> wrote:
> 
> >On Thu, Jan 07, 2021 at 07:03:40PM -0500, Jarod Wilson wrote:
> >> On Fri, Dec 18, 2020 at 04:18:59PM -0800, Jay Vosburgh wrote:
> >> > Jarod Wilson <jarod@redhat.com> wrote:
> >> > 
> >> > >This comes from an end-user request, where they're running multiple VMs on
> >> > >hosts with bonded interfaces connected to some interest switch topologies,
> >> > >where 802.3ad isn't an option. They're currently running a proprietary
> >> > >solution that effectively achieves load-balancing of VMs and bandwidth
> >> > >utilization improvements with a similar form of transmission algorithm.
> >> > >
> >> > >Basically, each VM has it's own vlan, so it always sends its traffic out
> >> > >the same interface, unless that interface fails. Traffic gets split
> >> > >between the interfaces, maintaining a consistent path, with failover still
> >> > >available if an interface goes down.
> >> > >
> >> > >This has been rudimetarily tested to provide similar results, suitable for
> >> > >them to use to move off their current proprietary solution.
> >> > >
> >> > >Still on the TODO list, if these even looks sane to begin with, is
> >> > >fleshing out Documentation/networking/bonding.rst.
> >> > 
> >> > 	I'm sure you're aware, but any final submission will also need
> >> > to include netlink and iproute2 support.
> >> 
> >> I believe everything for netlink support is already included, but I'll
> >> double-check that before submitting something for inclusion consideration.
> >
> >I'm not certain if what you actually meant was that I'd have to patch
> >iproute2 as well, which I've definitely stumbled onto today, but it's a
> >2-line patch, and everything seems to be working fine with it:
> 
> 	Yes, that's what I meant.
> 
> >$ sudo ip link set bond0 type bond xmit_hash_policy 5
> 
> 	Does the above work with the text label (presumably "vlansrc")
> as well as the number, and does "ip link add test type bond help" print
> the correct text for XMIT_HASH_POLICY?

All of the above looks correct to me, output below. Before submitting...
Could rename it from vlansrc to vlan+srcmac or some variation thereof if
it's desired. I tried to keep it relatively short, but it's perhaps a bit
less succinct like I have it now, and other modes include a +.

$ sudo modprobe bonding mode=2 max_bonds=1 xmit_hash_policy=0
$ sudo ip link set bond0 type bond xmit_hash_policy vlansrc
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v4.18.0-272.el8.vstx.x86_64

Bonding Mode: load balancing (xor)
Transmit Hash Policy: vlansrc (5)
MII Status: down
MII Polling Interval (ms): 0
Up Delay (ms): 0
Down Delay (ms): 0
Peer Notification Delay (ms): 0

$ sudo ip link add test type bond help
Usage: ... bond [ mode BONDMODE ] [ active_slave SLAVE_DEV ]
                [ clear_active_slave ] [ miimon MIIMON ]
                [ updelay UPDELAY ] [ downdelay DOWNDELAY ]
                [ peer_notify_delay DELAY ]
                [ use_carrier USE_CARRIER ]
                [ arp_interval ARP_INTERVAL ]
                [ arp_validate ARP_VALIDATE ]
                [ arp_all_targets ARP_ALL_TARGETS ]
                [ arp_ip_target [ ARP_IP_TARGET, ... ] ]
                [ primary SLAVE_DEV ]
                [ primary_reselect PRIMARY_RESELECT ]
                [ fail_over_mac FAIL_OVER_MAC ]
                [ xmit_hash_policy XMIT_HASH_POLICY ]
                [ resend_igmp RESEND_IGMP ]
                [ num_grat_arp|num_unsol_na NUM_GRAT_ARP|NUM_UNSOL_NA ]
                [ all_slaves_active ALL_SLAVES_ACTIVE ]
                [ min_links MIN_LINKS ]
                [ lp_interval LP_INTERVAL ]
                [ packets_per_slave PACKETS_PER_SLAVE ]
                [ tlb_dynamic_lb TLB_DYNAMIC_LB ]
                [ lacp_rate LACP_RATE ]
                [ ad_select AD_SELECT ]
                [ ad_user_port_key PORTKEY ]
                [ ad_actor_sys_prio SYSPRIO ]
                [ ad_actor_system LLADDR ]

BONDMODE := balance-rr|active-backup|balance-xor|broadcast|802.3ad|balance-tlb|balance-alb
ARP_VALIDATE := none|active|backup|all
ARP_ALL_TARGETS := any|all
PRIMARY_RESELECT := always|better|failure
FAIL_OVER_MAC := none|active|follow
XMIT_HASH_POLICY := layer2|layer2+3|layer3+4|encap2+3|encap3+4|vlansrc
LACP_RATE := slow|fast
AD_SELECT := stable|bandwidth|count


-- 
Jarod Wilson
jarod@redhat.com


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH net-next v2] bonding: add a vlan+mac tx hashing option
  2020-12-18 19:30 [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option Jarod Wilson
  2020-12-19  0:18 ` Jay Vosburgh
  2020-12-28 10:11 ` Jiri Pirko
@ 2021-01-13 22:35 ` Jarod Wilson
  2021-01-13 23:41   ` [PATCH iproute2] bond: support xmit_hash_policy=vlan+mac Jarod Wilson
                     ` (2 more replies)
  2 siblings, 3 replies; 30+ messages in thread
From: Jarod Wilson @ 2021-01-13 22:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jarod Wilson, Jay Vosburgh, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Jakub Kicinski, Thomas Davis, netdev

This comes from an end-user request, where they're running multiple VMs on
hosts with bonded interfaces connected to some interest switch topologies,
where 802.3ad isn't an option. They're currently running a proprietary
solution that effectively achieves load-balancing of VMs and bandwidth
utilization improvements with a similar form of transmission algorithm.

Basically, each VM has it's own vlan, so it always sends its traffic out
the same interface, unless that interface fails. Traffic gets split
between the interfaces, maintaining a consistent path, with failover still
available if an interface goes down.

This has been rudimetarily tested to provide similar results, suitable for
them to use to move off their current proprietary solution. A patch for
iproute2 is forthcoming as well, to properly support the new mode there as
well.

Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Thomas Davis <tadavis@lbl.gov>
Cc: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
---
v2: verified netlink interfaces working, added Documentation, changed
tx hash mode name to vlan+mac for consistency and clarity.

 Documentation/networking/bonding.rst | 13 +++++++++++++
 drivers/net/bonding/bond_main.c      | 27 +++++++++++++++++++++++++--
 drivers/net/bonding/bond_options.c   |  1 +
 include/linux/netdevice.h            |  1 +
 include/uapi/linux/if_bonding.h      |  1 +
 5 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
index adc314639085..c78ceb7630a0 100644
--- a/Documentation/networking/bonding.rst
+++ b/Documentation/networking/bonding.rst
@@ -951,6 +951,19 @@ xmit_hash_policy
 		packets will be distributed according to the encapsulated
 		flows.
 
+	vlan+mac
+
+		This policy uses a very rudimentary vland ID and source mac
+		ID hash to load-balance traffic per-vlan, with failover
+		should one leg fail. The intended use case is for a bond
+		shared by multiple virtual machines, all configured to
+		use their own vlan, to give lacp-like functionality
+		without requiring lacp-capable switching hardware.
+
+		The formula for the hash is simply
+
+		hash = (vlan ID) XOR (source MAC)
+
 	The default value is layer2.  This option was added in bonding
 	version 2.6.3.  In earlier versions of bonding, this parameter
 	does not exist, and the layer2 policy is the only policy.  The
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5fe5232cc3f3..766c09a553c1 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -164,7 +164,7 @@ module_param(xmit_hash_policy, charp, 0);
 MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; "
 				   "0 for layer 2 (default), 1 for layer 3+4, "
 				   "2 for layer 2+3, 3 for encap layer 2+3, "
-				   "4 for encap layer 3+4");
+				   "4 for encap layer 3+4, 5 for vlan+mac");
 module_param(arp_interval, int, 0);
 MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
 module_param_array(arp_ip_target, charp, NULL, 0);
@@ -1434,6 +1434,8 @@ static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond,
 		return NETDEV_LAG_HASH_E23;
 	case BOND_XMIT_POLICY_ENCAP34:
 		return NETDEV_LAG_HASH_E34;
+	case BOND_XMIT_POLICY_VLAN_SRCMAC:
+		return NETDEV_LAG_HASH_VLAN_SRCMAC;
 	default:
 		return NETDEV_LAG_HASH_UNKNOWN;
 	}
@@ -3494,6 +3496,20 @@ static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk,
 	return true;
 }
 
+static inline u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
+{
+	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
+	u32 srcmac = mac_hdr->h_source[5];
+	u16 vlan;
+
+	if (!skb_vlan_tag_present(skb))
+		return srcmac;
+
+	vlan = skb_vlan_tag_get(skb);
+
+	return srcmac ^ vlan;
+}
+
 /* Extract the appropriate headers based on bond's xmit policy */
 static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 			      struct flow_keys *fk)
@@ -3501,10 +3517,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34;
 	int noff, proto = -1;
 
-	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) {
+	switch (bond->params.xmit_policy) {
+	case BOND_XMIT_POLICY_ENCAP23:
+	case BOND_XMIT_POLICY_ENCAP34:
 		memset(fk, 0, sizeof(*fk));
 		return __skb_flow_dissect(NULL, skb, &flow_keys_bonding,
 					  fk, NULL, 0, 0, 0, 0);
+	default:
+		break;
 	}
 
 	fk->ports.ports = 0;
@@ -3556,6 +3576,9 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
 	    skb->l4_hash)
 		return skb->hash;
 
+	if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC)
+		return bond_vlan_srcmac_hash(skb);
+
 	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 ||
 	    !bond_flow_dissect(bond, skb, &flow))
 		return bond_eth_hash(skb);
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index a4e4e15f574d..deafe3587c80 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -101,6 +101,7 @@ static const struct bond_opt_value bond_xmit_hashtype_tbl[] = {
 	{ "layer2+3", BOND_XMIT_POLICY_LAYER23, 0},
 	{ "encap2+3", BOND_XMIT_POLICY_ENCAP23, 0},
 	{ "encap3+4", BOND_XMIT_POLICY_ENCAP34, 0},
+	{ "vlan+mac", BOND_XMIT_POLICY_VLAN_SRCMAC,  0},
 	{ NULL,       -1,                       0},
 };
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5b949076ed23..a94ce80a2fe1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2615,6 +2615,7 @@ enum netdev_lag_hash {
 	NETDEV_LAG_HASH_L23,
 	NETDEV_LAG_HASH_E23,
 	NETDEV_LAG_HASH_E34,
+	NETDEV_LAG_HASH_VLAN_SRCMAC,
 	NETDEV_LAG_HASH_UNKNOWN,
 };
 
diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
index 45f3750aa861..e8eb4ad03cf1 100644
--- a/include/uapi/linux/if_bonding.h
+++ b/include/uapi/linux/if_bonding.h
@@ -94,6 +94,7 @@
 #define BOND_XMIT_POLICY_LAYER23	2 /* layer 2+3 (IP ^ MAC) */
 #define BOND_XMIT_POLICY_ENCAP23	3 /* encapsulated layer 2+3 */
 #define BOND_XMIT_POLICY_ENCAP34	4 /* encapsulated layer 3+4 */
+#define BOND_XMIT_POLICY_VLAN_SRCMAC	5 /* vlan + source MAC */
 
 /* 802.3ad port state definitions (43.4.2.2 in the 802.3ad standard) */
 #define LACP_STATE_LACP_ACTIVITY   0x1
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH iproute2] bond: support xmit_hash_policy=vlan+mac
  2021-01-13 22:35 ` [PATCH net-next v2] bonding: add a vlan+mac tx hashing option Jarod Wilson
@ 2021-01-13 23:41   ` Jarod Wilson
  2021-01-15 15:12     ` Jarod Wilson
  2021-01-15 19:21     ` [PATCH iproute2 v2] bond: support xmit_hash_policy=vlan+srcmac Jarod Wilson
  2021-01-14  1:58   ` [PATCH net-next v2] bonding: add a vlan+mac tx hashing option Jakub Kicinski
  2021-01-15 19:21   ` [PATCH net-next v3] bonding: add a vlan+srcmac " Jarod Wilson
  2 siblings, 2 replies; 30+ messages in thread
From: Jarod Wilson @ 2021-01-13 23:41 UTC (permalink / raw)
  To: netdev; +Cc: Jarod Wilson, Stephen Hemminger, Jay Vosburgh

There's a new transmit hash policy being added to the bonding driver that
is a simple XOR of vlan ID and source MAC, xmit_hash_policy vlan+mac. This
trivial patch makes it configurable and queryable via iproute2.

$ sudo modprobe bonding mode=2 max_bonds=1 xmit_hash_policy=0

$ sudo ip link set bond0 type bond xmit_hash_policy vlan+mac

$ ip -d link show bond0
11: bond0: <BROADCAST,MULTICAST,MASTER> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
    link/ether ce:85:5e:24:ce:90 brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535
    bond mode balance-xor miimon 0 updelay 0 downdelay 0 peer_notify_delay 0 use_carrier 1 arp_interval 0 arp_validate none arp_all_targets any
primary_reselect always fail_over_mac none xmit_hash_policy vlan+mac resend_igmp 1 num_grat_arp 1 all_slaves_active 0 min_links 0 lp_interval 1
packets_per_slave 1 lacp_rate slow ad_select stable tlb_dynamic_lb 1 addrgenmode eui64 numtxqueues 16 numrxqueues 16 gso_max_size 65536 gso_max_segs
65535

$ grep Hash /proc/net/bonding/bond0
Transmit Hash Policy: vlan+mac (5)

$ sudo ip link add test type bond help
Usage: ... bond [ mode BONDMODE ] [ active_slave SLAVE_DEV ]
                [ clear_active_slave ] [ miimon MIIMON ]
                [ updelay UPDELAY ] [ downdelay DOWNDELAY ]
                [ peer_notify_delay DELAY ]
                [ use_carrier USE_CARRIER ]
                [ arp_interval ARP_INTERVAL ]
                [ arp_validate ARP_VALIDATE ]
                [ arp_all_targets ARP_ALL_TARGETS ]
                [ arp_ip_target [ ARP_IP_TARGET, ... ] ]
                [ primary SLAVE_DEV ]
                [ primary_reselect PRIMARY_RESELECT ]
                [ fail_over_mac FAIL_OVER_MAC ]
                [ xmit_hash_policy XMIT_HASH_POLICY ]
                [ resend_igmp RESEND_IGMP ]
                [ num_grat_arp|num_unsol_na NUM_GRAT_ARP|NUM_UNSOL_NA ]
                [ all_slaves_active ALL_SLAVES_ACTIVE ]
                [ min_links MIN_LINKS ]
                [ lp_interval LP_INTERVAL ]
                [ packets_per_slave PACKETS_PER_SLAVE ]
                [ tlb_dynamic_lb TLB_DYNAMIC_LB ]
                [ lacp_rate LACP_RATE ]
                [ ad_select AD_SELECT ]
                [ ad_user_port_key PORTKEY ]
                [ ad_actor_sys_prio SYSPRIO ]
                [ ad_actor_system LLADDR ]

BONDMODE := balance-rr|active-backup|balance-xor|broadcast|802.3ad|balance-tlb|balance-alb
ARP_VALIDATE := none|active|backup|all
ARP_ALL_TARGETS := any|all
PRIMARY_RESELECT := always|better|failure
FAIL_OVER_MAC := none|active|follow
XMIT_HASH_POLICY := layer2|layer2+3|layer3+4|encap2+3|encap3+4|vlan+mac
LACP_RATE := slow|fast
AD_SELECT := stable|bandwidth|count

Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
---
 ip/iplink_bond.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ip/iplink_bond.c b/ip/iplink_bond.c
index 585b6be1..b9470b98 100644
--- a/ip/iplink_bond.c
+++ b/ip/iplink_bond.c
@@ -70,6 +70,7 @@ static const char *xmit_hash_policy_tbl[] = {
 	"layer2+3",
 	"encap2+3",
 	"encap3+4",
+	"vlan+mac",
 	NULL,
 };
 
@@ -148,7 +149,7 @@ static void print_explain(FILE *f)
 		"ARP_ALL_TARGETS := any|all\n"
 		"PRIMARY_RESELECT := always|better|failure\n"
 		"FAIL_OVER_MAC := none|active|follow\n"
-		"XMIT_HASH_POLICY := layer2|layer2+3|layer3+4|encap2+3|encap3+4\n"
+		"XMIT_HASH_POLICY := layer2|layer2+3|layer3+4|encap2+3|encap3+4|vlan+mac\n"
 		"LACP_RATE := slow|fast\n"
 		"AD_SELECT := stable|bandwidth|count\n"
 	);
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next v2] bonding: add a vlan+mac tx hashing option
  2021-01-13 22:35 ` [PATCH net-next v2] bonding: add a vlan+mac tx hashing option Jarod Wilson
  2021-01-13 23:41   ` [PATCH iproute2] bond: support xmit_hash_policy=vlan+mac Jarod Wilson
@ 2021-01-14  1:58   ` Jakub Kicinski
  2021-01-14 21:11     ` Jarod Wilson
  2021-01-15 19:21   ` [PATCH net-next v3] bonding: add a vlan+srcmac " Jarod Wilson
  2 siblings, 1 reply; 30+ messages in thread
From: Jakub Kicinski @ 2021-01-14  1:58 UTC (permalink / raw)
  To: Jarod Wilson
  Cc: linux-kernel, Jay Vosburgh, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Thomas Davis, netdev

On Wed, 13 Jan 2021 17:35:48 -0500 Jarod Wilson wrote:
> This comes from an end-user request, where they're running multiple VMs on
> hosts with bonded interfaces connected to some interest switch topologies,
> where 802.3ad isn't an option. They're currently running a proprietary
> solution that effectively achieves load-balancing of VMs and bandwidth
> utilization improvements with a similar form of transmission algorithm.
> 
> Basically, each VM has it's own vlan, so it always sends its traffic out
> the same interface, unless that interface fails. Traffic gets split
> between the interfaces, maintaining a consistent path, with failover still
> available if an interface goes down.
> 
> This has been rudimetarily tested to provide similar results, suitable for
> them to use to move off their current proprietary solution. A patch for
> iproute2 is forthcoming as well, to properly support the new mode there as
> well.

> Signed-off-by: Jarod Wilson <jarod@redhat.com>
> ---
> v2: verified netlink interfaces working, added Documentation, changed
> tx hash mode name to vlan+mac for consistency and clarity.
> 
>  Documentation/networking/bonding.rst | 13 +++++++++++++
>  drivers/net/bonding/bond_main.c      | 27 +++++++++++++++++++++++++--
>  drivers/net/bonding/bond_options.c   |  1 +
>  include/linux/netdevice.h            |  1 +
>  include/uapi/linux/if_bonding.h      |  1 +
>  5 files changed, 41 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
> index adc314639085..c78ceb7630a0 100644
> --- a/Documentation/networking/bonding.rst
> +++ b/Documentation/networking/bonding.rst
> @@ -951,6 +951,19 @@ xmit_hash_policy
>  		packets will be distributed according to the encapsulated
>  		flows.
>  
> +	vlan+mac
> +
> +		This policy uses a very rudimentary vland ID and source mac
> +		ID hash to load-balance traffic per-vlan, with failover
> +		should one leg fail. The intended use case is for a bond
> +		shared by multiple virtual machines, all configured to
> +		use their own vlan, to give lacp-like functionality
> +		without requiring lacp-capable switching hardware.
> +
> +		The formula for the hash is simply
> +
> +		hash = (vlan ID) XOR (source MAC)

But in the code it's only using one byte of the MAC, currently.

I think that's fine for the particular use case but should we call out
explicitly in the commit message why it's considered sufficient?

Someone can change it later, if needed, but best if we spell out the
current motivation.

>  	The default value is layer2.  This option was added in bonding
>  	version 2.6.3.  In earlier versions of bonding, this parameter
>  	does not exist, and the layer2 policy is the only policy.  The

> +static inline u32 bond_vlan_srcmac_hash(struct sk_buff *skb)

Can we drop the inline? It's a static function called once.

> +{
> +	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);

I don't see anything in the patch making sure the interface actually
has a L2 header. Should we validate somehow the ifc is Ethernet?

> +	u32 srcmac = mac_hdr->h_source[5];
> +	u16 vlan;
> +
> +	if (!skb_vlan_tag_present(skb))
> +		return srcmac;
> +
> +	vlan = skb_vlan_tag_get(skb);
> +
> +	return srcmac ^ vlan;
> +}

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next v2] bonding: add a vlan+mac tx hashing option
  2021-01-14  1:58   ` [PATCH net-next v2] bonding: add a vlan+mac tx hashing option Jakub Kicinski
@ 2021-01-14 21:11     ` Jarod Wilson
  2021-01-14 21:23       ` Jakub Kicinski
  2021-01-14 21:54       ` Jay Vosburgh
  0 siblings, 2 replies; 30+ messages in thread
From: Jarod Wilson @ 2021-01-14 21:11 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: linux-kernel, Jay Vosburgh, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Thomas Davis, netdev

On Wed, Jan 13, 2021 at 05:58:18PM -0800, Jakub Kicinski wrote:
> On Wed, 13 Jan 2021 17:35:48 -0500 Jarod Wilson wrote:
> > This comes from an end-user request, where they're running multiple VMs on
> > hosts with bonded interfaces connected to some interest switch topologies,
> > where 802.3ad isn't an option. They're currently running a proprietary
> > solution that effectively achieves load-balancing of VMs and bandwidth
> > utilization improvements with a similar form of transmission algorithm.
> > 
> > Basically, each VM has it's own vlan, so it always sends its traffic out
> > the same interface, unless that interface fails. Traffic gets split
> > between the interfaces, maintaining a consistent path, with failover still
> > available if an interface goes down.
> > 
> > This has been rudimetarily tested to provide similar results, suitable for
> > them to use to move off their current proprietary solution. A patch for
> > iproute2 is forthcoming as well, to properly support the new mode there as
> > well.
> 
> > Signed-off-by: Jarod Wilson <jarod@redhat.com>
> > ---
> > v2: verified netlink interfaces working, added Documentation, changed
> > tx hash mode name to vlan+mac for consistency and clarity.
> > 
> >  Documentation/networking/bonding.rst | 13 +++++++++++++
> >  drivers/net/bonding/bond_main.c      | 27 +++++++++++++++++++++++++--
> >  drivers/net/bonding/bond_options.c   |  1 +
> >  include/linux/netdevice.h            |  1 +
> >  include/uapi/linux/if_bonding.h      |  1 +
> >  5 files changed, 41 insertions(+), 2 deletions(-)
> > 
> > diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
> > index adc314639085..c78ceb7630a0 100644
> > --- a/Documentation/networking/bonding.rst
> > +++ b/Documentation/networking/bonding.rst
> > @@ -951,6 +951,19 @@ xmit_hash_policy
> >  		packets will be distributed according to the encapsulated
> >  		flows.
> >  
> > +	vlan+mac
> > +
> > +		This policy uses a very rudimentary vland ID and source mac
> > +		ID hash to load-balance traffic per-vlan, with failover
> > +		should one leg fail. The intended use case is for a bond
> > +		shared by multiple virtual machines, all configured to
> > +		use their own vlan, to give lacp-like functionality
> > +		without requiring lacp-capable switching hardware.
> > +
> > +		The formula for the hash is simply
> > +
> > +		hash = (vlan ID) XOR (source MAC)
> 
> But in the code it's only using one byte of the MAC, currently.
> 
> I think that's fine for the particular use case but should we call out
> explicitly in the commit message why it's considered sufficient?
> 
> Someone can change it later, if needed, but best if we spell out the
> current motivation.

In truth, this code started out as a copy of bond_eth_hash(), which also
only uses the last byte, though of both source and destination macs. In
the typical use case for the requesting user, the bond is formed from two
onboard NICs, which typically have adjacent mac addresses, i.e.,
AA:BB:CC:DD:EE:01 and AA:BB:CC:DD:EE:02, so only the last byte is really
relevant to hash differently, but in thinking about it, a replacement NIC
because an onboard one died could have the same last byte, and maybe we
ought to just go full source mac right off the go here.

Something like this instead maybe:

static u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
{
        struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
        u32 srcmac = 0;
        u16 vlan;
        int i;

        for (i = 0; i < ETH_ALEN; i++)
                srcmac = (srcmac << 8) | mac_hdr->h_source[i];

        if (!skb_vlan_tag_present(skb))
                return srcmac;

        vlan = skb_vlan_tag_get(skb);

        return vlan ^ srcmac;
}

Then the documentation is spot-on, and we're future-proof, though
marginally less performant in calculating the hash, which may have been a
consideration when the original function was written, but is probably
basically irrelevant w/modern systems...

> >  	The default value is layer2.  This option was added in bonding
> >  	version 2.6.3.  In earlier versions of bonding, this parameter
> >  	does not exist, and the layer2 policy is the only policy.  The
> 
> > +static inline u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
> 
> Can we drop the inline? It's a static function called once.

Works for me. That was also inherited by copying bond_eth_hash(). :)

> > +{
> > +	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
> 
> I don't see anything in the patch making sure the interface actually
> has a L2 header. Should we validate somehow the ifc is Ethernet?

I don't think it's necessary. There doesn't appear to be any explicit
check for BOND_XMIT_POLICY_LAYER2 either. I believe we're guaranteed to
not have anything but an ethernet header here, as the only other type I'm
aware of being supported is Infiniband, but we limit that to active-backup
only, and xmit_hash_policy isn't valid for active-backup.

-- 
Jarod Wilson
jarod@redhat.com


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next v2] bonding: add a vlan+mac tx hashing option
  2021-01-14 21:11     ` Jarod Wilson
@ 2021-01-14 21:23       ` Jakub Kicinski
  2021-01-14 21:42         ` Jarod Wilson
  2021-01-14 21:54       ` Jay Vosburgh
  1 sibling, 1 reply; 30+ messages in thread
From: Jakub Kicinski @ 2021-01-14 21:23 UTC (permalink / raw)
  To: Jarod Wilson
  Cc: linux-kernel, Jay Vosburgh, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Thomas Davis, netdev

On Thu, 14 Jan 2021 16:11:41 -0500 Jarod Wilson wrote:
> In truth, this code started out as a copy of bond_eth_hash(), which also
> only uses the last byte, though of both source and destination macs. In
> the typical use case for the requesting user, the bond is formed from two
> onboard NICs, which typically have adjacent mac addresses, i.e.,
> AA:BB:CC:DD:EE:01 and AA:BB:CC:DD:EE:02, so only the last byte is really
> relevant to hash differently, but in thinking about it, a replacement NIC
> because an onboard one died could have the same last byte, and maybe we
> ought to just go full source mac right off the go here.
> 
> Something like this instead maybe:
> 
> static u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
> {
>         struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
>         u32 srcmac = 0;
>         u16 vlan;
>         int i;
> 
>         for (i = 0; i < ETH_ALEN; i++)
>                 srcmac = (srcmac << 8) | mac_hdr->h_source[i];
> 
>         if (!skb_vlan_tag_present(skb))
>                 return srcmac;
> 
>         vlan = skb_vlan_tag_get(skb);
> 
>         return vlan ^ srcmac;
> }
> 
> Then the documentation is spot-on, and we're future-proof, though
> marginally less performant in calculating the hash, which may have been a
> consideration when the original function was written, but is probably
> basically irrelevant w/modern systems...

No preference, especially if bond_eth_hash() already uses the last byte.
Just make sure the choice is explained in the commit message.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next v2] bonding: add a vlan+mac tx hashing option
  2021-01-14 21:23       ` Jakub Kicinski
@ 2021-01-14 21:42         ` Jarod Wilson
  0 siblings, 0 replies; 30+ messages in thread
From: Jarod Wilson @ 2021-01-14 21:42 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: linux-kernel, Jay Vosburgh, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Thomas Davis, netdev

On Thu, Jan 14, 2021 at 01:23:14PM -0800, Jakub Kicinski wrote:
> On Thu, 14 Jan 2021 16:11:41 -0500 Jarod Wilson wrote:
> > In truth, this code started out as a copy of bond_eth_hash(), which also
> > only uses the last byte, though of both source and destination macs. In
> > the typical use case for the requesting user, the bond is formed from two
> > onboard NICs, which typically have adjacent mac addresses, i.e.,
> > AA:BB:CC:DD:EE:01 and AA:BB:CC:DD:EE:02, so only the last byte is really
> > relevant to hash differently, but in thinking about it, a replacement NIC
> > because an onboard one died could have the same last byte, and maybe we
> > ought to just go full source mac right off the go here.
> > 
> > Something like this instead maybe:
> > 
> > static u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
> > {
> >         struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
> >         u32 srcmac = 0;
> >         u16 vlan;
> >         int i;
> > 
> >         for (i = 0; i < ETH_ALEN; i++)
> >                 srcmac = (srcmac << 8) | mac_hdr->h_source[i];
> > 
> >         if (!skb_vlan_tag_present(skb))
> >                 return srcmac;
> > 
> >         vlan = skb_vlan_tag_get(skb);
> > 
> >         return vlan ^ srcmac;
> > }
> > 
> > Then the documentation is spot-on, and we're future-proof, though
> > marginally less performant in calculating the hash, which may have been a
> > consideration when the original function was written, but is probably
> > basically irrelevant w/modern systems...
> 
> No preference, especially if bond_eth_hash() already uses the last byte.
> Just make sure the choice is explained in the commit message.

I've sold myself on using the full MAC, because if there's no vlan tag
present, mac is the only thing used for the hash, increasing the chances
of getting the same hash for two different interfaces, which won't happen
if we've got the full MAC. Of course, I'm not sure why someone would be
using this xmit hash outside of the very particular use-case that includes
VLANs, but people do strange things...

-- 
Jarod Wilson
jarod@redhat.com


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next v2] bonding: add a vlan+mac tx hashing option
  2021-01-14 21:11     ` Jarod Wilson
  2021-01-14 21:23       ` Jakub Kicinski
@ 2021-01-14 21:54       ` Jay Vosburgh
  2021-01-15 15:08         ` Jarod Wilson
  1 sibling, 1 reply; 30+ messages in thread
From: Jay Vosburgh @ 2021-01-14 21:54 UTC (permalink / raw)
  To: Jarod Wilson
  Cc: Jakub Kicinski, linux-kernel, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Thomas Davis, netdev

Jarod Wilson <jarod@redhat.com> wrote:

>On Wed, Jan 13, 2021 at 05:58:18PM -0800, Jakub Kicinski wrote:
>> On Wed, 13 Jan 2021 17:35:48 -0500 Jarod Wilson wrote:
>> > This comes from an end-user request, where they're running multiple VMs on
>> > hosts with bonded interfaces connected to some interest switch topologies,
>> > where 802.3ad isn't an option. They're currently running a proprietary
>> > solution that effectively achieves load-balancing of VMs and bandwidth
>> > utilization improvements with a similar form of transmission algorithm.
>> > 
>> > Basically, each VM has it's own vlan, so it always sends its traffic out
>> > the same interface, unless that interface fails. Traffic gets split
>> > between the interfaces, maintaining a consistent path, with failover still
>> > available if an interface goes down.
>> > 
>> > This has been rudimetarily tested to provide similar results, suitable for
>> > them to use to move off their current proprietary solution. A patch for
>> > iproute2 is forthcoming as well, to properly support the new mode there as
>> > well.
>> 
>> > Signed-off-by: Jarod Wilson <jarod@redhat.com>
>> > ---
>> > v2: verified netlink interfaces working, added Documentation, changed
>> > tx hash mode name to vlan+mac for consistency and clarity.
>> > 
>> >  Documentation/networking/bonding.rst | 13 +++++++++++++
>> >  drivers/net/bonding/bond_main.c      | 27 +++++++++++++++++++++++++--
>> >  drivers/net/bonding/bond_options.c   |  1 +
>> >  include/linux/netdevice.h            |  1 +
>> >  include/uapi/linux/if_bonding.h      |  1 +
>> >  5 files changed, 41 insertions(+), 2 deletions(-)
>> > 
>> > diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
>> > index adc314639085..c78ceb7630a0 100644
>> > --- a/Documentation/networking/bonding.rst
>> > +++ b/Documentation/networking/bonding.rst
>> > @@ -951,6 +951,19 @@ xmit_hash_policy
>> >  		packets will be distributed according to the encapsulated
>> >  		flows.
>> >  
>> > +	vlan+mac

	I notice that the code calls it "VLAN_SRCMAC" but the
user-facing nomenclature is "vlan+mac"; I tend to lean towards having
the user visible name also be "vlan+srcmac".  Both for consistency, and
just in case someone someday wants "vlan+dstmac".  And you did ask for
preference on this in a separate email.

>> > +		This policy uses a very rudimentary vland ID and source mac
>> > +		ID hash to load-balance traffic per-vlan, with failover
>> > +		should one leg fail. The intended use case is for a bond
>> > +		shared by multiple virtual machines, all configured to
>> > +		use their own vlan, to give lacp-like functionality
>> > +		without requiring lacp-capable switching hardware.
>> > +
>> > +		The formula for the hash is simply
>> > +
>> > +		hash = (vlan ID) XOR (source MAC)
>> 
>> But in the code it's only using one byte of the MAC, currently.
>> 
>> I think that's fine for the particular use case but should we call out
>> explicitly in the commit message why it's considered sufficient?
>> 
>> Someone can change it later, if needed, but best if we spell out the
>> current motivation.
>
>In truth, this code started out as a copy of bond_eth_hash(), which also
>only uses the last byte, though of both source and destination macs. In
>the typical use case for the requesting user, the bond is formed from two
>onboard NICs, which typically have adjacent mac addresses, i.e.,
>AA:BB:CC:DD:EE:01 and AA:BB:CC:DD:EE:02, so only the last byte is really
>relevant to hash differently, but in thinking about it, a replacement NIC
>because an onboard one died could have the same last byte, and maybe we
>ought to just go full source mac right off the go here.

	Yah, the existing L2 hash is pretty weak.  It might be possible
to squeeze this into the existing bond_xmit_hash a bit better, if the
hash is two u32s.  The first being the first 32 bits of the MAC, and the
second being the last 16 bits of the MAC combined with the 16 bit VLAN
tag.

	There's already logic at the end of bond_xmit_hash to reduce a
u32 into the final hash that perhaps could be leveraged.  

	Thinking about it, though, all the ways to combine that data
together end up being pretty vile ("*(u32 *)&ethhdr->h_source[0]" sorts
of things).

>Something like this instead maybe:
>
>static u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
>{
>        struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
>        u32 srcmac = 0;
>        u16 vlan;
>        int i;
>
>        for (i = 0; i < ETH_ALEN; i++)
>                srcmac = (srcmac << 8) | mac_hdr->h_source[i];

	I think this will shift h_source[0] and [1] into oblivion.

>        if (!skb_vlan_tag_present(skb))
>                return srcmac;
>
>        vlan = skb_vlan_tag_get(skb);
>
>        return vlan ^ srcmac;
>}
>
>Then the documentation is spot-on, and we're future-proof, though
>marginally less performant in calculating the hash, which may have been a
>consideration when the original function was written, but is probably
>basically irrelevant w/modern systems...
>
>> >  	The default value is layer2.  This option was added in bonding
>> >  	version 2.6.3.  In earlier versions of bonding, this parameter
>> >  	does not exist, and the layer2 policy is the only policy.  The
>> 
>> > +static inline u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
>> 
>> Can we drop the inline? It's a static function called once.
>
>Works for me. That was also inherited by copying bond_eth_hash(). :)
>
>> > +{
>> > +	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
>> 
>> I don't see anything in the patch making sure the interface actually
>> has a L2 header. Should we validate somehow the ifc is Ethernet?
>
>I don't think it's necessary. There doesn't appear to be any explicit
>check for BOND_XMIT_POLICY_LAYER2 either. I believe we're guaranteed to
>not have anything but an ethernet header here, as the only other type I'm
>aware of being supported is Infiniband, but we limit that to active-backup
>only, and xmit_hash_policy isn't valid for active-backup.

	This is correct, interfaces in a bond other than active-backup
will all be ARPHRD_ETHER.  I'm unaware of a way to get a packet in there
without at least an Ethernet header.

	-J

---
	-Jay Vosburgh, jay.vosburgh@canonical.com

^ permalink raw reply	[flat|nested] 30+ messages in thread

* question about bonding mode 4
  2021-01-08 13:12     ` Jiri Pirko
  2021-01-08 15:21       ` Jarod Wilson
@ 2021-01-15  2:02       ` moyufeng
  2021-01-23  6:10         ` moyufeng
  1 sibling, 1 reply; 30+ messages in thread
From: moyufeng @ 2021-01-15  2:02 UTC (permalink / raw)
  To: Jiri Pirko, Jay Vosburgh
  Cc: lipeng (Y),
	linux-kernel, Veaceslav Falico, Andy Gospodarek, David S. Miller,
	Jakub Kicinski, Thomas Davis, netdev, linuxarm, Salil Mehta

Hi Team,

I have a question about bonding. During testing bonding mode 4
scenarios, I find that there is a very low probability that
the pointer is null. The following information is displayed:

[99359.795934] bond0: (slave eth13.2001): Port 2 did not find a suitable aggregator
[99359.796960] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020
[99359.798127] Mem abort info:
[99359.798526]   ESR = 0x96000004
[99359.798938]   EC = 0x25: DABT (current EL), IL = 32 bits
[99359.799673]   SET = 0, FnV = 0
[99359.800106]   EA = 0, S1PTW = 0
[99359.800554] Data abort info:
[99359.800952]   ISV = 0, ISS = 0x00000004
[99359.801522]   CM = 0, WnR = 0
[99359.801970] user pgtable: 4k pages, 48-bit VAs, pgdp=00000000c64e6000
[99359.802876] [0000000000000020] pgd=0000000000000000
[99359.803555] Internal error: Oops: 96000004 [#1] PREEMPT SMP
[99359.804369] Modules linked in: bonding hns3(-) hclgevf hnae3 [last unloaded: bonding]
[99359.805494] CPU: 1 PID: 951 Comm: kworker/u10:2 Not tainted 5.7.0-rc4+ #1
[99359.806455] Hardware name: linux,dummy-virt (DT)
[99359.807107] Workqueue: bond0 bond_3ad_state_machine_handler [bonding]
[99359.808056] pstate: 60c00005 (nZCv daif +PAN +UAO)
[99359.808722] pc : bond_3ad_state_machine_handler+0x7fc/0xdb8 [bonding]
[99359.809652] lr : bond_3ad_state_machine_handler+0x7f4/0xdb8 [bonding]
[99359.810535] sp : ffff80001882bd20
[99359.811012] x29: ffff80001882bd20 x28: ffff000085939a38
[99359.811791] x27: ffff00008649bb68 x26: 00000000aaaaaaab
[99359.812871] x25: ffff800009401000 x24: ffff800009408de4
[99359.814049] x23: ffff80001882bd98 x22: ffff00008649b880
[99359.815210] x21: 0000000000000000 x20: ffff000085939a00
[99359.816401] x19: ffff00008649b880 x18: ffff800012572988
[99359.817637] x17: 0000000000000000 x16: 0000000000000000
[99359.818870] x15: ffff80009882b987 x14: 726f746167657267
[99359.820090] x13: 676120656c626174 x12: 697573206120646e
[99359.821374] x11: 696620746f6e2064 x10: 696420322074726f
[99359.822659] x9 : 50203a2931303032 x8 : 0000000000081391
[99359.823891] x7 : ffff8000108e3ad0 x6 : ffff8000128858bb
[99359.825109] x5 : 0000000000000000 x4 : 0000000000000000
[99359.826262] x3 : 00000000ffffffff x2 : 906b329bb5362a00
[99359.827394] x1 : 906b329bb5362a00 x0 : 0000000000000000
[99359.828540] Call trace:
[99359.829071]  bond_3ad_state_machine_handler+0x7fc/0xdb8 [bonding]
[99359.830367]  process_one_work+0x15c/0x4a0
[99359.831216]  worker_thread+0x50/0x478
[99359.832022]  kthread+0x130/0x160
[99359.832716]  ret_from_fork+0x10/0x18
[99359.833487] Code: 910c0021 95f704bb f9403f80 b5ffe300 (f9401000)
[99359.834742] ---[ end trace c7a8e02914afc4e0 ]---
[99359.835817] Kernel panic - not syncing: Fatal exception in interrupt
[99359.837334] SMP: stopping secondary CPUs
[99359.838277] Kernel Offset: disabled
[99359.839086] CPU features: 0x080002,22208218
[99359.840053] Memory Limit: none
[99359.840783] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---

The test procedure is as follows:
1. Configure bonding and set it to mode 4.
    echo "4" > /sys/class/net/bond0/bonding/mode
    ifconfig bond0 up

2. Configure two VLANs and add them to the bonding in step 1.
    vconfig add eth0 2001
    vconfig add eth1 2001
    ifenslave bond0 eth0.2001 eth1.2001

3. Unload the network device driver and bonding driver.
    rmmod hns3
    rmmod hclge
    rmmod hnae3
    rmmod bonding.ko

4. Repeat the preceding steps for a long time.

By checking the logic in ad_port_selection_logic(), I find that
if enter the branch "Port %d did not find a suitable aggregator",
the value of port->aggregator will be NULL, causing the problem.

So I'd like to ask what circumstances will be involved in this
branch, and what should be done in this case?


The detailed code analysis is as follows:

static void ad_port_selection_logic(struct port *port, bool *update_slave_arr)
{
[...]
	/* if the port is connected to other aggregator, detach it */
	if (port->aggregator) {
		/* detach the port from its former aggregator */
		temp_aggregator = port->aggregator;
		for (curr_port = temp_aggregator->lag_ports; curr_port;
		     last_port = curr_port,
		     curr_port = curr_port->next_port_in_aggregator) {
			if (curr_port == port) {
				temp_aggregator->num_of_ports--;
				/* if it is the first port attached to the
				 * aggregator
				 */
				if (!last_port) {
					temp_aggregator->lag_ports =
						port->next_port_in_aggregator;
				} else {
					/* not the first port attached to the
					 * aggregator
					 */
					last_port->next_port_in_aggregator =
						port->next_port_in_aggregator;
				}

				/* clear the port's relations to this
				 * aggregator
				 */
				port->aggregator = NULL;

----analysis: set port->aggregator NULL at the beginning of ad_port_selection_logic().

				port->next_port_in_aggregator = NULL;
				port->actor_port_aggregator_identifier = 0;

				slave_dbg(bond->dev, port->slave->dev, "Port %d left LAG %d\n",
					  port->actor_port_number,
					  temp_aggregator->aggregator_identifier);
				/* if the aggregator is empty, clear its
				 * parameters, and set it ready to be attached
				 */
				if (!temp_aggregator->lag_ports)
					ad_clear_agg(temp_aggregator);
				break;
			}
		}
		if (!curr_port) {
			/* meaning: the port was related to an aggregator
			 * but was not on the aggregator port list
			 */
			net_warn_ratelimited("%s: (slave %s): Warning: Port %d was related to aggregator %d but was not on its port list\n",
					     port->slave->bond->dev->name,
					     port->slave->dev->name,
					     port->actor_port_number,
					     port->aggregator->aggregator_identifier);
		}
	}
	/* search on all aggregators for a suitable aggregator for this port */
	bond_for_each_slave(bond, slave, iter) {
		aggregator = &(SLAVE_AD_INFO(slave)->aggregator);
		/* keep a free aggregator for later use(if needed) */
		if (!aggregator->lag_ports) {
			if (!free_aggregator)
				free_aggregator = aggregator;

----analysis: Save free_aggregator if found a free aggregator. But in this case, no free aggregator is available.

			continue;
		}
		/* check if current aggregator suits us */
		if (((aggregator->actor_oper_aggregator_key == port->actor_oper_port_key) && /* if all parameters match AND */
		     MAC_ADDRESS_EQUAL(&(aggregator->partner_system), &(port->partner_oper.system)) &&
		     (aggregator->partner_system_priority == port->partner_oper.system_priority) &&
		     (aggregator->partner_oper_aggregator_key == port->partner_oper.key)
		    ) &&
		    ((!MAC_ADDRESS_EQUAL(&(port->partner_oper.system), &(null_mac_addr)) && /* partner answers */
		      !aggregator->is_individual)  /* but is not individual OR */
		    )
		   ) {
			/* attach to the founded aggregator */
			port->aggregator = aggregator;

----analysis: If a proper aggregator is found, port->aggregator is assigned here.

			port->actor_port_aggregator_identifier =
				port->aggregator->aggregator_identifier;
			port->next_port_in_aggregator = aggregator->lag_ports;
			port->aggregator->num_of_ports++;
			aggregator->lag_ports = port;
			slave_dbg(bond->dev, slave->dev, "Port %d joined LAG %d (existing LAG)\n",
				  port->actor_port_number,
				  port->aggregator->aggregator_identifier);

			/* mark this port as selected */
			port->sm_vars |= AD_PORT_SELECTED;
			found = 1;

----analysis: Set found to 1 if port->aggregator is assigned.

			break;
		}
	}
	/* the port couldn't find an aggregator - attach it to a new
	 * aggregator
	 */
	if (!found) {
		if (free_aggregator) {
			/* assign port a new aggregator */
			port->aggregator = free_aggregator;

----analysis: No proper free_aggregator is found. Therefore, port->aggregator cannot be assigned here.

			port->actor_port_aggregator_identifier =
				port->aggregator->aggregator_identifier;

			/* update the new aggregator's parameters
			 * if port was responsed from the end-user
			 */
			if (port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS)
				/* if port is full duplex */
				port->aggregator->is_individual = false;
			else
				port->aggregator->is_individual = true;

			port->aggregator->actor_admin_aggregator_key =
				port->actor_admin_port_key;
			port->aggregator->actor_oper_aggregator_key =
				port->actor_oper_port_key;
			port->aggregator->partner_system =
				port->partner_oper.system;
			port->aggregator->partner_system_priority =
				port->partner_oper.system_priority;
			port->aggregator->partner_oper_aggregator_key = port->partner_oper.key;
			port->aggregator->receive_state = 1;
			port->aggregator->transmit_state = 1;
			port->aggregator->lag_ports = port;
			port->aggregator->num_of_ports++;

			/* mark this port as selected */
			port->sm_vars |= AD_PORT_SELECTED;

			slave_dbg(bond->dev, port->slave->dev, "Port %d joined LAG %d (new LAG)\n",
				  port->actor_port_number,
				  port->aggregator->aggregator_identifier);
		} else {
			slave_err(bond->dev, port->slave->dev,
				  "Port %d did not find a suitable aggregator\n",
				  port->actor_port_number);

----analysis: The fault scenario goes to this branch, and port->aggregator remains NULL.

		}
	}
	/* if all aggregator's ports are READY_N == TRUE, set ready=TRUE
	 * in all aggregator's ports, else set ready=FALSE in all
	 * aggregator's ports
	 */
	__set_agg_ports_ready(port->aggregator,
			      __agg_ports_are_ready(port->aggregator));

----analysis: port->aggregator is still NULL, which causes problem.


	aggregator = __get_first_agg(port);
	ad_agg_selection_logic(aggregator, update_slave_arr);

	if (!port->aggregator->is_active)
		port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION;
}


Thanks.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next v2] bonding: add a vlan+mac tx hashing option
  2021-01-14 21:54       ` Jay Vosburgh
@ 2021-01-15 15:08         ` Jarod Wilson
  0 siblings, 0 replies; 30+ messages in thread
From: Jarod Wilson @ 2021-01-15 15:08 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Jakub Kicinski, linux-kernel, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Thomas Davis, netdev

On Thu, Jan 14, 2021 at 01:54:31PM -0800, Jay Vosburgh wrote:
> Jarod Wilson <jarod@redhat.com> wrote:
> 
> >On Wed, Jan 13, 2021 at 05:58:18PM -0800, Jakub Kicinski wrote:
> >> On Wed, 13 Jan 2021 17:35:48 -0500 Jarod Wilson wrote:
> >> > This comes from an end-user request, where they're running multiple VMs on
> >> > hosts with bonded interfaces connected to some interest switch topologies,
> >> > where 802.3ad isn't an option. They're currently running a proprietary
> >> > solution that effectively achieves load-balancing of VMs and bandwidth
> >> > utilization improvements with a similar form of transmission algorithm.
> >> > 
> >> > Basically, each VM has it's own vlan, so it always sends its traffic out
> >> > the same interface, unless that interface fails. Traffic gets split
> >> > between the interfaces, maintaining a consistent path, with failover still
> >> > available if an interface goes down.
> >> > 
> >> > This has been rudimetarily tested to provide similar results, suitable for
> >> > them to use to move off their current proprietary solution. A patch for
> >> > iproute2 is forthcoming as well, to properly support the new mode there as
> >> > well.
> >> 
> >> > Signed-off-by: Jarod Wilson <jarod@redhat.com>
> >> > ---
> >> > v2: verified netlink interfaces working, added Documentation, changed
> >> > tx hash mode name to vlan+mac for consistency and clarity.
> >> > 
> >> >  Documentation/networking/bonding.rst | 13 +++++++++++++
> >> >  drivers/net/bonding/bond_main.c      | 27 +++++++++++++++++++++++++--
> >> >  drivers/net/bonding/bond_options.c   |  1 +
> >> >  include/linux/netdevice.h            |  1 +
> >> >  include/uapi/linux/if_bonding.h      |  1 +
> >> >  5 files changed, 41 insertions(+), 2 deletions(-)
> >> > 
> >> > diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
> >> > index adc314639085..c78ceb7630a0 100644
> >> > --- a/Documentation/networking/bonding.rst
> >> > +++ b/Documentation/networking/bonding.rst
> >> > @@ -951,6 +951,19 @@ xmit_hash_policy
> >> >  		packets will be distributed according to the encapsulated
> >> >  		flows.
> >> >  
> >> > +	vlan+mac
> 
> 	I notice that the code calls it "VLAN_SRCMAC" but the
> user-facing nomenclature is "vlan+mac"; I tend to lean towards having
> the user visible name also be "vlan+srcmac".  Both for consistency, and
> just in case someone someday wants "vlan+dstmac".  And you did ask for
> preference on this in a separate email.

That's valid. I was trying to keep it short, but it does muddy the waters
a bit by not including src. I'll adjust accordingly and resend the
userspace bit too.

...
> 	Yah, the existing L2 hash is pretty weak.  It might be possible
> to squeeze this into the existing bond_xmit_hash a bit better, if the
> hash is two u32s.  The first being the first 32 bits of the MAC, and the
> second being the last 16 bits of the MAC combined with the 16 bit VLAN
> tag.
> 
> 	There's already logic at the end of bond_xmit_hash to reduce a
> u32 into the final hash that perhaps could be leveraged.  
> 
> 	Thinking about it, though, all the ways to combine that data
> together end up being pretty vile ("*(u32 *)&ethhdr->h_source[0]" sorts
> of things).

Yeah, I'd worry that bond_xmit_hash() is already getting a bit complicated
to follow and understand, and that would make it even more so.

> >Something like this instead maybe:
> >
> >static u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
> >{
> >        struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
> >        u32 srcmac = 0;
> >        u16 vlan;
> >        int i;
> >
> >        for (i = 0; i < ETH_ALEN; i++)
> >                srcmac = (srcmac << 8) | mac_hdr->h_source[i];
> 
> 	I think this will shift h_source[0] and [1] into oblivion.

Argh, yep, 48 bits don't fit into a u32. Okay, so I'll replace that with a
u32 srcmac_vendor and u32 srcmac_dev, but they'll only have 24 bits of data
in them, then return vlan ^ srcmac_vendor ^ srcmac_dev, I think.

-- 
Jarod Wilson
jarod@redhat.com


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH iproute2] bond: support xmit_hash_policy=vlan+mac
  2021-01-13 23:41   ` [PATCH iproute2] bond: support xmit_hash_policy=vlan+mac Jarod Wilson
@ 2021-01-15 15:12     ` Jarod Wilson
  2021-01-15 19:21     ` [PATCH iproute2 v2] bond: support xmit_hash_policy=vlan+srcmac Jarod Wilson
  1 sibling, 0 replies; 30+ messages in thread
From: Jarod Wilson @ 2021-01-15 15:12 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, Jay Vosburgh

On Wed, Jan 13, 2021 at 06:41:17PM -0500, Jarod Wilson wrote:
> There's a new transmit hash policy being added to the bonding driver that
> is a simple XOR of vlan ID and source MAC, xmit_hash_policy vlan+mac. This
> trivial patch makes it configurable and queryable via iproute2.
> 
> $ sudo modprobe bonding mode=2 max_bonds=1 xmit_hash_policy=0
> 
> $ sudo ip link set bond0 type bond xmit_hash_policy vlan+mac
> 
> $ ip -d link show bond0
> 11: bond0: <BROADCAST,MULTICAST,MASTER> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
>     link/ether ce:85:5e:24:ce:90 brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535
>     bond mode balance-xor miimon 0 updelay 0 downdelay 0 peer_notify_delay 0 use_carrier 1 arp_interval 0 arp_validate none arp_all_targets any
> primary_reselect always fail_over_mac none xmit_hash_policy vlan+mac resend_igmp 1 num_grat_arp 1 all_slaves_active 0 min_links 0 lp_interval 1
> packets_per_slave 1 lacp_rate slow ad_select stable tlb_dynamic_lb 1 addrgenmode eui64 numtxqueues 16 numrxqueues 16 gso_max_size 65536 gso_max_segs
> 65535
> 
> $ grep Hash /proc/net/bonding/bond0
> Transmit Hash Policy: vlan+mac (5)
> 
> $ sudo ip link add test type bond help
> Usage: ... bond [ mode BONDMODE ] [ active_slave SLAVE_DEV ]
>                 [ clear_active_slave ] [ miimon MIIMON ]
>                 [ updelay UPDELAY ] [ downdelay DOWNDELAY ]
>                 [ peer_notify_delay DELAY ]
>                 [ use_carrier USE_CARRIER ]
>                 [ arp_interval ARP_INTERVAL ]
>                 [ arp_validate ARP_VALIDATE ]
>                 [ arp_all_targets ARP_ALL_TARGETS ]
>                 [ arp_ip_target [ ARP_IP_TARGET, ... ] ]
>                 [ primary SLAVE_DEV ]
>                 [ primary_reselect PRIMARY_RESELECT ]
>                 [ fail_over_mac FAIL_OVER_MAC ]
>                 [ xmit_hash_policy XMIT_HASH_POLICY ]
>                 [ resend_igmp RESEND_IGMP ]
>                 [ num_grat_arp|num_unsol_na NUM_GRAT_ARP|NUM_UNSOL_NA ]
>                 [ all_slaves_active ALL_SLAVES_ACTIVE ]
>                 [ min_links MIN_LINKS ]
>                 [ lp_interval LP_INTERVAL ]
>                 [ packets_per_slave PACKETS_PER_SLAVE ]
>                 [ tlb_dynamic_lb TLB_DYNAMIC_LB ]
>                 [ lacp_rate LACP_RATE ]
>                 [ ad_select AD_SELECT ]
>                 [ ad_user_port_key PORTKEY ]
>                 [ ad_actor_sys_prio SYSPRIO ]
>                 [ ad_actor_system LLADDR ]
> 
> BONDMODE := balance-rr|active-backup|balance-xor|broadcast|802.3ad|balance-tlb|balance-alb
> ARP_VALIDATE := none|active|backup|all
> ARP_ALL_TARGETS := any|all
> PRIMARY_RESELECT := always|better|failure
> FAIL_OVER_MAC := none|active|follow
> XMIT_HASH_POLICY := layer2|layer2+3|layer3+4|encap2+3|encap3+4|vlan+mac
> LACP_RATE := slow|fast
> AD_SELECT := stable|bandwidth|count
> 
> Cc: Stephen Hemminger <stephen@networkplumber.org>
> Cc: Jay Vosburgh <j.vosburgh@gmail.com>
> Signed-off-by: Jarod Wilson <jarod@redhat.com>

Self-nack on this version, renaming the mode to vlan+srcmac as discussed.

-- 
Jarod Wilson
jarod@redhat.com


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH net-next v3] bonding: add a vlan+srcmac tx hashing option
  2021-01-13 22:35 ` [PATCH net-next v2] bonding: add a vlan+mac tx hashing option Jarod Wilson
  2021-01-13 23:41   ` [PATCH iproute2] bond: support xmit_hash_policy=vlan+mac Jarod Wilson
  2021-01-14  1:58   ` [PATCH net-next v2] bonding: add a vlan+mac tx hashing option Jakub Kicinski
@ 2021-01-15 19:21   ` Jarod Wilson
  2021-01-18 23:10     ` David Ahern
  2021-01-19  1:09     ` [PATCH net-next v4] " Jarod Wilson
  2 siblings, 2 replies; 30+ messages in thread
From: Jarod Wilson @ 2021-01-15 19:21 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jarod Wilson, Jay Vosburgh, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Jakub Kicinski, Thomas Davis, netdev

This comes from an end-user request, where they're running multiple VMs on
hosts with bonded interfaces connected to some interest switch topologies,
where 802.3ad isn't an option. They're currently running a proprietary
solution that effectively achieves load-balancing of VMs and bandwidth
utilization improvements with a similar form of transmission algorithm.

Basically, each VM has it's own vlan, so it always sends its traffic out
the same interface, unless that interface fails. Traffic gets split
between the interfaces, maintaining a consistent path, with failover still
available if an interface goes down.

Unlike bond_eth_hash(), this hash function is using the full source MAC
address instead of just the last byte, as there are so few components to
the hash, and in the no-vlan case, we would be returning just the last
byte of the source MAC as the hash value. It's entirely possible to have
two NICs in a bond with the same last byte of their MAC, but not the same
MAC, so this adjustment should guarantee distinct hashes in all cases.

This has been rudimetarily tested to provide similar results to the
proprietary solution it is aiming to replace. A patch for iproute2 is also
posted, to properly support the new mode there as well.

Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Thomas Davis <tadavis@lbl.gov>
Cc: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
---
v2: verified netlink interfaces working, added Documentation, changed
tx hash mode name to vlan+mac for consistency and clarity.
v3: drop inline from hash function, use full source MAC, not just the
last byte, expand explanation in patch description, extend hash name to
vlan+srcmac.

 Documentation/networking/bonding.rst | 13 +++++++++++
 drivers/net/bonding/bond_main.c      | 34 ++++++++++++++++++++++++++--
 drivers/net/bonding/bond_options.c   | 13 ++++++-----
 include/linux/netdevice.h            |  1 +
 include/uapi/linux/if_bonding.h      |  1 +
 5 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
index adc314639085..36562dcd3e1e 100644
--- a/Documentation/networking/bonding.rst
+++ b/Documentation/networking/bonding.rst
@@ -951,6 +951,19 @@ xmit_hash_policy
 		packets will be distributed according to the encapsulated
 		flows.
 
+	vlan+srcmac
+
+		This policy uses a very rudimentary vland ID and source mac
+		ID hash to load-balance traffic per-vlan, with failover
+		should one leg fail. The intended use case is for a bond
+		shared by multiple virtual machines, all configured to
+		use their own vlan, to give lacp-like functionality
+		without requiring lacp-capable switching hardware.
+
+		The formula for the hash is simply
+
+		hash = (vlan ID) XOR (source MAC vendor) XOR (source MAC dev)
+
 	The default value is layer2.  This option was added in bonding
 	version 2.6.3.  In earlier versions of bonding, this parameter
 	does not exist, and the layer2 policy is the only policy.  The
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5fe5232cc3f3..d4bc4d4e953b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -164,7 +164,7 @@ module_param(xmit_hash_policy, charp, 0);
 MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; "
 				   "0 for layer 2 (default), 1 for layer 3+4, "
 				   "2 for layer 2+3, 3 for encap layer 2+3, "
-				   "4 for encap layer 3+4");
+				   "4 for encap layer 3+4, 5 for vlan+srcmac");
 module_param(arp_interval, int, 0);
 MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
 module_param_array(arp_ip_target, charp, NULL, 0);
@@ -1434,6 +1434,8 @@ static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond,
 		return NETDEV_LAG_HASH_E23;
 	case BOND_XMIT_POLICY_ENCAP34:
 		return NETDEV_LAG_HASH_E34;
+	case BOND_XMIT_POLICY_VLAN_SRCMAC:
+		return NETDEV_LAG_HASH_VLAN_SRCMAC;
 	default:
 		return NETDEV_LAG_HASH_UNKNOWN;
 	}
@@ -3494,6 +3496,27 @@ static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk,
 	return true;
 }
 
+static u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
+{
+	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
+	u32 srcmac_vendor = 0, srcmac_dev = 0;
+	u16 vlan;
+	int i;
+
+	for (i = 0; i < 3; i++)
+		srcmac_vendor = (srcmac_vendor << 8) | mac_hdr->h_source[i];
+
+	for (i = 3; i < ETH_ALEN; i++)
+		srcmac_dev = (srcmac_dev << 8) | mac_hdr->h_source[i];
+
+	if (!skb_vlan_tag_present(skb))
+		return srcmac_vendor ^ srcmac_dev;
+
+	vlan = skb_vlan_tag_get(skb);
+
+	return vlan ^ srcmac_vendor ^ srcmac_dev;
+}
+
 /* Extract the appropriate headers based on bond's xmit policy */
 static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 			      struct flow_keys *fk)
@@ -3501,10 +3524,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34;
 	int noff, proto = -1;
 
-	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) {
+	switch (bond->params.xmit_policy) {
+	case BOND_XMIT_POLICY_ENCAP23:
+	case BOND_XMIT_POLICY_ENCAP34:
 		memset(fk, 0, sizeof(*fk));
 		return __skb_flow_dissect(NULL, skb, &flow_keys_bonding,
 					  fk, NULL, 0, 0, 0, 0);
+	default:
+		break;
 	}
 
 	fk->ports.ports = 0;
@@ -3556,6 +3583,9 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
 	    skb->l4_hash)
 		return skb->hash;
 
+	if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC)
+		return bond_vlan_srcmac_hash(skb);
+
 	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 ||
 	    !bond_flow_dissect(bond, skb, &flow))
 		return bond_eth_hash(skb);
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index a4e4e15f574d..c69400c5bf07 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -96,12 +96,13 @@ static const struct bond_opt_value bond_pps_tbl[] = {
 };
 
 static const struct bond_opt_value bond_xmit_hashtype_tbl[] = {
-	{ "layer2",   BOND_XMIT_POLICY_LAYER2, BOND_VALFLAG_DEFAULT},
-	{ "layer3+4", BOND_XMIT_POLICY_LAYER34, 0},
-	{ "layer2+3", BOND_XMIT_POLICY_LAYER23, 0},
-	{ "encap2+3", BOND_XMIT_POLICY_ENCAP23, 0},
-	{ "encap3+4", BOND_XMIT_POLICY_ENCAP34, 0},
-	{ NULL,       -1,                       0},
+	{ "layer2",      BOND_XMIT_POLICY_LAYER2,      BOND_VALFLAG_DEFAULT},
+	{ "layer3+4",    BOND_XMIT_POLICY_LAYER34,     0},
+	{ "layer2+3",    BOND_XMIT_POLICY_LAYER23,     0},
+	{ "encap2+3",    BOND_XMIT_POLICY_ENCAP23,     0},
+	{ "encap3+4",    BOND_XMIT_POLICY_ENCAP34,     0},
+	{ "vlan+srcmac", BOND_XMIT_POLICY_VLAN_SRCMAC, 0},
+	{ NULL,          -1,                           0},
 };
 
 static const struct bond_opt_value bond_arp_validate_tbl[] = {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5b949076ed23..a94ce80a2fe1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2615,6 +2615,7 @@ enum netdev_lag_hash {
 	NETDEV_LAG_HASH_L23,
 	NETDEV_LAG_HASH_E23,
 	NETDEV_LAG_HASH_E34,
+	NETDEV_LAG_HASH_VLAN_SRCMAC,
 	NETDEV_LAG_HASH_UNKNOWN,
 };
 
diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
index 45f3750aa861..e8eb4ad03cf1 100644
--- a/include/uapi/linux/if_bonding.h
+++ b/include/uapi/linux/if_bonding.h
@@ -94,6 +94,7 @@
 #define BOND_XMIT_POLICY_LAYER23	2 /* layer 2+3 (IP ^ MAC) */
 #define BOND_XMIT_POLICY_ENCAP23	3 /* encapsulated layer 2+3 */
 #define BOND_XMIT_POLICY_ENCAP34	4 /* encapsulated layer 3+4 */
+#define BOND_XMIT_POLICY_VLAN_SRCMAC	5 /* vlan + source MAC */
 
 /* 802.3ad port state definitions (43.4.2.2 in the 802.3ad standard) */
 #define LACP_STATE_LACP_ACTIVITY   0x1
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH iproute2 v2] bond: support xmit_hash_policy=vlan+srcmac
  2021-01-13 23:41   ` [PATCH iproute2] bond: support xmit_hash_policy=vlan+mac Jarod Wilson
  2021-01-15 15:12     ` Jarod Wilson
@ 2021-01-15 19:21     ` Jarod Wilson
  2021-01-23 18:35       ` David Ahern
  1 sibling, 1 reply; 30+ messages in thread
From: Jarod Wilson @ 2021-01-15 19:21 UTC (permalink / raw)
  To: netdev; +Cc: Jarod Wilson, Stephen Hemminger, Jay Vosburgh

There's a new transmit hash policy being added to the bonding driver that
is a simple XOR of vlan ID and source MAC, xmit_hash_policy vlan+srcmac.
This trivial patch makes it configurable and queryable via iproute2.

$ sudo modprobe bonding mode=2 max_bonds=1 xmit_hash_policy=0

$ sudo ip link set bond0 type bond xmit_hash_policy vlan+srcmac

$ ip -d link show bond0
11: bond0: <BROADCAST,MULTICAST,MASTER> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
    link/ether ce:85:5e:24:ce:90 brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535
    bond mode balance-xor miimon 0 updelay 0 downdelay 0 peer_notify_delay 0 use_carrier 1 arp_interval 0 arp_validate none arp_all_targets any
primary_reselect always fail_over_mac none xmit_hash_policy vlan+srcmac resend_igmp 1 num_grat_arp 1 all_slaves_active 0 min_links 0 lp_interval 1
packets_per_slave 1 lacp_rate slow ad_select stable tlb_dynamic_lb 1 addrgenmode eui64 numtxqueues 16 numrxqueues 16 gso_max_size 65536 gso_max_segs
65535

$ grep Hash /proc/net/bonding/bond0
Transmit Hash Policy: vlan+srcmac (5)

$ sudo ip link add test type bond help
Usage: ... bond [ mode BONDMODE ] [ active_slave SLAVE_DEV ]
                [ clear_active_slave ] [ miimon MIIMON ]
                [ updelay UPDELAY ] [ downdelay DOWNDELAY ]
                [ peer_notify_delay DELAY ]
                [ use_carrier USE_CARRIER ]
                [ arp_interval ARP_INTERVAL ]
                [ arp_validate ARP_VALIDATE ]
                [ arp_all_targets ARP_ALL_TARGETS ]
                [ arp_ip_target [ ARP_IP_TARGET, ... ] ]
                [ primary SLAVE_DEV ]
                [ primary_reselect PRIMARY_RESELECT ]
                [ fail_over_mac FAIL_OVER_MAC ]
                [ xmit_hash_policy XMIT_HASH_POLICY ]
                [ resend_igmp RESEND_IGMP ]
                [ num_grat_arp|num_unsol_na NUM_GRAT_ARP|NUM_UNSOL_NA ]
                [ all_slaves_active ALL_SLAVES_ACTIVE ]
                [ min_links MIN_LINKS ]
                [ lp_interval LP_INTERVAL ]
                [ packets_per_slave PACKETS_PER_SLAVE ]
                [ tlb_dynamic_lb TLB_DYNAMIC_LB ]
                [ lacp_rate LACP_RATE ]
                [ ad_select AD_SELECT ]
                [ ad_user_port_key PORTKEY ]
                [ ad_actor_sys_prio SYSPRIO ]
                [ ad_actor_system LLADDR ]

BONDMODE := balance-rr|active-backup|balance-xor|broadcast|802.3ad|balance-tlb|balance-alb
ARP_VALIDATE := none|active|backup|all
ARP_ALL_TARGETS := any|all
PRIMARY_RESELECT := always|better|failure
FAIL_OVER_MAC := none|active|follow
XMIT_HASH_POLICY := layer2|layer2+3|layer3+4|encap2+3|encap3+4|vlan+srcmac
LACP_RATE := slow|fast
AD_SELECT := stable|bandwidth|count

Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
---
 ip/iplink_bond.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ip/iplink_bond.c b/ip/iplink_bond.c
index 585b6be1..b9470b98 100644
--- a/ip/iplink_bond.c
+++ b/ip/iplink_bond.c
@@ -70,6 +70,7 @@ static const char *xmit_hash_policy_tbl[] = {
 	"layer2+3",
 	"encap2+3",
 	"encap3+4",
+	"vlan+srcmac",
 	NULL,
 };
 
@@ -148,7 +149,7 @@ static void print_explain(FILE *f)
 		"ARP_ALL_TARGETS := any|all\n"
 		"PRIMARY_RESELECT := always|better|failure\n"
 		"FAIL_OVER_MAC := none|active|follow\n"
-		"XMIT_HASH_POLICY := layer2|layer2+3|layer3+4|encap2+3|encap3+4\n"
+		"XMIT_HASH_POLICY := layer2|layer2+3|layer3+4|encap2+3|encap3+4|vlan+srcmac\n"
 		"LACP_RATE := slow|fast\n"
 		"AD_SELECT := stable|bandwidth|count\n"
 	);
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next v3] bonding: add a vlan+srcmac tx hashing option
  2021-01-15 19:21   ` [PATCH net-next v3] bonding: add a vlan+srcmac " Jarod Wilson
@ 2021-01-18 23:10     ` David Ahern
  2021-01-19  1:04       ` Jarod Wilson
  2021-01-19  1:09     ` [PATCH net-next v4] " Jarod Wilson
  1 sibling, 1 reply; 30+ messages in thread
From: David Ahern @ 2021-01-18 23:10 UTC (permalink / raw)
  To: Jarod Wilson, linux-kernel
  Cc: Jay Vosburgh, Veaceslav Falico, Andy Gospodarek, David S. Miller,
	Jakub Kicinski, Thomas Davis, netdev

On 1/15/21 12:21 PM, Jarod Wilson wrote:
> diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
> index adc314639085..36562dcd3e1e 100644
> --- a/Documentation/networking/bonding.rst
> +++ b/Documentation/networking/bonding.rst
> @@ -951,6 +951,19 @@ xmit_hash_policy
>  		packets will be distributed according to the encapsulated
>  		flows.
>  
> +	vlan+srcmac
> +
> +		This policy uses a very rudimentary vland ID and source mac

s/vland/vlan/

> +		ID hash to load-balance traffic per-vlan, with failover

drop ID on this line; just 'source mac'.


> +		should one leg fail. The intended use case is for a bond
> +		shared by multiple virtual machines, all configured to
> +		use their own vlan, to give lacp-like functionality
> +		without requiring lacp-capable switching hardware.
> +
> +		The formula for the hash is simply
> +
> +		hash = (vlan ID) XOR (source MAC vendor) XOR (source MAC dev)
> +
>  	The default value is layer2.  This option was added in bonding
>  	version 2.6.3.  In earlier versions of bonding, this parameter
>  	does not exist, and the layer2 policy is the only policy.  The

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next v3] bonding: add a vlan+srcmac tx hashing option
  2021-01-18 23:10     ` David Ahern
@ 2021-01-19  1:04       ` Jarod Wilson
  0 siblings, 0 replies; 30+ messages in thread
From: Jarod Wilson @ 2021-01-19  1:04 UTC (permalink / raw)
  To: David Ahern
  Cc: linux-kernel, Jay Vosburgh, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Jakub Kicinski, Thomas Davis, netdev

On Mon, Jan 18, 2021 at 04:10:38PM -0700, David Ahern wrote:
> On 1/15/21 12:21 PM, Jarod Wilson wrote:
> > diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
> > index adc314639085..36562dcd3e1e 100644
> > --- a/Documentation/networking/bonding.rst
> > +++ b/Documentation/networking/bonding.rst
> > @@ -951,6 +951,19 @@ xmit_hash_policy
> >  		packets will be distributed according to the encapsulated
> >  		flows.
> >  
> > +	vlan+srcmac
> > +
> > +		This policy uses a very rudimentary vland ID and source mac
> 
> s/vland/vlan/
> 
> > +		ID hash to load-balance traffic per-vlan, with failover
> 
> drop ID on this line; just 'source mac'.

Bah. Crap. Didn't test documentation, clearly. Or proof-read it. Will fix
in v4. Hopefully, nothing else to change though...

-- 
Jarod Wilson
jarod@redhat.com


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH net-next v4] bonding: add a vlan+srcmac tx hashing option
  2021-01-15 19:21   ` [PATCH net-next v3] bonding: add a vlan+srcmac " Jarod Wilson
  2021-01-18 23:10     ` David Ahern
@ 2021-01-19  1:09     ` Jarod Wilson
  2021-01-20  6:10       ` patchwork-bot+netdevbpf
  1 sibling, 1 reply; 30+ messages in thread
From: Jarod Wilson @ 2021-01-19  1:09 UTC (permalink / raw)
  To: linux-kernel
  Cc: Jarod Wilson, Jay Vosburgh, Veaceslav Falico, Andy Gospodarek,
	David S. Miller, Jakub Kicinski, Thomas Davis, netdev

This comes from an end-user request, where they're running multiple VMs on
hosts with bonded interfaces connected to some interest switch topologies,
where 802.3ad isn't an option. They're currently running a proprietary
solution that effectively achieves load-balancing of VMs and bandwidth
utilization improvements with a similar form of transmission algorithm.

Basically, each VM has it's own vlan, so it always sends its traffic out
the same interface, unless that interface fails. Traffic gets split
between the interfaces, maintaining a consistent path, with failover still
available if an interface goes down.

Unlike bond_eth_hash(), this hash function is using the full source MAC
address instead of just the last byte, as there are so few components to
the hash, and in the no-vlan case, we would be returning just the last
byte of the source MAC as the hash value. It's entirely possible to have
two NICs in a bond with the same last byte of their MAC, but not the same
MAC, so this adjustment should guarantee distinct hashes in all cases.

This has been rudimetarily tested to provide similar results to the
proprietary solution it is aiming to replace. A patch for iproute2 is also
posted, to properly support the new mode there as well.

Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Thomas Davis <tadavis@lbl.gov>
Cc: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
---
v2: verified netlink interfaces working, added Documentation, changed
tx hash mode name to vlan+mac for consistency and clarity.
v3: drop inline from hash function, use full source MAC, not just the
last byte, expand explanation in patch description, extend hash name to
vlan+srcmac.
v4: fix documentation issues pointed out by David Ahern

 Documentation/networking/bonding.rst | 13 +++++++++++
 drivers/net/bonding/bond_main.c      | 34 ++++++++++++++++++++++++++--
 drivers/net/bonding/bond_options.c   | 13 ++++++-----
 include/linux/netdevice.h            |  1 +
 include/uapi/linux/if_bonding.h      |  1 +
 5 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
index adc314639085..36562dcd3e1e 100644
--- a/Documentation/networking/bonding.rst
+++ b/Documentation/networking/bonding.rst
@@ -951,6 +951,19 @@ xmit_hash_policy
 		packets will be distributed according to the encapsulated
 		flows.
 
+	vlan+srcmac
+
+		This policy uses a very rudimentary vlan ID and source mac
+		hash to load-balance traffic per-vlan, with failover
+		should one leg fail. The intended use case is for a bond
+		shared by multiple virtual machines, all configured to
+		use their own vlan, to give lacp-like functionality
+		without requiring lacp-capable switching hardware.
+
+		The formula for the hash is simply
+
+		hash = (vlan ID) XOR (source MAC vendor) XOR (source MAC dev)
+
 	The default value is layer2.  This option was added in bonding
 	version 2.6.3.  In earlier versions of bonding, this parameter
 	does not exist, and the layer2 policy is the only policy.  The
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5fe5232cc3f3..d4bc4d4e953b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -164,7 +164,7 @@ module_param(xmit_hash_policy, charp, 0);
 MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; "
 				   "0 for layer 2 (default), 1 for layer 3+4, "
 				   "2 for layer 2+3, 3 for encap layer 2+3, "
-				   "4 for encap layer 3+4");
+				   "4 for encap layer 3+4, 5 for vlan+srcmac");
 module_param(arp_interval, int, 0);
 MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
 module_param_array(arp_ip_target, charp, NULL, 0);
@@ -1434,6 +1434,8 @@ static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond,
 		return NETDEV_LAG_HASH_E23;
 	case BOND_XMIT_POLICY_ENCAP34:
 		return NETDEV_LAG_HASH_E34;
+	case BOND_XMIT_POLICY_VLAN_SRCMAC:
+		return NETDEV_LAG_HASH_VLAN_SRCMAC;
 	default:
 		return NETDEV_LAG_HASH_UNKNOWN;
 	}
@@ -3494,6 +3496,27 @@ static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk,
 	return true;
 }
 
+static u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
+{
+	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
+	u32 srcmac_vendor = 0, srcmac_dev = 0;
+	u16 vlan;
+	int i;
+
+	for (i = 0; i < 3; i++)
+		srcmac_vendor = (srcmac_vendor << 8) | mac_hdr->h_source[i];
+
+	for (i = 3; i < ETH_ALEN; i++)
+		srcmac_dev = (srcmac_dev << 8) | mac_hdr->h_source[i];
+
+	if (!skb_vlan_tag_present(skb))
+		return srcmac_vendor ^ srcmac_dev;
+
+	vlan = skb_vlan_tag_get(skb);
+
+	return vlan ^ srcmac_vendor ^ srcmac_dev;
+}
+
 /* Extract the appropriate headers based on bond's xmit policy */
 static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 			      struct flow_keys *fk)
@@ -3501,10 +3524,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34;
 	int noff, proto = -1;
 
-	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) {
+	switch (bond->params.xmit_policy) {
+	case BOND_XMIT_POLICY_ENCAP23:
+	case BOND_XMIT_POLICY_ENCAP34:
 		memset(fk, 0, sizeof(*fk));
 		return __skb_flow_dissect(NULL, skb, &flow_keys_bonding,
 					  fk, NULL, 0, 0, 0, 0);
+	default:
+		break;
 	}
 
 	fk->ports.ports = 0;
@@ -3556,6 +3583,9 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
 	    skb->l4_hash)
 		return skb->hash;
 
+	if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC)
+		return bond_vlan_srcmac_hash(skb);
+
 	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 ||
 	    !bond_flow_dissect(bond, skb, &flow))
 		return bond_eth_hash(skb);
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index a4e4e15f574d..c69400c5bf07 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -96,12 +96,13 @@ static const struct bond_opt_value bond_pps_tbl[] = {
 };
 
 static const struct bond_opt_value bond_xmit_hashtype_tbl[] = {
-	{ "layer2",   BOND_XMIT_POLICY_LAYER2, BOND_VALFLAG_DEFAULT},
-	{ "layer3+4", BOND_XMIT_POLICY_LAYER34, 0},
-	{ "layer2+3", BOND_XMIT_POLICY_LAYER23, 0},
-	{ "encap2+3", BOND_XMIT_POLICY_ENCAP23, 0},
-	{ "encap3+4", BOND_XMIT_POLICY_ENCAP34, 0},
-	{ NULL,       -1,                       0},
+	{ "layer2",      BOND_XMIT_POLICY_LAYER2,      BOND_VALFLAG_DEFAULT},
+	{ "layer3+4",    BOND_XMIT_POLICY_LAYER34,     0},
+	{ "layer2+3",    BOND_XMIT_POLICY_LAYER23,     0},
+	{ "encap2+3",    BOND_XMIT_POLICY_ENCAP23,     0},
+	{ "encap3+4",    BOND_XMIT_POLICY_ENCAP34,     0},
+	{ "vlan+srcmac", BOND_XMIT_POLICY_VLAN_SRCMAC, 0},
+	{ NULL,          -1,                           0},
 };
 
 static const struct bond_opt_value bond_arp_validate_tbl[] = {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5b949076ed23..a94ce80a2fe1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2615,6 +2615,7 @@ enum netdev_lag_hash {
 	NETDEV_LAG_HASH_L23,
 	NETDEV_LAG_HASH_E23,
 	NETDEV_LAG_HASH_E34,
+	NETDEV_LAG_HASH_VLAN_SRCMAC,
 	NETDEV_LAG_HASH_UNKNOWN,
 };
 
diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
index 45f3750aa861..e8eb4ad03cf1 100644
--- a/include/uapi/linux/if_bonding.h
+++ b/include/uapi/linux/if_bonding.h
@@ -94,6 +94,7 @@
 #define BOND_XMIT_POLICY_LAYER23	2 /* layer 2+3 (IP ^ MAC) */
 #define BOND_XMIT_POLICY_ENCAP23	3 /* encapsulated layer 2+3 */
 #define BOND_XMIT_POLICY_ENCAP34	4 /* encapsulated layer 3+4 */
+#define BOND_XMIT_POLICY_VLAN_SRCMAC	5 /* vlan + source MAC */
 
 /* 802.3ad port state definitions (43.4.2.2 in the 802.3ad standard) */
 #define LACP_STATE_LACP_ACTIVITY   0x1
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next v4] bonding: add a vlan+srcmac tx hashing option
  2021-01-19  1:09     ` [PATCH net-next v4] " Jarod Wilson
@ 2021-01-20  6:10       ` patchwork-bot+netdevbpf
  0 siblings, 0 replies; 30+ messages in thread
From: patchwork-bot+netdevbpf @ 2021-01-20  6:10 UTC (permalink / raw)
  To: Jarod Wilson
  Cc: linux-kernel, j.vosburgh, vfalico, andy, davem, kuba, tadavis, netdev

Hello:

This patch was applied to netdev/net-next.git (refs/heads/master):

On Mon, 18 Jan 2021 20:09:27 -0500 you wrote:
> This comes from an end-user request, where they're running multiple VMs on
> hosts with bonded interfaces connected to some interest switch topologies,
> where 802.3ad isn't an option. They're currently running a proprietary
> solution that effectively achieves load-balancing of VMs and bandwidth
> utilization improvements with a similar form of transmission algorithm.
> 
> Basically, each VM has it's own vlan, so it always sends its traffic out
> the same interface, unless that interface fails. Traffic gets split
> between the interfaces, maintaining a consistent path, with failover still
> available if an interface goes down.
> 
> [...]

Here is the summary with links:
  - [net-next,v4] bonding: add a vlan+srcmac tx hashing option
    https://git.kernel.org/netdev/net-next/c/7b8fc0103bb5

You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: question about bonding mode 4
  2021-01-15  2:02       ` question about bonding mode 4 moyufeng
@ 2021-01-23  6:10         ` moyufeng
  2021-01-29 19:11           ` Jay Vosburgh
  0 siblings, 1 reply; 30+ messages in thread
From: moyufeng @ 2021-01-23  6:10 UTC (permalink / raw)
  To: Jiri Pirko, Jay Vosburgh
  Cc: lipeng (Y),
	linux-kernel, Veaceslav Falico, Andy Gospodarek, David S. Miller,
	Jakub Kicinski, Thomas Davis, netdev, linuxarm, Salil Mehta

Ping...
Any comments? Thanks!

On 2021/1/15 10:02, moyufeng wrote:
> Hi Team,
> 
> I have a question about bonding. During testing bonding mode 4
> scenarios, I find that there is a very low probability that
> the pointer is null. The following information is displayed:
> 
> [99359.795934] bond0: (slave eth13.2001): Port 2 did not find a suitable aggregator
> [99359.796960] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020
> [99359.798127] Mem abort info:
> [99359.798526]   ESR = 0x96000004
> [99359.798938]   EC = 0x25: DABT (current EL), IL = 32 bits
> [99359.799673]   SET = 0, FnV = 0
> [99359.800106]   EA = 0, S1PTW = 0
> [99359.800554] Data abort info:
> [99359.800952]   ISV = 0, ISS = 0x00000004
> [99359.801522]   CM = 0, WnR = 0
> [99359.801970] user pgtable: 4k pages, 48-bit VAs, pgdp=00000000c64e6000
> [99359.802876] [0000000000000020] pgd=0000000000000000
> [99359.803555] Internal error: Oops: 96000004 [#1] PREEMPT SMP
> [99359.804369] Modules linked in: bonding hns3(-) hclgevf hnae3 [last unloaded: bonding]
> [99359.805494] CPU: 1 PID: 951 Comm: kworker/u10:2 Not tainted 5.7.0-rc4+ #1
> [99359.806455] Hardware name: linux,dummy-virt (DT)
> [99359.807107] Workqueue: bond0 bond_3ad_state_machine_handler [bonding]
> [99359.808056] pstate: 60c00005 (nZCv daif +PAN +UAO)
> [99359.808722] pc : bond_3ad_state_machine_handler+0x7fc/0xdb8 [bonding]
> [99359.809652] lr : bond_3ad_state_machine_handler+0x7f4/0xdb8 [bonding]
> [99359.810535] sp : ffff80001882bd20
> [99359.811012] x29: ffff80001882bd20 x28: ffff000085939a38
> [99359.811791] x27: ffff00008649bb68 x26: 00000000aaaaaaab
> [99359.812871] x25: ffff800009401000 x24: ffff800009408de4
> [99359.814049] x23: ffff80001882bd98 x22: ffff00008649b880
> [99359.815210] x21: 0000000000000000 x20: ffff000085939a00
> [99359.816401] x19: ffff00008649b880 x18: ffff800012572988
> [99359.817637] x17: 0000000000000000 x16: 0000000000000000
> [99359.818870] x15: ffff80009882b987 x14: 726f746167657267
> [99359.820090] x13: 676120656c626174 x12: 697573206120646e
> [99359.821374] x11: 696620746f6e2064 x10: 696420322074726f
> [99359.822659] x9 : 50203a2931303032 x8 : 0000000000081391
> [99359.823891] x7 : ffff8000108e3ad0 x6 : ffff8000128858bb
> [99359.825109] x5 : 0000000000000000 x4 : 0000000000000000
> [99359.826262] x3 : 00000000ffffffff x2 : 906b329bb5362a00
> [99359.827394] x1 : 906b329bb5362a00 x0 : 0000000000000000
> [99359.828540] Call trace:
> [99359.829071]  bond_3ad_state_machine_handler+0x7fc/0xdb8 [bonding]
> [99359.830367]  process_one_work+0x15c/0x4a0
> [99359.831216]  worker_thread+0x50/0x478
> [99359.832022]  kthread+0x130/0x160
> [99359.832716]  ret_from_fork+0x10/0x18
> [99359.833487] Code: 910c0021 95f704bb f9403f80 b5ffe300 (f9401000)
> [99359.834742] ---[ end trace c7a8e02914afc4e0 ]---
> [99359.835817] Kernel panic - not syncing: Fatal exception in interrupt
> [99359.837334] SMP: stopping secondary CPUs
> [99359.838277] Kernel Offset: disabled
> [99359.839086] CPU features: 0x080002,22208218
> [99359.840053] Memory Limit: none
> [99359.840783] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
> 
> The test procedure is as follows:
> 1. Configure bonding and set it to mode 4.
>     echo "4" > /sys/class/net/bond0/bonding/mode
>     ifconfig bond0 up
> 
> 2. Configure two VLANs and add them to the bonding in step 1.
>     vconfig add eth0 2001
>     vconfig add eth1 2001
>     ifenslave bond0 eth0.2001 eth1.2001
> 
> 3. Unload the network device driver and bonding driver.
>     rmmod hns3
>     rmmod hclge
>     rmmod hnae3
>     rmmod bonding.ko
> 
> 4. Repeat the preceding steps for a long time.
> 
> By checking the logic in ad_port_selection_logic(), I find that
> if enter the branch "Port %d did not find a suitable aggregator",
> the value of port->aggregator will be NULL, causing the problem.
> 
> So I'd like to ask what circumstances will be involved in this
> branch, and what should be done in this case?
> 
> 
> The detailed code analysis is as follows:
> 
> static void ad_port_selection_logic(struct port *port, bool *update_slave_arr)
> {
> [...]
> 	/* if the port is connected to other aggregator, detach it */
> 	if (port->aggregator) {
> 		/* detach the port from its former aggregator */
> 		temp_aggregator = port->aggregator;
> 		for (curr_port = temp_aggregator->lag_ports; curr_port;
> 		     last_port = curr_port,
> 		     curr_port = curr_port->next_port_in_aggregator) {
> 			if (curr_port == port) {
> 				temp_aggregator->num_of_ports--;
> 				/* if it is the first port attached to the
> 				 * aggregator
> 				 */
> 				if (!last_port) {
> 					temp_aggregator->lag_ports =
> 						port->next_port_in_aggregator;
> 				} else {
> 					/* not the first port attached to the
> 					 * aggregator
> 					 */
> 					last_port->next_port_in_aggregator =
> 						port->next_port_in_aggregator;
> 				}
> 
> 				/* clear the port's relations to this
> 				 * aggregator
> 				 */
> 				port->aggregator = NULL;
> 
> ----analysis: set port->aggregator NULL at the beginning of ad_port_selection_logic().
> 
> 				port->next_port_in_aggregator = NULL;
> 				port->actor_port_aggregator_identifier = 0;
> 
> 				slave_dbg(bond->dev, port->slave->dev, "Port %d left LAG %d\n",
> 					  port->actor_port_number,
> 					  temp_aggregator->aggregator_identifier);
> 				/* if the aggregator is empty, clear its
> 				 * parameters, and set it ready to be attached
> 				 */
> 				if (!temp_aggregator->lag_ports)
> 					ad_clear_agg(temp_aggregator);
> 				break;
> 			}
> 		}
> 		if (!curr_port) {
> 			/* meaning: the port was related to an aggregator
> 			 * but was not on the aggregator port list
> 			 */
> 			net_warn_ratelimited("%s: (slave %s): Warning: Port %d was related to aggregator %d but was not on its port list\n",
> 					     port->slave->bond->dev->name,
> 					     port->slave->dev->name,
> 					     port->actor_port_number,
> 					     port->aggregator->aggregator_identifier);
> 		}
> 	}
> 	/* search on all aggregators for a suitable aggregator for this port */
> 	bond_for_each_slave(bond, slave, iter) {
> 		aggregator = &(SLAVE_AD_INFO(slave)->aggregator);
> 		/* keep a free aggregator for later use(if needed) */
> 		if (!aggregator->lag_ports) {
> 			if (!free_aggregator)
> 				free_aggregator = aggregator;
> 
> ----analysis: Save free_aggregator if found a free aggregator. But in this case, no free aggregator is available.
> 
> 			continue;
> 		}
> 		/* check if current aggregator suits us */
> 		if (((aggregator->actor_oper_aggregator_key == port->actor_oper_port_key) && /* if all parameters match AND */
> 		     MAC_ADDRESS_EQUAL(&(aggregator->partner_system), &(port->partner_oper.system)) &&
> 		     (aggregator->partner_system_priority == port->partner_oper.system_priority) &&
> 		     (aggregator->partner_oper_aggregator_key == port->partner_oper.key)
> 		    ) &&
> 		    ((!MAC_ADDRESS_EQUAL(&(port->partner_oper.system), &(null_mac_addr)) && /* partner answers */
> 		      !aggregator->is_individual)  /* but is not individual OR */
> 		    )
> 		   ) {
> 			/* attach to the founded aggregator */
> 			port->aggregator = aggregator;
> 
> ----analysis: If a proper aggregator is found, port->aggregator is assigned here.
> 
> 			port->actor_port_aggregator_identifier =
> 				port->aggregator->aggregator_identifier;
> 			port->next_port_in_aggregator = aggregator->lag_ports;
> 			port->aggregator->num_of_ports++;
> 			aggregator->lag_ports = port;
> 			slave_dbg(bond->dev, slave->dev, "Port %d joined LAG %d (existing LAG)\n",
> 				  port->actor_port_number,
> 				  port->aggregator->aggregator_identifier);
> 
> 			/* mark this port as selected */
> 			port->sm_vars |= AD_PORT_SELECTED;
> 			found = 1;
> 
> ----analysis: Set found to 1 if port->aggregator is assigned.
> 
> 			break;
> 		}
> 	}
> 	/* the port couldn't find an aggregator - attach it to a new
> 	 * aggregator
> 	 */
> 	if (!found) {
> 		if (free_aggregator) {
> 			/* assign port a new aggregator */
> 			port->aggregator = free_aggregator;
> 
> ----analysis: No proper free_aggregator is found. Therefore, port->aggregator cannot be assigned here.
> 
> 			port->actor_port_aggregator_identifier =
> 				port->aggregator->aggregator_identifier;
> 
> 			/* update the new aggregator's parameters
> 			 * if port was responsed from the end-user
> 			 */
> 			if (port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS)
> 				/* if port is full duplex */
> 				port->aggregator->is_individual = false;
> 			else
> 				port->aggregator->is_individual = true;
> 
> 			port->aggregator->actor_admin_aggregator_key =
> 				port->actor_admin_port_key;
> 			port->aggregator->actor_oper_aggregator_key =
> 				port->actor_oper_port_key;
> 			port->aggregator->partner_system =
> 				port->partner_oper.system;
> 			port->aggregator->partner_system_priority =
> 				port->partner_oper.system_priority;
> 			port->aggregator->partner_oper_aggregator_key = port->partner_oper.key;
> 			port->aggregator->receive_state = 1;
> 			port->aggregator->transmit_state = 1;
> 			port->aggregator->lag_ports = port;
> 			port->aggregator->num_of_ports++;
> 
> 			/* mark this port as selected */
> 			port->sm_vars |= AD_PORT_SELECTED;
> 
> 			slave_dbg(bond->dev, port->slave->dev, "Port %d joined LAG %d (new LAG)\n",
> 				  port->actor_port_number,
> 				  port->aggregator->aggregator_identifier);
> 		} else {
> 			slave_err(bond->dev, port->slave->dev,
> 				  "Port %d did not find a suitable aggregator\n",
> 				  port->actor_port_number);
> 
> ----analysis: The fault scenario goes to this branch, and port->aggregator remains NULL.
> 
> 		}
> 	}
> 	/* if all aggregator's ports are READY_N == TRUE, set ready=TRUE
> 	 * in all aggregator's ports, else set ready=FALSE in all
> 	 * aggregator's ports
> 	 */
> 	__set_agg_ports_ready(port->aggregator,
> 			      __agg_ports_are_ready(port->aggregator));
> 
> ----analysis: port->aggregator is still NULL, which causes problem.
> 
> 
> 	aggregator = __get_first_agg(port);
> 	ad_agg_selection_logic(aggregator, update_slave_arr);
> 
> 	if (!port->aggregator->is_active)
> 		port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION;
> }
> 
> 
> Thanks.
> .
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH iproute2 v2] bond: support xmit_hash_policy=vlan+srcmac
  2021-01-15 19:21     ` [PATCH iproute2 v2] bond: support xmit_hash_policy=vlan+srcmac Jarod Wilson
@ 2021-01-23 18:35       ` David Ahern
  0 siblings, 0 replies; 30+ messages in thread
From: David Ahern @ 2021-01-23 18:35 UTC (permalink / raw)
  To: Jarod Wilson, netdev; +Cc: Stephen Hemminger, Jay Vosburgh

On 1/15/21 12:21 PM, Jarod Wilson wrote:
> There's a new transmit hash policy being added to the bonding driver that
> is a simple XOR of vlan ID and source MAC, xmit_hash_policy vlan+srcmac.
> This trivial patch makes it configurable and queryable via iproute2.
> 
> $ sudo modprobe bonding mode=2 max_bonds=1 xmit_hash_policy=0
> 
> $ sudo ip link set bond0 type bond xmit_hash_policy vlan+srcmac
> 
> $ ip -d link show bond0
> 11: bond0: <BROADCAST,MULTICAST,MASTER> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
>     link/ether ce:85:5e:24:ce:90 brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535
>     bond mode balance-xor miimon 0 updelay 0 downdelay 0 peer_notify_delay 0 use_carrier 1 arp_interval 0 arp_validate none arp_all_targets any
> primary_reselect always fail_over_mac none xmit_hash_policy vlan+srcmac resend_igmp 1 num_grat_arp 1 all_slaves_active 0 min_links 0 lp_interval 1
> packets_per_slave 1 lacp_rate slow ad_select stable tlb_dynamic_lb 1 addrgenmode eui64 numtxqueues 16 numrxqueues 16 gso_max_size 65536 gso_max_segs
> 65535
> 
> $ grep Hash /proc/net/bonding/bond0
> Transmit Hash Policy: vlan+srcmac (5)
> 
> $ sudo ip link add test type bond help
> Usage: ... bond [ mode BONDMODE ] [ active_slave SLAVE_DEV ]
>                 [ clear_active_slave ] [ miimon MIIMON ]
>                 [ updelay UPDELAY ] [ downdelay DOWNDELAY ]
>                 [ peer_notify_delay DELAY ]
>                 [ use_carrier USE_CARRIER ]
>                 [ arp_interval ARP_INTERVAL ]
>                 [ arp_validate ARP_VALIDATE ]
>                 [ arp_all_targets ARP_ALL_TARGETS ]
>                 [ arp_ip_target [ ARP_IP_TARGET, ... ] ]
>                 [ primary SLAVE_DEV ]
>                 [ primary_reselect PRIMARY_RESELECT ]
>                 [ fail_over_mac FAIL_OVER_MAC ]
>                 [ xmit_hash_policy XMIT_HASH_POLICY ]
>                 [ resend_igmp RESEND_IGMP ]
>                 [ num_grat_arp|num_unsol_na NUM_GRAT_ARP|NUM_UNSOL_NA ]
>                 [ all_slaves_active ALL_SLAVES_ACTIVE ]
>                 [ min_links MIN_LINKS ]
>                 [ lp_interval LP_INTERVAL ]
>                 [ packets_per_slave PACKETS_PER_SLAVE ]
>                 [ tlb_dynamic_lb TLB_DYNAMIC_LB ]
>                 [ lacp_rate LACP_RATE ]
>                 [ ad_select AD_SELECT ]
>                 [ ad_user_port_key PORTKEY ]
>                 [ ad_actor_sys_prio SYSPRIO ]
>                 [ ad_actor_system LLADDR ]
> 
> BONDMODE := balance-rr|active-backup|balance-xor|broadcast|802.3ad|balance-tlb|balance-alb
> ARP_VALIDATE := none|active|backup|all
> ARP_ALL_TARGETS := any|all
> PRIMARY_RESELECT := always|better|failure
> FAIL_OVER_MAC := none|active|follow
> XMIT_HASH_POLICY := layer2|layer2+3|layer3+4|encap2+3|encap3+4|vlan+srcmac
> LACP_RATE := slow|fast
> AD_SELECT := stable|bandwidth|count
> 
> Cc: Stephen Hemminger <stephen@networkplumber.org>
> Cc: Jay Vosburgh <j.vosburgh@gmail.com>
> Signed-off-by: Jarod Wilson <jarod@redhat.com>
> ---
>  ip/iplink_bond.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 

applied to iproute2-next.


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: question about bonding mode 4
  2021-01-23  6:10         ` moyufeng
@ 2021-01-29 19:11           ` Jay Vosburgh
  2021-01-30  9:41             ` moyufeng
  0 siblings, 1 reply; 30+ messages in thread
From: Jay Vosburgh @ 2021-01-29 19:11 UTC (permalink / raw)
  To: moyufeng
  Cc: Jiri Pirko, lipeng (Y),
	linux-kernel, Veaceslav Falico, Andy Gospodarek, David S. Miller,
	Jakub Kicinski, Thomas Davis, netdev, linuxarm, Salil Mehta

moyufeng <moyufeng@huawei.com> wrote:

>Ping...
>Any comments? Thanks!
>
>On 2021/1/15 10:02, moyufeng wrote:
>> Hi Team,
>> 
>> I have a question about bonding. During testing bonding mode 4
>> scenarios, I find that there is a very low probability that
>> the pointer is null. The following information is displayed:
>> 
>> [99359.795934] bond0: (slave eth13.2001): Port 2 did not find a suitable aggregator
>> [99359.796960] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020
>> [99359.798127] Mem abort info:
>> [99359.798526]   ESR = 0x96000004
>> [99359.798938]   EC = 0x25: DABT (current EL), IL = 32 bits
>> [99359.799673]   SET = 0, FnV = 0
>> [99359.800106]   EA = 0, S1PTW = 0
>> [99359.800554] Data abort info:
>> [99359.800952]   ISV = 0, ISS = 0x00000004
>> [99359.801522]   CM = 0, WnR = 0
>> [99359.801970] user pgtable: 4k pages, 48-bit VAs, pgdp=00000000c64e6000
>> [99359.802876] [0000000000000020] pgd=0000000000000000
>> [99359.803555] Internal error: Oops: 96000004 [#1] PREEMPT SMP
>> [99359.804369] Modules linked in: bonding hns3(-) hclgevf hnae3 [last unloaded: bonding]
>> [99359.805494] CPU: 1 PID: 951 Comm: kworker/u10:2 Not tainted 5.7.0-rc4+ #1
>> [99359.806455] Hardware name: linux,dummy-virt (DT)
>> [99359.807107] Workqueue: bond0 bond_3ad_state_machine_handler [bonding]
>> [99359.808056] pstate: 60c00005 (nZCv daif +PAN +UAO)
>> [99359.808722] pc : bond_3ad_state_machine_handler+0x7fc/0xdb8 [bonding]
>> [99359.809652] lr : bond_3ad_state_machine_handler+0x7f4/0xdb8 [bonding]
>> [99359.810535] sp : ffff80001882bd20
>> [99359.811012] x29: ffff80001882bd20 x28: ffff000085939a38
>> [99359.811791] x27: ffff00008649bb68 x26: 00000000aaaaaaab
>> [99359.812871] x25: ffff800009401000 x24: ffff800009408de4
>> [99359.814049] x23: ffff80001882bd98 x22: ffff00008649b880
>> [99359.815210] x21: 0000000000000000 x20: ffff000085939a00
>> [99359.816401] x19: ffff00008649b880 x18: ffff800012572988
>> [99359.817637] x17: 0000000000000000 x16: 0000000000000000
>> [99359.818870] x15: ffff80009882b987 x14: 726f746167657267
>> [99359.820090] x13: 676120656c626174 x12: 697573206120646e
>> [99359.821374] x11: 696620746f6e2064 x10: 696420322074726f
>> [99359.822659] x9 : 50203a2931303032 x8 : 0000000000081391
>> [99359.823891] x7 : ffff8000108e3ad0 x6 : ffff8000128858bb
>> [99359.825109] x5 : 0000000000000000 x4 : 0000000000000000
>> [99359.826262] x3 : 00000000ffffffff x2 : 906b329bb5362a00
>> [99359.827394] x1 : 906b329bb5362a00 x0 : 0000000000000000
>> [99359.828540] Call trace:
>> [99359.829071]  bond_3ad_state_machine_handler+0x7fc/0xdb8 [bonding]
>> [99359.830367]  process_one_work+0x15c/0x4a0
>> [99359.831216]  worker_thread+0x50/0x478
>> [99359.832022]  kthread+0x130/0x160
>> [99359.832716]  ret_from_fork+0x10/0x18
>> [99359.833487] Code: 910c0021 95f704bb f9403f80 b5ffe300 (f9401000)
>> [99359.834742] ---[ end trace c7a8e02914afc4e0 ]---
>> [99359.835817] Kernel panic - not syncing: Fatal exception in interrupt
>> [99359.837334] SMP: stopping secondary CPUs
>> [99359.838277] Kernel Offset: disabled
>> [99359.839086] CPU features: 0x080002,22208218
>> [99359.840053] Memory Limit: none
>> [99359.840783] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
>> 
>> The test procedure is as follows:
>> 1. Configure bonding and set it to mode 4.
>>     echo "4" > /sys/class/net/bond0/bonding/mode
>>     ifconfig bond0 up
>> 
>> 2. Configure two VLANs and add them to the bonding in step 1.
>>     vconfig add eth0 2001
>>     vconfig add eth1 2001
>>     ifenslave bond0 eth0.2001 eth1.2001
>> 
>> 3. Unload the network device driver and bonding driver.
>>     rmmod hns3
>>     rmmod hclge
>>     rmmod hnae3
>>     rmmod bonding.ko

	Are you running the above in a script, and can you share the
entire thing?

	Does the issue occur with the current net-next?

>> 4. Repeat the preceding steps for a long time.

	When you run this test, what are the network interfaces eth0 and
eth1 connected to, and are those ports configured for VLAN 2001 and
LACP?

>> By checking the logic in ad_port_selection_logic(), I find that
>> if enter the branch "Port %d did not find a suitable aggregator",
>> the value of port->aggregator will be NULL, causing the problem.
>> 
>> So I'd like to ask what circumstances will be involved in this
>> branch, and what should be done in this case?

	Well, in principle, this shouldn't ever happen.  Every port
structure contains an aggregator structure, so there should always be
one available somewhere.  I'm going to speculate that there's a race
condition somewhere in the teardown processing vs the LACP state machine
that invalidates this presumption.

>> The detailed code analysis is as follows:

[...]

>> 	/* if all aggregator's ports are READY_N == TRUE, set ready=TRUE
>> 	 * in all aggregator's ports, else set ready=FALSE in all
>> 	 * aggregator's ports
>> 	 */
>> 	__set_agg_ports_ready(port->aggregator,
>> 			      __agg_ports_are_ready(port->aggregator));
>> 
>> ----analysis: port->aggregator is still NULL, which causes problem.
>> 
>> 	aggregator = __get_first_agg(port);
>> 	ad_agg_selection_logic(aggregator, update_slave_arr);
>> 
>> 	if (!port->aggregator->is_active)
>> 		port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION;

	Correct, if the "did not find a suitable aggregator" path is
taken, port->aggregator is NULL and bad things happen in the above
block.

	This is something that needs to be fixed, but I'm also concerned
that there are other issues lurking, so I'd like to be able to reproduce
this.

	-J

---
	-Jay Vosburgh, jay.vosburgh@canonical.com

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: question about bonding mode 4
  2021-01-29 19:11           ` Jay Vosburgh
@ 2021-01-30  9:41             ` moyufeng
  0 siblings, 0 replies; 30+ messages in thread
From: moyufeng @ 2021-01-30  9:41 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Jiri Pirko, lipeng (Y),
	linux-kernel, Veaceslav Falico, Andy Gospodarek, David S. Miller,
	Jakub Kicinski, Thomas Davis, netdev, linuxarm, Salil Mehta



On 2021/1/30 3:11, Jay Vosburgh wrote:
> moyufeng <moyufeng@huawei.com> wrote:
> 
>> Ping...
>> Any comments? Thanks!
>>
>> On 2021/1/15 10:02, moyufeng wrote:
>>> Hi Team,
>>>
>>> I have a question about bonding. During testing bonding mode 4
>>> scenarios, I find that there is a very low probability that
>>> the pointer is null. The following information is displayed:
>>>
>>> [99359.795934] bond0: (slave eth13.2001): Port 2 did not find a suitable aggregator
>>> [99359.796960] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020
>>> [99359.798127] Mem abort info:
>>> [99359.798526]   ESR = 0x96000004
>>> [99359.798938]   EC = 0x25: DABT (current EL), IL = 32 bits
>>> [99359.799673]   SET = 0, FnV = 0
>>> [99359.800106]   EA = 0, S1PTW = 0
>>> [99359.800554] Data abort info:
>>> [99359.800952]   ISV = 0, ISS = 0x00000004
>>> [99359.801522]   CM = 0, WnR = 0
>>> [99359.801970] user pgtable: 4k pages, 48-bit VAs, pgdp=00000000c64e6000
>>> [99359.802876] [0000000000000020] pgd=0000000000000000
>>> [99359.803555] Internal error: Oops: 96000004 [#1] PREEMPT SMP
>>> [99359.804369] Modules linked in: bonding hns3(-) hclgevf hnae3 [last unloaded: bonding]
>>> [99359.805494] CPU: 1 PID: 951 Comm: kworker/u10:2 Not tainted 5.7.0-rc4+ #1
>>> [99359.806455] Hardware name: linux,dummy-virt (DT)
>>> [99359.807107] Workqueue: bond0 bond_3ad_state_machine_handler [bonding]
>>> [99359.808056] pstate: 60c00005 (nZCv daif +PAN +UAO)
>>> [99359.808722] pc : bond_3ad_state_machine_handler+0x7fc/0xdb8 [bonding]
>>> [99359.809652] lr : bond_3ad_state_machine_handler+0x7f4/0xdb8 [bonding]
>>> [99359.810535] sp : ffff80001882bd20
>>> [99359.811012] x29: ffff80001882bd20 x28: ffff000085939a38
>>> [99359.811791] x27: ffff00008649bb68 x26: 00000000aaaaaaab
>>> [99359.812871] x25: ffff800009401000 x24: ffff800009408de4
>>> [99359.814049] x23: ffff80001882bd98 x22: ffff00008649b880
>>> [99359.815210] x21: 0000000000000000 x20: ffff000085939a00
>>> [99359.816401] x19: ffff00008649b880 x18: ffff800012572988
>>> [99359.817637] x17: 0000000000000000 x16: 0000000000000000
>>> [99359.818870] x15: ffff80009882b987 x14: 726f746167657267
>>> [99359.820090] x13: 676120656c626174 x12: 697573206120646e
>>> [99359.821374] x11: 696620746f6e2064 x10: 696420322074726f
>>> [99359.822659] x9 : 50203a2931303032 x8 : 0000000000081391
>>> [99359.823891] x7 : ffff8000108e3ad0 x6 : ffff8000128858bb
>>> [99359.825109] x5 : 0000000000000000 x4 : 0000000000000000
>>> [99359.826262] x3 : 00000000ffffffff x2 : 906b329bb5362a00
>>> [99359.827394] x1 : 906b329bb5362a00 x0 : 0000000000000000
>>> [99359.828540] Call trace:
>>> [99359.829071]  bond_3ad_state_machine_handler+0x7fc/0xdb8 [bonding]
>>> [99359.830367]  process_one_work+0x15c/0x4a0
>>> [99359.831216]  worker_thread+0x50/0x478
>>> [99359.832022]  kthread+0x130/0x160
>>> [99359.832716]  ret_from_fork+0x10/0x18
>>> [99359.833487] Code: 910c0021 95f704bb f9403f80 b5ffe300 (f9401000)
>>> [99359.834742] ---[ end trace c7a8e02914afc4e0 ]---
>>> [99359.835817] Kernel panic - not syncing: Fatal exception in interrupt
>>> [99359.837334] SMP: stopping secondary CPUs
>>> [99359.838277] Kernel Offset: disabled
>>> [99359.839086] CPU features: 0x080002,22208218
>>> [99359.840053] Memory Limit: none
>>> [99359.840783] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
>>>
>>> The test procedure is as follows:
>>> 1. Configure bonding and set it to mode 4.
>>>     echo "4" > /sys/class/net/bond0/bonding/mode
>>>     ifconfig bond0 up
>>>
>>> 2. Configure two VLANs and add them to the bonding in step 1.
>>>     vconfig add eth0 2001
>>>     vconfig add eth1 2001
>>>     ifenslave bond0 eth0.2001 eth1.2001
>>>
>>> 3. Unload the network device driver and bonding driver.
>>>     rmmod hns3
>>>     rmmod hclge
>>>     rmmod hnae3
>>>     rmmod bonding.ko
> 
> 	Are you running the above in a script, and can you share the
> entire thing?

Thanks for your reply.

Yes, it's a script, as below. However, the recurrence probability is
very low. So far, this problem occurs only once :(

#start
#!/bin/bash
Version=$(uname -r)
KoPath=/lib/modules/"${Version}"
while [ 1 ];do
	insmod ${KoPath}/hnae3.ko
	insmod ${KoPath}/hclgevf.ko
	insmod ${KoPath}/hns3.ko
	insmod ${KoPath}/bonding.ko

	ifconfig eth0 up
	ifconfig eth1 up

	echo "4" > /sys/class/net/bond0/bonding/mode
	ifconfig bond0 192.168.1.101
	echo 100 > "/sys/class/net/bond0/bonding/miimon"
	sleep 8

	vconfig add eth0 2001
	vconfig add eth1 2001
	ifconfig eth0.2001 192.168.2.101 up
	ifconfig eth1.2001 192.168.3.101 up
	sleep 8

	ifenslave bond0 eth0.2001 eth13.2001
	sleep 8

	rmmod hns3
	rmmod hclge
	rmmod hclgevf
	rmmod hnae3
	rmmod bonding.ko
	sleep 5
done
#end

> 
> 	Does the issue occur with the current net-next?
> 
Yes

>>> 4. Repeat the preceding steps for a long time.
> 
> 	When you run this test, what are the network interfaces eth0 and
> eth1 connected to, and are those ports configured for VLAN 2001 and
> LACP?
> 
Yes, the configurations at both ends are the same.

>>> By checking the logic in ad_port_selection_logic(), I find that
>>> if enter the branch "Port %d did not find a suitable aggregator",
>>> the value of port->aggregator will be NULL, causing the problem.
>>>
>>> So I'd like to ask what circumstances will be involved in this
>>> branch, and what should be done in this case?
> 
> 	Well, in principle, this shouldn't ever happen.  Every port
> structure contains an aggregator structure, so there should always be
> one available somewhere.  I'm going to speculate that there's a race
> condition somewhere in the teardown processing vs the LACP state machine
> that invalidates this presumption.
> 
I agree with your inference. But I don't have a better way to prove it,
since the recurrence probability is too low.

>>> The detailed code analysis is as follows:
> 
> [...]
> 
>>> 	/* if all aggregator's ports are READY_N == TRUE, set ready=TRUE
>>> 	 * in all aggregator's ports, else set ready=FALSE in all
>>> 	 * aggregator's ports
>>> 	 */
>>> 	__set_agg_ports_ready(port->aggregator,
>>> 			      __agg_ports_are_ready(port->aggregator));
>>>
>>> ----analysis: port->aggregator is still NULL, which causes problem.
>>>
>>> 	aggregator = __get_first_agg(port);
>>> 	ad_agg_selection_logic(aggregator, update_slave_arr);
>>>
>>> 	if (!port->aggregator->is_active)
>>> 		port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION;
> 
> 	Correct, if the "did not find a suitable aggregator" path is
> taken, port->aggregator is NULL and bad things happen in the above
> block.
> 
> 	This is something that needs to be fixed, but I'm also concerned
> that there are other issues lurking, so I'd like to be able to reproduce
> this.
> 
> 	-J
> 
I will continue to reproduce the problem and try to find ways to improve
the reproducibility probability.

Since this path is incorrect, could you help to fix it?

Thank you! :)

> ---
> 	-Jay Vosburgh, jay.vosburgh@canonical.com
> .
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2021-01-30  9:43 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-18 19:30 [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option Jarod Wilson
2020-12-19  0:18 ` Jay Vosburgh
2021-01-08  0:03   ` Jarod Wilson
2021-01-12 21:12     ` Jarod Wilson
2021-01-12 21:39       ` Jay Vosburgh
2021-01-12 22:32         ` Jarod Wilson
2020-12-28 10:11 ` Jiri Pirko
2021-01-07 23:58   ` Jarod Wilson
2021-01-08 13:12     ` Jiri Pirko
2021-01-08 15:21       ` Jarod Wilson
2021-01-15  2:02       ` question about bonding mode 4 moyufeng
2021-01-23  6:10         ` moyufeng
2021-01-29 19:11           ` Jay Vosburgh
2021-01-30  9:41             ` moyufeng
2021-01-13 22:35 ` [PATCH net-next v2] bonding: add a vlan+mac tx hashing option Jarod Wilson
2021-01-13 23:41   ` [PATCH iproute2] bond: support xmit_hash_policy=vlan+mac Jarod Wilson
2021-01-15 15:12     ` Jarod Wilson
2021-01-15 19:21     ` [PATCH iproute2 v2] bond: support xmit_hash_policy=vlan+srcmac Jarod Wilson
2021-01-23 18:35       ` David Ahern
2021-01-14  1:58   ` [PATCH net-next v2] bonding: add a vlan+mac tx hashing option Jakub Kicinski
2021-01-14 21:11     ` Jarod Wilson
2021-01-14 21:23       ` Jakub Kicinski
2021-01-14 21:42         ` Jarod Wilson
2021-01-14 21:54       ` Jay Vosburgh
2021-01-15 15:08         ` Jarod Wilson
2021-01-15 19:21   ` [PATCH net-next v3] bonding: add a vlan+srcmac " Jarod Wilson
2021-01-18 23:10     ` David Ahern
2021-01-19  1:04       ` Jarod Wilson
2021-01-19  1:09     ` [PATCH net-next v4] " Jarod Wilson
2021-01-20  6:10       ` patchwork-bot+netdevbpf

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.