All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC 1/2] net: bridge: add knob for filtering rx/tx BPDU packets on a port
@ 2022-02-10 14:24 Felix Fietkau
  2022-02-10 14:24 ` [RFC 2/2] net: bridge: add a software fast-path implementation Felix Fietkau
  2022-02-10 14:55 ` [RFC 1/2] net: bridge: add knob for filtering rx/tx BPDU packets on a port Nikolay Aleksandrov
  0 siblings, 2 replies; 12+ messages in thread
From: Felix Fietkau @ 2022-02-10 14:24 UTC (permalink / raw)
  To: netdev

Some devices (e.g. wireless APs) can't have devices behind them be part of
a bridge topology with redundant links, due to address limitations.
Additionally, broadcast traffic on these devices is somewhat expensive, due to
the low data rate and wakeups of clients in powersave mode.
This knob can be used to ensure that BPDU packets are never sent or forwarded
to/from these devices

Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
 include/linux/if_bridge.h    | 1 +
 include/uapi/linux/if_link.h | 1 +
 net/bridge/br_forward.c      | 5 +++++
 net/bridge/br_input.c        | 2 ++
 net/bridge/br_netlink.c      | 6 +++++-
 net/bridge/br_stp_bpdu.c     | 9 +++++++--
 net/core/rtnetlink.c         | 4 +++-
 7 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 509e18c7e740..18d3b264b754 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -58,6 +58,7 @@ struct br_ip_list {
 #define BR_MRP_LOST_CONT	BIT(18)
 #define BR_MRP_LOST_IN_CONT	BIT(19)
 #define BR_TX_FWD_OFFLOAD	BIT(20)
+#define BR_BPDU_FILTER		BIT(21)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 6218f93f5c1a..4c847c2d6afa 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -537,6 +537,7 @@ enum {
 	IFLA_BRPORT_MRP_IN_OPEN,
 	IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT,
 	IFLA_BRPORT_MCAST_EHT_HOSTS_CNT,
+	IFLA_BRPORT_BPDU_FILTER,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index ec646656dbf1..9fe5c888f27d 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -199,6 +199,7 @@ static struct net_bridge_port *maybe_deliver(
 void br_flood(struct net_bridge *br, struct sk_buff *skb,
 	      enum br_pkt_type pkt_type, bool local_rcv, bool local_orig)
 {
+	const unsigned char *dest = eth_hdr(skb)->h_dest;
 	struct net_bridge_port *prev = NULL;
 	struct net_bridge_port *p;
 
@@ -214,6 +215,10 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
 		case BR_PKT_MULTICAST:
 			if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
 				continue;
+			if ((p->flags & BR_BPDU_FILTER) &&
+			    unlikely(is_link_local_ether_addr(dest) &&
+				     dest[5] == 0))
+				continue;
 			break;
 		case BR_PKT_BROADCAST:
 			if (!(p->flags & BR_BCAST_FLOOD) && skb->dev != br->dev)
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index b50382f957c1..d8263c4849c1 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -316,6 +316,8 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
 		fwd_mask |= p->group_fwd_mask;
 		switch (dest[5]) {
 		case 0x00:	/* Bridge Group Address */
+			if (p->flags & BR_BPDU_FILTER)
+				goto drop;
 			/* If STP is turned off,
 			   then must forward to keep loop detection */
 			if (p->br->stp_enabled == BR_NO_STP ||
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 2ff83d84230d..11215c55adc2 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -184,6 +184,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_VLAN_TUNNEL */
 		+ nla_total_size(1)	/* IFLA_BRPORT_NEIGH_SUPPRESS */
 		+ nla_total_size(1)	/* IFLA_BRPORT_ISOLATED */
+		+ nla_total_size(1)	/* IFLA_BRPORT_BPDU_FILTER */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_ROOT_ID */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_BRIDGE_ID */
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_DESIGNATED_PORT */
@@ -269,7 +270,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 							  BR_MRP_LOST_CONT)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN,
 		       !!(p->flags & BR_MRP_LOST_IN_CONT)) ||
-	    nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)))
+	    nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_BPDU_FILTER, !!(p->flags & BR_BPDU_FILTER)))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -829,6 +831,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_ISOLATED]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 },
 	[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 },
+	[IFLA_BRPORT_BPDU_FILTER] = { .type = NLA_U8 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -893,6 +896,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[],
 	br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL);
 	br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS);
 	br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
+	br_set_port_flag(p, tb, IFLA_BRPORT_BPDU_FILTER, BR_BPDU_FILTER);
 
 	changed_mask = old_flags ^ p->flags;
 
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
index 0e4572f31330..9d2a235260eb 100644
--- a/net/bridge/br_stp_bpdu.c
+++ b/net/bridge/br_stp_bpdu.c
@@ -80,7 +80,8 @@ void br_send_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu)
 {
 	unsigned char buf[35];
 
-	if (p->br->stp_enabled != BR_KERNEL_STP)
+	if (p->br->stp_enabled != BR_KERNEL_STP ||
+	    (p->flags & BR_BPDU_FILTER))
 		return;
 
 	buf[0] = 0;
@@ -127,7 +128,8 @@ void br_send_tcn_bpdu(struct net_bridge_port *p)
 {
 	unsigned char buf[4];
 
-	if (p->br->stp_enabled != BR_KERNEL_STP)
+	if (p->br->stp_enabled != BR_KERNEL_STP ||
+	    (p->flags & BR_BPDU_FILTER))
 		return;
 
 	buf[0] = 0;
@@ -172,6 +174,9 @@ void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
 	if (!(br->dev->flags & IFF_UP))
 		goto out;
 
+	if (p->flags & BR_BPDU_FILTER)
+		goto out;
+
 	if (p->state == BR_STATE_DISABLED)
 		goto out;
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 710da8a36729..00328f0dd22b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4707,7 +4707,9 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 	    brport_nla_put_flag(skb, flags, mask,
 				IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) ||
 	    brport_nla_put_flag(skb, flags, mask,
-				IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) {
+				IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD) ||
+	    brport_nla_put_flag(skb, flags, mask,
+				IFLA_BRPORT_BPDU_FILTER, BR_BPDU_FILTER)) {
 		nla_nest_cancel(skb, protinfo);
 		goto nla_put_failure;
 	}
-- 
2.32.0 (Apple Git-132)


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [RFC 2/2] net: bridge: add a software fast-path implementation
  2022-02-10 14:24 [RFC 1/2] net: bridge: add knob for filtering rx/tx BPDU packets on a port Felix Fietkau
@ 2022-02-10 14:24 ` Felix Fietkau
  2022-02-10 15:02   ` Nikolay Aleksandrov
  2022-02-10 14:55 ` [RFC 1/2] net: bridge: add knob for filtering rx/tx BPDU packets on a port Nikolay Aleksandrov
  1 sibling, 1 reply; 12+ messages in thread
From: Felix Fietkau @ 2022-02-10 14:24 UTC (permalink / raw)
  To: netdev

This opt-in feature creates a per-port cache of dest_mac+src_mac+vlan tuples
with enough information to quickly push frames to the correct destination port.
It can be enabled per-port

Cache entries are automatically created when a skb is forwarded from one port
to another, and only if there is room and both ports have the offload flag set.

Whenever a fdb entry changes, all corresponding cache entries associated with
it are automatically flushed.

In my test on MT7622 when bridging 1.85 Gbit/s from Ethernet to WLAN, this
significantly improves bridging performance, especially with VLAN filtering
enabled:

CPU usage:
- no offload, no VLAN: 79%
- no offload, VLAN: 84%
- offload, no VLAN: 73-74%
- offload, VLAN: 74%

MT7622 has support for hardware offloading of packets from LAN to WLAN, both
routed and bridged. For bridging it needs source/destination MAC address entries
like the ones stored in this offload cache. This code will be extended later
in order to create appropriate flow_offload rules to handle this

Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
 include/linux/if_bridge.h       |   1 +
 include/uapi/linux/if_link.h    |   3 +
 net/bridge/Kconfig              |  10 +
 net/bridge/Makefile             |   1 +
 net/bridge/br.c                 |   8 +
 net/bridge/br_device.c          |   4 +
 net/bridge/br_fdb.c             |  20 +-
 net/bridge/br_forward.c         |   3 +
 net/bridge/br_if.c              |   4 +
 net/bridge/br_input.c           |   5 +
 net/bridge/br_netlink.c         |  31 ++-
 net/bridge/br_offload.c         | 466 ++++++++++++++++++++++++++++++++
 net/bridge/br_private.h         |  30 +-
 net/bridge/br_private_offload.h |  53 ++++
 net/bridge/br_stp.c             |   3 +
 net/bridge/br_vlan_tunnel.c     |   3 +
 net/core/rtnetlink.c            |   2 +-
 17 files changed, 641 insertions(+), 6 deletions(-)
 create mode 100644 net/bridge/br_offload.c
 create mode 100644 net/bridge/br_private_offload.h

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 18d3b264b754..944630df0ec3 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -59,6 +59,7 @@ struct br_ip_list {
 #define BR_MRP_LOST_IN_CONT	BIT(19)
 #define BR_TX_FWD_OFFLOAD	BIT(20)
 #define BR_BPDU_FILTER		BIT(21)
+#define BR_OFFLOAD		BIT(22)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 4c847c2d6afa..a7349414a27f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -482,6 +482,8 @@ enum {
 	IFLA_BR_VLAN_STATS_PER_PORT,
 	IFLA_BR_MULTI_BOOLOPT,
 	IFLA_BR_MCAST_QUERIER_STATE,
+	IFLA_BR_OFFLOAD_CACHE_SIZE,
+	IFLA_BR_OFFLOAD_CACHE_RESERVED,
 	__IFLA_BR_MAX,
 };
 
@@ -538,6 +540,7 @@ enum {
 	IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT,
 	IFLA_BRPORT_MCAST_EHT_HOSTS_CNT,
 	IFLA_BRPORT_BPDU_FILTER,
+	IFLA_BRPORT_OFFLOAD,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
index 3c8ded7d3e84..3f93da1f66da 100644
--- a/net/bridge/Kconfig
+++ b/net/bridge/Kconfig
@@ -34,6 +34,16 @@ config BRIDGE
 
 	  If unsure, say N.
 
+config BRIDGE_OFFLOAD
+	bool "Offloading support"
+	depends on BRIDGE
+	help
+	  If you say Y here, you can turn on a per-port offload flag, which
+	  will cache src/destination mac address flows between ports and handle
+	  them faster.
+
+	  If unsure, say N.
+
 config BRIDGE_IGMP_SNOOPING
 	bool "IGMP/MLD snooping"
 	depends on BRIDGE
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 7fb9a021873b..166f76b5f258 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -11,6 +11,7 @@ bridge-y	:= br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
 			br_netlink_tunnel.o br_arp_nd_proxy.o
 
 bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
+bridge-$(CONFIG_BRIDGE_OFFLOAD) += br_offload.o
 
 bridge-$(subst m,y,$(CONFIG_BRIDGE_NETFILTER)) += br_nf_core.o
 
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 1fac72cc617f..bd46e5e20b30 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -18,6 +18,7 @@
 #include <net/switchdev.h>
 
 #include "br_private.h"
+#include "br_private_offload.h"
 
 /*
  * Handle changes in state of network devices enslaved to a bridge.
@@ -381,6 +382,10 @@ static int __init br_init(void)
 	if (err)
 		goto err_out;
 
+	err = br_offload_init();
+	if (err)
+		goto err_out0;
+
 	err = register_pernet_subsys(&br_net_ops);
 	if (err)
 		goto err_out1;
@@ -430,6 +435,8 @@ static int __init br_init(void)
 err_out2:
 	unregister_pernet_subsys(&br_net_ops);
 err_out1:
+	br_offload_fini();
+err_out0:
 	br_fdb_fini();
 err_out:
 	stp_proto_unregister(&br_stp_proto);
@@ -452,6 +459,7 @@ static void __exit br_deinit(void)
 #if IS_ENABLED(CONFIG_ATM_LANE)
 	br_fdb_test_addr_hook = NULL;
 #endif
+	br_offload_fini();
 	br_fdb_fini();
 }
 
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 8d6bab244c4a..10c4e4039c7b 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -524,6 +524,10 @@ void br_dev_setup(struct net_device *dev)
 	br->bridge_hello_time = br->hello_time = 2 * HZ;
 	br->bridge_forward_delay = br->forward_delay = 15 * HZ;
 	br->bridge_ageing_time = br->ageing_time = BR_DEFAULT_AGEING_TIME;
+#ifdef CONFIG_BRIDGE_OFFLOAD
+	br->offload_cache_size = 128;
+	br->offload_cache_reserved = 8;
+#endif
 	dev->max_mtu = ETH_MAX_MTU;
 
 	br_netfilter_rtable_init(br);
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 6ccda68bd473..49abfc13a323 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -23,6 +23,7 @@
 #include <net/switchdev.h>
 #include <trace/events/bridge.h>
 #include "br_private.h"
+#include "br_private_offload.h"
 
 static const struct rhashtable_params br_fdb_rht_params = {
 	.head_offset = offsetof(struct net_bridge_fdb_entry, rhnode),
@@ -185,6 +186,8 @@ static void fdb_notify(struct net_bridge *br,
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
 
+	br_offload_fdb_update(fdb);
+
 	if (swdev_notify)
 		br_switchdev_fdb_notify(br, fdb, type);
 
@@ -393,6 +396,10 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br,
 	fdb->key.vlan_id = vid;
 	fdb->flags = flags;
 	fdb->updated = fdb->used = jiffies;
+#ifdef CONFIG_BRIDGE_OFFLOAD
+	INIT_HLIST_HEAD(&fdb->offload_in);
+	INIT_HLIST_HEAD(&fdb->offload_out);
+#endif
 	err = rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, &fdb->rhnode,
 					    br_fdb_rht_params);
 	if (err) {
@@ -527,8 +534,10 @@ void br_fdb_cleanup(struct work_struct *work)
 	 */
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
-		unsigned long this_timer = f->updated + delay;
+		unsigned long this_timer;
 
+		br_offload_fdb_refresh_time(br, f);
+		this_timer = f->updated + delay;
 		if (test_bit(BR_FDB_STATIC, &f->flags) ||
 		    test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) {
 			if (test_bit(BR_FDB_NOTIFY, &f->flags)) {
@@ -651,8 +660,11 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf,
 		if (num >= maxnum)
 			break;
 
-		if (has_expired(br, f))
-			continue;
+		if (has_expired(br, f)) {
+			if (!br_offload_fdb_refresh_time(br, f) ||
+			    has_expired(br, f))
+				continue;
+		}
 
 		/* ignore pseudo entry for local MAC address */
 		if (!f->dst)
@@ -797,6 +809,7 @@ int br_fdb_dump(struct sk_buff *skb,
 		if (!filter_dev && f->dst)
 			goto skip;
 
+		br_offload_fdb_refresh_time(br, f);
 		err = fdb_fill_info(skb, br, f,
 				    NETLINK_CB(cb->skb).portid,
 				    cb->nlh->nlmsg_seq,
@@ -831,6 +844,7 @@ int br_fdb_get(struct sk_buff *skb,
 		goto errout;
 	}
 
+	br_offload_fdb_refresh_time(br, f);
 	err = fdb_fill_info(skb, br, f, portid, seq,
 			    RTM_NEWNEIGH, 0);
 errout:
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 9fe5c888f27d..6d9025106d9d 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -16,6 +16,7 @@
 #include <linux/if_vlan.h>
 #include <linux/netfilter_bridge.h>
 #include "br_private.h"
+#include "br_private_offload.h"
 
 /* Don't forward packets to originating port or forwarding disabled */
 static inline int should_deliver(const struct net_bridge_port *p,
@@ -32,6 +33,8 @@ static inline int should_deliver(const struct net_bridge_port *p,
 
 int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
+	br_offload_output(skb);
+
 	skb_push(skb, ETH_HLEN);
 	if (!is_skb_forwardable(skb->dev, skb))
 		goto drop;
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 55f47cadb114..c68c7f6cc429 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -25,6 +25,7 @@
 #include <net/net_namespace.h>
 
 #include "br_private.h"
+#include "br_private_offload.h"
 
 /*
  * Determine initial path cost based on speed.
@@ -772,6 +773,9 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask)
 
 	if (mask & BR_NEIGH_SUPPRESS)
 		br_recalculate_neigh_suppress_enabled(br);
+
+	if (mask & BR_OFFLOAD)
+		br_offload_port_state(p);
 }
 
 bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag)
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index d8263c4849c1..b606ca06ff2d 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -22,6 +22,7 @@
 #include <linux/rculist.h>
 #include "br_private.h"
 #include "br_private_tunnel.h"
+#include "br_private_offload.h"
 
 static int
 br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -164,6 +165,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 			dst->used = now;
 		br_forward(dst->dst, skb, local_rcv, false);
 	} else {
+		br_offload_skb_disable(skb);
 		if (!mcast_hit)
 			br_flood(br, skb, pkt_type, local_rcv, false);
 		else
@@ -293,6 +295,9 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
 
 	memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
 
+	if (br_offload_input(p, skb))
+		return RX_HANDLER_CONSUMED;
+
 	p = br_port_get_rcu(skb->dev);
 	if (p->flags & BR_VLAN_TUNNEL)
 		br_handle_ingress_vlan_tunnel(skb, p, nbp_vlan_group_rcu(p));
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 11215c55adc2..994aca4b633a 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -19,6 +19,7 @@
 #include "br_private_cfm.h"
 #include "br_private_tunnel.h"
 #include "br_private_mcast_eht.h"
+#include "br_private_offload.h"
 
 static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg,
 				u32 filter_mask)
@@ -185,6 +186,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_NEIGH_SUPPRESS */
 		+ nla_total_size(1)	/* IFLA_BRPORT_ISOLATED */
 		+ nla_total_size(1)	/* IFLA_BRPORT_BPDU_FILTER */
+		+ nla_total_size(1)	/* IFLA_BRPORT_OFFLOAD */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_ROOT_ID */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_BRIDGE_ID */
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_DESIGNATED_PORT */
@@ -271,7 +273,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 	    nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN,
 		       !!(p->flags & BR_MRP_LOST_IN_CONT)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) ||
-	    nla_put_u8(skb, IFLA_BRPORT_BPDU_FILTER, !!(p->flags & BR_BPDU_FILTER)))
+	    nla_put_u8(skb, IFLA_BRPORT_BPDU_FILTER, !!(p->flags & BR_BPDU_FILTER)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_OFFLOAD, !!(p->flags & BR_OFFLOAD)))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -832,6 +835,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 },
 	[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 },
 	[IFLA_BRPORT_BPDU_FILTER] = { .type = NLA_U8 },
+	[IFLA_BRPORT_OFFLOAD] = { .type = NLA_U8 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -897,6 +901,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[],
 	br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS);
 	br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
 	br_set_port_flag(p, tb, IFLA_BRPORT_BPDU_FILTER, BR_BPDU_FILTER);
+	br_set_port_flag(p, tb, IFLA_BRPORT_OFFLOAD, BR_OFFLOAD);
 
 	changed_mask = old_flags ^ p->flags;
 
@@ -1165,6 +1170,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
 	[IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 },
 	[IFLA_BR_MULTI_BOOLOPT] =
 		NLA_POLICY_EXACT_LEN(sizeof(struct br_boolopt_multi)),
+	[IFLA_BR_OFFLOAD_CACHE_SIZE] = { .type = NLA_U32 },
+	[IFLA_BR_OFFLOAD_CACHE_RESERVED] = { .type = NLA_U32 },
 };
 
 static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1424,6 +1431,19 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 		br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val);
 	}
 #endif
+#ifdef CONFIG_BRIDGE_OFFLOAD
+	if (data[IFLA_BR_OFFLOAD_CACHE_SIZE]) {
+		u32 val = nla_get_u32(data[IFLA_BR_OFFLOAD_CACHE_SIZE]);
+
+		br_offload_set_cache_size(br, val);
+	}
+
+	if (data[IFLA_BR_OFFLOAD_CACHE_RESERVED]) {
+		u32 val = nla_get_u32(data[IFLA_BR_OFFLOAD_CACHE_RESERVED]);
+
+		br_offload_set_cache_reserved(br, val);
+	}
+#endif
 
 	if (data[IFLA_BR_MULTI_BOOLOPT]) {
 		struct br_boolopt_multi *bm;
@@ -1512,6 +1532,10 @@ static size_t br_get_size(const struct net_device *brdev)
 	       nla_total_size(sizeof(u8)) +     /* IFLA_BR_NF_CALL_IPTABLES */
 	       nla_total_size(sizeof(u8)) +     /* IFLA_BR_NF_CALL_IP6TABLES */
 	       nla_total_size(sizeof(u8)) +     /* IFLA_BR_NF_CALL_ARPTABLES */
+#endif
+#ifdef CONFIG_BRIDGE_OFFLOAD
+	       nla_total_size(sizeof(u32)) +	/* IFLA_BR_OFFLOAD_CACHE_SIZE */
+	       nla_total_size(sizeof(u32)) +	/* IFLA_BR_OFFLOAD_CACHE_RESERVED */
 #endif
 	       nla_total_size(sizeof(struct br_boolopt_multi)) + /* IFLA_BR_MULTI_BOOLOPT */
 	       0;
@@ -1636,6 +1660,11 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 		       br_opt_get(br, BROPT_NF_CALL_ARPTABLES) ? 1 : 0))
 		return -EMSGSIZE;
 #endif
+#ifdef CONFIG_BRIDGE_OFFLOAD
+	if (nla_put_u32(skb, IFLA_BR_OFFLOAD_CACHE_SIZE, br->offload_cache_size) ||
+	    nla_put_u32(skb, IFLA_BR_OFFLOAD_CACHE_RESERVED, br->offload_cache_reserved))
+		return -EMSGSIZE;
+#endif
 
 	return 0;
 }
diff --git a/net/bridge/br_offload.c b/net/bridge/br_offload.c
new file mode 100644
index 000000000000..8cb9266e6cf9
--- /dev/null
+++ b/net/bridge/br_offload.c
@@ -0,0 +1,466 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include "br_private.h"
+#include "br_private_offload.h"
+
+static DEFINE_SPINLOCK(offload_lock);
+
+struct bridge_flow_key {
+	u8 dest[ETH_ALEN];
+	u8 src[ETH_ALEN];
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	u16 vlan_tag;
+	bool vlan_present;
+#endif
+};
+
+struct bridge_flow {
+	struct net_bridge_port *port;
+	struct rhash_head node;
+	struct bridge_flow_key key;
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	bool vlan_out_present;
+	u16 vlan_out;
+#endif
+
+	unsigned long used;
+	struct net_bridge_fdb_entry *fdb_in, *fdb_out;
+	struct hlist_node fdb_list_in, fdb_list_out;
+
+	struct rcu_head rcu;
+};
+
+static const struct rhashtable_params flow_params = {
+	.automatic_shrinking = true,
+	.head_offset = offsetof(struct bridge_flow, node),
+	.key_len = sizeof(struct bridge_flow_key),
+	.key_offset = offsetof(struct bridge_flow, key),
+};
+
+static struct kmem_cache *offload_cache __read_mostly;
+
+static void
+flow_rcu_free(struct rcu_head *head)
+{
+	struct bridge_flow *flow;
+
+	flow = container_of(head, struct bridge_flow, rcu);
+	kmem_cache_free(offload_cache, flow);
+}
+
+static void
+__br_offload_flow_free(struct bridge_flow *flow)
+{
+	flow->used = 0;
+	hlist_del(&flow->fdb_list_in);
+	hlist_del(&flow->fdb_list_out);
+
+	call_rcu(&flow->rcu, flow_rcu_free);
+}
+
+static void
+br_offload_flow_free(struct bridge_flow *flow)
+{
+	if (rhashtable_remove_fast(&flow->port->offload.rht, &flow->node,
+				   flow_params) != 0)
+		return;
+
+	__br_offload_flow_free(flow);
+}
+
+static bool
+br_offload_flow_fdb_refresh_time(struct bridge_flow *flow,
+				 struct net_bridge_fdb_entry *fdb)
+{
+	if (!time_after(flow->used, fdb->updated))
+		return false;
+
+	fdb->updated = flow->used;
+
+	return true;
+}
+
+
+static void
+br_offload_flow_refresh_time(struct bridge_flow *flow)
+{
+	br_offload_flow_fdb_refresh_time(flow, flow->fdb_in);
+	br_offload_flow_fdb_refresh_time(flow, flow->fdb_out);
+}
+
+static void
+br_offload_destroy_cb(void *ptr, void *arg)
+{
+	struct bridge_flow *flow = ptr;
+
+	br_offload_flow_refresh_time(flow);
+	__br_offload_flow_free(flow);
+}
+
+static bool
+br_offload_need_gc(struct net_bridge_port *p)
+{
+	return (atomic_read(&p->offload.rht.nelems) +
+	        p->br->offload_cache_reserved) >= p->br->offload_cache_size;
+}
+
+static void
+br_offload_gc_work(struct work_struct *work)
+{
+	struct rhashtable_iter hti;
+	struct net_bridge_port *p;
+	struct bridge_flow *gc_flow = NULL;
+	struct bridge_flow *flow;
+	unsigned long gc_used;
+
+	p = container_of(work, struct net_bridge_port, offload.gc_work);
+
+	if (!br_offload_need_gc(p))
+		return;
+
+	rhashtable_walk_enter(&p->offload.rht, &hti);
+	rhashtable_walk_start(&hti);
+	while ((flow = rhashtable_walk_next(&hti)) != NULL) {
+		unsigned long used;
+
+		if (IS_ERR(flow))
+			continue;
+
+		used = READ_ONCE(flow->used);
+		if (!used)
+			continue;
+
+		if (gc_flow && !time_before(used, gc_used))
+			continue;
+
+		gc_flow = flow;
+		gc_used = used;
+	}
+	rhashtable_walk_stop(&hti);
+	rhashtable_walk_exit(&hti);
+
+	if (!gc_flow)
+		return;
+
+	spin_lock_bh(&offload_lock);
+	if (br_offload_need_gc(p) && gc_flow &&
+	    gc_flow->used == gc_used)
+		br_offload_flow_free(gc_flow);
+	if (p->offload.enabled && br_offload_need_gc(p))
+		queue_work(system_long_wq, work);
+	spin_unlock_bh(&offload_lock);
+
+}
+
+void br_offload_port_state(struct net_bridge_port *p)
+{
+	struct net_bridge_port_offload *o = &p->offload;
+	bool enabled = true;
+	bool flush = false;
+
+	if (p->state != BR_STATE_FORWARDING ||
+	    !(p->flags & BR_OFFLOAD))
+		enabled = false;
+
+	spin_lock_bh(&offload_lock);
+	if (o->enabled == enabled)
+		goto out;
+
+	if (enabled) {
+		if (!o->gc_work.func)
+			INIT_WORK(&o->gc_work, br_offload_gc_work);
+		rhashtable_init(&o->rht, &flow_params);
+	} else {
+		flush = true;
+		rhashtable_free_and_destroy(&o->rht, br_offload_destroy_cb, o);
+	}
+
+	o->enabled = enabled;
+
+out:
+	spin_unlock_bh(&offload_lock);
+
+	if (flush)
+		flush_work(&o->gc_work);
+}
+
+void br_offload_fdb_update(const struct net_bridge_fdb_entry *fdb)
+{
+	struct bridge_flow *f;
+	struct hlist_node *tmp;
+
+	spin_lock_bh(&offload_lock);
+
+	hlist_for_each_entry_safe(f, tmp, &fdb->offload_in, fdb_list_in) {
+		br_offload_flow_refresh_time(f);
+		br_offload_flow_free(f);
+	}
+	hlist_for_each_entry_safe(f, tmp, &fdb->offload_out, fdb_list_out) {
+		br_offload_flow_refresh_time(f);
+		br_offload_flow_free(f);
+	}
+
+	spin_unlock_bh(&offload_lock);
+}
+
+bool br_offload_fdb_refresh_time(struct net_bridge *br,
+				 struct net_bridge_fdb_entry *fdb)
+{
+	unsigned long timeout = jiffies - br->ageing_time;
+	struct bridge_flow *f;
+	struct hlist_node *tmp;
+	bool ret = false;
+
+	spin_lock_bh(&offload_lock);
+
+	hlist_for_each_entry_safe(f, tmp, &fdb->offload_in, fdb_list_in) {
+		if (br_offload_flow_fdb_refresh_time(f, fdb))
+			ret = true;
+		if (time_before(f->used, timeout))
+			br_offload_flow_free(f);
+	}
+
+	hlist_for_each_entry_safe(f, tmp, &fdb->offload_out, fdb_list_out) {
+		if (br_offload_flow_fdb_refresh_time(f, fdb))
+			ret = true;
+		if (time_before(f->used, timeout))
+			br_offload_flow_free(f);
+	}
+
+	spin_unlock_bh(&offload_lock);
+
+	return ret;
+}
+
+static void
+br_offload_prepare_key(struct net_bridge_port *p, struct bridge_flow_key *key,
+		       struct sk_buff *skb)
+{
+	memset(key, 0, sizeof(*key));
+	memcpy(key, eth_hdr(skb), 2 * ETH_ALEN);
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	if (!br_opt_get(p->br, BROPT_VLAN_ENABLED))
+		return;
+
+	if (!skb_vlan_tag_present(skb) || skb->vlan_proto != p->br->vlan_proto)
+		return;
+
+	key->vlan_present = true;
+	key->vlan_tag = skb_vlan_tag_get_id(skb);
+#endif
+}
+
+void br_offload_output(struct sk_buff *skb)
+{
+	struct net_bridge_port_offload *o;
+	struct br_input_skb_cb *cb = (struct br_input_skb_cb *)skb->cb;
+	struct net_bridge_port *p, *inp;
+	struct net_device *dev;
+	struct net_bridge_fdb_entry *fdb_in, *fdb_out;
+	struct net_bridge_vlan_group *vg;
+	struct bridge_flow_key key;
+	struct bridge_flow *flow;
+	u16 vlan;
+
+	if (!cb->offload)
+		return;
+
+	rcu_read_lock();
+
+	p = br_port_get_rcu(skb->dev);
+	if (!p)
+		goto out;
+
+	o = &p->offload;
+	if (!o->enabled)
+		goto out;
+
+	if (atomic_read(&p->offload.rht.nelems) >= p->br->offload_cache_size)
+		goto out;
+
+	dev = dev_get_by_index_rcu(dev_net(p->br->dev), cb->input_ifindex);
+	if (!dev)
+		goto out;
+
+	inp = br_port_get_rcu(dev);
+	if (!p)
+		goto out;
+
+	vg = nbp_vlan_group_rcu(inp);
+	vlan = cb->input_vlan_present ? cb->input_vlan_tag : br_get_pvid(vg);
+	fdb_in = br_fdb_find_rcu(p->br, eth_hdr(skb)->h_source, vlan);
+	if (!fdb_in)
+		goto out;
+
+	vg = nbp_vlan_group_rcu(p);
+	vlan = skb_vlan_tag_present(skb) ? skb_vlan_tag_get_id(skb) : br_get_pvid(vg);
+	fdb_out = br_fdb_find_rcu(p->br, eth_hdr(skb)->h_dest, vlan);
+	if (!fdb_out)
+		goto out;
+
+	br_offload_prepare_key(p, &key, skb);
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	key.vlan_present = cb->input_vlan_present;
+	key.vlan_tag = cb->input_vlan_tag;
+#endif
+
+	flow = kmem_cache_alloc(offload_cache, GFP_ATOMIC);
+	flow->port = fdb_in->dst;
+	memcpy(&flow->key, &key, sizeof(key));
+
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	flow->vlan_out_present = skb_vlan_tag_present(skb);
+	flow->vlan_out = skb_vlan_tag_get(skb);
+#endif
+
+	flow->fdb_in = fdb_in;
+	flow->fdb_out = fdb_out;
+	flow->used = jiffies;
+
+	spin_lock_bh(&offload_lock);
+	if (!o->enabled ||
+	    atomic_read(&p->offload.rht.nelems) >= p->br->offload_cache_size ||
+	    rhashtable_insert_fast(&flow->port->offload.rht, &flow->node, flow_params)) {
+		kmem_cache_free(offload_cache, flow);
+		goto out_unlock;
+	}
+
+	hlist_add_head(&flow->fdb_list_in, &fdb_in->offload_in);
+	hlist_add_head(&flow->fdb_list_out, &fdb_out->offload_out);
+
+	if (br_offload_need_gc(p))
+		queue_work(system_long_wq, &p->offload.gc_work);
+
+out_unlock:
+	spin_unlock_bh(&offload_lock);
+
+out:
+	rcu_read_unlock();
+}
+
+bool br_offload_input(struct net_bridge_port *p, struct sk_buff *skb)
+{
+	struct net_bridge_port_offload *o = &p->offload;
+	struct br_input_skb_cb *cb = (struct br_input_skb_cb *)skb->cb;
+	struct bridge_flow_key key;
+	struct net_bridge_port *dst;
+	struct bridge_flow *flow;
+	unsigned long now = jiffies;
+	bool ret = false;
+
+	if (skb->len < sizeof(key))
+		return false;
+
+	if (!o->enabled)
+		return false;
+
+	if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
+		return false;
+
+	br_offload_prepare_key(p, &key, skb);
+
+	rcu_read_lock();
+	flow = rhashtable_lookup(&o->rht, &key, flow_params);
+	if (!flow) {
+		cb->offload = 1;
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+		cb->input_vlan_present = key.vlan_present != 0;
+		cb->input_vlan_tag = key.vlan_tag;
+		cb->input_ifindex = p->dev->ifindex;
+#endif
+		goto out;
+	}
+
+	if (flow->fdb_in->dst != p)
+		goto out;
+
+	dst = flow->fdb_out->dst;
+	if (!dst)
+		goto out;
+
+	ret = true;
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	if (!flow->vlan_out_present && key.vlan_present) {
+		__vlan_hwaccel_clear_tag(skb);
+	} else if (flow->vlan_out_present) {
+		if (skb_vlan_tag_present(skb) &&
+		    skb->vlan_proto != p->br->vlan_proto) {
+			/* Protocol-mismatch, empty out vlan_tci for new tag */
+			skb_push(skb, ETH_HLEN);
+			skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+							skb_vlan_tag_get(skb));
+			if (unlikely(!skb))
+				goto out;
+
+			skb_pull(skb, ETH_HLEN);
+			skb_reset_mac_len(skb);
+		}
+
+		__vlan_hwaccel_put_tag(skb, p->br->vlan_proto,
+				       flow->vlan_out);
+	}
+#endif
+
+	skb->dev = dst->dev;
+	skb_push(skb, ETH_HLEN);
+
+	if (skb_warn_if_lro(skb) || !is_skb_forwardable(skb->dev, skb)) {
+		kfree_skb(skb);
+		goto out;
+	}
+
+	if (flow->used != now)
+		flow->used = now;
+	skb_forward_csum(skb);
+	dev_queue_xmit(skb);
+
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+static void
+br_offload_check_gc(struct net_bridge *br)
+{
+	struct net_bridge_port *p;
+
+	spin_lock_bh(&br->lock);
+	list_for_each_entry(p, &br->port_list, list)
+		if (br_offload_need_gc(p))
+			queue_work(system_long_wq, &p->offload.gc_work);
+	spin_unlock_bh(&br->lock);
+}
+
+
+int br_offload_set_cache_size(struct net_bridge *br, unsigned long val)
+{
+	br->offload_cache_size = val;
+	br_offload_check_gc(br);
+
+	return 0;
+}
+
+int br_offload_set_cache_reserved(struct net_bridge *br, unsigned long val)
+{
+	br->offload_cache_reserved = val;
+	br_offload_check_gc(br);
+
+	return 0;
+}
+
+int __init br_offload_init(void)
+{
+	offload_cache = kmem_cache_create("bridge_offload_cache",
+					  sizeof(struct bridge_flow),
+					  0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!offload_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void br_offload_fini(void)
+{
+	kmem_cache_destroy(offload_cache);
+}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 2661dda1a92b..40021fe4b8c8 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -268,7 +268,15 @@ struct net_bridge_fdb_entry {
 	unsigned long			updated ____cacheline_aligned_in_smp;
 	unsigned long			used;
 
-	struct rcu_head			rcu;
+	union {
+#ifdef CONFIG_BRIDGE_OFFLOAD
+		struct {
+			struct hlist_head		offload_in;
+			struct hlist_head		offload_out;
+		};
+#endif
+		struct rcu_head			rcu;
+	};
 };
 
 #define MDB_PG_FLAGS_PERMANENT	BIT(0)
@@ -343,6 +351,12 @@ struct net_bridge_mdb_entry {
 	struct rcu_head			rcu;
 };
 
+struct net_bridge_port_offload {
+	struct rhashtable		rht;
+	struct work_struct		gc_work;
+	bool				enabled;
+};
+
 struct net_bridge_port {
 	struct net_bridge		*br;
 	struct net_device		*dev;
@@ -404,6 +418,9 @@ struct net_bridge_port {
 	u16				backup_redirected_cnt;
 
 	struct bridge_stp_xstats	stp_xstats;
+#ifdef CONFIG_BRIDGE_OFFLOAD
+	struct net_bridge_port_offload	offload;
+#endif
 };
 
 #define kobj_to_brport(obj)	container_of(obj, struct net_bridge_port, kobj)
@@ -555,6 +572,11 @@ struct br_input_skb_cb {
 	u8 br_netfilter_broute:1;
 #endif
 
+#ifdef CONFIG_BRIDGE_OFFLOAD
+	u32				offload_cache_size;
+	u32				offload_cache_reserved;
+#endif
+
 #ifdef CONFIG_NET_SWITCHDEV
 	/* Set if TX data plane offloading is used towards at least one
 	 * hardware domain.
@@ -580,6 +602,12 @@ struct br_input_skb_cb {
 #else
 # define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb)	(0)
 #endif
+#ifdef CONFIG_BRIDGE_OFFLOAD
+	u8 offload:1;
+	u8 input_vlan_present:1;
+	u16 input_vlan_tag;
+	int input_ifindex;
+#endif
 
 #define br_printk(level, br, format, args...)	\
 	printk(level "%s: " format, (br)->dev->name, ##args)
diff --git a/net/bridge/br_private_offload.h b/net/bridge/br_private_offload.h
new file mode 100644
index 000000000000..f66edd0539ab
--- /dev/null
+++ b/net/bridge/br_private_offload.h
@@ -0,0 +1,53 @@
+#ifndef __BR_OFFLOAD_H
+#define __BR_OFFLOAD_H
+
+#ifdef CONFIG_BRIDGE_OFFLOAD
+bool br_offload_input(struct net_bridge_port *p, struct sk_buff *skb);
+void br_offload_output(struct sk_buff *skb);
+void br_offload_port_state(struct net_bridge_port *p);
+void br_offload_fdb_update(const struct net_bridge_fdb_entry *fdb);
+bool br_offload_fdb_refresh_time(struct net_bridge *br,
+				 struct net_bridge_fdb_entry *fdb);
+int br_offload_init(void);
+void br_offload_fini(void);
+int br_offload_set_cache_size(struct net_bridge *br, unsigned long val);
+int br_offload_set_cache_reserved(struct net_bridge *br, unsigned long val);
+#else
+static inline bool br_offload_input(struct net_bridge_port *p, struct sk_buff *skb)
+{
+	return false;
+}
+static inline void br_offload_output(struct sk_buff *skb)
+{
+}
+static inline void br_offload_port_state(struct net_bridge_port *p)
+{
+}
+static inline void br_offload_fdb_update(const struct net_bridge_fdb_entry *fdb)
+{
+}
+static inline bool br_offload_fdb_refresh_time(struct net_bridge *br,
+					       struct net_bridge_fdb_entry *fdb)
+{
+	return false;
+}
+static inline int br_offload_init(void)
+{
+	return 0;
+}
+static inline void br_offload_fini(void)
+{
+}
+#endif
+
+static inline void br_offload_skb_disable(struct sk_buff *skb)
+{
+#ifdef CONFIG_BRIDGE_OFFLOAD
+	struct br_input_skb_cb *cb = (struct br_input_skb_cb *)skb->cb;
+
+	if (cb->offload)
+		cb->offload = 0;
+#endif
+}
+
+#endif
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 1d80f34a139c..b57788b53d24 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -12,6 +12,7 @@
 
 #include "br_private.h"
 #include "br_private_stp.h"
+#include "br_private_offload.h"
 
 /* since time values in bpdu are in jiffies and then scaled (1/256)
  * before sending, make sure that is at least one STP tick.
@@ -52,6 +53,8 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
 				(unsigned int) p->port_no, p->dev->name,
 				br_port_state_names[p->state]);
 
+	br_offload_port_state(p);
+
 	if (p->br->stp_enabled == BR_KERNEL_STP) {
 		switch (p->state) {
 		case BR_STATE_BLOCKING:
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
index 6399a8a69d07..ffc65dc4eea8 100644
--- a/net/bridge/br_vlan_tunnel.c
+++ b/net/bridge/br_vlan_tunnel.c
@@ -15,6 +15,7 @@
 
 #include "br_private.h"
 #include "br_private_tunnel.h"
+#include "br_private_offload.h"
 
 static inline int br_vlan_tunid_cmp(struct rhashtable_compare_arg *arg,
 				    const void *ptr)
@@ -180,6 +181,7 @@ void br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
 	skb_dst_drop(skb);
 
 	__vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid);
+	br_offload_skb_disable(skb);
 }
 
 int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
@@ -201,6 +203,7 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
 	if (err)
 		return err;
 
+	br_offload_skb_disable(skb);
 	tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst);
 	if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst))
 		skb_dst_set(skb, &tunnel_dst->dst);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 00328f0dd22b..da8d3b72a77e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -55,7 +55,7 @@
 #include <net/net_namespace.h>
 
 #define RTNL_MAX_TYPE		50
-#define RTNL_SLAVE_MAX_TYPE	40
+#define RTNL_SLAVE_MAX_TYPE	41
 
 struct rtnl_link {
 	rtnl_doit_func		doit;
-- 
2.32.0 (Apple Git-132)


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [RFC 1/2] net: bridge: add knob for filtering rx/tx BPDU packets on a port
  2022-02-10 14:24 [RFC 1/2] net: bridge: add knob for filtering rx/tx BPDU packets on a port Felix Fietkau
  2022-02-10 14:24 ` [RFC 2/2] net: bridge: add a software fast-path implementation Felix Fietkau
@ 2022-02-10 14:55 ` Nikolay Aleksandrov
  2022-02-10 16:06   ` Felix Fietkau
  2022-02-11 17:01   ` Stephen Hemminger
  1 sibling, 2 replies; 12+ messages in thread
From: Nikolay Aleksandrov @ 2022-02-10 14:55 UTC (permalink / raw)
  To: Felix Fietkau, netdev

On 10/02/2022 16:24, Felix Fietkau wrote:
> Some devices (e.g. wireless APs) can't have devices behind them be part of
> a bridge topology with redundant links, due to address limitations.
> Additionally, broadcast traffic on these devices is somewhat expensive, due to
> the low data rate and wakeups of clients in powersave mode.
> This knob can be used to ensure that BPDU packets are never sent or forwarded
> to/from these devices
> 
> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> ---
>  include/linux/if_bridge.h    | 1 +
>  include/uapi/linux/if_link.h | 1 +
>  net/bridge/br_forward.c      | 5 +++++
>  net/bridge/br_input.c        | 2 ++
>  net/bridge/br_netlink.c      | 6 +++++-
>  net/bridge/br_stp_bpdu.c     | 9 +++++++--
>  net/core/rtnetlink.c         | 4 +++-
>  7 files changed, 24 insertions(+), 4 deletions(-)
> 

Why can't netfilter or tc be used to filter these frames?



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC 2/2] net: bridge: add a software fast-path implementation
  2022-02-10 14:24 ` [RFC 2/2] net: bridge: add a software fast-path implementation Felix Fietkau
@ 2022-02-10 15:02   ` Nikolay Aleksandrov
  2022-02-10 16:53     ` Felix Fietkau
  0 siblings, 1 reply; 12+ messages in thread
From: Nikolay Aleksandrov @ 2022-02-10 15:02 UTC (permalink / raw)
  To: Felix Fietkau, netdev

On 10/02/2022 16:24, Felix Fietkau wrote:
> This opt-in feature creates a per-port cache of dest_mac+src_mac+vlan tuples
> with enough information to quickly push frames to the correct destination port.
> It can be enabled per-port
> 
> Cache entries are automatically created when a skb is forwarded from one port
> to another, and only if there is room and both ports have the offload flag set.
> 
> Whenever a fdb entry changes, all corresponding cache entries associated with
> it are automatically flushed.
> 
> In my test on MT7622 when bridging 1.85 Gbit/s from Ethernet to WLAN, this
> significantly improves bridging performance, especially with VLAN filtering
> enabled:
> 
> CPU usage:
> - no offload, no VLAN: 79%
> - no offload, VLAN: 84%
> - offload, no VLAN: 73-74%
> - offload, VLAN: 74%
> 
> MT7622 has support for hardware offloading of packets from LAN to WLAN, both
> routed and bridged. For bridging it needs source/destination MAC address entries
> like the ones stored in this offload cache. This code will be extended later
> in order to create appropriate flow_offload rules to handle this
> 
> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> ---
>  include/linux/if_bridge.h       |   1 +
>  include/uapi/linux/if_link.h    |   3 +
>  net/bridge/Kconfig              |  10 +
>  net/bridge/Makefile             |   1 +
>  net/bridge/br.c                 |   8 +
>  net/bridge/br_device.c          |   4 +
>  net/bridge/br_fdb.c             |  20 +-
>  net/bridge/br_forward.c         |   3 +
>  net/bridge/br_if.c              |   4 +
>  net/bridge/br_input.c           |   5 +
>  net/bridge/br_netlink.c         |  31 ++-
>  net/bridge/br_offload.c         | 466 ++++++++++++++++++++++++++++++++
>  net/bridge/br_private.h         |  30 +-
>  net/bridge/br_private_offload.h |  53 ++++
>  net/bridge/br_stp.c             |   3 +
>  net/bridge/br_vlan_tunnel.c     |   3 +
>  net/core/rtnetlink.c            |   2 +-
>  17 files changed, 641 insertions(+), 6 deletions(-)
>  create mode 100644 net/bridge/br_offload.c
>  create mode 100644 net/bridge/br_private_offload.h
> 

Hi Felix,
that looks kind of familiar. :) I've been thinking about a similar optimization for
quite some time and generally love the idea, but I thought we'd allow this to be
implemented via eBPF flow speedup with some bridge helpers. There's also a lot of low
hanging fruit about optimizations in bridge's fast-path.

Also from your commit message it seems you don't need to store this in the bridge at
all but can use the notifications that others currently use and program these flows
in the interested driver. I think it'd be better to do the software flow cache via
ebpf, and do the hardware offload in the specific driver.

I can't do a thorough review of the patch right now and obviously it will have to be
broken up in smaller pieces. When I get a chance I'll comment on the details.

Thank you,
 Nik

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC 1/2] net: bridge: add knob for filtering rx/tx BPDU packets on a port
  2022-02-10 14:55 ` [RFC 1/2] net: bridge: add knob for filtering rx/tx BPDU packets on a port Nikolay Aleksandrov
@ 2022-02-10 16:06   ` Felix Fietkau
  2022-02-11  8:16     ` Nikolay Aleksandrov
  2022-02-11 17:01   ` Stephen Hemminger
  1 sibling, 1 reply; 12+ messages in thread
From: Felix Fietkau @ 2022-02-10 16:06 UTC (permalink / raw)
  To: Nikolay Aleksandrov, netdev


On 10.02.22 15:55, Nikolay Aleksandrov wrote:
> On 10/02/2022 16:24, Felix Fietkau wrote:
>> Some devices (e.g. wireless APs) can't have devices behind them be part of
>> a bridge topology with redundant links, due to address limitations.
>> Additionally, broadcast traffic on these devices is somewhat expensive, due to
>> the low data rate and wakeups of clients in powersave mode.
>> This knob can be used to ensure that BPDU packets are never sent or forwarded
>> to/from these devices
>> 
>> Signed-off-by: Felix Fietkau <nbd@nbd.name>
>> ---
>>  include/linux/if_bridge.h    | 1 +
>>  include/uapi/linux/if_link.h | 1 +
>>  net/bridge/br_forward.c      | 5 +++++
>>  net/bridge/br_input.c        | 2 ++
>>  net/bridge/br_netlink.c      | 6 +++++-
>>  net/bridge/br_stp_bpdu.c     | 9 +++++++--
>>  net/core/rtnetlink.c         | 4 +++-
>>  7 files changed, 24 insertions(+), 4 deletions(-)
>> 
> 
> Why can't netfilter or tc be used to filter these frames?
netfilter is slow as hell, and even adding a tc rule that has to look at 
all frames to check for useless BPDU packets costs a lot more CPU cycles 
than simply suppressing them at the source.

- Felix

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC 2/2] net: bridge: add a software fast-path implementation
  2022-02-10 15:02   ` Nikolay Aleksandrov
@ 2022-02-10 16:53     ` Felix Fietkau
  2022-02-11  8:50       ` Nikolay Aleksandrov
  0 siblings, 1 reply; 12+ messages in thread
From: Felix Fietkau @ 2022-02-10 16:53 UTC (permalink / raw)
  To: Nikolay Aleksandrov, netdev


On 10.02.22 16:02, Nikolay Aleksandrov wrote:
> Hi Felix,
> that looks kind of familiar. :) I've been thinking about a similar optimization for
> quite some time and generally love the idea, but I thought we'd allow this to be
> implemented via eBPF flow speedup with some bridge helpers. There's also a lot of low
> hanging fruit about optimizations in bridge's fast-path.
> 
> Also from your commit message it seems you don't need to store this in the bridge at
> all but can use the notifications that others currently use and program these flows
> in the interested driver. I think it'd be better to do the software flow cache via
> ebpf, and do the hardware offload in the specific driver.
To be honest, I have no idea how to handle this in a clean way in the 
driver, because this offloading path crosses several driver/subsystem 
boundaries.

Right now we have support for a packet processing engine (PPE) in the 
MT7622 SoC, which can handle offloading IPv4 NAT/routing and IPv6 routing.
The hardware can also handle forwarding of src-mac/destination-mac 
tuples, but that is currently unused because it's not needed for 
ethernet-only forwarding.

When adding WLAN to the mix, it gets more complex. The PPE has an output 
port that connects to a special block called Wireless Ethernet Dispatch, 
which can be configured to intercept DMA between the WLAN driver (mt76) 
and a PCIe device with MT7615 or MT7915 in order to inject extra packets.

I already have working NAT/routing offload support for this, which I 
will post soon. In order to figure out the path to WLAN, the offloading 
code calls the .ndo_fill_forward_path op, which mac80211 supports.
This allows the mt76 driver to fill in required metadata which gets 
stored in the PPE flowtable.

On MT7622, traffic can only flow from ethernet to WLAN in this manner, 
on newer SoCs, offloading can work in the other direction as well.

So when looking at fdb entries and flows between them, the ethernet 
driver will have to figure out:
- which port number to use in the DSA tag on the ethernet side
- which VLAN to use on the ethernet side, with the extra gotcha that 
ingress traffic will be tagged, but egress won't
- which entries sit behind mac80211 vifs that support offloading through 
WED.
I would also need to add a way to push the notifications through DSA to 
the ethernet driver, because there is a switch inbetween that is not 
involved in the offloading path (PPE handles DSA tagging/untagging).

By the way, all of this is needed for offloading a fairly standard 
configuration on these devices, so not just for weird exotic settings.

If I let the bridge code tracks flows, I can easily handle this by using 
the same kind of infrastructure that netfilter flowtable uses. If I push 
this to the driver, it becomes a lot more complex and messy, in my 
opinion...

- Felix

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC 1/2] net: bridge: add knob for filtering rx/tx BPDU packets on a port
  2022-02-10 16:06   ` Felix Fietkau
@ 2022-02-11  8:16     ` Nikolay Aleksandrov
  0 siblings, 0 replies; 12+ messages in thread
From: Nikolay Aleksandrov @ 2022-02-11  8:16 UTC (permalink / raw)
  To: Felix Fietkau, netdev

(resending, for some reason my first email didn't make it to the mailing list)

On 10/02/2022 18:06, Felix Fietkau wrote:
> 
> On 10.02.22 15:55, Nikolay Aleksandrov wrote:
>> On 10/02/2022 16:24, Felix Fietkau wrote:
>>> Some devices (e.g. wireless APs) can't have devices behind them be part of
>>> a bridge topology with redundant links, due to address limitations.
>>> Additionally, broadcast traffic on these devices is somewhat expensive, due to
>>> the low data rate and wakeups of clients in powersave mode.
>>> This knob can be used to ensure that BPDU packets are never sent or forwarded
>>> to/from these devices
>>>
>>> Signed-off-by: Felix Fietkau <nbd@nbd.name>
>>> ---
>>>  include/linux/if_bridge.h    | 1 +
>>>  include/uapi/linux/if_link.h | 1 +
>>>  net/bridge/br_forward.c      | 5 +++++
>>>  net/bridge/br_input.c        | 2 ++
>>>  net/bridge/br_netlink.c      | 6 +++++-
>>>  net/bridge/br_stp_bpdu.c     | 9 +++++++--
>>>  net/core/rtnetlink.c         | 4 +++-
>>>  7 files changed, 24 insertions(+), 4 deletions(-)
>>>
>>
>> Why can't netfilter or tc be used to filter these frames?
> netfilter is slow as hell, and even adding a tc rule that has to look at all frames to check for useless BPDU packets costs a lot more CPU cycles than simply suppressing them at the source.
> 
> - Felix

You can use XDP, should be much faster. I don't want new tests in the fast path
for something that is already solved and doesn't need anything bridge-specific.
 
Tomorrow someone will try the same with some other packet type,
sorry but absolutely unacceptable.

Cheers,
 Nik


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC 2/2] net: bridge: add a software fast-path implementation
  2022-02-10 16:53     ` Felix Fietkau
@ 2022-02-11  8:50       ` Nikolay Aleksandrov
  2022-03-28 15:15         ` Felix Fietkau
  0 siblings, 1 reply; 12+ messages in thread
From: Nikolay Aleksandrov @ 2022-02-11  8:50 UTC (permalink / raw)
  To: Felix Fietkau, netdev

On 10/02/2022 18:53, Felix Fietkau wrote:
> 
> On 10.02.22 16:02, Nikolay Aleksandrov wrote:
>> Hi Felix,
>> that looks kind of familiar. :) I've been thinking about a similar optimization for
>> quite some time and generally love the idea, but I thought we'd allow this to be
>> implemented via eBPF flow speedup with some bridge helpers. There's also a lot of low
>> hanging fruit about optimizations in bridge's fast-path.
>>
>> Also from your commit message it seems you don't need to store this in the bridge at
>> all but can use the notifications that others currently use and program these flows
>> in the interested driver. I think it'd be better to do the software flow cache via
>> ebpf, and do the hardware offload in the specific driver.
> To be honest, I have no idea how to handle this in a clean way in the driver, because this offloading path crosses several driver/subsystem boundaries.
> 
> Right now we have support for a packet processing engine (PPE) in the MT7622 SoC, which can handle offloading IPv4 NAT/routing and IPv6 routing.
> The hardware can also handle forwarding of src-mac/destination-mac tuples, but that is currently unused because it's not needed for ethernet-only forwarding.
> 
> When adding WLAN to the mix, it gets more complex. The PPE has an output port that connects to a special block called Wireless Ethernet Dispatch, which can be configured to intercept DMA between the WLAN driver (mt76) and a PCIe device with MT7615 or MT7915 in order to inject extra packets.
> 
> I already have working NAT/routing offload support for this, which I will post soon. In order to figure out the path to WLAN, the offloading code calls the .ndo_fill_forward_path op, which mac80211 supports.
> This allows the mt76 driver to fill in required metadata which gets stored in the PPE flowtable.
> > On MT7622, traffic can only flow from ethernet to WLAN in this manner, on newer SoCs, offloading can work in the other direction as well.
> 

That's really not a bridge problem and not an argument to add so much new infrastructure
which we should later maintain and fix. I'd prefer all that complexity to be kept where
it is needed. Especially for something that (minus the offload/hw support) can already be
done much more efficiently by using existing tools (flow marking/matching and offloading
using xdp/ebpf).

> So when looking at fdb entries and flows between them, the ethernet driver will have to figure out:
> - which port number to use in the DSA tag on the ethernet side
> - which VLAN to use on the ethernet side, with the extra gotcha that ingress traffic will be tagged, but egress won't
> - which entries sit behind mac80211 vifs that support offloading through WED.
> I would also need to add a way to push the notifications through DSA to the ethernet driver, because there is a switch inbetween that is not involved in the offloading path (PPE handles DSA tagging/untagging).
> 

Add the absolute minimum infrastructure (if any at all) on the bridge side to achieve it.
As I mentioned above caching flows can already be achieved by using ebpf with some
extra care and maybe help from user-space. We don't need to maintain such new complex and
very fragile infrastructure. The new "fast" path is far from ideal, you've taken care
of a few cases only, there are many more that can and should affect it, in addition any new
features which get added will have to consider it. It will be a big headache to get correct in
the first place and to maintain in the future, while at the same time we can already do
it through ebpf and we can even make it easily available if new ebpf helpers are accepted.
I don't see any value in adding this to the bridge, a flow cache done through xdp would
be much faster for the software case.

> By the way, all of this is needed for offloading a fairly standard configuration on these devices, so not just for weird exotic settings.
> 
> If I let the bridge code tracks flows, I can easily handle this by using the same kind of infrastructure that netfilter flowtable uses. If I push this to the driver, it becomes a lot more complex and messy, in my opinion...
> > - Felix

I've seen these arguments many times over, it'll be easier to do device-specific
(unrelated to bridge in most cases) feature Y in the bridge so let's stick it there even
though it doesn't fit the bridge model and similar functionality can already be
achieved by re-using existing tools.
I'm sure there are many ways to achieve that flow tracking, you can think about a netfilter
solution to get these flows (maybe through some nftables/ebtables rules), you can think about
marking the flows at ingress and picking them up at egress, surely there are many more
solutions for your device to track the flows that go through the bridge and offload them.

Thanks,
 Nik




^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC 1/2] net: bridge: add knob for filtering rx/tx BPDU packets on a port
  2022-02-10 14:55 ` [RFC 1/2] net: bridge: add knob for filtering rx/tx BPDU packets on a port Nikolay Aleksandrov
  2022-02-10 16:06   ` Felix Fietkau
@ 2022-02-11 17:01   ` Stephen Hemminger
  1 sibling, 0 replies; 12+ messages in thread
From: Stephen Hemminger @ 2022-02-11 17:01 UTC (permalink / raw)
  To: Nikolay Aleksandrov; +Cc: Felix Fietkau, netdev

On Thu, 10 Feb 2022 16:55:40 +0200
Nikolay Aleksandrov <nikolay@nvidia.com> wrote:

> On 10/02/2022 16:24, Felix Fietkau wrote:
> > Some devices (e.g. wireless APs) can't have devices behind them be part of
> > a bridge topology with redundant links, due to address limitations.
> > Additionally, broadcast traffic on these devices is somewhat expensive, due to
> > the low data rate and wakeups of clients in powersave mode.
> > This knob can be used to ensure that BPDU packets are never sent or forwarded
> > to/from these devices
> > 
> > Signed-off-by: Felix Fietkau <nbd@nbd.name>
> > ---
> >  include/linux/if_bridge.h    | 1 +
> >  include/uapi/linux/if_link.h | 1 +
> >  net/bridge/br_forward.c      | 5 +++++
> >  net/bridge/br_input.c        | 2 ++
> >  net/bridge/br_netlink.c      | 6 +++++-
> >  net/bridge/br_stp_bpdu.c     | 9 +++++++--
> >  net/core/rtnetlink.c         | 4 +++-
> >  7 files changed, 24 insertions(+), 4 deletions(-)
> >   
> 
> Why can't netfilter or tc be used to filter these frames?
> 
> 

It could but this looks better.
BPDU filter matches what other hardware switch vendors offer and
has the benefit of not adding another layer of complexity into
user configurations.

Adding one rule into a complex firewall or starting to have to
configure tc is a mess.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC 2/2] net: bridge: add a software fast-path implementation
  2022-02-11  8:50       ` Nikolay Aleksandrov
@ 2022-03-28 15:15         ` Felix Fietkau
  2022-03-28 18:20           ` Nikolay Aleksandrov
  0 siblings, 1 reply; 12+ messages in thread
From: Felix Fietkau @ 2022-03-28 15:15 UTC (permalink / raw)
  To: Nikolay Aleksandrov, netdev


Hi Nik,

I'd like to follow up on our discussion regarding bridge offloading.
I managed to come up with a user space + eBPF implementation that 
replaces this code and shows mostly the same performance gain as my 
previous kernel space implementation.

At first I tried to use generic XDP, but after getting it working, 
performance was pretty bad (due to headroom issues) and I was told that 
this is by design and nobody should use it in production.

Then I reworked the code to use tc classifier instead and it worked much 
better.

It's not fully ready yet, I need to add some more tests for incompatible 
features, but I'm getting there...
The code is here: https://github.com/nbd168/bridger

There's one thing I haven't been able to figure out yet: What's the 
proper way to keep bridge fdb entries alive from user space without 
modifying them in any other way?

- Felix

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC 2/2] net: bridge: add a software fast-path implementation
  2022-03-28 15:15         ` Felix Fietkau
@ 2022-03-28 18:20           ` Nikolay Aleksandrov
  2022-03-29 11:07             ` Felix Fietkau
  0 siblings, 1 reply; 12+ messages in thread
From: Nikolay Aleksandrov @ 2022-03-28 18:20 UTC (permalink / raw)
  To: Felix Fietkau, Nikolay Aleksandrov, netdev

On 28/03/2022 18:15, Felix Fietkau wrote:
> 
> Hi Nik,
> 
> I'd like to follow up on our discussion regarding bridge offloading.
> I managed to come up with a user space + eBPF implementation that replaces this code and shows mostly the same performance gain as my previous kernel space implementation.
> 
> At first I tried to use generic XDP, but after getting it working, performance was pretty bad (due to headroom issues) and I was told that this is by design and nobody should use it in production.
> 
> Then I reworked the code to use tc classifier instead and it worked much better.
> 
> It's not fully ready yet, I need to add some more tests for incompatible features, but I'm getting there...
> The code is here: https://github.com/nbd168/bridger
> 
> There's one thing I haven't been able to figure out yet: What's the proper way to keep bridge fdb entries alive from user space without modifying them in any other way?
> 
> - Felix

Hi Felix,
That's very nice! Interesting work. One way it's usually done is through periodic NTF_USE (refresh),
another would be to mark them externally learned and delete them yourself (user-space aging).
It really depends on the exact semantics you'd like.

Cheers,
 Nik

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC 2/2] net: bridge: add a software fast-path implementation
  2022-03-28 18:20           ` Nikolay Aleksandrov
@ 2022-03-29 11:07             ` Felix Fietkau
  0 siblings, 0 replies; 12+ messages in thread
From: Felix Fietkau @ 2022-03-29 11:07 UTC (permalink / raw)
  To: Nikolay Aleksandrov, netdev


On 28.03.22 20:20, Nikolay Aleksandrov wrote:
> On 28/03/2022 18:15, Felix Fietkau wrote:
>> 
>> Hi Nik,
>> 
>> I'd like to follow up on our discussion regarding bridge offloading.
>> I managed to come up with a user space + eBPF implementation that replaces this code and shows mostly the same performance gain as my previous kernel space implementation.
>> 
>> At first I tried to use generic XDP, but after getting it working, performance was pretty bad (due to headroom issues) and I was told that this is by design and nobody should use it in production.
>> 
>> Then I reworked the code to use tc classifier instead and it worked much better.
>> 
>> It's not fully ready yet, I need to add some more tests for incompatible features, but I'm getting there...
>> The code is here: https://github.com/nbd168/bridger
>> 
>> There's one thing I haven't been able to figure out yet: What's the proper way to keep bridge fdb entries alive from user space without modifying them in any other way?
>> 
>> - Felix
> 
> Hi Felix,
> That's very nice! Interesting work. One way it's usually done is through periodic NTF_USE (refresh),
> another would be to mark them externally learned and delete them yourself (user-space aging).
> It really depends on the exact semantics you'd like.
I will try NTF_USE, thanks. I really just want to keep the bridge fdb 
entries alive while there is activity on the offloaded flows.

- Felix


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2022-03-29 11:07 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-10 14:24 [RFC 1/2] net: bridge: add knob for filtering rx/tx BPDU packets on a port Felix Fietkau
2022-02-10 14:24 ` [RFC 2/2] net: bridge: add a software fast-path implementation Felix Fietkau
2022-02-10 15:02   ` Nikolay Aleksandrov
2022-02-10 16:53     ` Felix Fietkau
2022-02-11  8:50       ` Nikolay Aleksandrov
2022-03-28 15:15         ` Felix Fietkau
2022-03-28 18:20           ` Nikolay Aleksandrov
2022-03-29 11:07             ` Felix Fietkau
2022-02-10 14:55 ` [RFC 1/2] net: bridge: add knob for filtering rx/tx BPDU packets on a port Nikolay Aleksandrov
2022-02-10 16:06   ` Felix Fietkau
2022-02-11  8:16     ` Nikolay Aleksandrov
2022-02-11 17:01   ` Stephen Hemminger

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.