All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups
@ 2018-04-29 18:07 David Ahern
  2018-04-29 18:07 ` [RFC v2 bpf-next 1/9] net/ipv6: Rename fib6_lookup to fib6_node_lookup David Ahern
                   ` (9 more replies)
  0 siblings, 10 replies; 21+ messages in thread
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern

Provide a helper for doing a FIB and neighbor lookups in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet is expected to continue up the stack
for full processing.

Patches 1-6 do some more refactoring to IPv6 with the end goal of
extracting a FIB lookup function that aligns with fib_lookup for IPv4,
basically returning a fib6_info without creating a dst based entry.

Patch 7 adds lookup functions to the ipv6 stub. These are needed since
bpf is built into the kernel and ipv6 may not be built or loaded.

Patch 8 adds the bpf helper and 9 adds a sample program.

v2
- fixed use of foward helper from cls_act as noted by Daniel
- in patch 1 rename fib6_lookup_1 as well for consistency

David Ahern (9):
  net/ipv6: Rename fib6_lookup to fib6_node_lookup
  net/ipv6: Rename rt6_multipath_select
  net/ipv6: Extract table lookup from ip6_pol_route
  net/ipv6: Refactor fib6_rule_action
  net/ipv6: Add fib6_lookup
  net/ipv6: Update fib6 tracepoint to take fib6_info
  net/ipv6: Add fib lookup stubs for use in bpf helper
  bpf: Provide helper to do lookups in kernel FIB table
  samples/bpf: Add examples of ipv4 and ipv6 forwarding in XDP

 include/net/addrconf.h                    |  14 ++
 include/net/ip6_fib.h                     |  21 ++-
 include/trace/events/fib6.h               |  14 +-
 include/uapi/linux/bpf.h                  |  83 +++++++++-
 net/core/filter.c                         | 263 ++++++++++++++++++++++++++++++
 net/ipv6/addrconf_core.c                  |  33 +++-
 net/ipv6/af_inet6.c                       |   6 +-
 net/ipv6/fib6_rules.c                     | 138 +++++++++++++---
 net/ipv6/ip6_fib.c                        |  21 ++-
 net/ipv6/route.c                          |  76 +++++----
 samples/bpf/Makefile                      |   4 +
 samples/bpf/xdp_fwd_kern.c                | 110 +++++++++++++
 samples/bpf/xdp_fwd_user.c                | 136 +++++++++++++++
 tools/testing/selftests/bpf/bpf_helpers.h |   3 +
 14 files changed, 847 insertions(+), 75 deletions(-)
 create mode 100644 samples/bpf/xdp_fwd_kern.c
 create mode 100644 samples/bpf/xdp_fwd_user.c

-- 
2.11.0

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [RFC v2 bpf-next 1/9] net/ipv6: Rename fib6_lookup to fib6_node_lookup
  2018-04-29 18:07 [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups David Ahern
@ 2018-04-29 18:07 ` David Ahern
  2018-04-29 18:07 ` [RFC v2 bpf-next 2/9] net/ipv6: Rename rt6_multipath_select David Ahern
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 21+ messages in thread
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern

Rename fib6_lookup to fib6_node_lookup to better reflect what it
returns. The fib6_lookup name will be used in a later patch for
an IPv6 equivalent to IPv4's fib_lookup.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h |  6 +++---
 net/ipv6/ip6_fib.c    | 14 ++++++++------
 net/ipv6/route.c      |  8 ++++----
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1af450d4e923..5a16630179cb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,9 +376,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
-struct fib6_node *fib6_lookup(struct fib6_node *root,
-			      const struct in6_addr *daddr,
-			      const struct in6_addr *saddr);
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+				   const struct in6_addr *daddr,
+				   const struct in6_addr *saddr);
 
 struct fib6_node *fib6_locate(struct fib6_node *root,
 			      const struct in6_addr *daddr, int dst_len,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6421c893466e..4cfffa0f676e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1354,8 +1354,8 @@ struct lookup_args {
 	const struct in6_addr	*addr;		/* search key			*/
 };
 
-static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
-				       struct lookup_args *args)
+static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
+					    struct lookup_args *args)
 {
 	struct fib6_node *fn;
 	__be32 dir;
@@ -1400,7 +1400,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 #ifdef CONFIG_IPV6_SUBTREES
 				if (subtree) {
 					struct fib6_node *sfn;
-					sfn = fib6_lookup_1(subtree, args + 1);
+					sfn = fib6_node_lookup_1(subtree,
+								 args + 1);
 					if (!sfn)
 						goto backtrack;
 					fn = sfn;
@@ -1422,8 +1423,9 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 
 /* called with rcu_read_lock() held
  */
-struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
-			      const struct in6_addr *saddr)
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+				   const struct in6_addr *daddr,
+				   const struct in6_addr *saddr)
 {
 	struct fib6_node *fn;
 	struct lookup_args args[] = {
@@ -1442,7 +1444,7 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
 		}
 	};
 
-	fn = fib6_lookup_1(root, daddr ? args : args + 1);
+	fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
 	if (!fn || fn->fn_flags & RTN_TL_ROOT)
 		fn = root;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7ee0a34fba46..d903db30dfff 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1006,7 +1006,7 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
 		pn = rcu_dereference(fn->parent);
 		sn = FIB6_SUBTREE(pn);
 		if (sn && sn != fn)
-			fn = fib6_lookup(sn, NULL, saddr);
+			fn = fib6_node_lookup(sn, NULL, saddr);
 		else
 			fn = pn;
 		if (fn->fn_flags & RTN_RTINFO)
@@ -1059,7 +1059,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 		flags &= ~RT6_LOOKUP_F_IFACE;
 
 	rcu_read_lock();
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	f6i = rcu_dereference(fn->leaf);
 	if (!f6i) {
@@ -1815,7 +1815,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 	rcu_read_lock();
 
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;
 
 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
@@ -2420,7 +2420,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	 */
 
 	rcu_read_lock();
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	for_each_fib6_node_rt_rcu(fn) {
 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [RFC v2 bpf-next 2/9] net/ipv6: Rename rt6_multipath_select
  2018-04-29 18:07 [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups David Ahern
  2018-04-29 18:07 ` [RFC v2 bpf-next 1/9] net/ipv6: Rename fib6_lookup to fib6_node_lookup David Ahern
@ 2018-04-29 18:07 ` David Ahern
  2018-04-29 18:07 ` [RFC v2 bpf-next 3/9] net/ipv6: Extract table lookup from ip6_pol_route David Ahern
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 21+ messages in thread
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern

Rename rt6_multipath_select to fib6_multipath_select and export it.
A later patch wants access to it similar to IPv4's fib_select_path.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h |  5 +++++
 net/ipv6/route.c      | 17 +++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 5a16630179cb..80d76d8dc683 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+struct fib6_info *fib6_multipath_select(const struct net *net,
+					struct fib6_info *match,
+					struct flowi6 *fl6, int oif,
+					const struct sk_buff *skb, int strict);
+
 struct fib6_node *fib6_node_lookup(struct fib6_node *root,
 				   const struct in6_addr *daddr,
 				   const struct in6_addr *saddr);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d903db30dfff..58af969f3a2c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -419,11 +419,11 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	return false;
 }
 
-static struct fib6_info *rt6_multipath_select(const struct net *net,
-					      struct fib6_info *match,
-					     struct flowi6 *fl6, int oif,
-					     const struct sk_buff *skb,
-					     int strict)
+struct fib6_info *fib6_multipath_select(const struct net *net,
+					struct fib6_info *match,
+					struct flowi6 *fl6, int oif,
+					const struct sk_buff *skb,
+					int strict)
 {
 	struct fib6_info *sibling, *next_sibling;
 
@@ -1068,8 +1068,9 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
 				      fl6->flowi6_oif, flags);
 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
-			f6i = rt6_multipath_select(net, f6i, fl6,
-						   fl6->flowi6_oif, skb, flags);
+			f6i = fib6_multipath_select(net, f6i, fl6,
+						    fl6->flowi6_oif, skb,
+						    flags);
 	}
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
@@ -1824,7 +1825,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
 	if (f6i->fib6_nsiblings)
-		f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
+		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [RFC v2 bpf-next 3/9] net/ipv6: Extract table lookup from ip6_pol_route
  2018-04-29 18:07 [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups David Ahern
  2018-04-29 18:07 ` [RFC v2 bpf-next 1/9] net/ipv6: Rename fib6_lookup to fib6_node_lookup David Ahern
  2018-04-29 18:07 ` [RFC v2 bpf-next 2/9] net/ipv6: Rename rt6_multipath_select David Ahern
@ 2018-04-29 18:07 ` David Ahern
  2018-04-29 18:07 ` [RFC v2 bpf-next 4/9] net/ipv6: Refactor fib6_rule_action David Ahern
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 21+ messages in thread
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern

ip6_pol_route is used for ingress and egress FIB lookups. Refactor it
moving the table lookup into a separate fib6_table_lookup that can be
invoked separately and export the new function.

ip6_pol_route now calls fib6_table_lookup and uses the result to generate
a dst based rt6_info.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h |  4 ++++
 net/ipv6/route.c      | 39 +++++++++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 80d76d8dc683..4f7b8f59ea6d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+/* called with rcu lock held; caller needs to select path */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+				    int oif, struct flowi6 *fl6, int strict);
+
 struct fib6_info *fib6_multipath_select(const struct net *net,
 					struct fib6_info *match,
 					struct flowi6 *fl6, int oif,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 58af969f3a2c..d0ace0c5c3e9 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1800,21 +1800,12 @@ void rt6_age_exceptions(struct fib6_info *rt,
 	rcu_read_unlock_bh();
 }
 
-struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
-			       int oif, struct flowi6 *fl6,
-			       const struct sk_buff *skb, int flags)
+/* must be called with rcu lock held */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+				    int oif, struct flowi6 *fl6, int strict)
 {
 	struct fib6_node *fn, *saved_fn;
 	struct fib6_info *f6i;
-	struct rt6_info *rt;
-	int strict = 0;
-
-	strict |= flags & RT6_LOOKUP_F_IFACE;
-	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
-	if (net->ipv6.devconf_all->forwarding == 0)
-		strict |= RT6_LOOKUP_F_REACHABLE;
-
-	rcu_read_lock();
 
 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;
@@ -1824,8 +1815,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
-	if (f6i->fib6_nsiblings)
-		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
@@ -1838,6 +1827,28 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		}
 	}
 
+	return f6i;
+}
+
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6,
+			       const struct sk_buff *skb, int flags)
+{
+	struct fib6_info *f6i;
+	struct rt6_info *rt;
+	int strict = 0;
+
+	strict |= flags & RT6_LOOKUP_F_IFACE;
+	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
+	if (net->ipv6.devconf_all->forwarding == 0)
+		strict |= RT6_LOOKUP_F_REACHABLE;
+
+	rcu_read_lock();
+
+	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
+	if (f6i->fib6_nsiblings)
+		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
+
 	if (f6i == net->ipv6.fib6_null_entry) {
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [RFC v2 bpf-next 4/9] net/ipv6: Refactor fib6_rule_action
  2018-04-29 18:07 [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups David Ahern
                   ` (2 preceding siblings ...)
  2018-04-29 18:07 ` [RFC v2 bpf-next 3/9] net/ipv6: Extract table lookup from ip6_pol_route David Ahern
@ 2018-04-29 18:07 ` David Ahern
  2018-04-29 18:07 ` [RFC v2 bpf-next 5/9] net/ipv6: Add fib6_lookup David Ahern
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 21+ messages in thread
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern

Move source address lookup from fib6_rule_action to a helper. It will be
used in a later patch by a second variant for fib6_rule_action.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv6/fib6_rules.c | 52 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 6547fc6491a6..d040c4bff3a0 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -96,6 +96,31 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 	return &net->ipv6.ip6_null_entry->dst;
 }
 
+static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
+			   struct flowi6 *flp6, const struct net_device *dev)
+{
+	struct fib6_rule *r = (struct fib6_rule *)rule;
+
+	/* If we need to find a source address for this traffic,
+	 * we check the result if it meets requirement of the rule.
+	 */
+	if ((rule->flags & FIB_RULE_FIND_SADDR) &&
+	    r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
+		struct in6_addr saddr;
+
+		if (ipv6_dev_get_saddr(net, dev, &flp6->daddr,
+				       rt6_flags2srcprefs(flags), &saddr))
+			return -EAGAIN;
+
+		if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen))
+			return -EAGAIN;
+
+		flp6->saddr = saddr;
+	}
+
+	return 0;
+}
+
 static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 			    int flags, struct fib_lookup_arg *arg)
 {
@@ -134,27 +159,12 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 
 	rt = lookup(net, table, flp6, arg->lookup_data, flags);
 	if (rt != net->ipv6.ip6_null_entry) {
-		struct fib6_rule *r = (struct fib6_rule *)rule;
-
-		/*
-		 * If we need to find a source address for this traffic,
-		 * we check the result if it meets requirement of the rule.
-		 */
-		if ((rule->flags & FIB_RULE_FIND_SADDR) &&
-		    r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
-			struct in6_addr saddr;
-
-			if (ipv6_dev_get_saddr(net,
-					       ip6_dst_idev(&rt->dst)->dev,
-					       &flp6->daddr,
-					       rt6_flags2srcprefs(flags),
-					       &saddr))
-				goto again;
-			if (!ipv6_prefix_equal(&saddr, &r->src.addr,
-					       r->src.plen))
-				goto again;
-			flp6->saddr = saddr;
-		}
+		err = fib6_rule_saddr(net, rule, flags, flp6,
+				      ip6_dst_idev(&rt->dst)->dev);
+
+		if (err == -EAGAIN)
+			goto again;
+
 		err = rt->dst.error;
 		if (err != -EAGAIN)
 			goto out;
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [RFC v2 bpf-next 5/9] net/ipv6: Add fib6_lookup
  2018-04-29 18:07 [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups David Ahern
                   ` (3 preceding siblings ...)
  2018-04-29 18:07 ` [RFC v2 bpf-next 4/9] net/ipv6: Refactor fib6_rule_action David Ahern
@ 2018-04-29 18:07 ` David Ahern
  2018-05-01 18:15   ` Vincent Bernat
  2018-04-29 18:07 ` [RFC v2 bpf-next 6/9] net/ipv6: Update fib6 tracepoint to take fib6_info David Ahern
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 21+ messages in thread
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern

Add IPv6 equivalent to fib_lookup. Does a fib lookup, including rules,
but returns a FIB entry, fib6_info, rather than a dst based rt6_info.
fib6_lookup is on the order of 40% faster than any of the dst based
lookup methods without custom rules and 25% faster with custom rules.

Since the lookup function has a completely different signature,
fib6_rule_action is split into 2 paths: the existing one is
renamed __fib6_rule_action and a new one for the fib6_info path
is added. fib6_rule_action decides which to call based on the
lookup_ptr. If it is fib6_table_lookup then the new path is taken.

Caller must hold rcu lock as no reference is taken on the returned
fib entry.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h |  6 ++++
 net/ipv6/fib6_rules.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv6/ip6_fib.c    |  7 +++++
 3 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 4f7b8f59ea6d..d920dd00139b 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,12 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+/* called with rcu lock held; can return error pointer
+ * caller needs to select path
+ */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags);
+
 /* called with rcu lock held; caller needs to select path */
 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
 				    int oif, struct flowi6 *fl6, int strict);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index d040c4bff3a0..f590446595d8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net)
 	return fib_rules_seq_read(net, AF_INET6);
 }
 
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags)
+{
+	struct fib6_info *f6i;
+	int err;
+
+	if (net->ipv6.fib6_has_custom_rules) {
+		struct fib_lookup_arg arg = {
+			.lookup_ptr = fib6_table_lookup,
+			.lookup_data = &oif,
+			.flags = FIB_LOOKUP_NOREF,
+		};
+
+		l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+		err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
+				       flowi6_to_flowi(fl6), flags, &arg);
+		if (err)
+			return ERR_PTR(err);
+
+		f6i = arg.result ? : net->ipv6.fib6_null_entry;
+	} else {
+		f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl,
+					oif, fl6, flags);
+		if (!f6i || f6i == net->ipv6.fib6_null_entry)
+			f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
+						oif, fl6, flags);
+	}
+
+	return f6i;
+}
+
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup)
@@ -121,8 +154,48 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
 	return 0;
 }
 
-static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
-			    int flags, struct fib_lookup_arg *arg)
+static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
+				int flags, struct fib_lookup_arg *arg)
+{
+	struct flowi6 *flp6 = &flp->u.ip6;
+	struct net *net = rule->fr_net;
+	struct fib6_table *table;
+	struct fib6_info *f6i;
+	int err = -EAGAIN, *oif;
+	u32 tb_id;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+	case FR_ACT_UNREACHABLE:
+		return -ENETUNREACH;
+	case FR_ACT_PROHIBIT:
+		return -EACCES;
+	case FR_ACT_BLACKHOLE:
+	default:
+		return -EINVAL;
+	}
+
+	tb_id = fib_rule_get_table(rule, arg);
+	table = fib6_get_table(net, tb_id);
+	if (!table)
+		return -EAGAIN;
+
+	oif = (int *)arg->lookup_data;
+	f6i = fib6_table_lookup(net, table, *oif, flp6, flags);
+	if (f6i != net->ipv6.fib6_null_entry) {
+		err = fib6_rule_saddr(net, rule, flags, flp6,
+				      fib6_info_nh_dev(f6i));
+
+		if (likely(!err))
+			arg->result = f6i;
+	}
+
+	return err;
+}
+
+static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			      int flags, struct fib_lookup_arg *arg)
 {
 	struct flowi6 *flp6 = &flp->u.ip6;
 	struct rt6_info *rt = NULL;
@@ -182,6 +255,15 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 	return err;
 }
 
+static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			    int flags, struct fib_lookup_arg *arg)
+{
+	if (arg->lookup_ptr == fib6_table_lookup)
+		return fib6_rule_action_alt(rule, flp, flags, arg);
+
+	return __fib6_rule_action(rule, flp, flags, arg);
+}
+
 static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
 {
 	struct rt6_info *rt = (struct rt6_info *) arg->result;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 4cfffa0f676e..0b94c0a631cb 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -354,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 	return &rt->dst;
 }
 
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags)
+{
+	return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
+}
+
 static void __net_init fib6_tables_init(struct net *net)
 {
 	fib6_link_table(net, net->ipv6.fib6_main_tbl);
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [RFC v2 bpf-next 6/9] net/ipv6: Update fib6 tracepoint to take fib6_info
  2018-04-29 18:07 [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups David Ahern
                   ` (4 preceding siblings ...)
  2018-04-29 18:07 ` [RFC v2 bpf-next 5/9] net/ipv6: Add fib6_lookup David Ahern
@ 2018-04-29 18:07 ` David Ahern
  2018-04-29 18:07 ` [RFC v2 bpf-next 7/9] net/ipv6: Add fib lookup stubs for use in bpf helper David Ahern
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 21+ messages in thread
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern

Similar to IPv4, IPv6 should use the FIB lookup result in the
tracepoint.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/trace/events/fib6.h | 14 +++++++-------
 net/ipv6/route.c            | 14 ++++++--------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 7e8d48a81b91..1b8d951e3c12 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -12,10 +12,10 @@
 
 TRACE_EVENT(fib6_table_lookup,
 
-	TP_PROTO(const struct net *net, const struct rt6_info *rt,
+	TP_PROTO(const struct net *net, const struct fib6_info *f6i,
 		 struct fib6_table *table, const struct flowi6 *flp),
 
-	TP_ARGS(net, rt, table, flp),
+	TP_ARGS(net, f6i, table, flp),
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
@@ -48,20 +48,20 @@ TRACE_EVENT(fib6_table_lookup,
 		in6 = (struct in6_addr *)__entry->dst;
 		*in6 = flp->daddr;
 
-		if (rt->rt6i_idev) {
-			__assign_str(name, rt->rt6i_idev->dev->name);
+		if (f6i->fib6_nh.nh_dev) {
+			__assign_str(name, f6i->fib6_nh.nh_dev);
 		} else {
 			__assign_str(name, "");
 		}
-		if (rt == net->ipv6.ip6_null_entry) {
+		if (f6i == net->ipv6.fib6_null_entry) {
 			struct in6_addr in6_zero = {};
 
 			in6 = (struct in6_addr *)__entry->gw;
 			*in6 = in6_zero;
 
-		} else if (rt) {
+		} else if (f6i) {
 			in6 = (struct in6_addr *)__entry->gw;
-			*in6 = rt->rt6i_gateway;
+			*in6 = f6i->fib6_nh.nh_gw;
 		}
 	),
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d0ace0c5c3e9..cf8de6899581 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1078,6 +1078,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 			goto restart;
 	}
 
+	trace_fib6_table_lookup(net, f6i, table, fl6);
+
 	/* Search through exception table */
 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
 	if (rt) {
@@ -1096,8 +1098,6 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, rt, table, fl6);
-
 	return rt;
 }
 
@@ -1827,6 +1827,8 @@ struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
 		}
 	}
 
+	trace_fib6_table_lookup(net, f6i, table, fl6);
+
 	return f6i;
 }
 
@@ -1853,7 +1855,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
 		dst_hold(&rt->dst);
-		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	}
 
@@ -1864,7 +1865,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			dst_use_noref(&rt->dst, jiffies);
 
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
@@ -1890,9 +1890,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			dst_hold(&uncached_rt->dst);
 		}
 
-		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
 		return uncached_rt;
-
 	} else {
 		/* Get a percpu copy */
 
@@ -1906,7 +1904,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 		local_bh_enable();
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
+
 		return pcpu_rt;
 	}
 }
@@ -2486,7 +2484,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, ret, table, fl6);
+	trace_fib6_table_lookup(net, rt, table, fl6);
 	return ret;
 };
 
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [RFC v2 bpf-next 7/9] net/ipv6: Add fib lookup stubs for use in bpf helper
  2018-04-29 18:07 [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups David Ahern
                   ` (5 preceding siblings ...)
  2018-04-29 18:07 ` [RFC v2 bpf-next 6/9] net/ipv6: Update fib6 tracepoint to take fib6_info David Ahern
@ 2018-04-29 18:07 ` David Ahern
  2018-04-29 18:07 ` [RFC v2 bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table David Ahern
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 21+ messages in thread
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern

Add stubs to retrieve a handle to an IPv6 FIB table, fib6_get_table,
a stub to do a lookup in a specific table, fib6_table_lookup, and
a stub for a full route lookup.

The stubs are needed for core bpf code to handle the case when the
IPv6 module is not builtin.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/addrconf.h   | 14 ++++++++++++++
 net/ipv6/addrconf_core.c | 33 ++++++++++++++++++++++++++++++++-
 net/ipv6/af_inet6.c      |  6 +++++-
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 8312cc25a3af..ff766ab207e0 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -223,6 +223,20 @@ struct ipv6_stub {
 				 const struct in6_addr *addr);
 	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
 			       struct dst_entry **dst, struct flowi6 *fl6);
+
+	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
+	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
+					 struct flowi6 *fl6, int flags);
+	struct fib6_info *(*fib6_table_lookup)(struct net *net,
+					      struct fib6_table *table,
+					      int oif, struct flowi6 *fl6,
+					      int flags);
+	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
+						   struct fib6_info *f6i,
+						   struct flowi6 *fl6, int oif,
+						   const struct sk_buff *skb,
+						   int strict);
+
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
 			      const struct in6_addr *solicited_addr,
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 32b564dfd02a..2fe754fd4f5e 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -134,8 +134,39 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
 	return -EAFNOSUPPORT;
 }
 
+static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6, int flags)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			 int flags)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
+				   struct flowi6 *fl6, int oif,
+				   const struct sk_buff *skb, int strict)
+{
+	return f6i;
+}
+
 const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
-	.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+	.ipv6_dst_lookup   = eafnosupport_ipv6_dst_lookup,
+	.fib6_get_table    = eafnosupport_fib6_get_table,
+	.fib6_table_lookup = eafnosupport_fib6_table_lookup,
+	.fib6_lookup       = eafnosupport_fib6_lookup,
+	.fib6_multipath_select = eafnosupport_fib6_multipath_select,
 };
 EXPORT_SYMBOL_GPL(ipv6_stub);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 36d622c477b1..c0e8255d50bb 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -887,7 +887,11 @@ static struct pernet_operations inet6_net_ops = {
 static const struct ipv6_stub ipv6_stub_impl = {
 	.ipv6_sock_mc_join = ipv6_sock_mc_join,
 	.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
-	.ipv6_dst_lookup = ip6_dst_lookup,
+	.ipv6_dst_lookup   = ip6_dst_lookup,
+	.fib6_get_table	   = fib6_get_table,
+	.fib6_table_lookup = fib6_table_lookup,
+	.fib6_lookup       = fib6_lookup,
+	.fib6_multipath_select = fib6_multipath_select,
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
 	.nd_tbl	= &nd_tbl,
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [RFC v2 bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table
  2018-04-29 18:07 [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups David Ahern
                   ` (6 preceding siblings ...)
  2018-04-29 18:07 ` [RFC v2 bpf-next 7/9] net/ipv6: Add fib lookup stubs for use in bpf helper David Ahern
@ 2018-04-29 18:07 ` David Ahern
  2018-04-29 23:36   ` Alexei Starovoitov
  2018-05-02 11:27   ` Jesper Dangaard Brouer
  2018-04-29 18:07 ` [RFC v2 bpf-next 9/9] samples/bpf: Add examples of ipv4 and ipv6 forwarding in XDP David Ahern
  2018-05-01 14:20 ` [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups David Miller
  9 siblings, 2 replies; 21+ messages in thread
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern

Provide a helper for doing a FIB and neighbor lookup in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet continues up the stack.

If it is to be forwarded, the forwarding can be done directly if the
neighbor is already known. If the neighbor does not exist, the first
few packets go up the stack for neighbor resolution. Once resolved, the
xdp program provides the fast path.

On successful lookup the nexthop dmac, current device smac and egress
device index are returned.

The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
are implemented in this patch. The API includes layer 4 parameters if
the XDP program chooses to do deep packet inspection to allow compare
against ACLs implemented as FIB rules.

Header rewrite is left to the XDP program.

The lookup takes 2 flags:
- BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
  straight to the table associated with the device (expert setting for
  those looking to maximize throughput)

- BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
  Default is an ingress lookup.

Initial performance numbers collected by Jesper, forwarded packets/sec:

       Full stack    XDP FIB lookup    XDP Direct lookup
IPv4   1,947,969       7,074,156          7,415,333
IPv6   1,728,000       6,165,504          7,262,720

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/uapi/linux/bpf.h |  83 ++++++++++++++-
 net/core/filter.c        | 263 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 345 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 23b334bba1a6..52652507113e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -10,6 +10,8 @@
 
 #include <linux/types.h>
 #include <linux/bpf_common.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
 
 /* Extended instruction set based on top of classic BPF */
 
@@ -1801,6 +1803,33 @@ union bpf_attr {
  * 	Return
  * 		a non-negative value equal to or less than size on success, or
  * 		a negative error in case of failure.
+ *
+ * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
+ *	Description
+ *		Do FIB lookup in kernel tables using parameters in *params*.
+ *		If lookup is successful and result shows packets is to be
+ *		forwarded, the neighbor tables are searched for the nexthop.
+ *		If successful (ie., FIB lookup shows forwarding and nexthop
+ *		is resolved), the nexthop address is returned in ipv4_dst,
+ *		ipv6_dst or mpls_out based on family, smac is set to mac
+ *		address of egress device, dmac is set to nexthop mac address,
+ *		rt_metric is set to metric from route.
+ *
+ *		*plen* argument is the size of the passed in struct.
+ *		*flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *
+ *		**BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
+ *		full lookup using FIB rules
+ *		**BPF_FIB_LOOKUP_OUTPUT** mmeans do lookup from an egress
+ *		perspective (default is ingress)
+ *
+ *		*ctx* is either **struct xdp_md** for XDP programs or
+ *		**struct sk_buff** tc cls_act programs.
+ *
+ *	Return
+ *		Egress device index on success, 0 if packet needs to continue
+ *		up the stack for further processing or a negative error in case
+ *		of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1870,7 +1899,8 @@ union bpf_attr {
 	FN(bind),			\
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
-	FN(get_stack),
+	FN(get_stack),			\
+	FN(fib_lookup),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2278,4 +2308,55 @@ struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
 
+/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
+ * OUTPUT:  Do lookup from egress perspective; default is ingress
+ */
+#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
+#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
+
+struct bpf_fib_lookup {
+	/* input */
+	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+
+	/* set if lookup is to consider L4 data - e.g., FIB rules */
+	__u8	l4_protocol;
+	__be16	sport;
+	__be16	dport;
+
+	/* total length of packet from network header - used for MTU check */
+	__u16	tot_len;
+	__u32	ifindex;  /* L3 device index for lookup */
+
+	union {
+		/* inputs to lookup */
+		__u8	tos;		/* AF_INET  */
+		__be32	flowlabel;	/* AF_INET6 */
+
+		/* output: metric of fib result */
+		__u32 rt_metric;
+	};
+
+	union {
+		__be32		mpls_in;
+		__be32		ipv4_src;
+		struct in6_addr	ipv6_src;
+	};
+
+	/* input to bpf_fib_lookup, *dst is destination address.
+	 * output: bpf_fib_lookup sets to gateway address
+	 */
+	union {
+		/* return for MPLS lookups */
+		__be32		mpls_out[4];  /* support up to 4 labels */
+		__be32		ipv4_dst;
+		struct in6_addr	ipv6_dst;
+	};
+
+	/* output */
+	__be16	h_vlan_proto;
+	__be16	h_vlan_TCI;
+	__u8	smac[ETH_ALEN];
+	__u8	dmac[ETH_ALEN];
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index d3781daa26ab..c34ba2675a98 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -59,6 +59,10 @@
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <linux/bpf_trace.h>
+#include <linux/inetdevice.h>
+#include <net/ip_fib.h>
+#include <net/flow.h>
+#include <net/arp.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -3788,6 +3792,261 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
 };
 #endif
 
+#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
+				  const struct neighbour *neigh,
+				  const struct net_device *dev)
+{
+	memcpy(params->dmac, neigh->ha, ETH_ALEN);
+	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
+	params->h_vlan_TCI = 0;
+	params->h_vlan_proto = 0;
+
+	return dev->ifindex;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+			       u32 flags)
+{
+	struct in_device *in_dev;
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct fib_result res;
+	struct fib_nh *nh;
+	struct flowi4 fl4;
+	int err;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	/* verify forwarding is enabled on this interface */
+	in_dev = __in_dev_get_rcu(dev);
+	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+		return 0;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl4.flowi4_iif = 1;
+		fl4.flowi4_oif = params->ifindex;
+	} else {
+		fl4.flowi4_iif = params->ifindex;
+		fl4.flowi4_oif = 0;
+	}
+	fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_flags = 0;
+
+	fl4.flowi4_proto = params->l4_protocol;
+	fl4.daddr = params->ipv4_dst;
+	fl4.saddr = params->ipv4_src;
+	fl4.fl4_sport = params->sport;
+	fl4.fl4_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib_table *tb;
+
+		tb = fib_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+	} else {
+		fl4.flowi4_mark = 0;
+		fl4.flowi4_secid = 0;
+		fl4.flowi4_tun_key.tun_id = 0;
+		fl4.flowi4_uid = sock_net_uid(net, NULL);
+
+		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
+	}
+
+	if (err || res.type != RTN_UNICAST)
+		return 0;
+
+	if (res.fi->fib_nhs > 1)
+		fib_select_path(net, &res, &fl4, NULL);
+
+	nh = &res.fi->fib_nh[res.nh_sel];
+
+	/* do not handle lwt encaps right now */
+	if (nh->nh_lwtstate)
+		return 0;
+
+	dev = nh->nh_dev;
+	if (unlikely(!dev))
+		return 0;
+
+	if (nh->nh_gw)
+		params->ipv4_dst = nh->nh_gw;
+
+	params->rt_metric = res.fi->fib_priority;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so
+	 * rcu_read_lock_bh is not needed here
+	 */
+	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+			       u32 flags)
+{
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct fib6_info *f6i;
+	struct flowi6 fl6;
+	int strict = 0;
+	int oif;
+
+	/* link local addresses are never forwarded */
+	if (rt6_need_strict(&params->ipv6_dst) ||
+	    rt6_need_strict(&params->ipv6_src))
+		return 0;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl6.flowi6_iif = 1;
+		oif = fl6.flowi6_oif = params->ifindex;
+	} else {
+		oif = fl6.flowi6_iif = params->ifindex;
+		fl6.flowi6_oif = 0;
+		strict = RT6_LOOKUP_F_HAS_SADDR;
+	}
+	fl6.flowlabel = params->flowlabel;
+	fl6.flowi6_scope = 0;
+	fl6.flowi6_flags = 0;
+	fl6.mp_hash = 0;
+
+	fl6.flowi6_proto = params->l4_protocol;
+	fl6.daddr = params->ipv6_dst;
+	fl6.saddr = params->ipv6_src;
+	fl6.fl6_sport = params->sport;
+	fl6.fl6_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib6_table *tb;
+
+		tb = ipv6_stub->fib6_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
+	} else {
+		fl6.flowi6_mark = 0;
+		fl6.flowi6_secid = 0;
+		fl6.flowi6_tun_key.tun_id = 0;
+		fl6.flowi6_uid = sock_net_uid(net, NULL);
+
+		f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
+	}
+
+	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
+		return 0;
+
+	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
+	    f6i->fib6_type != RTN_UNICAST))
+		return 0;
+
+	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
+		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
+						       fl6.flowi6_oif, NULL,
+						       strict);
+
+	if (f6i->fib6_nh.nh_lwtstate)
+		return 0;
+
+	if (f6i->fib6_flags & RTF_GATEWAY)
+		params->ipv6_dst = f6i->fib6_nh.nh_gw;
+
+	dev = f6i->fib6_nh.nh_dev;
+	params->rt_metric = f6i->fib6_metric;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
+	 * not needed here. Can not use __ipv6_neigh_lookup_noref here
+	 * because we need to get nd_tbl via the stub
+	 */
+	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
+				      ndisc_hashfn, &params->ipv6_dst, dev);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
+	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+	if (plen < sizeof(*params))
+		return -EINVAL;
+
+	switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+	case AF_INET:
+		return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
+					   flags);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
+					   flags);
+#endif
+	}
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
+	.func		= bpf_xdp_fib_lookup,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
+	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+	if (plen < sizeof(*params))
+		return -EINVAL;
+
+	switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+	case AF_INET:
+		return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags);
+#endif
+	}
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
+	.func		= bpf_skb_fib_lookup,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3933,6 +4192,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_get_xfrm_state:
 		return &bpf_skb_get_xfrm_state_proto;
 #endif
+	case BPF_FUNC_fib_lookup:
+		return &bpf_skb_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -3958,6 +4219,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_redirect_map_proto;
 	case BPF_FUNC_xdp_adjust_tail:
 		return &bpf_xdp_adjust_tail_proto;
+	case BPF_FUNC_fib_lookup:
+		return &bpf_xdp_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [RFC v2 bpf-next 9/9] samples/bpf: Add examples of ipv4 and ipv6 forwarding in XDP
  2018-04-29 18:07 [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups David Ahern
                   ` (7 preceding siblings ...)
  2018-04-29 18:07 ` [RFC v2 bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table David Ahern
@ 2018-04-29 18:07 ` David Ahern
  2018-05-02 11:13   ` Jesper Dangaard Brouer
  2018-05-01 14:20 ` [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups David Miller
  9 siblings, 1 reply; 21+ messages in thread
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 samples/bpf/Makefile                      |   4 +
 samples/bpf/xdp_fwd_kern.c                | 110 ++++++++++++++++++++++++
 samples/bpf/xdp_fwd_user.c                | 136 ++++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/bpf_helpers.h |   3 +
 4 files changed, 253 insertions(+)
 create mode 100644 samples/bpf/xdp_fwd_kern.c
 create mode 100644 samples/bpf/xdp_fwd_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 5e31770ac087..393dac1c43f4 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -45,6 +45,7 @@ hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
 hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
+hostprogs-y += xdp_fwd
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -98,6 +99,7 @@ xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
 xdp_adjust_tail-objs := bpf_load.o $(LIBBPF) xdp_adjust_tail_user.o
+xdp_fwd-objs := bpf_load.o $(LIBBPF) xdp_fwd_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -151,6 +153,7 @@ always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
 always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
+always += xdp_fwd_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -197,6 +200,7 @@ HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
 HOSTLOADLIBES_cpustat += -lelf
 HOSTLOADLIBES_xdp_adjust_tail += -lelf
+HOSTLOADLIBES_xdp_fwd += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
new file mode 100644
index 000000000000..4012c073fc45
--- /dev/null
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include "bpf_helpers.h"
+
+#define IPV6_FLOWINFO_MASK              cpu_to_be32(0x0FFFFFFF)
+
+struct bpf_map_def SEC("maps") tx_port = {
+	.type = BPF_MAP_TYPE_DEVMAP,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 64,
+};
+
+static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+	struct bpf_fib_lookup fib_params;
+	struct ethhdr *eth = data;
+	int out_index;
+	u16 h_proto;
+	u64 nh_off;
+
+	nh_off = sizeof(*eth);
+	if (data + nh_off > data_end)
+		return XDP_DROP;
+
+	__builtin_memset(&fib_params, 0, sizeof(fib_params));
+
+	h_proto = eth->h_proto;
+	if (h_proto == htons(ETH_P_IP)) {
+		struct iphdr *iph = data + nh_off;
+
+		if (iph + 1 > data_end)
+			return XDP_DROP;
+
+		fib_params.family	= AF_INET;
+		fib_params.tos		= iph->tos;
+		fib_params.l4_protocol	= iph->protocol;
+		fib_params.sport	= 0;
+		fib_params.dport	= 0;
+		fib_params.tot_len	= ntohs(iph->tot_len);
+		fib_params.ipv4_src	= iph->saddr;
+		fib_params.ipv4_dst	= iph->daddr;
+	} else if (h_proto == htons(ETH_P_IPV6)) {
+		struct ipv6hdr *iph = data + nh_off;
+
+		if (iph + 1 > data_end)
+			return XDP_DROP;
+
+		fib_params.family	= AF_INET6;
+		fib_params.flowlabel	= *(__be32 *)iph & IPV6_FLOWINFO_MASK;
+		fib_params.l4_protocol	= iph->nexthdr;
+		fib_params.sport	= 0;
+		fib_params.dport	= 0;
+		fib_params.tot_len	= ntohs(iph->payload_len);
+		fib_params.ipv6_src	= iph->saddr;
+		fib_params.ipv6_dst	= iph->daddr;
+	} else {
+		return XDP_PASS;
+	}
+
+	fib_params.ifindex = ctx->ingress_ifindex;
+
+	out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
+
+	/* verify egress index has xdp support */
+	// TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
+	//       cannot pass map_type 14 into func bpf_map_lookup_elem#1:
+	if (out_index > 0) {
+		memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
+		memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
+		return bpf_redirect_map(&tx_port, out_index, 0);
+	}
+
+	return XDP_PASS;
+}
+
+SEC("xdp_fwd")
+int xdp_fwd_prog(struct xdp_md *ctx)
+{
+	return xdp_fwd_flags(ctx, 0);
+}
+
+SEC("xdp_fwd_direct")
+int xdp_fwd_direct_prog(struct xdp_md *ctx)
+{
+	return xdp_fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
new file mode 100644
index 000000000000..9c6606f57126
--- /dev/null
+++ b/samples/bpf/xdp_fwd_user.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <linux/limits.h>
+#include <net/if.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <libgen.h>
+
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "libbpf.h"
+
+
+static int do_attach(int idx, int fd, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, fd, 0);
+	if (err < 0)
+		printf("ERROR: failed to attach program to %s\n", name);
+
+	return err;
+}
+
+static int do_detach(int idx, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, -1, 0);
+	if (err < 0)
+		printf("ERROR: failed to detach program from %s\n", name);
+
+	return err;
+}
+
+static void usage(const char *prog)
+{
+	fprintf(stderr,
+		"usage: %s [OPTS] interface-list\n"
+		"\nOPTS:\n"
+		"    -d    detach program\n"
+		"    -D    direct table lookups (skip fib rules)\n",
+		prog);
+}
+
+int main(int argc, char **argv)
+{
+	char filename[PATH_MAX];
+	int opt, i, idx, err;
+	int prog_id = 0;
+	int attach = 1;
+	int ret = 0;
+
+	while ((opt = getopt(argc, argv, ":dD")) != -1) {
+		switch (opt) {
+		case 'd':
+			attach = 0;
+			break;
+		case 'D':
+			prog_id = 1;
+			break;
+		default:
+			usage(basename(argv[0]));
+			return 1;
+		}
+	}
+
+	if (optind == argc) {
+		usage(basename(argv[0]));
+		return 1;
+	}
+
+	if (attach) {
+		snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+		if (access(filename, O_RDONLY) < 0) {
+			printf("error accessing file %s: %s\n",
+				filename, strerror(errno));
+			return 1;
+		}
+
+		if (load_bpf_file(filename)) {
+			printf("%s", bpf_log_buf);
+			return 1;
+		}
+
+		if (!prog_fd[prog_id]) {
+			printf("load_bpf_file: %s\n", strerror(errno));
+			return 1;
+		}
+	}
+	if (attach) {
+		for (i = 1; i < 64; ++i)
+			bpf_map_update_elem(map_fd[0], &i, &i, 0);
+	}
+
+	for (i = optind; i < argc; ++i) {
+		idx = if_nametoindex(argv[i]);
+		if (!idx)
+			idx = strtoul(argv[i], NULL, 0);
+
+		if (!idx) {
+			fprintf(stderr, "Invalid arg\n");
+			return 1;
+		}
+		if (!attach) {
+			err = do_detach(idx, argv[i]);
+			if (err)
+				ret = err;
+		} else {
+			err = do_attach(idx, prog_fd[prog_id], argv[i]);
+			if (err)
+				ret = err;
+		}
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 265f8e0e8ada..2375d06c706b 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -103,6 +103,9 @@ static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
 	(void *) BPF_FUNC_skb_get_xfrm_state;
 static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
 	(void *) BPF_FUNC_get_stack;
+static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params,
+			     int plen, __u32 flags) =
+	(void *) BPF_FUNC_fib_lookup;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [RFC v2 bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table
  2018-04-29 18:07 ` [RFC v2 bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table David Ahern
@ 2018-04-29 23:36   ` Alexei Starovoitov
  2018-04-30  1:13     ` David Ahern
  2018-05-02 11:27   ` Jesper Dangaard Brouer
  1 sibling, 1 reply; 21+ messages in thread
From: Alexei Starovoitov @ 2018-04-29 23:36 UTC (permalink / raw)
  To: David Ahern
  Cc: netdev, borkmann, ast, davem, shm, roopa, brouer, toke, john.fastabend

On Sun, Apr 29, 2018 at 11:07:51AM -0700, David Ahern wrote:
> Provide a helper for doing a FIB and neighbor lookup in the kernel
> tables from an XDP program. The helper provides a fastpath for forwarding
> packets. If the packet is a local delivery or for any reason is not a
> simple lookup and forward, the packet continues up the stack.
> 
> If it is to be forwarded, the forwarding can be done directly if the
> neighbor is already known. If the neighbor does not exist, the first
> few packets go up the stack for neighbor resolution. Once resolved, the
> xdp program provides the fast path.
> 
> On successful lookup the nexthop dmac, current device smac and egress
> device index are returned.
> 
> The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
> are implemented in this patch. The API includes layer 4 parameters if
> the XDP program chooses to do deep packet inspection to allow compare
> against ACLs implemented as FIB rules.
> 
> Header rewrite is left to the XDP program.
> 
> The lookup takes 2 flags:
> - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
>   straight to the table associated with the device (expert setting for
>   those looking to maximize throughput)
> 
> - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
>   Default is an ingress lookup.
> 
> Initial performance numbers collected by Jesper, forwarded packets/sec:
> 
>        Full stack    XDP FIB lookup    XDP Direct lookup
> IPv4   1,947,969       7,074,156          7,415,333
> IPv6   1,728,000       6,165,504          7,262,720
> 
> Signed-off-by: David Ahern <dsahern@gmail.com>
> ---
>  include/uapi/linux/bpf.h |  83 ++++++++++++++-
>  net/core/filter.c        | 263 +++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 345 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 23b334bba1a6..52652507113e 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -10,6 +10,8 @@
>  
>  #include <linux/types.h>
>  #include <linux/bpf_common.h>
> +#include <linux/if_ether.h>
> +#include <linux/in6.h>
>  
>  /* Extended instruction set based on top of classic BPF */
>  
> @@ -1801,6 +1803,33 @@ union bpf_attr {
>   * 	Return
>   * 		a non-negative value equal to or less than size on success, or
>   * 		a negative error in case of failure.
> + *
> + * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
> + *	Description
> + *		Do FIB lookup in kernel tables using parameters in *params*.
> + *		If lookup is successful and result shows packets is to be
> + *		forwarded, the neighbor tables are searched for the nexthop.
> + *		If successful (ie., FIB lookup shows forwarding and nexthop
> + *		is resolved), the nexthop address is returned in ipv4_dst,
> + *		ipv6_dst or mpls_out based on family, smac is set to mac
> + *		address of egress device, dmac is set to nexthop mac address,
> + *		rt_metric is set to metric from route.
> + *
> + *		*plen* argument is the size of the passed in struct.
> + *		*flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
> + *
> + *		**BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
> + *		full lookup using FIB rules
> + *		**BPF_FIB_LOOKUP_OUTPUT** mmeans do lookup from an egress
> + *		perspective (default is ingress)
> + *
> + *		*ctx* is either **struct xdp_md** for XDP programs or
> + *		**struct sk_buff** tc cls_act programs.
> + *
> + *	Return
> + *		Egress device index on success, 0 if packet needs to continue
> + *		up the stack for further processing or a negative error in case
> + *		of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -1870,7 +1899,8 @@ union bpf_attr {
>  	FN(bind),			\
>  	FN(xdp_adjust_tail),		\
>  	FN(skb_get_xfrm_state),		\
> -	FN(get_stack),
> +	FN(get_stack),			\
> +	FN(fib_lookup),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2278,4 +2308,55 @@ struct bpf_raw_tracepoint_args {
>  	__u64 args[0];
>  };
>  
> +/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
> + * OUTPUT:  Do lookup from egress perspective; default is ingress
> + */
> +#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
> +#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
> +
> +struct bpf_fib_lookup {
> +	/* input */
> +	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
> +
> +	/* set if lookup is to consider L4 data - e.g., FIB rules */
> +	__u8	l4_protocol;
> +	__be16	sport;
> +	__be16	dport;
> +
> +	/* total length of packet from network header - used for MTU check */
> +	__u16	tot_len;
> +	__u32	ifindex;  /* L3 device index for lookup */
> +
> +	union {
> +		/* inputs to lookup */
> +		__u8	tos;		/* AF_INET  */
> +		__be32	flowlabel;	/* AF_INET6 */
> +
> +		/* output: metric of fib result */
> +		__u32 rt_metric;
> +	};
> +
> +	union {
> +		__be32		mpls_in;
> +		__be32		ipv4_src;
> +		struct in6_addr	ipv6_src;
> +	};
> +
> +	/* input to bpf_fib_lookup, *dst is destination address.
> +	 * output: bpf_fib_lookup sets to gateway address
> +	 */
> +	union {
> +		/* return for MPLS lookups */
> +		__be32		mpls_out[4];  /* support up to 4 labels */
> +		__be32		ipv4_dst;
> +		struct in6_addr	ipv6_dst;
> +	};
> +
> +	/* output */
> +	__be16	h_vlan_proto;
> +	__be16	h_vlan_TCI;
> +	__u8	smac[ETH_ALEN];
> +	__u8	dmac[ETH_ALEN];
> +};
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index d3781daa26ab..c34ba2675a98 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -59,6 +59,10 @@
>  #include <net/tcp.h>
>  #include <net/xfrm.h>
>  #include <linux/bpf_trace.h>
> +#include <linux/inetdevice.h>
> +#include <net/ip_fib.h>
> +#include <net/flow.h>
> +#include <net/arp.h>
>  
>  /**
>   *	sk_filter_trim_cap - run a packet through a socket filter
> @@ -3788,6 +3792,261 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
>  };
>  #endif
>  
> +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
> +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
> +				  const struct neighbour *neigh,
> +				  const struct net_device *dev)
> +{
> +	memcpy(params->dmac, neigh->ha, ETH_ALEN);
> +	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
> +	params->h_vlan_TCI = 0;
> +	params->h_vlan_proto = 0;
> +
> +	return dev->ifindex;
> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_INET)
> +static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
> +			       u32 flags)
> +{
> +	struct in_device *in_dev;
> +	struct neighbour *neigh;
> +	struct net_device *dev;
> +	struct fib_result res;
> +	struct fib_nh *nh;
> +	struct flowi4 fl4;
> +	int err;
> +
> +	dev = dev_get_by_index_rcu(net, params->ifindex);
> +	if (unlikely(!dev))
> +		return -ENODEV;
> +
> +	/* verify forwarding is enabled on this interface */
> +	in_dev = __in_dev_get_rcu(dev);
> +	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
> +		return 0;
> +
> +	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> +		fl4.flowi4_iif = 1;
> +		fl4.flowi4_oif = params->ifindex;
> +	} else {
> +		fl4.flowi4_iif = params->ifindex;
> +		fl4.flowi4_oif = 0;
> +	}
> +	fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
> +	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
> +	fl4.flowi4_flags = 0;
> +
> +	fl4.flowi4_proto = params->l4_protocol;
> +	fl4.daddr = params->ipv4_dst;
> +	fl4.saddr = params->ipv4_src;
> +	fl4.fl4_sport = params->sport;
> +	fl4.fl4_dport = params->dport;
> +
> +	if (flags & BPF_FIB_LOOKUP_DIRECT) {
> +		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
> +		struct fib_table *tb;
> +
> +		tb = fib_get_table(net, tbid);
> +		if (unlikely(!tb))
> +			return 0;
> +
> +		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
> +	} else {
> +		fl4.flowi4_mark = 0;
> +		fl4.flowi4_secid = 0;
> +		fl4.flowi4_tun_key.tun_id = 0;
> +		fl4.flowi4_uid = sock_net_uid(net, NULL);
> +
> +		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
> +	}
> +
> +	if (err || res.type != RTN_UNICAST)
> +		return 0;

should this be an error returned to the user instead of zero?
Seems useful to indicate.

> +
> +	if (res.fi->fib_nhs > 1)
> +		fib_select_path(net, &res, &fl4, NULL);
> +
> +	nh = &res.fi->fib_nh[res.nh_sel];
> +
> +	/* do not handle lwt encaps right now */
> +	if (nh->nh_lwtstate)
> +		return 0;

adn return enotsupp here?

> +
> +	dev = nh->nh_dev;
> +	if (unlikely(!dev))
> +		return 0;

enodev ?

> +
> +	if (nh->nh_gw)
> +		params->ipv4_dst = nh->nh_gw;
> +
> +	params->rt_metric = res.fi->fib_priority;
> +
> +	/* xdp and cls_bpf programs are run in RCU-bh so
> +	 * rcu_read_lock_bh is not needed here
> +	 */
> +	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
> +	if (neigh)
> +		return bpf_fib_set_fwd_params(params, neigh, dev);
> +
> +	return 0;

Even this return 0 doesn't quite fit to what doc says:
"0 if packet needs to continue  up the stack for further processing"
What stack suppose to do ?
It will hit the same condition and packet will be dropped, right?
Isn't it better to report all errors back to bpf prog and let
the program make decision instead of 'return 0' almost everywhere?

> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> +static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
> +			       u32 flags)
> +{
> +	struct neighbour *neigh;
> +	struct net_device *dev;
> +	struct fib6_info *f6i;
> +	struct flowi6 fl6;
> +	int strict = 0;
> +	int oif;
> +
> +	/* link local addresses are never forwarded */
> +	if (rt6_need_strict(&params->ipv6_dst) ||
> +	    rt6_need_strict(&params->ipv6_src))
> +		return 0;
> +
> +	dev = dev_get_by_index_rcu(net, params->ifindex);
> +	if (unlikely(!dev))
> +		return -ENODEV;
> +
> +	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> +		fl6.flowi6_iif = 1;
> +		oif = fl6.flowi6_oif = params->ifindex;
> +	} else {
> +		oif = fl6.flowi6_iif = params->ifindex;
> +		fl6.flowi6_oif = 0;
> +		strict = RT6_LOOKUP_F_HAS_SADDR;
> +	}
> +	fl6.flowlabel = params->flowlabel;
> +	fl6.flowi6_scope = 0;
> +	fl6.flowi6_flags = 0;
> +	fl6.mp_hash = 0;
> +
> +	fl6.flowi6_proto = params->l4_protocol;
> +	fl6.daddr = params->ipv6_dst;
> +	fl6.saddr = params->ipv6_src;
> +	fl6.fl6_sport = params->sport;
> +	fl6.fl6_dport = params->dport;
> +
> +	if (flags & BPF_FIB_LOOKUP_DIRECT) {
> +		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
> +		struct fib6_table *tb;
> +
> +		tb = ipv6_stub->fib6_get_table(net, tbid);
> +		if (unlikely(!tb))
> +			return 0;
> +
> +		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
> +	} else {
> +		fl6.flowi6_mark = 0;
> +		fl6.flowi6_secid = 0;
> +		fl6.flowi6_tun_key.tun_id = 0;
> +		fl6.flowi6_uid = sock_net_uid(net, NULL);
> +
> +		f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
> +	}
> +
> +	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
> +		return 0;
> +
> +	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
> +	    f6i->fib6_type != RTN_UNICAST))
> +		return 0;
> +
> +	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
> +		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
> +						       fl6.flowi6_oif, NULL,
> +						       strict);
> +
> +	if (f6i->fib6_nh.nh_lwtstate)
> +		return 0;
> +
> +	if (f6i->fib6_flags & RTF_GATEWAY)
> +		params->ipv6_dst = f6i->fib6_nh.nh_gw;
> +
> +	dev = f6i->fib6_nh.nh_dev;
> +	params->rt_metric = f6i->fib6_metric;
> +
> +	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
> +	 * not needed here. Can not use __ipv6_neigh_lookup_noref here
> +	 * because we need to get nd_tbl via the stub
> +	 */
> +	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
> +				      ndisc_hashfn, &params->ipv6_dst, dev);
> +	if (neigh)
> +		return bpf_fib_set_fwd_params(params, neigh, dev);
> +
> +	return 0;
> +}
> +#endif
> +
> +BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
> +	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
> +{
> +	if (plen < sizeof(*params))
> +		return -EINVAL;

there should be a check here that only two bits are used in 'flags'
Otherwise flags will not be extendable.

> +
> +	switch (params->family) {
> +#if IS_ENABLED(CONFIG_INET)
> +	case AF_INET:
> +		return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
> +					   flags);
> +#endif
> +#if IS_ENABLED(CONFIG_IPV6)
> +	case AF_INET6:
> +		return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
> +					   flags);
> +#endif
> +	}
> +	return -ENOTSUPP;
> +}
> +
> +static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
> +	.func		= bpf_xdp_fib_lookup,
> +	.gpl_only	= true,

do you really want to force all users of this helper to be gpl only?
I don't mind at all. That's your choice as the author of this helper.
Just the rest of networking helpers don't require gpl-ness.

> +	.pkt_access	= true,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type      = ARG_PTR_TO_CTX,
> +	.arg2_type      = ARG_PTR_TO_MEM,
> +	.arg3_type      = ARG_CONST_SIZE,
> +	.arg4_type	= ARG_ANYTHING,
> +};
> +
> +BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
> +	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
> +{
> +	if (plen < sizeof(*params))
> +		return -EINVAL;
> +
> +	switch (params->family) {
> +#if IS_ENABLED(CONFIG_INET)
> +	case AF_INET:
> +		return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags);
> +#endif
> +#if IS_ENABLED(CONFIG_IPV6)
> +	case AF_INET6:
> +		return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags);
> +#endif
> +	}
> +	return -ENOTSUPP;
> +}
> +
> +static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
> +	.func		= bpf_skb_fib_lookup,
> +	.gpl_only	= true,
> +	.pkt_access	= true,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type      = ARG_PTR_TO_CTX,
> +	.arg2_type      = ARG_PTR_TO_MEM,
> +	.arg3_type      = ARG_CONST_SIZE,
> +	.arg4_type	= ARG_ANYTHING,
> +};
> +
>  static const struct bpf_func_proto *
>  bpf_base_func_proto(enum bpf_func_id func_id)
>  {
> @@ -3933,6 +4192,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  	case BPF_FUNC_skb_get_xfrm_state:
>  		return &bpf_skb_get_xfrm_state_proto;
>  #endif
> +	case BPF_FUNC_fib_lookup:
> +		return &bpf_skb_fib_lookup_proto;
>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> @@ -3958,6 +4219,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_xdp_redirect_map_proto;
>  	case BPF_FUNC_xdp_adjust_tail:
>  		return &bpf_xdp_adjust_tail_proto;
> +	case BPF_FUNC_fib_lookup:
> +		return &bpf_xdp_fib_lookup_proto;
>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> -- 
> 2.11.0
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table
  2018-04-29 23:36   ` Alexei Starovoitov
@ 2018-04-30  1:13     ` David Ahern
  2018-05-15  3:46       ` David Ahern
  0 siblings, 1 reply; 21+ messages in thread
From: David Ahern @ 2018-04-30  1:13 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: netdev, borkmann, ast, davem, shm, roopa, brouer, toke, john.fastabend

On 4/29/18 5:36 PM, Alexei Starovoitov wrote:
>> +	if (flags & BPF_FIB_LOOKUP_DIRECT) {
>> +		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
>> +		struct fib_table *tb;
>> +
>> +		tb = fib_get_table(net, tbid);
>> +		if (unlikely(!tb))
>> +			return 0;
>> +
>> +		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
>> +	} else {
>> +		fl4.flowi4_mark = 0;
>> +		fl4.flowi4_secid = 0;
>> +		fl4.flowi4_tun_key.tun_id = 0;
>> +		fl4.flowi4_uid = sock_net_uid(net, NULL);
>> +
>> +		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
>> +	}
>> +
>> +	if (err || res.type != RTN_UNICAST)
>> +		return 0;
> 
> should this be an error returned to the user instead of zero?
> Seems useful to indicate.

res.type != UNICAST is not an error; it means other delivery type (e.g.,
local).

err < 0 means unreachable, prohibit, blackhole, etc. Arguably the error
could be returned to the xdp program, but it is more complicated than
that. Blackhole is a common default route or policy, but RTN_BLACKHOLE
== -EINVAL which is also the error code if the user passes invalid
arguments to the program.

> 
>> +
>> +	if (res.fi->fib_nhs > 1)
>> +		fib_select_path(net, &res, &fl4, NULL);
>> +
>> +	nh = &res.fi->fib_nh[res.nh_sel];
>> +
>> +	/* do not handle lwt encaps right now */
>> +	if (nh->nh_lwtstate)
>> +		return 0;
> 
> adn return enotsupp here?

see below

> 
>> +
>> +	dev = nh->nh_dev;
>> +	if (unlikely(!dev))
>> +		return 0;
> 
> enodev ?

see below

> 
>> +
>> +	if (nh->nh_gw)
>> +		params->ipv4_dst = nh->nh_gw;
>> +
>> +	params->rt_metric = res.fi->fib_priority;
>> +
>> +	/* xdp and cls_bpf programs are run in RCU-bh so
>> +	 * rcu_read_lock_bh is not needed here
>> +	 */
>> +	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
>> +	if (neigh)
>> +		return bpf_fib_set_fwd_params(params, neigh, dev);
>> +
>> +	return 0;
> 
> Even this return 0 doesn't quite fit to what doc says:
> "0 if packet needs to continue  up the stack for further processing"
> What stack suppose to do ?

First packet on a route the nexthop may not be resolved. Without punting
to the stack it never has an impetus to resolve that neighbor.


> It will hit the same condition and packet will be dropped, right?

no. It can resolve the neighbor so follow up packets can be forwarded in
the fast path.

> Isn't it better to report all errors back to bpf prog and let
> the program make decision instead of 'return 0' almost everywhere?

The idea here is to fast pass packets that fit a supported profile and
are to be forwarded. Everything else should continue up the stack as it
has wider capabilities. The helper and XDP programs should make no
assumptions on what the broader kernel and userspace might be monitoring
or want to do with packets that can not be forwarded in the fast path.
This is very similar to hardware forwarding when it punts packets to the
CPU for control plane assistance.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups
  2018-04-29 18:07 [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups David Ahern
                   ` (8 preceding siblings ...)
  2018-04-29 18:07 ` [RFC v2 bpf-next 9/9] samples/bpf: Add examples of ipv4 and ipv6 forwarding in XDP David Ahern
@ 2018-05-01 14:20 ` David Miller
  9 siblings, 0 replies; 21+ messages in thread
From: David Miller @ 2018-05-01 14:20 UTC (permalink / raw)
  To: dsahern; +Cc: netdev, borkmann, ast, shm, roopa, brouer, toke, john.fastabend

From: David Ahern <dsahern@gmail.com>
Date: Sun, 29 Apr 2018 11:07:43 -0700

> Provide a helper for doing a FIB and neighbor lookups in the kernel
> tables from an XDP program. The helper provides a fastpath for forwarding
> packets. If the packet is a local delivery or for any reason is not a
> simple lookup and forward, the packet is expected to continue up the stack
> for full processing.
> 
> Patches 1-6 do some more refactoring to IPv6 with the end goal of
> extracting a FIB lookup function that aligns with fib_lookup for IPv4,
> basically returning a fib6_info without creating a dst based entry.
> 
> Patch 7 adds lookup functions to the ipv6 stub. These are needed since
> bpf is built into the kernel and ipv6 may not be built or loaded.
> 
> Patch 8 adds the bpf helper and 9 adds a sample program.
> 
> v2
> - fixed use of foward helper from cls_act as noted by Daniel
> - in patch 1 rename fib6_lookup_1 as well for consistency

I've reviewed this and generally I agree with the semantic choices
wrt. resolution.

We really can't do neigh resolution without an SKB, so at least in
the xdp case we must push the packet up into the full stack path.

I guess we could do the neigh resolve in the cls_bpf case, but I
wonder how helpful that would be.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 bpf-next 5/9] net/ipv6: Add fib6_lookup
  2018-04-29 18:07 ` [RFC v2 bpf-next 5/9] net/ipv6: Add fib6_lookup David Ahern
@ 2018-05-01 18:15   ` Vincent Bernat
  2018-05-01 18:25     ` David Ahern
  0 siblings, 1 reply; 21+ messages in thread
From: Vincent Bernat @ 2018-05-01 18:15 UTC (permalink / raw)
  To: David Ahern
  Cc: netdev, borkmann, ast, davem, shm, roopa, brouer, toke, john.fastabend

 ❦ 29 avril 2018 11:07 -0700, David Ahern <dsahern@gmail.com> :

> +struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
> +			      int flags)

Maybe an EXPORT_SYMBOL_GPL? There is one for __fib_lookup (fib_lookup is
an inline function).
-- 
Use the "telephone test" for readability.
            - The Elements of Programming Style (Kernighan & Plauger)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 bpf-next 5/9] net/ipv6: Add fib6_lookup
  2018-05-01 18:15   ` Vincent Bernat
@ 2018-05-01 18:25     ` David Ahern
  0 siblings, 0 replies; 21+ messages in thread
From: David Ahern @ 2018-05-01 18:25 UTC (permalink / raw)
  To: Vincent Bernat
  Cc: netdev, borkmann, ast, davem, shm, roopa, brouer, toke, john.fastabend

On 5/1/18 12:15 PM, Vincent Bernat wrote:
>  ❦ 29 avril 2018 11:07 -0700, David Ahern <dsahern@gmail.com> :
> 
>> +struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
>> +			      int flags)
> 
> Maybe an EXPORT_SYMBOL_GPL? There is one for __fib_lookup (fib_lookup is
> an inline function).
> 

There needs to be an in-tree user relying on it. My intention is to
convert rt6_lookup to fib6_lookup; as I recall all of those just want
the egress device and do not need a dst based response. That might
provide the in-tree user - depending on how the conversion is done.

And I presume you are asking for your benchmark modules. I can send you
my diffs and results.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 bpf-next 9/9] samples/bpf: Add examples of ipv4 and ipv6 forwarding in XDP
  2018-04-29 18:07 ` [RFC v2 bpf-next 9/9] samples/bpf: Add examples of ipv4 and ipv6 forwarding in XDP David Ahern
@ 2018-05-02 11:13   ` Jesper Dangaard Brouer
  2018-05-02 15:40     ` David Ahern
  0 siblings, 1 reply; 21+ messages in thread
From: Jesper Dangaard Brouer @ 2018-05-02 11:13 UTC (permalink / raw)
  To: David Ahern
  Cc: netdev, borkmann, ast, davem, shm, roopa, toke, john.fastabend,
	brouer, Tariq Toukan


On Sun, 29 Apr 2018 11:07:52 -0700 David Ahern <dsahern@gmail.com> wrote:

> +	/* verify egress index has xdp support */
> +	// TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
> +	//       cannot pass map_type 14 into func bpf_map_lookup_elem#1:

I just want to point out that I/we are aware of this "usability"
problem with the sample program, but I don't want to block the FIB
helper going upstream, we can fix this problem later.

The problem is that if you load this bpf/xdp prog, then all incoming
traffic (on that interface), will be forward using XDP, regardless
whether the egress ifindex support XDP or not.  And if not supported,
then packets are dropped hard (only detectable via tracepoints).

If the bpf prog could tell/know that the egress ifindex doesn't
support XDP xmit, then it could simply return XDP_PASS to fallback to
the normal network stack.
-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table
  2018-04-29 18:07 ` [RFC v2 bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table David Ahern
  2018-04-29 23:36   ` Alexei Starovoitov
@ 2018-05-02 11:27   ` Jesper Dangaard Brouer
  2018-05-02 15:37     ` David Ahern
  1 sibling, 1 reply; 21+ messages in thread
From: Jesper Dangaard Brouer @ 2018-05-02 11:27 UTC (permalink / raw)
  To: David Ahern
  Cc: netdev, borkmann, ast, davem, shm, roopa, toke, john.fastabend,
	brouer, Vincent Bernat

On Sun, 29 Apr 2018 11:07:51 -0700
David Ahern <dsahern@gmail.com> wrote:

> Initial performance numbers collected by Jesper, forwarded packets/sec:
> 
>        Full stack    XDP FIB lookup    XDP Direct lookup
> IPv4   1,947,969       7,074,156          7,415,333
> IPv6   1,728,000       6,165,504          7,262,720

Do notice these number is single CPU core forwarding performance!
On a Broadwell E5-1650 v4 @ 3.60GHz.

Another interesting data point is that xdp_redirect_map performance is
13,365,161 pps, which allow us to calculate/isolate the overhead/cost
of the FIB lookup.

(1/13365161-1/7074156)*10^9 = -66.5 ns
(1/13365161-1/7415333)*10^9 = -60.0 ns

Which is very close to the measured 50 ns cost of the FIB lookup, done
by Vincent Bernat.
  See: https://vincent.bernat.im/en/blog/2017-ipv4-route-lookup-linux



Another way I calculate this is by (ran a new benchmark):

Performance: 7641593 (7,641,593) <= tx_unicast /sec
 * Packet-gap: (1/7641593*10^9)  = 130.86 ns

Find all FIB related lookup functions in perf-report::

 Samples: 93K of event 'cycles:ppp', Event count (approx.): 88553104553
   Overhead   Cost      CPU  Command      Symbol
     20.63 %  26.99 ns  002  ksoftirqd/2  [k] fib_table_lookup
     12.92 %  16.90 ns  002  ksoftirqd/2  [k] bpf_fib_lookup
      2.40 %   3.14 ns  002  ksoftirqd/2  [k] fib_select_path
      0.83 %   1.09 ns  002  ksoftirqd/2  [k] fib_get_table
      0.40 %   0.52 ns  002  ksoftirqd/2  [k] l3mdev_fib_table_rcu
 -----------
 Tot:37.18 % (20.63+12.92+2.40+0.83+0.40)
 ===========

Cost of FIB lookup:
 - 130.86/100*37.18 = 48.65 ns overhead by FIB lookup.

Again very close to Vincent's IPv4 measurements of ~50 ns.



Notice that the IPv6 measurements does not match up:
 https://vincent.bernat.im/en/blog/2017-ipv6-route-lookup-linux
This is because, we/I'm just testing the IPv6 route cache here...

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table
  2018-05-02 11:27   ` Jesper Dangaard Brouer
@ 2018-05-02 15:37     ` David Ahern
  2018-05-02 17:00       ` David Miller
  0 siblings, 1 reply; 21+ messages in thread
From: David Ahern @ 2018-05-02 15:37 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, borkmann, ast, davem, shm, roopa, toke, john.fastabend,
	Vincent Bernat

On 5/2/18 5:27 AM, Jesper Dangaard Brouer wrote:
> On Sun, 29 Apr 2018 11:07:51 -0700
> David Ahern <dsahern@gmail.com> wrote:
> 
>> Initial performance numbers collected by Jesper, forwarded packets/sec:
>>
>>        Full stack    XDP FIB lookup    XDP Direct lookup
>> IPv4   1,947,969       7,074,156          7,415,333
>> IPv6   1,728,000       6,165,504          7,262,720
> 
> Do notice these number is single CPU core forwarding performance!
> On a Broadwell E5-1650 v4 @ 3.60GHz.

I'll add that context to the commit message. Thanks,

> 
> Another interesting data point is that xdp_redirect_map performance is
> 13,365,161 pps, which allow us to calculate/isolate the overhead/cost
> of the FIB lookup.
> 
> (1/13365161-1/7074156)*10^9 = -66.5 ns
> (1/13365161-1/7415333)*10^9 = -60.0 ns
> 
> Which is very close to the measured 50 ns cost of the FIB lookup, done
> by Vincent Bernat.
>   See: https://vincent.bernat.im/en/blog/2017-ipv4-route-lookup-linux
> 
> 
> 
> Another way I calculate this is by (ran a new benchmark):
> 
> Performance: 7641593 (7,641,593) <= tx_unicast /sec
>  * Packet-gap: (1/7641593*10^9)  = 130.86 ns
> 
> Find all FIB related lookup functions in perf-report::
> 
>  Samples: 93K of event 'cycles:ppp', Event count (approx.): 88553104553
>    Overhead   Cost      CPU  Command      Symbol
>      20.63 %  26.99 ns  002  ksoftirqd/2  [k] fib_table_lookup
>      12.92 %  16.90 ns  002  ksoftirqd/2  [k] bpf_fib_lookup
>       2.40 %   3.14 ns  002  ksoftirqd/2  [k] fib_select_path
>       0.83 %   1.09 ns  002  ksoftirqd/2  [k] fib_get_table
>       0.40 %   0.52 ns  002  ksoftirqd/2  [k] l3mdev_fib_table_rcu
>  -----------
>  Tot:37.18 % (20.63+12.92+2.40+0.83+0.40)
>  ===========
> 
> Cost of FIB lookup:
>  - 130.86/100*37.18 = 48.65 ns overhead by FIB lookup.
> 
> Again very close to Vincent's IPv4 measurements of ~50 ns.
> 
> 
> 
> Notice that the IPv6 measurements does not match up:
>  https://vincent.bernat.im/en/blog/2017-ipv6-route-lookup-linux
> This is because, we/I'm just testing the IPv6 route cache here...
> 

Vincent's blog is before recent changes -- 4.15 getting the rcu locking,
net-next getting separate fib entries and now this set adding a FIB
lookup without the dst.

To share numbers from recent testing I did using Vincent's modules,
lookup times in nsec (using local_clock) with MULTIPLE_TABLES config
disabled for IPv4 and IPv6

            IPv4    IPv6-dst    IPv6-fib6
baseline     49        126         52

I have other cases with combinations of configs and rules, but this
shows the best possible case.

IPv6 needs some more work to improve speeds with MULTIPLE_TABLES enabled
(separate local and main tables unlike IPv4) and IPV6_SUBTREES enabled.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 bpf-next 9/9] samples/bpf: Add examples of ipv4 and ipv6 forwarding in XDP
  2018-05-02 11:13   ` Jesper Dangaard Brouer
@ 2018-05-02 15:40     ` David Ahern
  0 siblings, 0 replies; 21+ messages in thread
From: David Ahern @ 2018-05-02 15:40 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, borkmann, ast, davem, shm, roopa, toke, john.fastabend,
	Tariq Toukan

On 5/2/18 5:13 AM, Jesper Dangaard Brouer wrote:
> 
> On Sun, 29 Apr 2018 11:07:52 -0700 David Ahern <dsahern@gmail.com> wrote:
> 
>> +	/* verify egress index has xdp support */
>> +	// TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
>> +	//       cannot pass map_type 14 into func bpf_map_lookup_elem#1:
> 
> I just want to point out that I/we are aware of this "usability"
> problem with the sample program, but I don't want to block the FIB
> helper going upstream, we can fix this problem later.
> 
> The problem is that if you load this bpf/xdp prog, then all incoming
> traffic (on that interface), will be forward using XDP, regardless
> whether the egress ifindex support XDP or not.  And if not supported,
> then packets are dropped hard (only detectable via tracepoints).
> 
> If the bpf prog could tell/know that the egress ifindex doesn't
> support XDP xmit, then it could simply return XDP_PASS to fallback to
> the normal network stack.
> 

That's what the above lookup is intending but DEVMAP type does not
handle lookups from xdp programs. Any chance of fixing that?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table
  2018-05-02 15:37     ` David Ahern
@ 2018-05-02 17:00       ` David Miller
  0 siblings, 0 replies; 21+ messages in thread
From: David Miller @ 2018-05-02 17:00 UTC (permalink / raw)
  To: dsahern
  Cc: brouer, netdev, borkmann, ast, shm, roopa, toke, john.fastabend, bernat

From: David Ahern <dsahern@gmail.com>
Date: Wed, 2 May 2018 09:37:21 -0600

> To share numbers from recent testing I did using Vincent's modules,
> lookup times in nsec (using local_clock) with MULTIPLE_TABLES config
> disabled for IPv4 and IPv6
> 
>             IPv4    IPv6-dst    IPv6-fib6
> baseline     49        126         52
> 
> I have other cases with combinations of configs and rules, but this
> shows the best possible case.
> 
> IPv6 needs some more work to improve speeds with MULTIPLE_TABLES enabled
> (separate local and main tables unlike IPv4) and IPV6_SUBTREES enabled.

Yes, like for ipv4 sharing local and main tables will help a lot.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC v2 bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table
  2018-04-30  1:13     ` David Ahern
@ 2018-05-15  3:46       ` David Ahern
  0 siblings, 0 replies; 21+ messages in thread
From: David Ahern @ 2018-05-15  3:46 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: netdev, borkmann, ast, davem, shm, roopa, brouer, toke, john.fastabend

On 4/29/18 7:13 PM, David Ahern wrote:
> 
> The idea here is to fast pass packets that fit a supported profile and
> are to be forwarded. Everything else should continue up the stack as it
> has wider capabilities. The helper and XDP programs should make no
> assumptions on what the broader kernel and userspace might be monitoring
> or want to do with packets that can not be forwarded in the fast path.
> This is very similar to hardware forwarding when it punts packets to the
> CPU for control plane assistance.
> 

Thinking about this some more and how to return more information to the
bpf program about the FIB lookup.

bpf_fib_lookup struct is 64-bytes. It can not be expanded without
hurting performance. I could do another union on an input parameter and
return flags indicating why the returned index is 0. Something like this:

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 360a1168c353..75591522444c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2314,6 +2314,12 @@ struct bpf_raw_tracepoint_args {
 #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
 #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)

+#define BPF_FIB_LKUP_RET_NO_FWD      BIT(0)  /* pkt is not fwded */
+#define BPF_FIB_LKUP_RET_UNSUPP_LWT  BIT(1)  /* fwd requires unsupp
encap */
+#define BPF_FIB_LKUP_RET_NO_NHDEV    BIT(2)  /* nh device does not exist */
+#define BPF_FIB_LKUP_RET_NO_NEIGH    BIT(3)  /* no neigh entry for nh */
+#define BPF_FIB_LKUP_RET_FRAG_NEEDED BIT(4)  /* pkt too big to fwd */
+
 struct bpf_fib_lookup {
        /* input */
        __u8    family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
@@ -2325,7 +2331,11 @@ struct bpf_fib_lookup {

        /* total length of packet from network header - used for MTU
check */
        __u16   tot_len;
-       __u32   ifindex;  /* L3 device index for lookup */
+
+       union {
+               __u32   ifindex;   /* in: L3 device index for lookup */
+               __u32   ret_flags; /* out: BPF_FIB_LOOKUP_RET flags */
+       }

        union {
                /* inputs to lookup */


Similarly for the fib result, it could be returned with a union on say
family:
    union {
        __u8 family;   /* in: network family, AF_INET, AF_INET6, AF_MPLS */
        __u8 rt_type;  /* out: FIB lookup route type */
    };

Then if the fib result is -EINVAL/-EHOSTUNREACH/-EACCES, rt_type is set
to RTN_BLACKHOLE/RTN_UNREACHABLE/RTN_PROHIBIT allowing the XDP program
to make an informed decision on dropping the packet.

To avoid performance hits on the forwarding path, these return values
would *only* set if the ifindex returned is 0.

^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2018-05-15  3:46 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-29 18:07 [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups David Ahern
2018-04-29 18:07 ` [RFC v2 bpf-next 1/9] net/ipv6: Rename fib6_lookup to fib6_node_lookup David Ahern
2018-04-29 18:07 ` [RFC v2 bpf-next 2/9] net/ipv6: Rename rt6_multipath_select David Ahern
2018-04-29 18:07 ` [RFC v2 bpf-next 3/9] net/ipv6: Extract table lookup from ip6_pol_route David Ahern
2018-04-29 18:07 ` [RFC v2 bpf-next 4/9] net/ipv6: Refactor fib6_rule_action David Ahern
2018-04-29 18:07 ` [RFC v2 bpf-next 5/9] net/ipv6: Add fib6_lookup David Ahern
2018-05-01 18:15   ` Vincent Bernat
2018-05-01 18:25     ` David Ahern
2018-04-29 18:07 ` [RFC v2 bpf-next 6/9] net/ipv6: Update fib6 tracepoint to take fib6_info David Ahern
2018-04-29 18:07 ` [RFC v2 bpf-next 7/9] net/ipv6: Add fib lookup stubs for use in bpf helper David Ahern
2018-04-29 18:07 ` [RFC v2 bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table David Ahern
2018-04-29 23:36   ` Alexei Starovoitov
2018-04-30  1:13     ` David Ahern
2018-05-15  3:46       ` David Ahern
2018-05-02 11:27   ` Jesper Dangaard Brouer
2018-05-02 15:37     ` David Ahern
2018-05-02 17:00       ` David Miller
2018-04-29 18:07 ` [RFC v2 bpf-next 9/9] samples/bpf: Add examples of ipv4 and ipv6 forwarding in XDP David Ahern
2018-05-02 11:13   ` Jesper Dangaard Brouer
2018-05-02 15:40     ` David Ahern
2018-05-01 14:20 ` [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.