netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH bpf-next v2 0/3] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap
@ 2019-01-24 19:34 Peter Oskolkov
  2019-01-24 19:34 ` [PATCH bpf-next v2 1/3] bpf: add plumbing for BPF_LWT_ENCAP_IP in bpf_lwt_push_encap Peter Oskolkov
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Peter Oskolkov @ 2019-01-24 19:34 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Willem de Bruijn, Peter Oskolkov

This patchset implements BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN
and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers
to packets (e.g. IP/GRE, GUE, IPIP).

This is useful when thousands of different short-lived flows should be
encapped, each with different and dynamically determined destination.
Although lwtunnels can be used in some of these scenarios, the ability
to dynamically generate encap headers adds more flexibility, e.g.
when routing depends on the state of the host (reflected in global bpf
maps).

V2 changes: Added flowi-based route lookup, IPv6 encapping, and
encapping on ingress.


Peter Oskolkov (3):
  bpf: add plumbing for BPF_LWT_ENCAP_IP in bpf_lwt_push_encap
  bpf: implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
  selftests: bpf: add test_lwt_ip_encap selftest

 include/net/lwtunnel.h                        |   3 +
 include/uapi/linux/bpf.h                      |  23 +-
 net/core/filter.c                             |  47 ++-
 net/core/lwt_bpf.c                            | 142 ++++++++
 tools/testing/selftests/bpf/Makefile          |   5 +-
 .../testing/selftests/bpf/test_lwt_ip_encap.c | 125 +++++++
 .../selftests/bpf/test_lwt_ip_encap.sh        | 316 ++++++++++++++++++
 7 files changed, 652 insertions(+), 9 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_lwt_ip_encap.c
 create mode 100755 tools/testing/selftests/bpf/test_lwt_ip_encap.sh

-- 
2.20.1.321.g9e740568ce-goog


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH bpf-next v2 1/3] bpf: add plumbing for BPF_LWT_ENCAP_IP in bpf_lwt_push_encap
  2019-01-24 19:34 [PATCH bpf-next v2 0/3] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap Peter Oskolkov
@ 2019-01-24 19:34 ` Peter Oskolkov
  2019-01-24 19:34 ` [PATCH bpf-next v2 2/3] bpf: implement BPF_LWT_ENCAP_IP mode " Peter Oskolkov
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 9+ messages in thread
From: Peter Oskolkov @ 2019-01-24 19:34 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Willem de Bruijn, Peter Oskolkov

This patch adds all needed plumbing in preparation to allowing
bpf programs to do IP encapping via bpf_lwt_push_encap. Actual
implementation is added in the next patch in the patchset.

Of note:
- bpf_lwt_push_encap can now be called from BPF_PROG_TYPE_LWT_XMIT
  prog types in addition to BPF_PROG_TYPE_LWT_IN;
- as route lookups are different for ingress vs egress, the single
  external bpf_lwt_push_encap BPF helper is routed internally to
  either bpf_lwt_in_push_encap or bpf_lwt_xmit_push_encap BPF_CALLs,
  depending on prog type.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 include/uapi/linux/bpf.h | 23 ++++++++++++++++++--
 net/core/filter.c        | 46 +++++++++++++++++++++++++++++++++++-----
 2 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2940a9854f6d..d4852feb14e4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2014,6 +2014,16 @@ union bpf_attr {
  *			Only works if *skb* contains an IPv6 packet. Insert a
  *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
  *			the IPv6 header.
+ *		**BPF_LWT_ENCAP_IP**
+ *			IP encapsulation (GRE/GUE/IPIP/etc). The outer header
+ *			must be IPv4 or IPv6, followed by zero or more
+ *			additional headers, up to LWT_BPF_MAX_HEADROOM total
+ *			bytes in all prepended headers.
+ *
+ *		BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of
+ *		type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called
+ *		by bpf programs of types BPF_PROG_TYPE_LWT_IN and
+ *		BPF_PROG_TYPE_LWT_XMIT.
  *
  * 		A call to this helper is susceptible to change the underlaying
  * 		packet buffer. Therefore, at load time, all checks on pointers
@@ -2494,7 +2504,8 @@ enum bpf_hdr_start_off {
 /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
 enum bpf_lwt_encap_mode {
 	BPF_LWT_ENCAP_SEG6,
-	BPF_LWT_ENCAP_SEG6_INLINE
+	BPF_LWT_ENCAP_SEG6_INLINE,
+	BPF_LWT_ENCAP_IP,
 };
 
 #define __bpf_md_ptr(type, name)	\
@@ -2582,7 +2593,15 @@ enum bpf_ret_code {
 	BPF_DROP = 2,
 	/* 3-6 reserved */
 	BPF_REDIRECT = 7,
-	/* >127 are reserved for prog type specific return codes */
+	/* >127 are reserved for prog type specific return codes.
+	 *
+	 * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and
+	 *    BPF_PROG_TYPE_LWT_XMIT to indicate that skb's dst
+	 *    has changed and appropriate dst_input() or dst_output()
+	 *    action has to be taken (this is an L3 redirect, as
+	 *    opposed to L2 redirect represented by BPF_REDIRECT above).
+	 */
+	BPF_LWT_REROUTE = 128,
 };
 
 struct bpf_sock {
diff --git a/net/core/filter.c b/net/core/filter.c
index 8e587dd1da20..fd3ae092d3d7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4793,7 +4793,13 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
 }
 #endif /* CONFIG_IPV6_SEG6_BPF */
 
-BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
+static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
+			     bool ingress)
+{
+	return -EINVAL;  /* Implemented in the next patch. */
+}
+
+BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
 	   u32, len)
 {
 	switch (type) {
@@ -4801,14 +4807,41 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
 	case BPF_LWT_ENCAP_SEG6:
 	case BPF_LWT_ENCAP_SEG6_INLINE:
 		return bpf_push_seg6_encap(skb, type, hdr, len);
+#endif
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+	case BPF_LWT_ENCAP_IP:
+		return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
 #endif
 	default:
 		return -EINVAL;
 	}
 }
 
-static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
-	.func		= bpf_lwt_push_encap,
+BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
+	   void *, hdr, u32, len)
+{
+	switch (type) {
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+	case BPF_LWT_ENCAP_IP:
+		return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
+#endif
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
+	.func		= bpf_lwt_in_push_encap,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
+	.func		= bpf_lwt_xmit_push_encap,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
@@ -5274,7 +5307,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_lwt_seg6_adjust_srh ||
 	    func == bpf_lwt_seg6_action ||
 #endif
-	    func == bpf_lwt_push_encap)
+	    func == bpf_lwt_in_push_encap ||
+	    func == bpf_lwt_xmit_push_encap)
 		return true;
 
 	return false;
@@ -5652,7 +5686,7 @@ lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_lwt_push_encap:
-		return &bpf_lwt_push_encap_proto;
+		return &bpf_lwt_in_push_encap_proto;
 	default:
 		return lwt_out_func_proto(func_id, prog);
 	}
@@ -5688,6 +5722,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_l4_csum_replace_proto;
 	case BPF_FUNC_set_hash_invalid:
 		return &bpf_set_hash_invalid_proto;
+	case BPF_FUNC_lwt_push_encap:
+		return &bpf_lwt_xmit_push_encap_proto;
 	default:
 		return lwt_out_func_proto(func_id, prog);
 	}
-- 
2.20.1.321.g9e740568ce-goog


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH bpf-next v2 2/3] bpf: implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
  2019-01-24 19:34 [PATCH bpf-next v2 0/3] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap Peter Oskolkov
  2019-01-24 19:34 ` [PATCH bpf-next v2 1/3] bpf: add plumbing for BPF_LWT_ENCAP_IP in bpf_lwt_push_encap Peter Oskolkov
@ 2019-01-24 19:34 ` Peter Oskolkov
  2019-01-28 20:31   ` David Ahern
  2019-01-24 19:34 ` [PATCH bpf-next v2 3/3] selftests: bpf: add test_lwt_ip_encap selftest Peter Oskolkov
  2019-01-26  0:29 ` [PATCH bpf-next v2 0/3] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap David Ahern
  3 siblings, 1 reply; 9+ messages in thread
From: Peter Oskolkov @ 2019-01-24 19:34 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Willem de Bruijn, Peter Oskolkov

This patch implements BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN
and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers
to packets (e.g. IP/GRE, GUE, IPIP).

This is useful when thousands of different short-lived flows should be
encapped, each with different and dynamically determined destination.
Although lwtunnels can be used in some of these scenarios, the ability
to dynamically generate encap headers adds more flexibility, e.g.
when routing depends on the state of the host (reflected in global bpf
maps).

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 include/net/lwtunnel.h |   3 +
 net/core/filter.c      |   3 +-
 net/core/lwt_bpf.c     | 142 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 147 insertions(+), 1 deletion(-)

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 33fd9ba7e0e5..f0973eca8036 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -126,6 +126,8 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
 int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int lwtunnel_input(struct sk_buff *skb);
 int lwtunnel_xmit(struct sk_buff *skb);
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
+			  bool ingress);
 
 static inline void lwtunnel_set_redirect(struct dst_entry *dst)
 {
@@ -138,6 +140,7 @@ static inline void lwtunnel_set_redirect(struct dst_entry *dst)
 		dst->input = lwtunnel_input;
 	}
 }
+
 #else
 
 static inline void lwtstate_free(struct lwtunnel_state *lws)
diff --git a/net/core/filter.c b/net/core/filter.c
index fd3ae092d3d7..81d18660c38b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -73,6 +73,7 @@
 #include <linux/seg6_local.h>
 #include <net/seg6.h>
 #include <net/seg6_local.h>
+#include <net/lwtunnel.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -4796,7 +4797,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
 static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
 			     bool ingress)
 {
-	return -EINVAL;  /* Implemented in the next patch. */
+	return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
 }
 
 BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 3e85437f7106..a3f79bff3776 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/bpf.h>
 #include <net/lwtunnel.h>
+#include <net/ip6_route.h>
 
 struct bpf_lwt_prog {
 	struct bpf_prog *prog;
@@ -55,6 +56,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 
 	switch (ret) {
 	case BPF_OK:
+	case BPF_LWT_REROUTE:
 		break;
 
 	case BPF_REDIRECT:
@@ -97,6 +99,8 @@ static int bpf_input(struct sk_buff *skb)
 		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
 		if (ret < 0)
 			return ret;
+		if (ret == BPF_LWT_REROUTE)
+			return dst_input(skb);
 	}
 
 	if (unlikely(!dst->lwtstate->orig_input)) {
@@ -168,6 +172,13 @@ static int bpf_xmit(struct sk_buff *skb)
 			return LWTUNNEL_XMIT_CONTINUE;
 		case BPF_REDIRECT:
 			return LWTUNNEL_XMIT_DONE;
+		case BPF_LWT_REROUTE:
+			ret = dst_output(dev_net(skb_dst(skb)->dev),
+					 skb->sk, skb);
+			if (unlikely(ret))
+				return ret;
+			/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
+			return LWTUNNEL_XMIT_DONE;
 		default:
 			return ret;
 		}
@@ -389,6 +400,137 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {
 	.owner		= THIS_MODULE,
 };
 
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
+{
+	struct dst_entry *dst = NULL;
+	struct iphdr *iph;
+	bool ipv4;
+	int err;
+
+	if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
+		return -EINVAL;
+
+	/* validate protocol and length */
+	iph = (struct iphdr *)hdr;
+	if (iph->version == 4) {
+		ipv4 = true;
+		if (iph->ihl * 4 > len)
+			return -EINVAL;
+	} else if (iph->version == 6) {
+		ipv4 = false;
+		if (unlikely(len < sizeof(struct ipv6hdr)))
+			return -EINVAL;
+	} else {
+		return -EINVAL;
+	}
+
+	/* allocate enough space for the encap headers + L2 hdr */
+	if (ingress) {
+		err = skb_cow_head(skb, len + skb->mac_len);
+		if (unlikely(err))
+			return err;
+	} else {
+		/* ip_route_input_noref below does route lookup and dst
+		 * drop/set for ingress. There is no similar function for
+		 * egress, so we need to do route lookup and replace skb's
+		 * dst in this function.
+		 */
+		struct sock *sk;
+		struct net *net;
+
+		sk = sk_to_full_sk(skb->sk);
+		if (sk)
+			net = sock_net(sk);
+		else
+			net = dev_net(skb_dst(skb)->dev);
+
+		if (ipv4) {
+			struct flowi4 fl4 = {0};
+			struct rtable *rt;
+
+			fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
+			fl4.flowi4_mark = skb->mark;
+			fl4.flowi4_uid = sock_net_uid(net, sk);
+			fl4.flowi4_tos = RT_TOS(iph->tos);
+			fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+			fl4.flowi4_proto = iph->protocol;
+			fl4.daddr = iph->daddr;
+			fl4.saddr = iph->saddr;
+
+			rt = ip_route_output_key(net, &fl4);
+			if (IS_ERR(rt) || rt->dst.error)
+				return -EINVAL;
+			dst = &rt->dst;
+		} else {
+			struct ipv6hdr *iph6 = (struct ipv6hdr *)hdr;
+			struct flowi6 fl6 = {0};
+
+			fl6.flowi6_oif = sk ? sk->sk_bound_dev_if : 0;
+			fl6.flowi6_mark = skb->mark;
+			fl6.flowi6_uid = sock_net_uid(net, sk);
+			fl6.flowlabel = ip6_flowinfo(iph6);
+			fl6.flowi6_proto = iph6->nexthdr;
+			fl6.daddr = iph6->daddr;
+			fl6.saddr = iph6->saddr;
+
+			dst = ip6_route_output(net, skb->sk, &fl6);
+			if (IS_ERR(dst) || dst->error)
+				return -EINVAL;
+		}
+
+		err = skb_cow_head(skb, len + LL_RESERVED_SPACE(dst->dev));
+		if (unlikely(err))
+			return err;
+	}
+
+	/* push the encap headers and fix pointers */
+	skb_reset_inner_headers(skb);
+	skb->encapsulation = 1;
+	skb_push(skb, len);
+	if (ingress)
+		skb_postpush_rcsum(skb, iph, len);
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	memcpy(iph, hdr, len);
+	bpf_compute_data_pointers(skb);
+
+	/* final skb touches + routing */
+	if (ipv4) {
+		skb->protocol = htons(ETH_P_IP);
+		if (iph->ihl * 4 < len)
+			skb_set_transport_header(skb, iph->ihl * 4);
+
+		if (!iph->check)
+			iph->check = ip_fast_csum((unsigned char *)iph,
+						  iph->ihl);
+
+		if (ingress) {
+			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+						   iph->tos, skb_dst(skb)->dev);
+			if (err)
+				return err;
+		} else {
+			skb_dst_drop(skb);
+			skb_dst_set(skb, dst);
+		}
+	} else {
+		skb->protocol = htons(ETH_P_IPV6);
+		if (sizeof(struct ipv6hdr) < len)
+			skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+		if (ingress) {
+			ip6_route_input(skb);
+			if (skb_dst(skb)->error)
+				return skb_dst(skb)->error;
+		} else {
+			skb_dst_drop(skb);
+			skb_dst_set(skb, dst);
+		}
+	}
+
+	return 0;
+}
+
 static int __init bpf_lwt_init(void)
 {
 	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
-- 
2.20.1.321.g9e740568ce-goog


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH bpf-next v2 3/3] selftests: bpf: add test_lwt_ip_encap selftest
  2019-01-24 19:34 [PATCH bpf-next v2 0/3] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap Peter Oskolkov
  2019-01-24 19:34 ` [PATCH bpf-next v2 1/3] bpf: add plumbing for BPF_LWT_ENCAP_IP in bpf_lwt_push_encap Peter Oskolkov
  2019-01-24 19:34 ` [PATCH bpf-next v2 2/3] bpf: implement BPF_LWT_ENCAP_IP mode " Peter Oskolkov
@ 2019-01-24 19:34 ` Peter Oskolkov
  2019-01-26 21:46   ` Alexei Starovoitov
  2019-01-28 20:31   ` David Ahern
  2019-01-26  0:29 ` [PATCH bpf-next v2 0/3] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap David Ahern
  3 siblings, 2 replies; 9+ messages in thread
From: Peter Oskolkov @ 2019-01-24 19:34 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Willem de Bruijn, Peter Oskolkov

This patch adds a bpf self-test to cover BPF_LWT_ENCAP_IP mode
in bpf_lwt_push_encap.

Covered:
- encapping in LWT_IN and LWT_XMIT
- IPv4 and IPv6

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 tools/testing/selftests/bpf/Makefile          |   5 +-
 .../testing/selftests/bpf/test_lwt_ip_encap.c | 125 +++++++
 .../selftests/bpf/test_lwt_ip_encap.sh        | 316 ++++++++++++++++++
 3 files changed, 444 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_lwt_ip_encap.c
 create mode 100755 tools/testing/selftests/bpf/test_lwt_ip_encap.sh

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 70229de510f5..407c51cc5f07 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -39,7 +39,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
 	test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o \
 	test_sk_lookup_kern.o test_xdp_vlan.o test_queue_map.o test_stack_map.o \
-	xdp_dummy.o test_map_in_map.o
+	xdp_dummy.o test_map_in_map.o test_lwt_ip_encap.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -53,7 +53,8 @@ TEST_PROGS := test_kmod.sh \
 	test_lirc_mode2.sh \
 	test_skb_cgroup_id.sh \
 	test_flow_dissector.sh \
-	test_xdp_vlan.sh
+	test_xdp_vlan.sh \
+	test_lwt_ip_encap.sh
 
 TEST_PROGS_EXTENDED := with_addr.sh \
 	tcp_client.py \
diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.c b/tools/testing/selftests/bpf/test_lwt_ip_encap.c
new file mode 100644
index 000000000000..a7014277f3fe
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <string.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define BPF_LWT_ENCAP_IP 2
+#define BPF_LWT_REROUTE 128
+
+struct iphdr {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u8	ihl:4,
+		version:4;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	__u8	version:4,
+		ihl:4;
+#else
+#error "Fix your compiler's __BYTE_ORDER__?!"
+#endif
+	__u8	tos;
+	__be16	tot_len;
+	__be16	id;
+	__be16	frag_off;
+	__u8	ttl;
+	__u8	protocol;
+	__sum16	check;
+	__be32	saddr;
+	__be32	daddr;
+};
+
+struct ipv6hdr {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u8	priority:4,
+		version:4;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	__u8	version:4,
+		priority:4;
+#else
+#error "Fix your compiler's __BYTE_ORDER__?!"
+#endif
+	__u8	flow_lbl[3];
+
+	__be16	payload_len;
+	__u8	nexthdr;
+	__u8	hop_limit;
+
+	__u8	saddr[16];
+	__u8	daddr[16];
+};
+
+struct grehdr {
+	__be16 flags;
+	__be16 protocol;
+};
+
+SEC("encap_gre")
+int bpf_lwt_encap_gre(struct __sk_buff *skb)
+{
+	struct encap_hdr {
+		struct iphdr iph;
+		struct grehdr greh;
+	} hdr;
+	int err;
+
+	memset(&hdr, 0, sizeof(struct encap_hdr));
+
+	hdr.iph.ihl = 5;
+	hdr.iph.version = 4;
+	hdr.iph.ttl = 0x40;
+	hdr.iph.protocol = 47;  /* IPPROTO_GRE */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	hdr.iph.saddr = 0x640110ac;  /* 172.16.1.100 */
+	hdr.iph.daddr = 0x641010ac;  /* 172.16.16.100 */
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	hdr.iph.saddr = 0xac100164;  /* 172.16.1.100 */
+	hdr.iph.daddr = 0xac101064;  /* 172.16.16.100 */
+#else
+#error "Fix your compiler's __BYTE_ORDER__?!"
+#endif
+	hdr.iph.tot_len = bpf_htons(skb->len + sizeof(struct encap_hdr));
+
+	hdr.greh.protocol = skb->protocol;
+
+	err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr,
+				 sizeof(struct encap_hdr));
+	if (err)
+		return BPF_DROP;
+	return BPF_LWT_REROUTE;
+}
+
+SEC("encap_gre6")
+int bpf_lwt_encap_gre6(struct __sk_buff *skb)
+{
+	struct encap_hdr {
+		struct ipv6hdr ip6hdr;
+		struct grehdr greh;
+	} hdr;
+	int err;
+
+	memset(&hdr, 0, sizeof(struct encap_hdr));
+
+	hdr.ip6hdr.version = 6;
+	hdr.ip6hdr.payload_len = bpf_htons(skb->len + sizeof(struct grehdr));
+	hdr.ip6hdr.nexthdr = 47;  /* IPPROTO_GRE */
+	hdr.ip6hdr.hop_limit = 0x40;
+	/* fb01::1 */
+	hdr.ip6hdr.saddr[0] = 0xfb;
+	hdr.ip6hdr.saddr[1] = 1;
+	hdr.ip6hdr.saddr[15] = 1;
+	/* fb10::1 */
+	hdr.ip6hdr.daddr[0] = 0xfb;
+	hdr.ip6hdr.daddr[1] = 0x10;
+	hdr.ip6hdr.daddr[15] = 1;
+
+	hdr.greh.protocol = skb->protocol;
+
+	err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr,
+				 sizeof(struct encap_hdr));
+	if (err)
+		return BPF_DROP;
+
+	return BPF_LWT_REROUTE;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh
new file mode 100755
index 000000000000..4f511587bb74
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh
@@ -0,0 +1,316 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Setup/topology:
+#
+#    NS1             NS2             NS3
+#   veth1 <---> veth2   veth3 <---> veth4 (the top route)
+#   veth5 <---> veth6   veth7 <---> veth8 (the bottom route)
+#
+#   each vethN gets IPv[4|6]_N address
+#
+#   IPv*_SRC = IPv*_1
+#   IPv*_DST = IPv*_4
+#
+#   all tests test pings from IPv*_SRC to IPv*_DST
+#
+#   by default, routes are configured to allow packets to go
+#   IP*_1 <=> IP*_2 <=> IP*_3 <=> IP*_4 (the top route)
+#
+#   a GRE device is installed in NS3 with IPv*_GRE, and
+#   NS1/NS2 are configured to route packets to IPv*_GRE via IP*_8
+#   (the bottom route)
+#
+# Tests:
+#
+#   1. routes NS2->IPv*_DST are brought down, so the only way a ping
+#      from IP*_SRC to IP*_DST can work is via IPv*_GRE
+#
+#   2a. in an egress test, a bpf LWT_XMIT program is installed on veth1
+#       that encaps the packets with an IP/GRE header to route to IPv*_GRE
+#
+#       ping: SRC->[encap at veth1:egress]->GRE:decap->DST
+#       ping replies go DST->SRC directly
+#
+#   2b. in an ingress test, a bpf LWT_IN program is installed on veth2
+#       that encaps the packets with an IP/GRE header to route to IPv*_GRE
+#
+#       ping: SRC->[encap at veth2:ingress]->GRE:decap->DST
+#       ping replies go DST->SRC directly
+
+set +x  # debug OFF
+set -e  # exit on error
+
+if [[ $EUID -ne 0 ]]; then
+	echo "This script must be run as root"
+	echo "FAIL"
+	exit 1
+fi
+
+readonly NS1="ns1-$(mktemp -u XXXXXX)"
+readonly NS2="ns2-$(mktemp -u XXXXXX)"
+readonly NS3="ns3-$(mktemp -u XXXXXX)"
+
+readonly IPv4_1="172.16.1.100"
+readonly IPv4_2="172.16.2.100"
+readonly IPv4_3="172.16.3.100"
+readonly IPv4_4="172.16.4.100"
+readonly IPv4_5="172.16.5.100"
+readonly IPv4_6="172.16.6.100"
+readonly IPv4_7="172.16.7.100"
+readonly IPv4_8="172.16.8.100"
+readonly IPv4_GRE="172.16.16.100"
+
+readonly IPv4_SRC=$IPv4_1
+readonly IPv4_DST=$IPv4_4
+
+readonly IPv6_1="fb01::1"
+readonly IPv6_2="fb02::1"
+readonly IPv6_3="fb03::1"
+readonly IPv6_4="fb04::1"
+readonly IPv6_5="fb05::1"
+readonly IPv6_6="fb06::1"
+readonly IPv6_7="fb07::1"
+readonly IPv6_8="fb08::1"
+readonly IPv6_GRE="fb10::1"
+
+readonly IPv6_SRC=$IPv6_1
+readonly IPv6_DST=$IPv6_4
+
+setup() {
+set -e  # exit on error
+	# create devices and namespaces
+	ip netns add "${NS1}"
+	ip netns add "${NS2}"
+	ip netns add "${NS3}"
+
+	ip link add veth1 type veth peer name veth2
+	ip link add veth3 type veth peer name veth4
+	ip link add veth5 type veth peer name veth6
+	ip link add veth7 type veth peer name veth8
+
+	ip netns exec ${NS2} sysctl -w net.ipv4.ip_forward=1 2>&1 > /dev/null
+	ip netns exec ${NS2} sysctl -w net.ipv6.conf.all.forwarding=1 2>&1 > /dev/null
+
+	ip link set veth1 netns ${NS1}
+	ip link set veth2 netns ${NS2}
+	ip link set veth3 netns ${NS2}
+	ip link set veth4 netns ${NS3}
+	ip link set veth5 netns ${NS1}
+	ip link set veth6 netns ${NS2}
+	ip link set veth7 netns ${NS2}
+	ip link set veth8 netns ${NS3}
+
+	# configure addesses: the top route (1-2-3-4)
+	ip -netns ${NS1}    addr add ${IPv4_1}/24  dev veth1
+	ip -netns ${NS2}    addr add ${IPv4_2}/24  dev veth2
+	ip -netns ${NS2}    addr add ${IPv4_3}/24  dev veth3
+	ip -netns ${NS3}    addr add ${IPv4_4}/24  dev veth4
+	ip -netns ${NS1} -6 addr add ${IPv6_1}/128 dev veth1
+	ip -netns ${NS2} -6 addr add ${IPv6_2}/128 dev veth2
+	ip -netns ${NS2} -6 addr add ${IPv6_3}/128 dev veth3
+	ip -netns ${NS3} -6 addr add ${IPv6_4}/128 dev veth4
+
+	# configure addresses: the bottom route (5-6-7-8)
+	ip -netns ${NS1}    addr add ${IPv4_5}/24  dev veth5
+	ip -netns ${NS2}    addr add ${IPv4_6}/24  dev veth6
+	ip -netns ${NS2}    addr add ${IPv4_7}/24  dev veth7
+	ip -netns ${NS3}    addr add ${IPv4_8}/24  dev veth8
+	ip -netns ${NS1} -6 addr add ${IPv6_5}/128 dev veth5
+	ip -netns ${NS2} -6 addr add ${IPv6_6}/128 dev veth6
+	ip -netns ${NS2} -6 addr add ${IPv6_7}/128 dev veth7
+	ip -netns ${NS3} -6 addr add ${IPv6_8}/128 dev veth8
+
+
+	ip -netns ${NS1} link set dev veth1 up
+	ip -netns ${NS2} link set dev veth2 up
+	ip -netns ${NS2} link set dev veth3 up
+	ip -netns ${NS3} link set dev veth4 up
+	ip -netns ${NS1} link set dev veth5 up
+	ip -netns ${NS2} link set dev veth6 up
+	ip -netns ${NS2} link set dev veth7 up
+	ip -netns ${NS3} link set dev veth8 up
+
+	# configure routes: IP*_SRC -> veth1/IP*_2 (= top route) default;
+	# the bottom route to specific bottom addresses
+
+	# NS1
+	# top route
+	ip -netns ${NS1}    route add ${IPv4_2}/32  dev veth1
+	ip -netns ${NS1}    route add default dev veth1 via ${IPv4_2}  # go top by default
+	ip -netns ${NS1} -6 route add ${IPv6_2}/128 dev veth1
+	ip -netns ${NS1} -6 route add default dev veth1 via ${IPv6_2}  # go top by default
+	# bottom route
+	ip -netns ${NS1}    route add ${IPv4_6}/32  dev veth5
+	ip -netns ${NS1}    route add ${IPv4_7}/32  dev veth5 via ${IPv4_6}
+	ip -netns ${NS1}    route add ${IPv4_8}/32  dev veth5 via ${IPv4_6}
+	ip -netns ${NS1} -6 route add ${IPv6_6}/128 dev veth5
+	ip -netns ${NS1} -6 route add ${IPv6_7}/128 dev veth5 via ${IPv6_6}
+	ip -netns ${NS1} -6 route add ${IPv6_8}/128 dev veth5 via ${IPv6_6}
+
+	# NS2
+	# top route
+	ip -netns ${NS2}    route add ${IPv4_1}/32  dev veth2
+	ip -netns ${NS2}    route add ${IPv4_4}/32  dev veth3
+	ip -netns ${NS2} -6 route add ${IPv6_1}/128 dev veth2
+	ip -netns ${NS2} -6 route add ${IPv6_4}/128 dev veth3
+	# bottom route
+	ip -netns ${NS2}    route add ${IPv4_5}/32  dev veth6
+	ip -netns ${NS2}    route add ${IPv4_8}/32  dev veth7
+	ip -netns ${NS2} -6 route add ${IPv6_5}/128 dev veth6
+	ip -netns ${NS2} -6 route add ${IPv6_8}/128 dev veth7
+
+	# NS3
+	# top route
+	ip -netns ${NS3}    route add ${IPv4_3}/32  dev veth4
+	ip -netns ${NS3}    route add ${IPv4_1}/32  dev veth4 via ${IPv4_3}
+	ip -netns ${NS3}    route add ${IPv4_2}/32  dev veth4 via ${IPv4_3}
+	ip -netns ${NS3} -6 route add ${IPv6_3}/128 dev veth4
+	ip -netns ${NS3} -6 route add ${IPv6_1}/128 dev veth4 via ${IPv6_3}
+	ip -netns ${NS3} -6 route add ${IPv6_2}/128 dev veth4 via ${IPv6_3}
+	# bottom route
+	ip -netns ${NS3}    route add ${IPv4_7}/32  dev veth8
+	ip -netns ${NS3}    route add ${IPv4_5}/32  dev veth8 via ${IPv4_7}
+	ip -netns ${NS3}    route add ${IPv4_6}/32  dev veth8 via ${IPv4_7}
+	ip -netns ${NS3} -6 route add ${IPv6_7}/128 dev veth8
+	ip -netns ${NS3} -6 route add ${IPv6_5}/128 dev veth8 via ${IPv6_7}
+	ip -netns ${NS3} -6 route add ${IPv6_6}/128 dev veth8 via ${IPv6_7}
+
+	# configure IPv4 GRE device in NS3, and a route to it via the "bottom" route
+	ip -netns ${NS3} tunnel add gre_dev mode gre remote ${IPv4_1} local ${IPv4_GRE} ttl 255
+	ip -netns ${NS3} link set gre_dev up
+	ip -netns ${NS3} addr add ${IPv4_GRE} dev gre_dev
+	ip -netns ${NS1} route add ${IPv4_GRE}/32 dev veth5 via ${IPv4_6}
+	ip -netns ${NS2} route add ${IPv4_GRE}/32 dev veth7 via ${IPv4_8}
+
+
+	# configure IPv6 GRE device in NS3, and a route to it via the "bottom" route
+	ip -netns ${NS3} -6 tunnel add name gre6_dev mode ip6gre remote ${IPv6_1} local ${IPv6_GRE} ttl 255
+	ip -netns ${NS3} link set gre6_dev up
+	ip -netns ${NS3} -6 addr add ${IPv6_GRE} dev gre6_dev
+	ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6}
+	ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8}
+
+	# rp_filter gets confused by what these tests are doing, so disable it
+	ip netns exec ${NS1} sysctl -w net.ipv4.conf.all.rp_filter=0 2>&1 > /dev/null
+	ip netns exec ${NS2} sysctl -w net.ipv4.conf.all.rp_filter=0 2>&1 > /dev/null
+	ip netns exec ${NS3} sysctl -w net.ipv4.conf.all.rp_filter=0 2>&1 > /dev/null
+}
+
+cleanup() {
+	ip netns del ${NS1} 2> /dev/null
+	ip netns del ${NS2} 2> /dev/null
+	ip netns del ${NS3} 2> /dev/null
+}
+
+trap cleanup EXIT
+
+test_ping() {
+	local readonly PROTO=$1
+	local readonly EXPECTED=$2
+	local RET=0
+
+	set +e
+	if [ "${PROTO}" == "IPv4" ] ; then
+		ip netns exec ${NS1} ping  -c 1 -W 1 -I ${IPv4_SRC} ${IPv4_DST} 2>&1 > /dev/null
+		# ip netns exec ${NS1} ping  -c 1 -W 10 -I ${IPv4_SRC} ${IPv4_DST}
+		RET=$?
+	elif [ "${PROTO}" == "IPv6" ] ; then
+		ip netns exec ${NS1} ping6 -c 1 -W 6 -I ${IPv6_SRC} ${IPv6_DST} 2>&1 > /dev/null
+		# ip netns exec ${NS1} ping6 -c 1 -W 6 -I ${IPv6_SRC} ${IPv6_DST}
+		RET=$?
+	else
+		echo "test_ping: unknown PROTO: ${PROTO}"
+		exit 1
+	fi
+	set -e
+
+	if [ "0" != "${RET}" ]; then
+		RET=1
+	fi
+
+	if [ "${EXPECTED}" != "${RET}" ] ; then
+		echo "FAIL: test_ping: ${RET}"
+		exit 1
+	fi
+}
+
+test_egress() {
+	local readonly ENCAP=$1
+	echo "starting egress ${ENCAP} encap test"
+	setup
+
+	# need to wait a bit for IPv6 to autoconf, otherwise
+	# ping6 sometimes fails with "unable to bind to address"
+	sleep 1
+
+	# by default, pings work
+	test_ping IPv4 0
+	test_ping IPv6 0
+
+	# remove NS2->DST routes, ping fails
+	ip -netns ${NS2}    route del ${IPv4_DST}/32  dev veth3
+	ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3
+	test_ping IPv4 1
+	test_ping IPv6 1
+
+	# install replacement routes (LWT/eBPF), pings succeed
+	if [ "${ENCAP}" == "IPv4" ] ; then
+		ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1
+		ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1
+	elif [ "${ENCAP}" == "IPv6" ] ; then
+		ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1
+		ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1
+	else
+		echo "FAIL: unknown encap ${ENCAP}"
+	fi
+	test_ping IPv4 0
+	test_ping IPv6 0
+
+	cleanup
+	echo "PASS"
+}
+
+test_ingress() {
+	local readonly ENCAP=$1
+	echo "starting ingress ${ENCAP} encap test"
+	setup
+
+	# need to wait a bit for IPv6 to autoconf, otherwise
+	# ping6 sometimes fails with "unable to bind to address"
+	sleep 1
+
+	# by default, pings work
+	test_ping IPv4 0
+	test_ping IPv6 0
+
+	# remove NS2->DST routes, pings fail
+	ip -netns ${NS2}    route del ${IPv4_DST}/32  dev veth3
+	ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3
+	test_ping IPv4 1
+	test_ping IPv6 1
+
+	# install replacement routes (LWT/eBPF), pings succeed
+	if [ "${ENCAP}" == "IPv4" ] ; then
+		ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre dev veth2
+		ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre dev veth2
+	elif [ "${ENCAP}" == "IPv6" ] ; then
+		ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre6 dev veth2
+		ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre6 dev veth2
+	else
+		echo "FAIL: unknown encap ${ENCAP}"
+	fi
+	test_ping IPv4 0
+	test_ping IPv6 0
+
+	cleanup
+	echo "PASS"
+}
+
+test_egress IPv4
+test_egress IPv6
+
+test_ingress IPv4
+test_ingress IPv6
+
+echo "all tests passed"
-- 
2.20.1.321.g9e740568ce-goog


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next v2 0/3] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap
  2019-01-24 19:34 [PATCH bpf-next v2 0/3] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap Peter Oskolkov
                   ` (2 preceding siblings ...)
  2019-01-24 19:34 ` [PATCH bpf-next v2 3/3] selftests: bpf: add test_lwt_ip_encap selftest Peter Oskolkov
@ 2019-01-26  0:29 ` David Ahern
  3 siblings, 0 replies; 9+ messages in thread
From: David Ahern @ 2019-01-26  0:29 UTC (permalink / raw)
  To: Peter Oskolkov, Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, Willem de Bruijn

On 1/24/19 12:34 PM, Peter Oskolkov wrote:
> This patchset implements BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
> BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN
> and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers
> to packets (e.g. IP/GRE, GUE, IPIP).
> 
> This is useful when thousands of different short-lived flows should be
> encapped, each with different and dynamically determined destination.
> Although lwtunnels can be used in some of these scenarios, the ability
> to dynamically generate encap headers adds more flexibility, e.g.
> when routing depends on the state of the host (reflected in global bpf
> maps).
> 
> V2 changes: Added flowi-based route lookup, IPv6 encapping, and
> encapping on ingress.
> 
> 

I skimmed over the patches and nothing really jumps out. I do question
whether it will work with VRFs. I hope to find time to test it; most
likely that will not happen before Monday.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next v2 3/3] selftests: bpf: add test_lwt_ip_encap selftest
  2019-01-24 19:34 ` [PATCH bpf-next v2 3/3] selftests: bpf: add test_lwt_ip_encap selftest Peter Oskolkov
@ 2019-01-26 21:46   ` Alexei Starovoitov
  2019-01-26 21:47     ` Alexei Starovoitov
  2019-01-28 20:31   ` David Ahern
  1 sibling, 1 reply; 9+ messages in thread
From: Alexei Starovoitov @ 2019-01-26 21:46 UTC (permalink / raw)
  To: Peter Oskolkov
  Cc: Alexei Starovoitov, Daniel Borkmann, Network Development,
	Peter Oskolkov, David Ahern, Willem de Bruijn

On Thu, Jan 24, 2019 at 11:34 AM Peter Oskolkov <posk@google.com> wrote:
>
> This patch adds a bpf self-test to cover BPF_LWT_ENCAP_IP mode
> in bpf_lwt_push_encap.
>
> Covered:
> - encapping in LWT_IN and LWT_XMIT
> - IPv4 and IPv6
>
> Signed-off-by: Peter Oskolkov <posk@google.com>
> ---
>  tools/testing/selftests/bpf/Makefile          |   5 +-

no longer applies cleanly.
please rebase.
thanks

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next v2 3/3] selftests: bpf: add test_lwt_ip_encap selftest
  2019-01-26 21:46   ` Alexei Starovoitov
@ 2019-01-26 21:47     ` Alexei Starovoitov
  0 siblings, 0 replies; 9+ messages in thread
From: Alexei Starovoitov @ 2019-01-26 21:47 UTC (permalink / raw)
  To: Peter Oskolkov
  Cc: Alexei Starovoitov, Daniel Borkmann, Network Development,
	Peter Oskolkov, David Ahern, Willem de Bruijn,
	Stanislav Fomichev

On Sat, Jan 26, 2019 at 1:46 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Jan 24, 2019 at 11:34 AM Peter Oskolkov <posk@google.com> wrote:
> >
> > This patch adds a bpf self-test to cover BPF_LWT_ENCAP_IP mode
> > in bpf_lwt_push_encap.
> >
> > Covered:
> > - encapping in LWT_IN and LWT_XMIT
> > - IPv4 and IPv6
> >
> > Signed-off-by: Peter Oskolkov <posk@google.com>
> > ---
> >  tools/testing/selftests/bpf/Makefile          |   5 +-
>
> no longer applies cleanly.
> please rebase.
> thanks

oops. wrong thread.
Meant to say that Stanislav's patch doesn't apply.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next v2 3/3] selftests: bpf: add test_lwt_ip_encap selftest
  2019-01-24 19:34 ` [PATCH bpf-next v2 3/3] selftests: bpf: add test_lwt_ip_encap selftest Peter Oskolkov
  2019-01-26 21:46   ` Alexei Starovoitov
@ 2019-01-28 20:31   ` David Ahern
  1 sibling, 0 replies; 9+ messages in thread
From: David Ahern @ 2019-01-28 20:31 UTC (permalink / raw)
  To: Peter Oskolkov, Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, Willem de Bruijn

On 1/24/19 12:34 PM, Peter Oskolkov wrote:
> This patch adds a bpf self-test to cover BPF_LWT_ENCAP_IP mode
> in bpf_lwt_push_encap.
> 
> Covered:
> - encapping in LWT_IN and LWT_XMIT
> - IPv4 and IPv6
> 
> Signed-off-by: Peter Oskolkov <posk@google.com>
> ---
>  tools/testing/selftests/bpf/Makefile          |   5 +-
>  .../testing/selftests/bpf/test_lwt_ip_encap.c | 125 +++++++
>  .../selftests/bpf/test_lwt_ip_encap.sh        | 316 ++++++++++++++++++
>  3 files changed, 444 insertions(+), 2 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/test_lwt_ip_encap.c
>  create mode 100755 tools/testing/selftests/bpf/test_lwt_ip_encap.sh
> 
> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> index 70229de510f5..407c51cc5f07 100644
> --- a/tools/testing/selftests/bpf/Makefile
> +++ b/tools/testing/selftests/bpf/Makefile
> @@ -39,7 +39,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
>  	get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
>  	test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o \
>  	test_sk_lookup_kern.o test_xdp_vlan.o test_queue_map.o test_stack_map.o \
> -	xdp_dummy.o test_map_in_map.o
> +	xdp_dummy.o test_map_in_map.o test_lwt_ip_encap.o
>  
>  # Order correspond to 'make run_tests' order
>  TEST_PROGS := test_kmod.sh \
> @@ -53,7 +53,8 @@ TEST_PROGS := test_kmod.sh \
>  	test_lirc_mode2.sh \
>  	test_skb_cgroup_id.sh \
>  	test_flow_dissector.sh \
> -	test_xdp_vlan.sh
> +	test_xdp_vlan.sh \
> +	test_lwt_ip_encap.sh
>  
>  TEST_PROGS_EXTENDED := with_addr.sh \
>  	tcp_client.py \
> diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.c b/tools/testing/selftests/bpf/test_lwt_ip_encap.c
> new file mode 100644
> index 000000000000..a7014277f3fe
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.c
> @@ -0,0 +1,125 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/bpf.h>
> +#include <string.h>
> +#include "bpf_helpers.h"
> +#include "bpf_endian.h"
> +
> +#define BPF_LWT_ENCAP_IP 2
> +#define BPF_LWT_REROUTE 128

if you sync bpf.h header to tools/include/uapi/linux/bpf.h you should
not need to hard code those.


> +
> +struct iphdr {
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> +	__u8	ihl:4,
> +		version:4;
> +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
> +	__u8	version:4,
> +		ihl:4;
> +#else
> +#error "Fix your compiler's __BYTE_ORDER__?!"
> +#endif
> +	__u8	tos;
> +	__be16	tot_len;
> +	__be16	id;
> +	__be16	frag_off;
> +	__u8	ttl;
> +	__u8	protocol;
> +	__sum16	check;
> +	__be32	saddr;
> +	__be32	daddr;
> +};
> +
> +struct ipv6hdr {
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> +	__u8	priority:4,
> +		version:4;
> +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
> +	__u8	version:4,
> +		priority:4;
> +#else
> +#error "Fix your compiler's __BYTE_ORDER__?!"
> +#endif
> +	__u8	flow_lbl[3];
> +
> +	__be16	payload_len;
> +	__u8	nexthdr;
> +	__u8	hop_limit;
> +
> +	__u8	saddr[16];
> +	__u8	daddr[16];
> +};

you should be using proper header files for iphdr and ipv6hdr


> +
> +struct grehdr {
> +	__be16 flags;
> +	__be16 protocol;
> +};
> +
> +SEC("encap_gre")
> +int bpf_lwt_encap_gre(struct __sk_buff *skb)
> +{
> +	struct encap_hdr {
> +		struct iphdr iph;
> +		struct grehdr greh;
> +	} hdr;
> +	int err;
> +
> +	memset(&hdr, 0, sizeof(struct encap_hdr));
> +
> +	hdr.iph.ihl = 5;
> +	hdr.iph.version = 4;
> +	hdr.iph.ttl = 0x40;
> +	hdr.iph.protocol = 47;  /* IPPROTO_GRE */
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> +	hdr.iph.saddr = 0x640110ac;  /* 172.16.1.100 */
> +	hdr.iph.daddr = 0x641010ac;  /* 172.16.16.100 */
> +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
> +	hdr.iph.saddr = 0xac100164;  /* 172.16.1.100 */
> +	hdr.iph.daddr = 0xac101064;  /* 172.16.16.100 */
> +#else
> +#error "Fix your compiler's __BYTE_ORDER__?!"
> +#endif
> +	hdr.iph.tot_len = bpf_htons(skb->len + sizeof(struct encap_hdr));
> +
> +	hdr.greh.protocol = skb->protocol;
> +
> +	err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr,
> +				 sizeof(struct encap_hdr));
> +	if (err)
> +		return BPF_DROP;
> +	return BPF_LWT_REROUTE;
> +}
> +
> +SEC("encap_gre6")
> +int bpf_lwt_encap_gre6(struct __sk_buff *skb)
> +{
> +	struct encap_hdr {
> +		struct ipv6hdr ip6hdr;
> +		struct grehdr greh;
> +	} hdr;
> +	int err;
> +
> +	memset(&hdr, 0, sizeof(struct encap_hdr));
> +
> +	hdr.ip6hdr.version = 6;
> +	hdr.ip6hdr.payload_len = bpf_htons(skb->len + sizeof(struct grehdr));
> +	hdr.ip6hdr.nexthdr = 47;  /* IPPROTO_GRE */
> +	hdr.ip6hdr.hop_limit = 0x40;
> +	/* fb01::1 */
> +	hdr.ip6hdr.saddr[0] = 0xfb;
> +	hdr.ip6hdr.saddr[1] = 1;
> +	hdr.ip6hdr.saddr[15] = 1;
> +	/* fb10::1 */
> +	hdr.ip6hdr.daddr[0] = 0xfb;
> +	hdr.ip6hdr.daddr[1] = 0x10;
> +	hdr.ip6hdr.daddr[15] = 1;
> +
> +	hdr.greh.protocol = skb->protocol;
> +
> +	err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr,
> +				 sizeof(struct encap_hdr));
> +	if (err)
> +		return BPF_DROP;
> +
> +	return BPF_LWT_REROUTE;
> +}
> +
> +char _license[] SEC("license") = "GPL";
> diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh
> new file mode 100755
> index 000000000000..4f511587bb74
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh
> @@ -0,0 +1,316 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +#
> +# Setup/topology:
> +#
> +#    NS1             NS2             NS3
> +#   veth1 <---> veth2   veth3 <---> veth4 (the top route)
> +#   veth5 <---> veth6   veth7 <---> veth8 (the bottom route)
> +#
> +#   each vethN gets IPv[4|6]_N address
> +#
> +#   IPv*_SRC = IPv*_1
> +#   IPv*_DST = IPv*_4
> +#
> +#   all tests test pings from IPv*_SRC to IPv*_DST
> +#
> +#   by default, routes are configured to allow packets to go
> +#   IP*_1 <=> IP*_2 <=> IP*_3 <=> IP*_4 (the top route)
> +#
> +#   a GRE device is installed in NS3 with IPv*_GRE, and
> +#   NS1/NS2 are configured to route packets to IPv*_GRE via IP*_8
> +#   (the bottom route)
> +#
> +# Tests:
> +#
> +#   1. routes NS2->IPv*_DST are brought down, so the only way a ping
> +#      from IP*_SRC to IP*_DST can work is via IPv*_GRE
> +#
> +#   2a. in an egress test, a bpf LWT_XMIT program is installed on veth1
> +#       that encaps the packets with an IP/GRE header to route to IPv*_GRE
> +#
> +#       ping: SRC->[encap at veth1:egress]->GRE:decap->DST
> +#       ping replies go DST->SRC directly
> +#
> +#   2b. in an ingress test, a bpf LWT_IN program is installed on veth2
> +#       that encaps the packets with an IP/GRE header to route to IPv*_GRE
> +#
> +#       ping: SRC->[encap at veth2:ingress]->GRE:decap->DST
> +#       ping replies go DST->SRC directly
> +
> +set +x  # debug OFF

please don't do that. Being able to run a shell script with bash -x is
really helpful.

> +set -e  # exit on error
> +
> +if [[ $EUID -ne 0 ]]; then
> +	echo "This script must be run as root"
> +	echo "FAIL"
> +	exit 1
> +fi
> +
> +readonly NS1="ns1-$(mktemp -u XXXXXX)"
> +readonly NS2="ns2-$(mktemp -u XXXXXX)"
> +readonly NS3="ns3-$(mktemp -u XXXXXX)"
> +
> +readonly IPv4_1="172.16.1.100"
> +readonly IPv4_2="172.16.2.100"
> +readonly IPv4_3="172.16.3.100"
> +readonly IPv4_4="172.16.4.100"
> +readonly IPv4_5="172.16.5.100"
> +readonly IPv4_6="172.16.6.100"
> +readonly IPv4_7="172.16.7.100"
> +readonly IPv4_8="172.16.8.100"
> +readonly IPv4_GRE="172.16.16.100"
> +
> +readonly IPv4_SRC=$IPv4_1
> +readonly IPv4_DST=$IPv4_4
> +
> +readonly IPv6_1="fb01::1"
> +readonly IPv6_2="fb02::1"
> +readonly IPv6_3="fb03::1"
> +readonly IPv6_4="fb04::1"
> +readonly IPv6_5="fb05::1"
> +readonly IPv6_6="fb06::1"
> +readonly IPv6_7="fb07::1"
> +readonly IPv6_8="fb08::1"
> +readonly IPv6_GRE="fb10::1"
> +
> +readonly IPv6_SRC=$IPv6_1
> +readonly IPv6_DST=$IPv6_4
> +
> +setup() {
> +set -e  # exit on error
> +	# create devices and namespaces
> +	ip netns add "${NS1}"
> +	ip netns add "${NS2}"
> +	ip netns add "${NS3}"
> +
> +	ip link add veth1 type veth peer name veth2
> +	ip link add veth3 type veth peer name veth4
> +	ip link add veth5 type veth peer name veth6
> +	ip link add veth7 type veth peer name veth8
> +
> +	ip netns exec ${NS2} sysctl -w net.ipv4.ip_forward=1 2>&1 > /dev/null
> +	ip netns exec ${NS2} sysctl -w net.ipv6.conf.all.forwarding=1 2>&1 > /dev/null

-q on the sysctl will squash the output. Errors are worth seeing

> +
> +	ip link set veth1 netns ${NS1}
> +	ip link set veth2 netns ${NS2}
> +	ip link set veth3 netns ${NS2}
> +	ip link set veth4 netns ${NS3}
> +	ip link set veth5 netns ${NS1}
> +	ip link set veth6 netns ${NS2}
> +	ip link set veth7 netns ${NS2}
> +	ip link set veth8 netns ${NS3}
> +
> +	# configure addesses: the top route (1-2-3-4)
> +	ip -netns ${NS1}    addr add ${IPv4_1}/24  dev veth1
> +	ip -netns ${NS2}    addr add ${IPv4_2}/24  dev veth2
> +	ip -netns ${NS2}    addr add ${IPv4_3}/24  dev veth3
> +	ip -netns ${NS3}    addr add ${IPv4_4}/24  dev veth4
> +	ip -netns ${NS1} -6 addr add ${IPv6_1}/128 dev veth1
> +	ip -netns ${NS2} -6 addr add ${IPv6_2}/128 dev veth2
> +	ip -netns ${NS2} -6 addr add ${IPv6_3}/128 dev veth3
> +	ip -netns ${NS3} -6 addr add ${IPv6_4}/128 dev veth4
> +
> +	# configure addresses: the bottom route (5-6-7-8)
> +	ip -netns ${NS1}    addr add ${IPv4_5}/24  dev veth5
> +	ip -netns ${NS2}    addr add ${IPv4_6}/24  dev veth6
> +	ip -netns ${NS2}    addr add ${IPv4_7}/24  dev veth7
> +	ip -netns ${NS3}    addr add ${IPv4_8}/24  dev veth8
> +	ip -netns ${NS1} -6 addr add ${IPv6_5}/128 dev veth5
> +	ip -netns ${NS2} -6 addr add ${IPv6_6}/128 dev veth6
> +	ip -netns ${NS2} -6 addr add ${IPv6_7}/128 dev veth7
> +	ip -netns ${NS3} -6 addr add ${IPv6_8}/128 dev veth8
> +
> +
> +	ip -netns ${NS1} link set dev veth1 up
> +	ip -netns ${NS2} link set dev veth2 up
> +	ip -netns ${NS2} link set dev veth3 up
> +	ip -netns ${NS3} link set dev veth4 up
> +	ip -netns ${NS1} link set dev veth5 up
> +	ip -netns ${NS2} link set dev veth6 up
> +	ip -netns ${NS2} link set dev veth7 up
> +	ip -netns ${NS3} link set dev veth8 up
> +
> +	# configure routes: IP*_SRC -> veth1/IP*_2 (= top route) default;
> +	# the bottom route to specific bottom addresses
> +
> +	# NS1
> +	# top route
> +	ip -netns ${NS1}    route add ${IPv4_2}/32  dev veth1
> +	ip -netns ${NS1}    route add default dev veth1 via ${IPv4_2}  # go top by default
> +	ip -netns ${NS1} -6 route add ${IPv6_2}/128 dev veth1
> +	ip -netns ${NS1} -6 route add default dev veth1 via ${IPv6_2}  # go top by default
> +	# bottom route
> +	ip -netns ${NS1}    route add ${IPv4_6}/32  dev veth5
> +	ip -netns ${NS1}    route add ${IPv4_7}/32  dev veth5 via ${IPv4_6}
> +	ip -netns ${NS1}    route add ${IPv4_8}/32  dev veth5 via ${IPv4_6}
> +	ip -netns ${NS1} -6 route add ${IPv6_6}/128 dev veth5
> +	ip -netns ${NS1} -6 route add ${IPv6_7}/128 dev veth5 via ${IPv6_6}
> +	ip -netns ${NS1} -6 route add ${IPv6_8}/128 dev veth5 via ${IPv6_6}
> +
> +	# NS2
> +	# top route
> +	ip -netns ${NS2}    route add ${IPv4_1}/32  dev veth2
> +	ip -netns ${NS2}    route add ${IPv4_4}/32  dev veth3
> +	ip -netns ${NS2} -6 route add ${IPv6_1}/128 dev veth2
> +	ip -netns ${NS2} -6 route add ${IPv6_4}/128 dev veth3
> +	# bottom route
> +	ip -netns ${NS2}    route add ${IPv4_5}/32  dev veth6
> +	ip -netns ${NS2}    route add ${IPv4_8}/32  dev veth7
> +	ip -netns ${NS2} -6 route add ${IPv6_5}/128 dev veth6
> +	ip -netns ${NS2} -6 route add ${IPv6_8}/128 dev veth7
> +
> +	# NS3
> +	# top route
> +	ip -netns ${NS3}    route add ${IPv4_3}/32  dev veth4
> +	ip -netns ${NS3}    route add ${IPv4_1}/32  dev veth4 via ${IPv4_3}
> +	ip -netns ${NS3}    route add ${IPv4_2}/32  dev veth4 via ${IPv4_3}
> +	ip -netns ${NS3} -6 route add ${IPv6_3}/128 dev veth4
> +	ip -netns ${NS3} -6 route add ${IPv6_1}/128 dev veth4 via ${IPv6_3}
> +	ip -netns ${NS3} -6 route add ${IPv6_2}/128 dev veth4 via ${IPv6_3}
> +	# bottom route
> +	ip -netns ${NS3}    route add ${IPv4_7}/32  dev veth8
> +	ip -netns ${NS3}    route add ${IPv4_5}/32  dev veth8 via ${IPv4_7}
> +	ip -netns ${NS3}    route add ${IPv4_6}/32  dev veth8 via ${IPv4_7}
> +	ip -netns ${NS3} -6 route add ${IPv6_7}/128 dev veth8
> +	ip -netns ${NS3} -6 route add ${IPv6_5}/128 dev veth8 via ${IPv6_7}
> +	ip -netns ${NS3} -6 route add ${IPv6_6}/128 dev veth8 via ${IPv6_7}
> +
> +	# configure IPv4 GRE device in NS3, and a route to it via the "bottom" route
> +	ip -netns ${NS3} tunnel add gre_dev mode gre remote ${IPv4_1} local ${IPv4_GRE} ttl 255
> +	ip -netns ${NS3} link set gre_dev up
> +	ip -netns ${NS3} addr add ${IPv4_GRE} dev gre_dev
> +	ip -netns ${NS1} route add ${IPv4_GRE}/32 dev veth5 via ${IPv4_6}
> +	ip -netns ${NS2} route add ${IPv4_GRE}/32 dev veth7 via ${IPv4_8}
> +
> +
> +	# configure IPv6 GRE device in NS3, and a route to it via the "bottom" route
> +	ip -netns ${NS3} -6 tunnel add name gre6_dev mode ip6gre remote ${IPv6_1} local ${IPv6_GRE} ttl 255
> +	ip -netns ${NS3} link set gre6_dev up
> +	ip -netns ${NS3} -6 addr add ${IPv6_GRE} dev gre6_dev
> +	ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6}
> +	ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8}
> +
> +	# rp_filter gets confused by what these tests are doing, so disable it
> +	ip netns exec ${NS1} sysctl -w net.ipv4.conf.all.rp_filter=0 2>&1 > /dev/null
> +	ip netns exec ${NS2} sysctl -w net.ipv4.conf.all.rp_filter=0 2>&1 > /dev/null
> +	ip netns exec ${NS3} sysctl -w net.ipv4.conf.all.rp_filter=0 2>&1 > /dev/null
> +}
> +
> +cleanup() {
> +	ip netns del ${NS1} 2> /dev/null
> +	ip netns del ${NS2} 2> /dev/null
> +	ip netns del ${NS3} 2> /dev/null
> +}
> +
> +trap cleanup EXIT
> +
> +test_ping() {
> +	local readonly PROTO=$1
> +	local readonly EXPECTED=$2
> +	local RET=0
> +
> +	set +e
> +	if [ "${PROTO}" == "IPv4" ] ; then
> +		ip netns exec ${NS1} ping  -c 1 -W 1 -I ${IPv4_SRC} ${IPv4_DST} 2>&1 > /dev/null
> +		# ip netns exec ${NS1} ping  -c 1 -W 10 -I ${IPv4_SRC} ${IPv4_DST}
> +		RET=$?
> +	elif [ "${PROTO}" == "IPv6" ] ; then
> +		ip netns exec ${NS1} ping6 -c 1 -W 6 -I ${IPv6_SRC} ${IPv6_DST} 2>&1 > /dev/null
> +		# ip netns exec ${NS1} ping6 -c 1 -W 6 -I ${IPv6_SRC} ${IPv6_DST}
> +		RET=$?
> +	else
> +		echo "test_ping: unknown PROTO: ${PROTO}"
> +		exit 1
> +	fi
> +	set -e
> +
> +	if [ "0" != "${RET}" ]; then
> +		RET=1
> +	fi
> +
> +	if [ "${EXPECTED}" != "${RET}" ] ; then
> +		echo "FAIL: test_ping: ${RET}"
> +		exit 1
> +	fi
> +}
> +
> +test_egress() {
> +	local readonly ENCAP=$1
> +	echo "starting egress ${ENCAP} encap test"
> +	setup
> +
> +	# need to wait a bit for IPv6 to autoconf, otherwise
> +	# ping6 sometimes fails with "unable to bind to address"
> +	sleep 1

My attempts to run this test script fail:
$ ./test_lwt_ip_encap.sh
starting egress IPv4 encap test
PASS
starting egress IPv6 encap test
ping: bind icmp socket: Cannot assign requested address
FAIL: test_ping: 1

Adding 'nodad' for IPv6 addresses avoids the race of DAD completing
before the ping6 is attempted. And it removes the need for the sleep 1.


> +
> +	# by default, pings work
> +	test_ping IPv4 0
> +	test_ping IPv6 0
> +
> +	# remove NS2->DST routes, ping fails
> +	ip -netns ${NS2}    route del ${IPv4_DST}/32  dev veth3
> +	ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3
> +	test_ping IPv4 1
> +	test_ping IPv6 1
> +
> +	# install replacement routes (LWT/eBPF), pings succeed
> +	if [ "${ENCAP}" == "IPv4" ] ; then
> +		ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1
> +		ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1
> +	elif [ "${ENCAP}" == "IPv6" ] ; then
> +		ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1
> +		ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1
> +	else
> +		echo "FAIL: unknown encap ${ENCAP}"
> +	fi
> +	test_ping IPv4 0
> +	test_ping IPv6 0
> +
> +	cleanup
> +	echo "PASS"
> +}
> +
> +test_ingress() {
> +	local readonly ENCAP=$1
> +	echo "starting ingress ${ENCAP} encap test"
> +	setup
> +
> +	# need to wait a bit for IPv6 to autoconf, otherwise
> +	# ping6 sometimes fails with "unable to bind to address"
> +	sleep 1
> +
> +	# by default, pings work
> +	test_ping IPv4 0
> +	test_ping IPv6 0
> +
> +	# remove NS2->DST routes, pings fail
> +	ip -netns ${NS2}    route del ${IPv4_DST}/32  dev veth3
> +	ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3
> +	test_ping IPv4 1
> +	test_ping IPv6 1
> +
> +	# install replacement routes (LWT/eBPF), pings succeed
> +	if [ "${ENCAP}" == "IPv4" ] ; then
> +		ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre dev veth2
> +		ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre dev veth2
> +	elif [ "${ENCAP}" == "IPv6" ] ; then
> +		ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre6 dev veth2
> +		ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre6 dev veth2
> +	else
> +		echo "FAIL: unknown encap ${ENCAP}"
> +	fi
> +	test_ping IPv4 0
> +	test_ping IPv6 0
> +
> +	cleanup
> +	echo "PASS"
> +}
> +
> +test_egress IPv4
> +test_egress IPv6
> +
> +test_ingress IPv4
> +test_ingress IPv6
> +
> +echo "all tests passed"
> 

And adding a VRF version of this test fails as I suspected. See comments
on patch 2.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next v2 2/3] bpf: implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
  2019-01-24 19:34 ` [PATCH bpf-next v2 2/3] bpf: implement BPF_LWT_ENCAP_IP mode " Peter Oskolkov
@ 2019-01-28 20:31   ` David Ahern
  0 siblings, 0 replies; 9+ messages in thread
From: David Ahern @ 2019-01-28 20:31 UTC (permalink / raw)
  To: Peter Oskolkov, Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, Willem de Bruijn

On 1/24/19 12:34 PM, Peter Oskolkov wrote:
> This patch implements BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
> BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN
> and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers
> to packets (e.g. IP/GRE, GUE, IPIP).
> 
> This is useful when thousands of different short-lived flows should be
> encapped, each with different and dynamically determined destination.
> Although lwtunnels can be used in some of these scenarios, the ability
> to dynamically generate encap headers adds more flexibility, e.g.
> when routing depends on the state of the host (reflected in global bpf
> maps).
> 
> Signed-off-by: Peter Oskolkov <posk@google.com>
> ---
>  include/net/lwtunnel.h |   3 +
>  net/core/filter.c      |   3 +-
>  net/core/lwt_bpf.c     | 142 +++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 147 insertions(+), 1 deletion(-)
> 
> diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
> index 33fd9ba7e0e5..f0973eca8036 100644
> --- a/include/net/lwtunnel.h
> +++ b/include/net/lwtunnel.h
> @@ -126,6 +126,8 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
>  int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
>  int lwtunnel_input(struct sk_buff *skb);
>  int lwtunnel_xmit(struct sk_buff *skb);
> +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
> +			  bool ingress);
>  
>  static inline void lwtunnel_set_redirect(struct dst_entry *dst)
>  {
> @@ -138,6 +140,7 @@ static inline void lwtunnel_set_redirect(struct dst_entry *dst)
>  		dst->input = lwtunnel_input;
>  	}
>  }
> +
>  #else
>  
>  static inline void lwtstate_free(struct lwtunnel_state *lws)
> diff --git a/net/core/filter.c b/net/core/filter.c
> index fd3ae092d3d7..81d18660c38b 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -73,6 +73,7 @@
>  #include <linux/seg6_local.h>
>  #include <net/seg6.h>
>  #include <net/seg6_local.h>
> +#include <net/lwtunnel.h>
>  
>  /**
>   *	sk_filter_trim_cap - run a packet through a socket filter
> @@ -4796,7 +4797,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
>  static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
>  			     bool ingress)
>  {
> -	return -EINVAL;  /* Implemented in the next patch. */
> +	return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
>  }
>  
>  BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
> diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
> index 3e85437f7106..a3f79bff3776 100644
> --- a/net/core/lwt_bpf.c
> +++ b/net/core/lwt_bpf.c
> @@ -16,6 +16,7 @@
>  #include <linux/types.h>
>  #include <linux/bpf.h>
>  #include <net/lwtunnel.h>
> +#include <net/ip6_route.h>
>  
>  struct bpf_lwt_prog {
>  	struct bpf_prog *prog;
> @@ -55,6 +56,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
>  
>  	switch (ret) {
>  	case BPF_OK:
> +	case BPF_LWT_REROUTE:
>  		break;
>  
>  	case BPF_REDIRECT:
> @@ -97,6 +99,8 @@ static int bpf_input(struct sk_buff *skb)
>  		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
>  		if (ret < 0)
>  			return ret;
> +		if (ret == BPF_LWT_REROUTE)
> +			return dst_input(skb);
>  	}
>  
>  	if (unlikely(!dst->lwtstate->orig_input)) {
> @@ -168,6 +172,13 @@ static int bpf_xmit(struct sk_buff *skb)
>  			return LWTUNNEL_XMIT_CONTINUE;
>  		case BPF_REDIRECT:
>  			return LWTUNNEL_XMIT_DONE;
> +		case BPF_LWT_REROUTE:
> +			ret = dst_output(dev_net(skb_dst(skb)->dev),
> +					 skb->sk, skb);
> +			if (unlikely(ret))
> +				return ret;
> +			/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
> +			return LWTUNNEL_XMIT_DONE;
>  		default:
>  			return ret;
>  		}
> @@ -389,6 +400,137 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {
>  	.owner		= THIS_MODULE,
>  };
>  
> +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
> +{
> +	struct dst_entry *dst = NULL;
> +	struct iphdr *iph;
> +	bool ipv4;
> +	int err;
> +
> +	if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
> +		return -EINVAL;
> +
> +	/* validate protocol and length */
> +	iph = (struct iphdr *)hdr;
> +	if (iph->version == 4) {
> +		ipv4 = true;
> +		if (iph->ihl * 4 > len)
> +			return -EINVAL;
> +	} else if (iph->version == 6) {
> +		ipv4 = false;
> +		if (unlikely(len < sizeof(struct ipv6hdr)))
> +			return -EINVAL;
> +	} else {
> +		return -EINVAL;
> +	}
> +
> +	/* allocate enough space for the encap headers + L2 hdr */
> +	if (ingress) {
> +		err = skb_cow_head(skb, len + skb->mac_len);
> +		if (unlikely(err))
> +			return err;
> +	} else {
> +		/* ip_route_input_noref below does route lookup and dst
> +		 * drop/set for ingress. There is no similar function for
> +		 * egress, so we need to do route lookup and replace skb's
> +		 * dst in this function.
> +		 */
> +		struct sock *sk;
> +		struct net *net;
> +
> +		sk = sk_to_full_sk(skb->sk);
> +		if (sk)
> +			net = sock_net(sk);
> +		else
> +			net = dev_net(skb_dst(skb)->dev);

This delta gets VRF tests to pass too. Also, you should be able to
always get net from the device.

diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 526b7cfc6d52..79feebd6da34 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -436,20 +436,24 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb,
void *hdr, u32 len, bool ingress)
                 * egress, so we need to do route lookup and replace skb's
                 * dst in this function.
                 */
+               struct net_device *l3mdev =
l3mdev_master_dev_rcu(skb_dst(skb)->dev);
+               int oif = l3mdev ? l3mdev->ifindex : 0;
                struct sock *sk;
                struct net *net;

                sk = sk_to_full_sk(skb->sk);
-               if (sk)
+               if (sk) {
+                       if (sk->sk_bound_dev_if)
+                               oif = sk->sk_bound_dev_if;
                        net = sock_net(sk);
-               else
+               } else
                        net = dev_net(skb_dst(skb)->dev);

                if (ipv4) {
                        struct flowi4 fl4 = {0};
                        struct rtable *rt;

-                       fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
+                       fl4.flowi4_oif = oif;
                        fl4.flowi4_mark = skb->mark;
                        fl4.flowi4_uid = sock_net_uid(net, sk);
                        fl4.flowi4_tos = RT_TOS(iph->tos);
@@ -466,7 +470,7 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void
*hdr, u32 len, bool ingress)
                        struct ipv6hdr *iph6 = (struct ipv6hdr *)hdr;
                        struct flowi6 fl6 = {0};

-                       fl6.flowi6_oif = sk ? sk->sk_bound_dev_if : 0;
+                       fl6.flowi6_oif = oif;
                        fl6.flowi6_mark = skb->mark;
                        fl6.flowi6_uid = sock_net_uid(net, sk);
                        fl6.flowlabel = ip6_flowinfo(iph6);

> +
> +		if (ipv4) {
> +			struct flowi4 fl4 = {0};
> +			struct rtable *rt;
> +
> +			fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
> +			fl4.flowi4_mark = skb->mark;
> +			fl4.flowi4_uid = sock_net_uid(net, sk);
> +			fl4.flowi4_tos = RT_TOS(iph->tos);
> +			fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
> +			fl4.flowi4_proto = iph->protocol;
> +			fl4.daddr = iph->daddr;
> +			fl4.saddr = iph->saddr;
> +
> +			rt = ip_route_output_key(net, &fl4);
> +			if (IS_ERR(rt) || rt->dst.error)
> +				return -EINVAL;
> +			dst = &rt->dst;
> +		} else {
> +			struct ipv6hdr *iph6 = (struct ipv6hdr *)hdr;
> +			struct flowi6 fl6 = {0};
> +
> +			fl6.flowi6_oif = sk ? sk->sk_bound_dev_if : 0;
> +			fl6.flowi6_mark = skb->mark;
> +			fl6.flowi6_uid = sock_net_uid(net, sk);
> +			fl6.flowlabel = ip6_flowinfo(iph6);
> +			fl6.flowi6_proto = iph6->nexthdr;
> +			fl6.daddr = iph6->daddr;
> +			fl6.saddr = iph6->saddr;
> +
> +			dst = ip6_route_output(net, skb->sk, &fl6);
> +			if (IS_ERR(dst) || dst->error)
> +				return -EINVAL;
> +		}
> +
> +		err = skb_cow_head(skb, len + LL_RESERVED_SPACE(dst->dev));
> +		if (unlikely(err))
> +			return err;
> +	}
> +
> +	/* push the encap headers and fix pointers */
> +	skb_reset_inner_headers(skb);
> +	skb->encapsulation = 1;
> +	skb_push(skb, len);
> +	if (ingress)
> +		skb_postpush_rcsum(skb, iph, len);
> +	skb_reset_network_header(skb);
> +	iph = ip_hdr(skb);
> +	memcpy(iph, hdr, len);

Calling it iph and using ip_hdr seems wrong given that hdr can also be
IPv6. Why not just use skb_network_header?

> +	bpf_compute_data_pointers(skb);
> +
> +	/* final skb touches + routing */
> +	if (ipv4) {
> +		skb->protocol = htons(ETH_P_IP);
> +		if (iph->ihl * 4 < len)
> +			skb_set_transport_header(skb, iph->ihl * 4);
> +
> +		if (!iph->check)
> +			iph->check = ip_fast_csum((unsigned char *)iph,
> +						  iph->ihl);
> +
> +		if (ingress) {
> +			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
> +						   iph->tos, skb_dst(skb)->dev);
> +			if (err)
> +				return err;
> +		} else {
> +			skb_dst_drop(skb);
> +			skb_dst_set(skb, dst);
> +		}
> +	} else {
> +		skb->protocol = htons(ETH_P_IPV6);
> +		if (sizeof(struct ipv6hdr) < len)
> +			skb_set_transport_header(skb, sizeof(struct ipv6hdr));
> +
> +		if (ingress) {
> +			ip6_route_input(skb);
> +			if (skb_dst(skb)->error)
> +				return skb_dst(skb)->error;
> +		} else {
> +			skb_dst_drop(skb);
> +			skb_dst_set(skb, dst);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static int __init bpf_lwt_init(void)
>  {
>  	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
> 


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-01-28 20:31 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-24 19:34 [PATCH bpf-next v2 0/3] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap Peter Oskolkov
2019-01-24 19:34 ` [PATCH bpf-next v2 1/3] bpf: add plumbing for BPF_LWT_ENCAP_IP in bpf_lwt_push_encap Peter Oskolkov
2019-01-24 19:34 ` [PATCH bpf-next v2 2/3] bpf: implement BPF_LWT_ENCAP_IP mode " Peter Oskolkov
2019-01-28 20:31   ` David Ahern
2019-01-24 19:34 ` [PATCH bpf-next v2 3/3] selftests: bpf: add test_lwt_ip_encap selftest Peter Oskolkov
2019-01-26 21:46   ` Alexei Starovoitov
2019-01-26 21:47     ` Alexei Starovoitov
2019-01-28 20:31   ` David Ahern
2019-01-26  0:29 ` [PATCH bpf-next v2 0/3] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap David Ahern

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).