[PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap

* [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap
@ 2018-11-29  0:22 Peter Oskolkov
  2018-11-29  0:22 ` [PATCH bpf-next 2/2] selftests/bpf: add test_lwt_ip_encap selftest Peter Oskolkov
  2018-11-29  0:47 ` [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap David Ahern
  0 siblings, 2 replies; 10+ messages in thread
From: Peter Oskolkov @ 2018-11-29  0:22 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, Peter Oskolkov

This patch enables BPF programs (specifically, of LWT_XMIT type)
to add IP encapsulation headers to packets (e.g. IP/GRE, GUE, IPIP).

This is useful when thousands of different short-lived flows should be
encapped, each with different and dynamically determined destination.
Although lwtunnels can be used in some of these scenarios, the ability
to dynamically generate encap headers adds more flexibility, e.g.
when routing depends on the state of the host (reflected in global bpf
maps).

A future patch will enable IPv6 encapping (and IPv4/IPv6 cross-routing).

Tested: see the second patch in the series.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 include/net/lwtunnel.h   |  2 ++
 include/uapi/linux/bpf.h |  7 ++++-
 net/core/filter.c        | 58 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 33fd9ba7e0e5..6a1c5c2f16d5 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -16,6 +16,8 @@
 #define LWTUNNEL_STATE_INPUT_REDIRECT	BIT(1)
 #define LWTUNNEL_STATE_XMIT_REDIRECT	BIT(2)
 
+#define LWTUNNEL_MAX_ENCAP_HSIZE	80
+
 enum {
 	LWTUNNEL_XMIT_DONE,
 	LWTUNNEL_XMIT_CONTINUE,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 597afdbc1ab9..6f2efe2dca9f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1998,6 +1998,10 @@ union bpf_attr {
  *			Only works if *skb* contains an IPv6 packet. Insert a
  *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
  *			the IPv6 header.
+ *		**BPF_LWT_ENCAP_IP**
+ *			IP encapsulation (GRE/GUE/IPIP/etc). The outer header
+ *			must be IPv4, followed by zero, one, or more additional
+ *			headers.
  *
  * 		A call to this helper is susceptible to change the underlaying
  * 		packet buffer. Therefore, at load time, all checks on pointers
@@ -2444,7 +2448,8 @@ enum bpf_hdr_start_off {
 /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
 enum bpf_lwt_encap_mode {
 	BPF_LWT_ENCAP_SEG6,
-	BPF_LWT_ENCAP_SEG6_INLINE
+	BPF_LWT_ENCAP_SEG6_INLINE,
+	BPF_LWT_ENCAP_IP,
 };
 
 /* user accessible mirror of in-kernel sk_buff.
diff --git a/net/core/filter.c b/net/core/filter.c
index bd0df75dc7b6..17f3c37218e5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4793,6 +4793,60 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
 }
 #endif /* CONFIG_IPV6_SEG6_BPF */
 
+static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len)
+{
+	struct dst_entry *dst;
+	struct rtable *rt;
+	struct iphdr *iph;
+	struct net *net;
+	int err;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		return -EINVAL;  /* ETH_P_IPV6 not yet supported */
+
+	iph = (struct iphdr *)hdr;
+
+	if (unlikely(len < sizeof(struct iphdr) || len > LWTUNNEL_MAX_ENCAP_HSIZE))
+		return -EINVAL;
+	if (unlikely(iph->version != 4 || iph->ihl * 4 > len))
+		return -EINVAL;
+
+	if (skb->sk)
+		net = sock_net(skb->sk);
+	else {
+		net = dev_net(skb_dst(skb)->dev);
+	}
+	rt = ip_route_output(net, iph->daddr, 0, 0, 0);
+	if (IS_ERR(rt) || rt->dst.error)
+		return -EINVAL;
+	dst = &rt->dst;
+
+	skb_reset_inner_headers(skb);
+	skb->encapsulation = 1;
+
+	err = skb_cow_head(skb, len + LL_RESERVED_SPACE(dst->dev));
+	if (unlikely(err))
+		return err;
+
+	skb_push(skb, len);
+	skb_reset_network_header(skb);
+
+	iph = ip_hdr(skb);
+	memcpy(iph, hdr, len);
+
+	bpf_compute_data_pointers(skb);
+	if (iph->ihl * 4 < len)
+		skb_set_transport_header(skb, iph->ihl * 4);
+	skb->protocol = htons(ETH_P_IP);
+	if (!iph->check)
+		iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+
+	skb_dst_drop(skb);
+	dst_hold(dst);
+	skb_dst_set(skb, dst);
+	return 0;
+}
+
 BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
 	   u32, len)
 {
@@ -4802,6 +4856,8 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
 	case BPF_LWT_ENCAP_SEG6_INLINE:
 		return bpf_push_seg6_encap(skb, type, hdr, len);
 #endif
+	case BPF_LWT_ENCAP_IP:
+		return bpf_push_ip_encap(skb, hdr, len);
 	default:
 		return -EINVAL;
 	}
@@ -5687,6 +5743,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_l4_csum_replace_proto;
 	case BPF_FUNC_set_hash_invalid:
 		return &bpf_set_hash_invalid_proto;
+	case BPF_FUNC_lwt_push_encap:
+		return &bpf_lwt_push_encap_proto;
 	default:
 		return lwt_out_func_proto(func_id, prog);
 	}
-- 
2.20.0.rc0.387.gc7a69e6b6c-goog

^ permalink raw reply related	[flat|nested] 10+ messages in thread