All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap
@ 2018-11-29  0:22 Peter Oskolkov
  2018-11-29  0:22 ` [PATCH bpf-next 2/2] selftests/bpf: add test_lwt_ip_encap selftest Peter Oskolkov
  2018-11-29  0:47 ` [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap David Ahern
  0 siblings, 2 replies; 10+ messages in thread
From: Peter Oskolkov @ 2018-11-29  0:22 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, Peter Oskolkov

This patch enables BPF programs (specifically, of LWT_XMIT type)
to add IP encapsulation headers to packets (e.g. IP/GRE, GUE, IPIP).

This is useful when thousands of different short-lived flows should be
encapped, each with different and dynamically determined destination.
Although lwtunnels can be used in some of these scenarios, the ability
to dynamically generate encap headers adds more flexibility, e.g.
when routing depends on the state of the host (reflected in global bpf
maps).

A future patch will enable IPv6 encapping (and IPv4/IPv6 cross-routing).

Tested: see the second patch in the series.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 include/net/lwtunnel.h   |  2 ++
 include/uapi/linux/bpf.h |  7 ++++-
 net/core/filter.c        | 58 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 33fd9ba7e0e5..6a1c5c2f16d5 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -16,6 +16,8 @@
 #define LWTUNNEL_STATE_INPUT_REDIRECT	BIT(1)
 #define LWTUNNEL_STATE_XMIT_REDIRECT	BIT(2)
 
+#define LWTUNNEL_MAX_ENCAP_HSIZE	80
+
 enum {
 	LWTUNNEL_XMIT_DONE,
 	LWTUNNEL_XMIT_CONTINUE,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 597afdbc1ab9..6f2efe2dca9f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1998,6 +1998,10 @@ union bpf_attr {
  *			Only works if *skb* contains an IPv6 packet. Insert a
  *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
  *			the IPv6 header.
+ *		**BPF_LWT_ENCAP_IP**
+ *			IP encapsulation (GRE/GUE/IPIP/etc). The outer header
+ *			must be IPv4, followed by zero, one, or more additional
+ *			headers.
  *
  * 		A call to this helper is susceptible to change the underlaying
  * 		packet buffer. Therefore, at load time, all checks on pointers
@@ -2444,7 +2448,8 @@ enum bpf_hdr_start_off {
 /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
 enum bpf_lwt_encap_mode {
 	BPF_LWT_ENCAP_SEG6,
-	BPF_LWT_ENCAP_SEG6_INLINE
+	BPF_LWT_ENCAP_SEG6_INLINE,
+	BPF_LWT_ENCAP_IP,
 };
 
 /* user accessible mirror of in-kernel sk_buff.
diff --git a/net/core/filter.c b/net/core/filter.c
index bd0df75dc7b6..17f3c37218e5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4793,6 +4793,60 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
 }
 #endif /* CONFIG_IPV6_SEG6_BPF */
 
+static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len)
+{
+	struct dst_entry *dst;
+	struct rtable *rt;
+	struct iphdr *iph;
+	struct net *net;
+	int err;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		return -EINVAL;  /* ETH_P_IPV6 not yet supported */
+
+	iph = (struct iphdr *)hdr;
+
+	if (unlikely(len < sizeof(struct iphdr) || len > LWTUNNEL_MAX_ENCAP_HSIZE))
+		return -EINVAL;
+	if (unlikely(iph->version != 4 || iph->ihl * 4 > len))
+		return -EINVAL;
+
+	if (skb->sk)
+		net = sock_net(skb->sk);
+	else {
+		net = dev_net(skb_dst(skb)->dev);
+	}
+	rt = ip_route_output(net, iph->daddr, 0, 0, 0);
+	if (IS_ERR(rt) || rt->dst.error)
+		return -EINVAL;
+	dst = &rt->dst;
+
+	skb_reset_inner_headers(skb);
+	skb->encapsulation = 1;
+
+	err = skb_cow_head(skb, len + LL_RESERVED_SPACE(dst->dev));
+	if (unlikely(err))
+		return err;
+
+	skb_push(skb, len);
+	skb_reset_network_header(skb);
+
+	iph = ip_hdr(skb);
+	memcpy(iph, hdr, len);
+
+	bpf_compute_data_pointers(skb);
+	if (iph->ihl * 4 < len)
+		skb_set_transport_header(skb, iph->ihl * 4);
+	skb->protocol = htons(ETH_P_IP);
+	if (!iph->check)
+		iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+
+	skb_dst_drop(skb);
+	dst_hold(dst);
+	skb_dst_set(skb, dst);
+	return 0;
+}
+
 BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
 	   u32, len)
 {
@@ -4802,6 +4856,8 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
 	case BPF_LWT_ENCAP_SEG6_INLINE:
 		return bpf_push_seg6_encap(skb, type, hdr, len);
 #endif
+	case BPF_LWT_ENCAP_IP:
+		return bpf_push_ip_encap(skb, hdr, len);
 	default:
 		return -EINVAL;
 	}
@@ -5687,6 +5743,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_l4_csum_replace_proto;
 	case BPF_FUNC_set_hash_invalid:
 		return &bpf_set_hash_invalid_proto;
+	case BPF_FUNC_lwt_push_encap:
+		return &bpf_lwt_push_encap_proto;
 	default:
 		return lwt_out_func_proto(func_id, prog);
 	}
-- 
2.20.0.rc0.387.gc7a69e6b6c-goog

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH bpf-next 2/2] selftests/bpf: add test_lwt_ip_encap selftest
  2018-11-29  0:22 [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap Peter Oskolkov
@ 2018-11-29  0:22 ` Peter Oskolkov
  2018-11-29  0:47 ` [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap David Ahern
  1 sibling, 0 replies; 10+ messages in thread
From: Peter Oskolkov @ 2018-11-29  0:22 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, Peter Oskolkov

This patch adds a sample/selftest that covers BPF_LWT_ENCAP_IP option
added in the first patch in the series.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 tools/testing/selftests/bpf/Makefile          |   5 +-
 .../testing/selftests/bpf/test_lwt_ip_encap.c |  65 ++++++++++
 .../selftests/bpf/test_lwt_ip_encap.sh        | 114 ++++++++++++++++++
 3 files changed, 182 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_lwt_ip_encap.c
 create mode 100755 tools/testing/selftests/bpf/test_lwt_ip_encap.sh

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 73aa6d8f4a2f..044fcdbc9864 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -39,7 +39,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
 	test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o \
 	test_sk_lookup_kern.o test_xdp_vlan.o test_queue_map.o test_stack_map.o \
-	xdp_dummy.o test_map_in_map.o
+	xdp_dummy.o test_map_in_map.o test_lwt_ip_encap.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -53,7 +53,8 @@ TEST_PROGS := test_kmod.sh \
 	test_lirc_mode2.sh \
 	test_skb_cgroup_id.sh \
 	test_flow_dissector.sh \
-	test_xdp_vlan.sh
+	test_xdp_vlan.sh \
+	test_lwt_ip_encap.sh
 
 TEST_PROGS_EXTENDED := with_addr.sh
 
diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.c b/tools/testing/selftests/bpf/test_lwt_ip_encap.c
new file mode 100644
index 000000000000..967db922dcc6
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <string.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define BPF_LWT_ENCAP_IP 2
+
+struct iphdr {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u8	ihl:4,
+		version:4;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	__u8	version:4,
+		ihl:4;
+#else
+#error "Fix your compiler's __BYTE_ORDER__?!"
+#endif
+	__u8	tos;
+	__be16	tot_len;
+	__be16	id;
+	__be16	frag_off;
+	__u8	ttl;
+	__u8	protocol;
+	__sum16	check;
+	__be32	saddr;
+	__be32	daddr;
+};
+
+struct grehdr {
+	__be16 flags;
+	__be16 protocol;
+};
+
+SEC("encap_gre")
+int bpf_lwt_encap_gre(struct __sk_buff *skb)
+{
+	char encap_header[24];
+	int err;
+	struct iphdr *iphdr = (struct iphdr *)encap_header;
+	struct grehdr *greh = (struct grehdr *)(encap_header + sizeof(struct iphdr));
+
+	memset(encap_header, 0, sizeof(encap_header));
+
+	iphdr->ihl = 5;
+	iphdr->version = 4;
+	iphdr->tos = 0;
+	iphdr->ttl = 0x40;
+	iphdr->protocol = 47;  /* IPPROTO_GRE */
+	iphdr->saddr = 0x640110ac;  /* 172.16.1.100 */
+	iphdr->daddr = 0x640310ac;  /* 172.16.5.100 */
+	iphdr->check = 0;
+	iphdr->tot_len = bpf_htons(skb->len + sizeof(encap_header));
+
+	greh->protocol = bpf_htons(0x800);
+
+	err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, (void *)encap_header,
+				 sizeof(encap_header));
+	if (err)
+		return BPF_DROP;
+
+	return BPF_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh
new file mode 100755
index 000000000000..4c32b754bf96
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Setup:
+# - create VETH1/VETH2 veth
+# - VETH1 gets IP_SRC
+# - create netns NS
+# - move VETH2 to NS, add IP_DST
+# - in NS, create gre tunnel GREDEV, add IP_GRE
+# - in NS, configure GREDEV to route to IP_DST from IP_SRC
+# - configure route to IP_GRE via VETH1
+#   (note: there is no route to IP_DST from root/init ns)
+#
+# Test:
+# - listen on IP_DST
+# - send a packet to IP_DST: the listener does not get it
+# - add LWT_XMIT bpf to IP_DST that gre-encaps all packets to IP_GRE
+# - send a packet to IP_DST: the listener gets it
+
+
+# set -x  # debug ON
+set +x  # debug OFF
+set -e  # exit on error
+
+if [[ $EUID -ne 0 ]]; then
+	echo "This script must be run as root"
+	echo "FAIL"
+	exit 1
+fi
+
+readonly NS="ns-ip-encap-$(mktemp -u XXXXXX)"
+readonly OUT=$(mktemp /tmp/test_lwt_ip_incap.XXXXXX)
+
+readonly NET_SRC="172.16.1.0"
+
+readonly IP_SRC="172.16.1.100"
+readonly IP_DST="172.16.2.100"
+readonly IP_GRE="172.16.3.100"
+
+readonly PORT=5555
+readonly MSG="foo_bar"
+
+PID1=0
+PID2=0
+
+setup() {
+	ip link add veth1 type veth peer name veth2
+
+	ip netns add "${NS}"
+	ip link set veth2 netns ${NS}
+
+	ip link set dev veth1 up
+	ip -netns ${NS} link set dev veth2 up
+
+	ip addr add dev veth1 ${IP_SRC}/24
+	ip -netns ${NS} addr add dev veth2 ${IP_DST}/24
+
+	ip -netns ${NS} tunnel add gre_dev mode gre remote ${IP_SRC} local ${IP_GRE} ttl 255
+	ip -netns ${NS} link set gre_dev up
+	ip -netns ${NS} addr add ${IP_GRE} dev gre_dev
+	ip -netns ${NS} route add ${NET_SRC}/24 dev gre_dev
+
+	ip route add ${IP_GRE}/32 dev veth1
+}
+
+cleanup() {
+	ip link del veth1
+	ip netns del ${NS}
+	if [ $PID1 -ne 0 ] ; then kill $PID1 ; fi
+	if [ $PID2 -ne 0 ] ; then kill $PID2 ; fi
+	rm $OUT
+}
+
+trap cleanup EXIT
+setup
+
+# start the listener
+ip netns exec ${NS} nc -ul ${IP_DST} $PORT > $OUT &
+PID1=$!
+usleep 100000
+
+# send a packet
+echo -ne "${MSG}" | nc -u ${IP_DST} $PORT &
+PID2=$!
+usleep 1000000
+kill $PID2
+PID2=0
+
+# confirm the packet was not delivered
+if [ "$(<$OUT)" != "" ]; then
+	echo "FAIL: unexpected packet"
+	exit 1
+fi
+
+# install an lwt/bpf encap prog
+ip route add ${IP_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1
+usleep 100000
+
+# send a packet
+echo -ne "${MSG}" | nc -u ${IP_DST} $PORT &
+PID2=$!
+usleep 1000000
+kill $PID2
+PID2=0
+kill $PID1
+PID1=0
+
+if [ "$(<$OUT)" != "$MSG" ]; then
+	echo "FAIL"
+	exit 1
+fi
+
+echo "PASS"
+
-- 
2.20.0.rc0.387.gc7a69e6b6c-goog

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap
  2018-11-29  0:22 [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap Peter Oskolkov
  2018-11-29  0:22 ` [PATCH bpf-next 2/2] selftests/bpf: add test_lwt_ip_encap selftest Peter Oskolkov
@ 2018-11-29  0:47 ` David Ahern
  2018-11-29  1:34   ` Peter Oskolkov
  1 sibling, 1 reply; 10+ messages in thread
From: David Ahern @ 2018-11-29  0:47 UTC (permalink / raw)
  To: Peter Oskolkov, Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov

On 11/28/18 5:22 PM, Peter Oskolkov wrote:
> diff --git a/net/core/filter.c b/net/core/filter.c
> index bd0df75dc7b6..17f3c37218e5 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4793,6 +4793,60 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
>  }
>  #endif /* CONFIG_IPV6_SEG6_BPF */
>  
> +static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len)
> +{
> +	struct dst_entry *dst;
> +	struct rtable *rt;
> +	struct iphdr *iph;
> +	struct net *net;
> +	int err;
> +
> +	if (skb->protocol != htons(ETH_P_IP))
> +		return -EINVAL;  /* ETH_P_IPV6 not yet supported */
> +
> +	iph = (struct iphdr *)hdr;
> +
> +	if (unlikely(len < sizeof(struct iphdr) || len > LWTUNNEL_MAX_ENCAP_HSIZE))
> +		return -EINVAL;
> +	if (unlikely(iph->version != 4 || iph->ihl * 4 > len))
> +		return -EINVAL;
> +
> +	if (skb->sk)
> +		net = sock_net(skb->sk);
> +	else {
> +		net = dev_net(skb_dst(skb)->dev);
> +	}
> +	rt = ip_route_output(net, iph->daddr, 0, 0, 0);

That is a very limited use case. e.g., oif = 0 means you are not
considering any kind of policy routing (e.g., VRF).

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap
  2018-11-29  0:47 ` [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap David Ahern
@ 2018-11-29  1:34   ` Peter Oskolkov
  2018-11-30 20:08     ` David Ahern
  0 siblings, 1 reply; 10+ messages in thread
From: Peter Oskolkov @ 2018-11-29  1:34 UTC (permalink / raw)
  To: David Ahern; +Cc: ast, daniel, netdev, posk.devel

On Wed, Nov 28, 2018 at 4:47 PM David Ahern <dsahern@gmail.com> wrote:
>
> On 11/28/18 5:22 PM, Peter Oskolkov wrote:
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index bd0df75dc7b6..17f3c37218e5 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -4793,6 +4793,60 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
> >  }
> >  #endif /* CONFIG_IPV6_SEG6_BPF */
> >
> > +static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len)
> > +{
> > +     struct dst_entry *dst;
> > +     struct rtable *rt;
> > +     struct iphdr *iph;
> > +     struct net *net;
> > +     int err;
> > +
> > +     if (skb->protocol != htons(ETH_P_IP))
> > +             return -EINVAL;  /* ETH_P_IPV6 not yet supported */
> > +
> > +     iph = (struct iphdr *)hdr;
> > +
> > +     if (unlikely(len < sizeof(struct iphdr) || len > LWTUNNEL_MAX_ENCAP_HSIZE))
> > +             return -EINVAL;
> > +     if (unlikely(iph->version != 4 || iph->ihl * 4 > len))
> > +             return -EINVAL;
> > +
> > +     if (skb->sk)
> > +             net = sock_net(skb->sk);
> > +     else {
> > +             net = dev_net(skb_dst(skb)->dev);
> > +     }
> > +     rt = ip_route_output(net, iph->daddr, 0, 0, 0);
>
> That is a very limited use case. e.g., oif = 0 means you are not
> considering any kind of policy routing (e.g., VRF).

Hi David! Could you be a bit more specific re: what you would like to
see here? Thanks!

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap
  2018-11-29  1:34   ` Peter Oskolkov
@ 2018-11-30 20:08     ` David Ahern
  2018-11-30 23:35       ` Peter Oskolkov
  0 siblings, 1 reply; 10+ messages in thread
From: David Ahern @ 2018-11-30 20:08 UTC (permalink / raw)
  To: Peter Oskolkov; +Cc: ast, daniel, netdev, posk.devel

On 11/28/18 6:34 PM, Peter Oskolkov wrote:
> On Wed, Nov 28, 2018 at 4:47 PM David Ahern <dsahern@gmail.com> wrote:
>>
>> On 11/28/18 5:22 PM, Peter Oskolkov wrote:
>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>> index bd0df75dc7b6..17f3c37218e5 100644
>>> --- a/net/core/filter.c
>>> +++ b/net/core/filter.c
>>> @@ -4793,6 +4793,60 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
>>>  }
>>>  #endif /* CONFIG_IPV6_SEG6_BPF */
>>>
>>> +static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len)
>>> +{
>>> +     struct dst_entry *dst;
>>> +     struct rtable *rt;
>>> +     struct iphdr *iph;
>>> +     struct net *net;
>>> +     int err;
>>> +
>>> +     if (skb->protocol != htons(ETH_P_IP))
>>> +             return -EINVAL;  /* ETH_P_IPV6 not yet supported */
>>> +
>>> +     iph = (struct iphdr *)hdr;
>>> +
>>> +     if (unlikely(len < sizeof(struct iphdr) || len > LWTUNNEL_MAX_ENCAP_HSIZE))
>>> +             return -EINVAL;
>>> +     if (unlikely(iph->version != 4 || iph->ihl * 4 > len))
>>> +             return -EINVAL;
>>> +
>>> +     if (skb->sk)
>>> +             net = sock_net(skb->sk);
>>> +     else {
>>> +             net = dev_net(skb_dst(skb)->dev);
>>> +     }
>>> +     rt = ip_route_output(net, iph->daddr, 0, 0, 0);
>>
>> That is a very limited use case. e.g., oif = 0 means you are not
>> considering any kind of policy routing (e.g., VRF).
> 
> Hi David! Could you be a bit more specific re: what you would like to
> see here? Thanks!
> 

Is the encap happening on ingress or egress? Seems like the current code
does not assume either direction for lwt (BPF_PROG_TYPE_LWT_IN vs
BPF_PROG_TYPE_LWT_OUT), yet your change does - output only. Basically,
you should be filling in a flow struct and doing a proper lookup.

When the potentially custom encap header is pushed on, seems to me skb
marks should still be honored for the route lookup. If not, you should
handle that in the API.

>From there skb->dev at a minimum should be used as either iif (ingress)
or oif (egress).

The iph is already set so you have quick access to the tos.

Also, this should implement IPv6 as well before going in.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap
  2018-11-30 20:08     ` David Ahern
@ 2018-11-30 23:35       ` Peter Oskolkov
  2018-11-30 23:51         ` David Ahern
  0 siblings, 1 reply; 10+ messages in thread
From: Peter Oskolkov @ 2018-11-30 23:35 UTC (permalink / raw)
  To: David Ahern; +Cc: ast, daniel, netdev, posk.devel

Thanks, David! This is for egress only, so I'll add an appropriate
check. I'll also address your other comments/concerns in a v2 version
of this patchset.
On Fri, Nov 30, 2018 at 12:08 PM David Ahern <dsahern@gmail.com> wrote:
>
> On 11/28/18 6:34 PM, Peter Oskolkov wrote:
> > On Wed, Nov 28, 2018 at 4:47 PM David Ahern <dsahern@gmail.com> wrote:
> >>
> >> On 11/28/18 5:22 PM, Peter Oskolkov wrote:
> >>> diff --git a/net/core/filter.c b/net/core/filter.c
> >>> index bd0df75dc7b6..17f3c37218e5 100644
> >>> --- a/net/core/filter.c
> >>> +++ b/net/core/filter.c
> >>> @@ -4793,6 +4793,60 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
> >>>  }
> >>>  #endif /* CONFIG_IPV6_SEG6_BPF */
> >>>
> >>> +static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len)
> >>> +{
> >>> +     struct dst_entry *dst;
> >>> +     struct rtable *rt;
> >>> +     struct iphdr *iph;
> >>> +     struct net *net;
> >>> +     int err;
> >>> +
> >>> +     if (skb->protocol != htons(ETH_P_IP))
> >>> +             return -EINVAL;  /* ETH_P_IPV6 not yet supported */
> >>> +
> >>> +     iph = (struct iphdr *)hdr;
> >>> +
> >>> +     if (unlikely(len < sizeof(struct iphdr) || len > LWTUNNEL_MAX_ENCAP_HSIZE))
> >>> +             return -EINVAL;
> >>> +     if (unlikely(iph->version != 4 || iph->ihl * 4 > len))
> >>> +             return -EINVAL;
> >>> +
> >>> +     if (skb->sk)
> >>> +             net = sock_net(skb->sk);
> >>> +     else {
> >>> +             net = dev_net(skb_dst(skb)->dev);
> >>> +     }
> >>> +     rt = ip_route_output(net, iph->daddr, 0, 0, 0);
> >>
> >> That is a very limited use case. e.g., oif = 0 means you are not
> >> considering any kind of policy routing (e.g., VRF).
> >
> > Hi David! Could you be a bit more specific re: what you would like to
> > see here? Thanks!
> >
>
> Is the encap happening on ingress or egress? Seems like the current code
> does not assume either direction for lwt (BPF_PROG_TYPE_LWT_IN vs
> BPF_PROG_TYPE_LWT_OUT), yet your change does - output only. Basically,
> you should be filling in a flow struct and doing a proper lookup.
>
> When the potentially custom encap header is pushed on, seems to me skb
> marks should still be honored for the route lookup. If not, you should
> handle that in the API.
>
> From there skb->dev at a minimum should be used as either iif (ingress)
> or oif (egress).
>
> The iph is already set so you have quick access to the tos.
>
> Also, this should implement IPv6 as well before going in.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap
  2018-11-30 23:35       ` Peter Oskolkov
@ 2018-11-30 23:51         ` David Ahern
  2018-12-01  0:14           ` Peter Oskolkov
  0 siblings, 1 reply; 10+ messages in thread
From: David Ahern @ 2018-11-30 23:51 UTC (permalink / raw)
  To: Peter Oskolkov; +Cc: ast, daniel, netdev, posk.devel

On 11/30/18 4:35 PM, Peter Oskolkov wrote:
> Thanks, David! This is for egress only, so I'll add an appropriate
> check. I'll also address your other comments/concerns in a v2 version
> of this patchset.

Why are you limiting this to egress only?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap
  2018-11-30 23:51         ` David Ahern
@ 2018-12-01  0:14           ` Peter Oskolkov
  2018-12-03 16:04             ` David Ahern
  0 siblings, 1 reply; 10+ messages in thread
From: Peter Oskolkov @ 2018-12-01  0:14 UTC (permalink / raw)
  To: David Ahern; +Cc: ast, daniel, netdev, posk.devel

On Fri, Nov 30, 2018 at 3:52 PM David Ahern <dsahern@gmail.com> wrote:
>
> On 11/30/18 4:35 PM, Peter Oskolkov wrote:
> > Thanks, David! This is for egress only, so I'll add an appropriate
> > check. I'll also address your other comments/concerns in a v2 version
> > of this patchset.
>
> Why are you limiting this to egress only?

I'm focusing on egress because this is a use case that we have and
understand well, and would like to have a solution for sooner rather
than later.

Without understanding why anybody would want to do lwt-bpf encap on
ingress, I don't know, for example, what a good test would look like;
but I'd be happy to look into a specific ingress use case if you have
one.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap
  2018-12-01  0:14           ` Peter Oskolkov
@ 2018-12-03 16:04             ` David Ahern
  2018-12-03 16:55               ` Peter Oskolkov
  0 siblings, 1 reply; 10+ messages in thread
From: David Ahern @ 2018-12-03 16:04 UTC (permalink / raw)
  To: Peter Oskolkov; +Cc: ast, daniel, netdev, posk.devel

On 11/30/18 5:14 PM, Peter Oskolkov wrote:
> On Fri, Nov 30, 2018 at 3:52 PM David Ahern <dsahern@gmail.com> wrote:
>>
>> On 11/30/18 4:35 PM, Peter Oskolkov wrote:
>>> Thanks, David! This is for egress only, so I'll add an appropriate
>>> check. I'll also address your other comments/concerns in a v2 version
>>> of this patchset.
>>
>> Why are you limiting this to egress only?
> 
> I'm focusing on egress because this is a use case that we have and
> understand well, and would like to have a solution for sooner rather
> than later.
> 
> Without understanding why anybody would want to do lwt-bpf encap on
> ingress, I don't know, for example, what a good test would look like;
> but I'd be happy to look into a specific ingress use case if you have
> one.
> 

We can not have proliferation of helpers for a lot of one off use cases.
A little thought now makes this helper useful for more than just your 1
use case. And, IPv6 parity should be a minimal requirement for helpers.

Based on your route lookup I presume your use case is capturing certain
local traffic, pushing a custom header and sending that packet else
where. The same could be done on the ingress path.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap
  2018-12-03 16:04             ` David Ahern
@ 2018-12-03 16:55               ` Peter Oskolkov
  0 siblings, 0 replies; 10+ messages in thread
From: Peter Oskolkov @ 2018-12-03 16:55 UTC (permalink / raw)
  To: David Ahern; +Cc: ast, daniel, netdev, posk.devel

Our case is a bit different - it is more like using the SRH header in
IPv6 to route packets via a non-default intermediate hop. But I see
your point - I'll expand the patchset to cover IPv6 and the ingress
path.
On Mon, Dec 3, 2018 at 8:04 AM David Ahern <dsahern@gmail.com> wrote:
>
> On 11/30/18 5:14 PM, Peter Oskolkov wrote:
> > On Fri, Nov 30, 2018 at 3:52 PM David Ahern <dsahern@gmail.com> wrote:
> >>
> >> On 11/30/18 4:35 PM, Peter Oskolkov wrote:
> >>> Thanks, David! This is for egress only, so I'll add an appropriate
> >>> check. I'll also address your other comments/concerns in a v2 version
> >>> of this patchset.
> >>
> >> Why are you limiting this to egress only?
> >
> > I'm focusing on egress because this is a use case that we have and
> > understand well, and would like to have a solution for sooner rather
> > than later.
> >
> > Without understanding why anybody would want to do lwt-bpf encap on
> > ingress, I don't know, for example, what a good test would look like;
> > but I'd be happy to look into a specific ingress use case if you have
> > one.
> >
>
> We can not have proliferation of helpers for a lot of one off use cases.
> A little thought now makes this helper useful for more than just your 1
> use case. And, IPv6 parity should be a minimal requirement for helpers.
>
> Based on your route lookup I presume your use case is capturing certain
> local traffic, pushing a custom header and sending that packet else
> where. The same could be done on the ingress path.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2018-12-03 16:56 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-29  0:22 [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap Peter Oskolkov
2018-11-29  0:22 ` [PATCH bpf-next 2/2] selftests/bpf: add test_lwt_ip_encap selftest Peter Oskolkov
2018-11-29  0:47 ` [PATCH bpf-next 1/2] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap David Ahern
2018-11-29  1:34   ` Peter Oskolkov
2018-11-30 20:08     ` David Ahern
2018-11-30 23:35       ` Peter Oskolkov
2018-11-30 23:51         ` David Ahern
2018-12-01  0:14           ` Peter Oskolkov
2018-12-03 16:04             ` David Ahern
2018-12-03 16:55               ` Peter Oskolkov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.