linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH
@ 2021-03-03 12:33 Xuesen Huang
  2021-03-03 12:48 ` Xuesen Huang
  2021-03-03 18:53 ` Willem de Bruijn
  0 siblings, 2 replies; 5+ messages in thread
From: Xuesen Huang @ 2021-03-03 12:33 UTC (permalink / raw)
  To: daniel
  Cc: davem, bpf, willemdebruijn.kernel, netdev, linux-kernel,
	xiyou.wangcong, Xuesen Huang, Willem de Bruijn, Zhiyong Cheng,
	Li Wang

From: Xuesen Huang <huangxuesen@kuaishou.com>

bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets
encapsulation. But that is not appropriate when pushing Ethernet header.

Add an option to further specify encap L2 type and set the inner_protocol
as ETH_P_TEB.

Update test_tc_tunnel to verify adding vxlan encapsulation works with
this flag.

Suggested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Xuesen Huang <huangxuesen@kuaishou.com>
Signed-off-by: Zhiyong Cheng <chengzhiyong@kuaishou.com>
Signed-off-by: Li Wang <wangli09@kuaishou.com>
---
 include/uapi/linux/bpf.h                           |   5 +
 net/core/filter.c                                  |  11 ++-
 tools/include/uapi/linux/bpf.h                     |   5 +
 tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 107 ++++++++++++++++++---
 tools/testing/selftests/bpf/test_tc_tunnel.sh      |  15 ++-
 5 files changed, 124 insertions(+), 19 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 77d7c1b..d791596 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1751,6 +1751,10 @@ struct bpf_stack_build_id {
  *		  Use with ENCAP_L3/L4 flags to further specify the tunnel
  *		  type; *len* is the length of the inner MAC header.
  *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
+ *		  Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
+ *		  L2 type as Ethernet.
+ *
  * 		A call to this helper is susceptible to change the underlying
  * 		packet buffer. Therefore, at load time, all checks on pointers
  * 		previously done by the verifier are invalidated and must be
@@ -4088,6 +4092,7 @@ enum {
 	BPF_F_ADJ_ROOM_ENCAP_L4_GRE	= (1ULL << 3),
 	BPF_F_ADJ_ROOM_ENCAP_L4_UDP	= (1ULL << 4),
 	BPF_F_ADJ_ROOM_NO_CSUM_RESET	= (1ULL << 5),
+	BPF_F_ADJ_ROOM_ENCAP_L2_ETH	= (1ULL << 6),
 };
 
 enum {
diff --git a/net/core/filter.c b/net/core/filter.c
index 255aeee..8d1fb61 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3412,6 +3412,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
 					 BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
 					 BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
 					 BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
+					 BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
 					 BPF_F_ADJ_ROOM_ENCAP_L2( \
 					  BPF_ADJ_ROOM_ENCAP_L2_MASK))
 
@@ -3448,6 +3449,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
 		    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
 			return -EINVAL;
 
+		if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
+		    inner_mac_len < ETH_HLEN)
+			return -EINVAL;
+
 		if (skb->encapsulation)
 			return -EALREADY;
 
@@ -3466,7 +3471,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
 		skb->inner_mac_header = inner_net - inner_mac_len;
 		skb->inner_network_header = inner_net;
 		skb->inner_transport_header = inner_trans;
-		skb_set_inner_protocol(skb, skb->protocol);
+
+		if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
+			skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+		else
+			skb_set_inner_protocol(skb, skb->protocol);
 
 		skb->encapsulation = 1;
 		skb_set_network_header(skb, mac_len);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 77d7c1b..d791596 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1751,6 +1751,10 @@ struct bpf_stack_build_id {
  *		  Use with ENCAP_L3/L4 flags to further specify the tunnel
  *		  type; *len* is the length of the inner MAC header.
  *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
+ *		  Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
+ *		  L2 type as Ethernet.
+ *
  * 		A call to this helper is susceptible to change the underlying
  * 		packet buffer. Therefore, at load time, all checks on pointers
  * 		previously done by the verifier are invalidated and must be
@@ -4088,6 +4092,7 @@ enum {
 	BPF_F_ADJ_ROOM_ENCAP_L4_GRE	= (1ULL << 3),
 	BPF_F_ADJ_ROOM_ENCAP_L4_UDP	= (1ULL << 4),
 	BPF_F_ADJ_ROOM_NO_CSUM_RESET	= (1ULL << 5),
+	BPF_F_ADJ_ROOM_ENCAP_L2_ETH	= (1ULL << 6),
 };
 
 enum {
diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
index 37bce7a..6e144db 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
@@ -20,6 +20,14 @@
 #include <bpf/bpf_endian.h>
 #include <bpf/bpf_helpers.h>
 
+#define encap_ipv4(...) __encap_ipv4(__VA_ARGS__, 0)
+
+#define encap_ipv4_with_ext_proto(...) __encap_ipv4(__VA_ARGS__)
+
+#define encap_ipv6(...) __encap_ipv6(__VA_ARGS__, 0)
+
+#define encap_ipv6_with_ext_proto(...) __encap_ipv6(__VA_ARGS__)
+
 static const int cfg_port = 8000;
 
 static const int cfg_udp_src = 20000;
@@ -27,11 +35,24 @@
 #define	UDP_PORT		5555
 #define	MPLS_OVER_UDP_PORT	6635
 #define	ETH_OVER_UDP_PORT	7777
+#define	VXLAN_UDP_PORT		8472
+
+#define	EXTPROTO_VXLAN	0x1
+
+#define	VXLAN_N_VID     (1u << 24)
+#define	VXLAN_VNI_MASK	bpf_htonl((VXLAN_N_VID - 1) << 8)
+#define	VXLAN_FLAGS     0x8
+#define	VXLAN_VNI       1
 
 /* MPLS label 1000 with S bit (last label) set and ttl of 255. */
 static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
 						     MPLS_LS_S_MASK | 0xff);
 
+struct vxlanhdr {
+	__be32 vx_flags;
+	__be32 vx_vni;
+} __attribute__((packed));
+
 struct gre_hdr {
 	__be16 flags;
 	__be16 protocol;
@@ -45,13 +66,13 @@ struct gre_hdr {
 struct v4hdr {
 	struct iphdr ip;
 	union l4hdr l4hdr;
-	__u8 pad[16];			/* enough space for L2 header */
+	__u8 pad[24];			/* space for L2 header / vxlan header ... */
 } __attribute__((packed));
 
 struct v6hdr {
 	struct ipv6hdr ip;
 	union l4hdr l4hdr;
-	__u8 pad[16];			/* enough space for L2 header */
+	__u8 pad[24];			/* space for L2 header / vxlan header ... */
 } __attribute__((packed));
 
 static __always_inline void set_ipv4_csum(struct iphdr *iph)
@@ -69,14 +90,15 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph)
 	iph->check = ~((csum & 0xffff) + (csum >> 16));
 }
 
-static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
-				      __u16 l2_proto)
+static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
+				      __u16 l2_proto, __u16 ext_proto)
 {
 	__u16 udp_dst = UDP_PORT;
 	struct iphdr iph_inner;
 	struct v4hdr h_outer;
 	struct tcphdr tcph;
 	int olen, l2_len;
+	__u8 *l2_hdr = NULL;
 	int tcp_off;
 	__u64 flags;
 
@@ -141,7 +163,11 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
 		break;
 	case ETH_P_TEB:
 		l2_len = ETH_HLEN;
-		udp_dst = ETH_OVER_UDP_PORT;
+		if (ext_proto & EXTPROTO_VXLAN) {
+			udp_dst = VXLAN_UDP_PORT;
+			l2_len += sizeof(struct vxlanhdr);
+		} else
+			udp_dst = ETH_OVER_UDP_PORT;
 		break;
 	}
 	flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
@@ -171,14 +197,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
 	}
 
 	/* add L2 encap (if specified) */
+	l2_hdr = (__u8 *)&h_outer + olen;
 	switch (l2_proto) {
 	case ETH_P_MPLS_UC:
-		*((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
+		*(__u32 *)l2_hdr = mpls_label;
 		break;
 	case ETH_P_TEB:
-		if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
-				       ETH_HLEN))
+		flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
+
+		if (ext_proto & EXTPROTO_VXLAN) {
+			struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr;
+
+			vxlan_hdr->vx_flags = VXLAN_FLAGS;
+			vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8);
+
+			l2_hdr += sizeof(struct vxlanhdr);
+		}
+
+		if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN))
 			return TC_ACT_SHOT;
+
 		break;
 	}
 	olen += l2_len;
@@ -214,14 +252,15 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
 	return TC_ACT_OK;
 }
 
-static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
-				      __u16 l2_proto)
+static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
+				      __u16 l2_proto, __u16 ext_proto)
 {
 	__u16 udp_dst = UDP_PORT;
 	struct ipv6hdr iph_inner;
 	struct v6hdr h_outer;
 	struct tcphdr tcph;
 	int olen, l2_len;
+	__u8 *l2_hdr = NULL;
 	__u16 tot_len;
 	__u64 flags;
 
@@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
 		break;
 	case ETH_P_TEB:
 		l2_len = ETH_HLEN;
-		udp_dst = ETH_OVER_UDP_PORT;
+		if (ext_proto & EXTPROTO_VXLAN) {
+			udp_dst = VXLAN_UDP_PORT;
+			l2_len += sizeof(struct vxlanhdr);
+		} else
+			udp_dst = ETH_OVER_UDP_PORT;
 		break;
 	}
 	flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
@@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
 		h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
 		h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
 		tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
-			  sizeof(h_outer.l4hdr.udp);
+			  sizeof(h_outer.l4hdr.udp) + l2_len;
 		h_outer.l4hdr.udp.check = 0;
 		h_outer.l4hdr.udp.len = bpf_htons(tot_len);
 		break;
@@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
 	}
 
 	/* add L2 encap (if specified) */
+	l2_hdr = (__u8 *)&h_outer + olen;
 	switch (l2_proto) {
 	case ETH_P_MPLS_UC:
-		*((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
+		*(__u32 *)l2_hdr = mpls_label;
 		break;
 	case ETH_P_TEB:
-		if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
-				       ETH_HLEN))
+		flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
+
+		if (ext_proto & EXTPROTO_VXLAN) {
+			struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr;
+
+			vxlan_hdr->vx_flags = VXLAN_FLAGS;
+			vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8);
+
+			l2_hdr += sizeof(struct vxlanhdr);
+		}
+
+		if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN))
 			return TC_ACT_SHOT;
 		break;
 	}
@@ -372,6 +426,16 @@ int __encap_udp_eth(struct __sk_buff *skb)
 		return TC_ACT_OK;
 }
 
+SEC("encap_vxlan_eth")
+int __encap_vxlan_eth(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+		return encap_ipv4_with_ext_proto(skb, IPPROTO_UDP,
+				ETH_P_TEB, EXTPROTO_VXLAN);
+	else
+		return TC_ACT_OK;
+}
+
 SEC("encap_sit_none")
 int __encap_sit_none(struct __sk_buff *skb)
 {
@@ -444,6 +508,16 @@ int __encap_ip6udp_eth(struct __sk_buff *skb)
 		return TC_ACT_OK;
 }
 
+SEC("encap_ip6vxlan_eth")
+int __encap_ip6vxlan_eth(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+		return encap_ipv6_with_ext_proto(skb, IPPROTO_UDP,
+				ETH_P_TEB, EXTPROTO_VXLAN);
+	else
+		return TC_ACT_OK;
+}
+
 static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
 {
 	char buf[sizeof(struct v6hdr)];
@@ -479,6 +553,9 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
 		case ETH_OVER_UDP_PORT:
 			olen += ETH_HLEN;
 			break;
+		case VXLAN_UDP_PORT:
+			olen += ETH_HLEN + sizeof(struct vxlanhdr);
+			break;
 		}
 		break;
 	default:
diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh
index 7c76b84..c9dde9b 100755
--- a/tools/testing/selftests/bpf/test_tc_tunnel.sh
+++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh
@@ -44,8 +44,8 @@ setup() {
 	# clamp route to reserve room for tunnel headers
 	ip -netns "${ns1}" -4 route flush table main
 	ip -netns "${ns1}" -6 route flush table main
-	ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1458 dev veth1
-	ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1438 dev veth1
+	ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1450 dev veth1
+	ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1430 dev veth1
 
 	sleep 1
 
@@ -105,6 +105,12 @@ if [[ "$#" -eq "0" ]]; then
 	echo "sit"
 	$0 ipv6 sit none 100
 
+	echo "ip4 vxlan"
+	$0 ipv4 vxlan eth 2000
+
+	echo "ip6 vxlan"
+	$0 ipv6 ip6vxlan eth 2000
+
 	for mac in none mpls eth ; do
 		echo "ip gre $mac"
 		$0 ipv4 gre $mac 100
@@ -214,6 +220,9 @@ if [[ "$tuntype" =~ "udp" ]]; then
 	targs="encap fou encap-sport auto encap-dport $dport"
 elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
 	ttype=$gretaptype
+elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then
+	ttype="vxlan"
+	targs="id 1 dstport 8472 udp6zerocsumrx"
 else
 	ttype=$tuntype
 	targs=""
@@ -242,7 +251,7 @@ if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then
 elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then
 	# No support for TEB fou tunnel; expect failure.
 	expect_tun_fail=1
-elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
+elif [[ "$tuntype" =~ (gre|vxlan) && "$mac" == "eth" ]]; then
 	# Share ethernet address between tunnel/veth2 so L2 decap works.
 	ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \
 		  awk '/ether/ { print $2 }')
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH
  2021-03-03 12:33 [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH Xuesen Huang
@ 2021-03-03 12:48 ` Xuesen Huang
  2021-03-03 18:53 ` Willem de Bruijn
  1 sibling, 0 replies; 5+ messages in thread
From: Xuesen Huang @ 2021-03-03 12:48 UTC (permalink / raw)
  To: xiyou.wangcong
  Cc: Daniel Borkmann, David Miller, bpf, Willem de Bruijn,
	Network Development, LKML, xiyou.wangcong, Willem de Bruijn,
	Zhiyong Cheng, Li Wang

Thanks Cong!

Thanks to your suggestion, I try to add a simple test case to test_tc_tunnel. It works 
for me :)

Thanks for your review.

> 2021年3月3日 下午8:33,Xuesen Huang <hxseverything@gmail.com> 写道:
> 
> From: Xuesen Huang <huangxuesen@kuaishou.com>
> 
> bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets
> encapsulation. But that is not appropriate when pushing Ethernet header.
> 
> Add an option to further specify encap L2 type and set the inner_protocol
> as ETH_P_TEB.
> 
> Update test_tc_tunnel to verify adding vxlan encapsulation works with
> this flag.
> 
> Suggested-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Xuesen Huang <huangxuesen@kuaishou.com>
> Signed-off-by: Zhiyong Cheng <chengzhiyong@kuaishou.com>
> Signed-off-by: Li Wang <wangli09@kuaishou.com>
> ---
> include/uapi/linux/bpf.h                           |   5 +
> net/core/filter.c                                  |  11 ++-
> tools/include/uapi/linux/bpf.h                     |   5 +
> tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 107 ++++++++++++++++++---
> tools/testing/selftests/bpf/test_tc_tunnel.sh      |  15 ++-
> 5 files changed, 124 insertions(+), 19 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 77d7c1b..d791596 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1751,6 +1751,10 @@ struct bpf_stack_build_id {
>  *		  Use with ENCAP_L3/L4 flags to further specify the tunnel
>  *		  type; *len* is the length of the inner MAC header.
>  *
> + *		* **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
> + *		  Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
> + *		  L2 type as Ethernet.
> + *
>  * 		A call to this helper is susceptible to change the underlying
>  * 		packet buffer. Therefore, at load time, all checks on pointers
>  * 		previously done by the verifier are invalidated and must be
> @@ -4088,6 +4092,7 @@ enum {
> 	BPF_F_ADJ_ROOM_ENCAP_L4_GRE	= (1ULL << 3),
> 	BPF_F_ADJ_ROOM_ENCAP_L4_UDP	= (1ULL << 4),
> 	BPF_F_ADJ_ROOM_NO_CSUM_RESET	= (1ULL << 5),
> +	BPF_F_ADJ_ROOM_ENCAP_L2_ETH	= (1ULL << 6),
> };
> 
> enum {
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 255aeee..8d1fb61 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3412,6 +3412,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
> 					 BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
> 					 BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
> 					 BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
> +					 BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
> 					 BPF_F_ADJ_ROOM_ENCAP_L2( \
> 					  BPF_ADJ_ROOM_ENCAP_L2_MASK))
> 
> @@ -3448,6 +3449,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
> 		    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
> 			return -EINVAL;
> 
> +		if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
> +		    inner_mac_len < ETH_HLEN)
> +			return -EINVAL;
> +
> 		if (skb->encapsulation)
> 			return -EALREADY;
> 
> @@ -3466,7 +3471,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
> 		skb->inner_mac_header = inner_net - inner_mac_len;
> 		skb->inner_network_header = inner_net;
> 		skb->inner_transport_header = inner_trans;
> -		skb_set_inner_protocol(skb, skb->protocol);
> +
> +		if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
> +			skb_set_inner_protocol(skb, htons(ETH_P_TEB));
> +		else
> +			skb_set_inner_protocol(skb, skb->protocol);
> 
> 		skb->encapsulation = 1;
> 		skb_set_network_header(skb, mac_len);
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 77d7c1b..d791596 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -1751,6 +1751,10 @@ struct bpf_stack_build_id {
>  *		  Use with ENCAP_L3/L4 flags to further specify the tunnel
>  *		  type; *len* is the length of the inner MAC header.
>  *
> + *		* **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
> + *		  Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
> + *		  L2 type as Ethernet.
> + *
>  * 		A call to this helper is susceptible to change the underlying
>  * 		packet buffer. Therefore, at load time, all checks on pointers
>  * 		previously done by the verifier are invalidated and must be
> @@ -4088,6 +4092,7 @@ enum {
> 	BPF_F_ADJ_ROOM_ENCAP_L4_GRE	= (1ULL << 3),
> 	BPF_F_ADJ_ROOM_ENCAP_L4_UDP	= (1ULL << 4),
> 	BPF_F_ADJ_ROOM_NO_CSUM_RESET	= (1ULL << 5),
> +	BPF_F_ADJ_ROOM_ENCAP_L2_ETH	= (1ULL << 6),
> };
> 
> enum {
> diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> index 37bce7a..6e144db 100644
> --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> @@ -20,6 +20,14 @@
> #include <bpf/bpf_endian.h>
> #include <bpf/bpf_helpers.h>
> 
> +#define encap_ipv4(...) __encap_ipv4(__VA_ARGS__, 0)
> +
> +#define encap_ipv4_with_ext_proto(...) __encap_ipv4(__VA_ARGS__)
> +
> +#define encap_ipv6(...) __encap_ipv6(__VA_ARGS__, 0)
> +
> +#define encap_ipv6_with_ext_proto(...) __encap_ipv6(__VA_ARGS__)
> +
> static const int cfg_port = 8000;
> 
> static const int cfg_udp_src = 20000;
> @@ -27,11 +35,24 @@
> #define	UDP_PORT		5555
> #define	MPLS_OVER_UDP_PORT	6635
> #define	ETH_OVER_UDP_PORT	7777
> +#define	VXLAN_UDP_PORT		8472
> +
> +#define	EXTPROTO_VXLAN	0x1
> +
> +#define	VXLAN_N_VID     (1u << 24)
> +#define	VXLAN_VNI_MASK	bpf_htonl((VXLAN_N_VID - 1) << 8)
> +#define	VXLAN_FLAGS     0x8
> +#define	VXLAN_VNI       1
> 
> /* MPLS label 1000 with S bit (last label) set and ttl of 255. */
> static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
> 						     MPLS_LS_S_MASK | 0xff);
> 
> +struct vxlanhdr {
> +	__be32 vx_flags;
> +	__be32 vx_vni;
> +} __attribute__((packed));
> +
> struct gre_hdr {
> 	__be16 flags;
> 	__be16 protocol;
> @@ -45,13 +66,13 @@ struct gre_hdr {
> struct v4hdr {
> 	struct iphdr ip;
> 	union l4hdr l4hdr;
> -	__u8 pad[16];			/* enough space for L2 header */
> +	__u8 pad[24];			/* space for L2 header / vxlan header ... */
> } __attribute__((packed));
> 
> struct v6hdr {
> 	struct ipv6hdr ip;
> 	union l4hdr l4hdr;
> -	__u8 pad[16];			/* enough space for L2 header */
> +	__u8 pad[24];			/* space for L2 header / vxlan header ... */
> } __attribute__((packed));
> 
> static __always_inline void set_ipv4_csum(struct iphdr *iph)
> @@ -69,14 +90,15 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph)
> 	iph->check = ~((csum & 0xffff) + (csum >> 16));
> }
> 
> -static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
> -				      __u16 l2_proto)
> +static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
> +				      __u16 l2_proto, __u16 ext_proto)
> {
> 	__u16 udp_dst = UDP_PORT;
> 	struct iphdr iph_inner;
> 	struct v4hdr h_outer;
> 	struct tcphdr tcph;
> 	int olen, l2_len;
> +	__u8 *l2_hdr = NULL;
> 	int tcp_off;
> 	__u64 flags;
> 
> @@ -141,7 +163,11 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
> 		break;
> 	case ETH_P_TEB:
> 		l2_len = ETH_HLEN;
> -		udp_dst = ETH_OVER_UDP_PORT;
> +		if (ext_proto & EXTPROTO_VXLAN) {
> +			udp_dst = VXLAN_UDP_PORT;
> +			l2_len += sizeof(struct vxlanhdr);
> +		} else
> +			udp_dst = ETH_OVER_UDP_PORT;
> 		break;
> 	}
> 	flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
> @@ -171,14 +197,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
> 	}
> 
> 	/* add L2 encap (if specified) */
> +	l2_hdr = (__u8 *)&h_outer + olen;
> 	switch (l2_proto) {
> 	case ETH_P_MPLS_UC:
> -		*((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> +		*(__u32 *)l2_hdr = mpls_label;
> 		break;
> 	case ETH_P_TEB:
> -		if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> -				       ETH_HLEN))
> +		flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
> +
> +		if (ext_proto & EXTPROTO_VXLAN) {
> +			struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr;
> +
> +			vxlan_hdr->vx_flags = VXLAN_FLAGS;
> +			vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8);
> +
> +			l2_hdr += sizeof(struct vxlanhdr);
> +		}
> +
> +		if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN))
> 			return TC_ACT_SHOT;
> +
> 		break;
> 	}
> 	olen += l2_len;
> @@ -214,14 +252,15 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
> 	return TC_ACT_OK;
> }
> 
> -static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> -				      __u16 l2_proto)
> +static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> +				      __u16 l2_proto, __u16 ext_proto)
> {
> 	__u16 udp_dst = UDP_PORT;
> 	struct ipv6hdr iph_inner;
> 	struct v6hdr h_outer;
> 	struct tcphdr tcph;
> 	int olen, l2_len;
> +	__u8 *l2_hdr = NULL;
> 	__u16 tot_len;
> 	__u64 flags;
> 
> @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> 		break;
> 	case ETH_P_TEB:
> 		l2_len = ETH_HLEN;
> -		udp_dst = ETH_OVER_UDP_PORT;
> +		if (ext_proto & EXTPROTO_VXLAN) {
> +			udp_dst = VXLAN_UDP_PORT;
> +			l2_len += sizeof(struct vxlanhdr);
> +		} else
> +			udp_dst = ETH_OVER_UDP_PORT;
> 		break;
> 	}
> 	flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
> @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> 		h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
> 		h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
> 		tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
> -			  sizeof(h_outer.l4hdr.udp);
> +			  sizeof(h_outer.l4hdr.udp) + l2_len;
> 		h_outer.l4hdr.udp.check = 0;
> 		h_outer.l4hdr.udp.len = bpf_htons(tot_len);
> 		break;
> @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> 	}
> 
> 	/* add L2 encap (if specified) */
> +	l2_hdr = (__u8 *)&h_outer + olen;
> 	switch (l2_proto) {
> 	case ETH_P_MPLS_UC:
> -		*((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> +		*(__u32 *)l2_hdr = mpls_label;
> 		break;
> 	case ETH_P_TEB:
> -		if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> -				       ETH_HLEN))
> +		flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
> +
> +		if (ext_proto & EXTPROTO_VXLAN) {
> +			struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr;
> +
> +			vxlan_hdr->vx_flags = VXLAN_FLAGS;
> +			vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8);
> +
> +			l2_hdr += sizeof(struct vxlanhdr);
> +		}
> +
> +		if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN))
> 			return TC_ACT_SHOT;
> 		break;
> 	}
> @@ -372,6 +426,16 @@ int __encap_udp_eth(struct __sk_buff *skb)
> 		return TC_ACT_OK;
> }
> 
> +SEC("encap_vxlan_eth")
> +int __encap_vxlan_eth(struct __sk_buff *skb)
> +{
> +	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
> +		return encap_ipv4_with_ext_proto(skb, IPPROTO_UDP,
> +				ETH_P_TEB, EXTPROTO_VXLAN);
> +	else
> +		return TC_ACT_OK;
> +}
> +
> SEC("encap_sit_none")
> int __encap_sit_none(struct __sk_buff *skb)
> {
> @@ -444,6 +508,16 @@ int __encap_ip6udp_eth(struct __sk_buff *skb)
> 		return TC_ACT_OK;
> }
> 
> +SEC("encap_ip6vxlan_eth")
> +int __encap_ip6vxlan_eth(struct __sk_buff *skb)
> +{
> +	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
> +		return encap_ipv6_with_ext_proto(skb, IPPROTO_UDP,
> +				ETH_P_TEB, EXTPROTO_VXLAN);
> +	else
> +		return TC_ACT_OK;
> +}
> +
> static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
> {
> 	char buf[sizeof(struct v6hdr)];
> @@ -479,6 +553,9 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
> 		case ETH_OVER_UDP_PORT:
> 			olen += ETH_HLEN;
> 			break;
> +		case VXLAN_UDP_PORT:
> +			olen += ETH_HLEN + sizeof(struct vxlanhdr);
> +			break;
> 		}
> 		break;
> 	default:
> diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh
> index 7c76b84..c9dde9b 100755
> --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh
> +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh
> @@ -44,8 +44,8 @@ setup() {
> 	# clamp route to reserve room for tunnel headers
> 	ip -netns "${ns1}" -4 route flush table main
> 	ip -netns "${ns1}" -6 route flush table main
> -	ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1458 dev veth1
> -	ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1438 dev veth1
> +	ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1450 dev veth1
> +	ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1430 dev veth1
> 
> 	sleep 1
> 
> @@ -105,6 +105,12 @@ if [[ "$#" -eq "0" ]]; then
> 	echo "sit"
> 	$0 ipv6 sit none 100
> 
> +	echo "ip4 vxlan"
> +	$0 ipv4 vxlan eth 2000
> +
> +	echo "ip6 vxlan"
> +	$0 ipv6 ip6vxlan eth 2000
> +
> 	for mac in none mpls eth ; do
> 		echo "ip gre $mac"
> 		$0 ipv4 gre $mac 100
> @@ -214,6 +220,9 @@ if [[ "$tuntype" =~ "udp" ]]; then
> 	targs="encap fou encap-sport auto encap-dport $dport"
> elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
> 	ttype=$gretaptype
> +elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then
> +	ttype="vxlan"
> +	targs="id 1 dstport 8472 udp6zerocsumrx"
> else
> 	ttype=$tuntype
> 	targs=""
> @@ -242,7 +251,7 @@ if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then
> elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then
> 	# No support for TEB fou tunnel; expect failure.
> 	expect_tun_fail=1
> -elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
> +elif [[ "$tuntype" =~ (gre|vxlan) && "$mac" == "eth" ]]; then
> 	# Share ethernet address between tunnel/veth2 so L2 decap works.
> 	ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \
> 		  awk '/ether/ { print $2 }')
> -- 
> 1.8.3.1
> 


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH
  2021-03-03 12:33 [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH Xuesen Huang
  2021-03-03 12:48 ` Xuesen Huang
@ 2021-03-03 18:53 ` Willem de Bruijn
  2021-03-04  3:22   ` Xuesen Huang
  1 sibling, 1 reply; 5+ messages in thread
From: Willem de Bruijn @ 2021-03-03 18:53 UTC (permalink / raw)
  To: Xuesen Huang
  Cc: Daniel Borkmann, David Miller, bpf, Willem de Bruijn,
	Network Development, linux-kernel, Cong Wang, Xuesen Huang,
	Zhiyong Cheng, Li Wang

On Wed, Mar 3, 2021 at 7:33 AM Xuesen Huang <hxseverything@gmail.com> wrote:
>
> From: Xuesen Huang <huangxuesen@kuaishou.com>
>
> bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets
> encapsulation. But that is not appropriate when pushing Ethernet header.
>
> Add an option to further specify encap L2 type and set the inner_protocol
> as ETH_P_TEB.
>
> Update test_tc_tunnel to verify adding vxlan encapsulation works with
> this flag.
>
> Suggested-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Xuesen Huang <huangxuesen@kuaishou.com>
> Signed-off-by: Zhiyong Cheng <chengzhiyong@kuaishou.com>
> Signed-off-by: Li Wang <wangli09@kuaishou.com>

Thanks for adding the test. Perhaps that is better in a separate patch?

Overall looks great to me.

The patch has not (yet?) arrived on patchwork.

>  enum {
> diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> index 37bce7a..6e144db 100644
> --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> @@ -20,6 +20,14 @@
>  #include <bpf/bpf_endian.h>
>  #include <bpf/bpf_helpers.h>
>
> +#define encap_ipv4(...) __encap_ipv4(__VA_ARGS__, 0)
> +
> +#define encap_ipv4_with_ext_proto(...) __encap_ipv4(__VA_ARGS__)
> +
> +#define encap_ipv6(...) __encap_ipv6(__VA_ARGS__, 0)
> +
> +#define encap_ipv6_with_ext_proto(...) __encap_ipv6(__VA_ARGS__)
> +

Instead of untyped macros, I'd define encap_ipv4 as a function that
calls __encap_ipv4.

And no need for encap_ipv4_with_ext_proto equivalent to __encap_ipv4.

>  static const int cfg_port = 8000;
>
>  static const int cfg_udp_src = 20000;
> @@ -27,11 +35,24 @@
>  #define        UDP_PORT                5555
>  #define        MPLS_OVER_UDP_PORT      6635
>  #define        ETH_OVER_UDP_PORT       7777
> +#define        VXLAN_UDP_PORT          8472
> +
> +#define        EXTPROTO_VXLAN  0x1
> +
> +#define        VXLAN_N_VID     (1u << 24)
> +#define        VXLAN_VNI_MASK  bpf_htonl((VXLAN_N_VID - 1) << 8)
> +#define        VXLAN_FLAGS     0x8
> +#define        VXLAN_VNI       1
>
>  /* MPLS label 1000 with S bit (last label) set and ttl of 255. */
>  static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
>                                                      MPLS_LS_S_MASK | 0xff);
>
> +struct vxlanhdr {
> +       __be32 vx_flags;
> +       __be32 vx_vni;
> +} __attribute__((packed));
> +
>  struct gre_hdr {
>         __be16 flags;
>         __be16 protocol;
> @@ -45,13 +66,13 @@ struct gre_hdr {
>  struct v4hdr {
>         struct iphdr ip;
>         union l4hdr l4hdr;
> -       __u8 pad[16];                   /* enough space for L2 header */
> +       __u8 pad[24];                   /* space for L2 header / vxlan header ... */

could we use something like sizeof(..) instead of a constant?

> @@ -171,14 +197,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
>         }
>
>         /* add L2 encap (if specified) */
> +       l2_hdr = (__u8 *)&h_outer + olen;
>         switch (l2_proto) {
>         case ETH_P_MPLS_UC:
> -               *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> +               *(__u32 *)l2_hdr = mpls_label;
>                 break;
>         case ETH_P_TEB:
> -               if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> -                                      ETH_HLEN))

This is non-standard indentation? Here and elsewhere.

> @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>                 break;
>         case ETH_P_TEB:
>                 l2_len = ETH_HLEN;
> -               udp_dst = ETH_OVER_UDP_PORT;
> +               if (ext_proto & EXTPROTO_VXLAN) {
> +                       udp_dst = VXLAN_UDP_PORT;
> +                       l2_len += sizeof(struct vxlanhdr);
> +               } else
> +                       udp_dst = ETH_OVER_UDP_PORT;
>                 break;
>         }
>         flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
> @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>                 h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
>                 h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
>                 tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
> -                         sizeof(h_outer.l4hdr.udp);
> +                         sizeof(h_outer.l4hdr.udp) + l2_len;

Was this a bug previously?

>                 h_outer.l4hdr.udp.check = 0;
>                 h_outer.l4hdr.udp.len = bpf_htons(tot_len);
>                 break;
> @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>         }
>
>         /* add L2 encap (if specified) */
> +       l2_hdr = (__u8 *)&h_outer + olen;
>         switch (l2_proto) {
>         case ETH_P_MPLS_UC:
> -               *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> +               *(__u32 *)l2_hdr = mpls_label;
>                 break;
>         case ETH_P_TEB:
> -               if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> -                                      ETH_HLEN))
> +               flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;

This is a change also for the existing case. Correctly so, I imagine.
But the test used to pass with the wrong protocol?

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH
  2021-03-03 18:53 ` Willem de Bruijn
@ 2021-03-04  3:22   ` Xuesen Huang
  2021-03-04  3:40     ` Willem de Bruijn
  0 siblings, 1 reply; 5+ messages in thread
From: Xuesen Huang @ 2021-03-04  3:22 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Daniel Borkmann, David Miller, bpf, Network Development,
	linux-kernel, Cong Wang, Zhiyong Cheng, Li Wang



> 2021年3月4日 上午2:53,Willem de Bruijn <willemdebruijn.kernel@gmail.com> 写道:
> 
> On Wed, Mar 3, 2021 at 7:33 AM Xuesen Huang <hxseverything@gmail.com> wrote:
>> 
>> From: Xuesen Huang <huangxuesen@kuaishou.com>
>> 
>> bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets
>> encapsulation. But that is not appropriate when pushing Ethernet header.
>> 
>> Add an option to further specify encap L2 type and set the inner_protocol
>> as ETH_P_TEB.
>> 
>> Update test_tc_tunnel to verify adding vxlan encapsulation works with
>> this flag.
>> 
>> Suggested-by: Willem de Bruijn <willemb@google.com>
>> Signed-off-by: Xuesen Huang <huangxuesen@kuaishou.com>
>> Signed-off-by: Zhiyong Cheng <chengzhiyong@kuaishou.com>
>> Signed-off-by: Li Wang <wangli09@kuaishou.com>
> 
> Thanks for adding the test. Perhaps that is better in a separate patch?
> 
> Overall looks great to me.
> 
> The patch has not (yet?) arrived on patchwork.
> 
Thanks Willem, I will separate it into two patch.

I will send patch/v5 with only that new flag addition, lol.

>> enum {
>> diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
>> index 37bce7a..6e144db 100644
>> --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
>> +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
>> @@ -20,6 +20,14 @@
>> #include <bpf/bpf_endian.h>
>> #include <bpf/bpf_helpers.h>
>> 
>> +#define encap_ipv4(...) __encap_ipv4(__VA_ARGS__, 0)
>> +
>> +#define encap_ipv4_with_ext_proto(...) __encap_ipv4(__VA_ARGS__)
>> +
>> +#define encap_ipv6(...) __encap_ipv6(__VA_ARGS__, 0)
>> +
>> +#define encap_ipv6_with_ext_proto(...) __encap_ipv6(__VA_ARGS__)
>> +
> 
> Instead of untyped macros, I'd define encap_ipv4 as a function that
> calls __encap_ipv4.
> 
> And no need for encap_ipv4_with_ext_proto equivalent to __encap_ipv4.
> 
I defined these macros to try to keep the existing  invocation for encap_ipv4/6
as the same, if we define this as a function all invocation should be modified?

>> static const int cfg_port = 8000;
>> 
>> static const int cfg_udp_src = 20000;
>> @@ -27,11 +35,24 @@
>> #define        UDP_PORT                5555
>> #define        MPLS_OVER_UDP_PORT      6635
>> #define        ETH_OVER_UDP_PORT       7777
>> +#define        VXLAN_UDP_PORT          8472
>> +
>> +#define        EXTPROTO_VXLAN  0x1
>> +
>> +#define        VXLAN_N_VID     (1u << 24)
>> +#define        VXLAN_VNI_MASK  bpf_htonl((VXLAN_N_VID - 1) << 8)
>> +#define        VXLAN_FLAGS     0x8
>> +#define        VXLAN_VNI       1
>> 
>> /* MPLS label 1000 with S bit (last label) set and ttl of 255. */
>> static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
>>                                                     MPLS_LS_S_MASK | 0xff);
>> 
>> +struct vxlanhdr {
>> +       __be32 vx_flags;
>> +       __be32 vx_vni;
>> +} __attribute__((packed));
>> +
>> struct gre_hdr {
>>        __be16 flags;
>>        __be16 protocol;
>> @@ -45,13 +66,13 @@ struct gre_hdr {
>> struct v4hdr {
>>        struct iphdr ip;
>>        union l4hdr l4hdr;
>> -       __u8 pad[16];                   /* enough space for L2 header */
>> +       __u8 pad[24];                   /* space for L2 header / vxlan header ... */
> 
> could we use something like sizeof(..) instead of a constant?
> 
Thanks, I will try to fix this.

>> @@ -171,14 +197,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
>>        }
>> 
>>        /* add L2 encap (if specified) */
>> +       l2_hdr = (__u8 *)&h_outer + olen;
>>        switch (l2_proto) {
>>        case ETH_P_MPLS_UC:
>> -               *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
>> +               *(__u32 *)l2_hdr = mpls_label;
>>                break;
>>        case ETH_P_TEB:
>> -               if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
>> -                                      ETH_HLEN))
> 
> This is non-standard indentation? Here and elsewhere.
I thinks it’s a previous issue.

> 
>> @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>>                break;
>>        case ETH_P_TEB:
>>                l2_len = ETH_HLEN;
>> -               udp_dst = ETH_OVER_UDP_PORT;
>> +               if (ext_proto & EXTPROTO_VXLAN) {
>> +                       udp_dst = VXLAN_UDP_PORT;
>> +                       l2_len += sizeof(struct vxlanhdr);
>> +               } else
>> +                       udp_dst = ETH_OVER_UDP_PORT;
>>                break;
>>        }
>>        flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
>> @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>>                h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
>>                h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
>>                tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
>> -                         sizeof(h_outer.l4hdr.udp);
>> +                         sizeof(h_outer.l4hdr.udp) + l2_len;
> 
> Was this a bug previously?
> 
Yes, a tiny bug.

>>                h_outer.l4hdr.udp.check = 0;
>>                h_outer.l4hdr.udp.len = bpf_htons(tot_len);
>>                break;
>> @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>>        }
>> 
>>        /* add L2 encap (if specified) */
>> +       l2_hdr = (__u8 *)&h_outer + olen;
>>        switch (l2_proto) {
>>        case ETH_P_MPLS_UC:
>> -               *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
>> +               *(__u32 *)l2_hdr = mpls_label;
>>                break;
>>        case ETH_P_TEB:
>> -               if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
>> -                                      ETH_HLEN))
>> +               flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
> 
> This is a change also for the existing case. Correctly so, I imagine.
> But the test used to pass with the wrong protocol?
Yes all tests pass. I’m not sure should we add this flag for the existing tests
which encap eth as the l2 header or only for the Vxlan test?  

Waiting for your suggestion.
Thanks.



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH
  2021-03-04  3:22   ` Xuesen Huang
@ 2021-03-04  3:40     ` Willem de Bruijn
  0 siblings, 0 replies; 5+ messages in thread
From: Willem de Bruijn @ 2021-03-04  3:40 UTC (permalink / raw)
  To: Xuesen Huang
  Cc: Willem de Bruijn, Daniel Borkmann, David Miller, bpf,
	Network Development, linux-kernel, Cong Wang, Zhiyong Cheng,
	Li Wang

> > Instead of untyped macros, I'd define encap_ipv4 as a function that
> > calls __encap_ipv4.
> >
> > And no need for encap_ipv4_with_ext_proto equivalent to __encap_ipv4.
> >
> I defined these macros to try to keep the existing  invocation for encap_ipv4/6
> as the same, if we define this as a function all invocation should be modified?

You can leave the existing invocations the same and make the new
callers caller __encap_ipv4 directly, which takes one extra argument?
Adding a __ prefixed variant with extra args is a common pattern.

> >>        /* add L2 encap (if specified) */
> >> +       l2_hdr = (__u8 *)&h_outer + olen;
> >>        switch (l2_proto) {
> >>        case ETH_P_MPLS_UC:
> >> -               *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> >> +               *(__u32 *)l2_hdr = mpls_label;
> >>                break;
> >>        case ETH_P_TEB:
> >> -               if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> >> -                                      ETH_HLEN))
> >
> > This is non-standard indentation? Here and elsewhere.
> I thinks it’s a previous issue.

Ah right. Bad example. How about in __encap_vxlan_eth

+               return encap_ipv4_with_ext_proto(skb, IPPROTO_UDP,
+                               ETH_P_TEB, EXTPROTO_VXLAN);

> >> @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> >>        }
> >>
> >>        /* add L2 encap (if specified) */
> >> +       l2_hdr = (__u8 *)&h_outer + olen;
> >>        switch (l2_proto) {
> >>        case ETH_P_MPLS_UC:
> >> -               *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> >> +               *(__u32 *)l2_hdr = mpls_label;
> >>                break;
> >>        case ETH_P_TEB:
> >> -               if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> >> -                                      ETH_HLEN))
> >> +               flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
> >
> > This is a change also for the existing case. Correctly so, I imagine.
> > But the test used to pass with the wrong protocol?
> Yes all tests pass. I’m not sure should we add this flag for the existing tests
> which encap eth as the l2 header or only for the Vxlan test?

It is correct in both cases. If it does not break anything, I would do both.

Thanks,

  Willem

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-03-04  3:42 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-03 12:33 [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH Xuesen Huang
2021-03-03 12:48 ` Xuesen Huang
2021-03-03 18:53 ` Willem de Bruijn
2021-03-04  3:22   ` Xuesen Huang
2021-03-04  3:40     ` Willem de Bruijn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).