Re: [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH

From: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
To: Xuesen Huang <hxseverything@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>,
	David Miller <davem@davemloft.net>, bpf <bpf@vger.kernel.org>,
	Willem de Bruijn <willemdebruijn.kernel@gmail.com>,
	Network Development <netdev@vger.kernel.org>,
	linux-kernel <linux-kernel@vger.kernel.org>,
	Cong Wang <xiyou.wangcong@gmail.com>,
	Xuesen Huang <huangxuesen@kuaishou.com>,
	Zhiyong Cheng <chengzhiyong@kuaishou.com>,
	Li Wang <wangli09@kuaishou.com>
Subject: Re: [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH
Date: Wed, 3 Mar 2021 13:53:34 -0500	[thread overview]
Message-ID: <CA+FuTSfY0y7Y2XSKO-rqPY5mX83NWgAWbQeVukFA94eJVu2B2g@mail.gmail.com> (raw)
In-Reply-To: <20210303123338.99089-1-hxseverything@gmail.com>

On Wed, Mar 3, 2021 at 7:33 AM Xuesen Huang <hxseverything@gmail.com> wrote:
>
> From: Xuesen Huang <huangxuesen@kuaishou.com>
>
> bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets
> encapsulation. But that is not appropriate when pushing Ethernet header.
>
> Add an option to further specify encap L2 type and set the inner_protocol
> as ETH_P_TEB.
>
> Update test_tc_tunnel to verify adding vxlan encapsulation works with
> this flag.
>
> Suggested-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Xuesen Huang <huangxuesen@kuaishou.com>
> Signed-off-by: Zhiyong Cheng <chengzhiyong@kuaishou.com>
> Signed-off-by: Li Wang <wangli09@kuaishou.com>

Thanks for adding the test. Perhaps that is better in a separate patch?

Overall looks great to me.

The patch has not (yet?) arrived on patchwork.

>  enum {
> diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> index 37bce7a..6e144db 100644
> --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> @@ -20,6 +20,14 @@
>  #include <bpf/bpf_endian.h>
>  #include <bpf/bpf_helpers.h>
>
> +#define encap_ipv4(...) __encap_ipv4(__VA_ARGS__, 0)
> +
> +#define encap_ipv4_with_ext_proto(...) __encap_ipv4(__VA_ARGS__)
> +
> +#define encap_ipv6(...) __encap_ipv6(__VA_ARGS__, 0)
> +
> +#define encap_ipv6_with_ext_proto(...) __encap_ipv6(__VA_ARGS__)
> +

Instead of untyped macros, I'd define encap_ipv4 as a function that
calls __encap_ipv4.

And no need for encap_ipv4_with_ext_proto equivalent to __encap_ipv4.

>  static const int cfg_port = 8000;
>
>  static const int cfg_udp_src = 20000;
> @@ -27,11 +35,24 @@
>  #define        UDP_PORT                5555
>  #define        MPLS_OVER_UDP_PORT      6635
>  #define        ETH_OVER_UDP_PORT       7777
> +#define        VXLAN_UDP_PORT          8472
> +
> +#define        EXTPROTO_VXLAN  0x1
> +
> +#define        VXLAN_N_VID     (1u << 24)
> +#define        VXLAN_VNI_MASK  bpf_htonl((VXLAN_N_VID - 1) << 8)
> +#define        VXLAN_FLAGS     0x8
> +#define        VXLAN_VNI       1
>
>  /* MPLS label 1000 with S bit (last label) set and ttl of 255. */
>  static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
>                                                      MPLS_LS_S_MASK | 0xff);
>
> +struct vxlanhdr {
> +       __be32 vx_flags;
> +       __be32 vx_vni;
> +} __attribute__((packed));
> +
>  struct gre_hdr {
>         __be16 flags;
>         __be16 protocol;
> @@ -45,13 +66,13 @@ struct gre_hdr {
>  struct v4hdr {
>         struct iphdr ip;
>         union l4hdr l4hdr;
> -       __u8 pad[16];                   /* enough space for L2 header */
> +       __u8 pad[24];                   /* space for L2 header / vxlan header ... */

could we use something like sizeof(..) instead of a constant?

> @@ -171,14 +197,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
>         }
>
>         /* add L2 encap (if specified) */
> +       l2_hdr = (__u8 *)&h_outer + olen;
>         switch (l2_proto) {
>         case ETH_P_MPLS_UC:
> -               *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> +               *(__u32 *)l2_hdr = mpls_label;
>                 break;
>         case ETH_P_TEB:
> -               if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> -                                      ETH_HLEN))

This is non-standard indentation? Here and elsewhere.

> @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>                 break;
>         case ETH_P_TEB:
>                 l2_len = ETH_HLEN;
> -               udp_dst = ETH_OVER_UDP_PORT;
> +               if (ext_proto & EXTPROTO_VXLAN) {
> +                       udp_dst = VXLAN_UDP_PORT;
> +                       l2_len += sizeof(struct vxlanhdr);
> +               } else
> +                       udp_dst = ETH_OVER_UDP_PORT;
>                 break;
>         }
>         flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
> @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>                 h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
>                 h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
>                 tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
> -                         sizeof(h_outer.l4hdr.udp);
> +                         sizeof(h_outer.l4hdr.udp) + l2_len;

Was this a bug previously?

>                 h_outer.l4hdr.udp.check = 0;
>                 h_outer.l4hdr.udp.len = bpf_htons(tot_len);
>                 break;
> @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>         }
>
>         /* add L2 encap (if specified) */
> +       l2_hdr = (__u8 *)&h_outer + olen;
>         switch (l2_proto) {
>         case ETH_P_MPLS_UC:
> -               *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> +               *(__u32 *)l2_hdr = mpls_label;
>                 break;
>         case ETH_P_TEB:
> -               if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> -                                      ETH_HLEN))
> +               flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;

This is a change also for the existing case. Correctly so, I imagine.
But the test used to pass with the wrong protocol?