Re: [PATCH bpf-next 07/13] bpf: tcp: Support tcp_congestion_ops in bpf

From: Yonghong Song <yhs@fb.com>
To: Martin Lau <kafai@fb.com>, "bpf@vger.kernel.org" <bpf@vger.kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	David Miller <davem@davemloft.net>,
	Kernel Team <Kernel-team@fb.com>,
	"netdev@vger.kernel.org" <netdev@vger.kernel.org>
Subject: Re: [PATCH bpf-next 07/13] bpf: tcp: Support tcp_congestion_ops in bpf
Date: Tue, 17 Dec 2019 17:36:13 +0000	[thread overview]
Message-ID: <bb218f08-79ea-5029-e0a2-57d8a6d07d51@fb.com> (raw)
In-Reply-To: <20191214004753.1653075-1-kafai@fb.com>

On 12/13/19 4:47 PM, Martin KaFai Lau wrote:
> This patch makes "struct tcp_congestion_ops" to be the first user
> of BPF STRUCT_OPS.  It allows implementing a tcp_congestion_ops
> in bpf.
> 
> The BPF implemented tcp_congestion_ops can be used like
> regular kernel tcp-cc through sysctl and setsockopt.  e.g.
> [root@arch-fb-vm1 bpf]# sysctl -a | egrep congestion
> net.ipv4.tcp_allowed_congestion_control = reno cubic bpf_cubic
> net.ipv4.tcp_available_congestion_control = reno bic cubic bpf_cubic
> net.ipv4.tcp_congestion_control = bpf_cubic
> 
> There has been attempt to move the TCP CC to the user space
> (e.g. CCP in TCP).   The common arguments are faster turn around,
> get away from long-tail kernel versions in production...etc,
> which are legit points.
> 
> BPF has been the continuous effort to join both kernel and
> userspace upsides together (e.g. XDP to gain the performance
> advantage without bypassing the kernel).  The recent BPF
> advancements (in particular BTF-aware verifier, BPF trampoline,
> BPF CO-RE...) made implementing kernel struct ops (e.g. tcp cc)
> possible in BPF.  It allows a faster turnaround for testing algorithm
> in the production while leveraging the existing (and continue growing)
> BPF feature/framework instead of building one specifically for
> userspace TCP CC.
> 
> This patch allows write access to a few fields in tcp-sock
> (in bpf_tcp_ca_btf_struct_access()).
> 
> The optional "get_info" is unsupported now.  It can be added
> later.  One possible way is to output the info with a btf-id
> to describe the content.
> 
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>   include/linux/filter.h            |   2 +
>   include/net/tcp.h                 |   1 +
>   kernel/bpf/bpf_struct_ops_types.h |   7 +-
>   net/core/filter.c                 |   2 +-
>   net/ipv4/Makefile                 |   4 +
>   net/ipv4/bpf_tcp_ca.c             | 225 ++++++++++++++++++++++++++++++
>   net/ipv4/tcp_cong.c               |  14 +-
>   net/ipv4/tcp_ipv4.c               |   6 +-
>   net/ipv4/tcp_minisocks.c          |   4 +-
>   net/ipv4/tcp_output.c             |   4 +-
>   10 files changed, 254 insertions(+), 15 deletions(-)
>   create mode 100644 net/ipv4/bpf_tcp_ca.c
> 
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 37ac7025031d..7c22c5e6528d 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -844,6 +844,8 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
>   int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
>   			      bpf_aux_classic_check_t trans, bool save_orig);
>   void bpf_prog_destroy(struct bpf_prog *fp);
> +const struct bpf_func_proto *
> +bpf_base_func_proto(enum bpf_func_id func_id);
>   
>   int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
>   int sk_attach_bpf(u32 ufd, struct sock *sk);
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 86b9a8766648..fd87fa1df603 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -1007,6 +1007,7 @@ enum tcp_ca_ack_event_flags {
>   #define TCP_CONG_NON_RESTRICTED 0x1
>   /* Requires ECN/ECT set on all packets */
>   #define TCP_CONG_NEEDS_ECN	0x2
> +#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
>   
>   union tcp_cc_info;
>   
> diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
> index 7bb13ff49ec2..066d83ea1c99 100644
> --- a/kernel/bpf/bpf_struct_ops_types.h
> +++ b/kernel/bpf/bpf_struct_ops_types.h
> @@ -1,4 +1,9 @@
>   /* SPDX-License-Identifier: GPL-2.0 */
>   /* internal file - do not include directly */
>   
> -/* To be filled in a later patch */
> +#ifdef CONFIG_BPF_JIT
> +#ifdef CONFIG_INET
> +#include <net/tcp.h>
> +BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
> +#endif
> +#endif
> diff --git a/net/core/filter.c b/net/core/filter.c
> index a411f7835dee..fbb3698026bd 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5975,7 +5975,7 @@ bool bpf_helper_changes_pkt_data(void *func)
>   	return false;
>   }
>   
> -static const struct bpf_func_proto *
> +const struct bpf_func_proto *
>   bpf_base_func_proto(enum bpf_func_id func_id)
>   {
>   	switch (func_id) {
> diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
> index d57ecfaf89d4..7360d9b3eaad 100644
> --- a/net/ipv4/Makefile
> +++ b/net/ipv4/Makefile
> @@ -65,3 +65,7 @@ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
>   
>   obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
>   		      xfrm4_output.o xfrm4_protocol.o
> +
> +ifeq ($(CONFIG_BPF_SYSCALL),y)
> +obj-$(CONFIG_BPF_JIT) += bpf_tcp_ca.o
> +endif
> diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
> new file mode 100644
> index 000000000000..967af987bc26
> --- /dev/null
> +++ b/net/ipv4/bpf_tcp_ca.c
> @@ -0,0 +1,225 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2019 Facebook  */
> +
> +#include <linux/types.h>
> +#include <linux/bpf_verifier.h>
> +#include <linux/bpf.h>
> +#include <linux/btf.h>
> +#include <linux/filter.h>
> +#include <net/tcp.h>
> +
> +static u32 optional_ops[] = {
> +	offsetof(struct tcp_congestion_ops, init),
> +	offsetof(struct tcp_congestion_ops, release),
> +	offsetof(struct tcp_congestion_ops, set_state),
> +	offsetof(struct tcp_congestion_ops, cwnd_event),
> +	offsetof(struct tcp_congestion_ops, in_ack_event),
> +	offsetof(struct tcp_congestion_ops, pkts_acked),
> +	offsetof(struct tcp_congestion_ops, min_tso_segs),
> +	offsetof(struct tcp_congestion_ops, sndbuf_expand),
> +	offsetof(struct tcp_congestion_ops, cong_control),
> +};
> +
> +static u32 unsupported_ops[] = {
> +	offsetof(struct tcp_congestion_ops, get_info),
> +};

Could you elaborate by adding some comments at least how
required fields are handled?

> +
> +static const struct btf_type *tcp_sock_type;
> +static u32 tcp_sock_id, sock_id;
> +
> +static int bpf_tcp_ca_init(struct btf *_btf_vmlinux)
> +{
> +	s32 type_id;
> +
> +	type_id = btf_find_by_name_kind(_btf_vmlinux, "sock", BTF_KIND_STRUCT);
> +	if (type_id < 0)
> +		return -EINVAL;
> +	sock_id = type_id;
> +
> +	type_id = btf_find_by_name_kind(_btf_vmlinux, "tcp_sock",
> +					BTF_KIND_STRUCT);
> +	if (type_id < 0)
> +		return -EINVAL;
> +	tcp_sock_id = type_id;
> +	tcp_sock_type = btf_type_by_id(_btf_vmlinux, tcp_sock_id);
> +
> +	return 0;
> +}
> +
> +static bool check_optional(u32 member_offset)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(optional_ops); i++) {
> +		if (member_offset == optional_ops[i])
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
> +static bool check_unsupported(u32 member_offset)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) {
> +		if (member_offset == unsupported_ops[i])
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
[...]
> +
> +static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
> +	.get_func_proto		= bpf_tcp_ca_get_func_proto,
> +	.is_valid_access	= bpf_tcp_ca_is_valid_access,
> +	.btf_struct_access	= bpf_tcp_ca_btf_struct_access,
> +};
> +
> +static int bpf_tcp_ca_init_member(const struct btf_type *t,
> +				  const struct btf_member *member,
> +				  void *kdata, const void *udata)
> +{
> +	const struct tcp_congestion_ops *utcp_ca;
> +	struct tcp_congestion_ops *tcp_ca;
> +	size_t tcp_ca_name_len;
> +	int prog_fd;
> +	u32 moff;
> +
> +	utcp_ca = (const struct tcp_congestion_ops *)udata;
> +	tcp_ca = (struct tcp_congestion_ops *)kdata;
> +
> +	moff = btf_member_bit_offset(t, member) / 8;
> +	switch (moff) {
> +	case offsetof(struct tcp_congestion_ops, flags):
> +		if (utcp_ca->flags & ~TCP_CONG_MASK)
> +			return -EINVAL;
> +		tcp_ca->flags = utcp_ca->flags;
> +		return 1;
> +	case offsetof(struct tcp_congestion_ops, name):
> +		tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name));
> +		if (!tcp_ca_name_len ||
> +		    tcp_ca_name_len == sizeof(utcp_ca->name))
> +			return -EINVAL;
> +		memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name));
> +		return 1;
> +	}
> +
> +	if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL))
> +		return 0;
> +
> +	prog_fd = (int)(*(unsigned long *)(udata + moff));
> +	if (!prog_fd && !check_optional(moff) && !check_unsupported(moff))
> +		return -EINVAL;

So if a member is option or unsupported, we will return -EINVAL?
I probably miss something here.

> +
> +	return 0;
> +}
> +
> +static int bpf_tcp_ca_check_member(const struct btf_type *t,
> +				   const struct btf_member *member)
> +{
> +	if (check_unsupported(btf_member_bit_offset(t, member) / 8))
> +		return -ENOTSUPP;
> +	return 0;
> +}
> +
> +static int bpf_tcp_ca_reg(void *kdata)
> +{
> +	return tcp_register_congestion_control(kdata);
> +}
> +
> +static void bpf_tcp_ca_unreg(void *kdata)
> +{
> +	tcp_unregister_congestion_control(kdata);
> +}
> +
[...]