Re: [PATCH bpf-next v7 3/4] bpf: try to avoid kzalloc in cgroup/{s,g}etsockopt

From: Stanislav Fomichev <sdf@google.com>
To: Martin KaFai Lau <kafai@fb.com>
Cc: Netdev <netdev@vger.kernel.org>, bpf <bpf@vger.kernel.org>,
	Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Song Liu <songliubraving@fb.com>
Subject: Re: [PATCH bpf-next v7 3/4] bpf: try to avoid kzalloc in cgroup/{s,g}etsockopt
Date: Wed, 13 Jan 2021 11:08:30 -0800	[thread overview]
Message-ID: <CAKH8qBu3EVP7EBCO56SemGM-jR6ZMnggidEO16teLqFDDqTStg@mail.gmail.com> (raw)
In-Reply-To: <20210113190342.dzqylb6oqrkfhccv@kafai-mbp.dhcp.thefacebook.com>

On Wed, Jan 13, 2021 at 11:03 AM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Tue, Jan 12, 2021 at 02:38:46PM -0800, Stanislav Fomichev wrote:
> > When we attach a bpf program to cgroup/getsockopt any other getsockopt()
> > syscall starts incurring kzalloc/kfree cost.
> >
> > Let add a small buffer on the stack and use it for small (majority)
> > {s,g}etsockopt values. The buffer is small enough to fit into
> > the cache line and cover the majority of simple options (most
> > of them are 4 byte ints).
> >
> > It seems natural to do the same for setsockopt, but it's a bit more
> > involved when the BPF program modifies the data (where we have to
> > kmalloc). The assumption is that for the majority of setsockopt
> > calls (which are doing pure BPF options or apply policy) this
> > will bring some benefit as well.
> >
> > Without this patch (we remove about 1% __kmalloc):
> >      3.38%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
> >             |
> >              --3.30%--__cgroup_bpf_run_filter_getsockopt
> >                        |
> >                         --0.81%--__kmalloc
> >
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > Cc: Martin KaFai Lau <kafai@fb.com>
> > Cc: Song Liu <songliubraving@fb.com>
> > ---
> >  include/linux/filter.h |  5 ++++
> >  kernel/bpf/cgroup.c    | 52 ++++++++++++++++++++++++++++++++++++------
> >  2 files changed, 50 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/linux/filter.h b/include/linux/filter.h
> > index 29c27656165b..8739f1d4cac4 100644
> > --- a/include/linux/filter.h
> > +++ b/include/linux/filter.h
> > @@ -1281,6 +1281,11 @@ struct bpf_sysctl_kern {
> >       u64 tmp_reg;
> >  };
> >
> > +#define BPF_SOCKOPT_KERN_BUF_SIZE    32
> > +struct bpf_sockopt_buf {
> > +     u8              data[BPF_SOCKOPT_KERN_BUF_SIZE];
> > +};
> > +
> >  struct bpf_sockopt_kern {
> >       struct sock     *sk;
> >       u8              *optval;
> > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> > index 416e7738981b..dbeef7afbbf9 100644
> > --- a/kernel/bpf/cgroup.c
> > +++ b/kernel/bpf/cgroup.c
> > @@ -1298,7 +1298,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
> >       return empty;
> >  }
> >
> > -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
> > +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
> > +                          struct bpf_sockopt_buf *buf)
> >  {
> >       if (unlikely(max_optlen < 0))
> >               return -EINVAL;
> > @@ -1310,6 +1311,15 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
> >               max_optlen = PAGE_SIZE;
> >       }
> >
> > +     if (max_optlen <= sizeof(buf->data)) {
> > +             /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
> > +              * bytes avoid the cost of kzalloc.
> > +              */
> > +             ctx->optval = buf->data;
> > +             ctx->optval_end = ctx->optval + max_optlen;
> > +             return max_optlen;
> > +     }
> > +
> >       ctx->optval = kzalloc(max_optlen, GFP_USER);
> >       if (!ctx->optval)
> >               return -ENOMEM;
> > @@ -1319,16 +1329,26 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
> >       return max_optlen;
> >  }
> >
> > -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
> > +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
> > +                          struct bpf_sockopt_buf *buf)
> >  {
> > +     if (ctx->optval == buf->data)
> > +             return;
> >       kfree(ctx->optval);
> >  }
> >
> > +static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
> > +                               struct bpf_sockopt_buf *buf)
> > +{
> > +     return ctx->optval != buf->data;
> > +}
> > +
> >  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> >                                      int *optname, char __user *optval,
> >                                      int *optlen, char **kernel_optval)
> >  {
> >       struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
> > +     struct bpf_sockopt_buf buf = {};
> >       struct bpf_sockopt_kern ctx = {
> >               .sk = sk,
> >               .level = *level,
> > @@ -1350,7 +1370,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> >        */
> >       max_optlen = max_t(int, 16, *optlen);
> >
> > -     max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
> > +     max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
> >       if (max_optlen < 0)
> >               return max_optlen;
> >
> > @@ -1390,14 +1410,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> >                */
> >               if (ctx.optlen != 0) {
> >                       *optlen = ctx.optlen;
> > -                     *kernel_optval = ctx.optval;
> > +                     /* We've used bpf_sockopt_kern->buf as an intermediary
> > +                      * storage, but the BPF program indicates that we need
> > +                      * to pass this data to the kernel setsockopt handler.
> > +                      * No way to export on-stack buf, have to allocate a
> > +                      * new buffer.
> > +                      */
> > +                     if (!sockopt_buf_allocated(&ctx, &buf)) {
> > +                             void *p = kzalloc(ctx.optlen, GFP_USER);
> nit. zero-ing is unnecessary when memcpy() will be done later.
SG, will switch to kmalloc, thanks!

> Acked-by: Martin KaFai Lau <kafai@fb.com>