Re: [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock

From: Yonghong Song <yhs@fb.com>
To: Martin KaFai Lau <kafai@fb.com>, <bpf@vger.kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Eric Dumazet <edumazet@google.com>, <kernel-team@fb.com>,
	Neal Cardwell <ncardwell@google.com>, <netdev@vger.kernel.org>,
	Yuchung Cheng <ycheng@google.com>
Subject: Re: [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock
Date: Tue, 29 Jun 2021 10:27:17 -0700	[thread overview]
Message-ID: <6be60772-4d2a-30b0-5ebb-f857db31c037@fb.com> (raw)
In-Reply-To: <20210625200523.726854-1-kafai@fb.com>

On 6/25/21 1:05 PM, Martin KaFai Lau wrote:
> This patch does batching and lock_sock for the bpf tcp iter.
> It does not affect the proc fs iteration.
> 
> With bpf-tcp-cc, new algo rollout happens more often.  Instead of
> restarting the application to pick up the new tcp-cc, the next patch
> will allow bpf iter with CAP_NET_ADMIN to do setsockopt(TCP_CONGESTION).
> This requires locking the sock.
> 
> Also, unlike the proc iteration (cat /proc/net/tcp[6]), the bpf iter
> can inspect all fields of a tcp_sock.  It will be useful to have a
> consistent view on some of the fields (e.g. the ones reported in
> tcp_get_info() that also acquires the sock lock).
> 
> Double lock: locking the bucket first and then locking the sock could
> lead to deadlock.  This patch takes a batching approach similar to
> inet_diag.  While holding the bucket lock, it batch a number of sockets
> into an array first and then unlock the bucket.  Before doing show(),
> it then calls lock_sock_fast().
> 
> In a machine with ~400k connections, the maximum number of
> sk in a bucket of the established hashtable is 7.  0.02% of
> the established connections fall into this bucket size.
> 
> For listen hash (port+addr lhash2), the bucket is usually very
> small also except for the SO_REUSEPORT use case which the
> userspace could have one SO_REUSEPORT socket per thread.
> 
> While batching is used, it can also minimize the chance of missing
> sock in the setsockopt use case if the whole bucket is batched.
> This patch will start with a batch array with INIT_BATCH_SZ (16)
> which will be enough for the most common cases.  bpf_iter_tcp_batch()
> will try to realloc to a larger array to handle exception case (e.g.
> the SO_REUSEPORT case in the lhash2).
> 
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>   net/ipv4/tcp_ipv4.c | 236 ++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 230 insertions(+), 6 deletions(-)
> 
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 0d851289a89e..856144d33f52 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -2687,6 +2687,15 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
>   }
>   
>   #ifdef CONFIG_BPF_SYSCALL
> +struct bpf_tcp_iter_state {
> +	struct tcp_iter_state state;
> +	unsigned int cur_sk;
> +	unsigned int end_sk;
> +	unsigned int max_sk;
> +	struct sock **batch;
> +	bool st_bucket_done;
> +};
> +
>   struct bpf_iter__tcp {
>   	__bpf_md_ptr(struct bpf_iter_meta *, meta);
>   	__bpf_md_ptr(struct sock_common *, sk_common);
> @@ -2705,16 +2714,203 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
>   	return bpf_iter_run_prog(prog, &ctx);
>   }
>   
> +static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
> +{
> +	while (iter->cur_sk < iter->end_sk)
> +		sock_put(iter->batch[iter->cur_sk++]);
> +}
> +
> +static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
> +				      unsigned int new_batch_sz)
> +{
> +	struct sock **new_batch;
> +
> +	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, GFP_USER);

Since we return -ENOMEM below, should we have __GFP_NOWARN in kvmalloc 
flags?

> +	if (!new_batch)
> +		return -ENOMEM;
> +
> +	bpf_iter_tcp_put_batch(iter);
> +	kvfree(iter->batch);
> +	iter->batch = new_batch;
> +	iter->max_sk = new_batch_sz;
> +
> +	return 0;
> +}
> +
[...]
> +
>   static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
>   {
>   	struct bpf_iter_meta meta;
>   	struct bpf_prog *prog;
>   	struct sock *sk = v;
> +	bool slow;
>   	uid_t uid;
> +	int ret;
>   
>   	if (v == SEQ_START_TOKEN)
>   		return 0;
>   
> +	if (sk_fullsock(sk))
> +		slow = lock_sock_fast(sk);
> +
> +	if (unlikely(sk_unhashed(sk))) {
> +		ret = SEQ_SKIP;
> +		goto unlock;
> +	}

I am not a tcp expert. Maybe a dummy question.
Is it possible to do setsockopt() for listening socket?
What will happen if the listening sock is unhashed after the
above check?

> +
>   	if (sk->sk_state == TCP_TIME_WAIT) {
>   		uid = 0;
>   	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
> @@ -2728,11 +2924,18 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
>   
>   	meta.seq = seq;
>   	prog = bpf_iter_get_info(&meta, false);
> -	return tcp_prog_seq_show(prog, &meta, v, uid);
> +	ret = tcp_prog_seq_show(prog, &meta, v, uid);
> +
> +unlock:
> +	if (sk_fullsock(sk))
> +		unlock_sock_fast(sk, slow);
> +	return ret;
> +
>   }
>   
>   static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
>   {
> +	struct bpf_tcp_iter_state *iter = seq->private;
>   	struct bpf_iter_meta meta;
>   	struct bpf_prog *prog;
>   
> @@ -2743,13 +2946,16 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
>   			(void)tcp_prog_seq_show(prog, &meta, v, 0);
>   	}
>   
> -	tcp_seq_stop(seq, v);
> +	if (iter->cur_sk < iter->end_sk) {
> +		bpf_iter_tcp_put_batch(iter);
> +		iter->st_bucket_done = false;
> +	}
>   }
>   
[...]