All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yonghong Song <yhs@fb.com>
To: Martin KaFai Lau <kafai@fb.com>, <bpf@vger.kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Eric Dumazet <edumazet@google.com>, <kernel-team@fb.com>,
	Neal Cardwell <ncardwell@google.com>, <netdev@vger.kernel.org>,
	Yuchung Cheng <ycheng@google.com>
Subject: Re: [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock
Date: Tue, 29 Jun 2021 10:27:17 -0700	[thread overview]
Message-ID: <6be60772-4d2a-30b0-5ebb-f857db31c037@fb.com> (raw)
In-Reply-To: <20210625200523.726854-1-kafai@fb.com>



On 6/25/21 1:05 PM, Martin KaFai Lau wrote:
> This patch does batching and lock_sock for the bpf tcp iter.
> It does not affect the proc fs iteration.
> 
> With bpf-tcp-cc, new algo rollout happens more often.  Instead of
> restarting the application to pick up the new tcp-cc, the next patch
> will allow bpf iter with CAP_NET_ADMIN to do setsockopt(TCP_CONGESTION).
> This requires locking the sock.
> 
> Also, unlike the proc iteration (cat /proc/net/tcp[6]), the bpf iter
> can inspect all fields of a tcp_sock.  It will be useful to have a
> consistent view on some of the fields (e.g. the ones reported in
> tcp_get_info() that also acquires the sock lock).
> 
> Double lock: locking the bucket first and then locking the sock could
> lead to deadlock.  This patch takes a batching approach similar to
> inet_diag.  While holding the bucket lock, it batch a number of sockets
> into an array first and then unlock the bucket.  Before doing show(),
> it then calls lock_sock_fast().
> 
> In a machine with ~400k connections, the maximum number of
> sk in a bucket of the established hashtable is 7.  0.02% of
> the established connections fall into this bucket size.
> 
> For listen hash (port+addr lhash2), the bucket is usually very
> small also except for the SO_REUSEPORT use case which the
> userspace could have one SO_REUSEPORT socket per thread.
> 
> While batching is used, it can also minimize the chance of missing
> sock in the setsockopt use case if the whole bucket is batched.
> This patch will start with a batch array with INIT_BATCH_SZ (16)
> which will be enough for the most common cases.  bpf_iter_tcp_batch()
> will try to realloc to a larger array to handle exception case (e.g.
> the SO_REUSEPORT case in the lhash2).
> 
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>   net/ipv4/tcp_ipv4.c | 236 ++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 230 insertions(+), 6 deletions(-)
> 
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 0d851289a89e..856144d33f52 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -2687,6 +2687,15 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
>   }
>   
>   #ifdef CONFIG_BPF_SYSCALL
> +struct bpf_tcp_iter_state {
> +	struct tcp_iter_state state;
> +	unsigned int cur_sk;
> +	unsigned int end_sk;
> +	unsigned int max_sk;
> +	struct sock **batch;
> +	bool st_bucket_done;
> +};
> +
>   struct bpf_iter__tcp {
>   	__bpf_md_ptr(struct bpf_iter_meta *, meta);
>   	__bpf_md_ptr(struct sock_common *, sk_common);
> @@ -2705,16 +2714,203 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
>   	return bpf_iter_run_prog(prog, &ctx);
>   }
>   
> +static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
> +{
> +	while (iter->cur_sk < iter->end_sk)
> +		sock_put(iter->batch[iter->cur_sk++]);
> +}
> +
> +static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
> +				      unsigned int new_batch_sz)
> +{
> +	struct sock **new_batch;
> +
> +	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, GFP_USER);

Since we return -ENOMEM below, should we have __GFP_NOWARN in kvmalloc 
flags?

> +	if (!new_batch)
> +		return -ENOMEM;
> +
> +	bpf_iter_tcp_put_batch(iter);
> +	kvfree(iter->batch);
> +	iter->batch = new_batch;
> +	iter->max_sk = new_batch_sz;
> +
> +	return 0;
> +}
> +
[...]
> +
>   static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
>   {
>   	struct bpf_iter_meta meta;
>   	struct bpf_prog *prog;
>   	struct sock *sk = v;
> +	bool slow;
>   	uid_t uid;
> +	int ret;
>   
>   	if (v == SEQ_START_TOKEN)
>   		return 0;
>   
> +	if (sk_fullsock(sk))
> +		slow = lock_sock_fast(sk);
> +
> +	if (unlikely(sk_unhashed(sk))) {
> +		ret = SEQ_SKIP;
> +		goto unlock;
> +	}

I am not a tcp expert. Maybe a dummy question.
Is it possible to do setsockopt() for listening socket?
What will happen if the listening sock is unhashed after the
above check?

> +
>   	if (sk->sk_state == TCP_TIME_WAIT) {
>   		uid = 0;
>   	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
> @@ -2728,11 +2924,18 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
>   
>   	meta.seq = seq;
>   	prog = bpf_iter_get_info(&meta, false);
> -	return tcp_prog_seq_show(prog, &meta, v, uid);
> +	ret = tcp_prog_seq_show(prog, &meta, v, uid);
> +
> +unlock:
> +	if (sk_fullsock(sk))
> +		unlock_sock_fast(sk, slow);
> +	return ret;
> +
>   }
>   
>   static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
>   {
> +	struct bpf_tcp_iter_state *iter = seq->private;
>   	struct bpf_iter_meta meta;
>   	struct bpf_prog *prog;
>   
> @@ -2743,13 +2946,16 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
>   			(void)tcp_prog_seq_show(prog, &meta, v, 0);
>   	}
>   
> -	tcp_seq_stop(seq, v);
> +	if (iter->cur_sk < iter->end_sk) {
> +		bpf_iter_tcp_put_batch(iter);
> +		iter->st_bucket_done = false;
> +	}
>   }
>   
[...]

  reply	other threads:[~2021-06-29 17:27 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-25 20:04 [PATCH bpf-next 0/8] bpf: Allow bpf tcp iter to do bpf_setsockopt Martin KaFai Lau
2021-06-25 20:04 ` [PATCH bpf-next 1/8] tcp: seq_file: Avoid skipping sk during tcp_seek_last_pos Martin KaFai Lau
2021-06-25 20:04 ` [PATCH bpf-next 2/8] tcp: seq_file: Refactor net and family matching Martin KaFai Lau
2021-06-25 20:05 ` [PATCH bpf-next 3/8] bpf: tcp: seq_file: Remove bpf_seq_afinfo from tcp_iter_state Martin KaFai Lau
2021-06-25 20:05 ` [PATCH bpf-next 4/8] tcp: seq_file: Add listening_get_first() Martin KaFai Lau
2021-06-25 20:05 ` [PATCH bpf-next 5/8] tcp: seq_file: Replace listening_hash with lhash2 Martin KaFai Lau
2021-06-25 20:05 ` [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock Martin KaFai Lau
2021-06-29 17:27   ` Yonghong Song [this message]
2021-06-29 17:44     ` Martin KaFai Lau
2021-06-29 17:57       ` Yonghong Song
2021-06-29 18:06         ` Martin KaFai Lau
2021-06-29 18:55           ` Yonghong Song
2021-06-25 20:05 ` [PATCH bpf-next 7/8] bpf: tcp: Support bpf_setsockopt in bpf tcp iter Martin KaFai Lau
2021-06-25 20:05 ` [PATCH bpf-next 8/8] bpf: selftest: Test batching and " Martin KaFai Lau
2021-06-29 19:00   ` Yonghong Song
2021-06-29 19:04 ` [PATCH bpf-next 0/8] bpf: Allow bpf tcp iter to do bpf_setsockopt Yonghong Song
2021-06-26  5:21 [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock kernel test robot
2021-06-26 14:42 kernel test robot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=6be60772-4d2a-30b0-5ebb-f857db31c037@fb.com \
    --to=yhs@fb.com \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=edumazet@google.com \
    --cc=kafai@fb.com \
    --cc=kernel-team@fb.com \
    --cc=ncardwell@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=ycheng@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.