From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
To: "David S . Miller" <davem@davemloft.net>,
Jakub Kicinski <kuba@kernel.org>,
Eric Dumazet <edumazet@google.com>,
Alexei Starovoitov <ast@kernel.org>,
Daniel Borkmann <daniel@iogearbox.net>,
Andrii Nakryiko <andrii@kernel.org>,
Martin KaFai Lau <kafai@fb.com>
Cc: Benjamin Herrenschmidt <benh@amazon.com>,
Kuniyuki Iwashima <kuniyu@amazon.co.jp>,
Kuniyuki Iwashima <kuni1840@gmail.com>, <bpf@vger.kernel.org>,
<netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>
Subject: [PATCH v3 bpf-next 04/11] tcp: Add reuseport_migrate_sock() to select a new listener.
Date: Wed, 21 Apr 2021 00:41:33 +0900 [thread overview]
Message-ID: <20210420154140.80034-5-kuniyu@amazon.co.jp> (raw)
In-Reply-To: <20210420154140.80034-1-kuniyu@amazon.co.jp>
reuseport_migrate_sock() does the same check done in
reuseport_listen_stop_sock(). If the reuseport group is capable of
migration, reuseport_migrate_sock() selects a new listener by the child
socket hash and increments the listener's sk_refcnt beforehand. Thus, if we
fail in the migration, we have to decrement it later.
We will support migration by eBPF in the later commits.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
include/net/sock_reuseport.h | 3 ++
net/core/sock_reuseport.c | 78 +++++++++++++++++++++++++++++-------
2 files changed, 67 insertions(+), 14 deletions(-)
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 1333d0cddfbc..473b0b0fa4ab 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -37,6 +37,9 @@ extern struct sock *reuseport_select_sock(struct sock *sk,
u32 hash,
struct sk_buff *skb,
int hdr_len);
+struct sock *reuseport_migrate_sock(struct sock *sk,
+ struct sock *migrating_sk,
+ struct sk_buff *skb);
extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
extern int reuseport_detach_prog(struct sock *sk);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index d5fb0ad12e87..a2bca39ec0e3 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -44,7 +44,7 @@ static void __reuseport_add_sock(struct sock *sk,
struct sock_reuseport *reuse)
{
reuse->socks[reuse->num_socks] = sk;
- /* paired with smp_rmb() in reuseport_select_sock() */
+ /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
smp_wmb();
reuse->num_socks++;
}
@@ -435,6 +435,23 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
return reuse->socks[index];
}
+static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
+ u32 hash, u16 num_socks)
+{
+ int i, j;
+
+ i = j = reciprocal_scale(hash, num_socks);
+ while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
+ i++;
+ if (i >= num_socks)
+ i = 0;
+ if (i == j)
+ return NULL;
+ }
+
+ return reuse->socks[i];
+}
+
/**
* reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
* @sk: First socket in the group.
@@ -478,19 +495,8 @@ struct sock *reuseport_select_sock(struct sock *sk,
select_by_hash:
/* no bpf or invalid bpf result: fall back to hash usage */
- if (!sk2) {
- int i, j;
-
- i = j = reciprocal_scale(hash, socks);
- while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
- i++;
- if (i >= socks)
- i = 0;
- if (i == j)
- goto out;
- }
- sk2 = reuse->socks[i];
- }
+ if (!sk2)
+ sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
}
out:
@@ -499,6 +505,50 @@ struct sock *reuseport_select_sock(struct sock *sk,
}
EXPORT_SYMBOL(reuseport_select_sock);
+/**
+ * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
+ * @sk: close()ed or shutdown()ed socket in the group.
+ * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
+ * NEW_SYN_RECV request socket during 3WHS.
+ * @skb: skb to run through BPF filter.
+ * Returns a socket (with sk_refcnt +1) that should accept the child socket
+ * (or NULL on error).
+ */
+struct sock *reuseport_migrate_sock(struct sock *sk,
+ struct sock *migrating_sk,
+ struct sk_buff *skb)
+{
+ struct sock_reuseport *reuse;
+ struct sock *nsk = NULL;
+ u16 socks;
+ u32 hash;
+
+ rcu_read_lock();
+
+ reuse = rcu_dereference(sk->sk_reuseport_cb);
+ if (!reuse)
+ goto out;
+
+ socks = READ_ONCE(reuse->num_socks);
+ if (unlikely(!socks))
+ goto out;
+
+ /* paired with smp_wmb() in __reuseport_add_sock() */
+ smp_rmb();
+
+ hash = migrating_sk->sk_hash;
+ if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
+ nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
+
+ if (nsk && unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
+ nsk = NULL;
+
+out:
+ rcu_read_unlock();
+ return nsk;
+}
+EXPORT_SYMBOL(reuseport_migrate_sock);
+
int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
{
struct sock_reuseport *reuse;
--
2.30.2
next prev parent reply other threads:[~2021-04-20 15:44 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-04-20 15:41 [PATCH v3 bpf-next 00/11] Socket migration for SO_REUSEPORT Kuniyuki Iwashima
2021-04-20 15:41 ` [PATCH v3 bpf-next 01/11] net: Introduce net.ipv4.tcp_migrate_req Kuniyuki Iwashima
2021-04-20 15:41 ` [PATCH v3 bpf-next 02/11] tcp: Add num_closed_socks to struct sock_reuseport Kuniyuki Iwashima
2021-04-20 15:41 ` [PATCH v3 bpf-next 03/11] tcp: Keep TCP_CLOSE sockets in the reuseport group Kuniyuki Iwashima
2021-04-20 15:41 ` Kuniyuki Iwashima [this message]
2021-04-20 15:41 ` [PATCH v3 bpf-next 05/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues Kuniyuki Iwashima
2021-04-20 15:41 ` [PATCH v3 bpf-next 06/11] tcp: Migrate TCP_NEW_SYN_RECV requests at retransmitting SYN+ACKs Kuniyuki Iwashima
2021-04-20 15:41 ` [PATCH v3 bpf-next 07/11] tcp: Migrate TCP_NEW_SYN_RECV requests at receiving the final ACK Kuniyuki Iwashima
2021-04-20 15:41 ` [PATCH v3 bpf-next 08/11] bpf: Support BPF_FUNC_get_socket_cookie() for BPF_PROG_TYPE_SK_REUSEPORT Kuniyuki Iwashima
2021-04-20 15:41 ` [PATCH v3 bpf-next 09/11] bpf: Support socket migration by eBPF Kuniyuki Iwashima
2021-04-20 15:41 ` [PATCH v3 bpf-next 10/11] libbpf: Set expected_attach_type for BPF_PROG_TYPE_SK_REUSEPORT Kuniyuki Iwashima
2021-04-20 15:41 ` [PATCH v3 bpf-next 11/11] bpf: Test BPF_SK_REUSEPORT_SELECT_OR_MIGRATE Kuniyuki Iwashima
2021-04-20 22:57 ` Andrii Nakryiko
2021-04-21 11:34 ` Kuniyuki Iwashima
2021-04-20 16:43 ` [PATCH v3 bpf-next 00/11] Socket migration for SO_REUSEPORT Eric Dumazet
2021-04-21 11:30 ` Kuniyuki Iwashima
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210420154140.80034-5-kuniyu@amazon.co.jp \
--to=kuniyu@amazon.co.jp \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=benh@amazon.com \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=kafai@fb.com \
--cc=kuba@kernel.org \
--cc=kuni1840@gmail.com \
--cc=linux-kernel@vger.kernel.org \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).