All of lore.kernel.org
 help / color / mirror / Atom feed
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
To: "David S . Miller" <davem@davemloft.net>,
	Jakub Kicinski <kuba@kernel.org>,
	Eric Dumazet <edumazet@google.com>,
	Neal Cardwell <ncardwell@google.com>,
	Yuchung Cheng <ycheng@google.com>,
	Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Andrii Nakryiko <andrii@kernel.org>,
	Martin KaFai Lau <kafai@fb.com>
Cc: Benjamin Herrenschmidt <benh@amazon.com>,
	Kuniyuki Iwashima <kuniyu@amazon.co.jp>,
	Kuniyuki Iwashima <kuni1840@gmail.com>, <bpf@vger.kernel.org>,
	<netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>
Subject: [PATCH v8 bpf-next 05/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.
Date: Sat, 12 Jun 2021 21:32:18 +0900	[thread overview]
Message-ID: <20210612123224.12525-6-kuniyu@amazon.co.jp> (raw)
In-Reply-To: <20210612123224.12525-1-kuniyu@amazon.co.jp>

When we call close() or shutdown() for listening sockets, each child socket
in the accept queue are freed at inet_csk_listen_stop(). If we can get a
new listener by reuseport_migrate_sock() and clone the request by
inet_reqsk_clone(), we try to add it into the new listener's accept queue
by inet_csk_reqsk_queue_add(). If it fails, we have to call __reqsk_free()
to call sock_put() for its listener and free the cloned request.

After putting the full socket into ehash, tcp_v[46]_syn_recv_sock() sets
NULL to ireq_opt/pktopts in struct inet_request_sock, but ipv6_opt can be
non-NULL. So, we have to set NULL to ipv6_opt of the old request to avoid
double free.

Note that we do not update req->rsk_listener and instead clone the req to
migrate because another path may reference the original request. If we
protected it by RCU, we would need to add rcu_read_lock() in many places.

Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/
Suggested-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Martin KaFai Lau <kafai@fb.com>
---
 net/ipv4/inet_connection_sock.c | 70 ++++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fa806e9167ec..08878ef1bc70 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -695,6 +695,52 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
 }
 EXPORT_SYMBOL(inet_rtx_syn_ack);
 
+static struct request_sock *inet_reqsk_clone(struct request_sock *req,
+					     struct sock *sk)
+{
+	struct sock *req_sk, *nreq_sk;
+	struct request_sock *nreq;
+
+	nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+	if (!nreq) {
+		/* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
+		sock_put(sk);
+		return NULL;
+	}
+
+	req_sk = req_to_sk(req);
+	nreq_sk = req_to_sk(nreq);
+
+	memcpy(nreq_sk, req_sk,
+	       offsetof(struct sock, sk_dontcopy_begin));
+	memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,
+	       req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
+	sk_node_init(&nreq_sk->sk_node);
+	nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
+#ifdef CONFIG_XPS
+	nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
+#endif
+	nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;
+
+	nreq->rsk_listener = sk;
+
+	/* We need not acquire fastopenq->lock
+	 * because the child socket is locked in inet_csk_listen_stop().
+	 */
+	if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener)
+		rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);
+
+	return nreq;
+}
+
+static void reqsk_migrate_reset(struct request_sock *req)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	inet_rsk(req)->ipv6_opt = NULL;
+#endif
+}
+
 /* return true if req was found in the ehash table */
 static bool reqsk_queue_unlink(struct request_sock *req)
 {
@@ -1036,14 +1082,36 @@ void inet_csk_listen_stop(struct sock *sk)
 	 * of the variants now.			--ANK
 	 */
 	while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
-		struct sock *child = req->sk;
+		struct sock *child = req->sk, *nsk;
+		struct request_sock *nreq;
 
 		local_bh_disable();
 		bh_lock_sock(child);
 		WARN_ON(sock_owned_by_user(child));
 		sock_hold(child);
 
+		nsk = reuseport_migrate_sock(sk, child, NULL);
+		if (nsk) {
+			nreq = inet_reqsk_clone(req, nsk);
+			if (nreq) {
+				refcount_set(&nreq->rsk_refcnt, 1);
+
+				if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
+					reqsk_migrate_reset(req);
+				} else {
+					reqsk_migrate_reset(nreq);
+					__reqsk_free(nreq);
+				}
+
+				/* inet_csk_reqsk_queue_add() has already
+				 * called inet_child_forget() on failure case.
+				 */
+				goto skip_child_forget;
+			}
+		}
+
 		inet_child_forget(sk, req, child);
+skip_child_forget:
 		reqsk_put(req);
 		bh_unlock_sock(child);
 		local_bh_enable();
-- 
2.30.2


  parent reply	other threads:[~2021-06-12 12:34 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-12 12:32 [PATCH v8 bpf-next 00/11] Socket migration for SO_REUSEPORT Kuniyuki Iwashima
2021-06-12 12:32 ` [PATCH v8 bpf-next 01/11] net: Introduce net.ipv4.tcp_migrate_req Kuniyuki Iwashima
2021-06-12 12:32 ` [PATCH v8 bpf-next 02/11] tcp: Add num_closed_socks to struct sock_reuseport Kuniyuki Iwashima
2021-06-12 12:32 ` [PATCH v8 bpf-next 03/11] tcp: Keep TCP_CLOSE sockets in the reuseport group Kuniyuki Iwashima
2021-06-12 12:32 ` [PATCH v8 bpf-next 04/11] tcp: Add reuseport_migrate_sock() to select a new listener Kuniyuki Iwashima
2021-06-12 12:32 ` Kuniyuki Iwashima [this message]
2021-06-12 12:32 ` [PATCH v8 bpf-next 06/11] tcp: Migrate TCP_NEW_SYN_RECV requests at retransmitting SYN+ACKs Kuniyuki Iwashima
2021-06-12 12:32 ` [PATCH v8 bpf-next 07/11] tcp: Migrate TCP_NEW_SYN_RECV requests at receiving the final ACK Kuniyuki Iwashima
2021-06-12 12:32 ` [PATCH v8 bpf-next 08/11] bpf: Support BPF_FUNC_get_socket_cookie() for BPF_PROG_TYPE_SK_REUSEPORT Kuniyuki Iwashima
2021-06-12 12:32 ` [PATCH v8 bpf-next 09/11] bpf: Support socket migration by eBPF Kuniyuki Iwashima
2021-06-12 12:32 ` [PATCH v8 bpf-next 10/11] libbpf: Set expected_attach_type for BPF_PROG_TYPE_SK_REUSEPORT Kuniyuki Iwashima
2021-06-12 12:32 ` [PATCH v8 bpf-next 11/11] bpf: Test BPF_SK_REUSEPORT_SELECT_OR_MIGRATE Kuniyuki Iwashima
2021-06-15 15:35 ` [PATCH v8 bpf-next 00/11] Socket migration for SO_REUSEPORT Eric Dumazet
2021-06-15 16:03   ` Kuniyuki Iwashima
2021-06-15 16:19     ` Daniel Borkmann

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210612123224.12525-6-kuniyu@amazon.co.jp \
    --to=kuniyu@amazon.co.jp \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=benh@amazon.com \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=kafai@fb.com \
    --cc=kuba@kernel.org \
    --cc=kuni1840@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ncardwell@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=ycheng@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.