Re: [PATCH net-next] net/smc: introduce shadow sockets for fallback connections

From: Wenjia Zhang <wenjia@linux.ibm.com>
To: Kai Shen <KaiShen@linux.alibaba.com>,
	kgraul@linux.ibm.com, jaka@linux.ibm.com, kuba@kernel.org,
	davem@davemloft.net, dsahern@kernel.org
Cc: netdev@vger.kernel.org, linux-s390@vger.kernel.org,
	linux-rdma@vger.kernel.org
Subject: Re: [PATCH net-next] net/smc: introduce shadow sockets for fallback connections
Date: Wed, 22 Mar 2023 18:09:41 +0100	[thread overview]
Message-ID: <170b35d9-2071-caf3-094e-6abfb7cefa75@linux.ibm.com> (raw)
In-Reply-To: <20230321071959.87786-1-KaiShen@linux.alibaba.com>

On 21.03.23 08:19, Kai Shen wrote:
> SMC-R performs not so well on fallback situations right now,
> especially on short link server fallback occasions. We are planning
> to make SMC-R widely used and handling this fallback performance
> issue is really crucial to us. Here we introduce a shadow socket
> method to try to relief this problem.
> 
Could you please elaborate the problem?
> Basicly, we use two more accept queues to hold incoming connections,
> one for fallback connections and the other for smc-r connections.
> We implement this method by using two more 'shadow' sockets and
> make the connection path of fallback connections almost the same as
> normal tcp connections.
> 
> Now the SMC-R accept path is like:
>    1. incoming connection
>    2. schedule work to smc sock alloc, tcp accept and push to smc
>       acceptq
>    3. wake up user to accept
> 
> When fallback happens on servers, the accepting path is the same
> which costs more than normal tcp accept path. In fallback
> situations, the step 2 above is not necessary and the smc sock is
> also not needed. So we use two more shadow sockets when one smc
> socket start listening. When new connection comes, we pop the req
> to the fallback socket acceptq or the non-fallback socket acceptq
> according to its syn_smc flag. As a result, when fallback happen we
> can graft the user socket with a normal tcp sock instead of a smc
> sock and get rid of the cost generated by step 2 and smc sock
> releasing.
> 
>                 +-----> non-fallback socket acceptq
>                 |
> incoming req --+
>                 |
>                 +-----> fallback socket acceptq
> 
> With the help of shadow socket, we gain similar performance as tcp
> connections on short link nginx server fallback occasions as what
> is illustrated below.
> 
> Cases are like "./wrk http://x.x.x.x:x/
> 	-H 'Connection: Close' -c 1600 -t 32 -d 20 --latency"
> 
> TCP:
>      Requests/sec: 145438.65
>      Transfer/sec:     21.64MB
> 
> Server fallback occasions on original SMC-R:
>      Requests/sec: 114192.82
>      Transfer/sec:     16.99MB
> 
> Server fallback occasions on SMC-R with shadow sockets:
>      Requests/sec: 143528.11
>      Transfer/sec:     21.35MB
> 

Generally, I don't have a good feeling about the two non-listenning 
sockets, and I can not see why it is necessary to introduce the socket 
actsock instead of using the clcsock itself. Maybe you can convince me 
with a good reason.

> On the other hand, as a result of using another accept queue, the
> fastopenq lock is not the right lock to access when accepting. So
> we need to find the right fastopenq lock in inet_csk_accept.
> 
> Signed-off-by: Kai Shen <KaiShen@linux.alibaba.com>
> ---
>   net/ipv4/inet_connection_sock.c |  13 ++-
>   net/smc/af_smc.c                | 143 ++++++++++++++++++++++++++++++--
>   net/smc/smc.h                   |   2 +
>   3 files changed, 150 insertions(+), 8 deletions(-)
> 
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index 65ad4251f6fd..ba2ec5ad4c04 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -658,6 +658,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
>   {
>   	struct inet_connection_sock *icsk = inet_csk(sk);
>   	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
> +	spinlock_t *fastopenq_lock = &queue->fastopenq.lock;
>   	struct request_sock *req;
>   	struct sock *newsk;
>   	int error;
> @@ -689,7 +690,15 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
>   
>   	if (sk->sk_protocol == IPPROTO_TCP &&
>   	    tcp_rsk(req)->tfo_listener) {
> -		spin_lock_bh(&queue->fastopenq.lock);
> +#if IS_ENABLED(CONFIG_SMC)
> +		if (tcp_sk(sk)->syn_smc) {
> +			struct request_sock_queue *orig_queue;
> +
> +			orig_queue = &inet_csk(req->rsk_listener)->icsk_accept_queue;
> +			fastopenq_lock = &orig_queue->fastopenq.lock;
> +		}
> +#endif
> +		spin_lock_bh(fastopenq_lock);
>   		if (tcp_rsk(req)->tfo_listener) {
>   			/* We are still waiting for the final ACK from 3WHS
>   			 * so can't free req now. Instead, we set req->sk to
> @@ -700,7 +709,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
>   			req->sk = NULL;
>   			req = NULL;
>   		}
> -		spin_unlock_bh(&queue->fastopenq.lock);
> +		spin_unlock_bh(fastopenq_lock);
>   	}
>   
>   out:
> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
> index a4cccdfdc00a..ad6c3b9ec9a6 100644
> --- a/net/smc/af_smc.c
> +++ b/net/smc/af_smc.c
> @@ -126,7 +126,9 @@ static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
>   
>   	smc = smc_clcsock_user_data(sk);
>   
> -	if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) >
> +	if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs)
> +			+ READ_ONCE(smc->actsock->sk->sk_ack_backlog)
> +			+ READ_ONCE(smc->fbsock->sk->sk_ack_backlog) >
>   				sk->sk_max_ack_backlog)
>   		goto drop;
>   
> @@ -286,6 +288,10 @@ static int __smc_release(struct smc_sock *smc)
>   				/* wake up clcsock accept */
>   				rc = kernel_sock_shutdown(smc->clcsock,
>   							  SHUT_RDWR);
> +				if (smc->fbsock)
> +					sock_release(smc->fbsock);
> +				if (smc->actsock)
> +					sock_release(smc->actsock);
>   			}
>   			sk->sk_state = SMC_CLOSED;
>   			sk->sk_state_change(sk);
> @@ -1681,7 +1687,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
>   
>   	mutex_lock(&lsmc->clcsock_release_lock);
>   	if (lsmc->clcsock)
> -		rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
> +		rc = kernel_accept(lsmc->actsock, &new_clcsock, SOCK_NONBLOCK);
>   	mutex_unlock(&lsmc->clcsock_release_lock);
>   	lock_sock(lsk);
>   	if  (rc < 0 && rc != -EAGAIN)
> @@ -2486,9 +2492,46 @@ static void smc_tcp_listen_work(struct work_struct *work)
>   	sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
>   }
>   
> +#define SMC_LINK 1
> +#define FALLBACK_LINK 2
> +static inline int smc_sock_pop_to_another_acceptq(struct smc_sock *lsmc)
> +{
> +	struct sock *lsk = lsmc->clcsock->sk;
> +	struct inet_connection_sock *icsk = inet_csk(lsk);
> +	struct inet_connection_sock *dest_icsk;
> +	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
> +	struct request_sock_queue *dest_queue;
> +	struct request_sock *req;
> +	struct sock *dst_sock;
> +	int ret;
> +
> +	req = reqsk_queue_remove(queue, lsk);
> +	if (!req)
> +		return -EINVAL;
> +
> +	if (tcp_sk(req->sk)->syn_smc || lsmc->sockopt_defer_accept) {
> +		dst_sock = lsmc->actsock->sk;
> +		ret = SMC_LINK;
> +	} else {
> +		dst_sock = lsmc->fbsock->sk;
> +		ret = FALLBACK_LINK;
> +	}
> +
> +	dest_icsk = inet_csk(dst_sock);
> +	dest_queue = &dest_icsk->icsk_accept_queue;
> +
> +	spin_lock_bh(&dest_queue->rskq_lock);
> +	WRITE_ONCE(req->dl_next, dest_queue->rskq_accept_head);
> +	sk_acceptq_added(dst_sock);
> +	dest_queue->rskq_accept_head = req;
> +	spin_unlock_bh(&dest_queue->rskq_lock);
> +	return ret;
> +}
> +
>   static void smc_clcsock_data_ready(struct sock *listen_clcsock)
>   {
>   	struct smc_sock *lsmc;
> +	int ret;
>   
>   	read_lock_bh(&listen_clcsock->sk_callback_lock);
>   	lsmc = smc_clcsock_user_data(listen_clcsock);
> @@ -2496,14 +2539,41 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock)
>   		goto out;
>   	lsmc->clcsk_data_ready(listen_clcsock);
>   	if (lsmc->sk.sk_state == SMC_LISTEN) {
> -		sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
> -		if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) > -			sock_put(&lsmc->sk);
> +		ret = smc_sock_pop_to_another_acceptq(lsmc);
> +		if (ret == SMC_LINK) {
> +			sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
> +			if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work))
> +				sock_put(&lsmc->sk);
> +		} else if (ret == FALLBACK_LINK) {
> +			lsmc->sk.sk_data_ready(&lsmc->sk);
> +		}
>   	}
>   out:
>   	read_unlock_bh(&listen_clcsock->sk_callback_lock);
>   }
>   
> +static void smc_shadow_socket_init(struct socket *sock)
> +{
> +	struct inet_connection_sock *icsk = inet_csk(sock->sk);
> +	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
> +
> +	tcp_set_state(sock->sk, TCP_LISTEN);
> +	sock->sk->sk_ack_backlog = 0;
> +
> +	inet_csk_delack_init(sock->sk);
> +
> +	spin_lock_init(&queue->rskq_lock);
> +
> +	spin_lock_init(&queue->fastopenq.lock);
> +	queue->fastopenq.rskq_rst_head = NULL;
> +	queue->fastopenq.rskq_rst_tail = NULL;
> +	queue->fastopenq.qlen = 0;
> +
> +	queue->rskq_accept_head = NULL;
> +
> +	tcp_sk(sock->sk)->syn_smc = 1;
> +}
> +
>   static int smc_listen(struct socket *sock, int backlog)
>   {
>   	struct sock *sk = sock->sk;
> @@ -2551,6 +2621,18 @@ static int smc_listen(struct socket *sock, int backlog)
>   	if (smc->limit_smc_hs)
>   		tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
>   
> +	rc = sock_create_kern(sock_net(sk), PF_INET, SOCK_STREAM, IPPROTO_TCP,
> +			      &smc->fbsock);
> +	if (rc)
> +		goto out;
> +	smc_shadow_socket_init(smc->fbsock);
> +
> +	rc = sock_create_kern(sock_net(sk), PF_INET, SOCK_STREAM, IPPROTO_TCP,
> +			      &smc->actsock);
> +	if (rc)
> +		goto out;
> +	smc_shadow_socket_init(smc->actsock);
> +
>   	rc = kernel_listen(smc->clcsock, backlog);
>   	if (rc) {
>   		write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
> @@ -2569,6 +2651,30 @@ static int smc_listen(struct socket *sock, int backlog)
>   	return rc;
>   }
>   
> +static inline bool tcp_reqsk_queue_empty(struct sock *sk)
> +{
> +	struct inet_connection_sock *icsk = inet_csk(sk);
> +	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
> +
> +	return reqsk_queue_empty(queue);
> +}
> +
Since this is only used by smc, I'd like to suggest to use 
smc_tcp_reqsk_queue_empty instead of tcp_reqsk_queue_empty.

> +static inline void
> +smc_restore_fbsock_protocol_family(struct socket *new_sock, struct socket *sock)
> +{
> +	struct smc_sock *lsmc = smc_sk(sock->sk);
> +
> +	new_sock->sk->sk_data_ready = lsmc->fbsock->sk->sk_data_ready;
> +	new_sock->ops = lsmc->fbsock->ops;
> +	new_sock->type = lsmc->fbsock->type;
> +
> +	module_put(sock->ops->owner);
> +	__module_get(new_sock->ops->owner);
> +
> +	if (tcp_sk(new_sock->sk)->syn_smc)
> +		pr_err("new sock is not fallback.\n");
> +}
> +
>   static int smc_accept(struct socket *sock, struct socket *new_sock,
>   		      int flags, bool kern)
>   {
> @@ -2579,6 +2685,18 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
>   	int rc = 0;
>   
>   	lsmc = smc_sk(sk);
> +	/* There is a lock in inet_csk_accept, so to make a fast path we do not lock_sock here */
> +	if (lsmc->sk.sk_state == SMC_LISTEN && !tcp_reqsk_queue_empty(lsmc->fbsock->sk)) {
> +		rc = lsmc->clcsock->ops->accept(lsmc->fbsock, new_sock, O_NONBLOCK, true);
> +		if (rc == -EAGAIN)
> +			goto normal_path;
> +		if (rc < 0)
> +			return rc;
> +		smc_restore_fbsock_protocol_family(new_sock, sock);
> +		return rc;
> +	}
> +
> +normal_path:
>   	sock_hold(sk); /* sock_put below */
>   	lock_sock(sk);
>   
> @@ -2593,6 +2711,18 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
>   	add_wait_queue_exclusive(sk_sleep(sk), &wait);
>   	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
>   		set_current_state(TASK_INTERRUPTIBLE);
> +		if (!tcp_reqsk_queue_empty(lsmc->fbsock->sk)) {
> +			rc = lsmc->clcsock->ops->accept(lsmc->fbsock, new_sock, O_NONBLOCK, true);
> +			if (rc == -EAGAIN)
> +				goto next_round;
> +			if (rc < 0)
> +				break;
> +
> +			smc_restore_fbsock_protocol_family(new_sock, sock);
> +			nsk = new_sock->sk;
> +			break;
> +		}
> +next_round:
>   		if (!timeo) {
>   			rc = -EAGAIN;
>   			break;
> @@ -2731,7 +2861,8 @@ static __poll_t smc_accept_poll(struct sock *parent)
>   	__poll_t mask = 0;
>   
>   	spin_lock(&isk->accept_q_lock);
> -	if (!list_empty(&isk->accept_q))
> +	if (!list_empty(&isk->accept_q) ||
> +	    !reqsk_queue_empty(&inet_csk(isk->fbsock->sk)->icsk_accept_queue))
>   		mask = EPOLLIN | EPOLLRDNORM;
>   	spin_unlock(&isk->accept_q_lock);
>   
> diff --git a/net/smc/smc.h b/net/smc/smc.h
> index 5ed765ea0c73..9a62c8f37e26 100644
> --- a/net/smc/smc.h
> +++ b/net/smc/smc.h
> @@ -241,6 +241,8 @@ struct smc_connection {
>   struct smc_sock {				/* smc sock container */
>   	struct sock		sk;
>   	struct socket		*clcsock;	/* internal tcp socket */
> +	struct socket		*fbsock;	/* socket for fallback connection */
> +	struct socket		*actsock;	/* socket for non-fallback conneciotn */
>   	void			(*clcsk_state_change)(struct sock *sk);
>   						/* original stat_change fct. */
>   	void			(*clcsk_data_ready)(struct sock *sk);