All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [MPTCP] [RFC PATCH v3 16/16] mptcp: Make connection_list a real list of subflows
@ 2018-10-09 22:15 Christoph Paasch
  0 siblings, 0 replies; 4+ messages in thread
From: Christoph Paasch @ 2018-10-09 22:15 UTC (permalink / raw)
  To: mptcp

[-- Attachment #1: Type: text/plain, Size: 15479 bytes --]

On 09/10/18 - 14:57:55, Mat Martineau wrote:
> On Mon, 8 Oct 2018, Christoph Paasch wrote:
> 
> > On 05/10/18 - 15:59:18, Mat Martineau wrote:
> > > From: Peter Krystad <peter.krystad(a)intel.com>
> > > 
> > > Signed-off-by: Peter Krystad <peter.krystad(a)intel.com>
> > > ---
> > >  include/net/mptcp.h  |   8 +-
> > >  net/mptcp/protocol.c | 193 +++++++++++++++++++++++++++++--------------
> > >  2 files changed, 139 insertions(+), 62 deletions(-)
> > > 
> > > diff --git a/include/net/mptcp.h b/include/net/mptcp.h
> > > index 6c9516360de4..ff8db00bcafc 100644
> > > --- a/include/net/mptcp.h
> > > +++ b/include/net/mptcp.h
> > > @@ -17,6 +17,7 @@
> > >  #define __NET_MPTCP_H
> > > 
> > >  #include <linux/tcp.h>
> > > +#include <linux/spinlock.h>
> > > 
> > >  /* MPTCP option subtypes */
> > > 
> > > @@ -44,10 +45,14 @@ struct mptcp_sock {
> > >  	u64		write_seq;
> > >  	atomic64_t	ack_seq;
> > >  	u32		token;
> > > -	struct socket	*connection_list; /* @@ needs to be a list */
> > > +	spinlock_t	conn_list_lock;
> > > +	struct hlist_head conn_list;
> > >  	struct socket	*subflow; /* outgoing connect, listener or !mp_capable */
> > >  };
> > > 
> > > +#define mptcp_for_each_subflow(__msk, __subflow)			\
> > > +	hlist_for_each_entry_rcu(__subflow, &((__msk)->conn_list), node)
> > > +
> > >  static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
> > >  {
> > >  	return (struct mptcp_sock *)sk;
> > > @@ -77,6 +82,7 @@ static inline struct mptcp_skb_cb *mptcp_skb_priv_cb(struct sk_buff *skb)
> > >  struct subflow_sock {
> > >  	/* tcp_sock must be the first member */
> > >  	struct	tcp_sock sk;
> > > +	struct	hlist_node node;// conn_list of subflows
> > >  	u64	local_key;
> > >  	u64	map_seq;
> > >  	u32	map_subflow_seq;
> > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > > index 911428122c9c..29f4d22fdad2 100644
> > > --- a/net/mptcp/protocol.c
> > > +++ b/net/mptcp/protocol.c
> > > @@ -70,35 +70,45 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> > >  	size_t psize;
> > > 
> > >  	pr_debug("msk=%p", msk);
> > > -	if (!msk->connection_list && msk->subflow) {
> > > +	if (msk->subflow) {
> > >  		pr_debug("fallback passthrough");
> > >  		return sock_sendmsg(msk->subflow, msg);
> > >  	}
> > > 
> > > +	rcu_read_lock();
> > > +	node = rcu_dereference(hlist_first_rcu(&msk->conn_list));
> > > +	subflow = hlist_entry(node, struct subflow_sock, node);
> > > +	ssk = sock_sk(subflow);
> > > +	sock_hold(ssk);
> > > +	rcu_read_unlock();
> > > +
> > >  	if (!msg_data_left(msg)) {
> > >  		pr_debug("empty send");
> > > -		return sock_sendmsg(msk->connection_list, msg);
> > > +		ret = sock_sendmsg(sock_sk(subflow)->sk_socket, msg);
> > > +		goto put_out;
> > >  	}
> > > 
> > > -	ssk = msk->connection_list->sk;
> > > -
> > >  	pr_debug("conn_list->subflow=%p", subflow);
> > > 
> > > -	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
> > > -		return -ENOTSUPP;
> > > +	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) {
> > > +		ret = -ENOTSUPP;
> > > +		goto put_out;
> > > +	}
> > > 
> > >  	/* Initial experiment: new page per send.  Real code will
> > >  	 * maintain list of active pages and DSS mappings, append to the
> > >  	 * end and honor zerocopy
> > >  	 */
> > >  	page = alloc_page(GFP_KERNEL);
> > > -	if (!page)
> > > -		return -ENOMEM;
> > > +	if (!page) {
> > > +		ret = -ENOMEM;
> > > +		goto put_out;
> > > +	}
> > > 
> > >  	mcb = kzalloc(sizeof(*mcb), GFP_KERNEL);
> > >  	if (!mcb) {
> > > -		put_page(page);
> > > -		return -ENOMEM;
> > > +		ret = -ENOMEM;
> > > +		goto put_out;
> > >  	}
> > > 
> > >  	/* Copy to page */
> > > @@ -110,9 +120,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> > >  	pr_debug("left=%ld", msg_data_left(msg));
> > > 
> > >  	if (!psize) {
> > > -		put_page(page);
> > > -		kfree(mcb);
> > > -		return -EINVAL;
> > > +		ret = -EINVAL;
> > > +		goto put_out;
> > >  	}
> > > 
> > >  	lock_sock(sk);
> > > @@ -129,9 +138,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> > > 
> > >  	ret = do_tcp_sendpages(ssk, page, poffset, min_t(int, size_goal, psize),
> > >  			       msg->msg_flags | MSG_SENDPAGE_NOTLAST);
> > > -	put_page(page);
> > >  	if (ret <= 0)
> > > -		goto error_out;
> > > +		goto release_out;
> > > 
> > >  	if (skb == tcp_write_queue_tail(ssk))
> > >  		pr_err("no new skb %p/%p", sk, ssk);
> > > @@ -167,10 +175,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> > > 
> > >  	tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal);
> > > 
> > > -error_out:
> > > +release_out:
> > >  	release_sock(ssk);
> > >  	release_sock(sk);
> > > 
> > > +put_out:
> > > +	if (page)
> > > +		put_page(page);
> > > +
> > > +	sock_put(ssk);
> > >  	kfree(mcb);
> > >  	return ret;
> > >  }
> > > @@ -290,19 +303,25 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> > >  	struct mptcp_sock *msk = mptcp_sk(sk);
> > >  	struct subflow_sock *subflow;
> > >  	struct mptcp_read_arg arg;
> > > +	struct hlist_node *node;
> > >  	read_descriptor_t desc;
> > >  	struct tcp_sock *tp;
> > >  	struct sock *ssk;
> > >  	int copied = 0;
> > >  	long timeo;
> > > 
> > > -	if (!msk->connection_list) {
> > > +	if (msk->subflow) {
> > >  		pr_debug("fallback-read subflow=%p", msk->subflow->sk);
> > >  		return sock_recvmsg(msk->subflow, msg, flags);
> > >  	}
> > > 
> > > -	ssk = msk->connection_list->sk;
> > > -	subflow = subflow_sk(ssk);
> > > +	rcu_read_lock();
> > > +	node = rcu_dereference(hlist_first_rcu(&msk->conn_list));
> > > +	subflow = hlist_entry(node, struct subflow_sock, node);
> > > +	ssk = sock_sk(subflow);
> > > +	sock_hold(ssk);
> > > +	rcu_read_unlock();
> > > +
> > >  	tp = tcp_sk(ssk);
> > > 
> > >  	desc.arg.data = &arg;
> > > @@ -430,6 +449,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> > >  	release_sock(ssk);
> > >  	release_sock(sk);
> > > 
> > > +	sock_put(ssk);
> > > +
> > >  	return copied;
> > >  }
> > > 
> > > @@ -439,22 +460,50 @@ static int mptcp_init_sock(struct sock *sk)
> > > 
> > >  	pr_debug("msk=%p", msk);
> > > 
> > > +	INIT_HLIST_HEAD(&msk->conn_list);
> > > +	spin_lock_init(&msk->conn_list_lock);
> > > +
> > >  	return 0;
> > >  }
> > > 
> > >  static void mptcp_close(struct sock *sk, long timeout)
> > >  {
> > >  	struct mptcp_sock *msk = mptcp_sk(sk);
> > > +	struct hlist_node *node;
> > > +	struct socket *ssk = NULL;
> > > +	struct subflow_sock *subflow;
> > > 
> > > +	spin_lock_bh(&msk->conn_list_lock);
> > >  	if (msk->subflow) {
> > > -		pr_debug("subflow=%p", msk->subflow->sk);
> > > -		sock_release(msk->subflow);
> > > +		ssk = msk->subflow;
> > > +		msk->subflow = NULL;
> > >  	}
> > > -
> > > -	if (msk->connection_list) {
> > > -		pr_debug("conn_list->subflow=%p", msk->connection_list->sk);
> > > -		sock_release(msk->connection_list);
> > > +	spin_unlock_bh(&msk->conn_list_lock);
> > > +	if (ssk != NULL) {
> > > +		pr_debug("subflow=%p", ssk->sk);
> > > +		sock_release(ssk);
> > >  	}
> > > +
> > > +	do {
> > > +		spin_lock_bh(&msk->conn_list_lock);
> > > +		/* The spin lock was just acquired, so tell
> > > +		 * rcu_dereference_check() this access to the list is properly
> > > +		 * protected even though the rcu_read_lock is not held.
> > > +		 */
> > > +		node = rcu_dereference_check(
> > > +			hlist_first_rcu(&msk->conn_list),
> > > +			spin_is_locked(&msk->conn_list_lock) ||
> > > +			!IS_ENABLED(CONFIG_SMP));
> > > +		if (node == NULL)
> > > +			break;
> > > +		subflow = hlist_entry(node, struct subflow_sock, node);
> > > +		hlist_del_rcu(node);
> > > +		spin_unlock_bh(&msk->conn_list_lock);
> > > +		synchronize_rcu();
> > > +		pr_debug("conn_list->subflow=%p", subflow);
> > > +		sock_release(sock_sk(subflow)->sk_socket);
> > 
> > Same here as for the shutdown. We need to trigger the closing in the right
> > sequence. First, we need to close the MPTCP-layer and then the subflows.
> 
> Is that to get the DATA_FINs out, or are there other reasons too?

The DATA_FIN and all the other data that might be pending. All of that first
needs to be acknowledged before closing subflows.
Otherwise, not all the data will be delivered to the peer.


(there is an optimization where one could combine DATA_FIN + FIN in certain
conditions - detailed in the RFC - but I think we can skip such an
optimization)


Christoph

> 
> 
> Mat
> 
> 
> > 
> > > +	} while (1);
> > > +	spin_unlock_bh(&msk->conn_list_lock);
> > >  }
> > > 
> > >  static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
> > > @@ -493,7 +542,10 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
> > >  		msk->token = subflow->token;
> > >  		pr_debug("token=%u", msk->token);
> > >  		token_update_accept(new_sock->sk, mp->sk);
> > > -		msk->connection_list = new_sock;
> > > +		spin_lock_bh(&msk->conn_list_lock);
> > > +		hlist_add_head_rcu(&subflow->node, &msk->conn_list);
> > > +		msk->subflow = NULL;
> > > +		spin_unlock_bh(&msk->conn_list_lock);
> > > 
> > >  		crypto_key_sha1(msk->remote_key, NULL, &ack_seq);
> > >  		msk->write_seq = subflow->idsn + 1;
> > > @@ -524,46 +576,44 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname,
> > >  			    char __user *uoptval, unsigned int optlen)
> > >  {
> > >  	struct mptcp_sock *msk = mptcp_sk(sk);
> > > -	struct socket *subflow;
> > >  	char __kernel *optval;
> > > 
> > > -	pr_debug("msk=%p", msk);
> > > -	if (msk->connection_list) {
> > > -		subflow = msk->connection_list;
> > > -		pr_debug("conn_list->subflow=%p", subflow->sk);
> > > -	} else {
> > > -		subflow = msk->subflow;
> > > -		pr_debug("subflow=%p", subflow->sk);
> > > -	}
> > > -
> > >  	/* will be treated as __user in subflow_setsockopt */
> > >  	optval = (char __kernel __force *) uoptval;
> > > 
> > > -	return kernel_setsockopt(subflow, level, optname, optval, optlen);
> > > +	pr_debug("msk=%p", msk);
> > > +	if (msk->subflow) {
> > > +		pr_debug("subflow=%p", msk->subflow->sk);
> > > +		return kernel_setsockopt(msk->subflow, level, optname, optval, optlen);
> > > +	}
> > > +
> > > +	/* @@ the meaning of setsockopt() when the socket is connected and
> > > +	 * there are multiple subflows is not defined.
> > > +	 */
> > > +	return 0;
> > >  }
> > > 
> > >  static int mptcp_getsockopt(struct sock *sk, int level, int optname,
> > >  			    char __user *uoptval, int __user *uoption)
> > >  {
> > >  	struct mptcp_sock *msk = mptcp_sk(sk);
> > > -	struct socket *subflow;
> > >  	char __kernel *optval;
> > >  	int __kernel *option;
> > > 
> > > -	pr_debug("msk=%p", msk);
> > > -	if (msk->connection_list) {
> > > -		subflow = msk->connection_list;
> > > -		pr_debug("conn_list->subflow=%p", subflow->sk);
> > > -	} else {
> > > -		subflow = msk->subflow;
> > > -		pr_debug("subflow=%p", subflow->sk);
> > > -	}
> > > -
> > >  	/* will be treated as __user in subflow_getsockopt */
> > >  	optval = (char __kernel __force *) uoptval;
> > >  	option = (int __kernel __force *) uoption;
> > > 
> > > -	return kernel_getsockopt(subflow, level, optname, optval, option);
> > > +	pr_debug("msk=%p", msk);
> > > +	if (msk->subflow) {
> > > +		pr_debug("subflow=%p", msk->subflow->sk);
> > > +		return kernel_getsockopt(msk->subflow, level, optname, optval, option);
> > > +	}
> > > +
> > > +	/* @@ the meaning of setsockopt() when the socket is connected and
> > > +	 * there are multiple subflows is not defined.
> > > +	 */
> > > +	return 0;
> > >  }
> > > 
> > >  static int mptcp_get_port(struct sock *sk, unsigned short snum)
> > > @@ -590,8 +640,10 @@ void mptcp_finish_connect(struct sock *sk, int mp_capable)
> > >  		msk->local_key = subflow->local_key;
> > >  		msk->token = subflow->token;
> > >  		pr_debug("token=%u", msk->token);
> > > -		msk->connection_list = msk->subflow;
> > > +		spin_lock_bh(&msk->conn_list_lock);
> > > +		hlist_add_head_rcu(&subflow->node, &msk->conn_list);
> > >  		msk->subflow = NULL;
> > > +		spin_unlock_bh(&msk->conn_list_lock);
> > > 
> > >  		crypto_key_sha1(msk->remote_key, NULL, &ack_seq);
> > >  		msk->write_seq = subflow->idsn + 1;
> > > @@ -659,17 +711,32 @@ int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
> > >  int mptcp_stream_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
> > >  {
> > >  	struct mptcp_sock *msk = mptcp_sk(sock->sk);
> > > -	struct socket *subflow;
> > > -	int err = -EPERM;
> > > +	struct subflow_sock *subflow;
> > > +	struct hlist_node *node;
> > > +	struct sock *ssk;
> > > +	int ret;
> > > 
> > > -	if (msk->connection_list)
> > > -		subflow = msk->connection_list;
> > > -	else
> > > -		subflow = msk->subflow;
> > > +	pr_debug("msk=%p", msk);
> > > 
> > > -	err = inet_getname(subflow, uaddr, peer);
> > > +	if (msk->subflow) {
> > > +		pr_debug("subflow=%p", msk->subflow->sk);
> > > +		return inet_getname(msk->subflow, uaddr, peer);
> > > +	}
> > > 
> > > -	return err;
> > > +	/* @@ the meaning of getname() for the remote peer when the socket
> > > +	 * is connected and there are multiple subflows is not defined.
> > > +	 * For now just use the first subflow on the list.
> > > +	 */
> > > +	rcu_read_lock();
> > > +	node = rcu_dereference(hlist_first_rcu(&msk->conn_list));
> > > +	subflow = hlist_entry(node, struct subflow_sock, node);
> > > +	ssk = sock_sk(subflow);
> > > +	sock_hold(ssk);
> > > +	rcu_read_unlock();
> > > +
> > > +	ret = inet_getname(sock_sk(subflow)->sk_socket, uaddr, peer);
> > > +	sock_put(ssk);
> > > +	return ret;
> > >  }
> > > 
> > >  int mptcp_stream_listen(struct socket *sock, int backlog)
> > > @@ -703,18 +770,22 @@ int mptcp_stream_accept(struct socket *sock, struct socket *newsock, int flags,
> > >  int mptcp_stream_shutdown(struct socket *sock, int how)
> > >  {
> > >  	struct mptcp_sock *msk = mptcp_sk(sock->sk);
> > > +	struct subflow_sock *subflow;
> > >  	int ret = 0;
> > > 
> > >  	pr_debug("sk=%p, how=%d", msk, how);
> > > 
> > >  	if (msk->subflow) {
> > >  		pr_debug("subflow=%p", msk->subflow->sk);
> > > -		ret = kernel_sock_shutdown(msk->subflow, how);
> > > +		return kernel_sock_shutdown(msk->subflow, how);
> > >  	}
> > > -	if (msk->connection_list) {
> > > -		pr_debug("conn_list->subflow=%p", msk->connection_list->sk);
> > > -		ret = kernel_sock_shutdown(msk->connection_list, how);
> > > +
> > > +	rcu_read_lock();
> > > +	mptcp_for_each_subflow(msk, subflow) {
> > > +		pr_debug("conn_list->subflow=%p", subflow);
> > > +		ret = kernel_sock_shutdown(sock_sk(subflow)->sk_socket, how);
> > >  	}
> > 
> > Subflows first need to get drained before they can be shutdown. Otherwise,
> > we will end up not delivering all the data.
> > 
> > 
> > Christoph
> > 
> > > +	rcu_read_unlock();
> > > 
> > >  	return ret;
> > >  }
> > > --
> > > 2.19.1
> > > 
> > > _______________________________________________
> > > mptcp mailing list
> > > mptcp(a)lists.01.org
> > > https://lists.01.org/mailman/listinfo/mptcp
> > 
> 
> --
> Mat Martineau
> Intel OTC

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [MPTCP] [RFC PATCH v3 16/16] mptcp: Make connection_list a real list of subflows
@ 2018-10-09 21:57 Mat Martineau
  0 siblings, 0 replies; 4+ messages in thread
From: Mat Martineau @ 2018-10-09 21:57 UTC (permalink / raw)
  To: mptcp

[-- Attachment #1: Type: text/plain, Size: 13741 bytes --]

On Mon, 8 Oct 2018, Christoph Paasch wrote:

> On 05/10/18 - 15:59:18, Mat Martineau wrote:
>> From: Peter Krystad <peter.krystad(a)intel.com>
>>
>> Signed-off-by: Peter Krystad <peter.krystad(a)intel.com>
>> ---
>>  include/net/mptcp.h  |   8 +-
>>  net/mptcp/protocol.c | 193 +++++++++++++++++++++++++++++--------------
>>  2 files changed, 139 insertions(+), 62 deletions(-)
>>
>> diff --git a/include/net/mptcp.h b/include/net/mptcp.h
>> index 6c9516360de4..ff8db00bcafc 100644
>> --- a/include/net/mptcp.h
>> +++ b/include/net/mptcp.h
>> @@ -17,6 +17,7 @@
>>  #define __NET_MPTCP_H
>>
>>  #include <linux/tcp.h>
>> +#include <linux/spinlock.h>
>>
>>  /* MPTCP option subtypes */
>>
>> @@ -44,10 +45,14 @@ struct mptcp_sock {
>>  	u64		write_seq;
>>  	atomic64_t	ack_seq;
>>  	u32		token;
>> -	struct socket	*connection_list; /* @@ needs to be a list */
>> +	spinlock_t	conn_list_lock;
>> +	struct hlist_head conn_list;
>>  	struct socket	*subflow; /* outgoing connect, listener or !mp_capable */
>>  };
>>
>> +#define mptcp_for_each_subflow(__msk, __subflow)			\
>> +	hlist_for_each_entry_rcu(__subflow, &((__msk)->conn_list), node)
>> +
>>  static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
>>  {
>>  	return (struct mptcp_sock *)sk;
>> @@ -77,6 +82,7 @@ static inline struct mptcp_skb_cb *mptcp_skb_priv_cb(struct sk_buff *skb)
>>  struct subflow_sock {
>>  	/* tcp_sock must be the first member */
>>  	struct	tcp_sock sk;
>> +	struct	hlist_node node;// conn_list of subflows
>>  	u64	local_key;
>>  	u64	map_seq;
>>  	u32	map_subflow_seq;
>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>> index 911428122c9c..29f4d22fdad2 100644
>> --- a/net/mptcp/protocol.c
>> +++ b/net/mptcp/protocol.c
>> @@ -70,35 +70,45 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>  	size_t psize;
>>
>>  	pr_debug("msk=%p", msk);
>> -	if (!msk->connection_list && msk->subflow) {
>> +	if (msk->subflow) {
>>  		pr_debug("fallback passthrough");
>>  		return sock_sendmsg(msk->subflow, msg);
>>  	}
>>
>> +	rcu_read_lock();
>> +	node = rcu_dereference(hlist_first_rcu(&msk->conn_list));
>> +	subflow = hlist_entry(node, struct subflow_sock, node);
>> +	ssk = sock_sk(subflow);
>> +	sock_hold(ssk);
>> +	rcu_read_unlock();
>> +
>>  	if (!msg_data_left(msg)) {
>>  		pr_debug("empty send");
>> -		return sock_sendmsg(msk->connection_list, msg);
>> +		ret = sock_sendmsg(sock_sk(subflow)->sk_socket, msg);
>> +		goto put_out;
>>  	}
>>
>> -	ssk = msk->connection_list->sk;
>> -
>>  	pr_debug("conn_list->subflow=%p", subflow);
>>
>> -	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
>> -		return -ENOTSUPP;
>> +	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) {
>> +		ret = -ENOTSUPP;
>> +		goto put_out;
>> +	}
>>
>>  	/* Initial experiment: new page per send.  Real code will
>>  	 * maintain list of active pages and DSS mappings, append to the
>>  	 * end and honor zerocopy
>>  	 */
>>  	page = alloc_page(GFP_KERNEL);
>> -	if (!page)
>> -		return -ENOMEM;
>> +	if (!page) {
>> +		ret = -ENOMEM;
>> +		goto put_out;
>> +	}
>>
>>  	mcb = kzalloc(sizeof(*mcb), GFP_KERNEL);
>>  	if (!mcb) {
>> -		put_page(page);
>> -		return -ENOMEM;
>> +		ret = -ENOMEM;
>> +		goto put_out;
>>  	}
>>
>>  	/* Copy to page */
>> @@ -110,9 +120,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>  	pr_debug("left=%ld", msg_data_left(msg));
>>
>>  	if (!psize) {
>> -		put_page(page);
>> -		kfree(mcb);
>> -		return -EINVAL;
>> +		ret = -EINVAL;
>> +		goto put_out;
>>  	}
>>
>>  	lock_sock(sk);
>> @@ -129,9 +138,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>
>>  	ret = do_tcp_sendpages(ssk, page, poffset, min_t(int, size_goal, psize),
>>  			       msg->msg_flags | MSG_SENDPAGE_NOTLAST);
>> -	put_page(page);
>>  	if (ret <= 0)
>> -		goto error_out;
>> +		goto release_out;
>>
>>  	if (skb == tcp_write_queue_tail(ssk))
>>  		pr_err("no new skb %p/%p", sk, ssk);
>> @@ -167,10 +175,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>
>>  	tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal);
>>
>> -error_out:
>> +release_out:
>>  	release_sock(ssk);
>>  	release_sock(sk);
>>
>> +put_out:
>> +	if (page)
>> +		put_page(page);
>> +
>> +	sock_put(ssk);
>>  	kfree(mcb);
>>  	return ret;
>>  }
>> @@ -290,19 +303,25 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>>  	struct mptcp_sock *msk = mptcp_sk(sk);
>>  	struct subflow_sock *subflow;
>>  	struct mptcp_read_arg arg;
>> +	struct hlist_node *node;
>>  	read_descriptor_t desc;
>>  	struct tcp_sock *tp;
>>  	struct sock *ssk;
>>  	int copied = 0;
>>  	long timeo;
>>
>> -	if (!msk->connection_list) {
>> +	if (msk->subflow) {
>>  		pr_debug("fallback-read subflow=%p", msk->subflow->sk);
>>  		return sock_recvmsg(msk->subflow, msg, flags);
>>  	}
>>
>> -	ssk = msk->connection_list->sk;
>> -	subflow = subflow_sk(ssk);
>> +	rcu_read_lock();
>> +	node = rcu_dereference(hlist_first_rcu(&msk->conn_list));
>> +	subflow = hlist_entry(node, struct subflow_sock, node);
>> +	ssk = sock_sk(subflow);
>> +	sock_hold(ssk);
>> +	rcu_read_unlock();
>> +
>>  	tp = tcp_sk(ssk);
>>
>>  	desc.arg.data = &arg;
>> @@ -430,6 +449,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>>  	release_sock(ssk);
>>  	release_sock(sk);
>>
>> +	sock_put(ssk);
>> +
>>  	return copied;
>>  }
>>
>> @@ -439,22 +460,50 @@ static int mptcp_init_sock(struct sock *sk)
>>
>>  	pr_debug("msk=%p", msk);
>>
>> +	INIT_HLIST_HEAD(&msk->conn_list);
>> +	spin_lock_init(&msk->conn_list_lock);
>> +
>>  	return 0;
>>  }
>>
>>  static void mptcp_close(struct sock *sk, long timeout)
>>  {
>>  	struct mptcp_sock *msk = mptcp_sk(sk);
>> +	struct hlist_node *node;
>> +	struct socket *ssk = NULL;
>> +	struct subflow_sock *subflow;
>>
>> +	spin_lock_bh(&msk->conn_list_lock);
>>  	if (msk->subflow) {
>> -		pr_debug("subflow=%p", msk->subflow->sk);
>> -		sock_release(msk->subflow);
>> +		ssk = msk->subflow;
>> +		msk->subflow = NULL;
>>  	}
>> -
>> -	if (msk->connection_list) {
>> -		pr_debug("conn_list->subflow=%p", msk->connection_list->sk);
>> -		sock_release(msk->connection_list);
>> +	spin_unlock_bh(&msk->conn_list_lock);
>> +	if (ssk != NULL) {
>> +		pr_debug("subflow=%p", ssk->sk);
>> +		sock_release(ssk);
>>  	}
>> +
>> +	do {
>> +		spin_lock_bh(&msk->conn_list_lock);
>> +		/* The spin lock was just acquired, so tell
>> +		 * rcu_dereference_check() this access to the list is properly
>> +		 * protected even though the rcu_read_lock is not held.
>> +		 */
>> +		node = rcu_dereference_check(
>> +			hlist_first_rcu(&msk->conn_list),
>> +			spin_is_locked(&msk->conn_list_lock) ||
>> +			!IS_ENABLED(CONFIG_SMP));
>> +		if (node == NULL)
>> +			break;
>> +		subflow = hlist_entry(node, struct subflow_sock, node);
>> +		hlist_del_rcu(node);
>> +		spin_unlock_bh(&msk->conn_list_lock);
>> +		synchronize_rcu();
>> +		pr_debug("conn_list->subflow=%p", subflow);
>> +		sock_release(sock_sk(subflow)->sk_socket);
>
> Same here as for the shutdown. We need to trigger the closing in the right
> sequence. First, we need to close the MPTCP-layer and then the subflows.

Is that to get the DATA_FINs out, or are there other reasons too?


Mat


>
>> +	} while (1);
>> +	spin_unlock_bh(&msk->conn_list_lock);
>>  }
>>
>>  static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
>> @@ -493,7 +542,10 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
>>  		msk->token = subflow->token;
>>  		pr_debug("token=%u", msk->token);
>>  		token_update_accept(new_sock->sk, mp->sk);
>> -		msk->connection_list = new_sock;
>> +		spin_lock_bh(&msk->conn_list_lock);
>> +		hlist_add_head_rcu(&subflow->node, &msk->conn_list);
>> +		msk->subflow = NULL;
>> +		spin_unlock_bh(&msk->conn_list_lock);
>>
>>  		crypto_key_sha1(msk->remote_key, NULL, &ack_seq);
>>  		msk->write_seq = subflow->idsn + 1;
>> @@ -524,46 +576,44 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname,
>>  			    char __user *uoptval, unsigned int optlen)
>>  {
>>  	struct mptcp_sock *msk = mptcp_sk(sk);
>> -	struct socket *subflow;
>>  	char __kernel *optval;
>>
>> -	pr_debug("msk=%p", msk);
>> -	if (msk->connection_list) {
>> -		subflow = msk->connection_list;
>> -		pr_debug("conn_list->subflow=%p", subflow->sk);
>> -	} else {
>> -		subflow = msk->subflow;
>> -		pr_debug("subflow=%p", subflow->sk);
>> -	}
>> -
>>  	/* will be treated as __user in subflow_setsockopt */
>>  	optval = (char __kernel __force *) uoptval;
>>
>> -	return kernel_setsockopt(subflow, level, optname, optval, optlen);
>> +	pr_debug("msk=%p", msk);
>> +	if (msk->subflow) {
>> +		pr_debug("subflow=%p", msk->subflow->sk);
>> +		return kernel_setsockopt(msk->subflow, level, optname, optval, optlen);
>> +	}
>> +
>> +	/* @@ the meaning of setsockopt() when the socket is connected and
>> +	 * there are multiple subflows is not defined.
>> +	 */
>> +	return 0;
>>  }
>>
>>  static int mptcp_getsockopt(struct sock *sk, int level, int optname,
>>  			    char __user *uoptval, int __user *uoption)
>>  {
>>  	struct mptcp_sock *msk = mptcp_sk(sk);
>> -	struct socket *subflow;
>>  	char __kernel *optval;
>>  	int __kernel *option;
>>
>> -	pr_debug("msk=%p", msk);
>> -	if (msk->connection_list) {
>> -		subflow = msk->connection_list;
>> -		pr_debug("conn_list->subflow=%p", subflow->sk);
>> -	} else {
>> -		subflow = msk->subflow;
>> -		pr_debug("subflow=%p", subflow->sk);
>> -	}
>> -
>>  	/* will be treated as __user in subflow_getsockopt */
>>  	optval = (char __kernel __force *) uoptval;
>>  	option = (int __kernel __force *) uoption;
>>
>> -	return kernel_getsockopt(subflow, level, optname, optval, option);
>> +	pr_debug("msk=%p", msk);
>> +	if (msk->subflow) {
>> +		pr_debug("subflow=%p", msk->subflow->sk);
>> +		return kernel_getsockopt(msk->subflow, level, optname, optval, option);
>> +	}
>> +
>> +	/* @@ the meaning of setsockopt() when the socket is connected and
>> +	 * there are multiple subflows is not defined.
>> +	 */
>> +	return 0;
>>  }
>>
>>  static int mptcp_get_port(struct sock *sk, unsigned short snum)
>> @@ -590,8 +640,10 @@ void mptcp_finish_connect(struct sock *sk, int mp_capable)
>>  		msk->local_key = subflow->local_key;
>>  		msk->token = subflow->token;
>>  		pr_debug("token=%u", msk->token);
>> -		msk->connection_list = msk->subflow;
>> +		spin_lock_bh(&msk->conn_list_lock);
>> +		hlist_add_head_rcu(&subflow->node, &msk->conn_list);
>>  		msk->subflow = NULL;
>> +		spin_unlock_bh(&msk->conn_list_lock);
>>
>>  		crypto_key_sha1(msk->remote_key, NULL, &ack_seq);
>>  		msk->write_seq = subflow->idsn + 1;
>> @@ -659,17 +711,32 @@ int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
>>  int mptcp_stream_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
>>  {
>>  	struct mptcp_sock *msk = mptcp_sk(sock->sk);
>> -	struct socket *subflow;
>> -	int err = -EPERM;
>> +	struct subflow_sock *subflow;
>> +	struct hlist_node *node;
>> +	struct sock *ssk;
>> +	int ret;
>>
>> -	if (msk->connection_list)
>> -		subflow = msk->connection_list;
>> -	else
>> -		subflow = msk->subflow;
>> +	pr_debug("msk=%p", msk);
>>
>> -	err = inet_getname(subflow, uaddr, peer);
>> +	if (msk->subflow) {
>> +		pr_debug("subflow=%p", msk->subflow->sk);
>> +		return inet_getname(msk->subflow, uaddr, peer);
>> +	}
>>
>> -	return err;
>> +	/* @@ the meaning of getname() for the remote peer when the socket
>> +	 * is connected and there are multiple subflows is not defined.
>> +	 * For now just use the first subflow on the list.
>> +	 */
>> +	rcu_read_lock();
>> +	node = rcu_dereference(hlist_first_rcu(&msk->conn_list));
>> +	subflow = hlist_entry(node, struct subflow_sock, node);
>> +	ssk = sock_sk(subflow);
>> +	sock_hold(ssk);
>> +	rcu_read_unlock();
>> +
>> +	ret = inet_getname(sock_sk(subflow)->sk_socket, uaddr, peer);
>> +	sock_put(ssk);
>> +	return ret;
>>  }
>>
>>  int mptcp_stream_listen(struct socket *sock, int backlog)
>> @@ -703,18 +770,22 @@ int mptcp_stream_accept(struct socket *sock, struct socket *newsock, int flags,
>>  int mptcp_stream_shutdown(struct socket *sock, int how)
>>  {
>>  	struct mptcp_sock *msk = mptcp_sk(sock->sk);
>> +	struct subflow_sock *subflow;
>>  	int ret = 0;
>>
>>  	pr_debug("sk=%p, how=%d", msk, how);
>>
>>  	if (msk->subflow) {
>>  		pr_debug("subflow=%p", msk->subflow->sk);
>> -		ret = kernel_sock_shutdown(msk->subflow, how);
>> +		return kernel_sock_shutdown(msk->subflow, how);
>>  	}
>> -	if (msk->connection_list) {
>> -		pr_debug("conn_list->subflow=%p", msk->connection_list->sk);
>> -		ret = kernel_sock_shutdown(msk->connection_list, how);
>> +
>> +	rcu_read_lock();
>> +	mptcp_for_each_subflow(msk, subflow) {
>> +		pr_debug("conn_list->subflow=%p", subflow);
>> +		ret = kernel_sock_shutdown(sock_sk(subflow)->sk_socket, how);
>>  	}
>
> Subflows first need to get drained before they can be shutdown. Otherwise,
> we will end up not delivering all the data.
>
>
> Christoph
>
>> +	rcu_read_unlock();
>>
>>  	return ret;
>>  }
>> --
>> 2.19.1
>>
>> _______________________________________________
>> mptcp mailing list
>> mptcp(a)lists.01.org
>> https://lists.01.org/mailman/listinfo/mptcp
>

--
Mat Martineau
Intel OTC

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [MPTCP] [RFC PATCH v3 16/16] mptcp: Make connection_list a real list of subflows
@ 2018-10-08 18:28 Christoph Paasch
  0 siblings, 0 replies; 4+ messages in thread
From: Christoph Paasch @ 2018-10-08 18:28 UTC (permalink / raw)
  To: mptcp

[-- Attachment #1: Type: text/plain, Size: 13244 bytes --]

On 05/10/18 - 15:59:18, Mat Martineau wrote:
> From: Peter Krystad <peter.krystad(a)intel.com>
> 
> Signed-off-by: Peter Krystad <peter.krystad(a)intel.com>
> ---
>  include/net/mptcp.h  |   8 +-
>  net/mptcp/protocol.c | 193 +++++++++++++++++++++++++++++--------------
>  2 files changed, 139 insertions(+), 62 deletions(-)
> 
> diff --git a/include/net/mptcp.h b/include/net/mptcp.h
> index 6c9516360de4..ff8db00bcafc 100644
> --- a/include/net/mptcp.h
> +++ b/include/net/mptcp.h
> @@ -17,6 +17,7 @@
>  #define __NET_MPTCP_H
>  
>  #include <linux/tcp.h>
> +#include <linux/spinlock.h>
>  
>  /* MPTCP option subtypes */
>  
> @@ -44,10 +45,14 @@ struct mptcp_sock {
>  	u64		write_seq;
>  	atomic64_t	ack_seq;
>  	u32		token;
> -	struct socket	*connection_list; /* @@ needs to be a list */
> +	spinlock_t	conn_list_lock;
> +	struct hlist_head conn_list;
>  	struct socket	*subflow; /* outgoing connect, listener or !mp_capable */
>  };
>  
> +#define mptcp_for_each_subflow(__msk, __subflow)			\
> +	hlist_for_each_entry_rcu(__subflow, &((__msk)->conn_list), node)
> +
>  static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
>  {
>  	return (struct mptcp_sock *)sk;
> @@ -77,6 +82,7 @@ static inline struct mptcp_skb_cb *mptcp_skb_priv_cb(struct sk_buff *skb)
>  struct subflow_sock {
>  	/* tcp_sock must be the first member */
>  	struct	tcp_sock sk;
> +	struct	hlist_node node;// conn_list of subflows
>  	u64	local_key;
>  	u64	map_seq;
>  	u32	map_subflow_seq;
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 911428122c9c..29f4d22fdad2 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -70,35 +70,45 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  	size_t psize;
>  
>  	pr_debug("msk=%p", msk);
> -	if (!msk->connection_list && msk->subflow) {
> +	if (msk->subflow) {
>  		pr_debug("fallback passthrough");
>  		return sock_sendmsg(msk->subflow, msg);
>  	}
>  
> +	rcu_read_lock();
> +	node = rcu_dereference(hlist_first_rcu(&msk->conn_list));
> +	subflow = hlist_entry(node, struct subflow_sock, node);
> +	ssk = sock_sk(subflow);
> +	sock_hold(ssk);
> +	rcu_read_unlock();
> +
>  	if (!msg_data_left(msg)) {
>  		pr_debug("empty send");
> -		return sock_sendmsg(msk->connection_list, msg);
> +		ret = sock_sendmsg(sock_sk(subflow)->sk_socket, msg);
> +		goto put_out;
>  	}
>  
> -	ssk = msk->connection_list->sk;
> -
>  	pr_debug("conn_list->subflow=%p", subflow);
>  
> -	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
> -		return -ENOTSUPP;
> +	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) {
> +		ret = -ENOTSUPP;
> +		goto put_out;
> +	}
>  
>  	/* Initial experiment: new page per send.  Real code will
>  	 * maintain list of active pages and DSS mappings, append to the
>  	 * end and honor zerocopy
>  	 */
>  	page = alloc_page(GFP_KERNEL);
> -	if (!page)
> -		return -ENOMEM;
> +	if (!page) {
> +		ret = -ENOMEM;
> +		goto put_out;
> +	}
>  
>  	mcb = kzalloc(sizeof(*mcb), GFP_KERNEL);
>  	if (!mcb) {
> -		put_page(page);
> -		return -ENOMEM;
> +		ret = -ENOMEM;
> +		goto put_out;
>  	}
>  
>  	/* Copy to page */
> @@ -110,9 +120,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  	pr_debug("left=%ld", msg_data_left(msg));
>  
>  	if (!psize) {
> -		put_page(page);
> -		kfree(mcb);
> -		return -EINVAL;
> +		ret = -EINVAL;
> +		goto put_out;
>  	}
>  
>  	lock_sock(sk);
> @@ -129,9 +138,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  
>  	ret = do_tcp_sendpages(ssk, page, poffset, min_t(int, size_goal, psize),
>  			       msg->msg_flags | MSG_SENDPAGE_NOTLAST);
> -	put_page(page);
>  	if (ret <= 0)
> -		goto error_out;
> +		goto release_out;
>  
>  	if (skb == tcp_write_queue_tail(ssk))
>  		pr_err("no new skb %p/%p", sk, ssk);
> @@ -167,10 +175,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  
>  	tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal);
>  
> -error_out:
> +release_out:
>  	release_sock(ssk);
>  	release_sock(sk);
>  
> +put_out:
> +	if (page)
> +		put_page(page);
> +
> +	sock_put(ssk);
>  	kfree(mcb);
>  	return ret;
>  }
> @@ -290,19 +303,25 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  	struct mptcp_sock *msk = mptcp_sk(sk);
>  	struct subflow_sock *subflow;
>  	struct mptcp_read_arg arg;
> +	struct hlist_node *node;
>  	read_descriptor_t desc;
>  	struct tcp_sock *tp;
>  	struct sock *ssk;
>  	int copied = 0;
>  	long timeo;
>  
> -	if (!msk->connection_list) {
> +	if (msk->subflow) {
>  		pr_debug("fallback-read subflow=%p", msk->subflow->sk);
>  		return sock_recvmsg(msk->subflow, msg, flags);
>  	}
>  
> -	ssk = msk->connection_list->sk;
> -	subflow = subflow_sk(ssk);
> +	rcu_read_lock();
> +	node = rcu_dereference(hlist_first_rcu(&msk->conn_list));
> +	subflow = hlist_entry(node, struct subflow_sock, node);
> +	ssk = sock_sk(subflow);
> +	sock_hold(ssk);
> +	rcu_read_unlock();
> +
>  	tp = tcp_sk(ssk);
>  
>  	desc.arg.data = &arg;
> @@ -430,6 +449,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  	release_sock(ssk);
>  	release_sock(sk);
>  
> +	sock_put(ssk);
> +
>  	return copied;
>  }
>  
> @@ -439,22 +460,50 @@ static int mptcp_init_sock(struct sock *sk)
>  
>  	pr_debug("msk=%p", msk);
>  
> +	INIT_HLIST_HEAD(&msk->conn_list);
> +	spin_lock_init(&msk->conn_list_lock);
> +
>  	return 0;
>  }
>  
>  static void mptcp_close(struct sock *sk, long timeout)
>  {
>  	struct mptcp_sock *msk = mptcp_sk(sk);
> +	struct hlist_node *node;
> +	struct socket *ssk = NULL;
> +	struct subflow_sock *subflow;
>  
> +	spin_lock_bh(&msk->conn_list_lock);
>  	if (msk->subflow) {
> -		pr_debug("subflow=%p", msk->subflow->sk);
> -		sock_release(msk->subflow);
> +		ssk = msk->subflow;
> +		msk->subflow = NULL;
>  	}
> -
> -	if (msk->connection_list) {
> -		pr_debug("conn_list->subflow=%p", msk->connection_list->sk);
> -		sock_release(msk->connection_list);
> +	spin_unlock_bh(&msk->conn_list_lock);
> +	if (ssk != NULL) {
> +		pr_debug("subflow=%p", ssk->sk);
> +		sock_release(ssk);
>  	}
> +
> +	do {
> +		spin_lock_bh(&msk->conn_list_lock);
> +		/* The spin lock was just acquired, so tell
> +		 * rcu_dereference_check() this access to the list is properly
> +		 * protected even though the rcu_read_lock is not held.
> +		 */
> +		node = rcu_dereference_check(
> +			hlist_first_rcu(&msk->conn_list),
> +			spin_is_locked(&msk->conn_list_lock) ||
> +			!IS_ENABLED(CONFIG_SMP));
> +		if (node == NULL)
> +			break;
> +		subflow = hlist_entry(node, struct subflow_sock, node);
> +		hlist_del_rcu(node);
> +		spin_unlock_bh(&msk->conn_list_lock);
> +		synchronize_rcu();
> +		pr_debug("conn_list->subflow=%p", subflow);
> +		sock_release(sock_sk(subflow)->sk_socket);

Same here as for the shutdown. We need to trigger the closing in the right
sequence. First, we need to close the MPTCP-layer and then the subflows.

> +	} while (1);
> +	spin_unlock_bh(&msk->conn_list_lock);
>  }
>  
>  static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
> @@ -493,7 +542,10 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
>  		msk->token = subflow->token;
>  		pr_debug("token=%u", msk->token);
>  		token_update_accept(new_sock->sk, mp->sk);
> -		msk->connection_list = new_sock;
> +		spin_lock_bh(&msk->conn_list_lock);
> +		hlist_add_head_rcu(&subflow->node, &msk->conn_list);
> +		msk->subflow = NULL;
> +		spin_unlock_bh(&msk->conn_list_lock);
>  
>  		crypto_key_sha1(msk->remote_key, NULL, &ack_seq);
>  		msk->write_seq = subflow->idsn + 1;
> @@ -524,46 +576,44 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname,
>  			    char __user *uoptval, unsigned int optlen)
>  {
>  	struct mptcp_sock *msk = mptcp_sk(sk);
> -	struct socket *subflow;
>  	char __kernel *optval;
>  
> -	pr_debug("msk=%p", msk);
> -	if (msk->connection_list) {
> -		subflow = msk->connection_list;
> -		pr_debug("conn_list->subflow=%p", subflow->sk);
> -	} else {
> -		subflow = msk->subflow;
> -		pr_debug("subflow=%p", subflow->sk);
> -	}
> -
>  	/* will be treated as __user in subflow_setsockopt */
>  	optval = (char __kernel __force *) uoptval;
>  
> -	return kernel_setsockopt(subflow, level, optname, optval, optlen);
> +	pr_debug("msk=%p", msk);
> +	if (msk->subflow) {
> +		pr_debug("subflow=%p", msk->subflow->sk);
> +		return kernel_setsockopt(msk->subflow, level, optname, optval, optlen);
> +	}
> +
> +	/* @@ the meaning of setsockopt() when the socket is connected and
> +	 * there are multiple subflows is not defined.
> +	 */
> +	return 0;
>  }
>  
>  static int mptcp_getsockopt(struct sock *sk, int level, int optname,
>  			    char __user *uoptval, int __user *uoption)
>  {
>  	struct mptcp_sock *msk = mptcp_sk(sk);
> -	struct socket *subflow;
>  	char __kernel *optval;
>  	int __kernel *option;
>  
> -	pr_debug("msk=%p", msk);
> -	if (msk->connection_list) {
> -		subflow = msk->connection_list;
> -		pr_debug("conn_list->subflow=%p", subflow->sk);
> -	} else {
> -		subflow = msk->subflow;
> -		pr_debug("subflow=%p", subflow->sk);
> -	}
> -
>  	/* will be treated as __user in subflow_getsockopt */
>  	optval = (char __kernel __force *) uoptval;
>  	option = (int __kernel __force *) uoption;
>  
> -	return kernel_getsockopt(subflow, level, optname, optval, option);
> +	pr_debug("msk=%p", msk);
> +	if (msk->subflow) {
> +		pr_debug("subflow=%p", msk->subflow->sk);
> +		return kernel_getsockopt(msk->subflow, level, optname, optval, option);
> +	}
> +
> +	/* @@ the meaning of setsockopt() when the socket is connected and
> +	 * there are multiple subflows is not defined.
> +	 */
> +	return 0;
>  }
>  
>  static int mptcp_get_port(struct sock *sk, unsigned short snum)
> @@ -590,8 +640,10 @@ void mptcp_finish_connect(struct sock *sk, int mp_capable)
>  		msk->local_key = subflow->local_key;
>  		msk->token = subflow->token;
>  		pr_debug("token=%u", msk->token);
> -		msk->connection_list = msk->subflow;
> +		spin_lock_bh(&msk->conn_list_lock);
> +		hlist_add_head_rcu(&subflow->node, &msk->conn_list);
>  		msk->subflow = NULL;
> +		spin_unlock_bh(&msk->conn_list_lock);
>  
>  		crypto_key_sha1(msk->remote_key, NULL, &ack_seq);
>  		msk->write_seq = subflow->idsn + 1;
> @@ -659,17 +711,32 @@ int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
>  int mptcp_stream_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
>  {
>  	struct mptcp_sock *msk = mptcp_sk(sock->sk);
> -	struct socket *subflow;
> -	int err = -EPERM;
> +	struct subflow_sock *subflow;
> +	struct hlist_node *node;
> +	struct sock *ssk;
> +	int ret;
>  
> -	if (msk->connection_list)
> -		subflow = msk->connection_list;
> -	else
> -		subflow = msk->subflow;
> +	pr_debug("msk=%p", msk);
>  
> -	err = inet_getname(subflow, uaddr, peer);
> +	if (msk->subflow) {
> +		pr_debug("subflow=%p", msk->subflow->sk);
> +		return inet_getname(msk->subflow, uaddr, peer);
> +	}
>  
> -	return err;
> +	/* @@ the meaning of getname() for the remote peer when the socket
> +	 * is connected and there are multiple subflows is not defined.
> +	 * For now just use the first subflow on the list.
> +	 */
> +	rcu_read_lock();
> +	node = rcu_dereference(hlist_first_rcu(&msk->conn_list));
> +	subflow = hlist_entry(node, struct subflow_sock, node);
> +	ssk = sock_sk(subflow);
> +	sock_hold(ssk);
> +	rcu_read_unlock();
> +
> +	ret = inet_getname(sock_sk(subflow)->sk_socket, uaddr, peer);
> +	sock_put(ssk);
> +	return ret;
>  }
>  
>  int mptcp_stream_listen(struct socket *sock, int backlog)
> @@ -703,18 +770,22 @@ int mptcp_stream_accept(struct socket *sock, struct socket *newsock, int flags,
>  int mptcp_stream_shutdown(struct socket *sock, int how)
>  {
>  	struct mptcp_sock *msk = mptcp_sk(sock->sk);
> +	struct subflow_sock *subflow;
>  	int ret = 0;
>  
>  	pr_debug("sk=%p, how=%d", msk, how);
>  
>  	if (msk->subflow) {
>  		pr_debug("subflow=%p", msk->subflow->sk);
> -		ret = kernel_sock_shutdown(msk->subflow, how);
> +		return kernel_sock_shutdown(msk->subflow, how);
>  	}
> -	if (msk->connection_list) {
> -		pr_debug("conn_list->subflow=%p", msk->connection_list->sk);
> -		ret = kernel_sock_shutdown(msk->connection_list, how);
> +
> +	rcu_read_lock();
> +	mptcp_for_each_subflow(msk, subflow) {
> +		pr_debug("conn_list->subflow=%p", subflow);
> +		ret = kernel_sock_shutdown(sock_sk(subflow)->sk_socket, how);
>  	}

Subflows first need to get drained before they can be shutdown. Otherwise,
we will end up not delivering all the data.


Christoph

> +	rcu_read_unlock();
>  
>  	return ret;
>  }
> -- 
> 2.19.1
> 
> _______________________________________________
> mptcp mailing list
> mptcp(a)lists.01.org
> https://lists.01.org/mailman/listinfo/mptcp

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [MPTCP] [RFC PATCH v3 16/16] mptcp: Make connection_list a real list of subflows
@ 2018-10-05 22:59 Mat Martineau
  0 siblings, 0 replies; 4+ messages in thread
From: Mat Martineau @ 2018-10-05 22:59 UTC (permalink / raw)
  To: mptcp

[-- Attachment #1: Type: text/plain, Size: 11958 bytes --]

From: Peter Krystad <peter.krystad(a)intel.com>

Signed-off-by: Peter Krystad <peter.krystad(a)intel.com>
---
 include/net/mptcp.h  |   8 +-
 net/mptcp/protocol.c | 193 +++++++++++++++++++++++++++++--------------
 2 files changed, 139 insertions(+), 62 deletions(-)

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 6c9516360de4..ff8db00bcafc 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -17,6 +17,7 @@
 #define __NET_MPTCP_H
 
 #include <linux/tcp.h>
+#include <linux/spinlock.h>
 
 /* MPTCP option subtypes */
 
@@ -44,10 +45,14 @@ struct mptcp_sock {
 	u64		write_seq;
 	atomic64_t	ack_seq;
 	u32		token;
-	struct socket	*connection_list; /* @@ needs to be a list */
+	spinlock_t	conn_list_lock;
+	struct hlist_head conn_list;
 	struct socket	*subflow; /* outgoing connect, listener or !mp_capable */
 };
 
+#define mptcp_for_each_subflow(__msk, __subflow)			\
+	hlist_for_each_entry_rcu(__subflow, &((__msk)->conn_list), node)
+
 static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
 {
 	return (struct mptcp_sock *)sk;
@@ -77,6 +82,7 @@ static inline struct mptcp_skb_cb *mptcp_skb_priv_cb(struct sk_buff *skb)
 struct subflow_sock {
 	/* tcp_sock must be the first member */
 	struct	tcp_sock sk;
+	struct	hlist_node node;// conn_list of subflows
 	u64	local_key;
 	u64	map_seq;
 	u32	map_subflow_seq;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 911428122c9c..29f4d22fdad2 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -70,35 +70,45 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	size_t psize;
 
 	pr_debug("msk=%p", msk);
-	if (!msk->connection_list && msk->subflow) {
+	if (msk->subflow) {
 		pr_debug("fallback passthrough");
 		return sock_sendmsg(msk->subflow, msg);
 	}
 
+	rcu_read_lock();
+	node = rcu_dereference(hlist_first_rcu(&msk->conn_list));
+	subflow = hlist_entry(node, struct subflow_sock, node);
+	ssk = sock_sk(subflow);
+	sock_hold(ssk);
+	rcu_read_unlock();
+
 	if (!msg_data_left(msg)) {
 		pr_debug("empty send");
-		return sock_sendmsg(msk->connection_list, msg);
+		ret = sock_sendmsg(sock_sk(subflow)->sk_socket, msg);
+		goto put_out;
 	}
 
-	ssk = msk->connection_list->sk;
-
 	pr_debug("conn_list->subflow=%p", subflow);
 
-	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
-		return -ENOTSUPP;
+	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) {
+		ret = -ENOTSUPP;
+		goto put_out;
+	}
 
 	/* Initial experiment: new page per send.  Real code will
 	 * maintain list of active pages and DSS mappings, append to the
 	 * end and honor zerocopy
 	 */
 	page = alloc_page(GFP_KERNEL);
-	if (!page)
-		return -ENOMEM;
+	if (!page) {
+		ret = -ENOMEM;
+		goto put_out;
+	}
 
 	mcb = kzalloc(sizeof(*mcb), GFP_KERNEL);
 	if (!mcb) {
-		put_page(page);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto put_out;
 	}
 
 	/* Copy to page */
@@ -110,9 +120,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	pr_debug("left=%ld", msg_data_left(msg));
 
 	if (!psize) {
-		put_page(page);
-		kfree(mcb);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto put_out;
 	}
 
 	lock_sock(sk);
@@ -129,9 +138,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	ret = do_tcp_sendpages(ssk, page, poffset, min_t(int, size_goal, psize),
 			       msg->msg_flags | MSG_SENDPAGE_NOTLAST);
-	put_page(page);
 	if (ret <= 0)
-		goto error_out;
+		goto release_out;
 
 	if (skb == tcp_write_queue_tail(ssk))
 		pr_err("no new skb %p/%p", sk, ssk);
@@ -167,10 +175,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal);
 
-error_out:
+release_out:
 	release_sock(ssk);
 	release_sock(sk);
 
+put_out:
+	if (page)
+		put_page(page);
+
+	sock_put(ssk);
 	kfree(mcb);
 	return ret;
 }
@@ -290,19 +303,25 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct subflow_sock *subflow;
 	struct mptcp_read_arg arg;
+	struct hlist_node *node;
 	read_descriptor_t desc;
 	struct tcp_sock *tp;
 	struct sock *ssk;
 	int copied = 0;
 	long timeo;
 
-	if (!msk->connection_list) {
+	if (msk->subflow) {
 		pr_debug("fallback-read subflow=%p", msk->subflow->sk);
 		return sock_recvmsg(msk->subflow, msg, flags);
 	}
 
-	ssk = msk->connection_list->sk;
-	subflow = subflow_sk(ssk);
+	rcu_read_lock();
+	node = rcu_dereference(hlist_first_rcu(&msk->conn_list));
+	subflow = hlist_entry(node, struct subflow_sock, node);
+	ssk = sock_sk(subflow);
+	sock_hold(ssk);
+	rcu_read_unlock();
+
 	tp = tcp_sk(ssk);
 
 	desc.arg.data = &arg;
@@ -430,6 +449,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	release_sock(ssk);
 	release_sock(sk);
 
+	sock_put(ssk);
+
 	return copied;
 }
 
@@ -439,22 +460,50 @@ static int mptcp_init_sock(struct sock *sk)
 
 	pr_debug("msk=%p", msk);
 
+	INIT_HLIST_HEAD(&msk->conn_list);
+	spin_lock_init(&msk->conn_list_lock);
+
 	return 0;
 }
 
 static void mptcp_close(struct sock *sk, long timeout)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct hlist_node *node;
+	struct socket *ssk = NULL;
+	struct subflow_sock *subflow;
 
+	spin_lock_bh(&msk->conn_list_lock);
 	if (msk->subflow) {
-		pr_debug("subflow=%p", msk->subflow->sk);
-		sock_release(msk->subflow);
+		ssk = msk->subflow;
+		msk->subflow = NULL;
 	}
-
-	if (msk->connection_list) {
-		pr_debug("conn_list->subflow=%p", msk->connection_list->sk);
-		sock_release(msk->connection_list);
+	spin_unlock_bh(&msk->conn_list_lock);
+	if (ssk != NULL) {
+		pr_debug("subflow=%p", ssk->sk);
+		sock_release(ssk);
 	}
+
+	do {
+		spin_lock_bh(&msk->conn_list_lock);
+		/* The spin lock was just acquired, so tell
+		 * rcu_dereference_check() this access to the list is properly
+		 * protected even though the rcu_read_lock is not held.
+		 */
+		node = rcu_dereference_check(
+			hlist_first_rcu(&msk->conn_list),
+			spin_is_locked(&msk->conn_list_lock) ||
+			!IS_ENABLED(CONFIG_SMP));
+		if (node == NULL)
+			break;
+		subflow = hlist_entry(node, struct subflow_sock, node);
+		hlist_del_rcu(node);
+		spin_unlock_bh(&msk->conn_list_lock);
+		synchronize_rcu();
+		pr_debug("conn_list->subflow=%p", subflow);
+		sock_release(sock_sk(subflow)->sk_socket);
+	} while (1);
+	spin_unlock_bh(&msk->conn_list_lock);
 }
 
 static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
@@ -493,7 +542,10 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
 		msk->token = subflow->token;
 		pr_debug("token=%u", msk->token);
 		token_update_accept(new_sock->sk, mp->sk);
-		msk->connection_list = new_sock;
+		spin_lock_bh(&msk->conn_list_lock);
+		hlist_add_head_rcu(&subflow->node, &msk->conn_list);
+		msk->subflow = NULL;
+		spin_unlock_bh(&msk->conn_list_lock);
 
 		crypto_key_sha1(msk->remote_key, NULL, &ack_seq);
 		msk->write_seq = subflow->idsn + 1;
@@ -524,46 +576,44 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname,
 			    char __user *uoptval, unsigned int optlen)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	struct socket *subflow;
 	char __kernel *optval;
 
-	pr_debug("msk=%p", msk);
-	if (msk->connection_list) {
-		subflow = msk->connection_list;
-		pr_debug("conn_list->subflow=%p", subflow->sk);
-	} else {
-		subflow = msk->subflow;
-		pr_debug("subflow=%p", subflow->sk);
-	}
-
 	/* will be treated as __user in subflow_setsockopt */
 	optval = (char __kernel __force *) uoptval;
 
-	return kernel_setsockopt(subflow, level, optname, optval, optlen);
+	pr_debug("msk=%p", msk);
+	if (msk->subflow) {
+		pr_debug("subflow=%p", msk->subflow->sk);
+		return kernel_setsockopt(msk->subflow, level, optname, optval, optlen);
+	}
+
+	/* @@ the meaning of setsockopt() when the socket is connected and
+	 * there are multiple subflows is not defined.
+	 */
+	return 0;
 }
 
 static int mptcp_getsockopt(struct sock *sk, int level, int optname,
 			    char __user *uoptval, int __user *uoption)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	struct socket *subflow;
 	char __kernel *optval;
 	int __kernel *option;
 
-	pr_debug("msk=%p", msk);
-	if (msk->connection_list) {
-		subflow = msk->connection_list;
-		pr_debug("conn_list->subflow=%p", subflow->sk);
-	} else {
-		subflow = msk->subflow;
-		pr_debug("subflow=%p", subflow->sk);
-	}
-
 	/* will be treated as __user in subflow_getsockopt */
 	optval = (char __kernel __force *) uoptval;
 	option = (int __kernel __force *) uoption;
 
-	return kernel_getsockopt(subflow, level, optname, optval, option);
+	pr_debug("msk=%p", msk);
+	if (msk->subflow) {
+		pr_debug("subflow=%p", msk->subflow->sk);
+		return kernel_getsockopt(msk->subflow, level, optname, optval, option);
+	}
+
+	/* @@ the meaning of setsockopt() when the socket is connected and
+	 * there are multiple subflows is not defined.
+	 */
+	return 0;
 }
 
 static int mptcp_get_port(struct sock *sk, unsigned short snum)
@@ -590,8 +640,10 @@ void mptcp_finish_connect(struct sock *sk, int mp_capable)
 		msk->local_key = subflow->local_key;
 		msk->token = subflow->token;
 		pr_debug("token=%u", msk->token);
-		msk->connection_list = msk->subflow;
+		spin_lock_bh(&msk->conn_list_lock);
+		hlist_add_head_rcu(&subflow->node, &msk->conn_list);
 		msk->subflow = NULL;
+		spin_unlock_bh(&msk->conn_list_lock);
 
 		crypto_key_sha1(msk->remote_key, NULL, &ack_seq);
 		msk->write_seq = subflow->idsn + 1;
@@ -659,17 +711,32 @@ int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 int mptcp_stream_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
 {
 	struct mptcp_sock *msk = mptcp_sk(sock->sk);
-	struct socket *subflow;
-	int err = -EPERM;
+	struct subflow_sock *subflow;
+	struct hlist_node *node;
+	struct sock *ssk;
+	int ret;
 
-	if (msk->connection_list)
-		subflow = msk->connection_list;
-	else
-		subflow = msk->subflow;
+	pr_debug("msk=%p", msk);
 
-	err = inet_getname(subflow, uaddr, peer);
+	if (msk->subflow) {
+		pr_debug("subflow=%p", msk->subflow->sk);
+		return inet_getname(msk->subflow, uaddr, peer);
+	}
 
-	return err;
+	/* @@ the meaning of getname() for the remote peer when the socket
+	 * is connected and there are multiple subflows is not defined.
+	 * For now just use the first subflow on the list.
+	 */
+	rcu_read_lock();
+	node = rcu_dereference(hlist_first_rcu(&msk->conn_list));
+	subflow = hlist_entry(node, struct subflow_sock, node);
+	ssk = sock_sk(subflow);
+	sock_hold(ssk);
+	rcu_read_unlock();
+
+	ret = inet_getname(sock_sk(subflow)->sk_socket, uaddr, peer);
+	sock_put(ssk);
+	return ret;
 }
 
 int mptcp_stream_listen(struct socket *sock, int backlog)
@@ -703,18 +770,22 @@ int mptcp_stream_accept(struct socket *sock, struct socket *newsock, int flags,
 int mptcp_stream_shutdown(struct socket *sock, int how)
 {
 	struct mptcp_sock *msk = mptcp_sk(sock->sk);
+	struct subflow_sock *subflow;
 	int ret = 0;
 
 	pr_debug("sk=%p, how=%d", msk, how);
 
 	if (msk->subflow) {
 		pr_debug("subflow=%p", msk->subflow->sk);
-		ret = kernel_sock_shutdown(msk->subflow, how);
+		return kernel_sock_shutdown(msk->subflow, how);
 	}
-	if (msk->connection_list) {
-		pr_debug("conn_list->subflow=%p", msk->connection_list->sk);
-		ret = kernel_sock_shutdown(msk->connection_list, how);
+
+	rcu_read_lock();
+	mptcp_for_each_subflow(msk, subflow) {
+		pr_debug("conn_list->subflow=%p", subflow);
+		ret = kernel_sock_shutdown(sock_sk(subflow)->sk_socket, how);
 	}
+	rcu_read_unlock();
 
 	return ret;
 }
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2018-10-09 22:15 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-09 22:15 [MPTCP] [RFC PATCH v3 16/16] mptcp: Make connection_list a real list of subflows Christoph Paasch
  -- strict thread matches above, loose matches on Subject: below --
2018-10-09 21:57 Mat Martineau
2018-10-08 18:28 Christoph Paasch
2018-10-05 22:59 Mat Martineau

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.