All of lore.kernel.org
 help / color / mirror / Atom feed
* [MPTCP] [PRE-RFC 4/6] mptcp: queue data for mptcp level retransmission
@ 2019-08-16 16:48 Paolo Abeni
  0 siblings, 0 replies; 3+ messages in thread
From: Paolo Abeni @ 2019-08-16 16:48 UTC (permalink / raw)
  To: mptcp

[-- Attachment #1: Type: text/plain, Size: 7095 bytes --]

keep the send page fragment on an MPTCP level retransmission queue.
the queue entries are allocated inside the page frag allocator,
acquiring an additional reference to the page for each list entry.

Also switch to a custom page frag refill function, to ensure that
the current page fragment can always host an MPTCP rtx queue entry.

The MPTCP rtx queue is flushed at socket destroy() time.

Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
---
 net/mptcp/protocol.c | 79 ++++++++++++++++++++++++++++++++++++++++----
 net/mptcp/protocol.h | 19 +++++++++++
 2 files changed, 92 insertions(+), 6 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index b7468f770d25..dcfffb71b545 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -67,6 +67,14 @@ static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk,
 	return mpext && mpext->data_seq + mpext->data_len == msk->write_seq;
 }
 
+static inline bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
+					      const struct page_frag *pfrag,
+					      const struct mptcp_data_frag *df)
+{
+	return df && pfrag->page == df->page &&
+		df->data_seq + df->data_len == msk->write_seq;
+}
+
 static u64 mptcp_update_msk_una(struct sock *sk)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
@@ -101,14 +109,45 @@ static void mptcp_update_subflow_una(struct sock *sk, struct sock *ssk)
 		subflow->snd_una = snd_una;
 }
 
+/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
+ * data
+ */
+bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+{
+	if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
+					pfrag, sk->sk_allocation)))
+		return true;
+
+	sk->sk_prot->enter_memory_pressure(sk);
+	sk_stream_moderate_sndbuf(sk);
+	return false;
+}
+
+static inline struct mptcp_data_frag *
+mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
+		      int offset)
+{
+	struct mptcp_data_frag *dfrag;
+
+	offset = ALIGN(offset, BITS_PER_LONG / 8);
+	dfrag = (struct mptcp_data_frag *)(pfrag->page + offset);
+	dfrag->data_len = 0;
+	dfrag->data_seq = msk->write_seq;
+	dfrag->offset = offset + sizeof(struct mptcp_data_frag);
+	dfrag->page = pfrag->page;
+
+	return dfrag;
+}
+
 static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
 			      struct msghdr *msg, long *timeo, int *pmss_now,
 			      int *ps_goal)
 {
-	int mss_now, avail_size, size_goal, ret;
+	bool dfrag_collapsed, collapsed, can_collapse = false;
+	int mss_now, avail_size, size_goal, offset, ret;
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	bool collapsed, can_collapse = false;
 	struct mptcp_ext *mpext = NULL;
+	struct mptcp_data_frag *dfrag;
 	struct page_frag *pfrag;
 	struct sk_buff *skb;
 	size_t psize;
@@ -117,7 +156,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
 	 * from one substream to another, but do per subflow memory accounting
 	 */
 	pfrag = sk_page_frag(sk);
-	while (!sk_page_frag_refill(ssk, pfrag)) {
+	while (!mptcp_page_frag_refill(ssk, pfrag)) {
 		ret = sk_stream_wait_memory(ssk, timeo);
 		if (ret)
 			return ret;
@@ -147,11 +186,22 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
 		else
 			can_collapse = false;
 	}
-	psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size);
+
+	/* reuse tail pfrag, if possible, or carve a new one from the page
+	 * allocator
+	 */
+	dfrag = mptcp_rtx_tail(sk);
+	offset = pfrag->offset;
+	dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
+	if (!dfrag_collapsed) {
+		dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
+		offset = dfrag->offset;
+	}
+	psize = min_t(size_t, pfrag->size - offset, avail_size);
 
 	/* Copy to page */
 	pr_debug("left=%zu", msg_data_left(msg));
-	psize = copy_page_from_iter(pfrag->page, pfrag->offset,
+	psize = copy_page_from_iter(pfrag->page, offset,
 				    min_t(size_t, msg_data_left(msg), psize),
 				    &msg->msg_iter);
 	pr_debug("left=%zu", msg_data_left(msg));
@@ -161,13 +211,22 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
 	/* tell the TCP stack to delay the push so that we can safely
 	 * access the skb after the sendpages call
 	 */
-	ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize,
+	ret = do_tcp_sendpages(ssk, pfrag->page, offset, psize,
 			       msg->msg_flags | MSG_SENDPAGE_NOTLAST);
 	if (ret <= 0)
 		return ret;
 	if (unlikely(ret < psize))
 		iov_iter_revert(&msg->msg_iter, psize - ret);
 
+	/* send succesful, keep track of sent data for mptcp-level
+	 * retransmission
+	 */
+	dfrag->data_len += ret;
+	if (!dfrag_collapsed) {
+		get_page(dfrag->page);
+		list_add_tail(&dfrag->list, &msk->rtx_queue);
+	}
+
 	collapsed = skb == tcp_write_queue_tail(ssk);
 	BUG_ON(collapsed && !can_collapse);
 	if (collapsed) {
@@ -615,6 +674,7 @@ static int __mptcp_init_sock(struct sock *sk)
 	pr_debug("msk=%p", msk);
 
 	INIT_LIST_HEAD(&msk->conn_list);
+	INIT_LIST_HEAD(&msk->rtx_queue);
 
 	return 0;
 }
@@ -651,6 +711,7 @@ static void mptcp_close(struct sock *sk, long timeout)
 		pr_debug("conn_list->subflow=%p", subflow);
 		sock_release(mptcp_subflow_tcp_socket(subflow));
 	}
+
 	release_sock(sk);
 
 	sock_orphan(sk);
@@ -731,10 +792,16 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
 static void mptcp_destroy(struct sock *sk)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct mptcp_data_frag *dtmp, *dfrag;
 
 	pr_debug("msk=%p, subflow=%p", sk, msk->subflow->sk);
 
 	token_destroy(msk->token);
+
+	list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
+		list_del(&dfrag->list);
+		put_page(dfrag->page);
+	}
 }
 
 static int mptcp_setsockopt(struct sock *sk, int level, int optname,
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 9220afe0d362..889974a4022c 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -108,6 +108,14 @@ struct mptcp_pm_data {
 	u32	token;
 };
 
+struct mptcp_data_frag {
+	struct list_head list;
+	u64 data_seq;
+	int data_len;
+	int offset;
+	struct page *page;
+};
+
 /* MPTCP connection sock */
 struct mptcp_sock {
 	/* inet_connection_sock must be the first member */
@@ -120,6 +128,7 @@ struct mptcp_sock {
 	u32		token;
 	u16		dport;
 	struct list_head conn_list;
+	struct list_head rtx_queue;
 	struct socket	*subflow; /* outgoing connect/listener/!mp_capable */
 	struct mptcp_pm_data	pm;
 	u8		addr_signal;
@@ -133,6 +142,16 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
 	return (struct mptcp_sock *)sk;
 }
 
+struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+
+	if (list_empty(&msk->rtx_queue))
+		return NULL;
+
+	return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
+}
+
 struct subflow_request_sock {
 	struct	tcp_request_sock sk;
 	u8	mp_capable : 1,
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [MPTCP] [PRE-RFC 4/6] mptcp: queue data for mptcp level retransmission
@ 2019-08-19 12:46 Paolo Abeni
  0 siblings, 0 replies; 3+ messages in thread
From: Paolo Abeni @ 2019-08-19 12:46 UTC (permalink / raw)
  To: mptcp

[-- Attachment #1: Type: text/plain, Size: 1353 bytes --]

On Mon, 2019-08-19 at 13:12 +0200, Florian Westphal wrote:
> Paolo Abeni <pabeni(a)redhat.com> wrote:
> > keep the send page fragment on an MPTCP level retransmission queue.
> > the queue entries are allocated inside the page frag allocator,
> > acquiring an additional reference to the page for each list entry.
> > 
> > Also switch to a custom page frag refill function, to ensure that
> > the current page fragment can always host an MPTCP rtx queue entry.
> > 
> > The MPTCP rtx queue is flushed at socket destroy() time.
> > +/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
> > + * data
> > + */
> > +bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
> > +{
> > +	if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
> > +					pfrag, sk->sk_allocation)))
> 
> Where is that 32U coming from?

From sk_page_frag_refill(). mptcp_page_frag_refill() is almost the
same, it just changes the amount of memory reserved.

Another option would have renaming sk_page_frag_refill() to
__sk_page_frag_refill() adding an additional argument for the reserved
memory, and implement sk_page_frag_refill() and
mptcp_page_frag_refill() on top of that.

I can't recall why I explicitly avoided such option ;) (possibly to
avoid touching the ntworking core more)

/P


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [MPTCP] [PRE-RFC 4/6] mptcp: queue data for mptcp level retransmission
@ 2019-08-19 11:12 Florian Westphal
  0 siblings, 0 replies; 3+ messages in thread
From: Florian Westphal @ 2019-08-19 11:12 UTC (permalink / raw)
  To: mptcp

[-- Attachment #1: Type: text/plain, Size: 1152 bytes --]

Paolo Abeni <pabeni(a)redhat.com> wrote:
> keep the send page fragment on an MPTCP level retransmission queue.
> the queue entries are allocated inside the page frag allocator,
> acquiring an additional reference to the page for each list entry.
> 
> Also switch to a custom page frag refill function, to ensure that
> the current page fragment can always host an MPTCP rtx queue entry.
> 
> The MPTCP rtx queue is flushed at socket destroy() time.

> +/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
> + * data
> + */
> +bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
> +{
> +	if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
> +					pfrag, sk->sk_allocation)))

Where is that 32U coming from?

> +static inline struct mptcp_data_frag *
> +mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
> +		      int offset)
> +{
> +	struct mptcp_data_frag *dfrag;
> +
> +	offset = ALIGN(offset, BITS_PER_LONG / 8);

sizeof(long)?

The concept of keeping page frags for rexmit looks good to me, thanks Paolo
for working on this.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2019-08-19 12:46 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-16 16:48 [MPTCP] [PRE-RFC 4/6] mptcp: queue data for mptcp level retransmission Paolo Abeni
2019-08-19 11:12 Florian Westphal
2019-08-19 12:46 Paolo Abeni

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.