All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
To: Paolo Abeni <pabeni@redhat.com>
Cc: mptcp@lists.linux.dev
Subject: Re: [PATCH v2 mptcp-next 2/4] mptcp: stop relaying on tcp_tx_skb_cache.
Date: Thu, 2 Sep 2021 17:30:02 -0700 (PDT)	[thread overview]
Message-ID: <14eebc85-4164-7c1c-dece-9cf3dc9d8d9c@linux.intel.com> (raw)
In-Reply-To: <ae127a2e3d2fb31e6f1687cb96128d39490f6be0.1630595171.git.pabeni@redhat.com>

On Thu, 2 Sep 2021, Paolo Abeni wrote:

> We want to revert the skb TX cache, but MPTCP is currently
> using it unconditionally.
>
> Rework the MPTCP tx code, so that tcp_tx_skb_cache is not
> needed anymore: do the whole coalescing check, skb allocation
> skb initialization/update inside mptcp_sendmsg_frag(), quite
> alike the current TCP code.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> v1 -> v2:
> - hopefully fix OoB, fetching nr_frags on new skbs
> ---
> net/mptcp/protocol.c | 132 +++++++++++++++++++++++++------------------
> 1 file changed, 77 insertions(+), 55 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index faf6e7000d18..101e61bb2a80 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -1224,6 +1224,7 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp)
> 		if (likely(__mptcp_add_ext(skb, gfp))) {
> 			skb_reserve(skb, MAX_TCP_HEADER);
> 			skb->reserved_tailroom = skb->end - skb->tail;
> +			INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);

Is this related to tx_skb_cache? Looks like it could be a -net fix.

> 			return skb;
> 		}
> 		__kfree_skb(skb);
> @@ -1233,31 +1234,23 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp)
> 	return NULL;
> }
>
> -static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp)
> +static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp)
> {
> 	struct sk_buff *skb;
>
> -	if (ssk->sk_tx_skb_cache) {
> -		skb = ssk->sk_tx_skb_cache;
> -		if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) &&
> -			     !__mptcp_add_ext(skb, gfp)))
> -			return false;
> -		return true;
> -	}
> -
> 	skb = __mptcp_do_alloc_tx_skb(sk, gfp);
> 	if (!skb)
> -		return false;
> +		return NULL;
>
> 	if (likely(sk_wmem_schedule(ssk, skb->truesize))) {
> -		ssk->sk_tx_skb_cache = skb;
> -		return true;
> +		skb_entail(ssk, skb);
> +		return skb;
> 	}
> 	kfree_skb(skb);
> -	return false;
> +	return NULL;
> }
>
> -static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held)
> +static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held)
> {
> 	gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation;
>
> @@ -1287,23 +1280,29 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
> 			      struct mptcp_sendmsg_info *info)
> {
> 	u64 data_seq = dfrag->data_seq + info->sent;
> +	int offset = dfrag->offset + info->sent;
> 	struct mptcp_sock *msk = mptcp_sk(sk);
> 	bool zero_window_probe = false;
> 	struct mptcp_ext *mpext = NULL;
> -	struct sk_buff *skb, *tail;
> -	bool must_collapse = false;
> -	int size_bias = 0;
> -	int avail_size;
> -	size_t ret = 0;
> +	bool can_coalesce = false;
> +	bool reuse_skb = true;
> +	struct sk_buff *skb;
> +	size_t copy;
> +	int i;
>
> 	pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u",
> 		 msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
>
> +	if (WARN_ON_ONCE(info->sent > info->limit ||
> +			 info->limit > dfrag->data_len))
> +		return 0;
> +
> 	/* compute send limit */
> 	info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
> -	avail_size = info->size_goal;
> +	copy = info->size_goal;
> +
> 	skb = tcp_write_queue_tail(ssk);
> -	if (skb) {
> +	if (skb && (copy > skb->len)) {
> 		/* Limit the write to the size available in the
> 		 * current skb, if any, so that we create at most a new skb.
> 		 * Explicitly tells TCP internals to avoid collapsing on later
> @@ -1316,53 +1315,76 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
> 			goto alloc_skb;
> 		}
>
> -		must_collapse = (info->size_goal - skb->len > 0) &&
> -				(skb_shinfo(skb)->nr_frags < sysctl_max_skb_frags);
> -		if (must_collapse) {
> -			size_bias = skb->len;
> -			avail_size = info->size_goal - skb->len;
> +		i = skb_shinfo(skb)->nr_frags;
> +		can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset);
> +		if (!can_coalesce && i >= sysctl_max_skb_frags) {
> +			tcp_mark_push(tcp_sk(ssk), skb);
> +			goto alloc_skb;
> 		}
> -	}
>
> +		copy -= skb->len;
> +	} else {
> alloc_skb:
> -	if (!must_collapse && !ssk->sk_tx_skb_cache &&
> -	    !mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held))
> -		return 0;
> +		skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held);
> +		if (!skb)
> +			return -ENOMEM;
> +
> +		i = skb_shinfo(skb)->nr_frags;
> +		reuse_skb = false;
> +		mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
> +	}
>
> 	/* Zero window and all data acked? Probe. */
> -	avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size);
> -	if (avail_size == 0) {
> +	copy = mptcp_check_allowed_size(msk, data_seq, copy);
> +	if (copy == 0) {
> 		u64 snd_una = READ_ONCE(msk->snd_una);
>
> -		if (skb || snd_una != msk->snd_nxt)
> +		if (skb || snd_una != msk->snd_nxt) {
> +			tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk));
> 			return 0;
> +		}
> +
> 		zero_window_probe = true;
> 		data_seq = snd_una - 1;
> -		avail_size = 1;
> -	}
> +		copy = 1;
>
> -	if (WARN_ON_ONCE(info->sent > info->limit ||
> -			 info->limit > dfrag->data_len))
> -		return 0;
> +		/* all mptcp-level data is acked, no skbs should be present into the
> +		 * ssk write queue
> +		 */
> +		WARN_ON_ONCE(reuse_skb);
> +	}
>
> -	ret = info->limit - info->sent;
> -	tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags,
> -			      dfrag->page, dfrag->offset + info->sent, &ret);
> -	if (!tail) {
> -		tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk));
> +	copy = min_t(size_t, copy, info->limit - info->sent);
> +	if (!sk_wmem_schedule(ssk, copy)) {
> +		tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk));
> 		return -ENOMEM;
> 	}
>
> -	/* if the tail skb is still the cached one, collapsing really happened.
> -	 */
> -	if (skb == tail) {
> -		TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH;
> -		mpext->data_len += ret;
> +	if (can_coalesce) {
> +		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
> +	} else {
> +		get_page(dfrag->page);
> +		skb_fill_page_desc(skb, i, dfrag->page, offset, copy);
> +	}
> +
> +	skb->len += copy;
> +	skb->data_len += copy;
> +	skb->truesize += copy;
> +	sk_wmem_queued_add(ssk, copy);
> +	sk_mem_charge(ssk, copy);
> +	skb->ip_summed = CHECKSUM_PARTIAL;
> +	WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy);
> +	TCP_SKB_CB(skb)->end_seq += copy;
> +	tcp_skb_pcount_set(skb, 0);
> +
> +	/* on skb reuse we just need to update the DSS len */
> +	if (reuse_skb) {
> +		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
> +		mpext->data_len += copy;

Should have the WARN_ON_ONCE(!mpext) before this block - while it 
shouldn't happen, it's slightly less impossible in the reuse_skb case.

I'll start some tests running.


Mat


> 		WARN_ON_ONCE(zero_window_probe);
> 		goto out;
> 	}
>
> -	mpext = skb_ext_find(tail, SKB_EXT_MPTCP);
> 	if (WARN_ON_ONCE(!mpext)) {
> 		/* should never reach here, stream corrupted */
> 		return -EINVAL;
> @@ -1371,7 +1393,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
> 	memset(mpext, 0, sizeof(*mpext));
> 	mpext->data_seq = data_seq;
> 	mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
> -	mpext->data_len = ret;
> +	mpext->data_len = copy;
> 	mpext->use_map = 1;
> 	mpext->dsn64 = 1;
>
> @@ -1380,18 +1402,18 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
> 		 mpext->dsn64);
>
> 	if (zero_window_probe) {
> -		mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
> +		mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
> 		mpext->frozen = 1;
> 		if (READ_ONCE(msk->csum_enabled))
> -			mptcp_update_data_checksum(tail, ret);
> +			mptcp_update_data_checksum(skb, copy);
> 		tcp_push_pending_frames(ssk);
> 		return 0;
> 	}
> out:
> 	if (READ_ONCE(msk->csum_enabled))
> -		mptcp_update_data_checksum(tail, ret);
> -	mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
> -	return ret;
> +		mptcp_update_data_checksum(skb, copy);
> +	mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
> +	return copy;
> }
>
> #define MPTCP_SEND_BURST_SIZE		((1 << 16) - \
> -- 
> 2.26.3
>
>
>

--
Mat Martineau
Intel

  reply	other threads:[~2021-09-03  0:30 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-02 15:52 [PATCH v2 mptcp-next 0/4] mptcp: just another xmit path refactor Paolo Abeni
2021-09-02 15:52 ` [PATCH v2 mptcp-next 1/4] tcp: expose the tcp_mark_push() and skb_entail() helpers Paolo Abeni
2021-09-02 15:52 ` [PATCH v2 mptcp-next 2/4] mptcp: stop relaying on tcp_tx_skb_cache Paolo Abeni
2021-09-03  0:30   ` Mat Martineau [this message]
2021-09-03 13:58     ` Paolo Abeni
2021-09-03 17:07       ` Mat Martineau
2021-09-03 11:28   ` Matthieu Baerts
2021-09-03 17:18     ` Paolo Abeni
2021-09-03 17:47       ` Matthieu Baerts
2021-09-04  8:00   ` Matthieu Baerts
2021-09-06  7:10     ` Paolo Abeni
2021-09-02 15:52 ` [PATCH v2 mptcp-next 3/4] Partially revert "tcp: factor out tcp_build_frag()" Paolo Abeni
2021-09-02 15:52 ` [PATCH v2 mptcp-next 4/4] tcp: remove sk_{tr}x_skb_cache Paolo Abeni
2021-09-03 20:11 ` [PATCH v2 mptcp-next 0/4] mptcp: just another xmit path refactor Mat Martineau
2021-09-04  8:00   ` Matthieu Baerts

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=14eebc85-4164-7c1c-dece-9cf3dc9d8d9c@linux.intel.com \
    --to=mathew.j.martineau@linux.intel.com \
    --cc=mptcp@lists.linux.dev \
    --cc=pabeni@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.