Re: [PATCH v2 mptcp-next 2/4] mptcp: stop relaying on tcp_tx_skb_cache.

From: Mat Martineau <mathew.j.martineau@linux.intel.com>
To: Paolo Abeni <pabeni@redhat.com>
Cc: mptcp@lists.linux.dev
Subject: Re: [PATCH v2 mptcp-next 2/4] mptcp: stop relaying on tcp_tx_skb_cache.
Date: Thu, 2 Sep 2021 17:30:02 -0700 (PDT)	[thread overview]
Message-ID: <14eebc85-4164-7c1c-dece-9cf3dc9d8d9c@linux.intel.com> (raw)
In-Reply-To: <ae127a2e3d2fb31e6f1687cb96128d39490f6be0.1630595171.git.pabeni@redhat.com>

On Thu, 2 Sep 2021, Paolo Abeni wrote:

> We want to revert the skb TX cache, but MPTCP is currently
> using it unconditionally.
>
> Rework the MPTCP tx code, so that tcp_tx_skb_cache is not
> needed anymore: do the whole coalescing check, skb allocation
> skb initialization/update inside mptcp_sendmsg_frag(), quite
> alike the current TCP code.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> v1 -> v2:
> - hopefully fix OoB, fetching nr_frags on new skbs
> ---
> net/mptcp/protocol.c | 132 +++++++++++++++++++++++++------------------
> 1 file changed, 77 insertions(+), 55 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index faf6e7000d18..101e61bb2a80 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -1224,6 +1224,7 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp)
> 		if (likely(__mptcp_add_ext(skb, gfp))) {
> 			skb_reserve(skb, MAX_TCP_HEADER);
> 			skb->reserved_tailroom = skb->end - skb->tail;
> +			INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);

Is this related to tx_skb_cache? Looks like it could be a -net fix.

> 			return skb;
> 		}
> 		__kfree_skb(skb);
> @@ -1233,31 +1234,23 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp)
> 	return NULL;
> }
>
> -static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp)
> +static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp)
> {
> 	struct sk_buff *skb;
>
> -	if (ssk->sk_tx_skb_cache) {
> -		skb = ssk->sk_tx_skb_cache;
> -		if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) &&
> -			     !__mptcp_add_ext(skb, gfp)))
> -			return false;
> -		return true;
> -	}
> -
> 	skb = __mptcp_do_alloc_tx_skb(sk, gfp);
> 	if (!skb)
> -		return false;
> +		return NULL;
>
> 	if (likely(sk_wmem_schedule(ssk, skb->truesize))) {
> -		ssk->sk_tx_skb_cache = skb;
> -		return true;
> +		skb_entail(ssk, skb);
> +		return skb;
> 	}
> 	kfree_skb(skb);
> -	return false;
> +	return NULL;
> }
>
> -static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held)
> +static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held)
> {
> 	gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation;
>
> @@ -1287,23 +1280,29 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
> 			      struct mptcp_sendmsg_info *info)
> {
> 	u64 data_seq = dfrag->data_seq + info->sent;
> +	int offset = dfrag->offset + info->sent;
> 	struct mptcp_sock *msk = mptcp_sk(sk);
> 	bool zero_window_probe = false;
> 	struct mptcp_ext *mpext = NULL;
> -	struct sk_buff *skb, *tail;
> -	bool must_collapse = false;
> -	int size_bias = 0;
> -	int avail_size;
> -	size_t ret = 0;
> +	bool can_coalesce = false;
> +	bool reuse_skb = true;
> +	struct sk_buff *skb;
> +	size_t copy;
> +	int i;
>
> 	pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u",
> 		 msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
>
> +	if (WARN_ON_ONCE(info->sent > info->limit ||
> +			 info->limit > dfrag->data_len))
> +		return 0;
> +
> 	/* compute send limit */
> 	info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
> -	avail_size = info->size_goal;
> +	copy = info->size_goal;
> +
> 	skb = tcp_write_queue_tail(ssk);
> -	if (skb) {
> +	if (skb && (copy > skb->len)) {
> 		/* Limit the write to the size available in the
> 		 * current skb, if any, so that we create at most a new skb.
> 		 * Explicitly tells TCP internals to avoid collapsing on later
> @@ -1316,53 +1315,76 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
> 			goto alloc_skb;
> 		}
>
> -		must_collapse = (info->size_goal - skb->len > 0) &&
> -				(skb_shinfo(skb)->nr_frags < sysctl_max_skb_frags);
> -		if (must_collapse) {
> -			size_bias = skb->len;
> -			avail_size = info->size_goal - skb->len;
> +		i = skb_shinfo(skb)->nr_frags;
> +		can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset);
> +		if (!can_coalesce && i >= sysctl_max_skb_frags) {
> +			tcp_mark_push(tcp_sk(ssk), skb);
> +			goto alloc_skb;
> 		}
> -	}
>
> +		copy -= skb->len;
> +	} else {
> alloc_skb:
> -	if (!must_collapse && !ssk->sk_tx_skb_cache &&
> -	    !mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held))
> -		return 0;
> +		skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held);
> +		if (!skb)
> +			return -ENOMEM;
> +
> +		i = skb_shinfo(skb)->nr_frags;
> +		reuse_skb = false;
> +		mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
> +	}
>
> 	/* Zero window and all data acked? Probe. */
> -	avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size);
> -	if (avail_size == 0) {
> +	copy = mptcp_check_allowed_size(msk, data_seq, copy);
> +	if (copy == 0) {
> 		u64 snd_una = READ_ONCE(msk->snd_una);
>
> -		if (skb || snd_una != msk->snd_nxt)
> +		if (skb || snd_una != msk->snd_nxt) {
> +			tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk));
> 			return 0;
> +		}
> +
> 		zero_window_probe = true;
> 		data_seq = snd_una - 1;
> -		avail_size = 1;
> -	}
> +		copy = 1;
>
> -	if (WARN_ON_ONCE(info->sent > info->limit ||
> -			 info->limit > dfrag->data_len))
> -		return 0;
> +		/* all mptcp-level data is acked, no skbs should be present into the
> +		 * ssk write queue
> +		 */
> +		WARN_ON_ONCE(reuse_skb);
> +	}
>
> -	ret = info->limit - info->sent;
> -	tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags,
> -			      dfrag->page, dfrag->offset + info->sent, &ret);
> -	if (!tail) {
> -		tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk));
> +	copy = min_t(size_t, copy, info->limit - info->sent);
> +	if (!sk_wmem_schedule(ssk, copy)) {
> +		tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk));
> 		return -ENOMEM;
> 	}
>
> -	/* if the tail skb is still the cached one, collapsing really happened.
> -	 */
> -	if (skb == tail) {
> -		TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH;
> -		mpext->data_len += ret;
> +	if (can_coalesce) {
> +		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
> +	} else {
> +		get_page(dfrag->page);
> +		skb_fill_page_desc(skb, i, dfrag->page, offset, copy);
> +	}
> +
> +	skb->len += copy;
> +	skb->data_len += copy;
> +	skb->truesize += copy;
> +	sk_wmem_queued_add(ssk, copy);
> +	sk_mem_charge(ssk, copy);
> +	skb->ip_summed = CHECKSUM_PARTIAL;
> +	WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy);
> +	TCP_SKB_CB(skb)->end_seq += copy;
> +	tcp_skb_pcount_set(skb, 0);
> +
> +	/* on skb reuse we just need to update the DSS len */
> +	if (reuse_skb) {
> +		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
> +		mpext->data_len += copy;

Should have the WARN_ON_ONCE(!mpext) before this block - while it 
shouldn't happen, it's slightly less impossible in the reuse_skb case.

I'll start some tests running.

Mat

> 		WARN_ON_ONCE(zero_window_probe);
> 		goto out;
> 	}
>
> -	mpext = skb_ext_find(tail, SKB_EXT_MPTCP);
> 	if (WARN_ON_ONCE(!mpext)) {
> 		/* should never reach here, stream corrupted */
> 		return -EINVAL;
> @@ -1371,7 +1393,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
> 	memset(mpext, 0, sizeof(*mpext));
> 	mpext->data_seq = data_seq;
> 	mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
> -	mpext->data_len = ret;
> +	mpext->data_len = copy;
> 	mpext->use_map = 1;
> 	mpext->dsn64 = 1;
>
> @@ -1380,18 +1402,18 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
> 		 mpext->dsn64);
>
> 	if (zero_window_probe) {
> -		mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
> +		mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
> 		mpext->frozen = 1;
> 		if (READ_ONCE(msk->csum_enabled))
> -			mptcp_update_data_checksum(tail, ret);
> +			mptcp_update_data_checksum(skb, copy);
> 		tcp_push_pending_frames(ssk);
> 		return 0;
> 	}
> out:
> 	if (READ_ONCE(msk->csum_enabled))
> -		mptcp_update_data_checksum(tail, ret);
> -	mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
> -	return ret;
> +		mptcp_update_data_checksum(skb, copy);
> +	mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
> +	return copy;
> }
>
> #define MPTCP_SEND_BURST_SIZE		((1 << 16) - \
> -- 
> 2.26.3
>
>
>

--
Mat Martineau
Intel