* [PATCH mptcp] mptcp: do not shrink snd_nxt when recovering
@ 2021-08-26 23:43 Florian Westphal
2021-08-27 14:33 ` Paolo Abeni
2021-08-28 0:45 ` Mat Martineau
0 siblings, 2 replies; 3+ messages in thread
From: Florian Westphal @ 2021-08-26 23:43 UTC (permalink / raw)
To: mptcp; +Cc: Florian Westphal
When recovering after a link failure, snd_nxt should not be set to a
lower value. Else, update of snd_nxt is broken because:
msk->snd_nxt += ret; (where ret is number of bytes sent)
assumes that snd_nxt always moves forward.
After reduction, its possible that snd_nxt update gets out of sync:
dfrag we just sent might have had a data sequence number even past
snd_nxt_recovery.
This change factors the common msk state update to a helper
and updates snd_nxt based on the current dfrag data sequence number.
The conditional is required for the recovery phase where we may
re-transmit old dfrags that are before current snd_nxt.
After this change, snd_nxt only moves forward and covers all in-sequence
data that was transmitted.
recovery_snd_nxt is retained to detect when recovery has completed.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
Paolo, I think this is in line with what we discussed earlier today.
This does NOT resolve issues/226, but as far as i can tell the
failure rate is cut in half: 500 runs of mptcp_join -l had 24 vs. 7
failures.
I also no longer see
ack_update_msk: invalid una (96072 or 424988 bytes, 1 below write_seq, cnt 22715
debug log from ack_update_msk(), printed when we ignore advertised
new_snd_una while msk->recovery flag is set.
net/mptcp/options.c | 8 +++-----
net/mptcp/protocol.c | 45 ++++++++++++++++++++++++++++++++------------
2 files changed, 36 insertions(+), 17 deletions(-)
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index f5fa8e7efd9c..12fcf7553d08 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -1019,11 +1019,9 @@ static void ack_update_msk(struct mptcp_sock *msk,
old_snd_una = msk->snd_una;
new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
- /* ACK for data not even sent yet and even above recovery bound? Ignore.*/
- if (unlikely(after64(new_snd_una, snd_nxt))) {
- if (!msk->recovery || after64(new_snd_una, msk->recovery_snd_nxt))
- new_snd_una = old_snd_una;
- }
+ /* ACK for data not even sent yet? Ignore.*/
+ if (unlikely(after64(new_snd_una, snd_nxt)))
+ new_snd_una = old_snd_una;
new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index af8f677162f3..385405cf4626 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1507,6 +1507,34 @@ static void mptcp_push_release(struct sock *sk, struct sock *ssk,
release_sock(ssk);
}
+static void mptcp_update_post_push(struct mptcp_sock *msk,
+ struct mptcp_data_frag *dfrag,
+ u32 sent)
+{
+ u64 snd_nxt_new = dfrag->data_seq;
+
+ dfrag->already_sent += sent;
+
+ msk->snd_burst -= sent;
+ msk->tx_pending_data -= sent;
+
+ snd_nxt_new += dfrag->already_sent;
+
+ /* snd_nxt_new can be smaller than snd_nxt in case mptcp
+ * is recovering after a failover. In that event, this re-sends
+ * old segments.
+ *
+ * Thus compute snd_nxt_new candidate based on
+ * the dfrag->data_seq that was sent and the data
+ * that has been handed to the subflow for transmission
+ * and skip update in case it was old dfrag.
+ */
+ if (likely(after64(snd_nxt_new, msk->snd_nxt)))
+ msk->snd_nxt = snd_nxt_new;
+
+ WARN_ON_ONCE(after64(msk->snd_nxt, msk->write_seq));
+}
+
void __mptcp_push_pending(struct sock *sk, unsigned int flags)
{
struct sock *prev_ssk = NULL, *ssk = NULL;
@@ -1550,12 +1578,10 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
}
info.sent += ret;
- dfrag->already_sent += ret;
- msk->snd_nxt += ret;
- msk->snd_burst -= ret;
- msk->tx_pending_data -= ret;
copied += ret;
len -= ret;
+
+ mptcp_update_post_push(msk, dfrag, ret);
}
WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
}
@@ -1608,13 +1634,11 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
goto out;
info.sent += ret;
- dfrag->already_sent += ret;
- msk->snd_nxt += ret;
- msk->snd_burst -= ret;
- msk->tx_pending_data -= ret;
copied += ret;
len -= ret;
first = false;
+
+ mptcp_update_post_push(msk, dfrag, ret);
}
WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
}
@@ -2212,15 +2236,12 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
return false;
}
- /* will accept ack for reijected data before re-sending them */
- if (!msk->recovery || after64(msk->snd_nxt, msk->recovery_snd_nxt))
- msk->recovery_snd_nxt = msk->snd_nxt;
+ msk->recovery_snd_nxt = msk->snd_nxt;
msk->recovery = true;
mptcp_data_unlock(sk);
msk->first_pending = rtx_head;
msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq;
- msk->snd_nxt = rtx_head->data_seq;
msk->snd_burst = 0;
/* be sure to clear the "sent status" on all re-injected fragments */
--
2.31.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH mptcp] mptcp: do not shrink snd_nxt when recovering
2021-08-26 23:43 [PATCH mptcp] mptcp: do not shrink snd_nxt when recovering Florian Westphal
@ 2021-08-27 14:33 ` Paolo Abeni
2021-08-28 0:45 ` Mat Martineau
1 sibling, 0 replies; 3+ messages in thread
From: Paolo Abeni @ 2021-08-27 14:33 UTC (permalink / raw)
To: Florian Westphal, mptcp
On Fri, 2021-08-27 at 01:43 +0200, Florian Westphal wrote:
> When recovering after a link failure, snd_nxt should not be set to a
> lower value. Else, update of snd_nxt is broken because:
>
> msk->snd_nxt += ret; (where ret is number of bytes sent)
>
> assumes that snd_nxt always moves forward.
> After reduction, its possible that snd_nxt update gets out of sync:
> dfrag we just sent might have had a data sequence number even past
> snd_nxt_recovery.
>
> This change factors the common msk state update to a helper
> and updates snd_nxt based on the current dfrag data sequence number.
>
> The conditional is required for the recovery phase where we may
> re-transmit old dfrags that are before current snd_nxt.
>
> After this change, snd_nxt only moves forward and covers all in-sequence
> data that was transmitted.
>
> recovery_snd_nxt is retained to detect when recovery has completed.
>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
> Paolo, I think this is in line with what we discussed earlier today.
>
> This does NOT resolve issues/226, but as far as i can tell the
> failure rate is cut in half: 500 runs of mptcp_join -l had 24 vs. 7
> failures.
>
> I also no longer see
>
> ack_update_msk: invalid una (96072 or 424988 bytes, 1 below write_seq, cnt 22715
>
> debug log from ack_update_msk(), printed when we ignore advertised
> new_snd_una while msk->recovery flag is set.
>
> net/mptcp/options.c | 8 +++-----
> net/mptcp/protocol.c | 45 ++++++++++++++++++++++++++++++++------------
> 2 files changed, 36 insertions(+), 17 deletions(-)
>
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index f5fa8e7efd9c..12fcf7553d08 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -1019,11 +1019,9 @@ static void ack_update_msk(struct mptcp_sock *msk,
> old_snd_una = msk->snd_una;
> new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
>
> - /* ACK for data not even sent yet and even above recovery bound? Ignore.*/
> - if (unlikely(after64(new_snd_una, snd_nxt))) {
> - if (!msk->recovery || after64(new_snd_una, msk->recovery_snd_nxt))
> - new_snd_una = old_snd_una;
> - }
> + /* ACK for data not even sent yet? Ignore.*/
> + if (unlikely(after64(new_snd_una, snd_nxt)))
> + new_snd_una = old_snd_una;
>
> new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index af8f677162f3..385405cf4626 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -1507,6 +1507,34 @@ static void mptcp_push_release(struct sock *sk, struct sock *ssk,
> release_sock(ssk);
> }
>
> +static void mptcp_update_post_push(struct mptcp_sock *msk,
> + struct mptcp_data_frag *dfrag,
> + u32 sent)
> +{
> + u64 snd_nxt_new = dfrag->data_seq;
> +
> + dfrag->already_sent += sent;
> +
> + msk->snd_burst -= sent;
> + msk->tx_pending_data -= sent;
> +
> + snd_nxt_new += dfrag->already_sent;
> +
> + /* snd_nxt_new can be smaller than snd_nxt in case mptcp
> + * is recovering after a failover. In that event, this re-sends
> + * old segments.
> + *
> + * Thus compute snd_nxt_new candidate based on
> + * the dfrag->data_seq that was sent and the data
> + * that has been handed to the subflow for transmission
> + * and skip update in case it was old dfrag.
> + */
> + if (likely(after64(snd_nxt_new, msk->snd_nxt)))
> + msk->snd_nxt = snd_nxt_new;
> +
> + WARN_ON_ONCE(after64(msk->snd_nxt, msk->write_seq));
I think we could drop this WARN_ON(), as 'snd_nxt_new' never exceeds
the maximum sequence number in the MPTCP tx queue, which in turn never
exceeds write_seq. In any case no bjg objections on my side.
> +}
> +
> void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> {
> struct sock *prev_ssk = NULL, *ssk = NULL;
> @@ -1550,12 +1578,10 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> }
>
> info.sent += ret;
> - dfrag->already_sent += ret;
> - msk->snd_nxt += ret;
> - msk->snd_burst -= ret;
> - msk->tx_pending_data -= ret;
> copied += ret;
> len -= ret;
> +
> + mptcp_update_post_push(msk, dfrag, ret);
> }
> WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
> }
> @@ -1608,13 +1634,11 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
> goto out;
>
> info.sent += ret;
> - dfrag->already_sent += ret;
> - msk->snd_nxt += ret;
> - msk->snd_burst -= ret;
> - msk->tx_pending_data -= ret;
> copied += ret;
> len -= ret;
> first = false;
> +
> + mptcp_update_post_push(msk, dfrag, ret);
> }
> WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
> }
> @@ -2212,15 +2236,12 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
> return false;
> }
>
> - /* will accept ack for reijected data before re-sending them */
> - if (!msk->recovery || after64(msk->snd_nxt, msk->recovery_snd_nxt))
> - msk->recovery_snd_nxt = msk->snd_nxt;
> + msk->recovery_snd_nxt = msk->snd_nxt;
> msk->recovery = true;
> mptcp_data_unlock(sk);
>
> msk->first_pending = rtx_head;
> msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq;
> - msk->snd_nxt = rtx_head->data_seq;
> msk->snd_burst = 0;
>
> /* be sure to clear the "sent status" on all re-injected fragments */
otherwise LGTM! Thanks Florian!
Paolo
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH mptcp] mptcp: do not shrink snd_nxt when recovering
2021-08-26 23:43 [PATCH mptcp] mptcp: do not shrink snd_nxt when recovering Florian Westphal
2021-08-27 14:33 ` Paolo Abeni
@ 2021-08-28 0:45 ` Mat Martineau
1 sibling, 0 replies; 3+ messages in thread
From: Mat Martineau @ 2021-08-28 0:45 UTC (permalink / raw)
To: Florian Westphal; +Cc: mptcp
On Fri, 27 Aug 2021, Florian Westphal wrote:
> When recovering after a link failure, snd_nxt should not be set to a
> lower value. Else, update of snd_nxt is broken because:
>
> msk->snd_nxt += ret; (where ret is number of bytes sent)
>
> assumes that snd_nxt always moves forward.
> After reduction, its possible that snd_nxt update gets out of sync:
> dfrag we just sent might have had a data sequence number even past
> snd_nxt_recovery.
Do you mean recovery_snd_nxt?
>
> This change factors the common msk state update to a helper
> and updates snd_nxt based on the current dfrag data sequence number.
>
> The conditional is required for the recovery phase where we may
> re-transmit old dfrags that are before current snd_nxt.
>
> After this change, snd_nxt only moves forward and covers all in-sequence
> data that was transmitted.
>
> recovery_snd_nxt is retained to detect when recovery has completed.
>
> Signed-off-by: Florian Westphal <fw@strlen.de>
Will need a Fixes tag for the -net tree.
Code looks good though, I don't have any changes to suggest.
-Mat
> ---
> Paolo, I think this is in line with what we discussed earlier today.
>
> This does NOT resolve issues/226, but as far as i can tell the
> failure rate is cut in half: 500 runs of mptcp_join -l had 24 vs. 7
> failures.
>
> I also no longer see
>
> ack_update_msk: invalid una (96072 or 424988 bytes, 1 below write_seq, cnt 22715
>
> debug log from ack_update_msk(), printed when we ignore advertised
> new_snd_una while msk->recovery flag is set.
>
> net/mptcp/options.c | 8 +++-----
> net/mptcp/protocol.c | 45 ++++++++++++++++++++++++++++++++------------
> 2 files changed, 36 insertions(+), 17 deletions(-)
>
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index f5fa8e7efd9c..12fcf7553d08 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -1019,11 +1019,9 @@ static void ack_update_msk(struct mptcp_sock *msk,
> old_snd_una = msk->snd_una;
> new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
>
> - /* ACK for data not even sent yet and even above recovery bound? Ignore.*/
> - if (unlikely(after64(new_snd_una, snd_nxt))) {
> - if (!msk->recovery || after64(new_snd_una, msk->recovery_snd_nxt))
> - new_snd_una = old_snd_una;
> - }
> + /* ACK for data not even sent yet? Ignore.*/
> + if (unlikely(after64(new_snd_una, snd_nxt)))
> + new_snd_una = old_snd_una;
>
> new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index af8f677162f3..385405cf4626 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -1507,6 +1507,34 @@ static void mptcp_push_release(struct sock *sk, struct sock *ssk,
> release_sock(ssk);
> }
>
> +static void mptcp_update_post_push(struct mptcp_sock *msk,
> + struct mptcp_data_frag *dfrag,
> + u32 sent)
> +{
> + u64 snd_nxt_new = dfrag->data_seq;
> +
> + dfrag->already_sent += sent;
> +
> + msk->snd_burst -= sent;
> + msk->tx_pending_data -= sent;
> +
> + snd_nxt_new += dfrag->already_sent;
> +
> + /* snd_nxt_new can be smaller than snd_nxt in case mptcp
> + * is recovering after a failover. In that event, this re-sends
> + * old segments.
> + *
> + * Thus compute snd_nxt_new candidate based on
> + * the dfrag->data_seq that was sent and the data
> + * that has been handed to the subflow for transmission
> + * and skip update in case it was old dfrag.
> + */
> + if (likely(after64(snd_nxt_new, msk->snd_nxt)))
> + msk->snd_nxt = snd_nxt_new;
> +
> + WARN_ON_ONCE(after64(msk->snd_nxt, msk->write_seq));
> +}
> +
> void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> {
> struct sock *prev_ssk = NULL, *ssk = NULL;
> @@ -1550,12 +1578,10 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> }
>
> info.sent += ret;
> - dfrag->already_sent += ret;
> - msk->snd_nxt += ret;
> - msk->snd_burst -= ret;
> - msk->tx_pending_data -= ret;
> copied += ret;
> len -= ret;
> +
> + mptcp_update_post_push(msk, dfrag, ret);
> }
> WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
> }
> @@ -1608,13 +1634,11 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
> goto out;
>
> info.sent += ret;
> - dfrag->already_sent += ret;
> - msk->snd_nxt += ret;
> - msk->snd_burst -= ret;
> - msk->tx_pending_data -= ret;
> copied += ret;
> len -= ret;
> first = false;
> +
> + mptcp_update_post_push(msk, dfrag, ret);
> }
> WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
> }
> @@ -2212,15 +2236,12 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
> return false;
> }
>
> - /* will accept ack for reijected data before re-sending them */
> - if (!msk->recovery || after64(msk->snd_nxt, msk->recovery_snd_nxt))
> - msk->recovery_snd_nxt = msk->snd_nxt;
> + msk->recovery_snd_nxt = msk->snd_nxt;
> msk->recovery = true;
> mptcp_data_unlock(sk);
>
> msk->first_pending = rtx_head;
> msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq;
> - msk->snd_nxt = rtx_head->data_seq;
> msk->snd_burst = 0;
>
> /* be sure to clear the "sent status" on all re-injected fragments */
> --
> 2.31.1
>
>
>
--
Mat Martineau
Intel
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2021-08-28 0:45 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-26 23:43 [PATCH mptcp] mptcp: do not shrink snd_nxt when recovering Florian Westphal
2021-08-27 14:33 ` Paolo Abeni
2021-08-28 0:45 ` Mat Martineau
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).