* [MPTCP] Re: [mptcp-next] mptcp: track window announced to peer
@ 2020-10-15 13:46 Paolo Abeni
0 siblings, 0 replies; 3+ messages in thread
From: Paolo Abeni @ 2020-10-15 13:46 UTC (permalink / raw)
To: mptcp
[-- Attachment #1: Type: text/plain, Size: 7131 bytes --]
On Thu, 2020-10-15 at 14:45 +0200, Florian Westphal wrote:
> OoO handling attemtps to detect when packet is out-of-window by testing
> current ack sequence and remaining space vs. sequence number.
>
> This doesn't work reliably. This stores the highest allowed sequence
> number that we've sent and uses it to detect oow packets.
>
> Do this during mptcp option writing, this handles ACKs sent on
> behalf of tcp_cleanup_rbuf(). We also need to update when a new
> window is chosen, because option writing occurs earlier than this:
>
> 1. mptcp_established_options_dss advances ack_seq
> 2. ack_seq + win is stored
> 3. __tcp_transmit_skb calls tcp_select_window
>
> This would lead to detection of oow packets as saved value was based on
> outdated tp->rcv_wnd.
>
> Signed-off-by: Florian Westphal <fw(a)strlen.de>
> ---
> include/net/mptcp.h | 3 +++
> net/ipv4/tcp_output.c | 2 ++
> net/mptcp/options.c | 13 +++++++++++--
> net/mptcp/protocol.c | 10 +++++-----
> net/mptcp/protocol.h | 1 +
> net/mptcp/subflow.c | 10 ++++++++++
> 6 files changed, 32 insertions(+), 7 deletions(-)
>
> diff --git a/include/net/mptcp.h b/include/net/mptcp.h
> index 6e706d838e4e..177735615ef6 100644
> --- a/include/net/mptcp.h
> +++ b/include/net/mptcp.h
> @@ -63,6 +63,8 @@ extern struct request_sock_ops mptcp_subflow_request_sock_ops;
>
> void mptcp_init(void);
>
> +void mptcp_set_rwin(const struct sock *ssk, u32 rw);
> +
> static inline bool sk_is_mptcp(const struct sock *sk)
> {
> return tcp_sk(sk)->is_mptcp;
> @@ -220,6 +222,7 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
>
> static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { }
> static inline void mptcp_seq_show(struct seq_file *seq) { }
> +static inline void mptcp_set_rwin(const struct sock *ssk, u32 rw) { }
>
> static inline int mptcp_subflow_init_cookie_req(struct request_sock *req,
> const struct sock *sk_listener,
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 9abf5a0358d5..89946e451365 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -279,6 +279,8 @@ static u16 tcp_select_window(struct sock *sk)
> }
> tp->rcv_wnd = new_win;
> tp->rcv_wup = tp->rcv_nxt;
> + if (sk_is_mptcp(sk))
> + mptcp_set_rwin(sk, new_win);
>
> /* Make sure we do not exceed the maximum possible
> * scaled window.
I'm wondering if we could avoid this extra hook moving option writing
after tcp wind selection, passing the tcp_sock to mptcp_write_options
and fetch rcv_wup there.
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index d601dd1ebe5e..0babe13780cc 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -489,6 +489,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
> struct mptcp_ext *mpext;
> unsigned int ack_size;
> bool ret = false;
> + u64 ack_seq;
>
> mpext = skb ? mptcp_get_ext(skb) : NULL;
> snd_data_fin_enable = mptcp_data_fin_enabled(msk);
> @@ -517,15 +518,23 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
> return ret;
> }
>
> + ack_seq = READ_ONCE(msk->ack_seq);
> +
> if (READ_ONCE(msk->use_64bit_ack)) {
> ack_size = TCPOLEN_MPTCP_DSS_ACK64;
> - opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq);
> + opts->ext_copy.data_ack = ack_seq;
> opts->ext_copy.ack64 = 1;
> } else {
> ack_size = TCPOLEN_MPTCP_DSS_ACK32;
> - opts->ext_copy.data_ack32 = (uint32_t)READ_ONCE(msk->ack_seq);
> + opts->ext_copy.data_ack32 = (uint32_t)ack_seq;
> opts->ext_copy.ack64 = 0;
> }
> +
> + ack_seq += tcp_sk(sk)->rcv_wnd;
> +
> + if (after64(ack_seq, msk->rcv_wnd_sent))
> + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
> +
Don't we need to READ_ONCE() the msk wnd ? For TCP-generated acks or
TCP retransmissions, here we could be outside the msk socket lock.
> opts->ext_copy.use_ack = 1;
>
> /* Add kind/length/subtype/flag overhead if mapping is not populated */
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 95121d9116cb..6917ed13c8d5 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -168,19 +168,17 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
> struct rb_node **p, *parent;
> u64 seq, end_seq, max_seq;
> struct sk_buff *skb1;
> - int space;
>
> seq = MPTCP_SKB_CB(skb)->map_seq;
> end_seq = MPTCP_SKB_CB(skb)->end_seq;
> - space = tcp_space(sk);
> - max_seq = space > 0 ? space + msk->ack_seq : msk->ack_seq;
> + max_seq = READ_ONCE(msk->rcv_wnd_sent);
>
> pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
> RB_EMPTY_ROOT(&msk->out_of_order_queue));
> - if (after64(seq, max_seq)) {
> + if (after64(end_seq, max_seq)) {
> /* out of window */
> mptcp_drop(sk, skb);
> - pr_info("oow by %ld\n", (unsigned long)seq - (unsigned long)max_seq);
> + pr_info("oow by %ld, rcv_wnd_sent %lu\n", (unsigned long)end_seq - (unsigned long)max_seq, (long)msk->rcv_wnd_sent);
> MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
> return;
> }
> @@ -2273,6 +2271,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
> mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
> ack_seq++;
> WRITE_ONCE(msk->ack_seq, ack_seq);
> + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
> }
>
> #if !IS_ENABLED(CONFIG_KASAN)
> @@ -2582,6 +2581,7 @@ void mptcp_finish_connect(struct sock *ssk)
> WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
> WRITE_ONCE(msk->snd_nxt, msk->write_seq);
> WRITE_ONCE(msk->ack_seq, ack_seq);
> + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
> WRITE_ONCE(msk->can_ack, 1);
> atomic64_set(&msk->snd_una, msk->write_seq);
>
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 2f2e0b64ec58..ea8d6d9a6224 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -211,6 +211,7 @@ struct mptcp_sock {
> u64 write_seq;
> u64 snd_nxt;
> u64 ack_seq;
> + u64 rcv_wnd_sent;
> u64 rcv_data_fin_seq;
> struct sock *last_snd;
> int snd_burst;
> diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
> index 00040dee323e..66a1ec338fdc 100644
> --- a/net/mptcp/subflow.c
> +++ b/net/mptcp/subflow.c
> @@ -961,6 +961,16 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space)
> *full_space = tcp_full_space(sk);
> }
>
> +void mptcp_set_rwin(const struct sock *ssk, u32 rwin)
> +{
> + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
> + struct mptcp_sock *msk = mptcp_sk(subflow->conn);
> + u64 ack_seq = READ_ONCE(msk->ack_seq) + rwin;
> +
> + if (after64(ack_seq, msk->rcv_wnd_sent))
> + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
> +}
> +
> static void subflow_data_ready(struct sock *sk)
> {
> struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
Looks very good to me. I'm trying a few iteration of simult xmit self
test to see how it goes...
/P
^ permalink raw reply [flat|nested] 3+ messages in thread
* [MPTCP] Re: [mptcp-next] mptcp: track window announced to peer
@ 2020-10-15 14:49 Paolo Abeni
0 siblings, 0 replies; 3+ messages in thread
From: Paolo Abeni @ 2020-10-15 14:49 UTC (permalink / raw)
To: mptcp
[-- Attachment #1: Type: text/plain, Size: 2480 bytes --]
On Thu, 2020-10-15 at 15:59 +0200, Florian Westphal wrote:
> Paolo Abeni <pabeni(a)redhat.com> wrote:
> > > diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> > > index 9abf5a0358d5..89946e451365 100644
> > > --- a/net/ipv4/tcp_output.c
> > > +++ b/net/ipv4/tcp_output.c
> > > @@ -279,6 +279,8 @@ static u16 tcp_select_window(struct sock *sk)
> > > }
> > > tp->rcv_wnd = new_win;
> > > tp->rcv_wup = tp->rcv_nxt;
> > > + if (sk_is_mptcp(sk))
> > > + mptcp_set_rwin(sk, new_win);
> > >
> > > /* Make sure we do not exceed the maximum possible
> > > * scaled window.
> >
> > I'm wondering if we could avoid this extra hook moving option writing
> > after tcp wind selection, passing the tcp_sock to mptcp_write_options
> > and fetch rcv_wup there.
>
> Maybe, I agree the extra hooking is stupid.
> I'll have a look.
>
> > > if (READ_ONCE(msk->use_64bit_ack)) {
> > > ack_size = TCPOLEN_MPTCP_DSS_ACK64;
> > > - opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq);
> > > + opts->ext_copy.data_ack = ack_seq;
> > > opts->ext_copy.ack64 = 1;
> > > } else {
> > > ack_size = TCPOLEN_MPTCP_DSS_ACK32;
> > > - opts->ext_copy.data_ack32 = (uint32_t)READ_ONCE(msk->ack_seq);
> > > + opts->ext_copy.data_ack32 = (uint32_t)ack_seq;
> > > opts->ext_copy.ack64 = 0;
> > > }
> > > +
> > > + ack_seq += tcp_sk(sk)->rcv_wnd;
> > > +
> > > + if (after64(ack_seq, msk->rcv_wnd_sent))
> > > + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
> > > +
> >
> > Don't we need to READ_ONCE() the msk wnd ?
>
> wnd? You mean tcp_sk()->rcv_wnd?
Yep, that is!
> > For TCP-generated acks or
> > TCP retransmissions, here we could be outside the msk socket lock.
>
> Yes, we can't assume msk lock is held.
>
> > Looks very good to me. I'm trying a few iteration of simult xmit self
> > test to see how it goes...
>
> The alternative would be to relax the OOW test and allow a cushion,
> maybe based on msk rmem space, before deciding we have OoW packet and
> dropping it.
This code is working greatly! No more OoW here and self-test are much
more stable. There are still quite rare outlier but I think they are
unrelated to mptcp code (likely something else scheduled on the CPU
running the VM - still to be investigated).
Trimming the extra hook is not critical, I think, but would be nice if
doable without much larger changes. Otherwise I'm more then fine also
this way!
Cheers,
Paolo
^ permalink raw reply [flat|nested] 3+ messages in thread
* [MPTCP] Re: [mptcp-next] mptcp: track window announced to peer
@ 2020-10-15 13:59 Florian Westphal
0 siblings, 0 replies; 3+ messages in thread
From: Florian Westphal @ 2020-10-15 13:59 UTC (permalink / raw)
To: mptcp
[-- Attachment #1: Type: text/plain, Size: 1860 bytes --]
Paolo Abeni <pabeni(a)redhat.com> wrote:
> > diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> > index 9abf5a0358d5..89946e451365 100644
> > --- a/net/ipv4/tcp_output.c
> > +++ b/net/ipv4/tcp_output.c
> > @@ -279,6 +279,8 @@ static u16 tcp_select_window(struct sock *sk)
> > }
> > tp->rcv_wnd = new_win;
> > tp->rcv_wup = tp->rcv_nxt;
> > + if (sk_is_mptcp(sk))
> > + mptcp_set_rwin(sk, new_win);
> >
> > /* Make sure we do not exceed the maximum possible
> > * scaled window.
>
> I'm wondering if we could avoid this extra hook moving option writing
> after tcp wind selection, passing the tcp_sock to mptcp_write_options
> and fetch rcv_wup there.
Maybe, I agree the extra hooking is stupid.
I'll have a look.
> > if (READ_ONCE(msk->use_64bit_ack)) {
> > ack_size = TCPOLEN_MPTCP_DSS_ACK64;
> > - opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq);
> > + opts->ext_copy.data_ack = ack_seq;
> > opts->ext_copy.ack64 = 1;
> > } else {
> > ack_size = TCPOLEN_MPTCP_DSS_ACK32;
> > - opts->ext_copy.data_ack32 = (uint32_t)READ_ONCE(msk->ack_seq);
> > + opts->ext_copy.data_ack32 = (uint32_t)ack_seq;
> > opts->ext_copy.ack64 = 0;
> > }
> > +
> > + ack_seq += tcp_sk(sk)->rcv_wnd;
> > +
> > + if (after64(ack_seq, msk->rcv_wnd_sent))
> > + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
> > +
>
> Don't we need to READ_ONCE() the msk wnd ?
wnd? You mean tcp_sk()->rcv_wnd?
> For TCP-generated acks or
> TCP retransmissions, here we could be outside the msk socket lock.
Yes, we can't assume msk lock is held.
> Looks very good to me. I'm trying a few iteration of simult xmit self
> test to see how it goes...
The alternative would be to relax the OOW test and allow a cushion,
maybe based on msk rmem space, before deciding we have OoW packet and
dropping it.
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2020-10-15 14:49 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-15 13:46 [MPTCP] Re: [mptcp-next] mptcp: track window announced to peer Paolo Abeni
2020-10-15 13:59 Florian Westphal
2020-10-15 14:49 Paolo Abeni
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.