All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH mptcp-next v5 00/11] refactor push pending
@ 2022-10-06 12:17 Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 01/11] Squash to "mptcp: add get_subflow wrappers" Geliang Tang
                   ` (12 more replies)
  0 siblings, 13 replies; 20+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

v5:
- address Mat's comments in v4.

v4:
- update __mptcp_subflow_push_pending as Mat suggested.
- add more patches from "BPF redundant scheduler" series.

v3:
- add a cleanup patch.
- remove msk->last_snd in mptcp_subflow_get_send().
- add the loop that calls the scheduler again in __mptcp_push_pending().

v2:
- add snd_burst check in dfrags loop as Mat suggested.

Refactor __mptcp_push_pending() and __mptcp_subflow_push_pending() to
remove duplicate code and support redundant scheduler more easily in
__mptcp_subflow_push_pending().

Geliang Tang (11):
  Squash to "mptcp: add get_subflow wrappers"
  mptcp: 'first' argument for subflow_push_pending
  mptcp: refactor push_pending logic
  mptcp: drop last_snd for burst scheduler
  mptcp: simplify push_pending
  mptcp: multi subflows push_pending
  mptcp: use msk instead of mptcp_sk
  mptcp: refactor subflow_push_pending logic
  mptcp: simplify subflow_push_pending
  mptcp: multi subflows subflow_push_pending
  mptcp: multi subflows retrans support

 net/mptcp/pm.c         |   9 +-
 net/mptcp/pm_netlink.c |   3 -
 net/mptcp/protocol.c   | 285 ++++++++++++++++++++++-------------------
 net/mptcp/protocol.h   |   5 +-
 net/mptcp/sched.c      |  61 +++++----
 5 files changed, 184 insertions(+), 179 deletions(-)

-- 
2.35.3


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH mptcp-next v5 01/11] Squash to "mptcp: add get_subflow wrappers"
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 02/11] mptcp: 'first' argument for subflow_push_pending Geliang Tang
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 20+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

Please update the commit log:

'''
This patch defines two new wrappers mptcp_sched_get_send() and
mptcp_sched_get_retrans(), invoke get_subflow() of msk->sched in them.

Set the subflow pointers array in struct mptcp_sched_data before invoking
get_subflow(), then it can be used in get_subflow() in the BPF contexts.

Check the subflow scheduled flags to test which subflow or subflows are
picked by the scheduler.

Move sock_owned_by_me() and the fallback check code from
mptcp_subflow_get_send/retrans() into the wrappers.
'''
---
 net/mptcp/protocol.c |  8 +++---
 net/mptcp/protocol.h |  4 +--
 net/mptcp/sched.c    | 59 +++++++++++++++++++++-----------------------
 3 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 8feb684408f7..d500a00fa778 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1547,7 +1547,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 			int ret = 0;
 
 			prev_ssk = ssk;
-			ssk = mptcp_sched_get_send(msk);
+			ssk = mptcp_subflow_get_send(msk);
 
 			/* First check. If the ssk has changed since
 			 * the last round, release prev_ssk
@@ -1616,7 +1616,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
 			 * check for a different subflow usage only after
 			 * spooling the first chunk of data
 			 */
-			xmit_ssk = first ? ssk : mptcp_sched_get_send(mptcp_sk(sk));
+			xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
 			if (!xmit_ssk)
 				goto out;
 			if (xmit_ssk != ssk) {
@@ -2478,7 +2478,7 @@ static void __mptcp_retrans(struct sock *sk)
 	mptcp_clean_una_wakeup(sk);
 
 	/* first check ssk: need to kick "stale" logic */
-	ssk = mptcp_sched_get_retrans(msk);
+	ssk = mptcp_subflow_get_retrans(msk);
 	dfrag = mptcp_rtx_head(sk);
 	if (!dfrag) {
 		if (mptcp_data_fin_enabled(msk)) {
@@ -3196,7 +3196,7 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
 		return;
 
 	if (!sock_owned_by_user(sk)) {
-		struct sock *xmit_ssk = mptcp_sched_get_send(mptcp_sk(sk));
+		struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk));
 
 		if (xmit_ssk == ssk)
 			__mptcp_subflow_push_pending(sk, ssk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 93c535440a5c..e81399debff9 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -640,8 +640,8 @@ void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
 				 bool scheduled);
 struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk);
 struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk);
-struct sock *mptcp_sched_get_send(struct mptcp_sock *msk);
-struct sock *mptcp_sched_get_retrans(struct mptcp_sock *msk);
+int mptcp_sched_get_send(struct mptcp_sock *msk);
+int mptcp_sched_get_retrans(struct mptcp_sock *msk);
 
 static inline bool __tcp_can_send(const struct sock *ssk)
 {
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
index 044c5ec8bbfb..d1cad1eefb35 100644
--- a/net/mptcp/sched.c
+++ b/net/mptcp/sched.c
@@ -114,67 +114,64 @@ static int mptcp_sched_data_init(struct mptcp_sock *msk, bool reinject,
 	for (; i < MPTCP_SUBFLOWS_MAX; i++)
 		data->contexts[i] = NULL;
 
+	msk->snd_burst = 0;
+
 	return 0;
 }
 
-struct sock *mptcp_sched_get_send(struct mptcp_sock *msk)
+int mptcp_sched_get_send(struct mptcp_sock *msk)
 {
 	struct mptcp_sched_data data;
 	struct sock *ssk = NULL;
-	int i;
 
 	sock_owned_by_me((struct sock *)msk);
 
 	/* the following check is moved out of mptcp_subflow_get_send */
 	if (__mptcp_check_fallback(msk)) {
-		if (!msk->first)
-			return NULL;
-		return __tcp_can_send(msk->first) &&
-		       sk_stream_memory_free(msk->first) ? msk->first : NULL;
+		if (msk->first &&
+		    __tcp_can_send(msk->first) &&
+		    sk_stream_memory_free(msk->first)) {
+			mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true);
+			return 0;
+		}
+		return -EINVAL;
 	}
 
-	if (!msk->sched)
-		return mptcp_subflow_get_send(msk);
+	if (!msk->sched) {
+		ssk = mptcp_subflow_get_send(msk);
+		if (!ssk)
+			return -EINVAL;
+		mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true);
+		return 0;
+	}
 
 	mptcp_sched_data_init(msk, false, &data);
 	msk->sched->get_subflow(msk, &data);
 
-	for (i = 0; i < MPTCP_SUBFLOWS_MAX; i++) {
-		if (data.contexts[i] && READ_ONCE(data.contexts[i]->scheduled)) {
-			ssk = data.contexts[i]->tcp_sock;
-			msk->last_snd = ssk;
-			break;
-		}
-	}
-
-	return ssk;
+	return 0;
 }
 
-struct sock *mptcp_sched_get_retrans(struct mptcp_sock *msk)
+int mptcp_sched_get_retrans(struct mptcp_sock *msk)
 {
 	struct mptcp_sched_data data;
 	struct sock *ssk = NULL;
-	int i;
 
 	sock_owned_by_me((const struct sock *)msk);
 
 	/* the following check is moved out of mptcp_subflow_get_retrans */
 	if (__mptcp_check_fallback(msk))
-		return NULL;
+		return -EINVAL;
 
-	if (!msk->sched)
-		return mptcp_subflow_get_retrans(msk);
+	if (!msk->sched) {
+		ssk = mptcp_subflow_get_retrans(msk);
+		if (!ssk)
+			return -EINVAL;
+		mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true);
+		return 0;
+	}
 
 	mptcp_sched_data_init(msk, true, &data);
 	msk->sched->get_subflow(msk, &data);
 
-	for (i = 0; i < MPTCP_SUBFLOWS_MAX; i++) {
-		if (data.contexts[i] && READ_ONCE(data.contexts[i]->scheduled)) {
-			ssk = data.contexts[i]->tcp_sock;
-			msk->last_snd = ssk;
-			break;
-		}
-	}
-
-	return ssk;
+	return 0;
 }
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH mptcp-next v5 02/11] mptcp: 'first' argument for subflow_push_pending
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 01/11] Squash to "mptcp: add get_subflow wrappers" Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 03/11] mptcp: refactor push_pending logic Geliang Tang
                   ` (10 subsequent siblings)
  12 siblings, 0 replies; 20+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

The function mptcp_subflow_process_delegated() uses the input ssk first,
while __mptcp_check_push() invokes the packet scheduler first.

So this patch adds a new argument named first for the function
__mptcp_subflow_push_pending() to deal with these two cases separately.

With this change, the code that invokes the packet scheduler in the
fuction __mptcp_check_push() can be removed, and replaced by invoking
__mptcp_subflow_push_pending() directly.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index d500a00fa778..84d33393d24e 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1593,7 +1593,8 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 		__mptcp_check_send_data_fin(sk);
 }
 
-static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
+static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
+					 bool first)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct mptcp_sendmsg_info info = {
@@ -1602,7 +1603,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
 	struct mptcp_data_frag *dfrag;
 	struct sock *xmit_ssk;
 	int len, copied = 0;
-	bool first = true;
 
 	info.flags = 0;
 	while ((dfrag = mptcp_send_head(sk))) {
@@ -1612,8 +1612,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
 		while (len > 0) {
 			int ret = 0;
 
-			/* the caller already invoked the packet scheduler,
-			 * check for a different subflow usage only after
+			/* check for a different subflow usage only after
 			 * spooling the first chunk of data
 			 */
 			xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
@@ -3195,16 +3194,10 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
 	if (!mptcp_send_head(sk))
 		return;
 
-	if (!sock_owned_by_user(sk)) {
-		struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk));
-
-		if (xmit_ssk == ssk)
-			__mptcp_subflow_push_pending(sk, ssk);
-		else if (xmit_ssk)
-			mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), MPTCP_DELEGATE_SEND);
-	} else {
+	if (!sock_owned_by_user(sk))
+		__mptcp_subflow_push_pending(sk, ssk, false);
+	else
 		__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
-	}
 }
 
 #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
@@ -3295,7 +3288,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk)
 	if (test_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status)) {
 		mptcp_data_lock(sk);
 		if (!sock_owned_by_user(sk))
-			__mptcp_subflow_push_pending(sk, ssk);
+			__mptcp_subflow_push_pending(sk, ssk, true);
 		else
 			__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
 		mptcp_data_unlock(sk);
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH mptcp-next v5 03/11] mptcp: refactor push_pending logic
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 01/11] Squash to "mptcp: add get_subflow wrappers" Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 02/11] mptcp: 'first' argument for subflow_push_pending Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-11  0:38   ` Mat Martineau
  2022-10-06 12:17 ` [PATCH mptcp-next v5 04/11] mptcp: drop last_snd for burst scheduler Geliang Tang
                   ` (9 subsequent siblings)
  12 siblings, 1 reply; 20+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

To support redundant package schedulers more easily, this patch refactors
__mptcp_push_pending() logic from:

For each dfrag:
	While sends succeed:
		Call the scheduler (selects subflow and msk->snd_burst)
		Update subflow locks (push/release/acquire as needed)
		Send the dfrag data with mptcp_sendmsg_frag()
		Update already_sent, snd_nxt, snd_burst
	Update msk->first_pending
Push/release on final subflow

to:

While the scheduler selects one subflow:
	Lock the subflow
	For each pending dfrag:
		While sends succeed:
			Send the dfrag data with mptcp_sendmsg_frag()
			Update already_sent, snd_nxt, snd_burst
		Update msk->first_pending
		Break if required by msk->snd_burst / etc
	Push and release the subflow

This patch alse moves the burst check conditions out of the function
mptcp_subflow_get_send(), check them in __mptcp_push_pending() and
__mptcp_subflow_push_pending() in the inner "for each pending dfrag"
loop.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 86 ++++++++++++++++++++------------------------
 1 file changed, 39 insertions(+), 47 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 84d33393d24e..bf77defbc546 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1417,14 +1417,6 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 	u64 linger_time;
 	long tout = 0;
 
-	/* re-use last subflow, if the burst allow that */
-	if (msk->last_snd && msk->snd_burst > 0 &&
-	    sk_stream_memory_free(msk->last_snd) &&
-	    mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
-		mptcp_set_timeout(sk);
-		return msk->last_snd;
-	}
-
 	/* pick the subflow with the lower wmem/wspace ratio */
 	for (i = 0; i < SSK_MODE_MAX; ++i) {
 		send_info[i].ssk = NULL;
@@ -1530,60 +1522,53 @@ void mptcp_check_and_set_pending(struct sock *sk)
 
 void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 {
-	struct sock *prev_ssk = NULL, *ssk = NULL;
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct mptcp_sendmsg_info info = {
 				.flags = flags,
 	};
 	bool do_check_data_fin = false;
 	struct mptcp_data_frag *dfrag;
+	struct sock *ssk;
 	int len;
 
-	while ((dfrag = mptcp_send_head(sk))) {
-		info.sent = dfrag->already_sent;
-		info.limit = dfrag->data_len;
-		len = dfrag->data_len - dfrag->already_sent;
-		while (len > 0) {
-			int ret = 0;
-
-			prev_ssk = ssk;
-			ssk = mptcp_subflow_get_send(msk);
-
-			/* First check. If the ssk has changed since
-			 * the last round, release prev_ssk
-			 */
-			if (ssk != prev_ssk && prev_ssk)
-				mptcp_push_release(prev_ssk, &info);
-			if (!ssk)
-				goto out;
+again:
+	while (mptcp_send_head(sk) && (ssk = mptcp_subflow_get_send(msk))) {
+		lock_sock(ssk);
 
-			/* Need to lock the new subflow only if different
-			 * from the previous one, otherwise we are still
-			 * helding the relevant lock
-			 */
-			if (ssk != prev_ssk)
-				lock_sock(ssk);
+		while ((dfrag = mptcp_send_head(sk))) {
+			info.sent = dfrag->already_sent;
+			info.limit = dfrag->data_len;
+			len = dfrag->data_len - dfrag->already_sent;
+			while (len > 0) {
+				int ret = 0;
+
+				ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+				if (ret <= 0) {
+					if (ret == -EAGAIN)
+						goto again;
+					mptcp_push_release(ssk, &info);
+					goto out;
+				}
+
+				do_check_data_fin = true;
+				info.sent += ret;
+				len -= ret;
+
+				mptcp_update_post_push(msk, dfrag, ret);
+			}
+			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
 
-			ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
-			if (ret <= 0) {
-				if (ret == -EAGAIN)
-					continue;
+			if (msk->snd_burst <= 0 ||
+			    !sk_stream_memory_free(ssk) ||
+			    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
 				mptcp_push_release(ssk, &info);
-				goto out;
+				goto again;
 			}
-
-			do_check_data_fin = true;
-			info.sent += ret;
-			len -= ret;
-
-			mptcp_update_post_push(msk, dfrag, ret);
+			mptcp_set_timeout(sk);
 		}
-		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
-	}
 
-	/* at this point we held the socket lock for the last subflow we used */
-	if (ssk)
 		mptcp_push_release(ssk, &info);
+	}
 
 out:
 	/* ensure the rtx timer is running */
@@ -1636,6 +1621,13 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 			mptcp_update_post_push(msk, dfrag, ret);
 		}
 		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+
+		if (msk->snd_burst <= 0 ||
+		    !sk_stream_memory_free(ssk) ||
+		    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
+			goto out;
+		}
+		mptcp_set_timeout(sk);
 	}
 
 out:
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH mptcp-next v5 04/11] mptcp: drop last_snd for burst scheduler
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (2 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 03/11] mptcp: refactor push_pending logic Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 05/11] mptcp: simplify push_pending Geliang Tang
                   ` (8 subsequent siblings)
  12 siblings, 0 replies; 20+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

msk->last_snd is no longer used anymore in burst scheduler, now it is
used for the round-robin BPF MPTCP scheduler only. This patch removes
the last_snd related code for burst scheduler.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/pm.c         |  9 +--------
 net/mptcp/pm_netlink.c |  3 ---
 net/mptcp/protocol.c   | 11 +----------
 net/mptcp/protocol.h   |  1 -
 net/mptcp/sched.c      |  2 ++
 5 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 45e2a48397b9..cdeb7280ac76 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -282,15 +282,8 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)
 
 	pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
 	msk = mptcp_sk(sk);
-	if (subflow->backup != bkup) {
+	if (subflow->backup != bkup)
 		subflow->backup = bkup;
-		mptcp_data_lock(sk);
-		if (!sock_owned_by_user(sk))
-			msk->last_snd = NULL;
-		else
-			__set_bit(MPTCP_RESET_SCHEDULER,  &msk->cb_flags);
-		mptcp_data_unlock(sk);
-	}
 
 	mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC);
 }
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 9813ed0fde9b..1f2da4aedcb4 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -475,9 +475,6 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con
 
 	slow = lock_sock_fast(ssk);
 	if (prio) {
-		if (subflow->backup != backup)
-			msk->last_snd = NULL;
-
 		subflow->send_mp_prio = 1;
 		subflow->backup = backup;
 		subflow->request_bkup = backup;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index bf77defbc546..8708e1e4ba16 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1469,16 +1469,13 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 
 	burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
 	wmem = READ_ONCE(ssk->sk_wmem_queued);
-	if (!burst) {
-		msk->last_snd = NULL;
+	if (!burst)
 		return ssk;
-	}
 
 	subflow = mptcp_subflow_ctx(ssk);
 	subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
 					   READ_ONCE(ssk->sk_pacing_rate) * burst,
 					   burst + wmem);
-	msk->last_snd = ssk;
 	msk->snd_burst = burst;
 	return ssk;
 }
@@ -2343,9 +2340,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 		msk->first = NULL;
 
 out:
-	if (ssk == msk->last_snd)
-		msk->last_snd = NULL;
-
 	if (need_push)
 		__mptcp_push_pending(sk, 0);
 }
@@ -2978,7 +2972,6 @@ static int mptcp_disconnect(struct sock *sk, int flags)
 	 * subflow
 	 */
 	mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
-	msk->last_snd = NULL;
 	WRITE_ONCE(msk->flags, 0);
 	msk->cb_flags = 0;
 	msk->push_pending = 0;
@@ -3239,8 +3232,6 @@ static void mptcp_release_cb(struct sock *sk)
 			__mptcp_set_connected(sk);
 		if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
 			__mptcp_error_report(sk);
-		if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags))
-			msk->last_snd = NULL;
 	}
 
 	__mptcp_update_rmem(sk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index e81399debff9..05f4c6fd0cd8 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -124,7 +124,6 @@
 #define MPTCP_RETRANSMIT	4
 #define MPTCP_FLUSH_JOIN_LIST	5
 #define MPTCP_CONNECTED		6
-#define MPTCP_RESET_SCHEDULER	7
 
 static inline bool before64(__u64 seq1, __u64 seq2)
 {
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
index d1cad1eefb35..3d805760ae99 100644
--- a/net/mptcp/sched.c
+++ b/net/mptcp/sched.c
@@ -67,6 +67,7 @@ int mptcp_init_sched(struct mptcp_sock *msk,
 	msk->sched = sched;
 	if (msk->sched->init)
 		msk->sched->init(msk);
+	msk->last_snd = NULL;
 
 	pr_debug("sched=%s", msk->sched->name);
 
@@ -84,6 +85,7 @@ void mptcp_release_sched(struct mptcp_sock *msk)
 	msk->sched = NULL;
 	if (sched->release)
 		sched->release(msk);
+	msk->last_snd = NULL;
 
 	bpf_module_put(sched, sched->owner);
 }
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH mptcp-next v5 05/11] mptcp: simplify push_pending
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (3 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 04/11] mptcp: drop last_snd for burst scheduler Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 06/11] mptcp: multi subflows push_pending Geliang Tang
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 20+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

This patch moves the duplicate code from __mptcp_push_pending() and
__mptcp_subflow_push_pending() into a new helper function, named
__subflow_push_pending(). And simplify __mptcp_push_pending() by
invoking this helper.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 95 +++++++++++++++++++++++++-------------------
 1 file changed, 54 insertions(+), 41 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 8708e1e4ba16..29905214103e 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1480,12 +1480,6 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 	return ssk;
 }
 
-static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info)
-{
-	tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal);
-	release_sock(ssk);
-}
-
 static void mptcp_update_post_push(struct mptcp_sock *msk,
 				   struct mptcp_data_frag *dfrag,
 				   u32 sent)
@@ -1517,61 +1511,80 @@ void mptcp_check_and_set_pending(struct sock *sk)
 		mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING);
 }
 
+static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
+				  struct mptcp_sendmsg_info *info)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct mptcp_data_frag *dfrag;
+	int len, copied = 0, err = 0;
+
+	while ((dfrag = mptcp_send_head(sk))) {
+		info->sent = dfrag->already_sent;
+		info->limit = dfrag->data_len;
+		len = dfrag->data_len - dfrag->already_sent;
+		while (len > 0) {
+			int ret = 0;
+
+			ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info);
+			if (ret <= 0) {
+				err = ret;
+				goto out;
+			}
+
+			info->sent += ret;
+			copied += ret;
+			len -= ret;
+
+			mptcp_update_post_push(msk, dfrag, ret);
+		}
+		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+
+		if (msk->snd_burst <= 0 ||
+		    !sk_stream_memory_free(ssk) ||
+		    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
+			err = -EAGAIN;
+			goto out;
+		}
+		mptcp_set_timeout(sk);
+	}
+
+out:
+	if (copied) {
+		tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle,
+			 info->size_goal);
+		err = copied;
+	}
+
+	return err;
+}
+
 void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct mptcp_sendmsg_info info = {
 				.flags = flags,
 	};
-	bool do_check_data_fin = false;
-	struct mptcp_data_frag *dfrag;
 	struct sock *ssk;
-	int len;
+	int ret = 0;
 
 again:
 	while (mptcp_send_head(sk) && (ssk = mptcp_subflow_get_send(msk))) {
 		lock_sock(ssk);
+		ret = __subflow_push_pending(sk, ssk, &info);
+		release_sock(ssk);
 
-		while ((dfrag = mptcp_send_head(sk))) {
-			info.sent = dfrag->already_sent;
-			info.limit = dfrag->data_len;
-			len = dfrag->data_len - dfrag->already_sent;
-			while (len > 0) {
-				int ret = 0;
-
-				ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
-				if (ret <= 0) {
-					if (ret == -EAGAIN)
-						goto again;
-					mptcp_push_release(ssk, &info);
-					goto out;
-				}
-
-				do_check_data_fin = true;
-				info.sent += ret;
-				len -= ret;
-
-				mptcp_update_post_push(msk, dfrag, ret);
-			}
-			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
-
-			if (msk->snd_burst <= 0 ||
-			    !sk_stream_memory_free(ssk) ||
-			    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
-				mptcp_push_release(ssk, &info);
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
 				goto again;
-			}
-			mptcp_set_timeout(sk);
+			goto out;
 		}
-
-		mptcp_push_release(ssk, &info);
 	}
 
 out:
 	/* ensure the rtx timer is running */
 	if (!mptcp_timer_pending(sk))
 		mptcp_reset_timer(sk);
-	if (do_check_data_fin)
+	if (ret > 0)
 		__mptcp_check_send_data_fin(sk);
 }
 
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH mptcp-next v5 06/11] mptcp: multi subflows push_pending
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (4 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 05/11] mptcp: simplify push_pending Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 07/11] mptcp: use msk instead of mptcp_sk Geliang Tang
                   ` (6 subsequent siblings)
  12 siblings, 0 replies; 20+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

This patch adds the multiple subflows support for __mptcp_push_pending().
Use mptcp_sched_get_send() wrapper instead of mptcp_subflow_get_send() in
it.

Check the subflow scheduled flags to test which subflow or subflows are
picked by the scheduler, use them to send data.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 29905214103e..fdb879e09a32 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1561,22 +1561,31 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
 void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	struct mptcp_sendmsg_info info = {
-				.flags = flags,
-	};
-	struct sock *ssk;
 	int ret = 0;
 
 again:
-	while (mptcp_send_head(sk) && (ssk = mptcp_subflow_get_send(msk))) {
-		lock_sock(ssk);
-		ret = __subflow_push_pending(sk, ssk, &info);
-		release_sock(ssk);
+	while (mptcp_send_head(sk) && !mptcp_sched_get_send(msk)) {
+		struct mptcp_subflow_context *subflow;
+		struct mptcp_sendmsg_info info = {
+			.flags = flags,
+		};
 
-		if (ret <= 0) {
-			if (ret == -EAGAIN)
-				goto again;
-			goto out;
+		mptcp_for_each_subflow(msk, subflow) {
+			if (READ_ONCE(subflow->scheduled)) {
+				struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+				lock_sock(ssk);
+				ret = __subflow_push_pending(sk, ssk, &info);
+				release_sock(ssk);
+
+				if (ret <= 0) {
+					if (ret == -EAGAIN)
+						goto again;
+					goto out;
+				}
+				msk->last_snd = ssk;
+				mptcp_subflow_set_scheduled(subflow, false);
+			}
 		}
 	}
 
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH mptcp-next v5 07/11] mptcp: use msk instead of mptcp_sk
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (5 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 06/11] mptcp: multi subflows push_pending Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 08/11] mptcp: refactor subflow_push_pending logic Geliang Tang
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 20+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

Use msk instead of mptcp_sk(sk) in the functions where the variable
"msk = mptcp_sk(sk)" has been defined.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index fdb879e09a32..b15b97e8cbf7 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1619,7 +1619,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 			/* check for a different subflow usage only after
 			 * spooling the first chunk of data
 			 */
-			xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
+			xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk);
 			if (!xmit_ssk)
 				goto out;
 			if (xmit_ssk != ssk) {
@@ -2249,7 +2249,7 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
 	struct mptcp_data_frag *cur, *rtx_head;
 	struct mptcp_sock *msk = mptcp_sk(sk);
 
-	if (__mptcp_check_fallback(mptcp_sk(sk)))
+	if (__mptcp_check_fallback(msk))
 		return false;
 
 	if (tcp_rtx_and_write_queues_empty(sk))
@@ -2928,7 +2928,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
 
 	sock_hold(sk);
 	pr_debug("msk=%p state=%d", sk, sk->sk_state);
-	if (mptcp_sk(sk)->token)
+	if (msk->token)
 		mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
 
 	if (sk->sk_state == TCP_CLOSE) {
@@ -2987,8 +2987,8 @@ static int mptcp_disconnect(struct sock *sk, int flags)
 	mptcp_stop_timer(sk);
 	sk_stop_timer(sk, &sk->sk_timer);
 
-	if (mptcp_sk(sk)->token)
-		mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
+	if (msk->token)
+		mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
 
 	/* msk->subflow is still intact, the following will not free the first
 	 * subflow
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH mptcp-next v5 08/11] mptcp: refactor subflow_push_pending logic
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (6 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 07/11] mptcp: use msk instead of mptcp_sk Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 09/11] mptcp: simplify subflow_push_pending Geliang Tang
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 20+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

This patch refactors __mptcp_subflow_push_pending logic from:

For each dfrag:
	While sends succeed:
		Call the scheduler (selects subflow and msk->snd_burst)
		Send the dfrag data with mptcp_subflow_delegate(), break
		Send the dfrag data with mptcp_sendmsg_frag()
		Update dfrag->already_sent, msk->snd_nxt, msk->snd_burst
	Update msk->first_pending

to:

While first_pending isn't empty:
	Call the scheduler (selects subflow and msk->snd_burst)
	Send the dfrag data with mptcp_subflow_delegate(), break
	Send the dfrag data with mptcp_sendmsg_frag()
	For each pending dfrag:
		While sends succeed:
			Send the dfrag data with mptcp_sendmsg_frag()
			Update already_sent, snd_nxt, snd_burst
		Update msk->first_pending
		Break if required by msk->snd_burst / etc

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 66 +++++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 32 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index b15b97e8cbf7..e2f47e8ca63e 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1609,44 +1609,46 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 	int len, copied = 0;
 
 	info.flags = 0;
-	while ((dfrag = mptcp_send_head(sk))) {
-		info.sent = dfrag->already_sent;
-		info.limit = dfrag->data_len;
-		len = dfrag->data_len - dfrag->already_sent;
-		while (len > 0) {
-			int ret = 0;
+	while (mptcp_send_head(sk)) {
+		/* check for a different subflow usage only after
+		 * spooling the first chunk of data
+		 */
+		xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk);
+		if (!xmit_ssk)
+			goto out;
+		if (xmit_ssk != ssk) {
+			mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk),
+					       MPTCP_DELEGATE_SEND);
+			goto out;
+		}
 
-			/* check for a different subflow usage only after
-			 * spooling the first chunk of data
-			 */
-			xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk);
-			if (!xmit_ssk)
-				goto out;
-			if (xmit_ssk != ssk) {
-				mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk),
-						       MPTCP_DELEGATE_SEND);
-				goto out;
-			}
+		while ((dfrag = mptcp_send_head(sk))) {
+			info.sent = dfrag->already_sent;
+			info.limit = dfrag->data_len;
+			len = dfrag->data_len - dfrag->already_sent;
+			while (len > 0) {
+				int ret = 0;
 
-			ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
-			if (ret <= 0)
-				goto out;
+				ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+				if (ret <= 0)
+					goto out;
 
-			info.sent += ret;
-			copied += ret;
-			len -= ret;
-			first = false;
+				info.sent += ret;
+				copied += ret;
+				len -= ret;
+				first = false;
 
-			mptcp_update_post_push(msk, dfrag, ret);
-		}
-		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+				mptcp_update_post_push(msk, dfrag, ret);
+			}
+			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
 
-		if (msk->snd_burst <= 0 ||
-		    !sk_stream_memory_free(ssk) ||
-		    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
-			goto out;
+			if (msk->snd_burst <= 0 ||
+			    !sk_stream_memory_free(ssk) ||
+			    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
+				goto out;
+			}
+			mptcp_set_timeout(sk);
 		}
-		mptcp_set_timeout(sk);
 	}
 
 out:
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH mptcp-next v5 09/11] mptcp: simplify subflow_push_pending
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (7 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 08/11] mptcp: refactor subflow_push_pending logic Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 10/11] mptcp: multi subflows subflow_push_pending Geliang Tang
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 20+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

This patch simplifies __mptcp_subflow_push_pending() by invoking
__subflow_push_pending() helper.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 39 ++++++++-------------------------------
 1 file changed, 8 insertions(+), 31 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e2f47e8ca63e..66436885b749 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1604,10 +1604,10 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 	struct mptcp_sendmsg_info info = {
 		.data_lock_held = true,
 	};
-	struct mptcp_data_frag *dfrag;
 	struct sock *xmit_ssk;
-	int len, copied = 0;
+	int ret = 0;
 
+again:
 	info.flags = 0;
 	while (mptcp_send_head(sk)) {
 		/* check for a different subflow usage only after
@@ -1622,32 +1622,11 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 			goto out;
 		}
 
-		while ((dfrag = mptcp_send_head(sk))) {
-			info.sent = dfrag->already_sent;
-			info.limit = dfrag->data_len;
-			len = dfrag->data_len - dfrag->already_sent;
-			while (len > 0) {
-				int ret = 0;
-
-				ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
-				if (ret <= 0)
-					goto out;
-
-				info.sent += ret;
-				copied += ret;
-				len -= ret;
-				first = false;
-
-				mptcp_update_post_push(msk, dfrag, ret);
-			}
-			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
-
-			if (msk->snd_burst <= 0 ||
-			    !sk_stream_memory_free(ssk) ||
-			    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
-				goto out;
-			}
-			mptcp_set_timeout(sk);
+		ret = __subflow_push_pending(sk, ssk, &info);
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
+				goto again;
+			break;
 		}
 	}
 
@@ -1655,9 +1634,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 	/* __mptcp_alloc_tx_skb could have released some wmem and we are
 	 * not going to flush it via release_sock()
 	 */
-	if (copied) {
-		tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
-			 info.size_goal);
+	if (ret > 0) {
 		if (!mptcp_timer_pending(sk))
 			mptcp_reset_timer(sk);
 
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH mptcp-next v5 10/11] mptcp: multi subflows subflow_push_pending
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (8 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 09/11] mptcp: simplify subflow_push_pending Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 11/11] mptcp: multi subflows retrans support Geliang Tang
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 20+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

This patch adds the multiple subflows support for
__mptcp_subflow_push_pending(). Use mptcp_sched_get_send() wrapper
instead of mptcp_subflow_get_send() in it.

Check the subflow scheduled flags to test which subflow or subflows are
picked by the scheduler, use them to send data.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 50 ++++++++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 66436885b749..ecea2a400e6b 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1601,10 +1601,10 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 					 bool first)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct mptcp_subflow_context *subflow;
 	struct mptcp_sendmsg_info info = {
 		.data_lock_held = true,
 	};
-	struct sock *xmit_ssk;
 	int ret = 0;
 
 again:
@@ -1613,20 +1613,44 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 		/* check for a different subflow usage only after
 		 * spooling the first chunk of data
 		 */
-		xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk);
-		if (!xmit_ssk)
-			goto out;
-		if (xmit_ssk != ssk) {
-			mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk),
-					       MPTCP_DELEGATE_SEND);
-			goto out;
+		if (first) {
+			ret = __subflow_push_pending(sk, ssk, &info);
+			if (ret <= 0) {
+				if (ret == -EAGAIN)
+					goto again;
+				break;
+			}
+			first = false;
+			msk->last_snd = ssk;
+			continue;
 		}
 
-		ret = __subflow_push_pending(sk, ssk, &info);
-		if (ret <= 0) {
-			if (ret == -EAGAIN)
-				goto again;
-			break;
+		if (mptcp_sched_get_send(msk))
+			goto out;
+
+		mptcp_for_each_subflow(msk, subflow) {
+			if (READ_ONCE(subflow->scheduled)) {
+				struct sock *xmit_ssk = mptcp_subflow_tcp_sock(subflow);
+
+				if (xmit_ssk != ssk) {
+					if (mptcp_subflow_has_delegated_action(subflow))
+						goto out;
+					mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk),
+							       MPTCP_DELEGATE_SEND);
+					msk->last_snd = xmit_ssk;
+					mptcp_subflow_set_scheduled(subflow, false);
+					continue;
+				}
+
+				ret = __subflow_push_pending(sk, ssk, &info);
+				if (ret <= 0) {
+					if (ret == -EAGAIN)
+						goto again;
+					goto out;
+				}
+				msk->last_snd = ssk;
+				mptcp_subflow_set_scheduled(subflow, false);
+			}
 		}
 	}
 
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH mptcp-next v5 11/11] mptcp: multi subflows retrans support
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (9 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 10/11] mptcp: multi subflows subflow_push_pending Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 17:53   ` mptcp: multi subflows retrans support: Tests Results MPTCP CI
  2022-10-07  0:16 ` [PATCH mptcp-next v5 00/11] refactor push pending Mat Martineau
  2022-10-11  1:01 ` Mat Martineau
  12 siblings, 1 reply; 20+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

This patch adds the multiple subflows support for __mptcp_retrans(). In
it, use sched_get_retrans() wrapper instead of mptcp_subflow_get_retrans().

Iterate each subflow of msk, check the scheduled flag to test if it is
picked by the scheduler. If so, use it to retrans data.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 62 ++++++++++++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index ecea2a400e6b..b422d5de435b 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2479,16 +2479,17 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
 static void __mptcp_retrans(struct sock *sk)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct mptcp_subflow_context *subflow;
 	struct mptcp_sendmsg_info info = {};
 	struct mptcp_data_frag *dfrag;
-	size_t copied = 0;
 	struct sock *ssk;
-	int ret;
+	int ret, err;
+	u16 len = 0;
 
 	mptcp_clean_una_wakeup(sk);
 
 	/* first check ssk: need to kick "stale" logic */
-	ssk = mptcp_subflow_get_retrans(msk);
+	err = mptcp_sched_get_retrans(msk);
 	dfrag = mptcp_rtx_head(sk);
 	if (!dfrag) {
 		if (mptcp_data_fin_enabled(msk)) {
@@ -2507,31 +2508,46 @@ static void __mptcp_retrans(struct sock *sk)
 		goto reset_timer;
 	}
 
-	if (!ssk)
+	if (err)
 		goto reset_timer;
 
-	lock_sock(ssk);
+	mptcp_for_each_subflow(msk, subflow) {
+		if (READ_ONCE(subflow->scheduled)) {
+			u16 copied = 0;
+
+			ssk = mptcp_subflow_tcp_sock(subflow);
+			if (!ssk)
+				goto reset_timer;
+
+			lock_sock(ssk);
+
+			/* limit retransmission to the bytes already sent on some subflows */
+			info.sent = 0;
+			info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len :
+								    dfrag->already_sent;
+			while (info.sent < info.limit) {
+				ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+				if (ret <= 0)
+					break;
+
+				MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
+				copied += ret;
+				info.sent += ret;
+			}
+			if (copied) {
+				len = max(copied, len);
+				tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
+					 info.size_goal);
+				WRITE_ONCE(msk->allow_infinite_fallback, false);
+			}
 
-	/* limit retransmission to the bytes already sent on some subflows */
-	info.sent = 0;
-	info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent;
-	while (info.sent < info.limit) {
-		ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
-		if (ret <= 0)
-			break;
+			release_sock(ssk);
 
-		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
-		copied += ret;
-		info.sent += ret;
-	}
-	if (copied) {
-		dfrag->already_sent = max(dfrag->already_sent, info.sent);
-		tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
-			 info.size_goal);
-		WRITE_ONCE(msk->allow_infinite_fallback, false);
+			msk->last_snd = ssk;
+			mptcp_subflow_set_scheduled(subflow, false);
+		}
 	}
-
-	release_sock(ssk);
+	dfrag->already_sent = max(dfrag->already_sent, len);
 
 reset_timer:
 	mptcp_check_and_set_pending(sk);
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: mptcp: multi subflows retrans support: Tests Results
  2022-10-06 12:17 ` [PATCH mptcp-next v5 11/11] mptcp: multi subflows retrans support Geliang Tang
@ 2022-10-06 17:53   ` MPTCP CI
  0 siblings, 0 replies; 20+ messages in thread
From: MPTCP CI @ 2022-10-06 17:53 UTC (permalink / raw)
  To: Geliang Tang; +Cc: mptcp

Hi Geliang,

Thank you for your modifications, that's great!

Our CI did some validations and here is its report:

- KVM Validation: normal:
  - Unstable: 2 failed test(s): packetdrill_add_addr selftest_mptcp_join 🔴:
  - Task: https://cirrus-ci.com/task/5053437011296256
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/5053437011296256/summary/summary.txt

- KVM Validation: debug:
  - Success! ✅:
  - Task: https://cirrus-ci.com/task/6179336918138880
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/6179336918138880/summary/summary.txt

Initiator: Patchew Applier
Commits: https://github.com/multipath-tcp/mptcp_net-next/commits/acfdc20f458a


If there are some issues, you can reproduce them using the same environment as
the one used by the CI thanks to a docker image, e.g.:

    $ cd [kernel source code]
    $ docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --privileged --rm -it \
        --pull always mptcp/mptcp-upstream-virtme-docker:latest \
        auto-debug

For more details:

    https://github.com/multipath-tcp/mptcp-upstream-virtme-docker


Please note that despite all the efforts that have been already done to have a
stable tests suite when executed on a public CI like here, it is possible some
reported issues are not due to your modifications. Still, do not hesitate to
help us improve that ;-)

Cheers,
MPTCP GH Action bot
Bot operated by Matthieu Baerts (Tessares)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH mptcp-next v5 00/11] refactor push pending
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (10 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 11/11] mptcp: multi subflows retrans support Geliang Tang
@ 2022-10-07  0:16 ` Mat Martineau
  2022-10-07  9:00   ` Geliang Tang
  2022-10-11  1:01 ` Mat Martineau
  12 siblings, 1 reply; 20+ messages in thread
From: Mat Martineau @ 2022-10-07  0:16 UTC (permalink / raw)
  To: Geliang Tang; +Cc: mptcp

On Thu, 6 Oct 2022, Geliang Tang wrote:

> v5:
> - address Mat's comments in v4.

Hi Geliang -

Thanks for the v5. I haven't finished looking over all the patches in 
detail yet, but two things I do want to reply to right now:

  * Thanks for explaining in patch 4 that last_snd is still useful for 
round robin. I had forgotten about that, and it looked like a "write-only" 
variable in the kernel code.

  * In the meeting today Paolo suggested that a good test for the new 
scheduler loop would be to modify simult_flows.sh to use much larger 
files, then see if the modified code slowed down any of the simult_flows 
tests.

He suggested making the test file 10x larger in simult_flows.sh:

-	size=$((2 * 2048 * 4096))
+	size=$((2 * 2048 * 4096 * 10))

Can you compare the test times between the export branch and this series, 
with both of them using the larger file size?


Thanks,

Mat


>
> v4:
> - update __mptcp_subflow_push_pending as Mat suggested.
> - add more patches from "BPF redundant scheduler" series.
>
> v3:
> - add a cleanup patch.
> - remove msk->last_snd in mptcp_subflow_get_send().
> - add the loop that calls the scheduler again in __mptcp_push_pending().
>
> v2:
> - add snd_burst check in dfrags loop as Mat suggested.
>
> Refactor __mptcp_push_pending() and __mptcp_subflow_push_pending() to
> remove duplicate code and support redundant scheduler more easily in
> __mptcp_subflow_push_pending().
>
> Geliang Tang (11):
>  Squash to "mptcp: add get_subflow wrappers"
>  mptcp: 'first' argument for subflow_push_pending
>  mptcp: refactor push_pending logic
>  mptcp: drop last_snd for burst scheduler
>  mptcp: simplify push_pending
>  mptcp: multi subflows push_pending
>  mptcp: use msk instead of mptcp_sk
>  mptcp: refactor subflow_push_pending logic
>  mptcp: simplify subflow_push_pending
>  mptcp: multi subflows subflow_push_pending
>  mptcp: multi subflows retrans support
>
> net/mptcp/pm.c         |   9 +-
> net/mptcp/pm_netlink.c |   3 -
> net/mptcp/protocol.c   | 285 ++++++++++++++++++++++-------------------
> net/mptcp/protocol.h   |   5 +-
> net/mptcp/sched.c      |  61 +++++----
> 5 files changed, 184 insertions(+), 179 deletions(-)
>
> -- 
> 2.35.3
>
>
>

--
Mat Martineau
Intel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH mptcp-next v5 00/11] refactor push pending
  2022-10-07  0:16 ` [PATCH mptcp-next v5 00/11] refactor push pending Mat Martineau
@ 2022-10-07  9:00   ` Geliang Tang
  2022-10-10 15:05     ` Geliang Tang
  0 siblings, 1 reply; 20+ messages in thread
From: Geliang Tang @ 2022-10-07  9:00 UTC (permalink / raw)
  To: Mat Martineau; +Cc: mptcp

[-- Attachment #1: Type: text/plain, Size: 3689 bytes --]

On Thu, Oct 06, 2022 at 05:16:55PM -0700, Mat Martineau wrote:
> On Thu, 6 Oct 2022, Geliang Tang wrote:
> 
> > v5:
> > - address Mat's comments in v4.
> 
> Hi Geliang -
> 
> Thanks for the v5. I haven't finished looking over all the patches in detail
> yet, but two things I do want to reply to right now:
> 
>  * Thanks for explaining in patch 4 that last_snd is still useful for round
> robin. I had forgotten about that, and it looked like a "write-only"
> variable in the kernel code.
> 
>  * In the meeting today Paolo suggested that a good test for the new
> scheduler loop would be to modify simult_flows.sh to use much larger files,
> then see if the modified code slowed down any of the simult_flows tests.
> 
> He suggested making the test file 10x larger in simult_flows.sh:
> 
> -	size=$((2 * 2048 * 4096))
> +	size=$((2 * 2048 * 4096 * 10))
> 
> Can you compare the test times between the export branch and this series,
> with both of them using the larger file size?

10x larger failed in my tests with timeout. So I changed it to 5x
larger.

Here's the patch:

@@ -52,7 +52,7 @@ setup()
 	sout=$(mktemp)
 	cout=$(mktemp)
 	capout=$(mktemp)
-	size=$((2 * 2048 * 4096))
+	size=$((2 * 2048 * 4096 * 5))

 	dd if=/dev/zero of=$small bs=4096 count=20 >/dev/null 2>&1
 	dd if=/dev/zero of=$large bs=4096 count=$((size / 4096)) >/dev/null 2>&1
@@ -210,13 +210,13 @@ do_transfer()
 	fi

 	echo " [ fail ]"
-	echo "client exit code $retc, server $rets" 1>&2
-	echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
-	ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
-	echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
-	ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
-	ls -l $sin $cout
-	ls -l $cin $sout
+	#echo "client exit code $retc, server $rets" 1>&2
+	#echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
+	#ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
+	#echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
+	#ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
+	#ls -l $sin $cout
+	#ls -l $cin $sout

 	cat "$capout"
 	return 1

All logs are attached. "5x_10times_export.log" is for the export branch
and "5x_10times_refactor_v5.log" is for this series.

Thanks,
-Geliang

> 
> 
> Thanks,
> 
> Mat
> 
> 
> > 
> > v4:
> > - update __mptcp_subflow_push_pending as Mat suggested.
> > - add more patches from "BPF redundant scheduler" series.
> > 
> > v3:
> > - add a cleanup patch.
> > - remove msk->last_snd in mptcp_subflow_get_send().
> > - add the loop that calls the scheduler again in __mptcp_push_pending().
> > 
> > v2:
> > - add snd_burst check in dfrags loop as Mat suggested.
> > 
> > Refactor __mptcp_push_pending() and __mptcp_subflow_push_pending() to
> > remove duplicate code and support redundant scheduler more easily in
> > __mptcp_subflow_push_pending().
> > 
> > Geliang Tang (11):
> >  Squash to "mptcp: add get_subflow wrappers"
> >  mptcp: 'first' argument for subflow_push_pending
> >  mptcp: refactor push_pending logic
> >  mptcp: drop last_snd for burst scheduler
> >  mptcp: simplify push_pending
> >  mptcp: multi subflows push_pending
> >  mptcp: use msk instead of mptcp_sk
> >  mptcp: refactor subflow_push_pending logic
> >  mptcp: simplify subflow_push_pending
> >  mptcp: multi subflows subflow_push_pending
> >  mptcp: multi subflows retrans support
> > 
> > net/mptcp/pm.c         |   9 +-
> > net/mptcp/pm_netlink.c |   3 -
> > net/mptcp/protocol.c   | 285 ++++++++++++++++++++++-------------------
> > net/mptcp/protocol.h   |   5 +-
> > net/mptcp/sched.c      |  61 +++++----
> > 5 files changed, 184 insertions(+), 179 deletions(-)
> > 
> > -- 
> > 2.35.3
> > 
> > 
> > 
> 
> --
> Mat Martineau
> Intel

[-- Attachment #2: 5x_10times_export.log --]
[-- Type: text/plain, Size: 13848 bytes --]

1

balanced bwidth                                             transfer slower than expected! runtime 36150 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36135 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36087 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36103 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18173 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18284 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18160 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction 18169 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18349 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse direction18170 max 18227      [ OK ]

2

balanced bwidth                                             transfer slower than expected! runtime 36085 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36095 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36104 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36106 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18364 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18272 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18246 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction 18150 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay            18216 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay - reverse direction18199 max 18227      [ OK ]

3

balanced bwidth                                             transfer slower than expected! runtime 36114 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36082 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36125 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36093 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18198 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18431 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18312 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction 18197 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18284 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18450 ms, expected 18227 ms max 18227       [ fail ]

4

balanced bwidth                                             transfer slower than expected! runtime 36111 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36112 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36126 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36106 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18217 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       18214 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18366 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18347 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            18218 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18459 ms, expected 18227 ms max 18227       [ fail ]

5

balanced bwidth                                             transfer slower than expected! runtime 36128 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36106 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36125 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36108 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18172 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18288 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18312 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18265 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18344 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse direction18133 max 18227      [ OK ]

6

balanced bwidth                                             transfer slower than expected! runtime 36114 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36085 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36156 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36109 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18345 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18409 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18167 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18317 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            18178 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay - reverse direction18163 max 18227      [ OK ]

7

balanced bwidth                                             transfer slower than expected! runtime 36150 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36087 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36108 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36082 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18292 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18442 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18283 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction 18203 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay            18218 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18383 ms, expected 18227 ms max 18227       [ fail ]

8

balanced bwidth                                             transfer slower than expected! runtime 36090 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36089 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36106 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36101 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18271 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18267 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18330 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18323 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18332 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18357 ms, expected 18227 ms max 18227       [ fail ]

9

balanced bwidth                                             transfer slower than expected! runtime 36134 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36108 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36110 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36077 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18256 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18232 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18265 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction 18211 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18389 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse direction18165 max 18227      [ OK ]

10

balanced bwidth                                             transfer slower than expected! runtime 36113 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36089 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36092 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36103 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18413 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       18162 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay                     18139 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18349 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18256 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18540 ms, expected 18227 ms max 18227       [ fail ]

[-- Attachment #3: 5x_10times_refactor_v5.log --]
[-- Type: text/plain, Size: 14553 bytes --]


1

balanced bwidth                                             transfer slower than expected! runtime 36095 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36168 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36112 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36120 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18235 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18445 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18310 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18299 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18388 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18493 ms, expected 18227 ms max 18227       [ fail ]

2

balanced bwidth                                             transfer slower than expected! runtime 36134 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36202 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36174 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36145 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18144 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18329 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18281 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18512 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18297 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18361 ms, expected 18227 ms max 18227       [ fail ]

3

balanced bwidth                                             transfer slower than expected! runtime 36186 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36166 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36157 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36185 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18207 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18270 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18148 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18247 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18405 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18495 ms, expected 18227 ms max 18227       [ fail ]

4

balanced bwidth                                             transfer slower than expected! runtime 36128 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36145 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36176 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36109 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18355 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18521 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18203 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18282 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18473 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18415 ms, expected 18227 ms max 18227       [ fail ]

5

balanced bwidth                                             transfer slower than expected! runtime 36131 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36167 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36173 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36143 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18182 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18377 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18184 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18274 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            18151 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18450 ms, expected 18227 ms max 18227       [ fail ]

6

balanced bwidth                                             transfer slower than expected! runtime 36129 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36199 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36133 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36158 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18133 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18488 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18334 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18457 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18379 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18271 ms, expected 18227 ms max 18227       [ fail ]

7

balanced bwidth                                             transfer slower than expected! runtime 36135 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36119 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36131 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36112 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18344 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18378 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18145 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18434 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18612 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18535 ms, expected 18227 ms max 18227       [ fail ]

8

balanced bwidth                                             transfer slower than expected! runtime 36158 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36180 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36108 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36136 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18183 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18352 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18151 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18423 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18333 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18373 ms, expected 18227 ms max 18227       [ fail ]

9

balanced bwidth                                             transfer slower than expected! runtime 36123 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36152 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36112 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36128 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18267 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18311 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18167 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18294 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18279 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18392 ms, expected 18227 ms max 18227       [ fail ]

10

balanced bwidth                                             transfer slower than expected! runtime 36152 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36164 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36172 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36170 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18345 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18317 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18367 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18407 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18444 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18489 ms, expected 18227 ms max 18227       [ fail ]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH mptcp-next v5 00/11] refactor push pending
  2022-10-07  9:00   ` Geliang Tang
@ 2022-10-10 15:05     ` Geliang Tang
  2022-10-11  0:12       ` Mat Martineau
  0 siblings, 1 reply; 20+ messages in thread
From: Geliang Tang @ 2022-10-10 15:05 UTC (permalink / raw)
  To: Geliang Tang; +Cc: Mat Martineau, mptcp

Geliang Tang <geliang.tang@suse.com> 于2022年10月7日周五 16:59写道:
>
> On Thu, Oct 06, 2022 at 05:16:55PM -0700, Mat Martineau wrote:
> > On Thu, 6 Oct 2022, Geliang Tang wrote:
> >
> > > v5:
> > > - address Mat's comments in v4.
> >
> > Hi Geliang -
> >
> > Thanks for the v5. I haven't finished looking over all the patches in detail
> > yet, but two things I do want to reply to right now:
> >
> >  * Thanks for explaining in patch 4 that last_snd is still useful for round
> > robin. I had forgotten about that, and it looked like a "write-only"
> > variable in the kernel code.
> >
> >  * In the meeting today Paolo suggested that a good test for the new
> > scheduler loop would be to modify simult_flows.sh to use much larger files,
> > then see if the modified code slowed down any of the simult_flows tests.
> >
> > He suggested making the test file 10x larger in simult_flows.sh:
> >
> > -     size=$((2 * 2048 * 4096))
> > +     size=$((2 * 2048 * 4096 * 10))
> >
> > Can you compare the test times between the export branch and this series,
> > with both of them using the larger file size?
>
> 10x larger failed in my tests with timeout. So I changed it to 5x
> larger.
>
> Here's the patch:
>
> @@ -52,7 +52,7 @@ setup()
>         sout=$(mktemp)
>         cout=$(mktemp)
>         capout=$(mktemp)
> -       size=$((2 * 2048 * 4096))
> +       size=$((2 * 2048 * 4096 * 5))
>
>         dd if=/dev/zero of=$small bs=4096 count=20 >/dev/null 2>&1
>         dd if=/dev/zero of=$large bs=4096 count=$((size / 4096)) >/dev/null 2>&1
> @@ -210,13 +210,13 @@ do_transfer()
>         fi
>
>         echo " [ fail ]"
> -       echo "client exit code $retc, server $rets" 1>&2
> -       echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
> -       ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
> -       echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
> -       ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
> -       ls -l $sin $cout
> -       ls -l $cin $sout
> +       #echo "client exit code $retc, server $rets" 1>&2
> +       #echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
> +       #ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
> +       #echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
> +       #ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
> +       #ls -l $sin $cout
> +       #ls -l $cin $sout
>
>         cat "$capout"
>         return 1
>
> All logs are attached. "5x_10times_export.log" is for the export branch
> and "5x_10times_refactor_v5.log" is for this series.

I compared the two log data. The test time of this series is indeed
longer than that of export, but the difference is very small.

I removed the useless information in the two logs and only retained
the running time and expected time. It looks like this:

> cat 5x_export.log | head
36150 max 36005
36135 max 36005
36087 max 36005
36103 max 36005
18284 max 18227
18349 max 18227

Add all running time and expected time:

> awk '{ sum += $1 }; END { print sum }' 5x_export.log
2540920
> awk '{ sum += $3 }; END { print sum }' 5x_export.log
2533820

The ratio of the total running time to the total expected time is
1.00280209 (2540920/2533820).

Use the same method to calculate data in 5x_10times_refactor_v5.log:

> cat 5x_refactor_v5.log | head
36095 max 36005
36168 max 36005
36112 max 36005
36120 max 36005
18235 max 18227
18445 max 18227

> awk '{ sum += $1 }; END { print sum }' 5x_refactor_v5.log
2546024
> awk '{ sum += $3 }; END { print sum }' 5x_refactor_v5.log
2533820

The ratio is 1.00481644.  It is 0.002 more than export data. I think
this difference is acceptable.

Thanks,
-Geliang

>
> Thanks,
> -Geliang
>
> >
> >
> > Thanks,
> >
> > Mat
> >
> >
> > >
> > > v4:
> > > - update __mptcp_subflow_push_pending as Mat suggested.
> > > - add more patches from "BPF redundant scheduler" series.
> > >
> > > v3:
> > > - add a cleanup patch.
> > > - remove msk->last_snd in mptcp_subflow_get_send().
> > > - add the loop that calls the scheduler again in __mptcp_push_pending().
> > >
> > > v2:
> > > - add snd_burst check in dfrags loop as Mat suggested.
> > >
> > > Refactor __mptcp_push_pending() and __mptcp_subflow_push_pending() to
> > > remove duplicate code and support redundant scheduler more easily in
> > > __mptcp_subflow_push_pending().
> > >
> > > Geliang Tang (11):
> > >  Squash to "mptcp: add get_subflow wrappers"
> > >  mptcp: 'first' argument for subflow_push_pending
> > >  mptcp: refactor push_pending logic
> > >  mptcp: drop last_snd for burst scheduler
> > >  mptcp: simplify push_pending
> > >  mptcp: multi subflows push_pending
> > >  mptcp: use msk instead of mptcp_sk
> > >  mptcp: refactor subflow_push_pending logic
> > >  mptcp: simplify subflow_push_pending
> > >  mptcp: multi subflows subflow_push_pending
> > >  mptcp: multi subflows retrans support
> > >
> > > net/mptcp/pm.c         |   9 +-
> > > net/mptcp/pm_netlink.c |   3 -
> > > net/mptcp/protocol.c   | 285 ++++++++++++++++++++++-------------------
> > > net/mptcp/protocol.h   |   5 +-
> > > net/mptcp/sched.c      |  61 +++++----
> > > 5 files changed, 184 insertions(+), 179 deletions(-)
> > >
> > > --
> > > 2.35.3
> > >
> > >
> > >
> >
> > --
> > Mat Martineau
> > Intel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH mptcp-next v5 00/11] refactor push pending
  2022-10-10 15:05     ` Geliang Tang
@ 2022-10-11  0:12       ` Mat Martineau
  0 siblings, 0 replies; 20+ messages in thread
From: Mat Martineau @ 2022-10-11  0:12 UTC (permalink / raw)
  To: Geliang Tang; +Cc: Geliang Tang, mptcp, Paolo Abeni

[-- Attachment #1: Type: text/plain, Size: 4064 bytes --]

On Mon, 10 Oct 2022, Geliang Tang wrote:

> Geliang Tang <geliang.tang@suse.com> 于2022年10月7日周五 16:59写道:
>>
>> On Thu, Oct 06, 2022 at 05:16:55PM -0700, Mat Martineau wrote:
>>> On Thu, 6 Oct 2022, Geliang Tang wrote:
>>>
>>>> v5:
>>>> - address Mat's comments in v4.
>>>
>>> Hi Geliang -
>>>
>>> Thanks for the v5. I haven't finished looking over all the patches in detail
>>> yet, but two things I do want to reply to right now:
>>>
>>>  * Thanks for explaining in patch 4 that last_snd is still useful for round
>>> robin. I had forgotten about that, and it looked like a "write-only"
>>> variable in the kernel code.
>>>
>>>  * In the meeting today Paolo suggested that a good test for the new
>>> scheduler loop would be to modify simult_flows.sh to use much larger files,
>>> then see if the modified code slowed down any of the simult_flows tests.
>>>
>>> He suggested making the test file 10x larger in simult_flows.sh:
>>>
>>> -     size=$((2 * 2048 * 4096))
>>> +     size=$((2 * 2048 * 4096 * 10))
>>>
>>> Can you compare the test times between the export branch and this series,
>>> with both of them using the larger file size?
>>
>> 10x larger failed in my tests with timeout. So I changed it to 5x
>> larger.
>>
>> Here's the patch:
>>
>> @@ -52,7 +52,7 @@ setup()
>>         sout=$(mktemp)
>>         cout=$(mktemp)
>>         capout=$(mktemp)
>> -       size=$((2 * 2048 * 4096))
>> +       size=$((2 * 2048 * 4096 * 5))
>>
>>         dd if=/dev/zero of=$small bs=4096 count=20 >/dev/null 2>&1
>>         dd if=/dev/zero of=$large bs=4096 count=$((size / 4096)) >/dev/null 2>&1
>> @@ -210,13 +210,13 @@ do_transfer()
>>         fi
>>
>>         echo " [ fail ]"
>> -       echo "client exit code $retc, server $rets" 1>&2
>> -       echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
>> -       ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
>> -       echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
>> -       ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
>> -       ls -l $sin $cout
>> -       ls -l $cin $sout
>> +       #echo "client exit code $retc, server $rets" 1>&2
>> +       #echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
>> +       #ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
>> +       #echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
>> +       #ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
>> +       #ls -l $sin $cout
>> +       #ls -l $cin $sout
>>
>>         cat "$capout"
>>         return 1
>>
>> All logs are attached. "5x_10times_export.log" is for the export branch
>> and "5x_10times_refactor_v5.log" is for this series.
>
> I compared the two log data. The test time of this series is indeed
> longer than that of export, but the difference is very small.
>
> I removed the useless information in the two logs and only retained
> the running time and expected time. It looks like this:
>
>> cat 5x_export.log | head
> 36150 max 36005
> 36135 max 36005
> 36087 max 36005
> 36103 max 36005
> 18284 max 18227
> 18349 max 18227
>
> Add all running time and expected time:
>
>> awk '{ sum += $1 }; END { print sum }' 5x_export.log
> 2540920
>> awk '{ sum += $3 }; END { print sum }' 5x_export.log
> 2533820
>
> The ratio of the total running time to the total expected time is
> 1.00280209 (2540920/2533820).
>
> Use the same method to calculate data in 5x_10times_refactor_v5.log:
>
>> cat 5x_refactor_v5.log | head
> 36095 max 36005
> 36168 max 36005
> 36112 max 36005
> 36120 max 36005
> 18235 max 18227
> 18445 max 18227
>
>> awk '{ sum += $1 }; END { print sum }' 5x_refactor_v5.log
> 2546024
>> awk '{ sum += $3 }; END { print sum }' 5x_refactor_v5.log
> 2533820
>
> The ratio is 1.00481644.  It is 0.002 more than export data. I think
> this difference is acceptable.
>

Thanks for taking a look at the data Geliang. To me it seems like the 0.2% 
difference is well within the margin of error, so there doesn't seem to be 
a significant performance regression in your data.

Paolo, how does this performance look to you?


--
Mat Martineau
Intel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH mptcp-next v5 03/11] mptcp: refactor push_pending logic
  2022-10-06 12:17 ` [PATCH mptcp-next v5 03/11] mptcp: refactor push_pending logic Geliang Tang
@ 2022-10-11  0:38   ` Mat Martineau
  0 siblings, 0 replies; 20+ messages in thread
From: Mat Martineau @ 2022-10-11  0:38 UTC (permalink / raw)
  To: Geliang Tang; +Cc: mptcp

On Thu, 6 Oct 2022, Geliang Tang wrote:

> To support redundant package schedulers more easily, this patch refactors
> __mptcp_push_pending() logic from:
>
> For each dfrag:
> 	While sends succeed:
> 		Call the scheduler (selects subflow and msk->snd_burst)
> 		Update subflow locks (push/release/acquire as needed)
> 		Send the dfrag data with mptcp_sendmsg_frag()
> 		Update already_sent, snd_nxt, snd_burst
> 	Update msk->first_pending
> Push/release on final subflow
>
> to:
>
> While the scheduler selects one subflow:
> 	Lock the subflow
> 	For each pending dfrag:
> 		While sends succeed:
> 			Send the dfrag data with mptcp_sendmsg_frag()
> 			Update already_sent, snd_nxt, snd_burst
> 		Update msk->first_pending
> 		Break if required by msk->snd_burst / etc
> 	Push and release the subflow
>
> This patch alse moves the burst check conditions out of the function
> mptcp_subflow_get_send(), check them in __mptcp_push_pending() and
> __mptcp_subflow_push_pending() in the inner "for each pending dfrag"
> loop.
>
> Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> ---
> net/mptcp/protocol.c | 86 ++++++++++++++++++++------------------------
> 1 file changed, 39 insertions(+), 47 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 84d33393d24e..bf77defbc546 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -1417,14 +1417,6 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
> 	u64 linger_time;
> 	long tout = 0;
>
> -	/* re-use last subflow, if the burst allow that */
> -	if (msk->last_snd && msk->snd_burst > 0 &&
> -	    sk_stream_memory_free(msk->last_snd) &&
> -	    mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
> -		mptcp_set_timeout(sk);
> -		return msk->last_snd;
> -	}
> -
> 	/* pick the subflow with the lower wmem/wspace ratio */
> 	for (i = 0; i < SSK_MODE_MAX; ++i) {
> 		send_info[i].ssk = NULL;
> @@ -1530,60 +1522,53 @@ void mptcp_check_and_set_pending(struct sock *sk)
>
> void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> {
> -	struct sock *prev_ssk = NULL, *ssk = NULL;
> 	struct mptcp_sock *msk = mptcp_sk(sk);
> 	struct mptcp_sendmsg_info info = {
> 				.flags = flags,
> 	};
> 	bool do_check_data_fin = false;
> 	struct mptcp_data_frag *dfrag;
> +	struct sock *ssk;
> 	int len;
>
> -	while ((dfrag = mptcp_send_head(sk))) {
> -		info.sent = dfrag->already_sent;
> -		info.limit = dfrag->data_len;
> -		len = dfrag->data_len - dfrag->already_sent;
> -		while (len > 0) {
> -			int ret = 0;
> -
> -			prev_ssk = ssk;
> -			ssk = mptcp_subflow_get_send(msk);
> -
> -			/* First check. If the ssk has changed since
> -			 * the last round, release prev_ssk
> -			 */
> -			if (ssk != prev_ssk && prev_ssk)
> -				mptcp_push_release(prev_ssk, &info);
> -			if (!ssk)
> -				goto out;
> +again:
> +	while (mptcp_send_head(sk) && (ssk = mptcp_subflow_get_send(msk))) {
> +		lock_sock(ssk);
>
> -			/* Need to lock the new subflow only if different
> -			 * from the previous one, otherwise we are still
> -			 * helding the relevant lock
> -			 */
> -			if (ssk != prev_ssk)
> -				lock_sock(ssk);
> +		while ((dfrag = mptcp_send_head(sk))) {
> +			info.sent = dfrag->already_sent;
> +			info.limit = dfrag->data_len;
> +			len = dfrag->data_len - dfrag->already_sent;
> +			while (len > 0) {
> +				int ret = 0;
> +
> +				ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
> +				if (ret <= 0) {
> +					if (ret == -EAGAIN)
> +						goto again;

ssk is still locked here, so the lock_sock(ssk) could deadlock or leave 
the ssk locked when another is selected by mptcp_subflow_get_send().


- Mat

> +					mptcp_push_release(ssk, &info);
> +					goto out;
> +				}
> +
> +				do_check_data_fin = true;
> +				info.sent += ret;
> +				len -= ret;
> +
> +				mptcp_update_post_push(msk, dfrag, ret);
> +			}
> +			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
>
> -			ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
> -			if (ret <= 0) {
> -				if (ret == -EAGAIN)
> -					continue;
> +			if (msk->snd_burst <= 0 ||
> +			    !sk_stream_memory_free(ssk) ||
> +			    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
> 				mptcp_push_release(ssk, &info);
> -				goto out;
> +				goto again;
> 			}
> -
> -			do_check_data_fin = true;
> -			info.sent += ret;
> -			len -= ret;
> -
> -			mptcp_update_post_push(msk, dfrag, ret);
> +			mptcp_set_timeout(sk);
> 		}
> -		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
> -	}
>
> -	/* at this point we held the socket lock for the last subflow we used */
> -	if (ssk)
> 		mptcp_push_release(ssk, &info);
> +	}
>
> out:
> 	/* ensure the rtx timer is running */
> @@ -1636,6 +1621,13 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
> 			mptcp_update_post_push(msk, dfrag, ret);
> 		}
> 		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
> +
> +		if (msk->snd_burst <= 0 ||
> +		    !sk_stream_memory_free(ssk) ||
> +		    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
> +			goto out;
> +		}
> +		mptcp_set_timeout(sk);
> 	}
>
> out:
> -- 
> 2.35.3
>
>
>

--
Mat Martineau
Intel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH mptcp-next v5 00/11] refactor push pending
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (11 preceding siblings ...)
  2022-10-07  0:16 ` [PATCH mptcp-next v5 00/11] refactor push pending Mat Martineau
@ 2022-10-11  1:01 ` Mat Martineau
  12 siblings, 0 replies; 20+ messages in thread
From: Mat Martineau @ 2022-10-11  1:01 UTC (permalink / raw)
  To: Geliang Tang; +Cc: mptcp

On Thu, 6 Oct 2022, Geliang Tang wrote:

> v5:
> - address Mat's comments in v4.
>

Hi Geliang -

I do think these changes are getting closer to being good to merge, 
there's a remaining bisectability issue that I mention in patch 3 and the 
feedback on the squash-to patches to address.

- Mat

> v4:
> - update __mptcp_subflow_push_pending as Mat suggested.
> - add more patches from "BPF redundant scheduler" series.
>
> v3:
> - add a cleanup patch.
> - remove msk->last_snd in mptcp_subflow_get_send().
> - add the loop that calls the scheduler again in __mptcp_push_pending().
>
> v2:
> - add snd_burst check in dfrags loop as Mat suggested.
>
> Refactor __mptcp_push_pending() and __mptcp_subflow_push_pending() to
> remove duplicate code and support redundant scheduler more easily in
> __mptcp_subflow_push_pending().
>
> Geliang Tang (11):
>  Squash to "mptcp: add get_subflow wrappers"
>  mptcp: 'first' argument for subflow_push_pending
>  mptcp: refactor push_pending logic
>  mptcp: drop last_snd for burst scheduler
>  mptcp: simplify push_pending
>  mptcp: multi subflows push_pending
>  mptcp: use msk instead of mptcp_sk
>  mptcp: refactor subflow_push_pending logic
>  mptcp: simplify subflow_push_pending
>  mptcp: multi subflows subflow_push_pending
>  mptcp: multi subflows retrans support
>
> net/mptcp/pm.c         |   9 +-
> net/mptcp/pm_netlink.c |   3 -
> net/mptcp/protocol.c   | 285 ++++++++++++++++++++++-------------------
> net/mptcp/protocol.h   |   5 +-
> net/mptcp/sched.c      |  61 +++++----
> 5 files changed, 184 insertions(+), 179 deletions(-)
>
> -- 
> 2.35.3
>
>
>

--
Mat Martineau
Intel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: mptcp: multi subflows retrans support: Tests Results
  2022-10-02 14:25 [PATCH mptcp-next v4 11/11] mptcp: multi subflows retrans support Geliang Tang
@ 2022-10-06 15:43 ` MPTCP CI
  0 siblings, 0 replies; 20+ messages in thread
From: MPTCP CI @ 2022-10-06 15:43 UTC (permalink / raw)
  To: Geliang Tang; +Cc: mptcp

Hi Geliang,

Thank you for your modifications, that's great!

Our CI did some validations and here is its report:

- KVM Validation: normal:
  - Success! ✅:
  - Task: https://cirrus-ci.com/task/6082444804751360
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/6082444804751360/summary/summary.txt

- {"code":404,"message":
  - "Can't find artifacts containing file conclusion.txt"}:
  - Task: https://cirrus-ci.com/task/5519494851330048
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/5519494851330048/summary/summary.txt

Initiator: Patchew Applier
Commits: https://github.com/multipath-tcp/mptcp_net-next/commits/bc515f11df5d


If there are some issues, you can reproduce them using the same environment as
the one used by the CI thanks to a docker image, e.g.:

    $ cd [kernel source code]
    $ docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --privileged --rm -it \
        --pull always mptcp/mptcp-upstream-virtme-docker:latest \
        auto-debug

For more details:

    https://github.com/multipath-tcp/mptcp-upstream-virtme-docker


Please note that despite all the efforts that have been already done to have a
stable tests suite when executed on a public CI like here, it is possible some
reported issues are not due to your modifications. Still, do not hesitate to
help us improve that ;-)

Cheers,
MPTCP GH Action bot
Bot operated by Matthieu Baerts (Tessares)

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2022-10-11  1:01 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 01/11] Squash to "mptcp: add get_subflow wrappers" Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 02/11] mptcp: 'first' argument for subflow_push_pending Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 03/11] mptcp: refactor push_pending logic Geliang Tang
2022-10-11  0:38   ` Mat Martineau
2022-10-06 12:17 ` [PATCH mptcp-next v5 04/11] mptcp: drop last_snd for burst scheduler Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 05/11] mptcp: simplify push_pending Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 06/11] mptcp: multi subflows push_pending Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 07/11] mptcp: use msk instead of mptcp_sk Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 08/11] mptcp: refactor subflow_push_pending logic Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 09/11] mptcp: simplify subflow_push_pending Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 10/11] mptcp: multi subflows subflow_push_pending Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 11/11] mptcp: multi subflows retrans support Geliang Tang
2022-10-06 17:53   ` mptcp: multi subflows retrans support: Tests Results MPTCP CI
2022-10-07  0:16 ` [PATCH mptcp-next v5 00/11] refactor push pending Mat Martineau
2022-10-07  9:00   ` Geliang Tang
2022-10-10 15:05     ` Geliang Tang
2022-10-11  0:12       ` Mat Martineau
2022-10-11  1:01 ` Mat Martineau
  -- strict thread matches above, loose matches on Subject: below --
2022-10-02 14:25 [PATCH mptcp-next v4 11/11] mptcp: multi subflows retrans support Geliang Tang
2022-10-06 15:43 ` mptcp: multi subflows retrans support: Tests Results MPTCP CI

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.