All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH mptcp-next v5 00/11] refactor push pending
@ 2022-10-06 12:17 Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 01/11] Squash to "mptcp: add get_subflow wrappers" Geliang Tang
                   ` (12 more replies)
  0 siblings, 13 replies; 19+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

v5:
- address Mat's comments in v4.

v4:
- update __mptcp_subflow_push_pending as Mat suggested.
- add more patches from "BPF redundant scheduler" series.

v3:
- add a cleanup patch.
- remove msk->last_snd in mptcp_subflow_get_send().
- add the loop that calls the scheduler again in __mptcp_push_pending().

v2:
- add snd_burst check in dfrags loop as Mat suggested.

Refactor __mptcp_push_pending() and __mptcp_subflow_push_pending() to
remove duplicate code and support redundant scheduler more easily in
__mptcp_subflow_push_pending().

Geliang Tang (11):
  Squash to "mptcp: add get_subflow wrappers"
  mptcp: 'first' argument for subflow_push_pending
  mptcp: refactor push_pending logic
  mptcp: drop last_snd for burst scheduler
  mptcp: simplify push_pending
  mptcp: multi subflows push_pending
  mptcp: use msk instead of mptcp_sk
  mptcp: refactor subflow_push_pending logic
  mptcp: simplify subflow_push_pending
  mptcp: multi subflows subflow_push_pending
  mptcp: multi subflows retrans support

 net/mptcp/pm.c         |   9 +-
 net/mptcp/pm_netlink.c |   3 -
 net/mptcp/protocol.c   | 285 ++++++++++++++++++++++-------------------
 net/mptcp/protocol.h   |   5 +-
 net/mptcp/sched.c      |  61 +++++----
 5 files changed, 184 insertions(+), 179 deletions(-)

-- 
2.35.3


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH mptcp-next v5 01/11] Squash to "mptcp: add get_subflow wrappers"
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 02/11] mptcp: 'first' argument for subflow_push_pending Geliang Tang
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 19+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

Please update the commit log:

'''
This patch defines two new wrappers mptcp_sched_get_send() and
mptcp_sched_get_retrans(), invoke get_subflow() of msk->sched in them.

Set the subflow pointers array in struct mptcp_sched_data before invoking
get_subflow(), then it can be used in get_subflow() in the BPF contexts.

Check the subflow scheduled flags to test which subflow or subflows are
picked by the scheduler.

Move sock_owned_by_me() and the fallback check code from
mptcp_subflow_get_send/retrans() into the wrappers.
'''
---
 net/mptcp/protocol.c |  8 +++---
 net/mptcp/protocol.h |  4 +--
 net/mptcp/sched.c    | 59 +++++++++++++++++++++-----------------------
 3 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 8feb684408f7..d500a00fa778 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1547,7 +1547,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 			int ret = 0;
 
 			prev_ssk = ssk;
-			ssk = mptcp_sched_get_send(msk);
+			ssk = mptcp_subflow_get_send(msk);
 
 			/* First check. If the ssk has changed since
 			 * the last round, release prev_ssk
@@ -1616,7 +1616,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
 			 * check for a different subflow usage only after
 			 * spooling the first chunk of data
 			 */
-			xmit_ssk = first ? ssk : mptcp_sched_get_send(mptcp_sk(sk));
+			xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
 			if (!xmit_ssk)
 				goto out;
 			if (xmit_ssk != ssk) {
@@ -2478,7 +2478,7 @@ static void __mptcp_retrans(struct sock *sk)
 	mptcp_clean_una_wakeup(sk);
 
 	/* first check ssk: need to kick "stale" logic */
-	ssk = mptcp_sched_get_retrans(msk);
+	ssk = mptcp_subflow_get_retrans(msk);
 	dfrag = mptcp_rtx_head(sk);
 	if (!dfrag) {
 		if (mptcp_data_fin_enabled(msk)) {
@@ -3196,7 +3196,7 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
 		return;
 
 	if (!sock_owned_by_user(sk)) {
-		struct sock *xmit_ssk = mptcp_sched_get_send(mptcp_sk(sk));
+		struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk));
 
 		if (xmit_ssk == ssk)
 			__mptcp_subflow_push_pending(sk, ssk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 93c535440a5c..e81399debff9 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -640,8 +640,8 @@ void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
 				 bool scheduled);
 struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk);
 struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk);
-struct sock *mptcp_sched_get_send(struct mptcp_sock *msk);
-struct sock *mptcp_sched_get_retrans(struct mptcp_sock *msk);
+int mptcp_sched_get_send(struct mptcp_sock *msk);
+int mptcp_sched_get_retrans(struct mptcp_sock *msk);
 
 static inline bool __tcp_can_send(const struct sock *ssk)
 {
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
index 044c5ec8bbfb..d1cad1eefb35 100644
--- a/net/mptcp/sched.c
+++ b/net/mptcp/sched.c
@@ -114,67 +114,64 @@ static int mptcp_sched_data_init(struct mptcp_sock *msk, bool reinject,
 	for (; i < MPTCP_SUBFLOWS_MAX; i++)
 		data->contexts[i] = NULL;
 
+	msk->snd_burst = 0;
+
 	return 0;
 }
 
-struct sock *mptcp_sched_get_send(struct mptcp_sock *msk)
+int mptcp_sched_get_send(struct mptcp_sock *msk)
 {
 	struct mptcp_sched_data data;
 	struct sock *ssk = NULL;
-	int i;
 
 	sock_owned_by_me((struct sock *)msk);
 
 	/* the following check is moved out of mptcp_subflow_get_send */
 	if (__mptcp_check_fallback(msk)) {
-		if (!msk->first)
-			return NULL;
-		return __tcp_can_send(msk->first) &&
-		       sk_stream_memory_free(msk->first) ? msk->first : NULL;
+		if (msk->first &&
+		    __tcp_can_send(msk->first) &&
+		    sk_stream_memory_free(msk->first)) {
+			mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true);
+			return 0;
+		}
+		return -EINVAL;
 	}
 
-	if (!msk->sched)
-		return mptcp_subflow_get_send(msk);
+	if (!msk->sched) {
+		ssk = mptcp_subflow_get_send(msk);
+		if (!ssk)
+			return -EINVAL;
+		mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true);
+		return 0;
+	}
 
 	mptcp_sched_data_init(msk, false, &data);
 	msk->sched->get_subflow(msk, &data);
 
-	for (i = 0; i < MPTCP_SUBFLOWS_MAX; i++) {
-		if (data.contexts[i] && READ_ONCE(data.contexts[i]->scheduled)) {
-			ssk = data.contexts[i]->tcp_sock;
-			msk->last_snd = ssk;
-			break;
-		}
-	}
-
-	return ssk;
+	return 0;
 }
 
-struct sock *mptcp_sched_get_retrans(struct mptcp_sock *msk)
+int mptcp_sched_get_retrans(struct mptcp_sock *msk)
 {
 	struct mptcp_sched_data data;
 	struct sock *ssk = NULL;
-	int i;
 
 	sock_owned_by_me((const struct sock *)msk);
 
 	/* the following check is moved out of mptcp_subflow_get_retrans */
 	if (__mptcp_check_fallback(msk))
-		return NULL;
+		return -EINVAL;
 
-	if (!msk->sched)
-		return mptcp_subflow_get_retrans(msk);
+	if (!msk->sched) {
+		ssk = mptcp_subflow_get_retrans(msk);
+		if (!ssk)
+			return -EINVAL;
+		mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true);
+		return 0;
+	}
 
 	mptcp_sched_data_init(msk, true, &data);
 	msk->sched->get_subflow(msk, &data);
 
-	for (i = 0; i < MPTCP_SUBFLOWS_MAX; i++) {
-		if (data.contexts[i] && READ_ONCE(data.contexts[i]->scheduled)) {
-			ssk = data.contexts[i]->tcp_sock;
-			msk->last_snd = ssk;
-			break;
-		}
-	}
-
-	return ssk;
+	return 0;
 }
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH mptcp-next v5 02/11] mptcp: 'first' argument for subflow_push_pending
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 01/11] Squash to "mptcp: add get_subflow wrappers" Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 03/11] mptcp: refactor push_pending logic Geliang Tang
                   ` (10 subsequent siblings)
  12 siblings, 0 replies; 19+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

The function mptcp_subflow_process_delegated() uses the input ssk first,
while __mptcp_check_push() invokes the packet scheduler first.

So this patch adds a new argument named first for the function
__mptcp_subflow_push_pending() to deal with these two cases separately.

With this change, the code that invokes the packet scheduler in the
fuction __mptcp_check_push() can be removed, and replaced by invoking
__mptcp_subflow_push_pending() directly.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index d500a00fa778..84d33393d24e 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1593,7 +1593,8 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 		__mptcp_check_send_data_fin(sk);
 }
 
-static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
+static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
+					 bool first)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct mptcp_sendmsg_info info = {
@@ -1602,7 +1603,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
 	struct mptcp_data_frag *dfrag;
 	struct sock *xmit_ssk;
 	int len, copied = 0;
-	bool first = true;
 
 	info.flags = 0;
 	while ((dfrag = mptcp_send_head(sk))) {
@@ -1612,8 +1612,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
 		while (len > 0) {
 			int ret = 0;
 
-			/* the caller already invoked the packet scheduler,
-			 * check for a different subflow usage only after
+			/* check for a different subflow usage only after
 			 * spooling the first chunk of data
 			 */
 			xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
@@ -3195,16 +3194,10 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
 	if (!mptcp_send_head(sk))
 		return;
 
-	if (!sock_owned_by_user(sk)) {
-		struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk));
-
-		if (xmit_ssk == ssk)
-			__mptcp_subflow_push_pending(sk, ssk);
-		else if (xmit_ssk)
-			mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), MPTCP_DELEGATE_SEND);
-	} else {
+	if (!sock_owned_by_user(sk))
+		__mptcp_subflow_push_pending(sk, ssk, false);
+	else
 		__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
-	}
 }
 
 #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
@@ -3295,7 +3288,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk)
 	if (test_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status)) {
 		mptcp_data_lock(sk);
 		if (!sock_owned_by_user(sk))
-			__mptcp_subflow_push_pending(sk, ssk);
+			__mptcp_subflow_push_pending(sk, ssk, true);
 		else
 			__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
 		mptcp_data_unlock(sk);
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH mptcp-next v5 03/11] mptcp: refactor push_pending logic
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 01/11] Squash to "mptcp: add get_subflow wrappers" Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 02/11] mptcp: 'first' argument for subflow_push_pending Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-11  0:38   ` Mat Martineau
  2022-10-06 12:17 ` [PATCH mptcp-next v5 04/11] mptcp: drop last_snd for burst scheduler Geliang Tang
                   ` (9 subsequent siblings)
  12 siblings, 1 reply; 19+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

To support redundant package schedulers more easily, this patch refactors
__mptcp_push_pending() logic from:

For each dfrag:
	While sends succeed:
		Call the scheduler (selects subflow and msk->snd_burst)
		Update subflow locks (push/release/acquire as needed)
		Send the dfrag data with mptcp_sendmsg_frag()
		Update already_sent, snd_nxt, snd_burst
	Update msk->first_pending
Push/release on final subflow

to:

While the scheduler selects one subflow:
	Lock the subflow
	For each pending dfrag:
		While sends succeed:
			Send the dfrag data with mptcp_sendmsg_frag()
			Update already_sent, snd_nxt, snd_burst
		Update msk->first_pending
		Break if required by msk->snd_burst / etc
	Push and release the subflow

This patch alse moves the burst check conditions out of the function
mptcp_subflow_get_send(), check them in __mptcp_push_pending() and
__mptcp_subflow_push_pending() in the inner "for each pending dfrag"
loop.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 86 ++++++++++++++++++++------------------------
 1 file changed, 39 insertions(+), 47 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 84d33393d24e..bf77defbc546 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1417,14 +1417,6 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 	u64 linger_time;
 	long tout = 0;
 
-	/* re-use last subflow, if the burst allow that */
-	if (msk->last_snd && msk->snd_burst > 0 &&
-	    sk_stream_memory_free(msk->last_snd) &&
-	    mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
-		mptcp_set_timeout(sk);
-		return msk->last_snd;
-	}
-
 	/* pick the subflow with the lower wmem/wspace ratio */
 	for (i = 0; i < SSK_MODE_MAX; ++i) {
 		send_info[i].ssk = NULL;
@@ -1530,60 +1522,53 @@ void mptcp_check_and_set_pending(struct sock *sk)
 
 void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 {
-	struct sock *prev_ssk = NULL, *ssk = NULL;
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct mptcp_sendmsg_info info = {
 				.flags = flags,
 	};
 	bool do_check_data_fin = false;
 	struct mptcp_data_frag *dfrag;
+	struct sock *ssk;
 	int len;
 
-	while ((dfrag = mptcp_send_head(sk))) {
-		info.sent = dfrag->already_sent;
-		info.limit = dfrag->data_len;
-		len = dfrag->data_len - dfrag->already_sent;
-		while (len > 0) {
-			int ret = 0;
-
-			prev_ssk = ssk;
-			ssk = mptcp_subflow_get_send(msk);
-
-			/* First check. If the ssk has changed since
-			 * the last round, release prev_ssk
-			 */
-			if (ssk != prev_ssk && prev_ssk)
-				mptcp_push_release(prev_ssk, &info);
-			if (!ssk)
-				goto out;
+again:
+	while (mptcp_send_head(sk) && (ssk = mptcp_subflow_get_send(msk))) {
+		lock_sock(ssk);
 
-			/* Need to lock the new subflow only if different
-			 * from the previous one, otherwise we are still
-			 * helding the relevant lock
-			 */
-			if (ssk != prev_ssk)
-				lock_sock(ssk);
+		while ((dfrag = mptcp_send_head(sk))) {
+			info.sent = dfrag->already_sent;
+			info.limit = dfrag->data_len;
+			len = dfrag->data_len - dfrag->already_sent;
+			while (len > 0) {
+				int ret = 0;
+
+				ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+				if (ret <= 0) {
+					if (ret == -EAGAIN)
+						goto again;
+					mptcp_push_release(ssk, &info);
+					goto out;
+				}
+
+				do_check_data_fin = true;
+				info.sent += ret;
+				len -= ret;
+
+				mptcp_update_post_push(msk, dfrag, ret);
+			}
+			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
 
-			ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
-			if (ret <= 0) {
-				if (ret == -EAGAIN)
-					continue;
+			if (msk->snd_burst <= 0 ||
+			    !sk_stream_memory_free(ssk) ||
+			    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
 				mptcp_push_release(ssk, &info);
-				goto out;
+				goto again;
 			}
-
-			do_check_data_fin = true;
-			info.sent += ret;
-			len -= ret;
-
-			mptcp_update_post_push(msk, dfrag, ret);
+			mptcp_set_timeout(sk);
 		}
-		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
-	}
 
-	/* at this point we held the socket lock for the last subflow we used */
-	if (ssk)
 		mptcp_push_release(ssk, &info);
+	}
 
 out:
 	/* ensure the rtx timer is running */
@@ -1636,6 +1621,13 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 			mptcp_update_post_push(msk, dfrag, ret);
 		}
 		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+
+		if (msk->snd_burst <= 0 ||
+		    !sk_stream_memory_free(ssk) ||
+		    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
+			goto out;
+		}
+		mptcp_set_timeout(sk);
 	}
 
 out:
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH mptcp-next v5 04/11] mptcp: drop last_snd for burst scheduler
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (2 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 03/11] mptcp: refactor push_pending logic Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 05/11] mptcp: simplify push_pending Geliang Tang
                   ` (8 subsequent siblings)
  12 siblings, 0 replies; 19+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

msk->last_snd is no longer used anymore in burst scheduler, now it is
used for the round-robin BPF MPTCP scheduler only. This patch removes
the last_snd related code for burst scheduler.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/pm.c         |  9 +--------
 net/mptcp/pm_netlink.c |  3 ---
 net/mptcp/protocol.c   | 11 +----------
 net/mptcp/protocol.h   |  1 -
 net/mptcp/sched.c      |  2 ++
 5 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 45e2a48397b9..cdeb7280ac76 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -282,15 +282,8 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)
 
 	pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
 	msk = mptcp_sk(sk);
-	if (subflow->backup != bkup) {
+	if (subflow->backup != bkup)
 		subflow->backup = bkup;
-		mptcp_data_lock(sk);
-		if (!sock_owned_by_user(sk))
-			msk->last_snd = NULL;
-		else
-			__set_bit(MPTCP_RESET_SCHEDULER,  &msk->cb_flags);
-		mptcp_data_unlock(sk);
-	}
 
 	mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC);
 }
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 9813ed0fde9b..1f2da4aedcb4 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -475,9 +475,6 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con
 
 	slow = lock_sock_fast(ssk);
 	if (prio) {
-		if (subflow->backup != backup)
-			msk->last_snd = NULL;
-
 		subflow->send_mp_prio = 1;
 		subflow->backup = backup;
 		subflow->request_bkup = backup;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index bf77defbc546..8708e1e4ba16 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1469,16 +1469,13 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 
 	burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
 	wmem = READ_ONCE(ssk->sk_wmem_queued);
-	if (!burst) {
-		msk->last_snd = NULL;
+	if (!burst)
 		return ssk;
-	}
 
 	subflow = mptcp_subflow_ctx(ssk);
 	subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
 					   READ_ONCE(ssk->sk_pacing_rate) * burst,
 					   burst + wmem);
-	msk->last_snd = ssk;
 	msk->snd_burst = burst;
 	return ssk;
 }
@@ -2343,9 +2340,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 		msk->first = NULL;
 
 out:
-	if (ssk == msk->last_snd)
-		msk->last_snd = NULL;
-
 	if (need_push)
 		__mptcp_push_pending(sk, 0);
 }
@@ -2978,7 +2972,6 @@ static int mptcp_disconnect(struct sock *sk, int flags)
 	 * subflow
 	 */
 	mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
-	msk->last_snd = NULL;
 	WRITE_ONCE(msk->flags, 0);
 	msk->cb_flags = 0;
 	msk->push_pending = 0;
@@ -3239,8 +3232,6 @@ static void mptcp_release_cb(struct sock *sk)
 			__mptcp_set_connected(sk);
 		if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
 			__mptcp_error_report(sk);
-		if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags))
-			msk->last_snd = NULL;
 	}
 
 	__mptcp_update_rmem(sk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index e81399debff9..05f4c6fd0cd8 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -124,7 +124,6 @@
 #define MPTCP_RETRANSMIT	4
 #define MPTCP_FLUSH_JOIN_LIST	5
 #define MPTCP_CONNECTED		6
-#define MPTCP_RESET_SCHEDULER	7
 
 static inline bool before64(__u64 seq1, __u64 seq2)
 {
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
index d1cad1eefb35..3d805760ae99 100644
--- a/net/mptcp/sched.c
+++ b/net/mptcp/sched.c
@@ -67,6 +67,7 @@ int mptcp_init_sched(struct mptcp_sock *msk,
 	msk->sched = sched;
 	if (msk->sched->init)
 		msk->sched->init(msk);
+	msk->last_snd = NULL;
 
 	pr_debug("sched=%s", msk->sched->name);
 
@@ -84,6 +85,7 @@ void mptcp_release_sched(struct mptcp_sock *msk)
 	msk->sched = NULL;
 	if (sched->release)
 		sched->release(msk);
+	msk->last_snd = NULL;
 
 	bpf_module_put(sched, sched->owner);
 }
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH mptcp-next v5 05/11] mptcp: simplify push_pending
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (3 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 04/11] mptcp: drop last_snd for burst scheduler Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 06/11] mptcp: multi subflows push_pending Geliang Tang
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 19+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

This patch moves the duplicate code from __mptcp_push_pending() and
__mptcp_subflow_push_pending() into a new helper function, named
__subflow_push_pending(). And simplify __mptcp_push_pending() by
invoking this helper.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 95 +++++++++++++++++++++++++-------------------
 1 file changed, 54 insertions(+), 41 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 8708e1e4ba16..29905214103e 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1480,12 +1480,6 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 	return ssk;
 }
 
-static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info)
-{
-	tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal);
-	release_sock(ssk);
-}
-
 static void mptcp_update_post_push(struct mptcp_sock *msk,
 				   struct mptcp_data_frag *dfrag,
 				   u32 sent)
@@ -1517,61 +1511,80 @@ void mptcp_check_and_set_pending(struct sock *sk)
 		mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING);
 }
 
+static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
+				  struct mptcp_sendmsg_info *info)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct mptcp_data_frag *dfrag;
+	int len, copied = 0, err = 0;
+
+	while ((dfrag = mptcp_send_head(sk))) {
+		info->sent = dfrag->already_sent;
+		info->limit = dfrag->data_len;
+		len = dfrag->data_len - dfrag->already_sent;
+		while (len > 0) {
+			int ret = 0;
+
+			ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info);
+			if (ret <= 0) {
+				err = ret;
+				goto out;
+			}
+
+			info->sent += ret;
+			copied += ret;
+			len -= ret;
+
+			mptcp_update_post_push(msk, dfrag, ret);
+		}
+		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+
+		if (msk->snd_burst <= 0 ||
+		    !sk_stream_memory_free(ssk) ||
+		    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
+			err = -EAGAIN;
+			goto out;
+		}
+		mptcp_set_timeout(sk);
+	}
+
+out:
+	if (copied) {
+		tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle,
+			 info->size_goal);
+		err = copied;
+	}
+
+	return err;
+}
+
 void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct mptcp_sendmsg_info info = {
 				.flags = flags,
 	};
-	bool do_check_data_fin = false;
-	struct mptcp_data_frag *dfrag;
 	struct sock *ssk;
-	int len;
+	int ret = 0;
 
 again:
 	while (mptcp_send_head(sk) && (ssk = mptcp_subflow_get_send(msk))) {
 		lock_sock(ssk);
+		ret = __subflow_push_pending(sk, ssk, &info);
+		release_sock(ssk);
 
-		while ((dfrag = mptcp_send_head(sk))) {
-			info.sent = dfrag->already_sent;
-			info.limit = dfrag->data_len;
-			len = dfrag->data_len - dfrag->already_sent;
-			while (len > 0) {
-				int ret = 0;
-
-				ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
-				if (ret <= 0) {
-					if (ret == -EAGAIN)
-						goto again;
-					mptcp_push_release(ssk, &info);
-					goto out;
-				}
-
-				do_check_data_fin = true;
-				info.sent += ret;
-				len -= ret;
-
-				mptcp_update_post_push(msk, dfrag, ret);
-			}
-			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
-
-			if (msk->snd_burst <= 0 ||
-			    !sk_stream_memory_free(ssk) ||
-			    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
-				mptcp_push_release(ssk, &info);
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
 				goto again;
-			}
-			mptcp_set_timeout(sk);
+			goto out;
 		}
-
-		mptcp_push_release(ssk, &info);
 	}
 
 out:
 	/* ensure the rtx timer is running */
 	if (!mptcp_timer_pending(sk))
 		mptcp_reset_timer(sk);
-	if (do_check_data_fin)
+	if (ret > 0)
 		__mptcp_check_send_data_fin(sk);
 }
 
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH mptcp-next v5 06/11] mptcp: multi subflows push_pending
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (4 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 05/11] mptcp: simplify push_pending Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 07/11] mptcp: use msk instead of mptcp_sk Geliang Tang
                   ` (6 subsequent siblings)
  12 siblings, 0 replies; 19+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

This patch adds the multiple subflows support for __mptcp_push_pending().
Use mptcp_sched_get_send() wrapper instead of mptcp_subflow_get_send() in
it.

Check the subflow scheduled flags to test which subflow or subflows are
picked by the scheduler, use them to send data.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 29905214103e..fdb879e09a32 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1561,22 +1561,31 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
 void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	struct mptcp_sendmsg_info info = {
-				.flags = flags,
-	};
-	struct sock *ssk;
 	int ret = 0;
 
 again:
-	while (mptcp_send_head(sk) && (ssk = mptcp_subflow_get_send(msk))) {
-		lock_sock(ssk);
-		ret = __subflow_push_pending(sk, ssk, &info);
-		release_sock(ssk);
+	while (mptcp_send_head(sk) && !mptcp_sched_get_send(msk)) {
+		struct mptcp_subflow_context *subflow;
+		struct mptcp_sendmsg_info info = {
+			.flags = flags,
+		};
 
-		if (ret <= 0) {
-			if (ret == -EAGAIN)
-				goto again;
-			goto out;
+		mptcp_for_each_subflow(msk, subflow) {
+			if (READ_ONCE(subflow->scheduled)) {
+				struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+				lock_sock(ssk);
+				ret = __subflow_push_pending(sk, ssk, &info);
+				release_sock(ssk);
+
+				if (ret <= 0) {
+					if (ret == -EAGAIN)
+						goto again;
+					goto out;
+				}
+				msk->last_snd = ssk;
+				mptcp_subflow_set_scheduled(subflow, false);
+			}
 		}
 	}
 
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH mptcp-next v5 07/11] mptcp: use msk instead of mptcp_sk
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (5 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 06/11] mptcp: multi subflows push_pending Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 08/11] mptcp: refactor subflow_push_pending logic Geliang Tang
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 19+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

Use msk instead of mptcp_sk(sk) in the functions where the variable
"msk = mptcp_sk(sk)" has been defined.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index fdb879e09a32..b15b97e8cbf7 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1619,7 +1619,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 			/* check for a different subflow usage only after
 			 * spooling the first chunk of data
 			 */
-			xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
+			xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk);
 			if (!xmit_ssk)
 				goto out;
 			if (xmit_ssk != ssk) {
@@ -2249,7 +2249,7 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
 	struct mptcp_data_frag *cur, *rtx_head;
 	struct mptcp_sock *msk = mptcp_sk(sk);
 
-	if (__mptcp_check_fallback(mptcp_sk(sk)))
+	if (__mptcp_check_fallback(msk))
 		return false;
 
 	if (tcp_rtx_and_write_queues_empty(sk))
@@ -2928,7 +2928,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
 
 	sock_hold(sk);
 	pr_debug("msk=%p state=%d", sk, sk->sk_state);
-	if (mptcp_sk(sk)->token)
+	if (msk->token)
 		mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
 
 	if (sk->sk_state == TCP_CLOSE) {
@@ -2987,8 +2987,8 @@ static int mptcp_disconnect(struct sock *sk, int flags)
 	mptcp_stop_timer(sk);
 	sk_stop_timer(sk, &sk->sk_timer);
 
-	if (mptcp_sk(sk)->token)
-		mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
+	if (msk->token)
+		mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
 
 	/* msk->subflow is still intact, the following will not free the first
 	 * subflow
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH mptcp-next v5 08/11] mptcp: refactor subflow_push_pending logic
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (6 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 07/11] mptcp: use msk instead of mptcp_sk Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 09/11] mptcp: simplify subflow_push_pending Geliang Tang
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 19+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

This patch refactors __mptcp_subflow_push_pending logic from:

For each dfrag:
	While sends succeed:
		Call the scheduler (selects subflow and msk->snd_burst)
		Send the dfrag data with mptcp_subflow_delegate(), break
		Send the dfrag data with mptcp_sendmsg_frag()
		Update dfrag->already_sent, msk->snd_nxt, msk->snd_burst
	Update msk->first_pending

to:

While first_pending isn't empty:
	Call the scheduler (selects subflow and msk->snd_burst)
	Send the dfrag data with mptcp_subflow_delegate(), break
	Send the dfrag data with mptcp_sendmsg_frag()
	For each pending dfrag:
		While sends succeed:
			Send the dfrag data with mptcp_sendmsg_frag()
			Update already_sent, snd_nxt, snd_burst
		Update msk->first_pending
		Break if required by msk->snd_burst / etc

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 66 +++++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 32 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index b15b97e8cbf7..e2f47e8ca63e 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1609,44 +1609,46 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 	int len, copied = 0;
 
 	info.flags = 0;
-	while ((dfrag = mptcp_send_head(sk))) {
-		info.sent = dfrag->already_sent;
-		info.limit = dfrag->data_len;
-		len = dfrag->data_len - dfrag->already_sent;
-		while (len > 0) {
-			int ret = 0;
+	while (mptcp_send_head(sk)) {
+		/* check for a different subflow usage only after
+		 * spooling the first chunk of data
+		 */
+		xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk);
+		if (!xmit_ssk)
+			goto out;
+		if (xmit_ssk != ssk) {
+			mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk),
+					       MPTCP_DELEGATE_SEND);
+			goto out;
+		}
 
-			/* check for a different subflow usage only after
-			 * spooling the first chunk of data
-			 */
-			xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk);
-			if (!xmit_ssk)
-				goto out;
-			if (xmit_ssk != ssk) {
-				mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk),
-						       MPTCP_DELEGATE_SEND);
-				goto out;
-			}
+		while ((dfrag = mptcp_send_head(sk))) {
+			info.sent = dfrag->already_sent;
+			info.limit = dfrag->data_len;
+			len = dfrag->data_len - dfrag->already_sent;
+			while (len > 0) {
+				int ret = 0;
 
-			ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
-			if (ret <= 0)
-				goto out;
+				ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+				if (ret <= 0)
+					goto out;
 
-			info.sent += ret;
-			copied += ret;
-			len -= ret;
-			first = false;
+				info.sent += ret;
+				copied += ret;
+				len -= ret;
+				first = false;
 
-			mptcp_update_post_push(msk, dfrag, ret);
-		}
-		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+				mptcp_update_post_push(msk, dfrag, ret);
+			}
+			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
 
-		if (msk->snd_burst <= 0 ||
-		    !sk_stream_memory_free(ssk) ||
-		    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
-			goto out;
+			if (msk->snd_burst <= 0 ||
+			    !sk_stream_memory_free(ssk) ||
+			    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
+				goto out;
+			}
+			mptcp_set_timeout(sk);
 		}
-		mptcp_set_timeout(sk);
 	}
 
 out:
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH mptcp-next v5 09/11] mptcp: simplify subflow_push_pending
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (7 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 08/11] mptcp: refactor subflow_push_pending logic Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 10/11] mptcp: multi subflows subflow_push_pending Geliang Tang
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 19+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

This patch simplifies __mptcp_subflow_push_pending() by invoking
__subflow_push_pending() helper.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 39 ++++++++-------------------------------
 1 file changed, 8 insertions(+), 31 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e2f47e8ca63e..66436885b749 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1604,10 +1604,10 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 	struct mptcp_sendmsg_info info = {
 		.data_lock_held = true,
 	};
-	struct mptcp_data_frag *dfrag;
 	struct sock *xmit_ssk;
-	int len, copied = 0;
+	int ret = 0;
 
+again:
 	info.flags = 0;
 	while (mptcp_send_head(sk)) {
 		/* check for a different subflow usage only after
@@ -1622,32 +1622,11 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 			goto out;
 		}
 
-		while ((dfrag = mptcp_send_head(sk))) {
-			info.sent = dfrag->already_sent;
-			info.limit = dfrag->data_len;
-			len = dfrag->data_len - dfrag->already_sent;
-			while (len > 0) {
-				int ret = 0;
-
-				ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
-				if (ret <= 0)
-					goto out;
-
-				info.sent += ret;
-				copied += ret;
-				len -= ret;
-				first = false;
-
-				mptcp_update_post_push(msk, dfrag, ret);
-			}
-			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
-
-			if (msk->snd_burst <= 0 ||
-			    !sk_stream_memory_free(ssk) ||
-			    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
-				goto out;
-			}
-			mptcp_set_timeout(sk);
+		ret = __subflow_push_pending(sk, ssk, &info);
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
+				goto again;
+			break;
 		}
 	}
 
@@ -1655,9 +1634,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 	/* __mptcp_alloc_tx_skb could have released some wmem and we are
 	 * not going to flush it via release_sock()
 	 */
-	if (copied) {
-		tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
-			 info.size_goal);
+	if (ret > 0) {
 		if (!mptcp_timer_pending(sk))
 			mptcp_reset_timer(sk);
 
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH mptcp-next v5 10/11] mptcp: multi subflows subflow_push_pending
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (8 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 09/11] mptcp: simplify subflow_push_pending Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 12:17 ` [PATCH mptcp-next v5 11/11] mptcp: multi subflows retrans support Geliang Tang
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 19+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

This patch adds the multiple subflows support for
__mptcp_subflow_push_pending(). Use mptcp_sched_get_send() wrapper
instead of mptcp_subflow_get_send() in it.

Check the subflow scheduled flags to test which subflow or subflows are
picked by the scheduler, use them to send data.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 50 ++++++++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 66436885b749..ecea2a400e6b 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1601,10 +1601,10 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 					 bool first)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct mptcp_subflow_context *subflow;
 	struct mptcp_sendmsg_info info = {
 		.data_lock_held = true,
 	};
-	struct sock *xmit_ssk;
 	int ret = 0;
 
 again:
@@ -1613,20 +1613,44 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
 		/* check for a different subflow usage only after
 		 * spooling the first chunk of data
 		 */
-		xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk);
-		if (!xmit_ssk)
-			goto out;
-		if (xmit_ssk != ssk) {
-			mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk),
-					       MPTCP_DELEGATE_SEND);
-			goto out;
+		if (first) {
+			ret = __subflow_push_pending(sk, ssk, &info);
+			if (ret <= 0) {
+				if (ret == -EAGAIN)
+					goto again;
+				break;
+			}
+			first = false;
+			msk->last_snd = ssk;
+			continue;
 		}
 
-		ret = __subflow_push_pending(sk, ssk, &info);
-		if (ret <= 0) {
-			if (ret == -EAGAIN)
-				goto again;
-			break;
+		if (mptcp_sched_get_send(msk))
+			goto out;
+
+		mptcp_for_each_subflow(msk, subflow) {
+			if (READ_ONCE(subflow->scheduled)) {
+				struct sock *xmit_ssk = mptcp_subflow_tcp_sock(subflow);
+
+				if (xmit_ssk != ssk) {
+					if (mptcp_subflow_has_delegated_action(subflow))
+						goto out;
+					mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk),
+							       MPTCP_DELEGATE_SEND);
+					msk->last_snd = xmit_ssk;
+					mptcp_subflow_set_scheduled(subflow, false);
+					continue;
+				}
+
+				ret = __subflow_push_pending(sk, ssk, &info);
+				if (ret <= 0) {
+					if (ret == -EAGAIN)
+						goto again;
+					goto out;
+				}
+				msk->last_snd = ssk;
+				mptcp_subflow_set_scheduled(subflow, false);
+			}
 		}
 	}
 
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH mptcp-next v5 11/11] mptcp: multi subflows retrans support
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (9 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 10/11] mptcp: multi subflows subflow_push_pending Geliang Tang
@ 2022-10-06 12:17 ` Geliang Tang
  2022-10-06 17:53   ` mptcp: multi subflows retrans support: Tests Results MPTCP CI
  2022-10-07  0:16 ` [PATCH mptcp-next v5 00/11] refactor push pending Mat Martineau
  2022-10-11  1:01 ` Mat Martineau
  12 siblings, 1 reply; 19+ messages in thread
From: Geliang Tang @ 2022-10-06 12:17 UTC (permalink / raw)
  To: mptcp; +Cc: Geliang Tang

This patch adds the multiple subflows support for __mptcp_retrans(). In
it, use sched_get_retrans() wrapper instead of mptcp_subflow_get_retrans().

Iterate each subflow of msk, check the scheduled flag to test if it is
picked by the scheduler. If so, use it to retrans data.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 62 ++++++++++++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index ecea2a400e6b..b422d5de435b 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2479,16 +2479,17 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
 static void __mptcp_retrans(struct sock *sk)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct mptcp_subflow_context *subflow;
 	struct mptcp_sendmsg_info info = {};
 	struct mptcp_data_frag *dfrag;
-	size_t copied = 0;
 	struct sock *ssk;
-	int ret;
+	int ret, err;
+	u16 len = 0;
 
 	mptcp_clean_una_wakeup(sk);
 
 	/* first check ssk: need to kick "stale" logic */
-	ssk = mptcp_subflow_get_retrans(msk);
+	err = mptcp_sched_get_retrans(msk);
 	dfrag = mptcp_rtx_head(sk);
 	if (!dfrag) {
 		if (mptcp_data_fin_enabled(msk)) {
@@ -2507,31 +2508,46 @@ static void __mptcp_retrans(struct sock *sk)
 		goto reset_timer;
 	}
 
-	if (!ssk)
+	if (err)
 		goto reset_timer;
 
-	lock_sock(ssk);
+	mptcp_for_each_subflow(msk, subflow) {
+		if (READ_ONCE(subflow->scheduled)) {
+			u16 copied = 0;
+
+			ssk = mptcp_subflow_tcp_sock(subflow);
+			if (!ssk)
+				goto reset_timer;
+
+			lock_sock(ssk);
+
+			/* limit retransmission to the bytes already sent on some subflows */
+			info.sent = 0;
+			info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len :
+								    dfrag->already_sent;
+			while (info.sent < info.limit) {
+				ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+				if (ret <= 0)
+					break;
+
+				MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
+				copied += ret;
+				info.sent += ret;
+			}
+			if (copied) {
+				len = max(copied, len);
+				tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
+					 info.size_goal);
+				WRITE_ONCE(msk->allow_infinite_fallback, false);
+			}
 
-	/* limit retransmission to the bytes already sent on some subflows */
-	info.sent = 0;
-	info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent;
-	while (info.sent < info.limit) {
-		ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
-		if (ret <= 0)
-			break;
+			release_sock(ssk);
 
-		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
-		copied += ret;
-		info.sent += ret;
-	}
-	if (copied) {
-		dfrag->already_sent = max(dfrag->already_sent, info.sent);
-		tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
-			 info.size_goal);
-		WRITE_ONCE(msk->allow_infinite_fallback, false);
+			msk->last_snd = ssk;
+			mptcp_subflow_set_scheduled(subflow, false);
+		}
 	}
-
-	release_sock(ssk);
+	dfrag->already_sent = max(dfrag->already_sent, len);
 
 reset_timer:
 	mptcp_check_and_set_pending(sk);
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: mptcp: multi subflows retrans support: Tests Results
  2022-10-06 12:17 ` [PATCH mptcp-next v5 11/11] mptcp: multi subflows retrans support Geliang Tang
@ 2022-10-06 17:53   ` MPTCP CI
  0 siblings, 0 replies; 19+ messages in thread
From: MPTCP CI @ 2022-10-06 17:53 UTC (permalink / raw)
  To: Geliang Tang; +Cc: mptcp

Hi Geliang,

Thank you for your modifications, that's great!

Our CI did some validations and here is its report:

- KVM Validation: normal:
  - Unstable: 2 failed test(s): packetdrill_add_addr selftest_mptcp_join 🔴:
  - Task: https://cirrus-ci.com/task/5053437011296256
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/5053437011296256/summary/summary.txt

- KVM Validation: debug:
  - Success! ✅:
  - Task: https://cirrus-ci.com/task/6179336918138880
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/6179336918138880/summary/summary.txt

Initiator: Patchew Applier
Commits: https://github.com/multipath-tcp/mptcp_net-next/commits/acfdc20f458a


If there are some issues, you can reproduce them using the same environment as
the one used by the CI thanks to a docker image, e.g.:

    $ cd [kernel source code]
    $ docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --privileged --rm -it \
        --pull always mptcp/mptcp-upstream-virtme-docker:latest \
        auto-debug

For more details:

    https://github.com/multipath-tcp/mptcp-upstream-virtme-docker


Please note that despite all the efforts that have been already done to have a
stable tests suite when executed on a public CI like here, it is possible some
reported issues are not due to your modifications. Still, do not hesitate to
help us improve that ;-)

Cheers,
MPTCP GH Action bot
Bot operated by Matthieu Baerts (Tessares)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH mptcp-next v5 00/11] refactor push pending
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (10 preceding siblings ...)
  2022-10-06 12:17 ` [PATCH mptcp-next v5 11/11] mptcp: multi subflows retrans support Geliang Tang
@ 2022-10-07  0:16 ` Mat Martineau
  2022-10-07  9:00   ` Geliang Tang
  2022-10-11  1:01 ` Mat Martineau
  12 siblings, 1 reply; 19+ messages in thread
From: Mat Martineau @ 2022-10-07  0:16 UTC (permalink / raw)
  To: Geliang Tang; +Cc: mptcp

On Thu, 6 Oct 2022, Geliang Tang wrote:

> v5:
> - address Mat's comments in v4.

Hi Geliang -

Thanks for the v5. I haven't finished looking over all the patches in 
detail yet, but two things I do want to reply to right now:

  * Thanks for explaining in patch 4 that last_snd is still useful for 
round robin. I had forgotten about that, and it looked like a "write-only" 
variable in the kernel code.

  * In the meeting today Paolo suggested that a good test for the new 
scheduler loop would be to modify simult_flows.sh to use much larger 
files, then see if the modified code slowed down any of the simult_flows 
tests.

He suggested making the test file 10x larger in simult_flows.sh:

-	size=$((2 * 2048 * 4096))
+	size=$((2 * 2048 * 4096 * 10))

Can you compare the test times between the export branch and this series, 
with both of them using the larger file size?


Thanks,

Mat


>
> v4:
> - update __mptcp_subflow_push_pending as Mat suggested.
> - add more patches from "BPF redundant scheduler" series.
>
> v3:
> - add a cleanup patch.
> - remove msk->last_snd in mptcp_subflow_get_send().
> - add the loop that calls the scheduler again in __mptcp_push_pending().
>
> v2:
> - add snd_burst check in dfrags loop as Mat suggested.
>
> Refactor __mptcp_push_pending() and __mptcp_subflow_push_pending() to
> remove duplicate code and support redundant scheduler more easily in
> __mptcp_subflow_push_pending().
>
> Geliang Tang (11):
>  Squash to "mptcp: add get_subflow wrappers"
>  mptcp: 'first' argument for subflow_push_pending
>  mptcp: refactor push_pending logic
>  mptcp: drop last_snd for burst scheduler
>  mptcp: simplify push_pending
>  mptcp: multi subflows push_pending
>  mptcp: use msk instead of mptcp_sk
>  mptcp: refactor subflow_push_pending logic
>  mptcp: simplify subflow_push_pending
>  mptcp: multi subflows subflow_push_pending
>  mptcp: multi subflows retrans support
>
> net/mptcp/pm.c         |   9 +-
> net/mptcp/pm_netlink.c |   3 -
> net/mptcp/protocol.c   | 285 ++++++++++++++++++++++-------------------
> net/mptcp/protocol.h   |   5 +-
> net/mptcp/sched.c      |  61 +++++----
> 5 files changed, 184 insertions(+), 179 deletions(-)
>
> -- 
> 2.35.3
>
>
>

--
Mat Martineau
Intel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH mptcp-next v5 00/11] refactor push pending
  2022-10-07  0:16 ` [PATCH mptcp-next v5 00/11] refactor push pending Mat Martineau
@ 2022-10-07  9:00   ` Geliang Tang
  2022-10-10 15:05     ` Geliang Tang
  0 siblings, 1 reply; 19+ messages in thread
From: Geliang Tang @ 2022-10-07  9:00 UTC (permalink / raw)
  To: Mat Martineau; +Cc: mptcp

[-- Attachment #1: Type: text/plain, Size: 3689 bytes --]

On Thu, Oct 06, 2022 at 05:16:55PM -0700, Mat Martineau wrote:
> On Thu, 6 Oct 2022, Geliang Tang wrote:
> 
> > v5:
> > - address Mat's comments in v4.
> 
> Hi Geliang -
> 
> Thanks for the v5. I haven't finished looking over all the patches in detail
> yet, but two things I do want to reply to right now:
> 
>  * Thanks for explaining in patch 4 that last_snd is still useful for round
> robin. I had forgotten about that, and it looked like a "write-only"
> variable in the kernel code.
> 
>  * In the meeting today Paolo suggested that a good test for the new
> scheduler loop would be to modify simult_flows.sh to use much larger files,
> then see if the modified code slowed down any of the simult_flows tests.
> 
> He suggested making the test file 10x larger in simult_flows.sh:
> 
> -	size=$((2 * 2048 * 4096))
> +	size=$((2 * 2048 * 4096 * 10))
> 
> Can you compare the test times between the export branch and this series,
> with both of them using the larger file size?

10x larger failed in my tests with timeout. So I changed it to 5x
larger.

Here's the patch:

@@ -52,7 +52,7 @@ setup()
 	sout=$(mktemp)
 	cout=$(mktemp)
 	capout=$(mktemp)
-	size=$((2 * 2048 * 4096))
+	size=$((2 * 2048 * 4096 * 5))

 	dd if=/dev/zero of=$small bs=4096 count=20 >/dev/null 2>&1
 	dd if=/dev/zero of=$large bs=4096 count=$((size / 4096)) >/dev/null 2>&1
@@ -210,13 +210,13 @@ do_transfer()
 	fi

 	echo " [ fail ]"
-	echo "client exit code $retc, server $rets" 1>&2
-	echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
-	ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
-	echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
-	ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
-	ls -l $sin $cout
-	ls -l $cin $sout
+	#echo "client exit code $retc, server $rets" 1>&2
+	#echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
+	#ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
+	#echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
+	#ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
+	#ls -l $sin $cout
+	#ls -l $cin $sout

 	cat "$capout"
 	return 1

All logs are attached. "5x_10times_export.log" is for the export branch
and "5x_10times_refactor_v5.log" is for this series.

Thanks,
-Geliang

> 
> 
> Thanks,
> 
> Mat
> 
> 
> > 
> > v4:
> > - update __mptcp_subflow_push_pending as Mat suggested.
> > - add more patches from "BPF redundant scheduler" series.
> > 
> > v3:
> > - add a cleanup patch.
> > - remove msk->last_snd in mptcp_subflow_get_send().
> > - add the loop that calls the scheduler again in __mptcp_push_pending().
> > 
> > v2:
> > - add snd_burst check in dfrags loop as Mat suggested.
> > 
> > Refactor __mptcp_push_pending() and __mptcp_subflow_push_pending() to
> > remove duplicate code and support redundant scheduler more easily in
> > __mptcp_subflow_push_pending().
> > 
> > Geliang Tang (11):
> >  Squash to "mptcp: add get_subflow wrappers"
> >  mptcp: 'first' argument for subflow_push_pending
> >  mptcp: refactor push_pending logic
> >  mptcp: drop last_snd for burst scheduler
> >  mptcp: simplify push_pending
> >  mptcp: multi subflows push_pending
> >  mptcp: use msk instead of mptcp_sk
> >  mptcp: refactor subflow_push_pending logic
> >  mptcp: simplify subflow_push_pending
> >  mptcp: multi subflows subflow_push_pending
> >  mptcp: multi subflows retrans support
> > 
> > net/mptcp/pm.c         |   9 +-
> > net/mptcp/pm_netlink.c |   3 -
> > net/mptcp/protocol.c   | 285 ++++++++++++++++++++++-------------------
> > net/mptcp/protocol.h   |   5 +-
> > net/mptcp/sched.c      |  61 +++++----
> > 5 files changed, 184 insertions(+), 179 deletions(-)
> > 
> > -- 
> > 2.35.3
> > 
> > 
> > 
> 
> --
> Mat Martineau
> Intel

[-- Attachment #2: 5x_10times_export.log --]
[-- Type: text/plain, Size: 13848 bytes --]

1

balanced bwidth                                             transfer slower than expected! runtime 36150 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36135 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36087 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36103 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18173 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18284 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18160 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction 18169 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18349 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse direction18170 max 18227      [ OK ]

2

balanced bwidth                                             transfer slower than expected! runtime 36085 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36095 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36104 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36106 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18364 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18272 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18246 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction 18150 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay            18216 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay - reverse direction18199 max 18227      [ OK ]

3

balanced bwidth                                             transfer slower than expected! runtime 36114 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36082 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36125 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36093 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18198 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18431 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18312 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction 18197 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18284 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18450 ms, expected 18227 ms max 18227       [ fail ]

4

balanced bwidth                                             transfer slower than expected! runtime 36111 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36112 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36126 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36106 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18217 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       18214 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18366 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18347 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            18218 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18459 ms, expected 18227 ms max 18227       [ fail ]

5

balanced bwidth                                             transfer slower than expected! runtime 36128 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36106 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36125 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36108 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18172 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18288 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18312 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18265 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18344 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse direction18133 max 18227      [ OK ]

6

balanced bwidth                                             transfer slower than expected! runtime 36114 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36085 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36156 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36109 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18345 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18409 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18167 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18317 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            18178 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay - reverse direction18163 max 18227      [ OK ]

7

balanced bwidth                                             transfer slower than expected! runtime 36150 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36087 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36108 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36082 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18292 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18442 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18283 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction 18203 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay            18218 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18383 ms, expected 18227 ms max 18227       [ fail ]

8

balanced bwidth                                             transfer slower than expected! runtime 36090 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36089 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36106 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36101 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18271 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18267 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18330 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18323 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18332 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18357 ms, expected 18227 ms max 18227       [ fail ]

9

balanced bwidth                                             transfer slower than expected! runtime 36134 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36108 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36110 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36077 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18256 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18232 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18265 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction 18211 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18389 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse direction18165 max 18227      [ OK ]

10

balanced bwidth                                             transfer slower than expected! runtime 36113 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36089 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36092 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36103 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18413 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       18162 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay                     18139 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18349 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18256 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18540 ms, expected 18227 ms max 18227       [ fail ]

[-- Attachment #3: 5x_10times_refactor_v5.log --]
[-- Type: text/plain, Size: 14553 bytes --]


1

balanced bwidth                                             transfer slower than expected! runtime 36095 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36168 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36112 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36120 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18235 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18445 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18310 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18299 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18388 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18493 ms, expected 18227 ms max 18227       [ fail ]

2

balanced bwidth                                             transfer slower than expected! runtime 36134 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36202 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36174 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36145 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18144 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18329 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18281 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18512 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18297 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18361 ms, expected 18227 ms max 18227       [ fail ]

3

balanced bwidth                                             transfer slower than expected! runtime 36186 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36166 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36157 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36185 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18207 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18270 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18148 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18247 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18405 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18495 ms, expected 18227 ms max 18227       [ fail ]

4

balanced bwidth                                             transfer slower than expected! runtime 36128 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36145 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36176 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36109 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18355 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18521 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18203 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18282 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18473 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18415 ms, expected 18227 ms max 18227       [ fail ]

5

balanced bwidth                                             transfer slower than expected! runtime 36131 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36167 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36173 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36143 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18182 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18377 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18184 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18274 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            18151 max 18227      [ OK ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18450 ms, expected 18227 ms max 18227       [ fail ]

6

balanced bwidth                                             transfer slower than expected! runtime 36129 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36199 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36133 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36158 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18133 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18488 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18334 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18457 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18379 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18271 ms, expected 18227 ms max 18227       [ fail ]

7

balanced bwidth                                             transfer slower than expected! runtime 36135 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36119 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36131 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36112 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18344 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18378 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18145 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18434 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18612 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18535 ms, expected 18227 ms max 18227       [ fail ]

8

balanced bwidth                                             transfer slower than expected! runtime 36158 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36180 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36108 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36136 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           18183 max 18227      [ OK ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18352 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18151 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18423 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18333 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18373 ms, expected 18227 ms max 18227       [ fail ]

9

balanced bwidth                                             transfer slower than expected! runtime 36123 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36152 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36112 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36128 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18267 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18311 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     18167 max 18227      [ OK ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18294 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18279 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18392 ms, expected 18227 ms max 18227       [ fail ]

10

balanced bwidth                                             transfer slower than expected! runtime 36152 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth - reverse direction                         transfer slower than expected! runtime 36164 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay                       transfer slower than expected! runtime 36172 ms, expected 36005 ms max 36005       [ fail ]
balanced bwidth with unbalanced delay - reverse direction   transfer slower than expected! runtime 36170 ms, expected 36005 ms max 36005       [ fail ]
unbalanced bwidth                                           transfer slower than expected! runtime 18345 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth - reverse direction                       transfer slower than expected! runtime 18317 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay                     transfer slower than expected! runtime 18367 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with unbalanced delay - reverse direction transfer slower than expected! runtime 18407 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay            transfer slower than expected! runtime 18444 ms, expected 18227 ms max 18227       [ fail ]
unbalanced bwidth with opposed, unbalanced delay - reverse directiontransfer slower than expected! runtime 18489 ms, expected 18227 ms max 18227       [ fail ]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH mptcp-next v5 00/11] refactor push pending
  2022-10-07  9:00   ` Geliang Tang
@ 2022-10-10 15:05     ` Geliang Tang
  2022-10-11  0:12       ` Mat Martineau
  0 siblings, 1 reply; 19+ messages in thread
From: Geliang Tang @ 2022-10-10 15:05 UTC (permalink / raw)
  To: Geliang Tang; +Cc: Mat Martineau, mptcp

Geliang Tang <geliang.tang@suse.com> 于2022年10月7日周五 16:59写道:
>
> On Thu, Oct 06, 2022 at 05:16:55PM -0700, Mat Martineau wrote:
> > On Thu, 6 Oct 2022, Geliang Tang wrote:
> >
> > > v5:
> > > - address Mat's comments in v4.
> >
> > Hi Geliang -
> >
> > Thanks for the v5. I haven't finished looking over all the patches in detail
> > yet, but two things I do want to reply to right now:
> >
> >  * Thanks for explaining in patch 4 that last_snd is still useful for round
> > robin. I had forgotten about that, and it looked like a "write-only"
> > variable in the kernel code.
> >
> >  * In the meeting today Paolo suggested that a good test for the new
> > scheduler loop would be to modify simult_flows.sh to use much larger files,
> > then see if the modified code slowed down any of the simult_flows tests.
> >
> > He suggested making the test file 10x larger in simult_flows.sh:
> >
> > -     size=$((2 * 2048 * 4096))
> > +     size=$((2 * 2048 * 4096 * 10))
> >
> > Can you compare the test times between the export branch and this series,
> > with both of them using the larger file size?
>
> 10x larger failed in my tests with timeout. So I changed it to 5x
> larger.
>
> Here's the patch:
>
> @@ -52,7 +52,7 @@ setup()
>         sout=$(mktemp)
>         cout=$(mktemp)
>         capout=$(mktemp)
> -       size=$((2 * 2048 * 4096))
> +       size=$((2 * 2048 * 4096 * 5))
>
>         dd if=/dev/zero of=$small bs=4096 count=20 >/dev/null 2>&1
>         dd if=/dev/zero of=$large bs=4096 count=$((size / 4096)) >/dev/null 2>&1
> @@ -210,13 +210,13 @@ do_transfer()
>         fi
>
>         echo " [ fail ]"
> -       echo "client exit code $retc, server $rets" 1>&2
> -       echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
> -       ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
> -       echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
> -       ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
> -       ls -l $sin $cout
> -       ls -l $cin $sout
> +       #echo "client exit code $retc, server $rets" 1>&2
> +       #echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
> +       #ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
> +       #echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
> +       #ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
> +       #ls -l $sin $cout
> +       #ls -l $cin $sout
>
>         cat "$capout"
>         return 1
>
> All logs are attached. "5x_10times_export.log" is for the export branch
> and "5x_10times_refactor_v5.log" is for this series.

I compared the two log data. The test time of this series is indeed
longer than that of export, but the difference is very small.

I removed the useless information in the two logs and only retained
the running time and expected time. It looks like this:

> cat 5x_export.log | head
36150 max 36005
36135 max 36005
36087 max 36005
36103 max 36005
18284 max 18227
18349 max 18227

Add all running time and expected time:

> awk '{ sum += $1 }; END { print sum }' 5x_export.log
2540920
> awk '{ sum += $3 }; END { print sum }' 5x_export.log
2533820

The ratio of the total running time to the total expected time is
1.00280209 (2540920/2533820).

Use the same method to calculate data in 5x_10times_refactor_v5.log:

> cat 5x_refactor_v5.log | head
36095 max 36005
36168 max 36005
36112 max 36005
36120 max 36005
18235 max 18227
18445 max 18227

> awk '{ sum += $1 }; END { print sum }' 5x_refactor_v5.log
2546024
> awk '{ sum += $3 }; END { print sum }' 5x_refactor_v5.log
2533820

The ratio is 1.00481644.  It is 0.002 more than export data. I think
this difference is acceptable.

Thanks,
-Geliang

>
> Thanks,
> -Geliang
>
> >
> >
> > Thanks,
> >
> > Mat
> >
> >
> > >
> > > v4:
> > > - update __mptcp_subflow_push_pending as Mat suggested.
> > > - add more patches from "BPF redundant scheduler" series.
> > >
> > > v3:
> > > - add a cleanup patch.
> > > - remove msk->last_snd in mptcp_subflow_get_send().
> > > - add the loop that calls the scheduler again in __mptcp_push_pending().
> > >
> > > v2:
> > > - add snd_burst check in dfrags loop as Mat suggested.
> > >
> > > Refactor __mptcp_push_pending() and __mptcp_subflow_push_pending() to
> > > remove duplicate code and support redundant scheduler more easily in
> > > __mptcp_subflow_push_pending().
> > >
> > > Geliang Tang (11):
> > >  Squash to "mptcp: add get_subflow wrappers"
> > >  mptcp: 'first' argument for subflow_push_pending
> > >  mptcp: refactor push_pending logic
> > >  mptcp: drop last_snd for burst scheduler
> > >  mptcp: simplify push_pending
> > >  mptcp: multi subflows push_pending
> > >  mptcp: use msk instead of mptcp_sk
> > >  mptcp: refactor subflow_push_pending logic
> > >  mptcp: simplify subflow_push_pending
> > >  mptcp: multi subflows subflow_push_pending
> > >  mptcp: multi subflows retrans support
> > >
> > > net/mptcp/pm.c         |   9 +-
> > > net/mptcp/pm_netlink.c |   3 -
> > > net/mptcp/protocol.c   | 285 ++++++++++++++++++++++-------------------
> > > net/mptcp/protocol.h   |   5 +-
> > > net/mptcp/sched.c      |  61 +++++----
> > > 5 files changed, 184 insertions(+), 179 deletions(-)
> > >
> > > --
> > > 2.35.3
> > >
> > >
> > >
> >
> > --
> > Mat Martineau
> > Intel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH mptcp-next v5 00/11] refactor push pending
  2022-10-10 15:05     ` Geliang Tang
@ 2022-10-11  0:12       ` Mat Martineau
  0 siblings, 0 replies; 19+ messages in thread
From: Mat Martineau @ 2022-10-11  0:12 UTC (permalink / raw)
  To: Geliang Tang; +Cc: Geliang Tang, mptcp, Paolo Abeni

[-- Attachment #1: Type: text/plain, Size: 4064 bytes --]

On Mon, 10 Oct 2022, Geliang Tang wrote:

> Geliang Tang <geliang.tang@suse.com> 于2022年10月7日周五 16:59写道:
>>
>> On Thu, Oct 06, 2022 at 05:16:55PM -0700, Mat Martineau wrote:
>>> On Thu, 6 Oct 2022, Geliang Tang wrote:
>>>
>>>> v5:
>>>> - address Mat's comments in v4.
>>>
>>> Hi Geliang -
>>>
>>> Thanks for the v5. I haven't finished looking over all the patches in detail
>>> yet, but two things I do want to reply to right now:
>>>
>>>  * Thanks for explaining in patch 4 that last_snd is still useful for round
>>> robin. I had forgotten about that, and it looked like a "write-only"
>>> variable in the kernel code.
>>>
>>>  * In the meeting today Paolo suggested that a good test for the new
>>> scheduler loop would be to modify simult_flows.sh to use much larger files,
>>> then see if the modified code slowed down any of the simult_flows tests.
>>>
>>> He suggested making the test file 10x larger in simult_flows.sh:
>>>
>>> -     size=$((2 * 2048 * 4096))
>>> +     size=$((2 * 2048 * 4096 * 10))
>>>
>>> Can you compare the test times between the export branch and this series,
>>> with both of them using the larger file size?
>>
>> 10x larger failed in my tests with timeout. So I changed it to 5x
>> larger.
>>
>> Here's the patch:
>>
>> @@ -52,7 +52,7 @@ setup()
>>         sout=$(mktemp)
>>         cout=$(mktemp)
>>         capout=$(mktemp)
>> -       size=$((2 * 2048 * 4096))
>> +       size=$((2 * 2048 * 4096 * 5))
>>
>>         dd if=/dev/zero of=$small bs=4096 count=20 >/dev/null 2>&1
>>         dd if=/dev/zero of=$large bs=4096 count=$((size / 4096)) >/dev/null 2>&1
>> @@ -210,13 +210,13 @@ do_transfer()
>>         fi
>>
>>         echo " [ fail ]"
>> -       echo "client exit code $retc, server $rets" 1>&2
>> -       echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
>> -       ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
>> -       echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
>> -       ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
>> -       ls -l $sin $cout
>> -       ls -l $cin $sout
>> +       #echo "client exit code $retc, server $rets" 1>&2
>> +       #echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
>> +       #ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
>> +       #echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
>> +       #ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
>> +       #ls -l $sin $cout
>> +       #ls -l $cin $sout
>>
>>         cat "$capout"
>>         return 1
>>
>> All logs are attached. "5x_10times_export.log" is for the export branch
>> and "5x_10times_refactor_v5.log" is for this series.
>
> I compared the two log data. The test time of this series is indeed
> longer than that of export, but the difference is very small.
>
> I removed the useless information in the two logs and only retained
> the running time and expected time. It looks like this:
>
>> cat 5x_export.log | head
> 36150 max 36005
> 36135 max 36005
> 36087 max 36005
> 36103 max 36005
> 18284 max 18227
> 18349 max 18227
>
> Add all running time and expected time:
>
>> awk '{ sum += $1 }; END { print sum }' 5x_export.log
> 2540920
>> awk '{ sum += $3 }; END { print sum }' 5x_export.log
> 2533820
>
> The ratio of the total running time to the total expected time is
> 1.00280209 (2540920/2533820).
>
> Use the same method to calculate data in 5x_10times_refactor_v5.log:
>
>> cat 5x_refactor_v5.log | head
> 36095 max 36005
> 36168 max 36005
> 36112 max 36005
> 36120 max 36005
> 18235 max 18227
> 18445 max 18227
>
>> awk '{ sum += $1 }; END { print sum }' 5x_refactor_v5.log
> 2546024
>> awk '{ sum += $3 }; END { print sum }' 5x_refactor_v5.log
> 2533820
>
> The ratio is 1.00481644.  It is 0.002 more than export data. I think
> this difference is acceptable.
>

Thanks for taking a look at the data Geliang. To me it seems like the 0.2% 
difference is well within the margin of error, so there doesn't seem to be 
a significant performance regression in your data.

Paolo, how does this performance look to you?


--
Mat Martineau
Intel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH mptcp-next v5 03/11] mptcp: refactor push_pending logic
  2022-10-06 12:17 ` [PATCH mptcp-next v5 03/11] mptcp: refactor push_pending logic Geliang Tang
@ 2022-10-11  0:38   ` Mat Martineau
  0 siblings, 0 replies; 19+ messages in thread
From: Mat Martineau @ 2022-10-11  0:38 UTC (permalink / raw)
  To: Geliang Tang; +Cc: mptcp

On Thu, 6 Oct 2022, Geliang Tang wrote:

> To support redundant package schedulers more easily, this patch refactors
> __mptcp_push_pending() logic from:
>
> For each dfrag:
> 	While sends succeed:
> 		Call the scheduler (selects subflow and msk->snd_burst)
> 		Update subflow locks (push/release/acquire as needed)
> 		Send the dfrag data with mptcp_sendmsg_frag()
> 		Update already_sent, snd_nxt, snd_burst
> 	Update msk->first_pending
> Push/release on final subflow
>
> to:
>
> While the scheduler selects one subflow:
> 	Lock the subflow
> 	For each pending dfrag:
> 		While sends succeed:
> 			Send the dfrag data with mptcp_sendmsg_frag()
> 			Update already_sent, snd_nxt, snd_burst
> 		Update msk->first_pending
> 		Break if required by msk->snd_burst / etc
> 	Push and release the subflow
>
> This patch alse moves the burst check conditions out of the function
> mptcp_subflow_get_send(), check them in __mptcp_push_pending() and
> __mptcp_subflow_push_pending() in the inner "for each pending dfrag"
> loop.
>
> Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> ---
> net/mptcp/protocol.c | 86 ++++++++++++++++++++------------------------
> 1 file changed, 39 insertions(+), 47 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 84d33393d24e..bf77defbc546 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -1417,14 +1417,6 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
> 	u64 linger_time;
> 	long tout = 0;
>
> -	/* re-use last subflow, if the burst allow that */
> -	if (msk->last_snd && msk->snd_burst > 0 &&
> -	    sk_stream_memory_free(msk->last_snd) &&
> -	    mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
> -		mptcp_set_timeout(sk);
> -		return msk->last_snd;
> -	}
> -
> 	/* pick the subflow with the lower wmem/wspace ratio */
> 	for (i = 0; i < SSK_MODE_MAX; ++i) {
> 		send_info[i].ssk = NULL;
> @@ -1530,60 +1522,53 @@ void mptcp_check_and_set_pending(struct sock *sk)
>
> void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> {
> -	struct sock *prev_ssk = NULL, *ssk = NULL;
> 	struct mptcp_sock *msk = mptcp_sk(sk);
> 	struct mptcp_sendmsg_info info = {
> 				.flags = flags,
> 	};
> 	bool do_check_data_fin = false;
> 	struct mptcp_data_frag *dfrag;
> +	struct sock *ssk;
> 	int len;
>
> -	while ((dfrag = mptcp_send_head(sk))) {
> -		info.sent = dfrag->already_sent;
> -		info.limit = dfrag->data_len;
> -		len = dfrag->data_len - dfrag->already_sent;
> -		while (len > 0) {
> -			int ret = 0;
> -
> -			prev_ssk = ssk;
> -			ssk = mptcp_subflow_get_send(msk);
> -
> -			/* First check. If the ssk has changed since
> -			 * the last round, release prev_ssk
> -			 */
> -			if (ssk != prev_ssk && prev_ssk)
> -				mptcp_push_release(prev_ssk, &info);
> -			if (!ssk)
> -				goto out;
> +again:
> +	while (mptcp_send_head(sk) && (ssk = mptcp_subflow_get_send(msk))) {
> +		lock_sock(ssk);
>
> -			/* Need to lock the new subflow only if different
> -			 * from the previous one, otherwise we are still
> -			 * helding the relevant lock
> -			 */
> -			if (ssk != prev_ssk)
> -				lock_sock(ssk);
> +		while ((dfrag = mptcp_send_head(sk))) {
> +			info.sent = dfrag->already_sent;
> +			info.limit = dfrag->data_len;
> +			len = dfrag->data_len - dfrag->already_sent;
> +			while (len > 0) {
> +				int ret = 0;
> +
> +				ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
> +				if (ret <= 0) {
> +					if (ret == -EAGAIN)
> +						goto again;

ssk is still locked here, so the lock_sock(ssk) could deadlock or leave 
the ssk locked when another is selected by mptcp_subflow_get_send().


- Mat

> +					mptcp_push_release(ssk, &info);
> +					goto out;
> +				}
> +
> +				do_check_data_fin = true;
> +				info.sent += ret;
> +				len -= ret;
> +
> +				mptcp_update_post_push(msk, dfrag, ret);
> +			}
> +			WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
>
> -			ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
> -			if (ret <= 0) {
> -				if (ret == -EAGAIN)
> -					continue;
> +			if (msk->snd_burst <= 0 ||
> +			    !sk_stream_memory_free(ssk) ||
> +			    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
> 				mptcp_push_release(ssk, &info);
> -				goto out;
> +				goto again;
> 			}
> -
> -			do_check_data_fin = true;
> -			info.sent += ret;
> -			len -= ret;
> -
> -			mptcp_update_post_push(msk, dfrag, ret);
> +			mptcp_set_timeout(sk);
> 		}
> -		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
> -	}
>
> -	/* at this point we held the socket lock for the last subflow we used */
> -	if (ssk)
> 		mptcp_push_release(ssk, &info);
> +	}
>
> out:
> 	/* ensure the rtx timer is running */
> @@ -1636,6 +1621,13 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk,
> 			mptcp_update_post_push(msk, dfrag, ret);
> 		}
> 		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
> +
> +		if (msk->snd_burst <= 0 ||
> +		    !sk_stream_memory_free(ssk) ||
> +		    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
> +			goto out;
> +		}
> +		mptcp_set_timeout(sk);
> 	}
>
> out:
> -- 
> 2.35.3
>
>
>

--
Mat Martineau
Intel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH mptcp-next v5 00/11] refactor push pending
  2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
                   ` (11 preceding siblings ...)
  2022-10-07  0:16 ` [PATCH mptcp-next v5 00/11] refactor push pending Mat Martineau
@ 2022-10-11  1:01 ` Mat Martineau
  12 siblings, 0 replies; 19+ messages in thread
From: Mat Martineau @ 2022-10-11  1:01 UTC (permalink / raw)
  To: Geliang Tang; +Cc: mptcp

On Thu, 6 Oct 2022, Geliang Tang wrote:

> v5:
> - address Mat's comments in v4.
>

Hi Geliang -

I do think these changes are getting closer to being good to merge, 
there's a remaining bisectability issue that I mention in patch 3 and the 
feedback on the squash-to patches to address.

- Mat

> v4:
> - update __mptcp_subflow_push_pending as Mat suggested.
> - add more patches from "BPF redundant scheduler" series.
>
> v3:
> - add a cleanup patch.
> - remove msk->last_snd in mptcp_subflow_get_send().
> - add the loop that calls the scheduler again in __mptcp_push_pending().
>
> v2:
> - add snd_burst check in dfrags loop as Mat suggested.
>
> Refactor __mptcp_push_pending() and __mptcp_subflow_push_pending() to
> remove duplicate code and support redundant scheduler more easily in
> __mptcp_subflow_push_pending().
>
> Geliang Tang (11):
>  Squash to "mptcp: add get_subflow wrappers"
>  mptcp: 'first' argument for subflow_push_pending
>  mptcp: refactor push_pending logic
>  mptcp: drop last_snd for burst scheduler
>  mptcp: simplify push_pending
>  mptcp: multi subflows push_pending
>  mptcp: use msk instead of mptcp_sk
>  mptcp: refactor subflow_push_pending logic
>  mptcp: simplify subflow_push_pending
>  mptcp: multi subflows subflow_push_pending
>  mptcp: multi subflows retrans support
>
> net/mptcp/pm.c         |   9 +-
> net/mptcp/pm_netlink.c |   3 -
> net/mptcp/protocol.c   | 285 ++++++++++++++++++++++-------------------
> net/mptcp/protocol.h   |   5 +-
> net/mptcp/sched.c      |  61 +++++----
> 5 files changed, 184 insertions(+), 179 deletions(-)
>
> -- 
> 2.35.3
>
>
>

--
Mat Martineau
Intel

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2022-10-11  1:01 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-06 12:17 [PATCH mptcp-next v5 00/11] refactor push pending Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 01/11] Squash to "mptcp: add get_subflow wrappers" Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 02/11] mptcp: 'first' argument for subflow_push_pending Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 03/11] mptcp: refactor push_pending logic Geliang Tang
2022-10-11  0:38   ` Mat Martineau
2022-10-06 12:17 ` [PATCH mptcp-next v5 04/11] mptcp: drop last_snd for burst scheduler Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 05/11] mptcp: simplify push_pending Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 06/11] mptcp: multi subflows push_pending Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 07/11] mptcp: use msk instead of mptcp_sk Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 08/11] mptcp: refactor subflow_push_pending logic Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 09/11] mptcp: simplify subflow_push_pending Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 10/11] mptcp: multi subflows subflow_push_pending Geliang Tang
2022-10-06 12:17 ` [PATCH mptcp-next v5 11/11] mptcp: multi subflows retrans support Geliang Tang
2022-10-06 17:53   ` mptcp: multi subflows retrans support: Tests Results MPTCP CI
2022-10-07  0:16 ` [PATCH mptcp-next v5 00/11] refactor push pending Mat Martineau
2022-10-07  9:00   ` Geliang Tang
2022-10-10 15:05     ` Geliang Tang
2022-10-11  0:12       ` Mat Martineau
2022-10-11  1:01 ` Mat Martineau

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.