* [PATCH mptcp-next v16 1/8] mptcp: add struct mptcp_sched_ops
2022-04-27 1:56 [PATCH mptcp-next v16 0/8] BPF packet scheduler Geliang Tang
@ 2022-04-27 1:56 ` Geliang Tang
2022-04-27 1:56 ` [PATCH mptcp-next v16 2/8] mptcp: register default scheduler Geliang Tang
` (6 subsequent siblings)
7 siblings, 0 replies; 15+ messages in thread
From: Geliang Tang @ 2022-04-27 1:56 UTC (permalink / raw)
To: mptcp; +Cc: Geliang Tang
This patch defines struct mptcp_sched_ops, which has three struct members,
name, owner and list, and three function pointers, init, release and
get_subflow.
Add the scheduler registering, unregistering and finding functions to add,
delete and find a packet scheduler on the global list mptcp_sched_list.
For supporting a "redundant" packet scheduler in the future, this patch
adds a flag in struct mptcp_sched_data named call_again to indicate that
get_subflow() function needs to be called again.
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
include/net/mptcp.h | 19 +++++++
net/mptcp/Makefile | 2 +-
net/mptcp/protocol.h | 3 +
net/mptcp/sched.c | 56 +++++++++++++++++++
.../testing/selftests/bpf/bpf_mptcp_helpers.h | 18 ++++++
5 files changed, 97 insertions(+), 1 deletion(-)
create mode 100644 net/mptcp/sched.c
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 4d761ad530c9..dd4ee7a77567 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -95,6 +95,25 @@ struct mptcp_out_options {
#endif
};
+#define MPTCP_SCHED_NAME_MAX 16
+
+struct mptcp_sched_data {
+ struct sock *sock;
+ bool call_again;
+};
+
+struct mptcp_sched_ops {
+ void (*get_subflow)(struct mptcp_sock *msk, bool reinject,
+ struct mptcp_sched_data *data);
+
+ char name[MPTCP_SCHED_NAME_MAX];
+ struct module *owner;
+ struct list_head list;
+
+ void (*init)(struct mptcp_sock *msk);
+ void (*release)(struct mptcp_sock *msk);
+} ____cacheline_aligned_in_smp;
+
#ifdef CONFIG_MPTCP
extern struct request_sock_ops mptcp_subflow_request_sock_ops;
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
index 6e7df47c9584..8a7f68efa35f 100644
--- a/net/mptcp/Makefile
+++ b/net/mptcp/Makefile
@@ -2,7 +2,7 @@
obj-$(CONFIG_MPTCP) += mptcp.o
mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
- mib.o pm_netlink.o sockopt.o pm_userspace.o
+ mib.o pm_netlink.o sockopt.o pm_userspace.o sched.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 4672901d0dfe..221bb6b9860e 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -621,6 +621,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
struct sockaddr_storage *addr,
unsigned short family);
+struct mptcp_sched_ops *mptcp_sched_find(const char *name);
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow)
{
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
new file mode 100644
index 000000000000..c5d3bbafba71
--- /dev/null
+++ b/net/mptcp/sched.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2022, SUSE.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include "protocol.h"
+
+static DEFINE_SPINLOCK(mptcp_sched_list_lock);
+static LIST_HEAD(mptcp_sched_list);
+
+/* Must be called with rcu read lock held */
+struct mptcp_sched_ops *mptcp_sched_find(const char *name)
+{
+ struct mptcp_sched_ops *sched, *ret = NULL;
+
+ list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
+ if (!strcmp(sched->name, name)) {
+ ret = sched;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
+{
+ if (!sched->get_subflow)
+ return -EINVAL;
+
+ spin_lock(&mptcp_sched_list_lock);
+ if (mptcp_sched_find(sched->name)) {
+ spin_unlock(&mptcp_sched_list_lock);
+ return -EEXIST;
+ }
+ list_add_tail_rcu(&sched->list, &mptcp_sched_list);
+ spin_unlock(&mptcp_sched_list_lock);
+
+ pr_debug("%s registered", sched->name);
+ return 0;
+}
+
+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)
+{
+ spin_lock(&mptcp_sched_list_lock);
+ list_del_rcu(&sched->list);
+ spin_unlock(&mptcp_sched_list_lock);
+}
diff --git a/tools/testing/selftests/bpf/bpf_mptcp_helpers.h b/tools/testing/selftests/bpf/bpf_mptcp_helpers.h
index b5a43b108982..de20fbdb6d98 100644
--- a/tools/testing/selftests/bpf/bpf_mptcp_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_mptcp_helpers.h
@@ -6,6 +6,24 @@
#include "bpf_tcp_helpers.h"
+#define MPTCP_SCHED_NAME_MAX 16
+
+struct mptcp_sched_data {
+ struct sock *sock;
+ bool call_again;
+};
+
+struct mptcp_sched_ops {
+ char name[MPTCP_SCHED_NAME_MAX];
+
+ void (*init)(struct mptcp_sock *msk);
+ void (*release)(struct mptcp_sock *msk);
+
+ void (*get_subflow)(struct mptcp_sock *msk, bool reinject,
+ struct mptcp_sched_data *data);
+ void *owner;
+};
+
struct mptcp_sock {
struct inet_connection_sock sk;
--
2.34.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH mptcp-next v16 2/8] mptcp: register default scheduler
2022-04-27 1:56 [PATCH mptcp-next v16 0/8] BPF packet scheduler Geliang Tang
2022-04-27 1:56 ` [PATCH mptcp-next v16 1/8] mptcp: add struct mptcp_sched_ops Geliang Tang
@ 2022-04-27 1:56 ` Geliang Tang
2022-04-27 1:56 ` [PATCH mptcp-next v16 3/8] mptcp: add a new sysctl scheduler Geliang Tang
` (5 subsequent siblings)
7 siblings, 0 replies; 15+ messages in thread
From: Geliang Tang @ 2022-04-27 1:56 UTC (permalink / raw)
To: mptcp; +Cc: Geliang Tang
This patch defines the default packet scheduler mptcp_sched_default,
register it in mptcp_sched_init(), which is invoked in mptcp_proto_init().
Skip deleting this default scheduler in mptcp_unregister_scheduler().
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
net/mptcp/protocol.c | 9 +++++++++
net/mptcp/protocol.h | 3 +++
net/mptcp/sched.c | 14 ++++++++++++++
3 files changed, 26 insertions(+)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 653757ea0aca..bf9d5bd8dfa6 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2259,6 +2259,14 @@ static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
return min_stale_count > 1 ? backup : NULL;
}
+void mptcp_get_subflow_default(struct mptcp_sock *msk, bool reinject,
+ struct mptcp_sched_data *data)
+{
+ data->sock = reinject ? mptcp_subflow_get_retrans(msk) :
+ mptcp_subflow_get_send(msk);
+ data->call_again = 0;
+}
+
static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk)
{
if (msk->subflow) {
@@ -3813,6 +3821,7 @@ void __init mptcp_proto_init(void)
mptcp_subflow_init();
mptcp_pm_init();
+ mptcp_sched_init();
mptcp_token_init();
if (proto_register(&mptcp_prot, MPTCP_USE_SLAB) != 0)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 221bb6b9860e..65a765c8d2be 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -624,6 +624,9 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
struct mptcp_sched_ops *mptcp_sched_find(const char *name);
int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
+void mptcp_get_subflow_default(struct mptcp_sock *msk, bool reinject,
+ struct mptcp_sched_data *data);
+void mptcp_sched_init(void);
static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow)
{
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
index c5d3bbafba71..bd0beff8cac8 100644
--- a/net/mptcp/sched.c
+++ b/net/mptcp/sched.c
@@ -13,6 +13,12 @@
#include <linux/spinlock.h>
#include "protocol.h"
+static struct mptcp_sched_ops mptcp_sched_default = {
+ .get_subflow = mptcp_get_subflow_default,
+ .name = "default",
+ .owner = THIS_MODULE,
+};
+
static DEFINE_SPINLOCK(mptcp_sched_list_lock);
static LIST_HEAD(mptcp_sched_list);
@@ -50,7 +56,15 @@ int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)
{
+ if (sched == &mptcp_sched_default)
+ return;
+
spin_lock(&mptcp_sched_list_lock);
list_del_rcu(&sched->list);
spin_unlock(&mptcp_sched_list_lock);
}
+
+void mptcp_sched_init(void)
+{
+ mptcp_register_scheduler(&mptcp_sched_default);
+}
--
2.34.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH mptcp-next v16 3/8] mptcp: add a new sysctl scheduler
2022-04-27 1:56 [PATCH mptcp-next v16 0/8] BPF packet scheduler Geliang Tang
2022-04-27 1:56 ` [PATCH mptcp-next v16 1/8] mptcp: add struct mptcp_sched_ops Geliang Tang
2022-04-27 1:56 ` [PATCH mptcp-next v16 2/8] mptcp: register default scheduler Geliang Tang
@ 2022-04-27 1:56 ` Geliang Tang
2022-04-27 1:56 ` [PATCH mptcp-next v16 4/8] mptcp: add sched in mptcp_sock Geliang Tang
` (4 subsequent siblings)
7 siblings, 0 replies; 15+ messages in thread
From: Geliang Tang @ 2022-04-27 1:56 UTC (permalink / raw)
To: mptcp; +Cc: Geliang Tang
This patch adds a new sysctl, named scheduler, to support for selection
of different schedulers. Export mptcp_get_scheduler helper to get this
sysctl.
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
Documentation/networking/mptcp-sysctl.rst | 8 ++++++++
net/mptcp/ctrl.c | 14 ++++++++++++++
net/mptcp/protocol.h | 1 +
3 files changed, 23 insertions(+)
diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst
index e263dfcc4b40..d9e69fdc7ea3 100644
--- a/Documentation/networking/mptcp-sysctl.rst
+++ b/Documentation/networking/mptcp-sysctl.rst
@@ -75,3 +75,11 @@ stale_loss_cnt - INTEGER
This is a per-namespace sysctl.
Default: 4
+
+scheduler - STRING
+ Select the scheduler of your choice.
+
+ Support for selection of different schedulers. This is a per-namespace
+ sysctl.
+
+ Default: "default"
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index ae20b7d92e28..c46c22a84d23 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -32,6 +32,7 @@ struct mptcp_pernet {
u8 checksum_enabled;
u8 allow_join_initial_addr_port;
u8 pm_type;
+ char scheduler[MPTCP_SCHED_NAME_MAX];
};
static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
@@ -69,6 +70,11 @@ int mptcp_get_pm_type(const struct net *net)
return mptcp_get_pernet(net)->pm_type;
}
+const char *mptcp_get_scheduler(const struct net *net)
+{
+ return mptcp_get_pernet(net)->scheduler;
+}
+
static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
@@ -77,6 +83,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
pernet->allow_join_initial_addr_port = 1;
pernet->stale_loss_cnt = 4;
pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
+ strcpy(pernet->scheduler, "default");
}
#ifdef CONFIG_SYSCTL
@@ -128,6 +135,12 @@ static struct ctl_table mptcp_sysctl_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = &mptcp_pm_type_max
},
+ {
+ .procname = "scheduler",
+ .maxlen = MPTCP_SCHED_NAME_MAX,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
{}
};
@@ -149,6 +162,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[3].data = &pernet->allow_join_initial_addr_port;
table[4].data = &pernet->stale_loss_cnt;
table[5].data = &pernet->pm_type;
+ table[6].data = &pernet->scheduler;
hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
if (!hdr)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 65a765c8d2be..4842a28f34c2 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -596,6 +596,7 @@ int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net);
int mptcp_get_pm_type(const struct net *net);
+const char *mptcp_get_scheduler(const struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk);
--
2.34.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH mptcp-next v16 4/8] mptcp: add sched in mptcp_sock
2022-04-27 1:56 [PATCH mptcp-next v16 0/8] BPF packet scheduler Geliang Tang
` (2 preceding siblings ...)
2022-04-27 1:56 ` [PATCH mptcp-next v16 3/8] mptcp: add a new sysctl scheduler Geliang Tang
@ 2022-04-27 1:56 ` Geliang Tang
2022-04-27 1:56 ` [PATCH mptcp-next v16 5/8] mptcp: add get_subflow wrapper Geliang Tang
` (3 subsequent siblings)
7 siblings, 0 replies; 15+ messages in thread
From: Geliang Tang @ 2022-04-27 1:56 UTC (permalink / raw)
To: mptcp; +Cc: Geliang Tang
This patch adds a new struct member sched in struct mptcp_sock.
And two helpers mptcp_init_sched() and mptcp_release_sched() to
init and release it.
Init it with the sysctl scheduler in mptcp_init_sock(), copy the
scheduler from the parent in mptcp_sk_clone(), and release it in
__mptcp_destroy_sock().
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
net/mptcp/protocol.c | 7 ++++
net/mptcp/protocol.h | 4 +++
net/mptcp/sched.c | 34 +++++++++++++++++++
.../testing/selftests/bpf/bpf_mptcp_helpers.h | 1 +
4 files changed, 46 insertions(+)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index bf9d5bd8dfa6..7590e2d29f39 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2673,6 +2673,11 @@ static int mptcp_init_sock(struct sock *sk)
if (ret)
return ret;
+ ret = mptcp_init_sched(mptcp_sk(sk),
+ mptcp_sched_find(mptcp_get_scheduler(net)));
+ if (ret)
+ return ret;
+
/* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will
* propagate the correct value
*/
@@ -2832,6 +2837,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
sk_stop_timer(sk, &sk->sk_timer);
mptcp_data_unlock(sk);
msk->pm.status = 0;
+ mptcp_release_sched(msk);
/* clears msk->subflow, allowing the following loop to close
* even the initial subflow
@@ -3009,6 +3015,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->snd_una = msk->write_seq;
msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
+ mptcp_init_sched(msk, mptcp_sk(sk)->sched);
if (mp_opt->suboptions & OPTIONS_MPTCP_MPC) {
msk->can_ack = true;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 4842a28f34c2..22f3f41e1e32 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -298,6 +298,7 @@ struct mptcp_sock {
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
struct sock *first;
struct mptcp_pm_data pm;
+ struct mptcp_sched_ops *sched;
struct {
u32 space; /* bytes copied in last measurement window */
u32 copied; /* bytes copied in this measurement window */
@@ -628,6 +629,9 @@ void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
void mptcp_get_subflow_default(struct mptcp_sock *msk, bool reinject,
struct mptcp_sched_data *data);
void mptcp_sched_init(void);
+int mptcp_init_sched(struct mptcp_sock *msk,
+ struct mptcp_sched_ops *sched);
+void mptcp_release_sched(struct mptcp_sock *msk);
static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow)
{
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
index bd0beff8cac8..8025dc51fbe9 100644
--- a/net/mptcp/sched.c
+++ b/net/mptcp/sched.c
@@ -68,3 +68,37 @@ void mptcp_sched_init(void)
{
mptcp_register_scheduler(&mptcp_sched_default);
}
+
+int mptcp_init_sched(struct mptcp_sock *msk,
+ struct mptcp_sched_ops *sched)
+{
+ struct mptcp_sched_ops *sched_init = &mptcp_sched_default;
+
+ if (sched)
+ sched_init = sched;
+
+ if (!bpf_try_module_get(sched_init, sched_init->owner))
+ return -EBUSY;
+
+ msk->sched = sched_init;
+ if (msk->sched->init)
+ msk->sched->init(msk);
+
+ pr_debug("sched=%s", msk->sched->name);
+
+ return 0;
+}
+
+void mptcp_release_sched(struct mptcp_sock *msk)
+{
+ struct mptcp_sched_ops *sched = msk->sched;
+
+ if (!sched)
+ return;
+
+ msk->sched = NULL;
+ if (sched->release)
+ sched->release(msk);
+
+ bpf_module_put(sched, sched->owner);
+}
diff --git a/tools/testing/selftests/bpf/bpf_mptcp_helpers.h b/tools/testing/selftests/bpf/bpf_mptcp_helpers.h
index de20fbdb6d98..a0b83fbe8133 100644
--- a/tools/testing/selftests/bpf/bpf_mptcp_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_mptcp_helpers.h
@@ -29,6 +29,7 @@ struct mptcp_sock {
__u32 token;
struct sock *first;
+ struct mptcp_sched_ops *sched;
char ca_name[TCP_CA_NAME_MAX];
} __attribute__((preserve_access_index));
--
2.34.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH mptcp-next v16 5/8] mptcp: add get_subflow wrapper
2022-04-27 1:56 [PATCH mptcp-next v16 0/8] BPF packet scheduler Geliang Tang
` (3 preceding siblings ...)
2022-04-27 1:56 ` [PATCH mptcp-next v16 4/8] mptcp: add sched in mptcp_sock Geliang Tang
@ 2022-04-27 1:56 ` Geliang Tang
2022-04-27 8:27 ` Paolo Abeni
2022-04-27 1:56 ` [PATCH mptcp-next v16 6/8] mptcp: add bpf_mptcp_sched_ops Geliang Tang
` (2 subsequent siblings)
7 siblings, 1 reply; 15+ messages in thread
From: Geliang Tang @ 2022-04-27 1:56 UTC (permalink / raw)
To: mptcp; +Cc: Geliang Tang
This patch defines a new wrapper mptcp_sched_get_subflow(), invoke
get_subflow() of msk->sched in it. Use the wrapper instead of using
mptcp_subflow_get_send() directly.
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
net/mptcp/protocol.c | 9 ++++-----
net/mptcp/protocol.h | 16 ++++++++++++++++
2 files changed, 20 insertions(+), 5 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 7590e2d29f39..c6e963848b18 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1515,7 +1515,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
READ_ONCE(ssk->sk_pacing_rate) * burst,
burst + wmem);
- msk->last_snd = ssk;
msk->snd_burst = burst;
return ssk;
}
@@ -1575,7 +1574,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
int ret = 0;
prev_ssk = ssk;
- ssk = mptcp_subflow_get_send(msk);
+ ssk = mptcp_sched_get_subflow(msk, false);
/* First check. If the ssk has changed since
* the last round, release prev_ssk
@@ -1644,7 +1643,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
* check for a different subflow usage only after
* spooling the first chunk of data
*/
- xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
+ xmit_ssk = first ? ssk : mptcp_sched_get_subflow(mptcp_sk(sk), false);
if (!xmit_ssk)
goto out;
if (xmit_ssk != ssk) {
@@ -2489,7 +2488,7 @@ static void __mptcp_retrans(struct sock *sk)
mptcp_clean_una_wakeup(sk);
/* first check ssk: need to kick "stale" logic */
- ssk = mptcp_subflow_get_retrans(msk);
+ ssk = mptcp_sched_get_subflow(msk, true);
dfrag = mptcp_rtx_head(sk);
if (!dfrag) {
if (mptcp_data_fin_enabled(msk)) {
@@ -3154,7 +3153,7 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
return;
if (!sock_owned_by_user(sk)) {
- struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk));
+ struct sock *xmit_ssk = mptcp_sched_get_subflow(mptcp_sk(sk), false);
if (xmit_ssk == ssk)
__mptcp_subflow_push_pending(sk, ssk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 22f3f41e1e32..91512fc25128 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -633,6 +633,22 @@ int mptcp_init_sched(struct mptcp_sock *msk,
struct mptcp_sched_ops *sched);
void mptcp_release_sched(struct mptcp_sock *msk);
+static inline struct sock *mptcp_sched_get_subflow(struct mptcp_sock *msk, bool reinject)
+{
+ struct mptcp_sched_data data = {
+ .sock = msk->first,
+ .call_again = 0,
+ };
+
+ msk->sched ? INDIRECT_CALL_INET_1(msk->sched->get_subflow,
+ mptcp_get_subflow_default,
+ msk, reinject, &data) :
+ mptcp_get_subflow_default(msk, reinject, &data);
+
+ msk->last_snd = data.sock;
+ return data.sock;
+}
+
static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow)
{
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
--
2.34.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH mptcp-next v16 5/8] mptcp: add get_subflow wrapper
2022-04-27 1:56 ` [PATCH mptcp-next v16 5/8] mptcp: add get_subflow wrapper Geliang Tang
@ 2022-04-27 8:27 ` Paolo Abeni
2022-04-27 8:56 ` Paolo Abeni
2022-04-27 14:34 ` Geliang Tang
0 siblings, 2 replies; 15+ messages in thread
From: Paolo Abeni @ 2022-04-27 8:27 UTC (permalink / raw)
To: Geliang Tang, mptcp
Hello,
First of all I'm sorry for the very late feedback: I had some
difficulties to allocate time to follow this development.
On Wed, 2022-04-27 at 09:56 +0800, Geliang Tang wrote:
> This patch defines a new wrapper mptcp_sched_get_subflow(), invoke
> get_subflow() of msk->sched in it. Use the wrapper instead of using
> mptcp_subflow_get_send() directly.
>
> Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> ---
> net/mptcp/protocol.c | 9 ++++-----
> net/mptcp/protocol.h | 16 ++++++++++++++++
> 2 files changed, 20 insertions(+), 5 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 7590e2d29f39..c6e963848b18 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -1515,7 +1515,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
> subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
> READ_ONCE(ssk->sk_pacing_rate) * burst,
> burst + wmem);
> - msk->last_snd = ssk;
> msk->snd_burst = burst;
> return ssk;
> }
This breaks "mptcp: really share subflow snd_wnd": When the MPTCP-level
cwin is 0, we want to do 0-window probe (so mptcp_subflow_get_send()
return a 'valid' subflow) but we don't want to start a new burst for
such 0 win probe (so mptcp_subflow_get_send() clears last_snd).
With this change, when MPTCP-level cwin is 0, 'last_send' is not
cleared anymore.
A simple sulution would be:
---
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 653757ea0aca..c20f5fd04cad 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1507,7 +1507,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
wmem = READ_ONCE(ssk->sk_wmem_queued);
if (!burst) {
- msk->last_snd = NULL;
+ msk->snd_burst = 0;
return ssk;
}
---
Probably a better solution would be adding 'snd_burst' to struct
mptcp_sched_data, and let mptcp_sched_get_subflow() update msk-
>snd_burst as specified by the scheduler. With this latter change the
'msk' argument in mptcp_sched_ops->schedule() could be const.
> @@ -1575,7 +1574,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> int ret = 0;
>
> prev_ssk = ssk;
> - ssk = mptcp_subflow_get_send(msk);
> + ssk = mptcp_sched_get_subflow(msk, false);
>
> /* First check. If the ssk has changed since
> * the last round, release prev_ssk
> @@ -1644,7 +1643,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
> * check for a different subflow usage only after
> * spooling the first chunk of data
> */
> - xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
> + xmit_ssk = first ? ssk : mptcp_sched_get_subflow(mptcp_sk(sk), false);
> if (!xmit_ssk)
> goto out;
> if (xmit_ssk != ssk) {
> @@ -2489,7 +2488,7 @@ static void __mptcp_retrans(struct sock *sk)
> mptcp_clean_una_wakeup(sk);
>
> /* first check ssk: need to kick "stale" logic */
> - ssk = mptcp_subflow_get_retrans(msk);
> + ssk = mptcp_sched_get_subflow(msk, true);
> dfrag = mptcp_rtx_head(sk);
> if (!dfrag) {
> if (mptcp_data_fin_enabled(msk)) {
> @@ -3154,7 +3153,7 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
> return;
>
> if (!sock_owned_by_user(sk)) {
> - struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk));
> + struct sock *xmit_ssk = mptcp_sched_get_subflow(mptcp_sk(sk), false);
>
> if (xmit_ssk == ssk)
> __mptcp_subflow_push_pending(sk, ssk);
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 22f3f41e1e32..91512fc25128 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -633,6 +633,22 @@ int mptcp_init_sched(struct mptcp_sock *msk,
> struct mptcp_sched_ops *sched);
> void mptcp_release_sched(struct mptcp_sock *msk);
>
> +static inline struct sock *mptcp_sched_get_subflow(struct mptcp_sock *msk, bool reinject)
> +{
> + struct mptcp_sched_data data = {
> + .sock = msk->first,
> + .call_again = 0,
> + };
It's not clear to me:
- why we need the 'sock' argument, since the scheduler already get
'msk' as argument?
- why we need to initialize it (looks like an 'output' argument ?!?)
- what is the goal/role of the 'call_again' argument. It looks like we
just ignore it?!?
Side note: perhaps we should move the following chunk:
---
sock_owned_by_me(sk);
if (__mptcp_check_fallback(msk)) {
if (!msk->first)
return NULL;
return sk_stream_memory_free(msk->first) ? msk->first : NULL;
}
---
out of mptcp_subflow_get_send() into mptcp_sched_get_subflow(), so
that every scheduler will not have to deal with fallback sockets.
> +
> + msk->sched ? INDIRECT_CALL_INET_1(msk->sched->get_subflow,
> + mptcp_get_subflow_default,
> + msk, reinject, &data) :
> + mptcp_get_subflow_default(msk, reinject, &data);
With this we have quite a lot of conditionals for the default
scheduler. I think we can drop the INDIRECT_CALL_INET_1() wrapper, and
rework the code so that msk->sched is always NULL with the default
scheduler.
And sorry to bring the next topic so late, but why a 'reinject'
argument instead of an additional mptcp_sched_ops op? the latter option
will avoid another branch in fast-path.
Thanks!
Paolo
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH mptcp-next v16 5/8] mptcp: add get_subflow wrapper
2022-04-27 8:27 ` Paolo Abeni
@ 2022-04-27 8:56 ` Paolo Abeni
2022-04-27 14:34 ` Geliang Tang
1 sibling, 0 replies; 15+ messages in thread
From: Paolo Abeni @ 2022-04-27 8:56 UTC (permalink / raw)
To: Geliang Tang, mptcp
On Wed, 2022-04-27 at 10:27 +0200, Paolo Abeni wrote:
> > +
> > + msk->sched ? INDIRECT_CALL_INET_1(msk->sched->get_subflow,
> > + mptcp_get_subflow_default,
> > + msk, reinject, &data) :
> > + mptcp_get_subflow_default(msk, reinject, &data);
>
> With this we have quite a lot of conditionals for the default
> scheduler. I think we can drop the INDIRECT_CALL_INET_1() wrapper, and
> rework the code so that msk->sched is always NULL with the default
> scheduler.
Even better, we could drop the mptcp_get_subflow_default() wrapper and
call directly the current scheduler when msk->sched is NULL, even
before filling the 'data' struct. That should fit nicely expecially if
we use 2 separate ops for plain xmit and retrans.
Overall that would lead to something alike:
---
static inline struct sock *mptcp_sched_get_send(struct mptcp_sock msk)
{
struct mptcp_sched_data data;
sock_owned_by_me(sk);
/* the following check is moved out of mptcp_subflow_get_send */
if (__mptcp_check_fallback(msk)) {
if (!msk->first)
return NULL;
return sk_stream_memory_free(msk->first) ? msk->first : NULL;
}
if (!msk->sched)
return mptcp_subflow_get_send(msk);
data.sock = msk->first; /* if needed */
data.call_again = 0; /* if needed */
msk->sched->get_send(msk, &data);
msk->last_snd = data.sock;
return data.sock;
}
---
plus something similar for retrans.
Thanks!
Paolo
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH mptcp-next v16 5/8] mptcp: add get_subflow wrapper
2022-04-27 8:27 ` Paolo Abeni
2022-04-27 8:56 ` Paolo Abeni
@ 2022-04-27 14:34 ` Geliang Tang
2022-04-27 16:53 ` Matthieu Baerts
1 sibling, 1 reply; 15+ messages in thread
From: Geliang Tang @ 2022-04-27 14:34 UTC (permalink / raw)
To: Paolo Abeni; +Cc: Geliang Tang, MPTCP Upstream
Hi Paolo,
Thanks for your review!
Paolo Abeni <pabeni@redhat.com> 于2022年4月27日周三 16:27写道:
>
> Hello,
>
> First of all I'm sorry for the very late feedback: I had some
> difficulties to allocate time to follow this development.
>
> On Wed, 2022-04-27 at 09:56 +0800, Geliang Tang wrote:
> > This patch defines a new wrapper mptcp_sched_get_subflow(), invoke
> > get_subflow() of msk->sched in it. Use the wrapper instead of using
> > mptcp_subflow_get_send() directly.
> >
> > Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> > ---
> > net/mptcp/protocol.c | 9 ++++-----
> > net/mptcp/protocol.h | 16 ++++++++++++++++
> > 2 files changed, 20 insertions(+), 5 deletions(-)
> >
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index 7590e2d29f39..c6e963848b18 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -1515,7 +1515,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
> > subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
> > READ_ONCE(ssk->sk_pacing_rate) * burst,
> > burst + wmem);
> > - msk->last_snd = ssk;
> > msk->snd_burst = burst;
> > return ssk;
> > }
>
> This breaks "mptcp: really share subflow snd_wnd": When the MPTCP-level
> cwin is 0, we want to do 0-window probe (so mptcp_subflow_get_send()
> return a 'valid' subflow) but we don't want to start a new burst for
> such 0 win probe (so mptcp_subflow_get_send() clears last_snd).
>
> With this change, when MPTCP-level cwin is 0, 'last_send' is not
> cleared anymore.
>
> A simple sulution would be:
>
> ---
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 653757ea0aca..c20f5fd04cad 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -1507,7 +1507,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
> burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
> wmem = READ_ONCE(ssk->sk_wmem_queued);
> if (!burst) {
> - msk->last_snd = NULL;
> + msk->snd_burst = 0;
> return ssk;
> }
> ---
I think it's better to add back "msk->last_snd = ssk;" in
mptcp_subflow_get_send(), keep the original mptcp_subflow_get_send()
unchanged. And set msk->last_snd in the get_subflow() of the
scheduler, the same as v12:
https://patchwork.kernel.org/project/mptcp/patch/396f8edcbf8e4f65a1d76c228dc3d12aa4dd2f11.1650430389.git.geliang.tang@suse.com/
>
> Probably a better solution would be adding 'snd_burst' to struct
> mptcp_sched_data, and let mptcp_sched_get_subflow() update msk-
> >snd_burst as specified by the scheduler. With this latter change the
> 'msk' argument in mptcp_sched_ops->schedule() could be const.
>
> > @@ -1575,7 +1574,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> > int ret = 0;
> >
> > prev_ssk = ssk;
> > - ssk = mptcp_subflow_get_send(msk);
> > + ssk = mptcp_sched_get_subflow(msk, false);
> >
> > /* First check. If the ssk has changed since
> > * the last round, release prev_ssk
> > @@ -1644,7 +1643,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
> > * check for a different subflow usage only after
> > * spooling the first chunk of data
> > */
> > - xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
> > + xmit_ssk = first ? ssk : mptcp_sched_get_subflow(mptcp_sk(sk), false);
> > if (!xmit_ssk)
> > goto out;
> > if (xmit_ssk != ssk) {
> > @@ -2489,7 +2488,7 @@ static void __mptcp_retrans(struct sock *sk)
> > mptcp_clean_una_wakeup(sk);
> >
> > /* first check ssk: need to kick "stale" logic */
> > - ssk = mptcp_subflow_get_retrans(msk);
> > + ssk = mptcp_sched_get_subflow(msk, true);
> > dfrag = mptcp_rtx_head(sk);
> > if (!dfrag) {
> > if (mptcp_data_fin_enabled(msk)) {
> > @@ -3154,7 +3153,7 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
> > return;
> >
> > if (!sock_owned_by_user(sk)) {
> > - struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk));
> > + struct sock *xmit_ssk = mptcp_sched_get_subflow(mptcp_sk(sk), false);
> >
> > if (xmit_ssk == ssk)
> > __mptcp_subflow_push_pending(sk, ssk);
> > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> > index 22f3f41e1e32..91512fc25128 100644
> > --- a/net/mptcp/protocol.h
> > +++ b/net/mptcp/protocol.h
> > @@ -633,6 +633,22 @@ int mptcp_init_sched(struct mptcp_sock *msk,
> > struct mptcp_sched_ops *sched);
> > void mptcp_release_sched(struct mptcp_sock *msk);
> >
> > +static inline struct sock *mptcp_sched_get_subflow(struct mptcp_sock *msk, bool reinject)
> > +{
> > + struct mptcp_sched_data data = {
> > + .sock = msk->first,
> > + .call_again = 0,
> > + };
>
> It's not clear to me:
> - why we need the 'sock' argument, since the scheduler already get
> 'msk' as argument?
sock is the selected subflow to return in the scheduer.
> - why we need to initialize it (looks like an 'output' argument ?!?)
Yes, it's an output argument. No need to init it.
> - what is the goal/role of the 'call_again' argument. It looks like we
> just ignore it?!?
call_again is for supporting a "redundant" packet scheduler in the
future indicate that get_subflow() function needs to be called again.
>
> Side note: perhaps we should move the following chunk:
>
> ---
> sock_owned_by_me(sk);
>
> if (__mptcp_check_fallback(msk)) {
> if (!msk->first)
> return NULL;
> return sk_stream_memory_free(msk->first) ? msk->first : NULL;
> }
> ---
>
> out of mptcp_subflow_get_send() into mptcp_sched_get_subflow(), so
> that every scheduler will not have to deal with fallback sockets.
>
> > +
> > + msk->sched ? INDIRECT_CALL_INET_1(msk->sched->get_subflow,
> > + mptcp_get_subflow_default,
> > + msk, reinject, &data) :
> > + mptcp_get_subflow_default(msk, reinject, &data);
>
> With this we have quite a lot of conditionals for the default
> scheduler. I think we can drop the INDIRECT_CALL_INET_1() wrapper, and
> rework the code so that msk->sched is always NULL with the default
> scheduler.
So I need to drop patch 2 (2/8 mptcp: register default scheduler) in
this series, right?
>
> And sorry to bring the next topic so late, but why a 'reinject'
> argument instead of an additional mptcp_sched_ops op? the latter option
> will avoid another branch in fast-path.
How about keeping th 'reinject' argument like this:
----
static inline struct sock *mptcp_sched_get_send(struct mptcp_sock *msk)
{
struct mptcp_sched_data data;
sock_owned_by_me((struct sock *)msk);
/* the following check is moved out of mptcp_subflow_get_send */
if (__mptcp_check_fallback(msk)) {
if (!msk->first)
return NULL;
return sk_stream_memory_free(msk->first) ? msk->first : NULL;
}
if (!msk->sched)
return mptcp_subflow_get_send(msk);
msk->sched->get_subflow(msk, false, &data);
return data.sock;
}
static inline struct sock *mptcp_sched_get_retrans(struct mptcp_sock *msk)
{
struct mptcp_sched_data data;
sock_owned_by_me((const struct sock *)msk);
if (__mptcp_check_fallback(msk))
return NULL;
if (!msk->sched)
return mptcp_subflow_get_retrans(msk);
msk->sched->get_subflow(msk, true, &data);
return data.sock;
}
-----
Thanks,
-Geliang
>
> Thanks!
>
> Paolo
>
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH mptcp-next v16 5/8] mptcp: add get_subflow wrapper
2022-04-27 14:34 ` Geliang Tang
@ 2022-04-27 16:53 ` Matthieu Baerts
0 siblings, 0 replies; 15+ messages in thread
From: Matthieu Baerts @ 2022-04-27 16:53 UTC (permalink / raw)
To: Geliang Tang, Paolo Abeni; +Cc: Geliang Tang, MPTCP Upstream
Hi Geliang, Paolo,
On 27/04/2022 16:34, Geliang Tang wrote:
> Hi Paolo,
>
> Thanks for your review!
>
> Paolo Abeni <pabeni@redhat.com> 于2022年4月27日周三 16:27写道:
>>
>> Hello,
>>
>> First of all I'm sorry for the very late feedback: I had some
>> difficulties to allocate time to follow this development.
>>
>> On Wed, 2022-04-27 at 09:56 +0800, Geliang Tang wrote:
>>> This patch defines a new wrapper mptcp_sched_get_subflow(), invoke
>>> get_subflow() of msk->sched in it. Use the wrapper instead of using
>>> mptcp_subflow_get_send() directly.
>>>
>>> Signed-off-by: Geliang Tang <geliang.tang@suse.com>
>>> ---
>>> net/mptcp/protocol.c | 9 ++++-----
>>> net/mptcp/protocol.h | 16 ++++++++++++++++
>>> 2 files changed, 20 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>>> index 7590e2d29f39..c6e963848b18 100644
>>> --- a/net/mptcp/protocol.c
>>> +++ b/net/mptcp/protocol.c
>>> @@ -1515,7 +1515,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
>>> subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
>>> READ_ONCE(ssk->sk_pacing_rate) * burst,
>>> burst + wmem);
>>> - msk->last_snd = ssk;
>>> msk->snd_burst = burst;
>>> return ssk;
>>> }
>>
>> This breaks "mptcp: really share subflow snd_wnd": When the MPTCP-level
>> cwin is 0, we want to do 0-window probe (so mptcp_subflow_get_send()
>> return a 'valid' subflow) but we don't want to start a new burst for
>> such 0 win probe (so mptcp_subflow_get_send() clears last_snd).
>>
>> With this change, when MPTCP-level cwin is 0, 'last_send' is not
>> cleared anymore.
>>
>> A simple sulution would be:
>>
>> ---
>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>> index 653757ea0aca..c20f5fd04cad 100644
>> --- a/net/mptcp/protocol.c
>> +++ b/net/mptcp/protocol.c
>> @@ -1507,7 +1507,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
>> burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
>> wmem = READ_ONCE(ssk->sk_wmem_queued);
>> if (!burst) {
>> - msk->last_snd = NULL;
>> + msk->snd_burst = 0;
>> return ssk;
>> }
>> ---
>
> I think it's better to add back "msk->last_snd = ssk;" in
> mptcp_subflow_get_send(), keep the original mptcp_subflow_get_send()
> unchanged. And set msk->last_snd in the get_subflow() of the
> scheduler, the same as v12:
>
> https://patchwork.kernel.org/project/mptcp/patch/396f8edcbf8e4f65a1d76c228dc3d12aa4dd2f11.1650430389.git.geliang.tang@suse.com/
It might be interesting to discuss about the scheduler API at the
meeting tomorrow (sadly I will not be there).
Maybe it is interesting to separate the work in two steps like it is
done in mptcp.org: select data, then select subflow.
If the scheduler needs to know if it is is selecting a subflow for a
zero window test or not: we are getting closer to the mptcp.org API :)
(or maybe just enough to reset msk->snd_burst instead of asking all
schedulers to set 'last_snd' + give access to that for BPF)
Cheers,
Matt
>
>>
>> Probably a better solution would be adding 'snd_burst' to struct
>> mptcp_sched_data, and let mptcp_sched_get_subflow() update msk-
>>> snd_burst as specified by the scheduler. With this latter change the
>> 'msk' argument in mptcp_sched_ops->schedule() could be const.
>>
>>> @@ -1575,7 +1574,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
>>> int ret = 0;
>>>
>>> prev_ssk = ssk;
>>> - ssk = mptcp_subflow_get_send(msk);
>>> + ssk = mptcp_sched_get_subflow(msk, false);
>>>
>>> /* First check. If the ssk has changed since
>>> * the last round, release prev_ssk
>>> @@ -1644,7 +1643,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
>>> * check for a different subflow usage only after
>>> * spooling the first chunk of data
>>> */
>>> - xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
>>> + xmit_ssk = first ? ssk : mptcp_sched_get_subflow(mptcp_sk(sk), false);
>>> if (!xmit_ssk)
>>> goto out;
>>> if (xmit_ssk != ssk) {
>>> @@ -2489,7 +2488,7 @@ static void __mptcp_retrans(struct sock *sk)
>>> mptcp_clean_una_wakeup(sk);
>>>
>>> /* first check ssk: need to kick "stale" logic */
>>> - ssk = mptcp_subflow_get_retrans(msk);
>>> + ssk = mptcp_sched_get_subflow(msk, true);
>>> dfrag = mptcp_rtx_head(sk);
>>> if (!dfrag) {
>>> if (mptcp_data_fin_enabled(msk)) {
>>> @@ -3154,7 +3153,7 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
>>> return;
>>>
>>> if (!sock_owned_by_user(sk)) {
>>> - struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk));
>>> + struct sock *xmit_ssk = mptcp_sched_get_subflow(mptcp_sk(sk), false);
>>>
>>> if (xmit_ssk == ssk)
>>> __mptcp_subflow_push_pending(sk, ssk);
>>> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
>>> index 22f3f41e1e32..91512fc25128 100644
>>> --- a/net/mptcp/protocol.h
>>> +++ b/net/mptcp/protocol.h
>>> @@ -633,6 +633,22 @@ int mptcp_init_sched(struct mptcp_sock *msk,
>>> struct mptcp_sched_ops *sched);
>>> void mptcp_release_sched(struct mptcp_sock *msk);
>>>
>>> +static inline struct sock *mptcp_sched_get_subflow(struct mptcp_sock *msk, bool reinject)
>>> +{
>>> + struct mptcp_sched_data data = {
>>> + .sock = msk->first,
>>> + .call_again = 0,
>>> + };
>>
>> It's not clear to me:
>> - why we need the 'sock' argument, since the scheduler already get
>> 'msk' as argument?
>
> sock is the selected subflow to return in the scheduer.
>
>> - why we need to initialize it (looks like an 'output' argument ?!?)
>
> Yes, it's an output argument. No need to init it.
>
>> - what is the goal/role of the 'call_again' argument. It looks like we
>> just ignore it?!?
>
> call_again is for supporting a "redundant" packet scheduler in the
> future indicate that get_subflow() function needs to be called again.
>
>>
>> Side note: perhaps we should move the following chunk:
>>
>> ---
>> sock_owned_by_me(sk);
>>
>> if (__mptcp_check_fallback(msk)) {
>> if (!msk->first)
>> return NULL;
>> return sk_stream_memory_free(msk->first) ? msk->first : NULL;
>> }
>> ---
>>
>> out of mptcp_subflow_get_send() into mptcp_sched_get_subflow(), so
>> that every scheduler will not have to deal with fallback sockets.
>>
>>> +
>>> + msk->sched ? INDIRECT_CALL_INET_1(msk->sched->get_subflow,
>>> + mptcp_get_subflow_default,
>>> + msk, reinject, &data) :
>>> + mptcp_get_subflow_default(msk, reinject, &data);
>>
>> With this we have quite a lot of conditionals for the default
>> scheduler. I think we can drop the INDIRECT_CALL_INET_1() wrapper, and
>> rework the code so that msk->sched is always NULL with the default
>> scheduler.
>
> So I need to drop patch 2 (2/8 mptcp: register default scheduler) in
> this series, right?
>
>>
>> And sorry to bring the next topic so late, but why a 'reinject'
>> argument instead of an additional mptcp_sched_ops op? the latter option
>> will avoid another branch in fast-path.
>
> How about keeping th 'reinject' argument like this:
>
> ----
> static inline struct sock *mptcp_sched_get_send(struct mptcp_sock *msk)
> {
> struct mptcp_sched_data data;
>
> sock_owned_by_me((struct sock *)msk);
>
> /* the following check is moved out of mptcp_subflow_get_send */
> if (__mptcp_check_fallback(msk)) {
> if (!msk->first)
> return NULL;
> return sk_stream_memory_free(msk->first) ? msk->first : NULL;
> }
>
> if (!msk->sched)
> return mptcp_subflow_get_send(msk);
>
> msk->sched->get_subflow(msk, false, &data);
>
> return data.sock;
> }
>
> static inline struct sock *mptcp_sched_get_retrans(struct mptcp_sock *msk)
> {
> struct mptcp_sched_data data;
>
> sock_owned_by_me((const struct sock *)msk);
>
> if (__mptcp_check_fallback(msk))
> return NULL;
>
> if (!msk->sched)
> return mptcp_subflow_get_retrans(msk);
>
> msk->sched->get_subflow(msk, true, &data);
>
> return data.sock;
> }
> -----
>
> Thanks,
> -Geliang
>
>>
>> Thanks!
>>
>> Paolo
>>
>>
>
--
Tessares | Belgium | Hybrid Access Solutions
www.tessares.net
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH mptcp-next v16 6/8] mptcp: add bpf_mptcp_sched_ops
2022-04-27 1:56 [PATCH mptcp-next v16 0/8] BPF packet scheduler Geliang Tang
` (4 preceding siblings ...)
2022-04-27 1:56 ` [PATCH mptcp-next v16 5/8] mptcp: add get_subflow wrapper Geliang Tang
@ 2022-04-27 1:56 ` Geliang Tang
2022-04-27 13:29 ` kernel test robot
2022-04-27 1:56 ` [PATCH mptcp-next v16 7/8] selftests: bpf: add bpf_first scheduler Geliang Tang
2022-04-27 1:56 ` [PATCH mptcp-next v16 8/8] selftests: bpf: add bpf_first test Geliang Tang
7 siblings, 1 reply; 15+ messages in thread
From: Geliang Tang @ 2022-04-27 1:56 UTC (permalink / raw)
To: mptcp; +Cc: Geliang Tang
This patch implements a new struct bpf_struct_ops, bpf_mptcp_sched_ops.
Register and unregister the bpf scheduler in .reg and .unreg.
This MPTCP BPF scheduler implementation is similar to BPF TCP CC. And
net/ipv4/bpf_tcp_ca.c is a frame of reference for this patch.
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
kernel/bpf/bpf_struct_ops_types.h | 4 +
net/mptcp/bpf.c | 151 ++++++++++++++++++++++++++++++
2 files changed, 155 insertions(+)
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
index 5678a9ddf817..5a6b0c0d8d3d 100644
--- a/kernel/bpf/bpf_struct_ops_types.h
+++ b/kernel/bpf/bpf_struct_ops_types.h
@@ -8,5 +8,9 @@ BPF_STRUCT_OPS_TYPE(bpf_dummy_ops)
#ifdef CONFIG_INET
#include <net/tcp.h>
BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
+#ifdef CONFIG_MPTCP
+#include <net/mptcp.h>
+BPF_STRUCT_OPS_TYPE(mptcp_sched_ops)
+#endif
#endif
#endif
diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
index 535602ba2582..debd23497784 100644
--- a/net/mptcp/bpf.c
+++ b/net/mptcp/bpf.c
@@ -10,8 +10,159 @@
#define pr_fmt(fmt) "MPTCP: " fmt
#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
#include "protocol.h"
+#ifdef CONFIG_BPF_JIT
+extern struct bpf_struct_ops bpf_mptcp_sched_ops;
+extern struct btf *btf_vmlinux;
+static const struct btf_type *mptcp_sched_type __read_mostly;
+static u32 mptcp_sched_id;
+
+static u32 optional_ops[] = {
+ offsetof(struct mptcp_sched_ops, init),
+ offsetof(struct mptcp_sched_ops, release),
+ offsetof(struct mptcp_sched_ops, get_subflow),
+};
+
+static const struct bpf_func_proto *
+bpf_mptcp_sched_get_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
+{
+ return bpf_base_func_proto(func_id);
+}
+
+static int bpf_mptcp_sched_btf_struct_access(struct bpf_verifier_log *log,
+ const struct btf *btf,
+ const struct btf_type *t, int off,
+ int size, enum bpf_access_type atype,
+ u32 *next_btf_id,
+ enum bpf_type_flag *flag)
+{
+ size_t end;
+
+ if (atype == BPF_READ)
+ return btf_struct_access(log, btf, t, off, size, atype,
+ next_btf_id, flag);
+
+ if (t != mptcp_sched_type) {
+ bpf_log(log, "only access to mptcp_sched_data is supported\n");
+ return -EACCES;
+ }
+
+ switch (off) {
+ case offsetof(struct mptcp_sched_data, sock):
+ end = offsetofend(struct mptcp_sched_data, sock);
+ break;
+ case offsetof(struct mptcp_sched_data, call_again):
+ end = offsetofend(struct mptcp_sched_data, call_again);
+ break;
+ default:
+ bpf_log(log, "no write support to mptcp_sched_data at off %d\n", off);
+ return -EACCES;
+ }
+
+ if (off + size > end) {
+ bpf_log(log, "access beyond mptcp_sched_data at off %u size %u ended at %zu",
+ off, size, end);
+ return -EACCES;
+ }
+
+ return NOT_INIT;
+}
+
+static const struct bpf_verifier_ops bpf_mptcp_sched_verifier_ops = {
+ .get_func_proto = bpf_mptcp_sched_get_func_proto,
+ .is_valid_access = bpf_tracing_btf_ctx_access,
+ .btf_struct_access = bpf_mptcp_sched_btf_struct_access,
+};
+
+static int bpf_mptcp_sched_reg(void *kdata)
+{
+ return mptcp_register_scheduler(kdata);
+}
+
+static void bpf_mptcp_sched_unreg(void *kdata)
+{
+ mptcp_unregister_scheduler(kdata);
+}
+
+static int bpf_mptcp_sched_check_member(const struct btf_type *t,
+ const struct btf_member *member)
+{
+ return 0;
+}
+
+static bool is_optional(u32 member_offset)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(optional_ops); i++) {
+ if (member_offset == optional_ops[i])
+ return true;
+ }
+
+ return false;
+}
+
+static int bpf_mptcp_sched_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct mptcp_sched_ops *usched;
+ struct mptcp_sched_ops *sched;
+ int prog_fd;
+ u32 moff;
+
+ usched = (const struct mptcp_sched_ops *)udata;
+ sched = (struct mptcp_sched_ops *)kdata;
+
+ moff = __btf_member_bit_offset(t, member) / 8;
+ switch (moff) {
+ case offsetof(struct mptcp_sched_ops, name):
+ if (bpf_obj_name_cpy(sched->name, usched->name,
+ sizeof(sched->name)) <= 0)
+ return -EINVAL;
+ if (mptcp_sched_find(usched->name))
+ return -EEXIST;
+ return 1;
+ }
+
+ if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL))
+ return 0;
+
+ /* Ensure bpf_prog is provided for compulsory func ptr */
+ prog_fd = (int)(*(unsigned long *)(udata + moff));
+ if (!prog_fd && !is_optional(moff))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int bpf_mptcp_sched_init(struct btf *btf)
+{
+ mptcp_sched_id = btf_find_by_name_kind(btf, "mptcp_sched_data",
+ BTF_KIND_STRUCT);
+ if (mptcp_sched_id < 0)
+ return -EINVAL;
+ mptcp_sched_type = btf_type_by_id(btf, mptcp_sched_id);
+
+ return 0;
+}
+
+struct bpf_struct_ops bpf_mptcp_sched_ops = {
+ .verifier_ops = &bpf_mptcp_sched_verifier_ops,
+ .reg = bpf_mptcp_sched_reg,
+ .unreg = bpf_mptcp_sched_unreg,
+ .check_member = bpf_mptcp_sched_check_member,
+ .init_member = bpf_mptcp_sched_init_member,
+ .init = bpf_mptcp_sched_init,
+ .name = "mptcp_sched_ops",
+};
+#endif /* CONFIG_BPF_JIT */
+
struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk)
{
if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk_is_mptcp(sk))
--
2.34.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH mptcp-next v16 6/8] mptcp: add bpf_mptcp_sched_ops
2022-04-27 1:56 ` [PATCH mptcp-next v16 6/8] mptcp: add bpf_mptcp_sched_ops Geliang Tang
@ 2022-04-27 13:29 ` kernel test robot
0 siblings, 0 replies; 15+ messages in thread
From: kernel test robot @ 2022-04-27 13:29 UTC (permalink / raw)
To: Geliang Tang, mptcp; +Cc: kbuild-all, Geliang Tang
Hi Geliang,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on mptcp/export]
[cannot apply to bpf-next/master bpf/master linus/master v5.18-rc4 next-20220427]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/intel-lab-lkp/linux/commits/Geliang-Tang/BPF-packet-scheduler/20220427-095943
base: https://github.com/multipath-tcp/mptcp_net-next.git export
config: i386-randconfig-m021 (https://download.01.org/0day-ci/archive/20220427/202204272125.AiqJUAjR-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.2.0-20) 11.2.0
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
smatch warnings:
net/mptcp/bpf.c:148 bpf_mptcp_sched_init() warn: unsigned 'mptcp_sched_id' is never less than zero.
vim +/mptcp_sched_id +148 net/mptcp/bpf.c
143
144 static int bpf_mptcp_sched_init(struct btf *btf)
145 {
146 mptcp_sched_id = btf_find_by_name_kind(btf, "mptcp_sched_data",
147 BTF_KIND_STRUCT);
> 148 if (mptcp_sched_id < 0)
149 return -EINVAL;
150 mptcp_sched_type = btf_type_by_id(btf, mptcp_sched_id);
151
152 return 0;
153 }
154
--
0-DAY CI Kernel Test Service
https://01.org/lkp
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH mptcp-next v16 7/8] selftests: bpf: add bpf_first scheduler
2022-04-27 1:56 [PATCH mptcp-next v16 0/8] BPF packet scheduler Geliang Tang
` (5 preceding siblings ...)
2022-04-27 1:56 ` [PATCH mptcp-next v16 6/8] mptcp: add bpf_mptcp_sched_ops Geliang Tang
@ 2022-04-27 1:56 ` Geliang Tang
2022-04-27 1:56 ` [PATCH mptcp-next v16 8/8] selftests: bpf: add bpf_first test Geliang Tang
7 siblings, 0 replies; 15+ messages in thread
From: Geliang Tang @ 2022-04-27 1:56 UTC (permalink / raw)
To: mptcp; +Cc: Geliang Tang
This patch implements the simplest MPTCP scheduler, named bpf_first,
which always picks the first subflow to send data. It's a sample of
MPTCP BPF scheduler implementations.
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
.../selftests/bpf/progs/mptcp_bpf_first.c | 32 +++++++++++++++++++
1 file changed, 32 insertions(+)
create mode 100644 tools/testing/selftests/bpf/progs/mptcp_bpf_first.c
diff --git a/tools/testing/selftests/bpf/progs/mptcp_bpf_first.c b/tools/testing/selftests/bpf/progs/mptcp_bpf_first.c
new file mode 100644
index 000000000000..0ca9754c078d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/mptcp_bpf_first.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022, SUSE. */
+
+#include <linux/bpf.h>
+#include "bpf_mptcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("struct_ops/mptcp_sched_first_init")
+void BPF_PROG(mptcp_sched_first_init, struct mptcp_sock *msk)
+{
+}
+
+SEC("struct_ops/mptcp_sched_first_release")
+void BPF_PROG(mptcp_sched_first_release, struct mptcp_sock *msk)
+{
+}
+
+void BPF_STRUCT_OPS(bpf_first_get_subflow, struct mptcp_sock *msk,
+ bool reinject, struct mptcp_sched_data *data)
+{
+ data->sock = msk->first;
+ data->call_again = 0;
+}
+
+SEC(".struct_ops")
+struct mptcp_sched_ops first = {
+ .init = (void *)mptcp_sched_first_init,
+ .release = (void *)mptcp_sched_first_release,
+ .get_subflow = (void *)bpf_first_get_subflow,
+ .name = "bpf_first",
+};
--
2.34.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH mptcp-next v16 8/8] selftests: bpf: add bpf_first test
2022-04-27 1:56 [PATCH mptcp-next v16 0/8] BPF packet scheduler Geliang Tang
` (6 preceding siblings ...)
2022-04-27 1:56 ` [PATCH mptcp-next v16 7/8] selftests: bpf: add bpf_first scheduler Geliang Tang
@ 2022-04-27 1:56 ` Geliang Tang
2022-04-27 3:30 ` selftests: bpf: add bpf_first test: Tests Results MPTCP CI
7 siblings, 1 reply; 15+ messages in thread
From: Geliang Tang @ 2022-04-27 1:56 UTC (permalink / raw)
To: mptcp; +Cc: Geliang Tang
This patch expends the MPTCP test base to support MPTCP packet
scheduler tests. Add the bpf_first scheduler test in it. Use sysctl
to set net.mptcp.scheduler to use this sched.
Some code in send_data() is from prog_tests/bpf_tcp_ca.c.
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
.../testing/selftests/bpf/prog_tests/mptcp.c | 112 ++++++++++++++++++
1 file changed, 112 insertions(+)
diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c
index 7e704f5aab05..44484a63e62a 100644
--- a/tools/testing/selftests/bpf/prog_tests/mptcp.c
+++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c
@@ -4,6 +4,7 @@
#include <test_progs.h>
#include "cgroup_helpers.h"
#include "network_helpers.h"
+#include "mptcp_bpf_first.skel.h"
#ifndef TCP_CA_NAME_MAX
#define TCP_CA_NAME_MAX 16
@@ -19,6 +20,8 @@ struct mptcp_storage {
};
static char monitor_log_path[64];
+static const unsigned int total_bytes = 10 * 1024 * 1024;
+static int stop, duration;
static int verify_tsk(int map_fd, int client_fd)
{
@@ -251,8 +254,117 @@ void test_base(void)
close(cgroup_fd);
}
+static void *server(void *arg)
+{
+ int lfd = (int)(long)arg, err = 0, fd;
+ ssize_t nr_sent = 0, bytes = 0;
+ char batch[1500];
+
+ fd = accept(lfd, NULL, NULL);
+ while (fd == -1) {
+ if (errno == EINTR)
+ continue;
+ err = -errno;
+ goto done;
+ }
+
+ if (settimeo(fd, 0)) {
+ err = -errno;
+ goto done;
+ }
+
+ while (bytes < total_bytes && !READ_ONCE(stop)) {
+ nr_sent = send(fd, &batch,
+ MIN(total_bytes - bytes, sizeof(batch)), 0);
+ if (nr_sent == -1 && errno == EINTR)
+ continue;
+ if (nr_sent == -1) {
+ err = -errno;
+ break;
+ }
+ bytes += nr_sent;
+ }
+
+ CHECK(bytes != total_bytes, "send", "%zd != %u nr_sent:%zd errno:%d\n",
+ bytes, total_bytes, nr_sent, errno);
+
+done:
+ if (fd >= 0)
+ close(fd);
+ if (err) {
+ WRITE_ONCE(stop, 1);
+ return ERR_PTR(err);
+ }
+ return NULL;
+}
+
+static void send_data(int lfd, int fd)
+{
+ ssize_t nr_recv = 0, bytes = 0;
+ pthread_t srv_thread;
+ void *thread_ret;
+ char batch[1500];
+ int err;
+
+ WRITE_ONCE(stop, 0);
+
+ err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd);
+ if (CHECK(err != 0, "pthread_create", "err:%d errno:%d\n", err, errno))
+ return;
+
+ /* recv total_bytes */
+ while (bytes < total_bytes && !READ_ONCE(stop)) {
+ nr_recv = recv(fd, &batch,
+ MIN(total_bytes - bytes, sizeof(batch)), 0);
+ if (nr_recv == -1 && errno == EINTR)
+ continue;
+ if (nr_recv == -1)
+ break;
+ bytes += nr_recv;
+ }
+
+ CHECK(bytes != total_bytes, "recv", "%zd != %u nr_recv:%zd errno:%d\n",
+ bytes, total_bytes, nr_recv, errno);
+
+ WRITE_ONCE(stop, 1);
+
+ pthread_join(srv_thread, &thread_ret);
+ CHECK(IS_ERR(thread_ret), "pthread_join", "thread_ret:%ld",
+ PTR_ERR(thread_ret));
+}
+
+static void test_first(void)
+{
+ struct mptcp_bpf_first *first_skel;
+ int server_fd, client_fd;
+ struct bpf_link *link;
+
+ first_skel = mptcp_bpf_first__open_and_load();
+ if (CHECK(!first_skel, "bpf_first__open_and_load", "failed\n"))
+ return;
+
+ link = bpf_map__attach_struct_ops(first_skel->maps.first);
+ if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) {
+ mptcp_bpf_first__destroy(first_skel);
+ return;
+ }
+
+ system("sysctl -q net.mptcp.scheduler=bpf_first");
+ server_fd = start_mptcp_server(AF_INET, NULL, 0, 0);
+ client_fd = connect_to_mptcp_fd(server_fd, 0);
+
+ send_data(server_fd, client_fd);
+
+ close(client_fd);
+ close(server_fd);
+ bpf_link__destroy(link);
+ mptcp_bpf_first__destroy(first_skel);
+}
+
void test_mptcp(void)
{
if (test__start_subtest("base"))
test_base();
+ if (test__start_subtest("first"))
+ test_first();
}
--
2.34.1
^ permalink raw reply related [flat|nested] 15+ messages in thread