* [PATCH bpf-next v4 1/9] bpf: Retire the struct_ops map kvalue->refcnt.
2023-03-07 23:32 [PATCH bpf-next v4 0/9] Transit between BPF TCP congestion controls Kui-Feng Lee
@ 2023-03-07 23:32 ` Kui-Feng Lee
2023-03-07 23:33 ` [PATCH bpf-next v4 2/9] bpf: Create links for BPF struct_ops maps Kui-Feng Lee
` (7 subsequent siblings)
8 siblings, 0 replies; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-07 23:32 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
We have replaced kvalue-refcnt with synchronize_rcu() to wait for an
RCU grace period.
Maintenance of kvalue->refcnt was a complicated task, as we had to
simultaneously keep track of two reference counts: one for the
reference count of bpf_map. When the kvalue->refcnt reaches zero, we
also have to reduce the reference count on bpf_map - yet these steps
are not performed in an atomic manner and require us to be vigilant
when managing them. By eliminating kvalue->refcnt, we can make our
maintenance more straightforward as the refcount of bpf_map is now
solely managed!
To prevent the trampoline image of a struct_ops from being released
while it is still in use, we wait for an RCU grace period. The
setsockopt(TCP_CONGESTION, "...") command allows you to change your
socket's congestion control algorithm and can result in releasing the
old struct_ops implementation. Moreover, since this function is
exposed through bpf_setsockopt(), it may be accessed by BPF programs
as well. To ensure that the trampoline image belonging to struct_op
can be safely called while its method is in use, struct_ops is
safeguarded with rcu_read_lock(). Doing so prevents any destruction of
the associated images before returning from a trampoline and requires
us to wait for an RCU grace period.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
include/linux/bpf.h | 2 ++
kernel/bpf/bpf_struct_ops.c | 60 ++++++++++++++++++-------------------
kernel/bpf/syscall.c | 2 +-
3 files changed, 32 insertions(+), 32 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8b5d0b4c4ada..fc47c756455e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -78,6 +78,7 @@ struct bpf_map_ops {
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
void (*map_release)(struct bpf_map *map, struct file *map_file);
void (*map_free)(struct bpf_map *map);
+ void (*map_free_rcu)(struct bpf_map *map);
int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
void (*map_release_uref)(struct bpf_map *map);
void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
@@ -1869,6 +1870,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd);
struct bpf_map *__bpf_map_get(struct fd f);
void bpf_map_inc(struct bpf_map *map);
void bpf_map_inc_with_uref(struct bpf_map *map);
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index ece9870cab68..815b5e1cf325 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -58,6 +58,8 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_value kvalue;
};
+static DEFINE_MUTEX(update_mutex);
+
#define VALUE_PREFIX "bpf_struct_ops_"
#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
@@ -249,6 +251,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
struct bpf_struct_ops_value *uvalue, *kvalue;
enum bpf_struct_ops_state state;
+ s64 refcnt;
if (unlikely(*(u32 *)key != 0))
return -ENOENT;
@@ -267,7 +270,9 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
uvalue = value;
memcpy(uvalue, st_map->uvalue, map->value_size);
uvalue->state = state;
- refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+ refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
+ refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));
return 0;
}
@@ -491,7 +496,6 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*(unsigned long *)(udata + moff) = prog->aux->id;
}
- refcount_set(&kvalue->refcnt, 1);
bpf_map_inc(map);
set_memory_rox((long)st_map->image, 1);
@@ -536,8 +540,7 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
st_map->st_ops->unreg(&st_map->kvalue.data);
- if (refcount_dec_and_test(&st_map->kvalue.refcnt))
- bpf_map_put(map);
+ bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
return -EINPROGRESS;
@@ -574,6 +577,19 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its refcount may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * Thus, a rcu grace period is needed here.
+ */
+ synchronize_rcu();
+
if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
bpf_map_area_free(st_map->links);
@@ -660,41 +676,23 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
bool bpf_struct_ops_get(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
- return refcount_inc_not_zero(&kvalue->refcnt);
-}
-
-static void bpf_struct_ops_put_rcu(struct rcu_head *head)
-{
- struct bpf_struct_ops_map *st_map;
-
- st_map = container_of(head, struct bpf_struct_ops_map, rcu);
- bpf_map_put(&st_map->map);
+ map = __bpf_map_inc_not_zero(&st_map->map, false);
+ return !IS_ERR(map);
}
void bpf_struct_ops_put(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
- if (refcount_dec_and_test(&kvalue->refcnt)) {
- struct bpf_struct_ops_map *st_map;
-
- st_map = container_of(kvalue, struct bpf_struct_ops_map,
- kvalue);
- /* The struct_ops's function may switch to another struct_ops.
- *
- * For example, bpf_tcp_cc_x->init() may switch to
- * another tcp_cc_y by calling
- * setsockopt(TCP_CONGESTION, "tcp_cc_y").
- * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
- * and its map->refcnt may reach 0 which then free its
- * trampoline image while tcp_cc_x is still running.
- *
- * Thus, a rcu grace period is needed here.
- */
- call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
- }
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+ bpf_map_put(&st_map->map);
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cda8d00f3762..6351d50e3d8b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1276,7 +1276,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
}
/* map_idr_lock should have been held */
-static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
int refold;
--
2.34.1
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH bpf-next v4 2/9] bpf: Create links for BPF struct_ops maps.
2023-03-07 23:32 [PATCH bpf-next v4 0/9] Transit between BPF TCP congestion controls Kui-Feng Lee
2023-03-07 23:32 ` [PATCH bpf-next v4 1/9] bpf: Retire the struct_ops map kvalue->refcnt Kui-Feng Lee
@ 2023-03-07 23:33 ` Kui-Feng Lee
2023-03-08 0:37 ` Andrii Nakryiko
2023-03-07 23:33 ` [PATCH bpf-next v4 3/9] net: Update an existing TCP congestion control algorithm Kui-Feng Lee
` (6 subsequent siblings)
8 siblings, 1 reply; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-07 23:33 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
BPF struct_ops maps are employed directly to register TCP Congestion
Control algorithms. Unlike other BPF programs that terminate when
their links gone. The link of a BPF struct_ops map provides a uniform
experience akin to other types of BPF programs.
bpf_links are responsible for registering their associated
struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
set to create a bpf_link, while a structs without this flag behaves in
the same manner as before and is registered upon updating its value.
The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
used to craft the links for BPF struct_ops programs, but also to
create links for BPF struct_ops them-self. Since the links of BPF
struct_ops programs are only used to create trampolines internally,
they are never seen in other contexts. Thus, they can be reused for
struct_ops themself.
To maintain a reference to the map supporting this link, we add
bpf_struct_ops_link as an additional type. The pointer of the map is
RCU and won't be necessary until later in the patchset.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
include/linux/bpf.h | 11 +++
include/uapi/linux/bpf.h | 12 +++-
kernel/bpf/bpf_struct_ops.c | 119 +++++++++++++++++++++++++++++++--
kernel/bpf/syscall.c | 23 ++++---
tools/include/uapi/linux/bpf.h | 12 +++-
5 files changed, 163 insertions(+), 14 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index fc47c756455e..855b27f847eb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1481,6 +1481,7 @@ static inline void bpf_module_put(const void *data, struct module *owner)
else
module_put(owner);
}
+int bpf_struct_ops_link_create(union bpf_attr *attr);
#ifdef CONFIG_NET
/* Define it here to avoid the use of forward declaration */
@@ -1521,6 +1522,11 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
{
return -EINVAL;
}
+static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
#endif
#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
@@ -2307,6 +2313,11 @@ static inline void bpf_link_put(struct bpf_link *link)
{
}
+static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
static inline int bpf_obj_get_user(const char __user *pathname, int flags)
{
return -EOPNOTSUPP;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 17afd2b35ee5..eb3e435c5303 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@ enum bpf_attach_type {
BPF_PERF_EVENT,
BPF_TRACE_KPROBE_MULTI,
BPF_LSM_CGROUP,
+ BPF_STRUCT_OPS,
__MAX_BPF_ATTACH_TYPE
};
@@ -1266,6 +1267,9 @@ enum {
/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+ BPF_F_LINK = (1U << 13),
};
/* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@ union bpf_attr {
} task_fd_query;
struct { /* struct used by BPF_LINK_CREATE command */
- __u32 prog_fd; /* eBPF program to attach */
+ union {
+ __u32 prog_fd; /* eBPF program to attach */
+ __u32 map_fd; /* struct_ops to attach */
+ };
union {
__u32 target_fd; /* object to attach to */
__u32 target_ifindex; /* target ifindex */
@@ -6354,6 +6361,9 @@ struct bpf_link_info {
struct {
__u32 ifindex;
} xdp;
+ struct {
+ __u32 map_id;
+ } struct_ops;
};
} __attribute__((aligned(8)));
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 815b5e1cf325..dcb7a408d4e9 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -16,6 +16,7 @@ enum bpf_struct_ops_state {
BPF_STRUCT_OPS_STATE_INIT,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE,
+ BPF_STRUCT_OPS_STATE_READY,
};
#define BPF_STRUCT_OPS_COMMON_VALUE \
@@ -58,6 +59,11 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_value kvalue;
};
+struct bpf_struct_ops_link {
+ struct bpf_link link;
+ struct bpf_map __rcu *map;
+};
+
static DEFINE_MUTEX(update_mutex);
#define VALUE_PREFIX "bpf_struct_ops_"
@@ -496,11 +502,19 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*(unsigned long *)(udata + moff) = prog->aux->id;
}
- bpf_map_inc(map);
-
set_memory_rox((long)st_map->image, 1);
+ if (st_map->map.map_flags & BPF_F_LINK) {
+ /* Let bpf_link handle registration & unregistration.
+ *
+ * Pair with smp_load_acquire() during lookup_elem().
+ */
+ smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+ goto unlock;
+ }
+
err = st_ops->reg(kdata);
if (likely(!err)) {
+ bpf_map_inc(map);
/* Pair with smp_load_acquire() during lookup_elem().
* It ensures the above udata updates (e.g. prog->aux->id)
* can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
@@ -516,7 +530,6 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*/
set_memory_nx((long)st_map->image, 1);
set_memory_rw((long)st_map->image, 1);
- bpf_map_put(map);
reset_unlock:
bpf_struct_ops_map_put_progs(st_map);
@@ -534,6 +547,9 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
struct bpf_struct_ops_map *st_map;
st_map = (struct bpf_struct_ops_map *)map;
+ if (st_map->map.map_flags & BPF_F_LINK)
+ return -EOPNOTSUPP;
+
prev_state = cmpxchg(&st_map->kvalue.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE);
@@ -601,7 +617,7 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
{
if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
- attr->map_flags || !attr->btf_vmlinux_value_type_id)
+ (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
return -EINVAL;
return 0;
}
@@ -696,3 +712,98 @@ void bpf_struct_ops_put(const void *kdata)
bpf_map_put(&st_map->map);
}
+
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_struct_ops_map *st_map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = (struct bpf_struct_ops_map *)st_link->map;
+ st_map->st_ops->unreg(&st_map->kvalue.data);
+ bpf_map_put(st_link->map);
+ kfree(st_link);
+}
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ if (map)
+ seq_printf(seq, "map_id:\t%d\n", map->id);
+ rcu_read_unlock();
+}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ if (map)
+ info->struct_ops.map_id = map->id;
+ rcu_read_unlock();
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+ .dealloc = bpf_struct_ops_map_link_dealloc,
+ .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+ .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+ struct bpf_struct_ops_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+ int err;
+
+ map = bpf_map_get(attr->link_create.map_fd);
+ if (!map)
+ return -EINVAL;
+
+ st_map = (struct bpf_struct_ops_map *)map;
+
+ if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & BPF_F_LINK) ||
+ /* Pair with smp_store_release() during map_update */
+ smp_load_acquire(&st_map->kvalue.state) != BPF_STRUCT_OPS_STATE_READY) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+ RCU_INIT_POINTER(link->map, map);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto err_out;
+
+ err = st_map->st_ops->reg(st_map->kvalue.data);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ goto err_out;
+ }
+
+ return bpf_link_settle(&link_primer);
+
+err_out:
+ bpf_map_put(map);
+ kfree(link);
+ return err;
+}
+
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6351d50e3d8b..25b044fdd82b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2794,16 +2794,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
const struct bpf_prog *prog = link->prog;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"link_type:\t%s\n"
- "link_id:\t%u\n"
- "prog_tag:\t%s\n"
- "prog_id:\t%u\n",
+ "link_id:\t%u\n",
bpf_link_type_strs[link->type],
- link->id,
- prog_tag,
- prog->aux->id);
+ link->id);
+ if (prog) {
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+ seq_printf(m,
+ "prog_tag:\t%s\n"
+ "prog_id:\t%u\n",
+ prog_tag,
+ prog->aux->id);
+ }
if (link->ops->show_fdinfo)
link->ops->show_fdinfo(link, m);
}
@@ -4278,7 +4281,8 @@ static int bpf_link_get_info_by_fd(struct file *file,
info.type = link->type;
info.id = link->id;
- info.prog_id = link->prog->aux->id;
+ if (link->prog)
+ info.prog_id = link->prog->aux->id;
if (link->ops->fill_link_info) {
err = link->ops->fill_link_info(link, &info);
@@ -4541,6 +4545,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
if (CHECK_ATTR(BPF_LINK_CREATE))
return -EINVAL;
+ if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+ return bpf_struct_ops_link_create(attr);
+
prog = bpf_prog_get(attr->link_create.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 17afd2b35ee5..cd0ff39981e8 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@ enum bpf_attach_type {
BPF_PERF_EVENT,
BPF_TRACE_KPROBE_MULTI,
BPF_LSM_CGROUP,
+ BPF_STRUCT_OPS,
__MAX_BPF_ATTACH_TYPE
};
@@ -1266,6 +1267,9 @@ enum {
/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+ BPF_F_LINK = (1U << 13),
};
/* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@ union bpf_attr {
} task_fd_query;
struct { /* struct used by BPF_LINK_CREATE command */
- __u32 prog_fd; /* eBPF program to attach */
+ union {
+ __u32 prog_fd; /* eBPF program to attach */
+ __u32 map_fd; /* eBPF struct_ops to attach */
+ };
union {
__u32 target_fd; /* object to attach to */
__u32 target_ifindex; /* target ifindex */
@@ -6354,6 +6361,9 @@ struct bpf_link_info {
struct {
__u32 ifindex;
} xdp;
+ struct {
+ __u32 map_id;
+ } struct_ops;
};
} __attribute__((aligned(8)));
--
2.34.1
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 2/9] bpf: Create links for BPF struct_ops maps.
2023-03-07 23:33 ` [PATCH bpf-next v4 2/9] bpf: Create links for BPF struct_ops maps Kui-Feng Lee
@ 2023-03-08 0:37 ` Andrii Nakryiko
2023-03-08 1:11 ` Kui-Feng Lee
0 siblings, 1 reply; 25+ messages in thread
From: Andrii Nakryiko @ 2023-03-08 0:37 UTC (permalink / raw)
To: Kui-Feng Lee; +Cc: bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On Tue, Mar 7, 2023 at 3:33 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
>
> BPF struct_ops maps are employed directly to register TCP Congestion
> Control algorithms. Unlike other BPF programs that terminate when
> their links gone. The link of a BPF struct_ops map provides a uniform
> experience akin to other types of BPF programs.
>
> bpf_links are responsible for registering their associated
> struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
> set to create a bpf_link, while a structs without this flag behaves in
> the same manner as before and is registered upon updating its value.
>
> The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
> used to craft the links for BPF struct_ops programs, but also to
> create links for BPF struct_ops them-self. Since the links of BPF
> struct_ops programs are only used to create trampolines internally,
> they are never seen in other contexts. Thus, they can be reused for
> struct_ops themself.
>
> To maintain a reference to the map supporting this link, we add
> bpf_struct_ops_link as an additional type. The pointer of the map is
> RCU and won't be necessary until later in the patchset.
>
> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
> ---
> include/linux/bpf.h | 11 +++
> include/uapi/linux/bpf.h | 12 +++-
> kernel/bpf/bpf_struct_ops.c | 119 +++++++++++++++++++++++++++++++--
> kernel/bpf/syscall.c | 23 ++++---
> tools/include/uapi/linux/bpf.h | 12 +++-
> 5 files changed, 163 insertions(+), 14 deletions(-)
>
[...]
> +int bpf_struct_ops_link_create(union bpf_attr *attr)
> +{
> + struct bpf_struct_ops_link *link = NULL;
> + struct bpf_link_primer link_primer;
> + struct bpf_struct_ops_map *st_map;
> + struct bpf_map *map;
> + int err;
> +
> + map = bpf_map_get(attr->link_create.map_fd);
> + if (!map)
> + return -EINVAL;
> +
> + st_map = (struct bpf_struct_ops_map *)map;
> +
> + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & BPF_F_LINK) ||
> + /* Pair with smp_store_release() during map_update */
> + smp_load_acquire(&st_map->kvalue.state) != BPF_STRUCT_OPS_STATE_READY) {
> + err = -EINVAL;
> + goto err_out;
> + }
> +
> + link = kzalloc(sizeof(*link), GFP_USER);
> + if (!link) {
> + err = -ENOMEM;
> + goto err_out;
> + }
> + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
> + RCU_INIT_POINTER(link->map, map);
> +
> + err = bpf_link_prime(&link->link, &link_primer);
> + if (err)
> + goto err_out;
> +
> + err = st_map->st_ops->reg(st_map->kvalue.data);
> + if (err) {
> + bpf_link_cleanup(&link_primer);
link = NULL to avoid kfree()-ing it, see bpf_tracing_prog_attach() for
similar approach
> + goto err_out;
> + }
> +
> + return bpf_link_settle(&link_primer);
> +
> +err_out:
> + bpf_map_put(map);
> + kfree(link);
> + return err;
> +}
> +
[...]
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 2/9] bpf: Create links for BPF struct_ops maps.
2023-03-08 0:37 ` Andrii Nakryiko
@ 2023-03-08 1:11 ` Kui-Feng Lee
0 siblings, 0 replies; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-08 1:11 UTC (permalink / raw)
To: Andrii Nakryiko, Kui-Feng Lee
Cc: bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On 3/7/23 16:37, Andrii Nakryiko wrote:
> On Tue, Mar 7, 2023 at 3:33 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
>>
>> BPF struct_ops maps are employed directly to register TCP Congestion
>> Control algorithms. Unlike other BPF programs that terminate when
>> their links gone. The link of a BPF struct_ops map provides a uniform
>> experience akin to other types of BPF programs.
>>
>> bpf_links are responsible for registering their associated
>> struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
>> set to create a bpf_link, while a structs without this flag behaves in
>> the same manner as before and is registered upon updating its value.
>>
>> The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
>> used to craft the links for BPF struct_ops programs, but also to
>> create links for BPF struct_ops them-self. Since the links of BPF
>> struct_ops programs are only used to create trampolines internally,
>> they are never seen in other contexts. Thus, they can be reused for
>> struct_ops themself.
>>
>> To maintain a reference to the map supporting this link, we add
>> bpf_struct_ops_link as an additional type. The pointer of the map is
>> RCU and won't be necessary until later in the patchset.
>>
>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
>> ---
>> include/linux/bpf.h | 11 +++
>> include/uapi/linux/bpf.h | 12 +++-
>> kernel/bpf/bpf_struct_ops.c | 119 +++++++++++++++++++++++++++++++--
>> kernel/bpf/syscall.c | 23 ++++---
>> tools/include/uapi/linux/bpf.h | 12 +++-
>> 5 files changed, 163 insertions(+), 14 deletions(-)
>>
>
> [...]
>
>> +int bpf_struct_ops_link_create(union bpf_attr *attr)
>> +{
>> + struct bpf_struct_ops_link *link = NULL;
>> + struct bpf_link_primer link_primer;
>> + struct bpf_struct_ops_map *st_map;
>> + struct bpf_map *map;
>> + int err;
>> +
>> + map = bpf_map_get(attr->link_create.map_fd);
>> + if (!map)
>> + return -EINVAL;
>> +
>> + st_map = (struct bpf_struct_ops_map *)map;
>> +
>> + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS || !(map->map_flags & BPF_F_LINK) ||
>> + /* Pair with smp_store_release() during map_update */
>> + smp_load_acquire(&st_map->kvalue.state) != BPF_STRUCT_OPS_STATE_READY) {
>> + err = -EINVAL;
>> + goto err_out;
>> + }
>> +
>> + link = kzalloc(sizeof(*link), GFP_USER);
>> + if (!link) {
>> + err = -ENOMEM;
>> + goto err_out;
>> + }
>> + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
>> + RCU_INIT_POINTER(link->map, map);
>> +
>> + err = bpf_link_prime(&link->link, &link_primer);
>> + if (err)
>> + goto err_out;
>> +
>> + err = st_map->st_ops->reg(st_map->kvalue.data);
>> + if (err) {
>> + bpf_link_cleanup(&link_primer);
>
> link = NULL to avoid kfree()-ing it, see bpf_tracing_prog_attach() for
> similar approach
>
Got it! Thank you!
>> + goto err_out;
>> + }
>> +
>> + return bpf_link_settle(&link_primer);
>> +
>> +err_out:
>> + bpf_map_put(map);
>> + kfree(link);
>> + return err;
>> +}
>> +
>
> [...]
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH bpf-next v4 3/9] net: Update an existing TCP congestion control algorithm.
2023-03-07 23:32 [PATCH bpf-next v4 0/9] Transit between BPF TCP congestion controls Kui-Feng Lee
2023-03-07 23:32 ` [PATCH bpf-next v4 1/9] bpf: Retire the struct_ops map kvalue->refcnt Kui-Feng Lee
2023-03-07 23:33 ` [PATCH bpf-next v4 2/9] bpf: Create links for BPF struct_ops maps Kui-Feng Lee
@ 2023-03-07 23:33 ` Kui-Feng Lee
2023-03-07 23:33 ` [PATCH bpf-next v4 4/9] bpf: Validate kdata of a struct_ops before transiting to READY Kui-Feng Lee
` (5 subsequent siblings)
8 siblings, 0 replies; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-07 23:33 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
This feature lets you immediately transition to another congestion
control algorithm or implementation with the same name. Once a name
is updated, new connections will apply this new algorithm.
The 'validate' function pointer has been added to bpf_struct_ops as
well, allowing us to validate a struct_ops without having to go
through the registration process.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
include/linux/bpf.h | 2 ++
include/net/tcp.h | 3 ++
net/bpf/bpf_dummy_struct_ops.c | 6 ++++
net/ipv4/bpf_tcp_ca.c | 14 +++++++--
net/ipv4/tcp_cong.c | 57 +++++++++++++++++++++++++++++-----
5 files changed, 73 insertions(+), 9 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 855b27f847eb..047d2c6aba88 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1447,6 +1447,8 @@ struct bpf_struct_ops {
void *kdata, const void *udata);
int (*reg)(void *kdata);
void (*unreg)(void *kdata);
+ int (*update)(void *kdata, void *old_kdata);
+ int (*validate)(void *kdata);
const struct btf_type *type;
const struct btf_type *value_type;
const char *name;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index db9f828e9d1e..2abb755e6a3a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1117,6 +1117,9 @@ struct tcp_congestion_ops {
int tcp_register_congestion_control(struct tcp_congestion_ops *type);
void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
+int tcp_update_congestion_control(struct tcp_congestion_ops *type,
+ struct tcp_congestion_ops *old_type);
+int tcp_validate_congestion_control(struct tcp_congestion_ops *ca);
void tcp_assign_congestion_control(struct sock *sk);
void tcp_init_congestion_control(struct sock *sk);
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index ff4f89a2b02a..158f14e240d0 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -222,12 +222,18 @@ static void bpf_dummy_unreg(void *kdata)
{
}
+static int bpf_dummy_update(void *kdata, void *old_kdata)
+{
+ return -EOPNOTSUPP;
+}
+
struct bpf_struct_ops bpf_bpf_dummy_ops = {
.verifier_ops = &bpf_dummy_verifier_ops,
.init = bpf_dummy_init,
.check_member = bpf_dummy_ops_check_member,
.init_member = bpf_dummy_init_member,
.reg = bpf_dummy_reg,
+ .update = bpf_dummy_update,
.unreg = bpf_dummy_unreg,
.name = "bpf_dummy_ops",
};
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 13fc0c185cd9..e8b27826283e 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -239,8 +239,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name,
sizeof(tcp_ca->name)) <= 0)
return -EINVAL;
- if (tcp_ca_find(utcp_ca->name))
- return -EEXIST;
return 1;
}
@@ -266,13 +264,25 @@ static void bpf_tcp_ca_unreg(void *kdata)
tcp_unregister_congestion_control(kdata);
}
+static int bpf_tcp_ca_update(void *kdata, void *old_kdata)
+{
+ return tcp_update_congestion_control(kdata, old_kdata);
+}
+
+static int bpf_tcp_ca_validate(void *kdata)
+{
+ return tcp_validate_congestion_control(kdata);
+}
+
struct bpf_struct_ops bpf_tcp_congestion_ops = {
.verifier_ops = &bpf_tcp_ca_verifier_ops,
.reg = bpf_tcp_ca_reg,
.unreg = bpf_tcp_ca_unreg,
+ .update = bpf_tcp_ca_update,
.check_member = bpf_tcp_ca_check_member,
.init_member = bpf_tcp_ca_init_member,
.init = bpf_tcp_ca_init,
+ .validate = bpf_tcp_ca_validate,
.name = "tcp_congestion_ops",
};
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index db8b4b488c31..24829390e495 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -75,14 +75,8 @@ struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
return NULL;
}
-/*
- * Attach new congestion control algorithm to the list
- * of available options.
- */
-int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
{
- int ret = 0;
-
/* all algorithms must implement these */
if (!ca->ssthresh || !ca->undo_cwnd ||
!(ca->cong_avoid || ca->cong_control)) {
@@ -90,6 +84,20 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
return -EINVAL;
}
+ return 0;
+}
+
+/* Attach new congestion control algorithm to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+ int ret;
+
+ ret = tcp_validate_congestion_control(ca);
+ if (ret)
+ return ret;
+
ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
spin_lock(&tcp_cong_list_lock);
@@ -130,6 +138,41 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+/* Replace a registered old ca with a new one.
+ *
+ * The new ca must have the same name as the old one, that has been
+ * registered.
+ */
+int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
+{
+ struct tcp_congestion_ops *existing;
+ int ret;
+
+ ret = tcp_validate_congestion_control(ca);
+ if (ret)
+ return ret;
+
+ ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
+ spin_lock(&tcp_cong_list_lock);
+ existing = tcp_ca_find_key(old_ca->key);
+ if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) {
+ pr_notice("%s not registered or non-unique key\n",
+ ca->name);
+ ret = -EINVAL;
+ } else if (existing != old_ca) {
+ pr_notice("invalid old congestion control algorithm to replace\n");
+ ret = -EINVAL;
+ } else {
+ list_add_tail_rcu(&ca->list, &tcp_cong_list);
+ list_del_rcu(&existing->list);
+ pr_debug("%s updated\n", ca->name);
+ }
+ spin_unlock(&tcp_cong_list_lock);
+
+ return ret;
+}
+
u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
{
const struct tcp_congestion_ops *ca;
--
2.34.1
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH bpf-next v4 4/9] bpf: Validate kdata of a struct_ops before transiting to READY.
2023-03-07 23:32 [PATCH bpf-next v4 0/9] Transit between BPF TCP congestion controls Kui-Feng Lee
` (2 preceding siblings ...)
2023-03-07 23:33 ` [PATCH bpf-next v4 3/9] net: Update an existing TCP congestion control algorithm Kui-Feng Lee
@ 2023-03-07 23:33 ` Kui-Feng Lee
2023-03-07 23:33 ` [PATCH bpf-next v4 5/9] libbpf: Create a bpf_link in bpf_map__attach_struct_ops() Kui-Feng Lee
` (4 subsequent siblings)
8 siblings, 0 replies; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-07 23:33 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
By utilizing this check, we can avoid creating a struct_ops that
cannot be registered afterward. This way, future complications can be
avoided with ease.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
kernel/bpf/bpf_struct_ops.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index dcb7a408d4e9..c71c8d73c7ad 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -504,6 +504,11 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
set_memory_rox((long)st_map->image, 1);
if (st_map->map.map_flags & BPF_F_LINK) {
+ if (st_ops->validate) {
+ err = st_ops->validate(kdata);
+ if (err)
+ goto unlock;
+ }
/* Let bpf_link handle registration & unregistration.
*
* Pair with smp_load_acquire() during lookup_elem().
--
2.34.1
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH bpf-next v4 5/9] libbpf: Create a bpf_link in bpf_map__attach_struct_ops().
2023-03-07 23:32 [PATCH bpf-next v4 0/9] Transit between BPF TCP congestion controls Kui-Feng Lee
` (3 preceding siblings ...)
2023-03-07 23:33 ` [PATCH bpf-next v4 4/9] bpf: Validate kdata of a struct_ops before transiting to READY Kui-Feng Lee
@ 2023-03-07 23:33 ` Kui-Feng Lee
2023-03-08 0:46 ` Andrii Nakryiko
2023-03-07 23:33 ` [PATCH bpf-next v4 6/9] bpf: Update the struct_ops of a bpf_link Kui-Feng Lee
` (3 subsequent siblings)
8 siblings, 1 reply; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-07 23:33 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
bpf_map__attach_struct_ops() was creating a dummy bpf_link as a
placeholder, but now it is constructing an authentic one by calling
bpf_link_create() if the map has the BPF_F_LINK flag.
You can flag a struct_ops map with BPF_F_LINK by calling
bpf_map__set_map_flags().
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
tools/lib/bpf/libbpf.c | 84 +++++++++++++++++++++++++++++++-----------
1 file changed, 62 insertions(+), 22 deletions(-)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 35a698eb825d..a67efc3b3763 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -115,6 +115,7 @@ static const char * const attach_type_name[] = {
[BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_reuseport_select_or_migrate",
[BPF_PERF_EVENT] = "perf_event",
[BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi",
+ [BPF_STRUCT_OPS] = "struct_ops",
};
static const char * const link_type_name[] = {
@@ -7677,6 +7678,26 @@ static int bpf_object__resolve_externs(struct bpf_object *obj,
return 0;
}
+static void bpf_map_prepare_vdata(const struct bpf_map *map)
+{
+ struct bpf_struct_ops *st_ops;
+ __u32 i;
+
+ st_ops = map->st_ops;
+ for (i = 0; i < btf_vlen(st_ops->type); i++) {
+ struct bpf_program *prog = st_ops->progs[i];
+ void *kern_data;
+ int prog_fd;
+
+ if (!prog)
+ continue;
+
+ prog_fd = bpf_program__fd(prog);
+ kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
+ *(unsigned long *)kern_data = prog_fd;
+ }
+}
+
static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path)
{
int err, i;
@@ -7728,6 +7749,10 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch
btf__free(obj->btf_vmlinux);
obj->btf_vmlinux = NULL;
+ for (i = 0; i < obj->nr_maps; i++)
+ if (bpf_map__is_struct_ops(&obj->maps[i]))
+ bpf_map_prepare_vdata(&obj->maps[i]);
+
obj->loaded = true; /* doesn't matter if successfully or not */
if (err)
@@ -11429,22 +11454,34 @@ struct bpf_link *bpf_program__attach(const struct bpf_program *prog)
return link;
}
+struct bpf_link_struct_ops {
+ struct bpf_link link;
+ int map_fd;
+};
+
static int bpf_link__detach_struct_ops(struct bpf_link *link)
{
+ struct bpf_link_struct_ops *st_link;
__u32 zero = 0;
- if (bpf_map_delete_elem(link->fd, &zero))
- return -errno;
+ st_link = container_of(link, struct bpf_link_struct_ops, link);
- return 0;
+ if (st_link->map_fd < 0) {
+ /* Fake bpf_link */
+ if (bpf_map_delete_elem(link->fd, &zero))
+ return -errno;
+ return 0;
+ }
+
+ /* Doesn't support detaching. */
+ return -EOPNOTSUPP;
}
struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
{
- struct bpf_struct_ops *st_ops;
- struct bpf_link *link;
- __u32 i, zero = 0;
- int err;
+ struct bpf_link_struct_ops *link;
+ __u32 zero = 0;
+ int err, fd;
if (!bpf_map__is_struct_ops(map) || map->fd == -1)
return libbpf_err_ptr(-EINVAL);
@@ -11453,31 +11490,34 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
if (!link)
return libbpf_err_ptr(-EINVAL);
- st_ops = map->st_ops;
- for (i = 0; i < btf_vlen(st_ops->type); i++) {
- struct bpf_program *prog = st_ops->progs[i];
- void *kern_data;
- int prog_fd;
+ /* kern_vdata should be prepared during the loading phase. */
+ err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0);
+ if (err) {
+ err = -errno;
+ free(link);
+ return libbpf_err_ptr(err);
+ }
- if (!prog)
- continue;
- prog_fd = bpf_program__fd(prog);
- kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
- *(unsigned long *)kern_data = prog_fd;
+ if (!(map->def.map_flags & BPF_F_LINK)) {
+ /* Fake bpf_link */
+ link->link.fd = map->fd;
+ link->map_fd = -1;
+ link->link.detach = bpf_link__detach_struct_ops;
+ return &link->link;
}
- err = bpf_map_update_elem(map->fd, &zero, st_ops->kern_vdata, 0);
- if (err) {
+ fd = bpf_link_create(map->fd, -1, BPF_STRUCT_OPS, NULL);
+ if (fd < 0) {
err = -errno;
free(link);
return libbpf_err_ptr(err);
}
- link->detach = bpf_link__detach_struct_ops;
- link->fd = map->fd;
+ link->link.fd = fd;
+ link->map_fd = map->fd;
- return link;
+ return &link->link;
}
typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr,
--
2.34.1
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 5/9] libbpf: Create a bpf_link in bpf_map__attach_struct_ops().
2023-03-07 23:33 ` [PATCH bpf-next v4 5/9] libbpf: Create a bpf_link in bpf_map__attach_struct_ops() Kui-Feng Lee
@ 2023-03-08 0:46 ` Andrii Nakryiko
2023-03-08 3:33 ` Kui-Feng Lee
0 siblings, 1 reply; 25+ messages in thread
From: Andrii Nakryiko @ 2023-03-08 0:46 UTC (permalink / raw)
To: Kui-Feng Lee; +Cc: bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On Tue, Mar 7, 2023 at 3:33 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
>
> bpf_map__attach_struct_ops() was creating a dummy bpf_link as a
> placeholder, but now it is constructing an authentic one by calling
> bpf_link_create() if the map has the BPF_F_LINK flag.
>
> You can flag a struct_ops map with BPF_F_LINK by calling
> bpf_map__set_map_flags().
>
> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
> ---
> tools/lib/bpf/libbpf.c | 84 +++++++++++++++++++++++++++++++-----------
> 1 file changed, 62 insertions(+), 22 deletions(-)
>
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 35a698eb825d..a67efc3b3763 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -115,6 +115,7 @@ static const char * const attach_type_name[] = {
> [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_reuseport_select_or_migrate",
> [BPF_PERF_EVENT] = "perf_event",
> [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi",
> + [BPF_STRUCT_OPS] = "struct_ops",
> };
>
> static const char * const link_type_name[] = {
> @@ -7677,6 +7678,26 @@ static int bpf_object__resolve_externs(struct bpf_object *obj,
> return 0;
> }
>
> +static void bpf_map_prepare_vdata(const struct bpf_map *map)
> +{
> + struct bpf_struct_ops *st_ops;
> + __u32 i;
> +
> + st_ops = map->st_ops;
> + for (i = 0; i < btf_vlen(st_ops->type); i++) {
> + struct bpf_program *prog = st_ops->progs[i];
> + void *kern_data;
> + int prog_fd;
> +
> + if (!prog)
> + continue;
> +
> + prog_fd = bpf_program__fd(prog);
> + kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
> + *(unsigned long *)kern_data = prog_fd;
> + }
> +}
> +
> static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path)
> {
> int err, i;
> @@ -7728,6 +7749,10 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch
> btf__free(obj->btf_vmlinux);
> obj->btf_vmlinux = NULL;
>
> + for (i = 0; i < obj->nr_maps; i++)
> + if (bpf_map__is_struct_ops(&obj->maps[i]))
> + bpf_map_prepare_vdata(&obj->maps[i]);
This is similar in spirit to what bpf_object_init_prog_arrays() is
doing, let's add this as a separate step.
How about bpf_object_prepare_struct_ops()?
> +
> obj->loaded = true; /* doesn't matter if successfully or not */
>
> if (err)
> @@ -11429,22 +11454,34 @@ struct bpf_link *bpf_program__attach(const struct bpf_program *prog)
> return link;
> }
>
> +struct bpf_link_struct_ops {
> + struct bpf_link link;
> + int map_fd;
> +};
> +
> static int bpf_link__detach_struct_ops(struct bpf_link *link)
> {
> + struct bpf_link_struct_ops *st_link;
> __u32 zero = 0;
>
> - if (bpf_map_delete_elem(link->fd, &zero))
> - return -errno;
> + st_link = container_of(link, struct bpf_link_struct_ops, link);
>
> - return 0;
> + if (st_link->map_fd < 0) {
> + /* Fake bpf_link */
> + if (bpf_map_delete_elem(link->fd, &zero))
> + return -errno;
> + return 0;
just `return bpf_map_delete_elem(...)`, it will return actual error
(libbpf 1.0 simplification)
> + }
> +
> + /* Doesn't support detaching. */
> + return -EOPNOTSUPP;
> }
>
> struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
> {
> - struct bpf_struct_ops *st_ops;
> - struct bpf_link *link;
> - __u32 i, zero = 0;
> - int err;
> + struct bpf_link_struct_ops *link;
> + __u32 zero = 0;
> + int err, fd;
>
> if (!bpf_map__is_struct_ops(map) || map->fd == -1)
> return libbpf_err_ptr(-EINVAL);
> @@ -11453,31 +11490,34 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
> if (!link)
> return libbpf_err_ptr(-EINVAL);
>
> - st_ops = map->st_ops;
> - for (i = 0; i < btf_vlen(st_ops->type); i++) {
> - struct bpf_program *prog = st_ops->progs[i];
> - void *kern_data;
> - int prog_fd;
> + /* kern_vdata should be prepared during the loading phase. */
> + err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0);
> + if (err) {
> + err = -errno;
no need to deal with -errno, err is already the error you need
> + free(link);
> + return libbpf_err_ptr(err);
> + }
>
> - if (!prog)
> - continue;
>
> - prog_fd = bpf_program__fd(prog);
> - kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
> - *(unsigned long *)kern_data = prog_fd;
> + if (!(map->def.map_flags & BPF_F_LINK)) {
> + /* Fake bpf_link */
> + link->link.fd = map->fd;
> + link->map_fd = -1;
> + link->link.detach = bpf_link__detach_struct_ops;
> + return &link->link;
> }
>
> - err = bpf_map_update_elem(map->fd, &zero, st_ops->kern_vdata, 0);
> - if (err) {
> + fd = bpf_link_create(map->fd, -1, BPF_STRUCT_OPS, NULL);
> + if (fd < 0) {
> err = -errno;
same, fd is an error, it's true for all low-level libbpf APIs
> free(link);
> return libbpf_err_ptr(err);
> }
>
> - link->detach = bpf_link__detach_struct_ops;
> - link->fd = map->fd;
> + link->link.fd = fd;
> + link->map_fd = map->fd;
>
> - return link;
> + return &link->link;
> }
>
> typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr,
> --
> 2.34.1
>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 5/9] libbpf: Create a bpf_link in bpf_map__attach_struct_ops().
2023-03-08 0:46 ` Andrii Nakryiko
@ 2023-03-08 3:33 ` Kui-Feng Lee
0 siblings, 0 replies; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-08 3:33 UTC (permalink / raw)
To: Andrii Nakryiko, Kui-Feng Lee
Cc: bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On 3/7/23 16:46, Andrii Nakryiko wrote:
> On Tue, Mar 7, 2023 at 3:33 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
>>
>> bpf_map__attach_struct_ops() was creating a dummy bpf_link as a
>> placeholder, but now it is constructing an authentic one by calling
>> bpf_link_create() if the map has the BPF_F_LINK flag.
>>
>> You can flag a struct_ops map with BPF_F_LINK by calling
>> bpf_map__set_map_flags().
>>
>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
>> ---
>> tools/lib/bpf/libbpf.c | 84 +++++++++++++++++++++++++++++++-----------
>> 1 file changed, 62 insertions(+), 22 deletions(-)
>>
>> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
>> index 35a698eb825d..a67efc3b3763 100644
>> --- a/tools/lib/bpf/libbpf.c
>> +++ b/tools/lib/bpf/libbpf.c
>> @@ -115,6 +115,7 @@ static const char * const attach_type_name[] = {
>> [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_reuseport_select_or_migrate",
>> [BPF_PERF_EVENT] = "perf_event",
>> [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi",
>> + [BPF_STRUCT_OPS] = "struct_ops",
>> };
>>
>> static const char * const link_type_name[] = {
>> @@ -7677,6 +7678,26 @@ static int bpf_object__resolve_externs(struct bpf_object *obj,
>> return 0;
>> }
>>
>> +static void bpf_map_prepare_vdata(const struct bpf_map *map)
>> +{
>> + struct bpf_struct_ops *st_ops;
>> + __u32 i;
>> +
>> + st_ops = map->st_ops;
>> + for (i = 0; i < btf_vlen(st_ops->type); i++) {
>> + struct bpf_program *prog = st_ops->progs[i];
>> + void *kern_data;
>> + int prog_fd;
>> +
>> + if (!prog)
>> + continue;
>> +
>> + prog_fd = bpf_program__fd(prog);
>> + kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
>> + *(unsigned long *)kern_data = prog_fd;
>> + }
>> +}
>> +
>> static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path)
>> {
>> int err, i;
>> @@ -7728,6 +7749,10 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch
>> btf__free(obj->btf_vmlinux);
>> obj->btf_vmlinux = NULL;
>>
>> + for (i = 0; i < obj->nr_maps; i++)
>> + if (bpf_map__is_struct_ops(&obj->maps[i]))
>> + bpf_map_prepare_vdata(&obj->maps[i]);
>
> This is similar in spirit to what bpf_object_init_prog_arrays() is
> doing, let's add this as a separate step.
>
> How about bpf_object_prepare_struct_ops()?
Good!
>
>> +
>> obj->loaded = true; /* doesn't matter if successfully or not */
>>
>> if (err)
>> @@ -11429,22 +11454,34 @@ struct bpf_link *bpf_program__attach(const struct bpf_program *prog)
>> return link;
>> }
>>
>> +struct bpf_link_struct_ops {
>> + struct bpf_link link;
>> + int map_fd;
>> +};
>> +
>> static int bpf_link__detach_struct_ops(struct bpf_link *link)
>> {
>> + struct bpf_link_struct_ops *st_link;
>> __u32 zero = 0;
>>
>> - if (bpf_map_delete_elem(link->fd, &zero))
>> - return -errno;
>> + st_link = container_of(link, struct bpf_link_struct_ops, link);
>>
>> - return 0;
>> + if (st_link->map_fd < 0) {
>> + /* Fake bpf_link */
>> + if (bpf_map_delete_elem(link->fd, &zero))
>> + return -errno;
>> + return 0;
>
> just `return bpf_map_delete_elem(...)`, it will return actual error
> (libbpf 1.0 simplification)
Got it!
>
>> + }
>> +
>> + /* Doesn't support detaching. */
>> + return -EOPNOTSUPP;
>> }
>>
>> struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
>> {
>> - struct bpf_struct_ops *st_ops;
>> - struct bpf_link *link;
>> - __u32 i, zero = 0;
>> - int err;
>> + struct bpf_link_struct_ops *link;
>> + __u32 zero = 0;
>> + int err, fd;
>>
>> if (!bpf_map__is_struct_ops(map) || map->fd == -1)
>> return libbpf_err_ptr(-EINVAL);
>> @@ -11453,31 +11490,34 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
>> if (!link)
>> return libbpf_err_ptr(-EINVAL);
>>
>> - st_ops = map->st_ops;
>> - for (i = 0; i < btf_vlen(st_ops->type); i++) {
>> - struct bpf_program *prog = st_ops->progs[i];
>> - void *kern_data;
>> - int prog_fd;
>> + /* kern_vdata should be prepared during the loading phase. */
>> + err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0);
>> + if (err) {
>> + err = -errno;
>
> no need to deal with -errno, err is already the error you need
ok!
>
>> + free(link);
>> + return libbpf_err_ptr(err);
>> + }
>>
>> - if (!prog)
>> - continue;
>>
>> - prog_fd = bpf_program__fd(prog);
>> - kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
>> - *(unsigned long *)kern_data = prog_fd;
>> + if (!(map->def.map_flags & BPF_F_LINK)) {
>> + /* Fake bpf_link */
>> + link->link.fd = map->fd;
>> + link->map_fd = -1;
>> + link->link.detach = bpf_link__detach_struct_ops;
>> + return &link->link;
>> }
>>
>> - err = bpf_map_update_elem(map->fd, &zero, st_ops->kern_vdata, 0);
>> - if (err) {
>> + fd = bpf_link_create(map->fd, -1, BPF_STRUCT_OPS, NULL);
>> + if (fd < 0) {
>> err = -errno;
>
> same, fd is an error, it's true for all low-level libbpf APIs
Good to know!
>
>> free(link);
>> return libbpf_err_ptr(err);
>> }
>>
>> - link->detach = bpf_link__detach_struct_ops;
>> - link->fd = map->fd;
>> + link->link.fd = fd;
>> + link->map_fd = map->fd;
>>
>> - return link;
>> + return &link->link;
>> }
>>
>> typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr,
>> --
>> 2.34.1
>>
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH bpf-next v4 6/9] bpf: Update the struct_ops of a bpf_link.
2023-03-07 23:32 [PATCH bpf-next v4 0/9] Transit between BPF TCP congestion controls Kui-Feng Lee
` (4 preceding siblings ...)
2023-03-07 23:33 ` [PATCH bpf-next v4 5/9] libbpf: Create a bpf_link in bpf_map__attach_struct_ops() Kui-Feng Lee
@ 2023-03-07 23:33 ` Kui-Feng Lee
2023-03-08 0:49 ` Andrii Nakryiko
2023-03-07 23:33 ` [PATCH bpf-next v4 7/9] libbpf: Update a bpf_link with another struct_ops Kui-Feng Lee
` (2 subsequent siblings)
8 siblings, 1 reply; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-07 23:33 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
By improving the BPF_LINK_UPDATE command of bpf(), it should allow you
to conveniently switch between different struct_ops on a single
bpf_link. This would enable smoother transitions from one struct_ops
to another.
The struct_ops maps passing along with BPF_LINK_UPDATE should have the
BPF_F_LINK flag.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
include/linux/bpf.h | 1 +
include/uapi/linux/bpf.h | 8 ++++--
kernel/bpf/bpf_struct_ops.c | 46 ++++++++++++++++++++++++++++++++++
kernel/bpf/syscall.c | 43 ++++++++++++++++++++++++++++---
tools/include/uapi/linux/bpf.h | 7 +++++-
5 files changed, 98 insertions(+), 7 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 047d2c6aba88..2b5f150e370e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1405,6 +1405,7 @@ struct bpf_link_ops {
void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
int (*fill_link_info)(const struct bpf_link *link,
struct bpf_link_info *info);
+ int (*update_map)(struct bpf_link *link, struct bpf_map *new_map);
};
struct bpf_tramp_link {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index eb3e435c5303..999e199ebe06 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1555,8 +1555,12 @@ union bpf_attr {
struct { /* struct used by BPF_LINK_UPDATE command */
__u32 link_fd; /* link fd */
- /* new program fd to update link with */
- __u32 new_prog_fd;
+ union {
+ /* new program fd to update link with */
+ __u32 new_prog_fd;
+ /* new struct_ops map fd to update link with */
+ __u32 new_map_fd;
+ };
__u32 flags; /* extra flags */
/* expected link's program fd; is specified only if
* BPF_F_REPLACE flag is set in flags */
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index c71c8d73c7ad..2b850ce11617 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -759,10 +759,56 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
return 0;
}
+static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map)
+{
+ struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map, *old_st_map;
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *old_map;
+ int err = 0;
+
+ if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
+ !(new_map->map_flags & BPF_F_LINK))
+ return -EINVAL;
+
+ mutex_lock(&update_mutex);
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+
+ /* The new and old struct_ops must be the same type. */
+ st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+
+ old_map = st_link->map;
+ old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+ if (st_map->st_ops != old_st_map->st_ops ||
+ /* Pair with smp_store_release() during map_update */
+ smp_load_acquire(&st_map->kvalue.state) != BPF_STRUCT_OPS_STATE_READY) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ kvalue = &st_map->kvalue;
+
+ err = st_map->st_ops->update(kvalue->data, old_st_map->kvalue.data);
+ if (err)
+ goto err_out;
+
+ bpf_map_inc(new_map);
+ rcu_assign_pointer(st_link->map, new_map);
+
+ bpf_map_put(old_map);
+
+err_out:
+ mutex_unlock(&update_mutex);
+
+ return err;
+}
+
static const struct bpf_link_ops bpf_struct_ops_map_lops = {
.dealloc = bpf_struct_ops_map_link_dealloc,
.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+ .update_map = bpf_struct_ops_map_link_update,
};
int bpf_struct_ops_link_create(union bpf_attr *attr)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 25b044fdd82b..94ab1336ff41 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4646,6 +4646,30 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
return ret;
}
+static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
+{
+ struct bpf_map *new_map;
+ int ret = 0;
+
+ new_map = bpf_map_get(attr->link_update.new_map_fd);
+ if (IS_ERR(new_map))
+ return -EINVAL;
+
+ if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS) {
+ ret = -EINVAL;
+ goto out_put_map;
+ }
+
+ if (link->ops->update_map)
+ ret = link->ops->update_map(link, new_map);
+ else
+ ret = -EINVAL;
+
+out_put_map:
+ bpf_map_put(new_map);
+ return ret;
+}
+
#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
static int link_update(union bpf_attr *attr)
@@ -4658,14 +4682,25 @@ static int link_update(union bpf_attr *attr)
if (CHECK_ATTR(BPF_LINK_UPDATE))
return -EINVAL;
- flags = attr->link_update.flags;
- if (flags & ~BPF_F_REPLACE)
- return -EINVAL;
-
link = bpf_link_get_from_fd(attr->link_update.link_fd);
if (IS_ERR(link))
return PTR_ERR(link);
+ flags = attr->link_update.flags;
+
+ if (link->ops->update_map) {
+ if (flags) /* always replace the existing one */
+ ret = -EINVAL;
+ else
+ ret = link_update_map(link, attr);
+ goto out_put_link;
+ }
+
+ if (flags & ~BPF_F_REPLACE) {
+ ret = -EINVAL;
+ goto out_put_link;
+ }
+
new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
if (IS_ERR(new_prog)) {
ret = PTR_ERR(new_prog);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index cd0ff39981e8..259b8ab4f54e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1556,7 +1556,12 @@ union bpf_attr {
struct { /* struct used by BPF_LINK_UPDATE command */
__u32 link_fd; /* link fd */
/* new program fd to update link with */
- __u32 new_prog_fd;
+ union {
+ /* new program fd to update link with */
+ __u32 new_prog_fd;
+ /* new struct_ops map fd to update link with */
+ __u32 new_map_fd;
+ };
__u32 flags; /* extra flags */
/* expected link's program fd; is specified only if
* BPF_F_REPLACE flag is set in flags */
--
2.34.1
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 6/9] bpf: Update the struct_ops of a bpf_link.
2023-03-07 23:33 ` [PATCH bpf-next v4 6/9] bpf: Update the struct_ops of a bpf_link Kui-Feng Lee
@ 2023-03-08 0:49 ` Andrii Nakryiko
2023-03-08 16:27 ` Kui-Feng Lee
0 siblings, 1 reply; 25+ messages in thread
From: Andrii Nakryiko @ 2023-03-08 0:49 UTC (permalink / raw)
To: Kui-Feng Lee; +Cc: bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On Tue, Mar 7, 2023 at 3:33 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
>
> By improving the BPF_LINK_UPDATE command of bpf(), it should allow you
> to conveniently switch between different struct_ops on a single
> bpf_link. This would enable smoother transitions from one struct_ops
> to another.
>
> The struct_ops maps passing along with BPF_LINK_UPDATE should have the
> BPF_F_LINK flag.
>
> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
> ---
> include/linux/bpf.h | 1 +
> include/uapi/linux/bpf.h | 8 ++++--
> kernel/bpf/bpf_struct_ops.c | 46 ++++++++++++++++++++++++++++++++++
> kernel/bpf/syscall.c | 43 ++++++++++++++++++++++++++++---
> tools/include/uapi/linux/bpf.h | 7 +++++-
> 5 files changed, 98 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 047d2c6aba88..2b5f150e370e 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1405,6 +1405,7 @@ struct bpf_link_ops {
> void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
> int (*fill_link_info)(const struct bpf_link *link,
> struct bpf_link_info *info);
> + int (*update_map)(struct bpf_link *link, struct bpf_map *new_map);
> };
>
> struct bpf_tramp_link {
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index eb3e435c5303..999e199ebe06 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1555,8 +1555,12 @@ union bpf_attr {
>
> struct { /* struct used by BPF_LINK_UPDATE command */
> __u32 link_fd; /* link fd */
> - /* new program fd to update link with */
> - __u32 new_prog_fd;
> + union {
> + /* new program fd to update link with */
> + __u32 new_prog_fd;
> + /* new struct_ops map fd to update link with */
> + __u32 new_map_fd;
> + };
> __u32 flags; /* extra flags */
> /* expected link's program fd; is specified only if
> * BPF_F_REPLACE flag is set in flags */
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index c71c8d73c7ad..2b850ce11617 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -759,10 +759,56 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
> return 0;
> }
>
> +static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map)
> +{
> + struct bpf_struct_ops_value *kvalue;
> + struct bpf_struct_ops_map *st_map, *old_st_map;
> + struct bpf_struct_ops_link *st_link;
> + struct bpf_map *old_map;
> + int err = 0;
> +
> + if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
> + !(new_map->map_flags & BPF_F_LINK))
> + return -EINVAL;
> +
> + mutex_lock(&update_mutex);
> +
> + st_link = container_of(link, struct bpf_struct_ops_link, link);
> +
> + /* The new and old struct_ops must be the same type. */
> + st_map = container_of(new_map, struct bpf_struct_ops_map, map);
> +
> + old_map = st_link->map;
> + old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
> + if (st_map->st_ops != old_st_map->st_ops ||
> + /* Pair with smp_store_release() during map_update */
> + smp_load_acquire(&st_map->kvalue.state) != BPF_STRUCT_OPS_STATE_READY) {
> + err = -EINVAL;
> + goto err_out;
> + }
> +
> + kvalue = &st_map->kvalue;
> +
> + err = st_map->st_ops->update(kvalue->data, old_st_map->kvalue.data);
> + if (err)
> + goto err_out;
> +
> + bpf_map_inc(new_map);
> + rcu_assign_pointer(st_link->map, new_map);
> +
> + bpf_map_put(old_map);
> +
> +err_out:
> + mutex_unlock(&update_mutex);
> +
> + return err;
> +}
> +
> static const struct bpf_link_ops bpf_struct_ops_map_lops = {
> .dealloc = bpf_struct_ops_map_link_dealloc,
> .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
> .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
> + .update_map = bpf_struct_ops_map_link_update,
> };
>
> int bpf_struct_ops_link_create(union bpf_attr *attr)
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 25b044fdd82b..94ab1336ff41 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -4646,6 +4646,30 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
> return ret;
> }
>
> +static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
> +{
> + struct bpf_map *new_map;
> + int ret = 0;
> +
> + new_map = bpf_map_get(attr->link_update.new_map_fd);
> + if (IS_ERR(new_map))
> + return -EINVAL;
> +
> + if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS) {
> + ret = -EINVAL;
> + goto out_put_map;
> + }
> +
> + if (link->ops->update_map)
> + ret = link->ops->update_map(link, new_map);
> + else
> + ret = -EINVAL;
> +
> +out_put_map:
> + bpf_map_put(new_map);
> + return ret;
> +}
> +
> #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
>
> static int link_update(union bpf_attr *attr)
> @@ -4658,14 +4682,25 @@ static int link_update(union bpf_attr *attr)
> if (CHECK_ATTR(BPF_LINK_UPDATE))
> return -EINVAL;
>
> - flags = attr->link_update.flags;
> - if (flags & ~BPF_F_REPLACE)
> - return -EINVAL;
> -
> link = bpf_link_get_from_fd(attr->link_update.link_fd);
> if (IS_ERR(link))
> return PTR_ERR(link);
>
> + flags = attr->link_update.flags;
> +
> + if (link->ops->update_map) {
> + if (flags) /* always replace the existing one */
> + ret = -EINVAL;
> + else
> + ret = link_update_map(link, attr);
> + goto out_put_link;
umm... BPF_F_REPLACE for link_update is specifying "update only if
current prog fd matches what I specify", let's not ignore it for
struct_ops. This will create a deviation in behavior unnecessarily.
Please keep it consistent.
> + }
> +
> + if (flags & ~BPF_F_REPLACE) {
> + ret = -EINVAL;
> + goto out_put_link;
> + }
> +
> new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
> if (IS_ERR(new_prog)) {
> ret = PTR_ERR(new_prog);
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index cd0ff39981e8..259b8ab4f54e 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -1556,7 +1556,12 @@ union bpf_attr {
> struct { /* struct used by BPF_LINK_UPDATE command */
> __u32 link_fd; /* link fd */
> /* new program fd to update link with */
> - __u32 new_prog_fd;
> + union {
> + /* new program fd to update link with */
> + __u32 new_prog_fd;
> + /* new struct_ops map fd to update link with */
> + __u32 new_map_fd;
> + };
> __u32 flags; /* extra flags */
> /* expected link's program fd; is specified only if
> * BPF_F_REPLACE flag is set in flags */
> --
> 2.34.1
>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 6/9] bpf: Update the struct_ops of a bpf_link.
2023-03-08 0:49 ` Andrii Nakryiko
@ 2023-03-08 16:27 ` Kui-Feng Lee
0 siblings, 0 replies; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-08 16:27 UTC (permalink / raw)
To: Andrii Nakryiko, Kui-Feng Lee
Cc: bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On 3/7/23 16:49, Andrii Nakryiko wrote:
> On Tue, Mar 7, 2023 at 3:33 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
>>
>> By improving the BPF_LINK_UPDATE command of bpf(), it should allow you
>> to conveniently switch between different struct_ops on a single
>> bpf_link. This would enable smoother transitions from one struct_ops
>> to another.
>>
>> The struct_ops maps passing along with BPF_LINK_UPDATE should have the
>> BPF_F_LINK flag.
>>
>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
>> ---
>> include/linux/bpf.h | 1 +
>> include/uapi/linux/bpf.h | 8 ++++--
>> kernel/bpf/bpf_struct_ops.c | 46 ++++++++++++++++++++++++++++++++++
>> kernel/bpf/syscall.c | 43 ++++++++++++++++++++++++++++---
>> tools/include/uapi/linux/bpf.h | 7 +++++-
>> 5 files changed, 98 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 047d2c6aba88..2b5f150e370e 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1405,6 +1405,7 @@ struct bpf_link_ops {
>> void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
>> int (*fill_link_info)(const struct bpf_link *link,
>> struct bpf_link_info *info);
>> + int (*update_map)(struct bpf_link *link, struct bpf_map *new_map);
>> };
>>
>> struct bpf_tramp_link {
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index eb3e435c5303..999e199ebe06 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1555,8 +1555,12 @@ union bpf_attr {
>>
>> struct { /* struct used by BPF_LINK_UPDATE command */
>> __u32 link_fd; /* link fd */
>> - /* new program fd to update link with */
>> - __u32 new_prog_fd;
>> + union {
>> + /* new program fd to update link with */
>> + __u32 new_prog_fd;
>> + /* new struct_ops map fd to update link with */
>> + __u32 new_map_fd;
>> + };
>> __u32 flags; /* extra flags */
>> /* expected link's program fd; is specified only if
>> * BPF_F_REPLACE flag is set in flags */
>> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
>> index c71c8d73c7ad..2b850ce11617 100644
>> --- a/kernel/bpf/bpf_struct_ops.c
>> +++ b/kernel/bpf/bpf_struct_ops.c
>> @@ -759,10 +759,56 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
>> return 0;
>> }
>>
>> +static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map)
>> +{
>> + struct bpf_struct_ops_value *kvalue;
>> + struct bpf_struct_ops_map *st_map, *old_st_map;
>> + struct bpf_struct_ops_link *st_link;
>> + struct bpf_map *old_map;
>> + int err = 0;
>> +
>> + if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
>> + !(new_map->map_flags & BPF_F_LINK))
>> + return -EINVAL;
>> +
>> + mutex_lock(&update_mutex);
>> +
>> + st_link = container_of(link, struct bpf_struct_ops_link, link);
>> +
>> + /* The new and old struct_ops must be the same type. */
>> + st_map = container_of(new_map, struct bpf_struct_ops_map, map);
>> +
>> + old_map = st_link->map;
>> + old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
>> + if (st_map->st_ops != old_st_map->st_ops ||
>> + /* Pair with smp_store_release() during map_update */
>> + smp_load_acquire(&st_map->kvalue.state) != BPF_STRUCT_OPS_STATE_READY) {
>> + err = -EINVAL;
>> + goto err_out;
>> + }
>> +
>> + kvalue = &st_map->kvalue;
>> +
>> + err = st_map->st_ops->update(kvalue->data, old_st_map->kvalue.data);
>> + if (err)
>> + goto err_out;
>> +
>> + bpf_map_inc(new_map);
>> + rcu_assign_pointer(st_link->map, new_map);
>> +
>> + bpf_map_put(old_map);
>> +
>> +err_out:
>> + mutex_unlock(&update_mutex);
>> +
>> + return err;
>> +}
>> +
>> static const struct bpf_link_ops bpf_struct_ops_map_lops = {
>> .dealloc = bpf_struct_ops_map_link_dealloc,
>> .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
>> .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
>> + .update_map = bpf_struct_ops_map_link_update,
>> };
>>
>> int bpf_struct_ops_link_create(union bpf_attr *attr)
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 25b044fdd82b..94ab1336ff41 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -4646,6 +4646,30 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
>> return ret;
>> }
>>
>> +static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
>> +{
>> + struct bpf_map *new_map;
>> + int ret = 0;
>> +
>> + new_map = bpf_map_get(attr->link_update.new_map_fd);
>> + if (IS_ERR(new_map))
>> + return -EINVAL;
>> +
>> + if (new_map->map_type != BPF_MAP_TYPE_STRUCT_OPS) {
>> + ret = -EINVAL;
>> + goto out_put_map;
>> + }
>> +
>> + if (link->ops->update_map)
>> + ret = link->ops->update_map(link, new_map);
>> + else
>> + ret = -EINVAL;
>> +
>> +out_put_map:
>> + bpf_map_put(new_map);
>> + return ret;
>> +}
>> +
>> #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
>>
>> static int link_update(union bpf_attr *attr)
>> @@ -4658,14 +4682,25 @@ static int link_update(union bpf_attr *attr)
>> if (CHECK_ATTR(BPF_LINK_UPDATE))
>> return -EINVAL;
>>
>> - flags = attr->link_update.flags;
>> - if (flags & ~BPF_F_REPLACE)
>> - return -EINVAL;
>> -
>> link = bpf_link_get_from_fd(attr->link_update.link_fd);
>> if (IS_ERR(link))
>> return PTR_ERR(link);
>>
>> + flags = attr->link_update.flags;
>> +
>> + if (link->ops->update_map) {
>> + if (flags) /* always replace the existing one */
>> + ret = -EINVAL;
>> + else
>> + ret = link_update_map(link, attr);
>> + goto out_put_link;
>
> umm... BPF_F_REPLACE for link_update is specifying "update only if
> current prog fd matches what I specify", let's not ignore it for
> struct_ops. This will create a deviation in behavior unnecessarily.
> Please keep it consistent.
Ok!
>
>
>> + }
>> +
>> + if (flags & ~BPF_F_REPLACE) {
>> + ret = -EINVAL;
>> + goto out_put_link;
>> + }
>> +
>> new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
>> if (IS_ERR(new_prog)) {
>> ret = PTR_ERR(new_prog);
>> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
>> index cd0ff39981e8..259b8ab4f54e 100644
>> --- a/tools/include/uapi/linux/bpf.h
>> +++ b/tools/include/uapi/linux/bpf.h
>> @@ -1556,7 +1556,12 @@ union bpf_attr {
>> struct { /* struct used by BPF_LINK_UPDATE command */
>> __u32 link_fd; /* link fd */
>> /* new program fd to update link with */
>> - __u32 new_prog_fd;
>> + union {
>> + /* new program fd to update link with */
>> + __u32 new_prog_fd;
>> + /* new struct_ops map fd to update link with */
>> + __u32 new_map_fd;
>> + };
>> __u32 flags; /* extra flags */
>> /* expected link's program fd; is specified only if
>> * BPF_F_REPLACE flag is set in flags */
>> --
>> 2.34.1
>>
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH bpf-next v4 7/9] libbpf: Update a bpf_link with another struct_ops.
2023-03-07 23:32 [PATCH bpf-next v4 0/9] Transit between BPF TCP congestion controls Kui-Feng Lee
` (5 preceding siblings ...)
2023-03-07 23:33 ` [PATCH bpf-next v4 6/9] bpf: Update the struct_ops of a bpf_link Kui-Feng Lee
@ 2023-03-07 23:33 ` Kui-Feng Lee
2023-03-08 0:53 ` Andrii Nakryiko
2023-03-07 23:33 ` [PATCH bpf-next v4 8/9] libbpf: Use .struct_ops.link section to indicate a struct_ops with a link Kui-Feng Lee
2023-03-07 23:33 ` [PATCH bpf-next v4 9/9] selftests/bpf: Test switching TCP Congestion Control algorithms Kui-Feng Lee
8 siblings, 1 reply; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-07 23:33 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
Introduce bpf_link__update_struct_ops(), which will allow you to
effortlessly transition the struct_ops map of any given bpf_link into
an alternative.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
tools/lib/bpf/libbpf.c | 36 ++++++++++++++++++++++++++++++++++++
tools/lib/bpf/libbpf.h | 1 +
tools/lib/bpf/libbpf.map | 2 ++
3 files changed, 39 insertions(+)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a67efc3b3763..247de39d136f 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -11520,6 +11520,42 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
return &link->link;
}
+/*
+ * Swap the back struct_ops of a link with a new struct_ops map.
+ */
+int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map)
+{
+ struct bpf_link_struct_ops *st_ops_link;
+ __u32 zero = 0;
+ int err, fd;
+
+ if (!bpf_map__is_struct_ops(map) || map->fd == -1)
+ return -EINVAL;
+
+ st_ops_link = container_of(link, struct bpf_link_struct_ops, link);
+ /* Ensure the type of a link is correct */
+ if (st_ops_link->map_fd < 0)
+ return -EINVAL;
+
+ err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0);
+ if (err && errno != EBUSY) {
+ err = -errno;
+ free(link);
+ return err;
+ }
+
+ fd = bpf_link_update(link->fd, map->fd, NULL);
+ if (fd < 0) {
+ err = -errno;
+ free(link);
+ return err;
+ }
+
+ st_ops_link->map_fd = map->fd;
+
+ return 0;
+}
+
typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr,
void *private_data);
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 2efd80f6f7b9..5e62878d184c 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -695,6 +695,7 @@ bpf_program__attach_freplace(const struct bpf_program *prog,
struct bpf_map;
LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map);
+LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map);
struct bpf_iter_attach_opts {
size_t sz; /* size of this struct for forward/backward compatibility */
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 11c36a3c1a9f..e83571b04c19 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -384,4 +384,6 @@ LIBBPF_1.1.0 {
} LIBBPF_1.0.0;
LIBBPF_1.2.0 {
+ global:
+ bpf_link__update_map;
} LIBBPF_1.1.0;
--
2.34.1
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 7/9] libbpf: Update a bpf_link with another struct_ops.
2023-03-07 23:33 ` [PATCH bpf-next v4 7/9] libbpf: Update a bpf_link with another struct_ops Kui-Feng Lee
@ 2023-03-08 0:53 ` Andrii Nakryiko
2023-03-08 1:45 ` Kui-Feng Lee
0 siblings, 1 reply; 25+ messages in thread
From: Andrii Nakryiko @ 2023-03-08 0:53 UTC (permalink / raw)
To: Kui-Feng Lee; +Cc: bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On Tue, Mar 7, 2023 at 3:33 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
>
> Introduce bpf_link__update_struct_ops(), which will allow you to
update_map, not update_struct_ops, please update
> effortlessly transition the struct_ops map of any given bpf_link into
> an alternative.
This reads confusingly, tbh. Why not say "bpf_link__update_map()
allows to atomically update underlying struct_ops implementation for
given struct_ops BPF link" or something like this? Would it be
accurate?
>
> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
> ---
> tools/lib/bpf/libbpf.c | 36 ++++++++++++++++++++++++++++++++++++
> tools/lib/bpf/libbpf.h | 1 +
> tools/lib/bpf/libbpf.map | 2 ++
> 3 files changed, 39 insertions(+)
>
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index a67efc3b3763..247de39d136f 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -11520,6 +11520,42 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
> return &link->link;
> }
>
> +/*
> + * Swap the back struct_ops of a link with a new struct_ops map.
> + */
> +int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map)
> +{
> + struct bpf_link_struct_ops *st_ops_link;
> + __u32 zero = 0;
> + int err, fd;
> +
> + if (!bpf_map__is_struct_ops(map) || map->fd == -1)
let's not hard-code equality like this, < 0 is better
> + return -EINVAL;
> +
> + st_ops_link = container_of(link, struct bpf_link_struct_ops, link);
> + /* Ensure the type of a link is correct */
> + if (st_ops_link->map_fd < 0)
> + return -EINVAL;
> +
> + err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0);
> + if (err && errno != EBUSY) {
don't use errno, err is perfectly fine to rely on
> + err = -errno;
> + free(link);
why freeing the link?...
> + return err;
> + }
> +
> + fd = bpf_link_update(link->fd, map->fd, NULL);
> + if (fd < 0) {
> + err = -errno;
> + free(link);
same... please write tests that exercise both successful and
unsuccessful scenarios
> + return err;
> + }
> +
> + st_ops_link->map_fd = map->fd;
> +
> + return 0;
> +}
> +
> typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr,
> void *private_data);
>
> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> index 2efd80f6f7b9..5e62878d184c 100644
> --- a/tools/lib/bpf/libbpf.h
> +++ b/tools/lib/bpf/libbpf.h
> @@ -695,6 +695,7 @@ bpf_program__attach_freplace(const struct bpf_program *prog,
> struct bpf_map;
>
> LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map);
> +LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map);
>
> struct bpf_iter_attach_opts {
> size_t sz; /* size of this struct for forward/backward compatibility */
> diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
> index 11c36a3c1a9f..e83571b04c19 100644
> --- a/tools/lib/bpf/libbpf.map
> +++ b/tools/lib/bpf/libbpf.map
> @@ -384,4 +384,6 @@ LIBBPF_1.1.0 {
> } LIBBPF_1.0.0;
>
> LIBBPF_1.2.0 {
> + global:
> + bpf_link__update_map;
please always rebase before posting new versions of patch set,
LIBBPF_1.2.0 is not empty anymore
> } LIBBPF_1.1.0;
> --
> 2.34.1
>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 7/9] libbpf: Update a bpf_link with another struct_ops.
2023-03-08 0:53 ` Andrii Nakryiko
@ 2023-03-08 1:45 ` Kui-Feng Lee
0 siblings, 0 replies; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-08 1:45 UTC (permalink / raw)
To: Andrii Nakryiko, Kui-Feng Lee
Cc: bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On 3/7/23 16:53, Andrii Nakryiko wrote:
> On Tue, Mar 7, 2023 at 3:33 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
>>
>> Introduce bpf_link__update_struct_ops(), which will allow you to
>
> update_map, not update_struct_ops, please update
>
>> effortlessly transition the struct_ops map of any given bpf_link into
>> an alternative.
>
> This reads confusingly, tbh. Why not say "bpf_link__update_map()
> allows to atomically update underlying struct_ops implementation for
> given struct_ops BPF link" or something like this? Would it be
> accurate?
>
Right, it should be better.
>>
>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
>> ---
>> tools/lib/bpf/libbpf.c | 36 ++++++++++++++++++++++++++++++++++++
>> tools/lib/bpf/libbpf.h | 1 +
>> tools/lib/bpf/libbpf.map | 2 ++
>> 3 files changed, 39 insertions(+)
>>
>> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
>> index a67efc3b3763..247de39d136f 100644
>> --- a/tools/lib/bpf/libbpf.c
>> +++ b/tools/lib/bpf/libbpf.c
>> @@ -11520,6 +11520,42 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
>> return &link->link;
>> }
>>
>> +/*
>> + * Swap the back struct_ops of a link with a new struct_ops map.
>> + */
>> +int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map)
>> +{
>> + struct bpf_link_struct_ops *st_ops_link;
>> + __u32 zero = 0;
>> + int err, fd;
>> +
>> + if (!bpf_map__is_struct_ops(map) || map->fd == -1)
>
> let's not hard-code equality like this, < 0 is better
Ok!
>
>> + return -EINVAL;
>> +
>> + st_ops_link = container_of(link, struct bpf_link_struct_ops, link);
>> + /* Ensure the type of a link is correct */
>> + if (st_ops_link->map_fd < 0)
>> + return -EINVAL;
>> +
>> + err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0);
>> + if (err && errno != EBUSY) {
>
> don't use errno, err is perfectly fine to rely on
Ok!
>
>> + err = -errno;
>> + free(link);
>
> why freeing the link?...
Urg! It is a mistake.
>
>
>> + return err;
>> + }
>> +
>> + fd = bpf_link_update(link->fd, map->fd, NULL);
>> + if (fd < 0) {
>> + err = -errno;
>> + free(link);
>
> same... please write tests that exercise both successful and
> unsuccessful scenarios
Got it!
>
>> + return err;
>> + }
>> +
>> + st_ops_link->map_fd = map->fd;
>> +
>> + return 0;
>> +}
>> +
>> typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr,
>> void *private_data);
>>
>> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
>> index 2efd80f6f7b9..5e62878d184c 100644
>> --- a/tools/lib/bpf/libbpf.h
>> +++ b/tools/lib/bpf/libbpf.h
>> @@ -695,6 +695,7 @@ bpf_program__attach_freplace(const struct bpf_program *prog,
>> struct bpf_map;
>>
>> LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map);
>> +LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map);
>>
>> struct bpf_iter_attach_opts {
>> size_t sz; /* size of this struct for forward/backward compatibility */
>> diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
>> index 11c36a3c1a9f..e83571b04c19 100644
>> --- a/tools/lib/bpf/libbpf.map
>> +++ b/tools/lib/bpf/libbpf.map
>> @@ -384,4 +384,6 @@ LIBBPF_1.1.0 {
>> } LIBBPF_1.0.0;
>>
>> LIBBPF_1.2.0 {
>> + global:
>> + bpf_link__update_map;
>
> please always rebase before posting new versions of patch set,
> LIBBPF_1.2.0 is not empty anymore
>
>> } LIBBPF_1.1.0;
>> --
>> 2.34.1
>>
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH bpf-next v4 8/9] libbpf: Use .struct_ops.link section to indicate a struct_ops with a link.
2023-03-07 23:32 [PATCH bpf-next v4 0/9] Transit between BPF TCP congestion controls Kui-Feng Lee
` (6 preceding siblings ...)
2023-03-07 23:33 ` [PATCH bpf-next v4 7/9] libbpf: Update a bpf_link with another struct_ops Kui-Feng Lee
@ 2023-03-07 23:33 ` Kui-Feng Lee
2023-03-08 1:07 ` Andrii Nakryiko
2023-03-07 23:33 ` [PATCH bpf-next v4 9/9] selftests/bpf: Test switching TCP Congestion Control algorithms Kui-Feng Lee
8 siblings, 1 reply; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-07 23:33 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
Flags a struct_ops is to back a bpf_link by putting it to the
".struct_ops.link" section. Once it is flagged, the created
struct_ops can be used to create a bpf_link or update a bpf_link that
has been backed by another struct_ops.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
tools/lib/bpf/libbpf.c | 64 +++++++++++++++++++++++++++++++++---------
1 file changed, 50 insertions(+), 14 deletions(-)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 247de39d136f..d66acd2fdbaa 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -467,6 +467,7 @@ struct bpf_struct_ops {
#define KCONFIG_SEC ".kconfig"
#define KSYMS_SEC ".ksyms"
#define STRUCT_OPS_SEC ".struct_ops"
+#define STRUCT_OPS_LINK_SEC ".struct_ops.link"
enum libbpf_map_type {
LIBBPF_MAP_UNSPEC,
@@ -596,6 +597,7 @@ struct elf_state {
Elf64_Ehdr *ehdr;
Elf_Data *symbols;
Elf_Data *st_ops_data;
+ Elf_Data *st_ops_link_data;
size_t shstrndx; /* section index for section name strings */
size_t strtabidx;
struct elf_sec_desc *secs;
@@ -605,6 +607,7 @@ struct elf_state {
int text_shndx;
int symbols_shndx;
int st_ops_shndx;
+ int st_ops_link_shndx;
};
struct usdt_manager;
@@ -1119,7 +1122,7 @@ static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj)
return 0;
}
-static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
+static int bpf_object__init_struct_ops_maps_link(struct bpf_object *obj, bool link)
{
const struct btf_type *type, *datasec;
const struct btf_var_secinfo *vsi;
@@ -1127,18 +1130,33 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
const char *tname, *var_name;
__s32 type_id, datasec_id;
const struct btf *btf;
+ const char *sec_name;
struct bpf_map *map;
- __u32 i;
+ __u32 i, map_flags;
+ Elf_Data *data;
+ int shndx;
- if (obj->efile.st_ops_shndx == -1)
+ if (link) {
+ sec_name = STRUCT_OPS_LINK_SEC;
+ shndx = obj->efile.st_ops_link_shndx;
+ data = obj->efile.st_ops_link_data;
+ map_flags = BPF_F_LINK;
+ } else {
+ sec_name = STRUCT_OPS_SEC;
+ shndx = obj->efile.st_ops_shndx;
+ data = obj->efile.st_ops_data;
+ map_flags = 0;
+ }
+
+ if (shndx == -1)
return 0;
btf = obj->btf;
- datasec_id = btf__find_by_name_kind(btf, STRUCT_OPS_SEC,
+ datasec_id = btf__find_by_name_kind(btf, sec_name,
BTF_KIND_DATASEC);
if (datasec_id < 0) {
pr_warn("struct_ops init: DATASEC %s not found\n",
- STRUCT_OPS_SEC);
+ sec_name);
return -EINVAL;
}
@@ -1151,7 +1169,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
type_id = btf__resolve_type(obj->btf, vsi->type);
if (type_id < 0) {
pr_warn("struct_ops init: Cannot resolve var type_id %u in DATASEC %s\n",
- vsi->type, STRUCT_OPS_SEC);
+ vsi->type, sec_name);
return -EINVAL;
}
@@ -1170,7 +1188,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
if (IS_ERR(map))
return PTR_ERR(map);
- map->sec_idx = obj->efile.st_ops_shndx;
+ map->sec_idx = shndx;
map->sec_offset = vsi->offset;
map->name = strdup(var_name);
if (!map->name)
@@ -1180,6 +1198,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
map->def.key_size = sizeof(int);
map->def.value_size = type->size;
map->def.max_entries = 1;
+ map->def.map_flags = map_flags;
map->st_ops = calloc(1, sizeof(*map->st_ops));
if (!map->st_ops)
@@ -1192,14 +1211,14 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
if (!st_ops->data || !st_ops->progs || !st_ops->kern_func_off)
return -ENOMEM;
- if (vsi->offset + type->size > obj->efile.st_ops_data->d_size) {
+ if (vsi->offset + type->size > data->d_size) {
pr_warn("struct_ops init: var %s is beyond the end of DATASEC %s\n",
- var_name, STRUCT_OPS_SEC);
+ var_name, sec_name);
return -EINVAL;
}
memcpy(st_ops->data,
- obj->efile.st_ops_data->d_buf + vsi->offset,
+ data->d_buf + vsi->offset,
type->size);
st_ops->tname = tname;
st_ops->type = type;
@@ -1212,6 +1231,15 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
return 0;
}
+static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
+{
+ int err;
+
+ err = bpf_object__init_struct_ops_maps_link(obj, false);
+ err = err ?: bpf_object__init_struct_ops_maps_link(obj, true);
+ return err;
+}
+
static struct bpf_object *bpf_object__new(const char *path,
const void *obj_buf,
size_t obj_buf_sz,
@@ -1248,6 +1276,7 @@ static struct bpf_object *bpf_object__new(const char *path,
obj->efile.obj_buf_sz = obj_buf_sz;
obj->efile.btf_maps_shndx = -1;
obj->efile.st_ops_shndx = -1;
+ obj->efile.st_ops_link_shndx = -1;
obj->kconfig_map_idx = -1;
obj->kern_version = get_kernel_version();
@@ -1265,6 +1294,7 @@ static void bpf_object__elf_finish(struct bpf_object *obj)
obj->efile.elf = NULL;
obj->efile.symbols = NULL;
obj->efile.st_ops_data = NULL;
+ obj->efile.st_ops_link_data = NULL;
zfree(&obj->efile.secs);
obj->efile.sec_cnt = 0;
@@ -2753,12 +2783,13 @@ static bool libbpf_needs_btf(const struct bpf_object *obj)
{
return obj->efile.btf_maps_shndx >= 0 ||
obj->efile.st_ops_shndx >= 0 ||
+ obj->efile.st_ops_link_shndx >= 0 ||
obj->nr_extern > 0;
}
static bool kernel_needs_btf(const struct bpf_object *obj)
{
- return obj->efile.st_ops_shndx >= 0;
+ return obj->efile.st_ops_shndx >= 0 || obj->efile.st_ops_link_shndx >= 0;
}
static int bpf_object__init_btf(struct bpf_object *obj,
@@ -3451,6 +3482,9 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
} else if (strcmp(name, STRUCT_OPS_SEC) == 0) {
obj->efile.st_ops_data = data;
obj->efile.st_ops_shndx = idx;
+ } else if (strcmp(name, STRUCT_OPS_LINK_SEC) == 0) {
+ obj->efile.st_ops_link_data = data;
+ obj->efile.st_ops_link_shndx = idx;
} else {
pr_info("elf: skipping unrecognized data section(%d) %s\n",
idx, name);
@@ -3465,6 +3499,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
/* Only do relo for section with exec instructions */
if (!section_have_execinstr(obj, targ_sec_idx) &&
strcmp(name, ".rel" STRUCT_OPS_SEC) &&
+ strcmp(name, ".rel" STRUCT_OPS_LINK_SEC) &&
strcmp(name, ".rel" MAPS_ELF_SEC)) {
pr_info("elf: skipping relo section(%d) %s for section(%d) %s\n",
idx, name, targ_sec_idx,
@@ -6611,7 +6646,7 @@ static int bpf_object__collect_relos(struct bpf_object *obj)
return -LIBBPF_ERRNO__INTERNAL;
}
- if (idx == obj->efile.st_ops_shndx)
+ if (idx == obj->efile.st_ops_shndx || idx == obj->efile.st_ops_link_shndx)
err = bpf_object__collect_st_ops_relos(obj, shdr, data);
else if (idx == obj->efile.btf_maps_shndx)
err = bpf_object__collect_map_relos(obj, shdr, data);
@@ -8954,8 +8989,9 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
}
/* struct_ops BPF prog can be re-used between multiple
- * .struct_ops as long as it's the same struct_ops struct
- * definition and the same function pointer field
+ * .struct_ops & .struct_ops.link as long as it's the
+ * same struct_ops struct definition and the same
+ * function pointer field
*/
if (prog->attach_btf_id != st_ops->type_id ||
prog->expected_attach_type != member_idx) {
--
2.34.1
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 8/9] libbpf: Use .struct_ops.link section to indicate a struct_ops with a link.
2023-03-07 23:33 ` [PATCH bpf-next v4 8/9] libbpf: Use .struct_ops.link section to indicate a struct_ops with a link Kui-Feng Lee
@ 2023-03-08 1:07 ` Andrii Nakryiko
2023-03-08 4:23 ` Kui-Feng Lee
0 siblings, 1 reply; 25+ messages in thread
From: Andrii Nakryiko @ 2023-03-08 1:07 UTC (permalink / raw)
To: Kui-Feng Lee; +Cc: bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On Tue, Mar 7, 2023 at 3:33 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
>
> Flags a struct_ops is to back a bpf_link by putting it to the
> ".struct_ops.link" section. Once it is flagged, the created
> struct_ops can be used to create a bpf_link or update a bpf_link that
> has been backed by another struct_ops.
>
> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
> ---
> tools/lib/bpf/libbpf.c | 64 +++++++++++++++++++++++++++++++++---------
> 1 file changed, 50 insertions(+), 14 deletions(-)
>
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 247de39d136f..d66acd2fdbaa 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -467,6 +467,7 @@ struct bpf_struct_ops {
> #define KCONFIG_SEC ".kconfig"
> #define KSYMS_SEC ".ksyms"
> #define STRUCT_OPS_SEC ".struct_ops"
> +#define STRUCT_OPS_LINK_SEC ".struct_ops.link"
>
> enum libbpf_map_type {
> LIBBPF_MAP_UNSPEC,
> @@ -596,6 +597,7 @@ struct elf_state {
> Elf64_Ehdr *ehdr;
> Elf_Data *symbols;
> Elf_Data *st_ops_data;
> + Elf_Data *st_ops_link_data;
> size_t shstrndx; /* section index for section name strings */
> size_t strtabidx;
> struct elf_sec_desc *secs;
> @@ -605,6 +607,7 @@ struct elf_state {
> int text_shndx;
> int symbols_shndx;
> int st_ops_shndx;
> + int st_ops_link_shndx;
> };
>
> struct usdt_manager;
> @@ -1119,7 +1122,7 @@ static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj)
> return 0;
> }
>
> -static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
> +static int bpf_object__init_struct_ops_maps_link(struct bpf_object *obj, bool link)
let's shorten it and not use double underscores, as this is not a
public bpf_object API, just "init_struct_ops_maps" probably is fine
> {
> const struct btf_type *type, *datasec;
> const struct btf_var_secinfo *vsi;
> @@ -1127,18 +1130,33 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
> const char *tname, *var_name;
> __s32 type_id, datasec_id;
> const struct btf *btf;
> + const char *sec_name;
> struct bpf_map *map;
> - __u32 i;
> + __u32 i, map_flags;
> + Elf_Data *data;
> + int shndx;
>
> - if (obj->efile.st_ops_shndx == -1)
> + if (link) {
> + sec_name = STRUCT_OPS_LINK_SEC;
> + shndx = obj->efile.st_ops_link_shndx;
> + data = obj->efile.st_ops_link_data;
> + map_flags = BPF_F_LINK;
> + } else {
> + sec_name = STRUCT_OPS_SEC;
> + shndx = obj->efile.st_ops_shndx;
> + data = obj->efile.st_ops_data;
> + map_flags = 0;
> + }
let's pass these as function arguments instead
> +
> + if (shndx == -1)
> return 0;
>
> btf = obj->btf;
> - datasec_id = btf__find_by_name_kind(btf, STRUCT_OPS_SEC,
> + datasec_id = btf__find_by_name_kind(btf, sec_name,
> BTF_KIND_DATASEC);
> if (datasec_id < 0) {
> pr_warn("struct_ops init: DATASEC %s not found\n",
> - STRUCT_OPS_SEC);
> + sec_name);
> return -EINVAL;
> }
>
> @@ -1151,7 +1169,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
> type_id = btf__resolve_type(obj->btf, vsi->type);
> if (type_id < 0) {
> pr_warn("struct_ops init: Cannot resolve var type_id %u in DATASEC %s\n",
> - vsi->type, STRUCT_OPS_SEC);
> + vsi->type, sec_name);
> return -EINVAL;
> }
>
> @@ -1170,7 +1188,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
> if (IS_ERR(map))
> return PTR_ERR(map);
>
> - map->sec_idx = obj->efile.st_ops_shndx;
> + map->sec_idx = shndx;
> map->sec_offset = vsi->offset;
> map->name = strdup(var_name);
> if (!map->name)
> @@ -1180,6 +1198,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
> map->def.key_size = sizeof(int);
> map->def.value_size = type->size;
> map->def.max_entries = 1;
> + map->def.map_flags = map_flags;
>
> map->st_ops = calloc(1, sizeof(*map->st_ops));
> if (!map->st_ops)
> @@ -1192,14 +1211,14 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
> if (!st_ops->data || !st_ops->progs || !st_ops->kern_func_off)
> return -ENOMEM;
>
> - if (vsi->offset + type->size > obj->efile.st_ops_data->d_size) {
> + if (vsi->offset + type->size > data->d_size) {
> pr_warn("struct_ops init: var %s is beyond the end of DATASEC %s\n",
> - var_name, STRUCT_OPS_SEC);
> + var_name, sec_name);
> return -EINVAL;
> }
>
> memcpy(st_ops->data,
> - obj->efile.st_ops_data->d_buf + vsi->offset,
> + data->d_buf + vsi->offset,
> type->size);
> st_ops->tname = tname;
> st_ops->type = type;
> @@ -1212,6 +1231,15 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
> return 0;
> }
>
> +static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
let's name this bpf_object_init_struct_ops, no double underscores
> +{
> + int err;
> +
> + err = bpf_object__init_struct_ops_maps_link(obj, false);
> + err = err ?: bpf_object__init_struct_ops_maps_link(obj, true);
> + return err;
> +}
> +
> static struct bpf_object *bpf_object__new(const char *path,
> const void *obj_buf,
> size_t obj_buf_sz,
> @@ -1248,6 +1276,7 @@ static struct bpf_object *bpf_object__new(const char *path,
> obj->efile.obj_buf_sz = obj_buf_sz;
> obj->efile.btf_maps_shndx = -1;
> obj->efile.st_ops_shndx = -1;
> + obj->efile.st_ops_link_shndx = -1;
> obj->kconfig_map_idx = -1;
>
> obj->kern_version = get_kernel_version();
> @@ -1265,6 +1294,7 @@ static void bpf_object__elf_finish(struct bpf_object *obj)
> obj->efile.elf = NULL;
> obj->efile.symbols = NULL;
> obj->efile.st_ops_data = NULL;
> + obj->efile.st_ops_link_data = NULL;
>
> zfree(&obj->efile.secs);
> obj->efile.sec_cnt = 0;
> @@ -2753,12 +2783,13 @@ static bool libbpf_needs_btf(const struct bpf_object *obj)
> {
> return obj->efile.btf_maps_shndx >= 0 ||
> obj->efile.st_ops_shndx >= 0 ||
> + obj->efile.st_ops_link_shndx >= 0 ||
> obj->nr_extern > 0;
> }
>
> static bool kernel_needs_btf(const struct bpf_object *obj)
> {
> - return obj->efile.st_ops_shndx >= 0;
> + return obj->efile.st_ops_shndx >= 0 || obj->efile.st_ops_link_shndx >= 0;
> }
>
> static int bpf_object__init_btf(struct bpf_object *obj,
> @@ -3451,6 +3482,9 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
> } else if (strcmp(name, STRUCT_OPS_SEC) == 0) {
> obj->efile.st_ops_data = data;
> obj->efile.st_ops_shndx = idx;
> + } else if (strcmp(name, STRUCT_OPS_LINK_SEC) == 0) {
> + obj->efile.st_ops_link_data = data;
> + obj->efile.st_ops_link_shndx = idx;
> } else {
> pr_info("elf: skipping unrecognized data section(%d) %s\n",
> idx, name);
> @@ -3465,6 +3499,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
> /* Only do relo for section with exec instructions */
> if (!section_have_execinstr(obj, targ_sec_idx) &&
> strcmp(name, ".rel" STRUCT_OPS_SEC) &&
> + strcmp(name, ".rel" STRUCT_OPS_LINK_SEC) &&
> strcmp(name, ".rel" MAPS_ELF_SEC)) {
> pr_info("elf: skipping relo section(%d) %s for section(%d) %s\n",
> idx, name, targ_sec_idx,
> @@ -6611,7 +6646,7 @@ static int bpf_object__collect_relos(struct bpf_object *obj)
> return -LIBBPF_ERRNO__INTERNAL;
> }
>
> - if (idx == obj->efile.st_ops_shndx)
> + if (idx == obj->efile.st_ops_shndx || idx == obj->efile.st_ops_link_shndx)
> err = bpf_object__collect_st_ops_relos(obj, shdr, data);
this function calls find_struct_ops_map_by_offset() which assumes we
only have one struct_ops section. This won't work now, please double
check all this, there should be no assumption about specific section
index
> else if (idx == obj->efile.btf_maps_shndx)
> err = bpf_object__collect_map_relos(obj, shdr, data);
> @@ -8954,8 +8989,9 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
> }
>
> /* struct_ops BPF prog can be re-used between multiple
> - * .struct_ops as long as it's the same struct_ops struct
> - * definition and the same function pointer field
> + * .struct_ops & .struct_ops.link as long as it's the
> + * same struct_ops struct definition and the same
> + * function pointer field
> */
> if (prog->attach_btf_id != st_ops->type_id ||
> prog->expected_attach_type != member_idx) {
> --
> 2.34.1
>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 8/9] libbpf: Use .struct_ops.link section to indicate a struct_ops with a link.
2023-03-08 1:07 ` Andrii Nakryiko
@ 2023-03-08 4:23 ` Kui-Feng Lee
0 siblings, 0 replies; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-08 4:23 UTC (permalink / raw)
To: Andrii Nakryiko, Kui-Feng Lee
Cc: bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On 3/7/23 17:07, Andrii Nakryiko wrote:
> On Tue, Mar 7, 2023 at 3:33 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
>>
>> Flags a struct_ops is to back a bpf_link by putting it to the
>> ".struct_ops.link" section. Once it is flagged, the created
>> struct_ops can be used to create a bpf_link or update a bpf_link that
>> has been backed by another struct_ops.
>>
>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
>> ---
>> tools/lib/bpf/libbpf.c | 64 +++++++++++++++++++++++++++++++++---------
>> 1 file changed, 50 insertions(+), 14 deletions(-)
>>
>> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
>> index 247de39d136f..d66acd2fdbaa 100644
>> --- a/tools/lib/bpf/libbpf.c
>> +++ b/tools/lib/bpf/libbpf.c
>> @@ -467,6 +467,7 @@ struct bpf_struct_ops {
>> #define KCONFIG_SEC ".kconfig"
>> #define KSYMS_SEC ".ksyms"
>> #define STRUCT_OPS_SEC ".struct_ops"
>> +#define STRUCT_OPS_LINK_SEC ".struct_ops.link"
>>
>> enum libbpf_map_type {
>> LIBBPF_MAP_UNSPEC,
>> @@ -596,6 +597,7 @@ struct elf_state {
>> Elf64_Ehdr *ehdr;
>> Elf_Data *symbols;
>> Elf_Data *st_ops_data;
>> + Elf_Data *st_ops_link_data;
>> size_t shstrndx; /* section index for section name strings */
>> size_t strtabidx;
>> struct elf_sec_desc *secs;
>> @@ -605,6 +607,7 @@ struct elf_state {
>> int text_shndx;
>> int symbols_shndx;
>> int st_ops_shndx;
>> + int st_ops_link_shndx;
>> };
>>
>> struct usdt_manager;
>> @@ -1119,7 +1122,7 @@ static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj)
>> return 0;
>> }
>>
>> -static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
>> +static int bpf_object__init_struct_ops_maps_link(struct bpf_object *obj, bool link)
>
> let's shorten it and not use double underscores, as this is not a
> public bpf_object API, just "init_struct_ops_maps" probably is fine
>
>> {
>> const struct btf_type *type, *datasec;
>> const struct btf_var_secinfo *vsi;
>> @@ -1127,18 +1130,33 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
>> const char *tname, *var_name;
>> __s32 type_id, datasec_id;
>> const struct btf *btf;
>> + const char *sec_name;
>> struct bpf_map *map;
>> - __u32 i;
>> + __u32 i, map_flags;
>> + Elf_Data *data;
>> + int shndx;
>>
>> - if (obj->efile.st_ops_shndx == -1)
>> + if (link) {
>> + sec_name = STRUCT_OPS_LINK_SEC;
>> + shndx = obj->efile.st_ops_link_shndx;
>> + data = obj->efile.st_ops_link_data;
>> + map_flags = BPF_F_LINK;
>> + } else {
>> + sec_name = STRUCT_OPS_SEC;
>> + shndx = obj->efile.st_ops_shndx;
>> + data = obj->efile.st_ops_data;
>> + map_flags = 0;
>> + }
>
> let's pass these as function arguments instead
>
>> +
>> + if (shndx == -1)
>> return 0;
>>
>> btf = obj->btf;
>> - datasec_id = btf__find_by_name_kind(btf, STRUCT_OPS_SEC,
>> + datasec_id = btf__find_by_name_kind(btf, sec_name,
>> BTF_KIND_DATASEC);
>> if (datasec_id < 0) {
>> pr_warn("struct_ops init: DATASEC %s not found\n",
>> - STRUCT_OPS_SEC);
>> + sec_name);
>> return -EINVAL;
>> }
>>
>> @@ -1151,7 +1169,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
>> type_id = btf__resolve_type(obj->btf, vsi->type);
>> if (type_id < 0) {
>> pr_warn("struct_ops init: Cannot resolve var type_id %u in DATASEC %s\n",
>> - vsi->type, STRUCT_OPS_SEC);
>> + vsi->type, sec_name);
>> return -EINVAL;
>> }
>>
>> @@ -1170,7 +1188,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
>> if (IS_ERR(map))
>> return PTR_ERR(map);
>>
>> - map->sec_idx = obj->efile.st_ops_shndx;
>> + map->sec_idx = shndx;
>> map->sec_offset = vsi->offset;
>> map->name = strdup(var_name);
>> if (!map->name)
>> @@ -1180,6 +1198,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
>> map->def.key_size = sizeof(int);
>> map->def.value_size = type->size;
>> map->def.max_entries = 1;
>> + map->def.map_flags = map_flags;
>>
>> map->st_ops = calloc(1, sizeof(*map->st_ops));
>> if (!map->st_ops)
>> @@ -1192,14 +1211,14 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
>> if (!st_ops->data || !st_ops->progs || !st_ops->kern_func_off)
>> return -ENOMEM;
>>
>> - if (vsi->offset + type->size > obj->efile.st_ops_data->d_size) {
>> + if (vsi->offset + type->size > data->d_size) {
>> pr_warn("struct_ops init: var %s is beyond the end of DATASEC %s\n",
>> - var_name, STRUCT_OPS_SEC);
>> + var_name, sec_name);
>> return -EINVAL;
>> }
>>
>> memcpy(st_ops->data,
>> - obj->efile.st_ops_data->d_buf + vsi->offset,
>> + data->d_buf + vsi->offset,
>> type->size);
>> st_ops->tname = tname;
>> st_ops->type = type;
>> @@ -1212,6 +1231,15 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
>> return 0;
>> }
>>
>> +static int bpf_object__init_struct_ops_maps(struct bpf_object *obj)
>
> let's name this bpf_object_init_struct_ops, no double underscores
>
>> +{
>> + int err;
>> +
>> + err = bpf_object__init_struct_ops_maps_link(obj, false);
>> + err = err ?: bpf_object__init_struct_ops_maps_link(obj, true);
>> + return err;
>> +}
>> +
>> static struct bpf_object *bpf_object__new(const char *path,
>> const void *obj_buf,
>> size_t obj_buf_sz,
>> @@ -1248,6 +1276,7 @@ static struct bpf_object *bpf_object__new(const char *path,
>> obj->efile.obj_buf_sz = obj_buf_sz;
>> obj->efile.btf_maps_shndx = -1;
>> obj->efile.st_ops_shndx = -1;
>> + obj->efile.st_ops_link_shndx = -1;
>> obj->kconfig_map_idx = -1;
>>
>> obj->kern_version = get_kernel_version();
>> @@ -1265,6 +1294,7 @@ static void bpf_object__elf_finish(struct bpf_object *obj)
>> obj->efile.elf = NULL;
>> obj->efile.symbols = NULL;
>> obj->efile.st_ops_data = NULL;
>> + obj->efile.st_ops_link_data = NULL;
>>
>> zfree(&obj->efile.secs);
>> obj->efile.sec_cnt = 0;
>> @@ -2753,12 +2783,13 @@ static bool libbpf_needs_btf(const struct bpf_object *obj)
>> {
>> return obj->efile.btf_maps_shndx >= 0 ||
>> obj->efile.st_ops_shndx >= 0 ||
>> + obj->efile.st_ops_link_shndx >= 0 ||
>> obj->nr_extern > 0;
>> }
>>
>> static bool kernel_needs_btf(const struct bpf_object *obj)
>> {
>> - return obj->efile.st_ops_shndx >= 0;
>> + return obj->efile.st_ops_shndx >= 0 || obj->efile.st_ops_link_shndx >= 0;
>> }
>>
>> static int bpf_object__init_btf(struct bpf_object *obj,
>> @@ -3451,6 +3482,9 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
>> } else if (strcmp(name, STRUCT_OPS_SEC) == 0) {
>> obj->efile.st_ops_data = data;
>> obj->efile.st_ops_shndx = idx;
>> + } else if (strcmp(name, STRUCT_OPS_LINK_SEC) == 0) {
>> + obj->efile.st_ops_link_data = data;
>> + obj->efile.st_ops_link_shndx = idx;
>> } else {
>> pr_info("elf: skipping unrecognized data section(%d) %s\n",
>> idx, name);
>> @@ -3465,6 +3499,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
>> /* Only do relo for section with exec instructions */
>> if (!section_have_execinstr(obj, targ_sec_idx) &&
>> strcmp(name, ".rel" STRUCT_OPS_SEC) &&
>> + strcmp(name, ".rel" STRUCT_OPS_LINK_SEC) &&
>> strcmp(name, ".rel" MAPS_ELF_SEC)) {
>> pr_info("elf: skipping relo section(%d) %s for section(%d) %s\n",
>> idx, name, targ_sec_idx,
>> @@ -6611,7 +6646,7 @@ static int bpf_object__collect_relos(struct bpf_object *obj)
>> return -LIBBPF_ERRNO__INTERNAL;
>> }
>>
>> - if (idx == obj->efile.st_ops_shndx)
>> + if (idx == obj->efile.st_ops_shndx || idx == obj->efile.st_ops_link_shndx)
>> err = bpf_object__collect_st_ops_relos(obj, shdr, data);
>
> this function calls find_struct_ops_map_by_offset() which assumes we
> only have one struct_ops section. This won't work now, please double
> check all this, there should be no assumption about specific section
> index
Yes, I will check the section index of maps.
>
>> else if (idx == obj->efile.btf_maps_shndx)
>> err = bpf_object__collect_map_relos(obj, shdr, data);
>> @@ -8954,8 +8989,9 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
>> }
>>
>> /* struct_ops BPF prog can be re-used between multiple
>> - * .struct_ops as long as it's the same struct_ops struct
>> - * definition and the same function pointer field
>> + * .struct_ops & .struct_ops.link as long as it's the
>> + * same struct_ops struct definition and the same
>> + * function pointer field
>> */
>> if (prog->attach_btf_id != st_ops->type_id ||
>> prog->expected_attach_type != member_idx) {
>> --
>> 2.34.1
>>
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH bpf-next v4 9/9] selftests/bpf: Test switching TCP Congestion Control algorithms.
2023-03-07 23:32 [PATCH bpf-next v4 0/9] Transit between BPF TCP congestion controls Kui-Feng Lee
` (7 preceding siblings ...)
2023-03-07 23:33 ` [PATCH bpf-next v4 8/9] libbpf: Use .struct_ops.link section to indicate a struct_ops with a link Kui-Feng Lee
@ 2023-03-07 23:33 ` Kui-Feng Lee
2023-03-08 1:10 ` Andrii Nakryiko
8 siblings, 1 reply; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-07 23:33 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf; +Cc: Kui-Feng Lee
Create a pair of sockets that utilize the congestion control algorithm
under a particular name. Then switch up this congestion control
algorithm to another implementation and check whether newly created
connections using the same cc name now run the new implementation.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
.../selftests/bpf/prog_tests/bpf_tcp_ca.c | 38 ++++++++++++
.../selftests/bpf/progs/tcp_ca_update.c | 62 +++++++++++++++++++
2 files changed, 100 insertions(+)
create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_update.c
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index e980188d4124..caaa9175ee36 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -8,6 +8,7 @@
#include "bpf_dctcp.skel.h"
#include "bpf_cubic.skel.h"
#include "bpf_tcp_nogpl.skel.h"
+#include "tcp_ca_update.skel.h"
#include "bpf_dctcp_release.skel.h"
#include "tcp_ca_write_sk_pacing.skel.h"
#include "tcp_ca_incompl_cong_ops.skel.h"
@@ -381,6 +382,41 @@ static void test_unsupp_cong_op(void)
libbpf_set_print(old_print_fn);
}
+static void test_update_ca(void)
+{
+ struct tcp_ca_update *skel;
+ struct bpf_link *link;
+ int saved_ca1_cnt;
+ int err;
+
+ skel = tcp_ca_update__open();
+ if (!ASSERT_OK_PTR(skel, "open"))
+ return;
+
+ err = tcp_ca_update__load(skel);
+ if (!ASSERT_OK(err, "load")) {
+ tcp_ca_update__destroy(skel);
+ return;
+ }
+
+ link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
+ ASSERT_OK_PTR(link, "attach_struct_ops");
+
+ do_test("tcp_ca_update", NULL);
+ saved_ca1_cnt = skel->bss->ca1_cnt;
+ ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt");
+
+ err = bpf_link__update_map(link, skel->maps.ca_update_2);
+ ASSERT_OK(err, "update_struct_ops");
+
+ do_test("tcp_ca_update", NULL);
+ ASSERT_EQ(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt");
+ ASSERT_GT(skel->bss->ca2_cnt, 0, "ca2_ca2_cnt");
+
+ bpf_link__destroy(link);
+ tcp_ca_update__destroy(skel);
+}
+
void test_bpf_tcp_ca(void)
{
if (test__start_subtest("dctcp"))
@@ -399,4 +435,6 @@ void test_bpf_tcp_ca(void)
test_incompl_cong_ops();
if (test__start_subtest("unsupp_cong_op"))
test_unsupp_cong_op();
+ if (test__start_subtest("update_ca"))
+ test_update_ca();
}
diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
new file mode 100644
index 000000000000..36a04be95df5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int ca1_cnt = 0;
+int ca2_cnt = 0;
+
+#define USEC_PER_SEC 1000000UL
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+static inline struct tcp_sock *tcp_sk(const struct sock *sk)
+{
+ return (struct tcp_sock *)sk;
+}
+
+SEC("struct_ops/ca_update_1_cong_control")
+void BPF_PROG(ca_update_1_cong_control, struct sock *sk,
+ const struct rate_sample *rs)
+{
+ ca1_cnt++;
+}
+
+SEC("struct_ops/ca_update_2_cong_control")
+void BPF_PROG(ca_update_2_cong_control, struct sock *sk,
+ const struct rate_sample *rs)
+{
+ ca2_cnt++;
+}
+
+SEC("struct_ops/ca_update_ssthresh")
+__u32 BPF_PROG(ca_update_ssthresh, struct sock *sk)
+{
+ return tcp_sk(sk)->snd_ssthresh;
+}
+
+SEC("struct_ops/ca_update_undo_cwnd")
+__u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk)
+{
+ return tcp_sk(sk)->snd_cwnd;
+}
+
+SEC(".struct_ops.link")
+struct tcp_congestion_ops ca_update_1 = {
+ .cong_control = (void *)ca_update_1_cong_control,
+ .ssthresh = (void *)ca_update_ssthresh,
+ .undo_cwnd = (void *)ca_update_undo_cwnd,
+ .name = "tcp_ca_update",
+};
+
+SEC(".struct_ops.link")
+struct tcp_congestion_ops ca_update_2 = {
+ .cong_control = (void *)ca_update_2_cong_control,
+ .ssthresh = (void *)ca_update_ssthresh,
+ .undo_cwnd = (void *)ca_update_undo_cwnd,
+ .name = "tcp_ca_update",
+};
--
2.34.1
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 9/9] selftests/bpf: Test switching TCP Congestion Control algorithms.
2023-03-07 23:33 ` [PATCH bpf-next v4 9/9] selftests/bpf: Test switching TCP Congestion Control algorithms Kui-Feng Lee
@ 2023-03-08 1:10 ` Andrii Nakryiko
2023-03-08 15:58 ` Kui-Feng Lee
0 siblings, 1 reply; 25+ messages in thread
From: Andrii Nakryiko @ 2023-03-08 1:10 UTC (permalink / raw)
To: Kui-Feng Lee; +Cc: bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On Tue, Mar 7, 2023 at 3:34 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
>
> Create a pair of sockets that utilize the congestion control algorithm
> under a particular name. Then switch up this congestion control
> algorithm to another implementation and check whether newly created
> connections using the same cc name now run the new implementation.
>
> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
> ---
> .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 38 ++++++++++++
> .../selftests/bpf/progs/tcp_ca_update.c | 62 +++++++++++++++++++
> 2 files changed, 100 insertions(+)
> create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_update.c
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> index e980188d4124..caaa9175ee36 100644
> --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> @@ -8,6 +8,7 @@
> #include "bpf_dctcp.skel.h"
> #include "bpf_cubic.skel.h"
> #include "bpf_tcp_nogpl.skel.h"
> +#include "tcp_ca_update.skel.h"
> #include "bpf_dctcp_release.skel.h"
> #include "tcp_ca_write_sk_pacing.skel.h"
> #include "tcp_ca_incompl_cong_ops.skel.h"
> @@ -381,6 +382,41 @@ static void test_unsupp_cong_op(void)
> libbpf_set_print(old_print_fn);
> }
>
> +static void test_update_ca(void)
> +{
> + struct tcp_ca_update *skel;
> + struct bpf_link *link;
> + int saved_ca1_cnt;
> + int err;
> +
> + skel = tcp_ca_update__open();
> + if (!ASSERT_OK_PTR(skel, "open"))
> + return;
> +
> + err = tcp_ca_update__load(skel);
tcp_ca_update__open_and_load()
> + if (!ASSERT_OK(err, "load")) {
> + tcp_ca_update__destroy(skel);
> + return;
> + }
> +
> + link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
I think it's time to generate link holder for each struct_ops map to
the BPF skeleton, and support auto-attach of struct_ops skeleton.
Please do that as a follow up, once this patch set lands.
> + ASSERT_OK_PTR(link, "attach_struct_ops");
> +
> + do_test("tcp_ca_update", NULL);
> + saved_ca1_cnt = skel->bss->ca1_cnt;
> + ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt");
> +
> + err = bpf_link__update_map(link, skel->maps.ca_update_2);
> + ASSERT_OK(err, "update_struct_ops");
> +
> + do_test("tcp_ca_update", NULL);
> + ASSERT_EQ(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt");
> + ASSERT_GT(skel->bss->ca2_cnt, 0, "ca2_ca2_cnt");
how do we know that struct_ops programs were triggered? what
guarantees that? if nothing, we are just adding another flaky
networking test
> +
> + bpf_link__destroy(link);
> + tcp_ca_update__destroy(skel);
> +}
> +
> void test_bpf_tcp_ca(void)
> {
> if (test__start_subtest("dctcp"))
> @@ -399,4 +435,6 @@ void test_bpf_tcp_ca(void)
> test_incompl_cong_ops();
> if (test__start_subtest("unsupp_cong_op"))
> test_unsupp_cong_op();
> + if (test__start_subtest("update_ca"))
> + test_update_ca();
> }
> diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
> new file mode 100644
> index 000000000000..36a04be95df5
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
> @@ -0,0 +1,62 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include "vmlinux.h"
> +
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +
> +char _license[] SEC("license") = "GPL";
> +
> +int ca1_cnt = 0;
> +int ca2_cnt = 0;
> +
> +#define USEC_PER_SEC 1000000UL
> +
> +#define min(a, b) ((a) < (b) ? (a) : (b))
> +
> +static inline struct tcp_sock *tcp_sk(const struct sock *sk)
> +{
> + return (struct tcp_sock *)sk;
> +}
> +
> +SEC("struct_ops/ca_update_1_cong_control")
> +void BPF_PROG(ca_update_1_cong_control, struct sock *sk,
> + const struct rate_sample *rs)
> +{
> + ca1_cnt++;
> +}
> +
> +SEC("struct_ops/ca_update_2_cong_control")
> +void BPF_PROG(ca_update_2_cong_control, struct sock *sk,
> + const struct rate_sample *rs)
> +{
> + ca2_cnt++;
> +}
> +
> +SEC("struct_ops/ca_update_ssthresh")
> +__u32 BPF_PROG(ca_update_ssthresh, struct sock *sk)
> +{
> + return tcp_sk(sk)->snd_ssthresh;
> +}
> +
> +SEC("struct_ops/ca_update_undo_cwnd")
> +__u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk)
> +{
> + return tcp_sk(sk)->snd_cwnd;
> +}
> +
> +SEC(".struct_ops.link")
> +struct tcp_congestion_ops ca_update_1 = {
> + .cong_control = (void *)ca_update_1_cong_control,
> + .ssthresh = (void *)ca_update_ssthresh,
> + .undo_cwnd = (void *)ca_update_undo_cwnd,
> + .name = "tcp_ca_update",
> +};
> +
> +SEC(".struct_ops.link")
> +struct tcp_congestion_ops ca_update_2 = {
> + .cong_control = (void *)ca_update_2_cong_control,
> + .ssthresh = (void *)ca_update_ssthresh,
> + .undo_cwnd = (void *)ca_update_undo_cwnd,
> + .name = "tcp_ca_update",
> +};
please add a test where you combine both .struct_ops and
.struct_ops.link, it's an obvious potentially problematic combination
as I mentioned in previous patches, let's also have a negative test
where bpf_link__update_map() fails
> --
> 2.34.1
>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 9/9] selftests/bpf: Test switching TCP Congestion Control algorithms.
2023-03-08 1:10 ` Andrii Nakryiko
@ 2023-03-08 15:58 ` Kui-Feng Lee
2023-03-08 17:18 ` Andrii Nakryiko
0 siblings, 1 reply; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-08 15:58 UTC (permalink / raw)
To: Andrii Nakryiko, Kui-Feng Lee
Cc: bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On 3/7/23 17:10, Andrii Nakryiko wrote:
> On Tue, Mar 7, 2023 at 3:34 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
>>
>> Create a pair of sockets that utilize the congestion control algorithm
>> under a particular name. Then switch up this congestion control
>> algorithm to another implementation and check whether newly created
>> connections using the same cc name now run the new implementation.
>>
>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
>> ---
>> .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 38 ++++++++++++
>> .../selftests/bpf/progs/tcp_ca_update.c | 62 +++++++++++++++++++
>> 2 files changed, 100 insertions(+)
>> create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_update.c
>>
>> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
>> index e980188d4124..caaa9175ee36 100644
>> --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
>> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
>> @@ -8,6 +8,7 @@
>> #include "bpf_dctcp.skel.h"
>> #include "bpf_cubic.skel.h"
>> #include "bpf_tcp_nogpl.skel.h"
>> +#include "tcp_ca_update.skel.h"
>> #include "bpf_dctcp_release.skel.h"
>> #include "tcp_ca_write_sk_pacing.skel.h"
>> #include "tcp_ca_incompl_cong_ops.skel.h"
>> @@ -381,6 +382,41 @@ static void test_unsupp_cong_op(void)
>> libbpf_set_print(old_print_fn);
>> }
>>
>> +static void test_update_ca(void)
>> +{
>> + struct tcp_ca_update *skel;
>> + struct bpf_link *link;
>> + int saved_ca1_cnt;
>> + int err;
>> +
>> + skel = tcp_ca_update__open();
>> + if (!ASSERT_OK_PTR(skel, "open"))
>> + return;
>> +
>> + err = tcp_ca_update__load(skel);
>
> tcp_ca_update__open_and_load()
>
>> + if (!ASSERT_OK(err, "load")) {
>> + tcp_ca_update__destroy(skel);
>> + return;
>> + }
>> +
>> + link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
>
> I think it's time to generate link holder for each struct_ops map to
> the BPF skeleton, and support auto-attach of struct_ops skeleton.
> Please do that as a follow up, once this patch set lands.
Got it.
>
>> + ASSERT_OK_PTR(link, "attach_struct_ops");
>> +
>> + do_test("tcp_ca_update", NULL);
>> + saved_ca1_cnt = skel->bss->ca1_cnt;
>> + ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt");
>> +
>> + err = bpf_link__update_map(link, skel->maps.ca_update_2);
>> + ASSERT_OK(err, "update_struct_ops");
>> +
>> + do_test("tcp_ca_update", NULL);
>> + ASSERT_EQ(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt");
>> + ASSERT_GT(skel->bss->ca2_cnt, 0, "ca2_ca2_cnt");
>
> how do we know that struct_ops programs were triggered? what
> guarantees that? if nothing, we are just adding another flaky
> networking test
When an ack is received, cong_control of ca_update_1 and ca_update_2
will be called if they are activated. By checking ca1_cnt & ca2_cnt, we
know which one is activated. Here, we check if the ca1_cnt keeps the
same and ca2_cnt increase to make that ca_update_2 have replaced
ca_update_1.
>
>> +
>> + bpf_link__destroy(link);
>> + tcp_ca_update__destroy(skel);
>> +}
>> +
>> void test_bpf_tcp_ca(void)
>> {
>> if (test__start_subtest("dctcp"))
>> @@ -399,4 +435,6 @@ void test_bpf_tcp_ca(void)
>> test_incompl_cong_ops();
>> if (test__start_subtest("unsupp_cong_op"))
>> test_unsupp_cong_op();
>> + if (test__start_subtest("update_ca"))
>> + test_update_ca();
>> }
>> diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
>> new file mode 100644
>> index 000000000000..36a04be95df5
>> --- /dev/null
>> +++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
>> @@ -0,0 +1,62 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +#include "vmlinux.h"
>> +
>> +#include <bpf/bpf_helpers.h>
>> +#include <bpf/bpf_tracing.h>
>> +
>> +char _license[] SEC("license") = "GPL";
>> +
>> +int ca1_cnt = 0;
>> +int ca2_cnt = 0;
>> +
>> +#define USEC_PER_SEC 1000000UL
>> +
>> +#define min(a, b) ((a) < (b) ? (a) : (b))
>> +
>> +static inline struct tcp_sock *tcp_sk(const struct sock *sk)
>> +{
>> + return (struct tcp_sock *)sk;
>> +}
>> +
>> +SEC("struct_ops/ca_update_1_cong_control")
>> +void BPF_PROG(ca_update_1_cong_control, struct sock *sk,
>> + const struct rate_sample *rs)
>> +{
>> + ca1_cnt++;
>> +}
>> +
>> +SEC("struct_ops/ca_update_2_cong_control")
>> +void BPF_PROG(ca_update_2_cong_control, struct sock *sk,
>> + const struct rate_sample *rs)
>> +{
>> + ca2_cnt++;
>> +}
>> +
>> +SEC("struct_ops/ca_update_ssthresh")
>> +__u32 BPF_PROG(ca_update_ssthresh, struct sock *sk)
>> +{
>> + return tcp_sk(sk)->snd_ssthresh;
>> +}
>> +
>> +SEC("struct_ops/ca_update_undo_cwnd")
>> +__u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk)
>> +{
>> + return tcp_sk(sk)->snd_cwnd;
>> +}
>> +
>> +SEC(".struct_ops.link")
>> +struct tcp_congestion_ops ca_update_1 = {
>> + .cong_control = (void *)ca_update_1_cong_control,
>> + .ssthresh = (void *)ca_update_ssthresh,
>> + .undo_cwnd = (void *)ca_update_undo_cwnd,
>> + .name = "tcp_ca_update",
>> +};
>> +
>> +SEC(".struct_ops.link")
>> +struct tcp_congestion_ops ca_update_2 = {
>> + .cong_control = (void *)ca_update_2_cong_control,
>> + .ssthresh = (void *)ca_update_ssthresh,
>> + .undo_cwnd = (void *)ca_update_undo_cwnd,
>> + .name = "tcp_ca_update",
>> +};
>
> please add a test where you combine both .struct_ops and
> .struct_ops.link, it's an obvious potentially problematic combination
>
> as I mentioned in previous patches, let's also have a negative test
> where bpf_link__update_map() fails
Sure
>
>> --
>> 2.34.1
>>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 9/9] selftests/bpf: Test switching TCP Congestion Control algorithms.
2023-03-08 15:58 ` Kui-Feng Lee
@ 2023-03-08 17:18 ` Andrii Nakryiko
2023-03-08 18:10 ` Kui-Feng Lee
0 siblings, 1 reply; 25+ messages in thread
From: Andrii Nakryiko @ 2023-03-08 17:18 UTC (permalink / raw)
To: Kui-Feng Lee
Cc: Kui-Feng Lee, bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On Wed, Mar 8, 2023 at 7:58 AM Kui-Feng Lee <sinquersw@gmail.com> wrote:
>
>
>
> On 3/7/23 17:10, Andrii Nakryiko wrote:
> > On Tue, Mar 7, 2023 at 3:34 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
> >>
> >> Create a pair of sockets that utilize the congestion control algorithm
> >> under a particular name. Then switch up this congestion control
> >> algorithm to another implementation and check whether newly created
> >> connections using the same cc name now run the new implementation.
> >>
> >> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
> >> ---
> >> .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 38 ++++++++++++
> >> .../selftests/bpf/progs/tcp_ca_update.c | 62 +++++++++++++++++++
> >> 2 files changed, 100 insertions(+)
> >> create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_update.c
> >>
> >> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> >> index e980188d4124..caaa9175ee36 100644
> >> --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> >> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> >> @@ -8,6 +8,7 @@
> >> #include "bpf_dctcp.skel.h"
> >> #include "bpf_cubic.skel.h"
> >> #include "bpf_tcp_nogpl.skel.h"
> >> +#include "tcp_ca_update.skel.h"
> >> #include "bpf_dctcp_release.skel.h"
> >> #include "tcp_ca_write_sk_pacing.skel.h"
> >> #include "tcp_ca_incompl_cong_ops.skel.h"
> >> @@ -381,6 +382,41 @@ static void test_unsupp_cong_op(void)
> >> libbpf_set_print(old_print_fn);
> >> }
> >>
> >> +static void test_update_ca(void)
> >> +{
> >> + struct tcp_ca_update *skel;
> >> + struct bpf_link *link;
> >> + int saved_ca1_cnt;
> >> + int err;
> >> +
> >> + skel = tcp_ca_update__open();
> >> + if (!ASSERT_OK_PTR(skel, "open"))
> >> + return;
> >> +
> >> + err = tcp_ca_update__load(skel);
> >
> > tcp_ca_update__open_and_load()
> >
> >> + if (!ASSERT_OK(err, "load")) {
> >> + tcp_ca_update__destroy(skel);
> >> + return;
> >> + }
> >> +
> >> + link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
> >
> > I think it's time to generate link holder for each struct_ops map to
> > the BPF skeleton, and support auto-attach of struct_ops skeleton.
> > Please do that as a follow up, once this patch set lands.
>
> Got it.
>
> >
> >> + ASSERT_OK_PTR(link, "attach_struct_ops");
> >> +
> >> + do_test("tcp_ca_update", NULL);
> >> + saved_ca1_cnt = skel->bss->ca1_cnt;
> >> + ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt");
> >> +
> >> + err = bpf_link__update_map(link, skel->maps.ca_update_2);
> >> + ASSERT_OK(err, "update_struct_ops");
> >> +
> >> + do_test("tcp_ca_update", NULL);
> >> + ASSERT_EQ(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt");
> >> + ASSERT_GT(skel->bss->ca2_cnt, 0, "ca2_ca2_cnt");
> >
> > how do we know that struct_ops programs were triggered? what
> > guarantees that? if nothing, we are just adding another flaky
> > networking test
>
> When an ack is received, cong_control of ca_update_1 and ca_update_2
> will be called if they are activated. By checking ca1_cnt & ca2_cnt, we
> know which one is activated. Here, we check if the ca1_cnt keeps the
> same and ca2_cnt increase to make that ca_update_2 have replaced
> ca_update_1.
I just don't see anything in the test ensuring that ack is
sent/received, so it seems like we are relying on some background
system activity and proper timing (unless I miss something, which is
why I'm asking), so this is fragile, as in CI environment timings and
background activity would be very different and unpredictable, causing
flakiness of the test
>
> >
> >> +
> >> + bpf_link__destroy(link);
> >> + tcp_ca_update__destroy(skel);
> >> +}
> >> +
> >> void test_bpf_tcp_ca(void)
> >> {
> >> if (test__start_subtest("dctcp"))
> >> @@ -399,4 +435,6 @@ void test_bpf_tcp_ca(void)
> >> test_incompl_cong_ops();
> >> if (test__start_subtest("unsupp_cong_op"))
> >> test_unsupp_cong_op();
> >> + if (test__start_subtest("update_ca"))
> >> + test_update_ca();
> >> }
> >> diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
> >> new file mode 100644
> >> index 000000000000..36a04be95df5
> >> --- /dev/null
> >> +++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
> >> @@ -0,0 +1,62 @@
> >> +// SPDX-License-Identifier: GPL-2.0
> >> +
> >> +#include "vmlinux.h"
> >> +
> >> +#include <bpf/bpf_helpers.h>
> >> +#include <bpf/bpf_tracing.h>
> >> +
> >> +char _license[] SEC("license") = "GPL";
> >> +
> >> +int ca1_cnt = 0;
> >> +int ca2_cnt = 0;
> >> +
> >> +#define USEC_PER_SEC 1000000UL
> >> +
> >> +#define min(a, b) ((a) < (b) ? (a) : (b))
> >> +
> >> +static inline struct tcp_sock *tcp_sk(const struct sock *sk)
> >> +{
> >> + return (struct tcp_sock *)sk;
> >> +}
> >> +
> >> +SEC("struct_ops/ca_update_1_cong_control")
> >> +void BPF_PROG(ca_update_1_cong_control, struct sock *sk,
> >> + const struct rate_sample *rs)
> >> +{
> >> + ca1_cnt++;
> >> +}
> >> +
> >> +SEC("struct_ops/ca_update_2_cong_control")
> >> +void BPF_PROG(ca_update_2_cong_control, struct sock *sk,
> >> + const struct rate_sample *rs)
> >> +{
> >> + ca2_cnt++;
> >> +}
> >> +
> >> +SEC("struct_ops/ca_update_ssthresh")
> >> +__u32 BPF_PROG(ca_update_ssthresh, struct sock *sk)
> >> +{
> >> + return tcp_sk(sk)->snd_ssthresh;
> >> +}
> >> +
> >> +SEC("struct_ops/ca_update_undo_cwnd")
> >> +__u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk)
> >> +{
> >> + return tcp_sk(sk)->snd_cwnd;
> >> +}
> >> +
> >> +SEC(".struct_ops.link")
> >> +struct tcp_congestion_ops ca_update_1 = {
> >> + .cong_control = (void *)ca_update_1_cong_control,
> >> + .ssthresh = (void *)ca_update_ssthresh,
> >> + .undo_cwnd = (void *)ca_update_undo_cwnd,
> >> + .name = "tcp_ca_update",
> >> +};
> >> +
> >> +SEC(".struct_ops.link")
> >> +struct tcp_congestion_ops ca_update_2 = {
> >> + .cong_control = (void *)ca_update_2_cong_control,
> >> + .ssthresh = (void *)ca_update_ssthresh,
> >> + .undo_cwnd = (void *)ca_update_undo_cwnd,
> >> + .name = "tcp_ca_update",
> >> +};
> >
> > please add a test where you combine both .struct_ops and
> > .struct_ops.link, it's an obvious potentially problematic combination
> >
> > as I mentioned in previous patches, let's also have a negative test
> > where bpf_link__update_map() fails
>
> Sure
>
> >
> >> --
> >> 2.34.1
> >>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 9/9] selftests/bpf: Test switching TCP Congestion Control algorithms.
2023-03-08 17:18 ` Andrii Nakryiko
@ 2023-03-08 18:10 ` Kui-Feng Lee
2023-03-08 18:43 ` Andrii Nakryiko
0 siblings, 1 reply; 25+ messages in thread
From: Kui-Feng Lee @ 2023-03-08 18:10 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Kui-Feng Lee, bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On 3/8/23 09:18, Andrii Nakryiko wrote:
> On Wed, Mar 8, 2023 at 7:58 AM Kui-Feng Lee <sinquersw@gmail.com> wrote:
>>
>>
>>
>> On 3/7/23 17:10, Andrii Nakryiko wrote:
>>> On Tue, Mar 7, 2023 at 3:34 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
>>>>
>>>> Create a pair of sockets that utilize the congestion control algorithm
>>>> under a particular name. Then switch up this congestion control
>>>> algorithm to another implementation and check whether newly created
>>>> connections using the same cc name now run the new implementation.
>>>>
>>>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
>>>> ---
>>>> .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 38 ++++++++++++
>>>> .../selftests/bpf/progs/tcp_ca_update.c | 62 +++++++++++++++++++
>>>> 2 files changed, 100 insertions(+)
>>>> create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_update.c
>>>>
>>>> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
>>>> index e980188d4124..caaa9175ee36 100644
>>>> --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
>>>> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
>>>> @@ -8,6 +8,7 @@
>>>> #include "bpf_dctcp.skel.h"
>>>> #include "bpf_cubic.skel.h"
>>>> #include "bpf_tcp_nogpl.skel.h"
>>>> +#include "tcp_ca_update.skel.h"
>>>> #include "bpf_dctcp_release.skel.h"
>>>> #include "tcp_ca_write_sk_pacing.skel.h"
>>>> #include "tcp_ca_incompl_cong_ops.skel.h"
>>>> @@ -381,6 +382,41 @@ static void test_unsupp_cong_op(void)
>>>> libbpf_set_print(old_print_fn);
>>>> }
>>>>
>>>> +static void test_update_ca(void)
>>>> +{
>>>> + struct tcp_ca_update *skel;
>>>> + struct bpf_link *link;
>>>> + int saved_ca1_cnt;
>>>> + int err;
>>>> +
>>>> + skel = tcp_ca_update__open();
>>>> + if (!ASSERT_OK_PTR(skel, "open"))
>>>> + return;
>>>> +
>>>> + err = tcp_ca_update__load(skel);
>>>
>>> tcp_ca_update__open_and_load()
>>>
>>>> + if (!ASSERT_OK(err, "load")) {
>>>> + tcp_ca_update__destroy(skel);
>>>> + return;
>>>> + }
>>>> +
>>>> + link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
>>>
>>> I think it's time to generate link holder for each struct_ops map to
>>> the BPF skeleton, and support auto-attach of struct_ops skeleton.
>>> Please do that as a follow up, once this patch set lands.
>>
>> Got it.
>>
>>>
>>>> + ASSERT_OK_PTR(link, "attach_struct_ops");
>>>> +
>>>> + do_test("tcp_ca_update", NULL);
>>>> + saved_ca1_cnt = skel->bss->ca1_cnt;
>>>> + ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt");
>>>> +
>>>> + err = bpf_link__update_map(link, skel->maps.ca_update_2);
>>>> + ASSERT_OK(err, "update_struct_ops");
>>>> +
>>>> + do_test("tcp_ca_update", NULL);
>>>> + ASSERT_EQ(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt");
>>>> + ASSERT_GT(skel->bss->ca2_cnt, 0, "ca2_ca2_cnt");
>>>
>>> how do we know that struct_ops programs were triggered? what
>>> guarantees that? if nothing, we are just adding another flaky
>>> networking test
>>
>> When an ack is received, cong_control of ca_update_1 and ca_update_2
>> will be called if they are activated. By checking ca1_cnt & ca2_cnt, we
>> know which one is activated. Here, we check if the ca1_cnt keeps the
>> same and ca2_cnt increase to make that ca_update_2 have replaced
>> ca_update_1.
>
> I just don't see anything in the test ensuring that ack is
> sent/received, so it seems like we are relying on some background
> system activity and proper timing (unless I miss something, which is
> why I'm asking), so this is fragile, as in CI environment timings and
> background activity would be very different and unpredictable, causing
> flakiness of the test
The do_test() function creates two sockets to form a direct connection
that must receive at least one acknowledgment packet for the sockets to
progress into an ESTABLISHED state. If they don't, that means it fails
to establish a connection.
>
>>
>>>
>>>> +
>>>> + bpf_link__destroy(link);
>>>> + tcp_ca_update__destroy(skel);
>>>> +}
>>>> +
>>>> void test_bpf_tcp_ca(void)
>>>> {
>>>> if (test__start_subtest("dctcp"))
>>>> @@ -399,4 +435,6 @@ void test_bpf_tcp_ca(void)
>>>> test_incompl_cong_ops();
>>>> if (test__start_subtest("unsupp_cong_op"))
>>>> test_unsupp_cong_op();
>>>> + if (test__start_subtest("update_ca"))
>>>> + test_update_ca();
>>>> }
>>>> diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
>>>> new file mode 100644
>>>> index 000000000000..36a04be95df5
>>>> --- /dev/null
>>>> +++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
>>>> @@ -0,0 +1,62 @@
>>>> +// SPDX-License-Identifier: GPL-2.0
>>>> +
>>>> +#include "vmlinux.h"
>>>> +
>>>> +#include <bpf/bpf_helpers.h>
>>>> +#include <bpf/bpf_tracing.h>
>>>> +
>>>> +char _license[] SEC("license") = "GPL";
>>>> +
>>>> +int ca1_cnt = 0;
>>>> +int ca2_cnt = 0;
>>>> +
>>>> +#define USEC_PER_SEC 1000000UL
>>>> +
>>>> +#define min(a, b) ((a) < (b) ? (a) : (b))
>>>> +
>>>> +static inline struct tcp_sock *tcp_sk(const struct sock *sk)
>>>> +{
>>>> + return (struct tcp_sock *)sk;
>>>> +}
>>>> +
>>>> +SEC("struct_ops/ca_update_1_cong_control")
>>>> +void BPF_PROG(ca_update_1_cong_control, struct sock *sk,
>>>> + const struct rate_sample *rs)
>>>> +{
>>>> + ca1_cnt++;
>>>> +}
>>>> +
>>>> +SEC("struct_ops/ca_update_2_cong_control")
>>>> +void BPF_PROG(ca_update_2_cong_control, struct sock *sk,
>>>> + const struct rate_sample *rs)
>>>> +{
>>>> + ca2_cnt++;
>>>> +}
>>>> +
>>>> +SEC("struct_ops/ca_update_ssthresh")
>>>> +__u32 BPF_PROG(ca_update_ssthresh, struct sock *sk)
>>>> +{
>>>> + return tcp_sk(sk)->snd_ssthresh;
>>>> +}
>>>> +
>>>> +SEC("struct_ops/ca_update_undo_cwnd")
>>>> +__u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk)
>>>> +{
>>>> + return tcp_sk(sk)->snd_cwnd;
>>>> +}
>>>> +
>>>> +SEC(".struct_ops.link")
>>>> +struct tcp_congestion_ops ca_update_1 = {
>>>> + .cong_control = (void *)ca_update_1_cong_control,
>>>> + .ssthresh = (void *)ca_update_ssthresh,
>>>> + .undo_cwnd = (void *)ca_update_undo_cwnd,
>>>> + .name = "tcp_ca_update",
>>>> +};
>>>> +
>>>> +SEC(".struct_ops.link")
>>>> +struct tcp_congestion_ops ca_update_2 = {
>>>> + .cong_control = (void *)ca_update_2_cong_control,
>>>> + .ssthresh = (void *)ca_update_ssthresh,
>>>> + .undo_cwnd = (void *)ca_update_undo_cwnd,
>>>> + .name = "tcp_ca_update",
>>>> +};
>>>
>>> please add a test where you combine both .struct_ops and
>>> .struct_ops.link, it's an obvious potentially problematic combination
>>>
>>> as I mentioned in previous patches, let's also have a negative test
>>> where bpf_link__update_map() fails
>>
>> Sure
>>
>>>
>>>> --
>>>> 2.34.1
>>>>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH bpf-next v4 9/9] selftests/bpf: Test switching TCP Congestion Control algorithms.
2023-03-08 18:10 ` Kui-Feng Lee
@ 2023-03-08 18:43 ` Andrii Nakryiko
0 siblings, 0 replies; 25+ messages in thread
From: Andrii Nakryiko @ 2023-03-08 18:43 UTC (permalink / raw)
To: Kui-Feng Lee
Cc: Kui-Feng Lee, bpf, ast, martin.lau, song, kernel-team, andrii, sdf
On Wed, Mar 8, 2023 at 10:10 AM Kui-Feng Lee <sinquersw@gmail.com> wrote:
>
>
>
> On 3/8/23 09:18, Andrii Nakryiko wrote:
> > On Wed, Mar 8, 2023 at 7:58 AM Kui-Feng Lee <sinquersw@gmail.com> wrote:
> >>
> >>
> >>
> >> On 3/7/23 17:10, Andrii Nakryiko wrote:
> >>> On Tue, Mar 7, 2023 at 3:34 PM Kui-Feng Lee <kuifeng@meta.com> wrote:
> >>>>
> >>>> Create a pair of sockets that utilize the congestion control algorithm
> >>>> under a particular name. Then switch up this congestion control
> >>>> algorithm to another implementation and check whether newly created
> >>>> connections using the same cc name now run the new implementation.
> >>>>
> >>>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
> >>>> ---
> >>>> .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 38 ++++++++++++
> >>>> .../selftests/bpf/progs/tcp_ca_update.c | 62 +++++++++++++++++++
> >>>> 2 files changed, 100 insertions(+)
> >>>> create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_update.c
> >>>>
> >>>> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> >>>> index e980188d4124..caaa9175ee36 100644
> >>>> --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> >>>> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> >>>> @@ -8,6 +8,7 @@
> >>>> #include "bpf_dctcp.skel.h"
> >>>> #include "bpf_cubic.skel.h"
> >>>> #include "bpf_tcp_nogpl.skel.h"
> >>>> +#include "tcp_ca_update.skel.h"
> >>>> #include "bpf_dctcp_release.skel.h"
> >>>> #include "tcp_ca_write_sk_pacing.skel.h"
> >>>> #include "tcp_ca_incompl_cong_ops.skel.h"
> >>>> @@ -381,6 +382,41 @@ static void test_unsupp_cong_op(void)
> >>>> libbpf_set_print(old_print_fn);
> >>>> }
> >>>>
> >>>> +static void test_update_ca(void)
> >>>> +{
> >>>> + struct tcp_ca_update *skel;
> >>>> + struct bpf_link *link;
> >>>> + int saved_ca1_cnt;
> >>>> + int err;
> >>>> +
> >>>> + skel = tcp_ca_update__open();
> >>>> + if (!ASSERT_OK_PTR(skel, "open"))
> >>>> + return;
> >>>> +
> >>>> + err = tcp_ca_update__load(skel);
> >>>
> >>> tcp_ca_update__open_and_load()
> >>>
> >>>> + if (!ASSERT_OK(err, "load")) {
> >>>> + tcp_ca_update__destroy(skel);
> >>>> + return;
> >>>> + }
> >>>> +
> >>>> + link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
> >>>
> >>> I think it's time to generate link holder for each struct_ops map to
> >>> the BPF skeleton, and support auto-attach of struct_ops skeleton.
> >>> Please do that as a follow up, once this patch set lands.
> >>
> >> Got it.
> >>
> >>>
> >>>> + ASSERT_OK_PTR(link, "attach_struct_ops");
> >>>> +
> >>>> + do_test("tcp_ca_update", NULL);
> >>>> + saved_ca1_cnt = skel->bss->ca1_cnt;
> >>>> + ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt");
> >>>> +
> >>>> + err = bpf_link__update_map(link, skel->maps.ca_update_2);
> >>>> + ASSERT_OK(err, "update_struct_ops");
> >>>> +
> >>>> + do_test("tcp_ca_update", NULL);
> >>>> + ASSERT_EQ(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt");
> >>>> + ASSERT_GT(skel->bss->ca2_cnt, 0, "ca2_ca2_cnt");
> >>>
> >>> how do we know that struct_ops programs were triggered? what
> >>> guarantees that? if nothing, we are just adding another flaky
> >>> networking test
> >>
> >> When an ack is received, cong_control of ca_update_1 and ca_update_2
> >> will be called if they are activated. By checking ca1_cnt & ca2_cnt, we
> >> know which one is activated. Here, we check if the ca1_cnt keeps the
> >> same and ca2_cnt increase to make that ca_update_2 have replaced
> >> ca_update_1.
> >
> > I just don't see anything in the test ensuring that ack is
> > sent/received, so it seems like we are relying on some background
> > system activity and proper timing (unless I miss something, which is
> > why I'm asking), so this is fragile, as in CI environment timings and
> > background activity would be very different and unpredictable, causing
> > flakiness of the test
>
>
> The do_test() function creates two sockets to form a direct connection
> that must receive at least one acknowledgment packet for the sockets to
> progress into an ESTABLISHED state. If they don't, that means it fails
> to establish a connection.
yeah, my bad, I *completely* missed `do_test("tcp_ca_update", NULL)`
(even though I went specifically looking for something like that).
It's all good here.
>
> >
> >>
> >>>
> >>>> +
> >>>> + bpf_link__destroy(link);
> >>>> + tcp_ca_update__destroy(skel);
> >>>> +}
> >>>> +
> >>>> void test_bpf_tcp_ca(void)
> >>>> {
> >>>> if (test__start_subtest("dctcp"))
> >>>> @@ -399,4 +435,6 @@ void test_bpf_tcp_ca(void)
> >>>> test_incompl_cong_ops();
> >>>> if (test__start_subtest("unsupp_cong_op"))
> >>>> test_unsupp_cong_op();
> >>>> + if (test__start_subtest("update_ca"))
> >>>> + test_update_ca();
> >>>> }
> >>>> diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
> >>>> new file mode 100644
> >>>> index 000000000000..36a04be95df5
> >>>> --- /dev/null
> >>>> +++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
> >>>> @@ -0,0 +1,62 @@
> >>>> +// SPDX-License-Identifier: GPL-2.0
> >>>> +
> >>>> +#include "vmlinux.h"
> >>>> +
> >>>> +#include <bpf/bpf_helpers.h>
> >>>> +#include <bpf/bpf_tracing.h>
> >>>> +
> >>>> +char _license[] SEC("license") = "GPL";
> >>>> +
> >>>> +int ca1_cnt = 0;
> >>>> +int ca2_cnt = 0;
> >>>> +
> >>>> +#define USEC_PER_SEC 1000000UL
> >>>> +
> >>>> +#define min(a, b) ((a) < (b) ? (a) : (b))
> >>>> +
> >>>> +static inline struct tcp_sock *tcp_sk(const struct sock *sk)
> >>>> +{
> >>>> + return (struct tcp_sock *)sk;
> >>>> +}
> >>>> +
> >>>> +SEC("struct_ops/ca_update_1_cong_control")
> >>>> +void BPF_PROG(ca_update_1_cong_control, struct sock *sk,
> >>>> + const struct rate_sample *rs)
> >>>> +{
> >>>> + ca1_cnt++;
> >>>> +}
> >>>> +
> >>>> +SEC("struct_ops/ca_update_2_cong_control")
> >>>> +void BPF_PROG(ca_update_2_cong_control, struct sock *sk,
> >>>> + const struct rate_sample *rs)
> >>>> +{
> >>>> + ca2_cnt++;
> >>>> +}
> >>>> +
> >>>> +SEC("struct_ops/ca_update_ssthresh")
> >>>> +__u32 BPF_PROG(ca_update_ssthresh, struct sock *sk)
> >>>> +{
> >>>> + return tcp_sk(sk)->snd_ssthresh;
> >>>> +}
> >>>> +
> >>>> +SEC("struct_ops/ca_update_undo_cwnd")
> >>>> +__u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk)
> >>>> +{
> >>>> + return tcp_sk(sk)->snd_cwnd;
> >>>> +}
> >>>> +
> >>>> +SEC(".struct_ops.link")
> >>>> +struct tcp_congestion_ops ca_update_1 = {
> >>>> + .cong_control = (void *)ca_update_1_cong_control,
> >>>> + .ssthresh = (void *)ca_update_ssthresh,
> >>>> + .undo_cwnd = (void *)ca_update_undo_cwnd,
> >>>> + .name = "tcp_ca_update",
> >>>> +};
> >>>> +
> >>>> +SEC(".struct_ops.link")
> >>>> +struct tcp_congestion_ops ca_update_2 = {
> >>>> + .cong_control = (void *)ca_update_2_cong_control,
> >>>> + .ssthresh = (void *)ca_update_ssthresh,
> >>>> + .undo_cwnd = (void *)ca_update_undo_cwnd,
> >>>> + .name = "tcp_ca_update",
> >>>> +};
> >>>
> >>> please add a test where you combine both .struct_ops and
> >>> .struct_ops.link, it's an obvious potentially problematic combination
> >>>
> >>> as I mentioned in previous patches, let's also have a negative test
> >>> where bpf_link__update_map() fails
> >>
> >> Sure
> >>
> >>>
> >>>> --
> >>>> 2.34.1
> >>>>
^ permalink raw reply [flat|nested] 25+ messages in thread