* [RFC bpf-next v3 1/6] bpf: Factor out a common helper free_all()
2023-04-29 10:12 [RFC bpf-next v3 0/6] Handle immediate reuse in bpf memory allocator Hou Tao
@ 2023-04-29 10:12 ` Hou Tao
2023-04-29 10:12 ` [RFC bpf-next v3 2/6] bpf: Pass bitwise flags to bpf_mem_alloc_init() Hou Tao
` (4 subsequent siblings)
5 siblings, 0 replies; 20+ messages in thread
From: Hou Tao @ 2023-04-29 10:12 UTC (permalink / raw)
To: bpf, Martin KaFai Lau, Alexei Starovoitov
Cc: Andrii Nakryiko, Song Liu, Hao Luo, Yonghong Song,
Daniel Borkmann, KP Singh, Stanislav Fomichev, Jiri Olsa,
John Fastabend, Paul E . McKenney, rcu, houtao1
From: Hou Tao <houtao1@huawei.com>
Factor out a common helper free_all() to free all normal elements or
per-cpu elements on a lock-less list.
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
kernel/bpf/memalloc.c | 31 ++++++++++++++++---------------
1 file changed, 16 insertions(+), 15 deletions(-)
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 410637c225fb..0668bcd7c926 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -211,9 +211,9 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
mem_cgroup_put(memcg);
}
-static void free_one(struct bpf_mem_cache *c, void *obj)
+static void free_one(void *obj, bool percpu)
{
- if (c->percpu_size) {
+ if (percpu) {
free_percpu(((void **)obj)[1]);
kfree(obj);
return;
@@ -222,14 +222,19 @@ static void free_one(struct bpf_mem_cache *c, void *obj)
kfree(obj);
}
-static void __free_rcu(struct rcu_head *head)
+static void free_all(struct llist_node *llnode, bool percpu)
{
- struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
- struct llist_node *llnode = llist_del_all(&c->waiting_for_gp);
struct llist_node *pos, *t;
llist_for_each_safe(pos, t, llnode)
- free_one(c, pos);
+ free_one(pos, percpu);
+}
+
+static void __free_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+
+ free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
atomic_set(&c->call_rcu_in_progress, 0);
}
@@ -432,7 +437,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
static void drain_mem_cache(struct bpf_mem_cache *c)
{
- struct llist_node *llnode, *t;
+ bool percpu = !!c->percpu_size;
/* No progs are using this bpf_mem_cache, but htab_map_free() called
* bpf_mem_cache_free() for all remaining elements and they can be in
@@ -441,14 +446,10 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
* Except for waiting_for_gp list, there are no concurrent operations
* on these lists, so it is safe to use __llist_del_all().
*/
- llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
- free_one(c, llnode);
- llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp))
- free_one(c, llnode);
- llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist))
- free_one(c, llnode);
- llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist_extra))
- free_one(c, llnode);
+ free_all(__llist_del_all(&c->free_by_rcu), percpu);
+ free_all(llist_del_all(&c->waiting_for_gp), percpu);
+ free_all(__llist_del_all(&c->free_llist), percpu);
+ free_all(__llist_del_all(&c->free_llist_extra), percpu);
}
static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
--
2.29.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [RFC bpf-next v3 2/6] bpf: Pass bitwise flags to bpf_mem_alloc_init()
2023-04-29 10:12 [RFC bpf-next v3 0/6] Handle immediate reuse in bpf memory allocator Hou Tao
2023-04-29 10:12 ` [RFC bpf-next v3 1/6] bpf: Factor out a common helper free_all() Hou Tao
@ 2023-04-29 10:12 ` Hou Tao
2023-04-29 10:12 ` [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP Hou Tao
` (3 subsequent siblings)
5 siblings, 0 replies; 20+ messages in thread
From: Hou Tao @ 2023-04-29 10:12 UTC (permalink / raw)
To: bpf, Martin KaFai Lau, Alexei Starovoitov
Cc: Andrii Nakryiko, Song Liu, Hao Luo, Yonghong Song,
Daniel Borkmann, KP Singh, Stanislav Fomichev, Jiri Olsa,
John Fastabend, Paul E . McKenney, rcu, houtao1
From: Hou Tao <houtao1@huawei.com>
Extend a boolean argument to a bitwise flags argument for
bpf_mem_alloc_init(), so more new flags can be added later.
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
include/linux/bpf_mem_alloc.h | 8 +++++++-
kernel/bpf/core.c | 2 +-
kernel/bpf/cpumask.c | 2 +-
kernel/bpf/hashtab.c | 5 +++--
kernel/bpf/memalloc.c | 8 +++++++-
5 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index 3929be5743f4..148347950e16 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -12,6 +12,12 @@ struct bpf_mem_alloc {
struct bpf_mem_caches __percpu *caches;
struct bpf_mem_cache __percpu *cache;
struct work_struct work;
+ unsigned int flags;
+};
+
+/* flags for bpf_mem_alloc_init() */
+enum {
+ BPF_MA_PERCPU = 1U << 0,
};
/* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects.
@@ -21,7 +27,7 @@ struct bpf_mem_alloc {
* Alloc and free are done with bpf_mem_{alloc,free}() and the size of
* the returned object is given by the size argument of bpf_mem_alloc().
*/
-int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, unsigned int flags);
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
/* kmalloc/kfree equivalent: */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7421487422d4..5c9622e8ca34 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2773,7 +2773,7 @@ static int __init bpf_global_ma_init(void)
{
int ret;
- ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
+ ret = bpf_mem_alloc_init(&bpf_global_ma, 0, 0);
bpf_global_ma_set = !ret;
return ret;
}
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 7efdf5d770ca..f40636796f75 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -445,7 +445,7 @@ static int __init cpumask_kfunc_init(void)
},
};
- ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false);
+ ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), 0);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);
return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 00c253b84bf5..93009b94ac9b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -576,12 +576,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
goto free_prealloc;
}
} else {
- err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
+ err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, 0);
if (err)
goto free_map_locked;
if (percpu) {
err = bpf_mem_alloc_init(&htab->pcpu_ma,
- round_up(htab->map.value_size, 8), true);
+ round_up(htab->map.value_size, 8),
+ BPF_MA_PERCPU);
if (err)
goto free_map_locked;
}
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 0668bcd7c926..072102476019 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -98,6 +98,7 @@ struct bpf_mem_cache {
int free_cnt;
int low_watermark, high_watermark, batch;
int percpu_size;
+ unsigned int flags;
struct rcu_head rcu;
struct llist_head free_by_rcu;
@@ -377,13 +378,14 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
* kmalloc/kfree. Max allocation size is 4096 in this case.
* This is bpf_dynptr and bpf_kptr use case.
*/
-int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, unsigned int flags)
{
static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
struct bpf_mem_caches *cc, __percpu *pcc;
struct bpf_mem_cache *c, __percpu *pc;
struct obj_cgroup *objcg = NULL;
int cpu, i, unit_size, percpu_size = 0;
+ bool percpu = (flags & BPF_MA_PERCPU);
if (size) {
pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
@@ -406,9 +408,11 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
c->unit_size = unit_size;
c->objcg = objcg;
c->percpu_size = percpu_size;
+ c->flags = flags;
prefill_mem_cache(c, cpu);
}
ma->cache = pc;
+ ma->flags = flags;
return 0;
}
@@ -428,10 +432,12 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
c = &cc->cache[i];
c->unit_size = sizes[i];
c->objcg = objcg;
+ c->flags = flags;
prefill_mem_cache(c, cpu);
}
}
ma->caches = pcc;
+ ma->flags = flags;
return 0;
}
--
2.29.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP
2023-04-29 10:12 [RFC bpf-next v3 0/6] Handle immediate reuse in bpf memory allocator Hou Tao
2023-04-29 10:12 ` [RFC bpf-next v3 1/6] bpf: Factor out a common helper free_all() Hou Tao
2023-04-29 10:12 ` [RFC bpf-next v3 2/6] bpf: Pass bitwise flags to bpf_mem_alloc_init() Hou Tao
@ 2023-04-29 10:12 ` Hou Tao
2023-05-01 23:59 ` Martin KaFai Lau
2023-05-03 18:48 ` Alexei Starovoitov
2023-04-29 10:12 ` [RFC bpf-next v3 4/6] bpf: Introduce BPF_MA_FREE_AFTER_RCU_GP Hou Tao
` (2 subsequent siblings)
5 siblings, 2 replies; 20+ messages in thread
From: Hou Tao @ 2023-04-29 10:12 UTC (permalink / raw)
To: bpf, Martin KaFai Lau, Alexei Starovoitov
Cc: Andrii Nakryiko, Song Liu, Hao Luo, Yonghong Song,
Daniel Borkmann, KP Singh, Stanislav Fomichev, Jiri Olsa,
John Fastabend, Paul E . McKenney, rcu, houtao1
From: Hou Tao <houtao1@huawei.com>
Currently the freed objects in bpf memory allocator may be reused
immediately by new allocation, it introduces use-after-bpf-ma-free
problem for non-preallocated hash map and makes lookup procedure
return incorrect result. The immediate reuse also makes introducing
new use case more difficult (e.g. qp-trie).
So introduce BPF_MA_REUSE_AFTER_RCU_GP to solve these problems. For
BPF_MA_REUSE_AFTER_GP, the freed objects are reused only after one RCU
grace period and may be returned back to slab system after another
RCU-tasks-trace grace period. So for bpf programs which care about reuse
problem, these programs can use bpf_rcu_read_{lock,unlock}() to access
these freed objects safely and for those which doesn't care, there will
be safely use-after-bpf-ma-free because these objects have not been
freed by bpf memory allocator.
To make these freed elements being reusab quickly, BPF_MA_REUSE_AFTER_GP
dynamically allocates memory to create many inflight RCU callbacks to
mark these freed element being reusable. These memories used for
bpf_reuse_batch will be freed when these RCU callbacks complete. When no
memory is available, synchronize_rcu_expedited() will be used to make
these freed element reusable. In order to reduce the risk of OOM, part
of these reusable memory will be freed through RCU-tasks-trace grace
period. Before these freeing memories are freed, these memories are also
available for reuse.
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
include/linux/bpf_mem_alloc.h | 1 +
kernel/bpf/memalloc.c | 353 +++++++++++++++++++++++++++++++---
2 files changed, 326 insertions(+), 28 deletions(-)
diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index 148347950e16..e7f68432713b 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -18,6 +18,7 @@ struct bpf_mem_alloc {
/* flags for bpf_mem_alloc_init() */
enum {
BPF_MA_PERCPU = 1U << 0,
+ BPF_MA_REUSE_AFTER_RCU_GP = 1U << 1,
};
/* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects.
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 072102476019..262100f89610 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -63,6 +63,10 @@ static u8 size_index[24] __ro_after_init = {
2 /* 192 */
};
+static struct workqueue_struct *bpf_ma_wq;
+
+static void bpf_ma_prepare_reuse_work(struct work_struct *work);
+
static int bpf_mem_cache_idx(size_t size)
{
if (!size || size > 4096)
@@ -98,18 +102,36 @@ struct bpf_mem_cache {
int free_cnt;
int low_watermark, high_watermark, batch;
int percpu_size;
+ int cpu;
unsigned int flags;
+ raw_spinlock_t reuse_lock;
+ bool abort_reuse;
+ struct llist_head reuse_ready_head;
+ struct llist_node *reuse_ready_tail;
+ struct llist_head wait_for_free;
+ struct llist_head prepare_reuse_head;
+ struct llist_node *prepare_reuse_tail;
+ unsigned int prepare_reuse_cnt;
+ atomic_t reuse_cb_in_progress;
+ struct work_struct reuse_work;
+
struct rcu_head rcu;
struct llist_head free_by_rcu;
struct llist_head waiting_for_gp;
- atomic_t call_rcu_in_progress;
+ atomic_t free_cb_in_progress;
};
struct bpf_mem_caches {
struct bpf_mem_cache cache[NUM_CACHES];
};
+struct bpf_reuse_batch {
+ struct bpf_mem_cache *c;
+ struct llist_node *head, *tail;
+ struct rcu_head rcu;
+};
+
static struct llist_node notrace *__llist_del_first(struct llist_head *head)
{
struct llist_node *entry, *next;
@@ -154,6 +176,45 @@ static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
#endif
}
+static void *bpf_ma_get_reusable_obj(struct bpf_mem_cache *c)
+{
+ if (c->flags & BPF_MA_REUSE_AFTER_RCU_GP) {
+ unsigned long flags;
+ void *obj;
+
+ if (llist_empty(&c->reuse_ready_head) && llist_empty(&c->wait_for_free))
+ return NULL;
+
+ /* reuse_ready_head and wait_for_free may be manipulated by
+ * kworker and RCU callbacks.
+ */
+ raw_spin_lock_irqsave(&c->reuse_lock, flags);
+ obj = __llist_del_first(&c->reuse_ready_head);
+ if (obj) {
+ if (llist_empty(&c->reuse_ready_head))
+ c->reuse_ready_tail = NULL;
+ } else {
+ obj = __llist_del_first(&c->wait_for_free);
+ }
+ raw_spin_unlock_irqrestore(&c->reuse_lock, flags);
+ return obj;
+ }
+
+ /*
+ * free_by_rcu is only manipulated by irq work refill_work().
+ * IRQ works on the same CPU are called sequentially, so it is
+ * safe to use __llist_del_first() here. If alloc_bulk() is
+ * invoked by the initial prefill, there will be no running
+ * refill_work(), so __llist_del_first() is fine as well.
+ *
+ * In most cases, objects on free_by_rcu are from the same CPU.
+ * If some objects come from other CPUs, it doesn't incur any
+ * harm because NUMA_NO_NODE means the preference for current
+ * numa node and it is not a guarantee.
+ */
+ return __llist_del_first(&c->free_by_rcu);
+}
+
/* Mostly runs from irq_work except __init phase. */
static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
{
@@ -165,19 +226,7 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
memcg = get_memcg(c);
old_memcg = set_active_memcg(memcg);
for (i = 0; i < cnt; i++) {
- /*
- * free_by_rcu is only manipulated by irq work refill_work().
- * IRQ works on the same CPU are called sequentially, so it is
- * safe to use __llist_del_first() here. If alloc_bulk() is
- * invoked by the initial prefill, there will be no running
- * refill_work(), so __llist_del_first() is fine as well.
- *
- * In most cases, objects on free_by_rcu are from the same CPU.
- * If some objects come from other CPUs, it doesn't incur any
- * harm because NUMA_NO_NODE means the preference for current
- * numa node and it is not a guarantee.
- */
- obj = __llist_del_first(&c->free_by_rcu);
+ obj = bpf_ma_get_reusable_obj(c);
if (!obj) {
/* Allocate, but don't deplete atomic reserves that typical
* GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
@@ -236,7 +285,7 @@ static void __free_rcu(struct rcu_head *head)
struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
- atomic_set(&c->call_rcu_in_progress, 0);
+ atomic_set(&c->free_cb_in_progress, 0);
}
static void __free_rcu_tasks_trace(struct rcu_head *head)
@@ -264,7 +313,7 @@ static void do_call_rcu(struct bpf_mem_cache *c)
{
struct llist_node *llnode, *t;
- if (atomic_xchg(&c->call_rcu_in_progress, 1))
+ if (atomic_xchg(&c->free_cb_in_progress, 1))
return;
WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
@@ -409,6 +458,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, unsigned int flags)
c->objcg = objcg;
c->percpu_size = percpu_size;
c->flags = flags;
+ c->cpu = cpu;
+ INIT_WORK(&c->reuse_work, bpf_ma_prepare_reuse_work);
prefill_mem_cache(c, cpu);
}
ma->cache = pc;
@@ -433,6 +484,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, unsigned int flags)
c->unit_size = sizes[i];
c->objcg = objcg;
c->flags = flags;
+ c->cpu = cpu;
+ INIT_WORK(&c->reuse_work, bpf_ma_prepare_reuse_work);
prefill_mem_cache(c, cpu);
}
}
@@ -444,18 +497,40 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, unsigned int flags)
static void drain_mem_cache(struct bpf_mem_cache *c)
{
bool percpu = !!c->percpu_size;
+ struct llist_node *head[3];
+ unsigned long flags;
/* No progs are using this bpf_mem_cache, but htab_map_free() called
* bpf_mem_cache_free() for all remaining elements and they can be in
* free_by_rcu or in waiting_for_gp lists, so drain those lists now.
*
- * Except for waiting_for_gp list, there are no concurrent operations
- * on these lists, so it is safe to use __llist_del_all().
+ * Except for waiting_for_gp and free_llist_extra list, there are no
+ * concurrent operations on these lists, so it is safe to use
+ * __llist_del_all().
*/
free_all(__llist_del_all(&c->free_by_rcu), percpu);
free_all(llist_del_all(&c->waiting_for_gp), percpu);
free_all(__llist_del_all(&c->free_llist), percpu);
- free_all(__llist_del_all(&c->free_llist_extra), percpu);
+ free_all(llist_del_all(&c->free_llist_extra), percpu);
+
+ if (!(c->flags & BPF_MA_REUSE_AFTER_RCU_GP))
+ return;
+
+ raw_spin_lock_irqsave(&c->reuse_lock, flags);
+ /* Indicate kworker and RCU callback to free elements directly
+ * instead of adding new elements into these lists.
+ */
+ c->abort_reuse = true;
+ head[0] = __llist_del_all(&c->prepare_reuse_head);
+ c->prepare_reuse_tail = NULL;
+ head[1] = __llist_del_all(&c->reuse_ready_head);
+ c->reuse_ready_tail = NULL;
+ head[2] = __llist_del_all(&c->wait_for_free);
+ raw_spin_unlock_irqrestore(&c->reuse_lock, flags);
+
+ free_all(head[0], percpu);
+ free_all(head[1], percpu);
+ free_all(head[2], percpu);
}
static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
@@ -466,10 +541,39 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
ma->caches = NULL;
}
+static void bpf_ma_cancel_reuse_work(struct bpf_mem_alloc *ma)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i;
+
+ if (ma->cache) {
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ cancel_work_sync(&c->reuse_work);
+ }
+ }
+ if (ma->caches) {
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ cancel_work_sync(&c->reuse_work);
+ }
+ }
+ }
+}
+
static void free_mem_alloc(struct bpf_mem_alloc *ma)
{
- /* waiting_for_gp lists was drained, but __free_rcu might
- * still execute. Wait for it now before we freeing percpu caches.
+ bool reuse_after_rcu_gp = ma->flags & BPF_MA_REUSE_AFTER_RCU_GP;
+
+ /* Cancel the inflight kworkers */
+ if (reuse_after_rcu_gp)
+ bpf_ma_cancel_reuse_work(ma);
+
+ /* For normal bpf ma, waiting_for_gp lists was drained, but __free_rcu
+ * might still execute. Wait for it now before we freeing percpu caches.
*
* rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
* but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
@@ -477,9 +581,13 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)
* so if call_rcu(head, __free_rcu) is skipped due to
* rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
* using rcu_trace_implies_rcu_gp() as well.
+ *
+ * For reuse-after-rcu-gp bpf ma, use rcu_barrier_tasks_trace() to
+ * wait for the pending bpf_ma_free_reusable_cb() and use rcu_barrier()
+ * to wait for the pending bpf_ma_reuse_cb().
*/
rcu_barrier_tasks_trace();
- if (!rcu_trace_implies_rcu_gp())
+ if (reuse_after_rcu_gp || !rcu_trace_implies_rcu_gp())
rcu_barrier();
free_mem_alloc_no_barrier(ma);
}
@@ -512,6 +620,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
}
/* Defer barriers into worker to let the rest of map memory to be freed */
+ copy->flags = ma->flags;
copy->cache = ma->cache;
ma->cache = NULL;
copy->caches = ma->caches;
@@ -541,7 +650,9 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
*/
irq_work_sync(&c->refill_work);
drain_mem_cache(c);
- rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ rcu_in_progress += atomic_read(&c->free_cb_in_progress);
+ /* Pending kworkers or RCU callbacks */
+ rcu_in_progress += atomic_read(&c->reuse_cb_in_progress);
}
/* objcg is the same across cpus */
if (c->objcg)
@@ -556,7 +667,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
c = &cc->cache[i];
irq_work_sync(&c->refill_work);
drain_mem_cache(c);
- rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ rcu_in_progress += atomic_read(&c->free_cb_in_progress);
+ rcu_in_progress += atomic_read(&c->reuse_cb_in_progress);
}
}
if (c->objcg)
@@ -600,18 +712,183 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c)
return llnode;
}
+static void bpf_ma_add_to_reuse_ready_or_free(struct bpf_mem_cache *c, struct llist_node *head,
+ struct llist_node *tail)
+{
+ unsigned long flags;
+ bool abort;
+
+ raw_spin_lock_irqsave(&c->reuse_lock, flags);
+ abort = c->abort_reuse;
+ if (!abort) {
+ if (llist_empty(&c->reuse_ready_head))
+ c->reuse_ready_tail = tail;
+ __llist_add_batch(head, tail, &c->reuse_ready_head);
+ }
+ raw_spin_unlock_irqrestore(&c->reuse_lock, flags);
+
+ /* Don't move these objects to reuse_ready list and free
+ * these objects directly.
+ */
+ if (abort)
+ free_all(head, !!c->percpu_size);
+}
+
+static void bpf_ma_reuse_cb(struct rcu_head *rcu)
+{
+ struct bpf_reuse_batch *batch = container_of(rcu, struct bpf_reuse_batch, rcu);
+ struct bpf_mem_cache *c = batch->c;
+
+ bpf_ma_add_to_reuse_ready_or_free(c, batch->head, batch->tail);
+ atomic_dec(&c->reuse_cb_in_progress);
+ kfree(batch);
+}
+
+static bool bpf_ma_try_free_reuse_objs(struct bpf_mem_cache *c)
+{
+ struct llist_node *head, *tail;
+ bool do_free;
+
+ if (llist_empty(&c->reuse_ready_head))
+ return false;
+
+ do_free = !atomic_xchg(&c->free_cb_in_progress, 1);
+ if (!do_free)
+ return false;
+
+ head = __llist_del_all(&c->reuse_ready_head);
+ tail = c->reuse_ready_tail;
+ c->reuse_ready_tail = NULL;
+
+ __llist_add_batch(head, tail, &c->wait_for_free);
+
+ return true;
+}
+
+static void bpf_ma_free_reusable_cb(struct rcu_head *rcu)
+{
+ struct bpf_mem_cache *c = container_of(rcu, struct bpf_mem_cache, rcu);
+ struct llist_node *head;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&c->reuse_lock, flags);
+ head = __llist_del_all(&c->wait_for_free);
+ raw_spin_unlock_irqrestore(&c->reuse_lock, flags);
+
+ free_all(head, !!c->percpu_size);
+ atomic_set(&c->free_cb_in_progress, 0);
+}
+
+static void bpf_ma_prepare_reuse_work(struct work_struct *work)
+{
+ struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, reuse_work);
+ struct llist_node *head, *tail, *llnode, *tmp;
+ struct bpf_reuse_batch *batch;
+ unsigned long flags;
+ bool do_free;
+
+ local_irq_save(flags);
+ /* When CPU is offline, the running CPU may be different with
+ * the CPU which submitted the work. When these two CPUs are the same,
+ * kworker may be interrupted by NMI, so increase active to protect
+ * again such concurrency.
+ */
+ if (c->cpu == smp_processor_id())
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+ raw_spin_lock(&c->reuse_lock);
+ head = __llist_del_all(&c->prepare_reuse_head);
+ tail = c->prepare_reuse_tail;
+ c->prepare_reuse_tail = NULL;
+ c->prepare_reuse_cnt = 0;
+ if (c->cpu == smp_processor_id())
+ local_dec(&c->active);
+
+ /* Try to free elements in reusable list. Before these elements are
+ * freed in RCU cb, these element will still be available for reuse.
+ */
+ do_free = bpf_ma_try_free_reuse_objs(c);
+ raw_spin_unlock(&c->reuse_lock);
+ local_irq_restore(flags);
+
+ if (do_free)
+ call_rcu_tasks_trace(&c->rcu, bpf_ma_free_reusable_cb);
+
+ llist_for_each_safe(llnode, tmp, llist_del_all(&c->free_llist_extra)) {
+ if (!head)
+ tail = llnode;
+ llnode->next = head;
+ head = llnode->next;
+ }
+ /* Draining is in progress ? */
+ if (!head) {
+ /* kworker completes and no RCU callback */
+ atomic_dec(&c->reuse_cb_in_progress);
+ return;
+ }
+
+ batch = kmalloc(sizeof(*batch), GFP_KERNEL);
+ if (!batch) {
+ synchronize_rcu_expedited();
+ bpf_ma_add_to_reuse_ready_or_free(c, head, tail);
+ /* kworker completes and no RCU callback */
+ atomic_dec(&c->reuse_cb_in_progress);
+ return;
+ }
+
+ batch->c = c;
+ batch->head = head;
+ batch->tail = tail;
+ call_rcu(&batch->rcu, bpf_ma_reuse_cb);
+}
+
+static void notrace wait_gp_reuse_free(struct bpf_mem_cache *c, struct llist_node *llnode)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ /* In case a NMI-context bpf program is also freeing object. */
+ if (local_inc_return(&c->active) == 1) {
+ bool try_queue_work = false;
+
+ /* kworker may remove elements from prepare_reuse_head */
+ raw_spin_lock(&c->reuse_lock);
+ if (llist_empty(&c->prepare_reuse_head))
+ c->prepare_reuse_tail = llnode;
+ __llist_add(llnode, &c->prepare_reuse_head);
+ if (++c->prepare_reuse_cnt > c->high_watermark) {
+ /* Zero out prepare_reuse_cnt early to prevent
+ * unnecessary queue_work().
+ */
+ c->prepare_reuse_cnt = 0;
+ try_queue_work = true;
+ }
+ raw_spin_unlock(&c->reuse_lock);
+
+ if (try_queue_work && !work_pending(&c->reuse_work)) {
+ /* Use reuse_cb_in_progress to indicate there is
+ * inflight reuse kworker or reuse RCU callback.
+ */
+ atomic_inc(&c->reuse_cb_in_progress);
+ /* Already queued */
+ if (!queue_work(bpf_ma_wq, &c->reuse_work))
+ atomic_dec(&c->reuse_cb_in_progress);
+ }
+ } else {
+ llist_add(llnode, &c->free_llist_extra);
+ }
+ local_dec(&c->active);
+ local_irq_restore(flags);
+}
+
/* Though 'ptr' object could have been allocated on a different cpu
* add it to the free_llist of the current cpu.
* Let kfree() logic deal with it when it's later called from irq_work.
*/
-static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
+static void notrace immediate_reuse_free(struct bpf_mem_cache *c, struct llist_node *llnode)
{
- struct llist_node *llnode = ptr - LLIST_NODE_SZ;
unsigned long flags;
int cnt = 0;
- BUILD_BUG_ON(LLIST_NODE_SZ > 8);
-
local_irq_save(flags);
if (local_inc_return(&c->active) == 1) {
__llist_add(llnode, &c->free_llist);
@@ -633,6 +910,18 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
irq_work_raise(c);
}
+static inline void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
+{
+ struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+
+ BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+
+ if (c->flags & BPF_MA_REUSE_AFTER_RCU_GP)
+ wait_gp_reuse_free(c, llnode);
+ else
+ immediate_reuse_free(c, llnode);
+}
+
/* Called from BPF program or from sys_bpf syscall.
* In both cases migration is disabled.
*/
@@ -724,3 +1013,11 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
+
+static int __init bpf_ma_init(void)
+{
+ bpf_ma_wq = alloc_workqueue("bpf_ma", WQ_MEM_RECLAIM, 0);
+ BUG_ON(!bpf_ma_wq);
+ return 0;
+}
+late_initcall(bpf_ma_init);
--
2.29.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP
2023-04-29 10:12 ` [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP Hou Tao
@ 2023-05-01 23:59 ` Martin KaFai Lau
2023-05-03 18:48 ` Alexei Starovoitov
1 sibling, 0 replies; 20+ messages in thread
From: Martin KaFai Lau @ 2023-05-01 23:59 UTC (permalink / raw)
To: Hou Tao
Cc: Andrii Nakryiko, Song Liu, Hao Luo, Yonghong Song,
Daniel Borkmann, KP Singh, Stanislav Fomichev, Jiri Olsa,
John Fastabend, Paul E . McKenney, rcu, houtao1, bpf,
Alexei Starovoitov
On 4/29/23 3:12 AM, Hou Tao wrote:
> +static void bpf_ma_prepare_reuse_work(struct work_struct *work)
> +{
> + struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, reuse_work);
> + struct llist_node *head, *tail, *llnode, *tmp;
> + struct bpf_reuse_batch *batch;
> + unsigned long flags;
> + bool do_free;
> +
> + local_irq_save(flags);
> + /* When CPU is offline, the running CPU may be different with
> + * the CPU which submitted the work. When these two CPUs are the same,
> + * kworker may be interrupted by NMI, so increase active to protect
> + * again such concurrency.
> + */
> + if (c->cpu == smp_processor_id())
> + WARN_ON_ONCE(local_inc_return(&c->active) != 1);
> + raw_spin_lock(&c->reuse_lock);
> + head = __llist_del_all(&c->prepare_reuse_head);
> + tail = c->prepare_reuse_tail;
> + c->prepare_reuse_tail = NULL;
> + c->prepare_reuse_cnt = 0;
> + if (c->cpu == smp_processor_id())
> + local_dec(&c->active);
> +
> + /* Try to free elements in reusable list. Before these elements are
> + * freed in RCU cb, these element will still be available for reuse.
> + */
> + do_free = bpf_ma_try_free_reuse_objs(c);
> + raw_spin_unlock(&c->reuse_lock);
> + local_irq_restore(flags);
> +
> + if (do_free)
> + call_rcu_tasks_trace(&c->rcu, bpf_ma_free_reusable_cb);
> +
> + llist_for_each_safe(llnode, tmp, llist_del_all(&c->free_llist_extra)) {
> + if (!head)
> + tail = llnode;
> + llnode->next = head;
> + head = llnode->next;
> + }
> + /* Draining is in progress ? */
> + if (!head) {
> + /* kworker completes and no RCU callback */
> + atomic_dec(&c->reuse_cb_in_progress);
> + return;
> + }
> +
> + batch = kmalloc(sizeof(*batch), GFP_KERNEL);
> + if (!batch) {
> + synchronize_rcu_expedited();
> + bpf_ma_add_to_reuse_ready_or_free(c, head, tail);
> + /* kworker completes and no RCU callback */
> + atomic_dec(&c->reuse_cb_in_progress);
> + return;
> + }
> +
> + batch->c = c;
> + batch->head = head;
> + batch->tail = tail;
> + call_rcu(&batch->rcu, bpf_ma_reuse_cb);
> +}
> +
> +static void notrace wait_gp_reuse_free(struct bpf_mem_cache *c, struct llist_node *llnode)
> +{
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + /* In case a NMI-context bpf program is also freeing object. */
> + if (local_inc_return(&c->active) == 1) {
> + bool try_queue_work = false;
> +
> + /* kworker may remove elements from prepare_reuse_head */
> + raw_spin_lock(&c->reuse_lock);
> + if (llist_empty(&c->prepare_reuse_head))
> + c->prepare_reuse_tail = llnode;
> + __llist_add(llnode, &c->prepare_reuse_head);
> + if (++c->prepare_reuse_cnt > c->high_watermark) {
> + /* Zero out prepare_reuse_cnt early to prevent
> + * unnecessary queue_work().
> + */
> + c->prepare_reuse_cnt = 0;
> + try_queue_work = true;
> + }
> + raw_spin_unlock(&c->reuse_lock);
> +
> + if (try_queue_work && !work_pending(&c->reuse_work)) {
> + /* Use reuse_cb_in_progress to indicate there is
> + * inflight reuse kworker or reuse RCU callback.
> + */
> + atomic_inc(&c->reuse_cb_in_progress);
> + /* Already queued */
> + if (!queue_work(bpf_ma_wq, &c->reuse_work))
queue_work will be called from a bpf program (e.g. bpf_mem_cache_free ->
unit_free -> queue_work). Is it safe from recursion and deadlock?
eg. what if a tracing bpf prog is attached to some functions in workqueue.c
after acquiring a workqueue related spin lock and that tracing bpf prog is doing
unit_free?
Not a workqueue expert. Asking because it is not obvious to me considering there
is a lot of ground to cover in workqueue.c.
I wonder what happen to the current bpf memalloc approach to postpone work to
irq work. v2 mentioned it does not work well. Did you figure out why?
> + atomic_dec(&c->reuse_cb_in_progress);
> + }
> + } else {
> + llist_add(llnode, &c->free_llist_extra);
> + }
> + local_dec(&c->active);
> + local_irq_restore(flags);
> +}
> +
> /* Though 'ptr' object could have been allocated on a different cpu
> * add it to the free_llist of the current cpu.
> * Let kfree() logic deal with it when it's later called from irq_work.
> */
> -static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
> +static void notrace immediate_reuse_free(struct bpf_mem_cache *c, struct llist_node *llnode)
> {
> - struct llist_node *llnode = ptr - LLIST_NODE_SZ;
> unsigned long flags;
> int cnt = 0;
>
> - BUILD_BUG_ON(LLIST_NODE_SZ > 8);
> -
> local_irq_save(flags);
> if (local_inc_return(&c->active) == 1) {
> __llist_add(llnode, &c->free_llist);
> @@ -633,6 +910,18 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
> irq_work_raise(c);
> }
>
> +static inline void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
> +{
> + struct llist_node *llnode = ptr - LLIST_NODE_SZ;
> +
> + BUILD_BUG_ON(LLIST_NODE_SZ > 8);
> +
> + if (c->flags & BPF_MA_REUSE_AFTER_RCU_GP)
> + wait_gp_reuse_free(c, llnode);
> + else
> + immediate_reuse_free(c, llnode);
> +}
> +
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP
2023-04-29 10:12 ` [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP Hou Tao
2023-05-01 23:59 ` Martin KaFai Lau
@ 2023-05-03 18:48 ` Alexei Starovoitov
2023-05-03 21:57 ` Martin KaFai Lau
2023-05-04 1:35 ` Hou Tao
1 sibling, 2 replies; 20+ messages in thread
From: Alexei Starovoitov @ 2023-05-03 18:48 UTC (permalink / raw)
To: Hou Tao
Cc: bpf, Martin KaFai Lau, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, Paul E . McKenney, rcu, houtao1
On Sat, Apr 29, 2023 at 06:12:12PM +0800, Hou Tao wrote:
> +
> +static void notrace wait_gp_reuse_free(struct bpf_mem_cache *c, struct llist_node *llnode)
> +{
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + /* In case a NMI-context bpf program is also freeing object. */
> + if (local_inc_return(&c->active) == 1) {
> + bool try_queue_work = false;
> +
> + /* kworker may remove elements from prepare_reuse_head */
> + raw_spin_lock(&c->reuse_lock);
> + if (llist_empty(&c->prepare_reuse_head))
> + c->prepare_reuse_tail = llnode;
> + __llist_add(llnode, &c->prepare_reuse_head);
> + if (++c->prepare_reuse_cnt > c->high_watermark) {
> + /* Zero out prepare_reuse_cnt early to prevent
> + * unnecessary queue_work().
> + */
> + c->prepare_reuse_cnt = 0;
> + try_queue_work = true;
> + }
> + raw_spin_unlock(&c->reuse_lock);
> +
> + if (try_queue_work && !work_pending(&c->reuse_work)) {
> + /* Use reuse_cb_in_progress to indicate there is
> + * inflight reuse kworker or reuse RCU callback.
> + */
> + atomic_inc(&c->reuse_cb_in_progress);
> + /* Already queued */
> + if (!queue_work(bpf_ma_wq, &c->reuse_work))
As Martin pointed out queue_work() is not safe here.
The raw_spin_lock(&c->reuse_lock); earlier is not safe either.
For the next version please drop workers and spin_lock from unit_free/alloc paths.
If lock has to be taken it should be done from irq_work.
Under no circumstances we can use alloc_workqueue(). No new kthreads.
We can avoid adding new flag to bpf_mem_alloc to reduce the complexity
and do roughly equivalent of REUSE_AFTER_RCU_GP unconditionally in the following way:
- alloc_bulk() won't be trying to steal from c->free_by_rcu.
- do_call_rcu() does call_rcu(&c->rcu, __free_rcu) instead of task-trace version.
- rcu_trace_implies_rcu_gp() is never used.
- after RCU_GP __free_rcu() moves all waiting_for_gp elements into
a size specific link list per bpf_mem_alloc (not per bpf_mem_cache which is per-cpu)
and does call_rcu_tasks_trace
- Let's call this list ma->free_by_rcu_tasks_trace
(only one list for bpf_mem_alloc with known size or NUM_CACHES such lists when size == 0 at init)
- any cpu alloc_bulk() can steal from size specific ma->free_by_rcu_tasks_trace list that
is protected by ma->spin_lock (1 or NUM_CACHES such locks)
- ma->waiting_for_gp_tasks_trace will be freeing elements into slab
What it means that sleepable progs using hashmap will be able to avoid uaf with bpf_rcu_read_lock().
Without explicit bpf_rcu_read_lock() it's still safe and equivalent to existing behavior of bpf_mem_alloc.
(while your proposed BPF_MA_FREE_AFTER_RCU_GP flavor is not safe to use in hashtab with sleepable progs)
After that we can unconditionally remove rcu_head/call_rcu from bpf_cpumask and improve usability of bpf_obj_drop.
Probably usage of bpf_mem_alloc in local storage can be simplified as well.
Martin wdyt?
I think this approach adds minimal complexity to bpf_mem_alloc while solving all existing pain points
including needs of qp-trie.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP
2023-05-03 18:48 ` Alexei Starovoitov
@ 2023-05-03 21:57 ` Martin KaFai Lau
2023-05-03 23:06 ` Alexei Starovoitov
2023-05-04 1:35 ` Hou Tao
1 sibling, 1 reply; 20+ messages in thread
From: Martin KaFai Lau @ 2023-05-03 21:57 UTC (permalink / raw)
To: Alexei Starovoitov, Hou Tao
Cc: bpf, Andrii Nakryiko, Song Liu, Hao Luo, Yonghong Song,
Daniel Borkmann, KP Singh, Stanislav Fomichev, Jiri Olsa,
John Fastabend, Paul E . McKenney, rcu, houtao1
On 5/3/23 11:48 AM, Alexei Starovoitov wrote:
> What it means that sleepable progs using hashmap will be able to avoid uaf with bpf_rcu_read_lock().
> Without explicit bpf_rcu_read_lock() it's still safe and equivalent to existing behavior of bpf_mem_alloc.
> (while your proposed BPF_MA_FREE_AFTER_RCU_GP flavor is not safe to use in hashtab with sleepable progs)
>
> After that we can unconditionally remove rcu_head/call_rcu from bpf_cpumask and improve usability of bpf_obj_drop.
> Probably usage of bpf_mem_alloc in local storage can be simplified as well.
> Martin wdyt?
If the bpf prog always does a bpf_rcu_read_lock() before accessing the (e.g.)
task local storage, it can remove the reuse_now conditions in the
bpf_local_storage and directly call the bpf_mem_cache_free().
The only corner use case is when the bpf_prog or syscall does
bpf_task_storage_delete() instead of having the task storage stays with the
whole lifetime of the task_struct. Using REUSE_AFTER_RCU_GP will be a change of
this uaf guarantee to the sleepable program but it is still safe because it is
freed after tasks_trace gp. We could take this chance to align this behavior of
the local storage map to the other bpf maps.
For BPF_MA_FREE_AFTER_RCU_GP, there are cases that the bpf local storage knows
it can be freed without waiting tasks_trace gp. However, only task/cgroup
storages are in bpf ma and I don't believe this optimization matter much for
them. I would rather focus on the REUSE_AFTER_RCU_GP first.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP
2023-05-03 21:57 ` Martin KaFai Lau
@ 2023-05-03 23:06 ` Alexei Starovoitov
2023-05-03 23:39 ` Martin KaFai Lau
0 siblings, 1 reply; 20+ messages in thread
From: Alexei Starovoitov @ 2023-05-03 23:06 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: Hou Tao, bpf, Andrii Nakryiko, Song Liu, Hao Luo, Yonghong Song,
Daniel Borkmann, KP Singh, Stanislav Fomichev, Jiri Olsa,
John Fastabend, Paul E . McKenney, rcu, houtao1
On Wed, May 03, 2023 at 02:57:03PM -0700, Martin KaFai Lau wrote:
> On 5/3/23 11:48 AM, Alexei Starovoitov wrote:
> > What it means that sleepable progs using hashmap will be able to avoid uaf with bpf_rcu_read_lock().
> > Without explicit bpf_rcu_read_lock() it's still safe and equivalent to existing behavior of bpf_mem_alloc.
> > (while your proposed BPF_MA_FREE_AFTER_RCU_GP flavor is not safe to use in hashtab with sleepable progs)
> >
> > After that we can unconditionally remove rcu_head/call_rcu from bpf_cpumask and improve usability of bpf_obj_drop.
> > Probably usage of bpf_mem_alloc in local storage can be simplified as well.
> > Martin wdyt?
>
> If the bpf prog always does a bpf_rcu_read_lock() before accessing the
> (e.g.) task local storage, it can remove the reuse_now conditions in the
> bpf_local_storage and directly call the bpf_mem_cache_free().
>
> The only corner use case is when the bpf_prog or syscall does
> bpf_task_storage_delete() instead of having the task storage stays with the
> whole lifetime of the task_struct. Using REUSE_AFTER_RCU_GP will be a change
> of this uaf guarantee to the sleepable program but it is still safe because
> it is freed after tasks_trace gp. We could take this chance to align this
> behavior of the local storage map to the other bpf maps.
>
> For BPF_MA_FREE_AFTER_RCU_GP, there are cases that the bpf local storage
> knows it can be freed without waiting tasks_trace gp. However, only
> task/cgroup storages are in bpf ma and I don't believe this optimization
> matter much for them. I would rather focus on the REUSE_AFTER_RCU_GP first.
I'm confused which REUSE_AFTER_RCU_GP you meant.
What I proposed above is REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace
Hou's proposals: 1. BPF_MA_REUSE_AFTER_two_RCUs_GP 2. BPF_MA_FREE_AFTER_single_RCU_GP
If I'm reading bpf_local_storage correctly it can remove reuse_now logic
in all conditions with REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace.
What am I missing?
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP
2023-05-03 23:06 ` Alexei Starovoitov
@ 2023-05-03 23:39 ` Martin KaFai Lau
2023-05-04 1:42 ` Alexei Starovoitov
2023-05-04 2:08 ` Hou Tao
0 siblings, 2 replies; 20+ messages in thread
From: Martin KaFai Lau @ 2023-05-03 23:39 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Hou Tao, bpf, Andrii Nakryiko, Song Liu, Hao Luo, Yonghong Song,
Daniel Borkmann, KP Singh, Stanislav Fomichev, Jiri Olsa,
John Fastabend, Paul E . McKenney, rcu, houtao1
On 5/3/23 4:06 PM, Alexei Starovoitov wrote:
> On Wed, May 03, 2023 at 02:57:03PM -0700, Martin KaFai Lau wrote:
>> On 5/3/23 11:48 AM, Alexei Starovoitov wrote:
>>> What it means that sleepable progs using hashmap will be able to avoid uaf with bpf_rcu_read_lock().
>>> Without explicit bpf_rcu_read_lock() it's still safe and equivalent to existing behavior of bpf_mem_alloc.
>>> (while your proposed BPF_MA_FREE_AFTER_RCU_GP flavor is not safe to use in hashtab with sleepable progs)
>>>
>>> After that we can unconditionally remove rcu_head/call_rcu from bpf_cpumask and improve usability of bpf_obj_drop.
>>> Probably usage of bpf_mem_alloc in local storage can be simplified as well.
>>> Martin wdyt?
>>
>> If the bpf prog always does a bpf_rcu_read_lock() before accessing the
>> (e.g.) task local storage, it can remove the reuse_now conditions in the
>> bpf_local_storage and directly call the bpf_mem_cache_free().
>>
>> The only corner use case is when the bpf_prog or syscall does
>> bpf_task_storage_delete() instead of having the task storage stays with the
>> whole lifetime of the task_struct. Using REUSE_AFTER_RCU_GP will be a change
>> of this uaf guarantee to the sleepable program but it is still safe because
>> it is freed after tasks_trace gp. We could take this chance to align this
>> behavior of the local storage map to the other bpf maps.
>>
>> For BPF_MA_FREE_AFTER_RCU_GP, there are cases that the bpf local storage
>> knows it can be freed without waiting tasks_trace gp. However, only
>> task/cgroup storages are in bpf ma and I don't believe this optimization
>> matter much for them. I would rather focus on the REUSE_AFTER_RCU_GP first.
>
> I'm confused which REUSE_AFTER_RCU_GP you meant.
> What I proposed above is REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace
Regarding REUSE_AFTER_RCU_GP, I meant
REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace.
>
> Hou's proposals: 1. BPF_MA_REUSE_AFTER_two_RCUs_GP 2. BPF_MA_FREE_AFTER_single_RCU_GP
It probably is where the confusion is. I thought Hou's BPF_MA_REUSE_AFTER_RCU_GP
is already REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace. From the commit
message:
" ... So introduce BPF_MA_REUSE_AFTER_RCU_GP to solve these problems. For
BPF_MA_REUSE_AFTER_GP, the freed objects are reused only after one RCU
grace period and may be returned back to slab system after another
RCU-tasks-trace grace period. ..."
[I assumed BPF_MA_REUSE_AFTER_GP is just a typo of BPF_MA_REUSE_AFTER_"RCU"_GP]
>
> If I'm reading bpf_local_storage correctly it can remove reuse_now logic
> in all conditions with REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace.
Right, for smap->bpf_ma == true (cgroup and task storage), all reuse_now logic
can be gone and directly use the bpf_mem_cache_free(). Potentially the sk/inode
can also move to bpf_ma after running some benchmark. This will simplify things
a lot. For sk storage, the reuse_now was there to avoid the unnecessary
tasks_trace gp because performance impact was reported on sk storage where
connections can be open-and-close very frequently.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP
2023-05-03 23:39 ` Martin KaFai Lau
@ 2023-05-04 1:42 ` Alexei Starovoitov
2023-05-04 2:08 ` Hou Tao
1 sibling, 0 replies; 20+ messages in thread
From: Alexei Starovoitov @ 2023-05-04 1:42 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: Hou Tao, bpf, Andrii Nakryiko, Song Liu, Hao Luo, Yonghong Song,
Daniel Borkmann, KP Singh, Stanislav Fomichev, Jiri Olsa,
John Fastabend, Paul E . McKenney, rcu, houtao1
On Wed, May 03, 2023 at 04:39:01PM -0700, Martin KaFai Lau wrote:
> On 5/3/23 4:06 PM, Alexei Starovoitov wrote:
> > On Wed, May 03, 2023 at 02:57:03PM -0700, Martin KaFai Lau wrote:
> > > On 5/3/23 11:48 AM, Alexei Starovoitov wrote:
> > > > What it means that sleepable progs using hashmap will be able to avoid uaf with bpf_rcu_read_lock().
> > > > Without explicit bpf_rcu_read_lock() it's still safe and equivalent to existing behavior of bpf_mem_alloc.
> > > > (while your proposed BPF_MA_FREE_AFTER_RCU_GP flavor is not safe to use in hashtab with sleepable progs)
> > > >
> > > > After that we can unconditionally remove rcu_head/call_rcu from bpf_cpumask and improve usability of bpf_obj_drop.
> > > > Probably usage of bpf_mem_alloc in local storage can be simplified as well.
> > > > Martin wdyt?
> > >
> > > If the bpf prog always does a bpf_rcu_read_lock() before accessing the
> > > (e.g.) task local storage, it can remove the reuse_now conditions in the
> > > bpf_local_storage and directly call the bpf_mem_cache_free().
> > >
> > > The only corner use case is when the bpf_prog or syscall does
> > > bpf_task_storage_delete() instead of having the task storage stays with the
> > > whole lifetime of the task_struct. Using REUSE_AFTER_RCU_GP will be a change
> > > of this uaf guarantee to the sleepable program but it is still safe because
> > > it is freed after tasks_trace gp. We could take this chance to align this
> > > behavior of the local storage map to the other bpf maps.
> > >
> > > For BPF_MA_FREE_AFTER_RCU_GP, there are cases that the bpf local storage
> > > knows it can be freed without waiting tasks_trace gp. However, only
> > > task/cgroup storages are in bpf ma and I don't believe this optimization
> > > matter much for them. I would rather focus on the REUSE_AFTER_RCU_GP first.
> >
> > I'm confused which REUSE_AFTER_RCU_GP you meant.
> > What I proposed above is REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace
>
> Regarding REUSE_AFTER_RCU_GP, I meant
> REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace.
>
> >
> > Hou's proposals: 1. BPF_MA_REUSE_AFTER_two_RCUs_GP 2. BPF_MA_FREE_AFTER_single_RCU_GP
>
> It probably is where the confusion is. I thought Hou's
> BPF_MA_REUSE_AFTER_RCU_GP is already
> REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace. From the commit message:
Sorry. My bad. You're correct.
The difference between my and Hou's #1 is whether rcu_tasks_trace is global or per-cpu.
>
> " ... So introduce BPF_MA_REUSE_AFTER_RCU_GP to solve these problems. For
> BPF_MA_REUSE_AFTER_GP, the freed objects are reused only after one RCU
> grace period and may be returned back to slab system after another
> RCU-tasks-trace grace period. ..."
>
> [I assumed BPF_MA_REUSE_AFTER_GP is just a typo of BPF_MA_REUSE_AFTER_"RCU"_GP]
>
> >
> > If I'm reading bpf_local_storage correctly it can remove reuse_now logic
> > in all conditions with REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace.
>
> Right, for smap->bpf_ma == true (cgroup and task storage), all reuse_now
> logic can be gone and directly use the bpf_mem_cache_free(). Potentially the
> sk/inode can also move to bpf_ma after running some benchmark. This will
> simplify things a lot. For sk storage, the reuse_now was there to avoid the
> unnecessary tasks_trace gp because performance impact was reported on sk
> storage where connections can be open-and-close very frequently.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP
2023-05-03 23:39 ` Martin KaFai Lau
2023-05-04 1:42 ` Alexei Starovoitov
@ 2023-05-04 2:08 ` Hou Tao
1 sibling, 0 replies; 20+ messages in thread
From: Hou Tao @ 2023-05-04 2:08 UTC (permalink / raw)
To: Martin KaFai Lau, Alexei Starovoitov
Cc: bpf, Andrii Nakryiko, Song Liu, Hao Luo, Yonghong Song,
Daniel Borkmann, KP Singh, Stanislav Fomichev, Jiri Olsa,
John Fastabend, Paul E . McKenney, rcu, houtao1
Hi,
On 5/4/2023 7:39 AM, Martin KaFai Lau wrote:
> On 5/3/23 4:06 PM, Alexei Starovoitov wrote:
>> On Wed, May 03, 2023 at 02:57:03PM -0700, Martin KaFai Lau wrote:
>>> On 5/3/23 11:48 AM, Alexei Starovoitov wrote:
SNIP
>>>
>>> If the bpf prog always does a bpf_rcu_read_lock() before accessing the
>>> (e.g.) task local storage, it can remove the reuse_now conditions in
>>> the
>>> bpf_local_storage and directly call the bpf_mem_cache_free().
>>>
>>> The only corner use case is when the bpf_prog or syscall does
>>> bpf_task_storage_delete() instead of having the task storage stays
>>> with the
>>> whole lifetime of the task_struct. Using REUSE_AFTER_RCU_GP will be
>>> a change
>>> of this uaf guarantee to the sleepable program but it is still safe
>>> because
>>> it is freed after tasks_trace gp. We could take this chance to align
>>> this
>>> behavior of the local storage map to the other bpf maps.
>>>
>>> For BPF_MA_FREE_AFTER_RCU_GP, there are cases that the bpf local
>>> storage
>>> knows it can be freed without waiting tasks_trace gp. However, only
>>> task/cgroup storages are in bpf ma and I don't believe this
>>> optimization
>>> matter much for them. I would rather focus on the REUSE_AFTER_RCU_GP
>>> first.
OK.
>>
>> I'm confused which REUSE_AFTER_RCU_GP you meant.
>> What I proposed above is
>> REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace
>
> Regarding REUSE_AFTER_RCU_GP, I meant
> REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace.
>
>>
>> Hou's proposals: 1. BPF_MA_REUSE_AFTER_two_RCUs_GP 2.
>> BPF_MA_FREE_AFTER_single_RCU_GP
>
> It probably is where the confusion is. I thought Hou's
> BPF_MA_REUSE_AFTER_RCU_GP is already
> REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace. From the commit
> message:
>
> " ... So introduce BPF_MA_REUSE_AFTER_RCU_GP to solve these problems. For
> BPF_MA_REUSE_AFTER_GP, the freed objects are reused only after one RCU
> grace period and may be returned back to slab system after another
> RCU-tasks-trace grace period. ..."
>
> [I assumed BPF_MA_REUSE_AFTER_GP is just a typo of
> BPF_MA_REUSE_AFTER_"RCU"_GP]
Yes. Now the implementation of BPF_MA_REUSE_AFTER_RCU_GP is already
being REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace. It moves the
free objects to reuse_ready_head list after one RCU GP, splices the
elements in reuse_ready_head to wait_for_free when reuse_ready_head is
not empty and frees these elements in wait_for_free by
call_rcu_tasks_trace().
>
>>
>> If I'm reading bpf_local_storage correctly it can remove reuse_now logic
>> in all conditions with
>> REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace.
>
> Right, for smap->bpf_ma == true (cgroup and task storage), all
> reuse_now logic can be gone and directly use the bpf_mem_cache_free().
> Potentially the sk/inode can also move to bpf_ma after running some
> benchmark. This will simplify things a lot. For sk storage, the
> reuse_now was there to avoid the unnecessary tasks_trace gp because
> performance impact was reported on sk storage where connections can be
> open-and-close very frequently.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP
2023-05-03 18:48 ` Alexei Starovoitov
2023-05-03 21:57 ` Martin KaFai Lau
@ 2023-05-04 1:35 ` Hou Tao
2023-05-04 2:00 ` Alexei Starovoitov
1 sibling, 1 reply; 20+ messages in thread
From: Hou Tao @ 2023-05-04 1:35 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: bpf, Martin KaFai Lau, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, Paul E . McKenney, rcu, houtao1
Hi,
On 5/4/2023 2:48 AM, Alexei Starovoitov wrote:
> On Sat, Apr 29, 2023 at 06:12:12PM +0800, Hou Tao wrote:
>> +
>> +static void notrace wait_gp_reuse_free(struct bpf_mem_cache *c, struct llist_node *llnode)
>> +{
>> + unsigned long flags;
>> +
>> + local_irq_save(flags);
>> + /* In case a NMI-context bpf program is also freeing object. */
>> + if (local_inc_return(&c->active) == 1) {
>> + bool try_queue_work = false;
>> +
>> + /* kworker may remove elements from prepare_reuse_head */
>> + raw_spin_lock(&c->reuse_lock);
>> + if (llist_empty(&c->prepare_reuse_head))
>> + c->prepare_reuse_tail = llnode;
>> + __llist_add(llnode, &c->prepare_reuse_head);
>> + if (++c->prepare_reuse_cnt > c->high_watermark) {
>> + /* Zero out prepare_reuse_cnt early to prevent
>> + * unnecessary queue_work().
>> + */
>> + c->prepare_reuse_cnt = 0;
>> + try_queue_work = true;
>> + }
>> + raw_spin_unlock(&c->reuse_lock);
>> +
>> + if (try_queue_work && !work_pending(&c->reuse_work)) {
>> + /* Use reuse_cb_in_progress to indicate there is
>> + * inflight reuse kworker or reuse RCU callback.
>> + */
>> + atomic_inc(&c->reuse_cb_in_progress);
>> + /* Already queued */
>> + if (!queue_work(bpf_ma_wq, &c->reuse_work))
> As Martin pointed out queue_work() is not safe here.
> The raw_spin_lock(&c->reuse_lock); earlier is not safe either.
I see. Didn't recognize these problems.
> For the next version please drop workers and spin_lock from unit_free/alloc paths.
> If lock has to be taken it should be done from irq_work.
> Under no circumstances we can use alloc_workqueue(). No new kthreads.
Is there any reason to prohibit the use of new kthread in irq_work ?
>
> We can avoid adding new flag to bpf_mem_alloc to reduce the complexity
> and do roughly equivalent of REUSE_AFTER_RCU_GP unconditionally in the following way:
>
> - alloc_bulk() won't be trying to steal from c->free_by_rcu.
>
> - do_call_rcu() does call_rcu(&c->rcu, __free_rcu) instead of task-trace version.
No sure whether or not one inflight RCU callback is enough. Will check.
If one is not enough, I may use kmalloc(__GFP_NOWAIT) in irq work to
allocate multiple RCU callbacks.
> - rcu_trace_implies_rcu_gp() is never used.
>
> - after RCU_GP __free_rcu() moves all waiting_for_gp elements into
> a size specific link list per bpf_mem_alloc (not per bpf_mem_cache which is per-cpu)
> and does call_rcu_tasks_trace
>
> - Let's call this list ma->free_by_rcu_tasks_trace
> (only one list for bpf_mem_alloc with known size or NUM_CACHES such lists when size == 0 at init)
>
> - any cpu alloc_bulk() can steal from size specific ma->free_by_rcu_tasks_trace list that
> is protected by ma->spin_lock (1 or NUM_CACHES such locks)
To reduce the lock contention, alloc_bulk() can steal from the global
list in batch. Had tried the global list before but I didn't do the
concurrent freeing, I think it could reduce the risk of OOM for
add_del_on_diff_cpu.
>
> - ma->waiting_for_gp_tasks_trace will be freeing elements into slab
>
> What it means that sleepable progs using hashmap will be able to avoid uaf with bpf_rcu_read_lock().
> Without explicit bpf_rcu_read_lock() it's still safe and equivalent to existing behavior of bpf_mem_alloc.
> (while your proposed BPF_MA_FREE_AFTER_RCU_GP flavor is not safe to use in hashtab with sleepable progs)
>
> After that we can unconditionally remove rcu_head/call_rcu from bpf_cpumask and improve usability of bpf_obj_drop.
> Probably usage of bpf_mem_alloc in local storage can be simplified as well.
> Martin wdyt?
>
> I think this approach adds minimal complexity to bpf_mem_alloc while solving all existing pain points
> including needs of qp-trie.
Thanks for these great suggestions. Will try to do it in v4.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP
2023-05-04 1:35 ` Hou Tao
@ 2023-05-04 2:00 ` Alexei Starovoitov
2023-05-04 2:30 ` Hou Tao
0 siblings, 1 reply; 20+ messages in thread
From: Alexei Starovoitov @ 2023-05-04 2:00 UTC (permalink / raw)
To: Hou Tao
Cc: bpf, Martin KaFai Lau, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, Paul E . McKenney, rcu, houtao1
On Thu, May 04, 2023 at 09:35:17AM +0800, Hou Tao wrote:
> Hi,
>
> On 5/4/2023 2:48 AM, Alexei Starovoitov wrote:
> > On Sat, Apr 29, 2023 at 06:12:12PM +0800, Hou Tao wrote:
> >> +
> >> +static void notrace wait_gp_reuse_free(struct bpf_mem_cache *c, struct llist_node *llnode)
> >> +{
> >> + unsigned long flags;
> >> +
> >> + local_irq_save(flags);
> >> + /* In case a NMI-context bpf program is also freeing object. */
> >> + if (local_inc_return(&c->active) == 1) {
> >> + bool try_queue_work = false;
> >> +
> >> + /* kworker may remove elements from prepare_reuse_head */
> >> + raw_spin_lock(&c->reuse_lock);
> >> + if (llist_empty(&c->prepare_reuse_head))
> >> + c->prepare_reuse_tail = llnode;
> >> + __llist_add(llnode, &c->prepare_reuse_head);
> >> + if (++c->prepare_reuse_cnt > c->high_watermark) {
> >> + /* Zero out prepare_reuse_cnt early to prevent
> >> + * unnecessary queue_work().
> >> + */
> >> + c->prepare_reuse_cnt = 0;
> >> + try_queue_work = true;
> >> + }
> >> + raw_spin_unlock(&c->reuse_lock);
> >> +
> >> + if (try_queue_work && !work_pending(&c->reuse_work)) {
> >> + /* Use reuse_cb_in_progress to indicate there is
> >> + * inflight reuse kworker or reuse RCU callback.
> >> + */
> >> + atomic_inc(&c->reuse_cb_in_progress);
> >> + /* Already queued */
> >> + if (!queue_work(bpf_ma_wq, &c->reuse_work))
> > As Martin pointed out queue_work() is not safe here.
> > The raw_spin_lock(&c->reuse_lock); earlier is not safe either.
> I see. Didn't recognize these problems.
> > For the next version please drop workers and spin_lock from unit_free/alloc paths.
> > If lock has to be taken it should be done from irq_work.
> > Under no circumstances we can use alloc_workqueue(). No new kthreads.
> Is there any reason to prohibit the use of new kthread in irq_work ?
Because:
1. there is a workable solution without kthreads.
2. if there was no solution we would have to come up with one.
kthread is not an answer. It's hard to reason about a setup when kthreads
are in critical path due to scheduler. Assume the system is 100% cpu loaded.
kthreads delays and behavior is unpredictable. We cannot subject memory alloc/free to it.
> >
> > We can avoid adding new flag to bpf_mem_alloc to reduce the complexity
> > and do roughly equivalent of REUSE_AFTER_RCU_GP unconditionally in the following way:
> >
> > - alloc_bulk() won't be trying to steal from c->free_by_rcu.
> >
> > - do_call_rcu() does call_rcu(&c->rcu, __free_rcu) instead of task-trace version.
> No sure whether or not one inflight RCU callback is enough. Will check.
> If one is not enough, I may use kmalloc(__GFP_NOWAIT) in irq work to
> allocate multiple RCU callbacks.
Pls dont. Just assume it will work, implement the proposal (if you agree),
come back with the numbers and then we will discuss again.
We cannot keep arguing about merits of complicated patch set that was done on partial data.
Just like the whole thing with kthreads.
I requested early on: "pls no kthreads" and weeks later we're still arguing.
> > - rcu_trace_implies_rcu_gp() is never used.
> >
> > - after RCU_GP __free_rcu() moves all waiting_for_gp elements into
> > a size specific link list per bpf_mem_alloc (not per bpf_mem_cache which is per-cpu)
> > and does call_rcu_tasks_trace
> >
> > - Let's call this list ma->free_by_rcu_tasks_trace
> > (only one list for bpf_mem_alloc with known size or NUM_CACHES such lists when size == 0 at init)
> >
> > - any cpu alloc_bulk() can steal from size specific ma->free_by_rcu_tasks_trace list that
> > is protected by ma->spin_lock (1 or NUM_CACHES such locks)
> To reduce the lock contention, alloc_bulk() can steal from the global
> list in batch.
Pls no special batches. The simplest implementation possible.
alloc_bulk() has 'int cnt' argument. It will try to steal 'cnt' from ma->free_by_rcu_tasks_trace.
> Had tried the global list before but I didn't do the
> concurrent freeing, I think it could reduce the risk of OOM for
> add_del_on_diff_cpu.
Maybe you've tried, but we didn't see the patches and we cannot take for granted
anyone saying: "I've tried *foo*. It didn't work. That's why I'm doing *bar* here".
Everything mm is tricky. Little details matter a lot.
It's also questionable whether we should make any design decisions based on this benchmark
and in particular based on add_del_on_diff_cpu part of it.
I'm not saying we shouldn't consider it, but all numbers have a "decision weight"
associated with them.
For example: there is existing samples/bpf/map_perf_test benchmark.
So far we haven't seen the numbers from it.
Is it more important than your new bench? Yes and no. All numbers matter.
> >
> > - ma->waiting_for_gp_tasks_trace will be freeing elements into slab
> >
> > What it means that sleepable progs using hashmap will be able to avoid uaf with bpf_rcu_read_lock().
> > Without explicit bpf_rcu_read_lock() it's still safe and equivalent to existing behavior of bpf_mem_alloc.
> > (while your proposed BPF_MA_FREE_AFTER_RCU_GP flavor is not safe to use in hashtab with sleepable progs)
> >
> > After that we can unconditionally remove rcu_head/call_rcu from bpf_cpumask and improve usability of bpf_obj_drop.
> > Probably usage of bpf_mem_alloc in local storage can be simplified as well.
> > Martin wdyt?
> >
> > I think this approach adds minimal complexity to bpf_mem_alloc while solving all existing pain points
> > including needs of qp-trie.
> Thanks for these great suggestions. Will try to do it in v4.
Thanks.
Also for benchmark, pls don't hack htab and benchmark as 'non-landable patches' (as in this series).
Construct the patch series as:
- prep patches
- benchmark
- unconditional convert of bpf_ma to REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace
with numbers from bench(s) before and after this patch.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP
2023-05-04 2:00 ` Alexei Starovoitov
@ 2023-05-04 2:30 ` Hou Tao
2023-06-01 17:36 ` Alexei Starovoitov
0 siblings, 1 reply; 20+ messages in thread
From: Hou Tao @ 2023-05-04 2:30 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: bpf, Martin KaFai Lau, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, Paul E . McKenney, rcu, houtao1
Hi,
On 5/4/2023 10:00 AM, Alexei Starovoitov wrote:
> On Thu, May 04, 2023 at 09:35:17AM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 5/4/2023 2:48 AM, Alexei Starovoitov wrote:
>>> On Sat, Apr 29, 2023 at 06:12:12PM +0800, Hou Tao wrote:
SNIP
>>> + /* Already queued */
>>> + if (!queue_work(bpf_ma_wq, &c->reuse_work))
>>> As Martin pointed out queue_work() is not safe here.
>>> The raw_spin_lock(&c->reuse_lock); earlier is not safe either.
>> I see. Didn't recognize these problems.
>>> For the next version please drop workers and spin_lock from unit_free/alloc paths.
>>> If lock has to be taken it should be done from irq_work.
>>> Under no circumstances we can use alloc_workqueue(). No new kthreads.
>> Is there any reason to prohibit the use of new kthread in irq_work ?
> Because:
> 1. there is a workable solution without kthreads.
> 2. if there was no solution we would have to come up with one.
> kthread is not an answer. It's hard to reason about a setup when kthreads
> are in critical path due to scheduler. Assume the system is 100% cpu loaded.
> kthreads delays and behavior is unpredictable. We cannot subject memory alloc/free to it.
I see. Thanks for the explanation.
>
>>> We can avoid adding new flag to bpf_mem_alloc to reduce the complexity
>>> and do roughly equivalent of REUSE_AFTER_RCU_GP unconditionally in the following way:
>>>
>>> - alloc_bulk() won't be trying to steal from c->free_by_rcu.
>>>
>>> - do_call_rcu() does call_rcu(&c->rcu, __free_rcu) instead of task-trace version.
>> No sure whether or not one inflight RCU callback is enough. Will check.
>> If one is not enough, I may use kmalloc(__GFP_NOWAIT) in irq work to
>> allocate multiple RCU callbacks.
> Pls dont. Just assume it will work, implement the proposal (if you agree),
> come back with the numbers and then we will discuss again.
> We cannot keep arguing about merits of complicated patch set that was done on partial data.
OK. Will do.
> Just like the whole thing with kthreads.
> I requested early on: "pls no kthreads" and weeks later we're still arguing.
Sorry about missing that part.
>
>>> - rcu_trace_implies_rcu_gp() is never used.
>>>
>>> - after RCU_GP __free_rcu() moves all waiting_for_gp elements into
>>> a size specific link list per bpf_mem_alloc (not per bpf_mem_cache which is per-cpu)
>>> and does call_rcu_tasks_trace
>>>
>>> - Let's call this list ma->free_by_rcu_tasks_trace
>>> (only one list for bpf_mem_alloc with known size or NUM_CACHES such lists when size == 0 at init)
>>>
>>> - any cpu alloc_bulk() can steal from size specific ma->free_by_rcu_tasks_trace list that
>>> is protected by ma->spin_lock (1 or NUM_CACHES such locks)
>> To reduce the lock contention, alloc_bulk() can steal from the global
>> list in batch.
> Pls no special batches. The simplest implementation possible.
> alloc_bulk() has 'int cnt' argument. It will try to steal 'cnt' from ma->free_by_rcu_tasks_trace.
I see. Will do.
>
>> Had tried the global list before but I didn't do the
>> concurrent freeing, I think it could reduce the risk of OOM for
>> add_del_on_diff_cpu.
> Maybe you've tried, but we didn't see the patches and we cannot take for granted
> anyone saying: "I've tried *foo*. It didn't work. That's why I'm doing *bar* here".
> Everything mm is tricky. Little details matter a lot.
OK. I think it will work. The reason I didn't post it is that I was
obsessed with lock-less bpf ma at that moment.
> It's also questionable whether we should make any design decisions based on this benchmark
> and in particular based on add_del_on_diff_cpu part of it.
> I'm not saying we shouldn't consider it, but all numbers have a "decision weight"
> associated with them.
I see. The reason for add_del_on_diff_cpu is just to complement the
possible use cases of bpf memory allocator.
> For example: there is existing samples/bpf/map_perf_test benchmark.
> So far we haven't seen the numbers from it.
> Is it more important than your new bench? Yes and no. All numbers matter.
Will post the benchmark result for map_perf_test in v4. Had planned to
migrate map_perf_test to selftests/bpf/benchs, but couldn't find enough
time to do that.
>
>>> - ma->waiting_for_gp_tasks_trace will be freeing elements into slab
>>>
>>> What it means that sleepable progs using hashmap will be able to avoid uaf with bpf_rcu_read_lock().
>>> Without explicit bpf_rcu_read_lock() it's still safe and equivalent to existing behavior of bpf_mem_alloc.
>>> (while your proposed BPF_MA_FREE_AFTER_RCU_GP flavor is not safe to use in hashtab with sleepable progs)
>>>
>>> After that we can unconditionally remove rcu_head/call_rcu from bpf_cpumask and improve usability of bpf_obj_drop.
>>> Probably usage of bpf_mem_alloc in local storage can be simplified as well.
>>> Martin wdyt?
>>>
>>> I think this approach adds minimal complexity to bpf_mem_alloc while solving all existing pain points
>>> including needs of qp-trie.
>> Thanks for these great suggestions. Will try to do it in v4.
> Thanks.
> Also for benchmark, pls don't hack htab and benchmark as 'non-landable patches' (as in this series).
> Construct the patch series as:
> - prep patches
> - benchmark
> - unconditional convert of bpf_ma to REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace
> with numbers from bench(s) before and after this patch.
Thanks again for the suggestion. Will do in v4.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP
2023-05-04 2:30 ` Hou Tao
@ 2023-06-01 17:36 ` Alexei Starovoitov
2023-06-02 2:39 ` Hou Tao
0 siblings, 1 reply; 20+ messages in thread
From: Alexei Starovoitov @ 2023-06-01 17:36 UTC (permalink / raw)
To: Hou Tao
Cc: bpf, Martin KaFai Lau, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, Paul E . McKenney, rcu, Hou Tao
On Wed, May 3, 2023 at 7:30 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> > Construct the patch series as:
> > - prep patches
> > - benchmark
> > - unconditional convert of bpf_ma to REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace
> > with numbers from bench(s) before and after this patch.
> Thanks again for the suggestion. Will do in v4.
It's been a month. Any update?
Should we take over this work if you're busy?
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP
2023-06-01 17:36 ` Alexei Starovoitov
@ 2023-06-02 2:39 ` Hou Tao
2023-06-02 16:25 ` Alexei Starovoitov
0 siblings, 1 reply; 20+ messages in thread
From: Hou Tao @ 2023-06-02 2:39 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: bpf, Martin KaFai Lau, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, Paul E . McKenney, rcu, Hou Tao
Hi,
On 6/2/2023 1:36 AM, Alexei Starovoitov wrote:
> On Wed, May 3, 2023 at 7:30 PM Hou Tao <houtao@huaweicloud.com> wrote:
>>> Construct the patch series as:
>>> - prep patches
>>> - benchmark
>>> - unconditional convert of bpf_ma to REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace
>>> with numbers from bench(s) before and after this patch.
>> Thanks again for the suggestion. Will do in v4.
>
> It's been a month. Any update?
>
> Should we take over this work if you're busy?
Sorry for the delay. I should post some progress information about the
patch set early. The patch set is simpler compared with v3, I had
implemented v4 about two weeks ago. The problem is v4 don't work as
expected: its memory usage is huge compared with v3. The following is
the output from htab-mem benchmark:
overwrite:
Summary: loop 11.07 ± 1.25k/s, memory usage 995.08 ± 680.87MiB,
peak memory usage 2183.38MiB
batch_add_batch_del:
Summary: loop 11.48 ± 1.24k/s, memory usage 1393.36 ± 780.41MiB,
peak memory usage 2836.68MiB
add_del_on_diff_cpu:
Summary: loop 6.07 ± 0.69k/s, memory usage 14.44 ± 2.34MiB,
peak memory usage 20.30MiB
The direct reason for the huge memory usage is slower RCU grace period.
The RCU grace period used for reuse is much longer compared with v3 and
it is about 100ms or more (e.g, 2.6s). I am still trying to find out the
root cause of the slow RCU grace period. The first guest is the running
time of bpf program attached to getpgid() is longer, so the context
switch in bench is slowed down. The hist-diagram of getpgid() latency in
v4 indeed manifests a lot of abnormal tail latencies compared with v3 as
shown below.
v3 getpid() latency during overwrite benchmark:
@hist_ms:
[0] 193451
|@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[1] 767
| |
[2, 4) 75
| |
[4, 8) 1
| |
v4 getpid() latency during overwrite benchmark:
@hist_ms:
[0] 86270
|@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[1] 31252
|@@@@@@@@@@@@@@@@@@ |
[2, 4) 1
| |
[4, 8) 0
| |
[8, 16) 0
| |
[16, 32) 0
| |
[32, 64) 0
| |
[64, 128) 0
| |
[128, 256) 3
| |
[256, 512) 2
| |
[512, 1K) 1
| |
[1K, 2K) 2
| |
[2K, 4K) 1
| |
I think the newly-added global spin-lock in memory allocator and
irq-work running under the context of free procedure may lead to
abnormal tail latency and I am trying to demonstrate that by using
fine-grain locks and kworker (just temporarily). But on the other side,
considering the number of abnormal tail latency is much smaller compared
with the total number of getpgid() syscall, so I think maybe there is
still other causes for the slow RCU GP.
Because the progress of v4 is delayed, so how about I post v4 as soon as
possible for discussion (maybe I did it wrong) and at the same time I
continue to investigate the slow RCU grace period problem (I will try to
get some help from RCU community) ?
Regards,
Tao
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP
2023-06-02 2:39 ` Hou Tao
@ 2023-06-02 16:25 ` Alexei Starovoitov
0 siblings, 0 replies; 20+ messages in thread
From: Alexei Starovoitov @ 2023-06-02 16:25 UTC (permalink / raw)
To: Hou Tao
Cc: bpf, Martin KaFai Lau, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, Paul E . McKenney, rcu, Hou Tao
On Thu, Jun 1, 2023 at 7:40 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi,
>
> On 6/2/2023 1:36 AM, Alexei Starovoitov wrote:
> > On Wed, May 3, 2023 at 7:30 PM Hou Tao <houtao@huaweicloud.com> wrote:
> >>> Construct the patch series as:
> >>> - prep patches
> >>> - benchmark
> >>> - unconditional convert of bpf_ma to REUSE_AFTER_rcu_GP_and_free_after_rcu_tasks_trace
> >>> with numbers from bench(s) before and after this patch.
> >> Thanks again for the suggestion. Will do in v4.
> >
> > It's been a month. Any update?
> >
> > Should we take over this work if you're busy?
> Sorry for the delay. I should post some progress information about the
> patch set early. The patch set is simpler compared with v3, I had
> implemented v4 about two weeks ago. The problem is v4 don't work as
> expected: its memory usage is huge compared with v3. The following is
> the output from htab-mem benchmark:
>
> overwrite:
> Summary: loop 11.07 ± 1.25k/s, memory usage 995.08 ± 680.87MiB,
> peak memory usage 2183.38MiB
> batch_add_batch_del:
> Summary: loop 11.48 ± 1.24k/s, memory usage 1393.36 ± 780.41MiB,
> peak memory usage 2836.68MiB
> add_del_on_diff_cpu:
> Summary: loop 6.07 ± 0.69k/s, memory usage 14.44 ± 2.34MiB,
> peak memory usage 20.30MiB
>
> The direct reason for the huge memory usage is slower RCU grace period.
> The RCU grace period used for reuse is much longer compared with v3 and
> it is about 100ms or more (e.g, 2.6s). I am still trying to find out the
> root cause of the slow RCU grace period. The first guest is the running
> time of bpf program attached to getpgid() is longer, so the context
> switch in bench is slowed down. The hist-diagram of getpgid() latency in
> v4 indeed manifests a lot of abnormal tail latencies compared with v3 as
> shown below.
>
> v3 getpid() latency during overwrite benchmark:
> @hist_ms:
> [0] 193451
> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [1] 767
> | |
> [2, 4) 75
> | |
> [4, 8) 1
> | |
>
> v4 getpid() latency during overwrite benchmark:
> @hist_ms:
> [0] 86270
> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [1] 31252
> |@@@@@@@@@@@@@@@@@@ |
> [2, 4) 1
> | |
> [4, 8) 0
> | |
> [8, 16) 0
> | |
> [16, 32) 0
> | |
> [32, 64) 0
> | |
> [64, 128) 0
> | |
> [128, 256) 3
> | |
> [256, 512) 2
> | |
> [512, 1K) 1
> | |
> [1K, 2K) 2
> | |
> [2K, 4K) 1
> | |
>
> I think the newly-added global spin-lock in memory allocator and
> irq-work running under the context of free procedure may lead to
> abnormal tail latency and I am trying to demonstrate that by using
> fine-grain locks and kworker (just temporarily). But on the other side,
> considering the number of abnormal tail latency is much smaller compared
> with the total number of getpgid() syscall, so I think maybe there is
> still other causes for the slow RCU GP.
>
> Because the progress of v4 is delayed, so how about I post v4 as soon as
> possible for discussion (maybe I did it wrong) and at the same time I
> continue to investigate the slow RCU grace period problem (I will try to
> get some help from RCU community) ?
Yes. Please send v4. Let's investigate huge memory consumption together.
^ permalink raw reply [flat|nested] 20+ messages in thread
* [RFC bpf-next v3 4/6] bpf: Introduce BPF_MA_FREE_AFTER_RCU_GP
2023-04-29 10:12 [RFC bpf-next v3 0/6] Handle immediate reuse in bpf memory allocator Hou Tao
` (2 preceding siblings ...)
2023-04-29 10:12 ` [RFC bpf-next v3 3/6] bpf: Introduce BPF_MA_REUSE_AFTER_RCU_GP Hou Tao
@ 2023-04-29 10:12 ` Hou Tao
2023-04-29 10:12 ` [RFC bpf-next v3 5/6] bpf: Add two module parameters in htab for memory benchmark Hou Tao
2023-04-29 10:12 ` [RFC bpf-next v3 6/6] selftests/bpf: Add benchmark for bpf memory allocator Hou Tao
5 siblings, 0 replies; 20+ messages in thread
From: Hou Tao @ 2023-04-29 10:12 UTC (permalink / raw)
To: bpf, Martin KaFai Lau, Alexei Starovoitov
Cc: Andrii Nakryiko, Song Liu, Hao Luo, Yonghong Song,
Daniel Borkmann, KP Singh, Stanislav Fomichev, Jiri Olsa,
John Fastabend, Paul E . McKenney, rcu, houtao1
From: Hou Tao <houtao1@huawei.com>
Beside REUSE_AFTER_RCU_GP, also introduce FREE_AFTER_RCU_GP to solve
the immediate reuse problem as well. Compared with REUSE_AFTER_RCU_GP,
the implementation of FREE_AFTER_RCU_GP is much simpler. It doesn't try
to reuse these freed elements after one RCU GP is passed, instead it
just directly frees these elements back to slab subsystem after one RCU
GP. The shortcoming of FREE_AFTER_RCU_GP is that sleep-able program must
access these elements by using bpf_rcu_read_{lock,unlock}, otherwise
there will be use-after-free problem.
To simplify the implementation, FREE_AFTER_RCU_GP uses a global per-cpu
free list to temporarily keep these freed elements and uses a per-cpu
kworker to dynamically allocate RCU callback to free these freed
elements when the number of freed elements is above the threshold.
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
include/linux/bpf_mem_alloc.h | 1 +
kernel/bpf/memalloc.c | 139 ++++++++++++++++++++++++++++++++++
2 files changed, 140 insertions(+)
diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index e7f68432713b..61e8556208a2 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -19,6 +19,7 @@ struct bpf_mem_alloc {
enum {
BPF_MA_PERCPU = 1U << 0,
BPF_MA_REUSE_AFTER_RCU_GP = 1U << 1,
+ BPF_MA_FREE_AFTER_RCU_GP = 1U << 2,
};
/* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects.
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 262100f89610..5f6a4f2cfd37 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -63,7 +63,26 @@ static u8 size_index[24] __ro_after_init = {
2 /* 192 */
};
+#define BPF_MA_FREE_TYPE_NR 2
+
+struct bpf_ma_free_ctx {
+ raw_spinlock_t lock;
+ int cpu;
+ local_t active;
+ /* For both no per-cpu and per-cpu */
+ struct llist_head to_free[BPF_MA_FREE_TYPE_NR];
+ unsigned int to_free_cnt[BPF_MA_FREE_TYPE_NR];
+ struct llist_head to_free_extra[BPF_MA_FREE_TYPE_NR];
+ struct delayed_work dwork;
+};
+
+struct bpf_free_batch {
+ struct rcu_head rcu;
+ struct llist_node *to_free[BPF_MA_FREE_TYPE_NR];
+};
+
static struct workqueue_struct *bpf_ma_wq;
+static DEFINE_PER_CPU(struct bpf_ma_free_ctx, percpu_free_ctx);
static void bpf_ma_prepare_reuse_work(struct work_struct *work);
@@ -910,6 +929,112 @@ static void notrace immediate_reuse_free(struct bpf_mem_cache *c, struct llist_n
irq_work_raise(c);
}
+static void bpf_ma_batch_free_cb(struct rcu_head *rcu)
+{
+ struct bpf_free_batch *batch = container_of(rcu, struct bpf_free_batch, rcu);
+
+ free_all(batch->to_free[0], false);
+ free_all(batch->to_free[1], true);
+ kfree(batch);
+}
+
+static void bpf_ma_schedule_free_dwork(struct bpf_ma_free_ctx *ctx)
+{
+ long delay, left;
+ u64 to_free_cnt;
+
+ /* TODO: More reasonable threshold ? */
+ to_free_cnt = ctx->to_free_cnt[0] + ctx->to_free_cnt[1];
+ delay = to_free_cnt >= 256 ? 0 : HZ;
+ if (delayed_work_pending(&ctx->dwork)) {
+ left = ctx->dwork.timer.expires - jiffies;
+ if (delay < left)
+ mod_delayed_work(bpf_ma_wq, &ctx->dwork, delay);
+ return;
+ }
+ queue_delayed_work(bpf_ma_wq, &ctx->dwork, delay);
+}
+
+static void splice_llist(struct llist_head *llist, struct llist_node **head)
+{
+ struct llist_node *first, *last;
+
+ first = llist_del_all(llist);
+ if (!first)
+ return;
+
+ last = first;
+ while (last->next)
+ last = last->next;
+ last->next = *head;
+ *head = first;
+}
+
+static void bpf_ma_splice_to_free_list(struct bpf_ma_free_ctx *ctx, struct llist_node **to_free)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ /* Might be interrupted by a NMI which invokes unit_free() */
+ if (ctx->cpu == smp_processor_id())
+ WARN_ON_ONCE(local_inc_return(&ctx->active) != 1);
+ raw_spin_lock(&ctx->lock);
+ to_free[0] = __llist_del_all(&ctx->to_free[0]);
+ to_free[1] = __llist_del_all(&ctx->to_free[1]);
+ ctx->to_free_cnt[0] = 0;
+ ctx->to_free_cnt[1] = 0;
+ raw_spin_unlock(&ctx->lock);
+ if (ctx->cpu == smp_processor_id())
+ local_dec(&ctx->active);
+ local_irq_restore(flags);
+
+ splice_llist(&ctx->to_free_extra[0], &to_free[0]);
+ splice_llist(&ctx->to_free_extra[1], &to_free[1]);
+}
+
+static void bpf_ma_free_dwork(struct work_struct *work)
+{
+ struct bpf_ma_free_ctx *ctx = container_of(to_delayed_work(work),
+ struct bpf_ma_free_ctx, dwork);
+ struct llist_node *to_free[BPF_MA_FREE_TYPE_NR];
+ struct bpf_free_batch *batch;
+
+ bpf_ma_splice_to_free_list(ctx, to_free);
+
+ batch = kmalloc(sizeof(*batch), GFP_KERNEL);
+ if (!batch) {
+ synchronize_rcu_expedited();
+ free_all(to_free[0], false);
+ free_all(to_free[1], true);
+ return;
+ }
+
+ batch->to_free[0] = to_free[0];
+ batch->to_free[1] = to_free[1];
+ call_rcu(&batch->rcu, bpf_ma_batch_free_cb);
+}
+
+static void notrace wait_gp_direct_free(struct bpf_mem_cache *c, struct llist_node *llnode)
+{
+ bool percpu = !!c->percpu_size;
+ struct bpf_ma_free_ctx *ctx;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ctx = this_cpu_ptr(&percpu_free_ctx);
+ if (local_inc_return(&ctx->active) == 1) {
+ raw_spin_lock(&ctx->lock);
+ __llist_add(llnode, &ctx->to_free[percpu]);
+ ctx->to_free_cnt[percpu] += 1;
+ bpf_ma_schedule_free_dwork(ctx);
+ raw_spin_unlock(&ctx->lock);
+ } else {
+ llist_add(llnode, &ctx->to_free_extra[percpu]);
+ }
+ local_dec(&ctx->active);
+ local_irq_restore(flags);
+}
+
static inline void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
{
struct llist_node *llnode = ptr - LLIST_NODE_SZ;
@@ -918,6 +1043,8 @@ static inline void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
if (c->flags & BPF_MA_REUSE_AFTER_RCU_GP)
wait_gp_reuse_free(c, llnode);
+ else if (c->flags & BPF_MA_FREE_AFTER_RCU_GP)
+ wait_gp_direct_free(c, llnode);
else
immediate_reuse_free(c, llnode);
}
@@ -1016,8 +1143,20 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
static int __init bpf_ma_init(void)
{
+ int cpu;
+
bpf_ma_wq = alloc_workqueue("bpf_ma", WQ_MEM_RECLAIM, 0);
BUG_ON(!bpf_ma_wq);
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_ma_free_ctx *ctx;
+
+ ctx = per_cpu_ptr(&percpu_free_ctx, cpu);
+ raw_spin_lock_init(&ctx->lock);
+ ctx->cpu = cpu;
+ INIT_DELAYED_WORK(&ctx->dwork, bpf_ma_free_dwork);
+ }
+
return 0;
}
late_initcall(bpf_ma_init);
--
2.29.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [RFC bpf-next v3 5/6] bpf: Add two module parameters in htab for memory benchmark
2023-04-29 10:12 [RFC bpf-next v3 0/6] Handle immediate reuse in bpf memory allocator Hou Tao
` (3 preceding siblings ...)
2023-04-29 10:12 ` [RFC bpf-next v3 4/6] bpf: Introduce BPF_MA_FREE_AFTER_RCU_GP Hou Tao
@ 2023-04-29 10:12 ` Hou Tao
2023-04-29 10:12 ` [RFC bpf-next v3 6/6] selftests/bpf: Add benchmark for bpf memory allocator Hou Tao
5 siblings, 0 replies; 20+ messages in thread
From: Hou Tao @ 2023-04-29 10:12 UTC (permalink / raw)
To: bpf, Martin KaFai Lau, Alexei Starovoitov
Cc: Andrii Nakryiko, Song Liu, Hao Luo, Yonghong Song,
Daniel Borkmann, KP Singh, Stanislav Fomichev, Jiri Olsa,
John Fastabend, Paul E . McKenney, rcu, houtao1
From: Hou Tao <houtao1@huawei.com>
Add two module parameters in htab:
* reuse_flag: possible values are 0, 2 (REUSE_AFTER_RCU_GP) or
4 (FREE_AFTER_RCU_GP). The default value is 0 and this creates
a hash map which does immediate reuse.
* delayed_free: possible values are 0, 1. The default value is 0 and
the hash map will call bpf_mem_cache_free() directly. If the value
is 1, the hash map will call bpf_mem_cache_free() after one RCU GP
which mimics the free of bpf_cpumask.
These two module parameters are used for benchmarking purpose only and
are not intended for merging.
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
kernel/bpf/hashtab.c | 40 +++++++++++++++++++++++++++++++++-------
1 file changed, 33 insertions(+), 7 deletions(-)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 93009b94ac9b..8502957b8bcc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -100,6 +100,7 @@ struct bpf_htab {
struct percpu_counter pcount;
atomic_t count;
bool use_percpu_counter;
+ bool delayed_free;
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
u32 hashrnd;
@@ -120,14 +121,24 @@ struct htab_elem {
};
};
union {
- /* pointer to per-cpu pointer */
- void *ptr_to_pptr;
+ struct {
+ /* pointer to per-cpu pointer */
+ void *ptr_to_pptr;
+ struct bpf_mem_alloc *ma;
+ struct rcu_head rcu;
+ };
struct bpf_lru_node lru_node;
};
u32 hash;
char key[] __aligned(8);
};
+static int reuse_flag;
+module_param(reuse_flag, int, 0644);
+
+static bool delayed_free;
+module_param(delayed_free, bool, 0644);
+
static inline bool htab_is_prealloc(const struct bpf_htab *htab)
{
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
@@ -539,6 +550,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab_init_buckets(htab);
+ htab->delayed_free = delayed_free;
/* compute_batch_value() computes batch value as num_online_cpus() * 2
* and __percpu_counter_compare() needs
* htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
@@ -576,7 +588,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
goto free_prealloc;
}
} else {
- err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, 0);
+ err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, reuse_flag);
if (err)
goto free_map_locked;
if (percpu) {
@@ -878,12 +890,24 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return -ENOENT;
}
-static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
+static void htab_elem_free_rcu(struct rcu_head *rcu)
+{
+ struct htab_elem *l = container_of(rcu, struct htab_elem, rcu);
+
+ bpf_mem_cache_free(l->ma, l);
+}
+
+static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l, bool destroy)
{
check_and_free_fields(htab, l);
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
- bpf_mem_cache_free(&htab->ma, l);
+ if (destroy || !htab->delayed_free) {
+ bpf_mem_cache_free(&htab->ma, l);
+ return;
+ }
+ l->ma = &htab->ma;
+ call_rcu(&l->rcu, htab_elem_free_rcu);
}
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
@@ -931,7 +955,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
dec_elem_count(htab);
- htab_elem_free(htab, l);
+ htab_elem_free(htab, l, false);
}
}
@@ -1468,7 +1492,7 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
hlist_nulls_del_rcu(&l->hash_node);
- htab_elem_free(htab, l);
+ htab_elem_free(htab, l, true);
}
}
migrate_enable();
@@ -1522,6 +1546,8 @@ static void htab_map_free(struct bpf_map *map)
* during bpf_mem_alloc_destroy().
*/
if (!htab_is_prealloc(htab)) {
+ if (htab->delayed_free)
+ rcu_barrier();
delete_all_elements(htab);
} else {
htab_free_prealloced_fields(htab);
--
2.29.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [RFC bpf-next v3 6/6] selftests/bpf: Add benchmark for bpf memory allocator
2023-04-29 10:12 [RFC bpf-next v3 0/6] Handle immediate reuse in bpf memory allocator Hou Tao
` (4 preceding siblings ...)
2023-04-29 10:12 ` [RFC bpf-next v3 5/6] bpf: Add two module parameters in htab for memory benchmark Hou Tao
@ 2023-04-29 10:12 ` Hou Tao
5 siblings, 0 replies; 20+ messages in thread
From: Hou Tao @ 2023-04-29 10:12 UTC (permalink / raw)
To: bpf, Martin KaFai Lau, Alexei Starovoitov
Cc: Andrii Nakryiko, Song Liu, Hao Luo, Yonghong Song,
Daniel Borkmann, KP Singh, Stanislav Fomichev, Jiri Olsa,
John Fastabend, Paul E . McKenney, rcu, houtao1
From: Hou Tao <houtao1@huawei.com>
The benchmark could be used to compare the performance of hash map
operations and the memory usage between different flavors of bpf memory
allocator (e.g., no bpf ma vs bpf ma vs reuse-after-gp bpf ma). It also
could be used to check the performance improvement or the memory saving
provided by optimization.
The benchmark creates a non-preallocated hash map which uses bpf memory
allocator and shows the operation performance and the memory usage of
the hash map under different use cases:
(1) no_op
Only create the hash map and there is no operations on hash map. It is
used as the baseline. When each CPU completes the iteration of
nr_entries / nr_threads elements in hash map, the loop count is
increased.
(2) overwrite
Each CPU overwrites nonoverlapping part of hash map. When each CPU
completes overwriting of nr_entries / nr_threads elements in hash map,
the loop count is increased.
(3) batch_add_batch_del
Each CPU adds then deletes nonoverlapping part of hash map in batch.
When each CPU adds and deletes nr_entries / nr_threads elements in hash
map, the loop count is increased.
(4) add_del_on_diff_cpu
Each two CPUs add and delete nonoverlapping part of map cooperatively
When each CPU adds and deletes nr_entries / nr_threads * 2 elements in
hash map, the loop count is increased twice.
The following are the benchmark results when comparing between different
flavors of bpf memory allocator. These tests are conducted on a KVM guest
with 8 CPUs and 16 GB memory. The command line below is used to do all
the following benchmarks:
./bench htab-mem --use-case $name --max-entries 16384 ${OPTS} \
--full 50 -d 9 --producers=8 --prod-affinity=0-7
These results show:
* preallocated case has both better performance and better memory
efficiency.
* normal bpf memory doesn't add_del_on_diff_cpu very well. The larger
memory is due to the slow tasks trace RCU grace period.
* free-after-rcu-gp has fewer memory usage compared with
reuse-after-rcu-gp, but its performance is bad than
reuse-after-rcu-gp. Both free-after-rcu-gp and reuse-after-rcu-gp have
larger memory usage than normal bpf memory allocator due to the delay
of reuse.
* for extra call_rcu + bpf_memory_allocator, its memory usage is much
bigger than free-after-rcu-gp and reuse-after-rcu-gp.
(1) non-preallocated + no bpf memory allocator (v6.0.19)
use kmalloc() + call_rcu
| name | loop (k/s) | average memory (MiB) | peak memory (MiB) |
| -- | -- | -- | -- |
| no_op | 1214.42 | 0.92 | 0.92 |
| overwrite | 3.21 | 40.47 | 67.98 |
| batch_add_batch_del | 2.32 | 24.31 | 49.33 |
| add_del_on_diff_cpu | 2.92 | 4.03 | 6.00 |
(2) preallocated
OPTS=--preallocated
| name | loop (k/s) | average memory (MiB) | peak memory (MiB) |
| -- | -- | -- | -- |
| no_op | 1156.59 | 1.88 | 1.88 |
| overwrite | 36.19 | 1.88 | 1.88 |
| batch_add_batch_del | 22.27 | 1.88 | 1.88 |
| add_del_on_diff_cpu | 4.68 | 1.95 | 2.05 |
(3) normal bpf memory allocator
echo 0 > /sys/module/hashtab/parameters/reuse_flag
echo 0 > /sys/module/hashtab/parameters/delayed_free
| name | loop (k/s) | average memory (MiB) | peak memory (MiB) |
| -- | -- | -- | -- |
| no_op | 1273.55 | 0.98 | 0.98 |
| overwrite | 26.57 | 2.59 | 2.74 |
| batch_add_batch_del | 11.13 | 2.59 | 2.99 |
| add_del_on_diff_cpu | 3.72 | 15.15 | 26.04 |
(4) reuse-after-rcu-gp bpf memory allocator
echo 2 > /sys/module/hashtab/parameters/reuse_flag
echo 0 > /sys/module/hashtab/parameters/delayed_free
| name | loop (k/s) | average memory (MiB) | peak memory (MiB) |
| -- | -- | -- | -- |
| no_op | 1199.16 | 0.97 | 0.99 |
| overwrite | 16.37 | 24.01 | 31.76 |
| batch_add_batch_del | 9.61 | 16.71 | 19.95 |
| add_del_on_diff_cpu | 3.62 | 22.93 | 37.02 |
(5) free-after-rcu-gp bpf memory allocator
echo 4 > /sys/module/hashtab/parameters/reuse_flag
echo 0 > /sys/module/hashtab/parameters/delayed_free
| name | loop (k/s) | average memory (MiB) | peak memory (MiB) |
| -- | -- | -- | -- |
| no_op | 1274.59 | 0.99 | 0.99 |
| overwrite | 11.02 | 13.48 | 21.85 |
| batch_add_batch_del | 7.43 | 10.58 | 16.14 |
| add_del_on_diff_cpu | 3.15 | 6.36 | 9.65 |
(6) extra call_rcu + bpf memory allocator
echo 0 > /sys/module/hashtab/parameters/reuse_flag
echo 1 > /sys/module/hashtab/parameters/delayed_free
| name | loop (k/s) | average memory (MiB) | peak memory (MiB) |
| -- | -- | -- | -- |
| no_op | 1276.85 | 0.99 | 0.99 |
| overwrite | 12.57 | 372.01 | 676.56 |
| batch_add_batch_del | 9.31 | 276.14 | 431.04 |
| add_del_on_diff_cpu | 3.29 | 18.73 | 35.13 |
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
tools/testing/selftests/bpf/Makefile | 3 +
tools/testing/selftests/bpf/bench.c | 4 +
.../selftests/bpf/benchs/bench_htab_mem.c | 352 ++++++++++++++++++
.../bpf/benchs/run_bench_htab_mem.sh | 64 ++++
.../selftests/bpf/progs/htab_mem_bench.c | 135 +++++++
5 files changed, 558 insertions(+)
create mode 100644 tools/testing/selftests/bpf/benchs/bench_htab_mem.c
create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_htab_mem.sh
create mode 100644 tools/testing/selftests/bpf/progs/htab_mem_bench.c
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 7a196a87152c..b5baa23dfb98 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -647,11 +647,13 @@ $(OUTPUT)/bench_local_storage.o: $(OUTPUT)/local_storage_bench.skel.h
$(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tasks_trace_bench.skel.h
$(OUTPUT)/bench_local_storage_create.o: $(OUTPUT)/bench_local_storage_create.skel.h
$(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h
+$(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h
$(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
$(OUTPUT)/bench: LDLIBS += -lm
$(OUTPUT)/bench: $(OUTPUT)/bench.o \
$(TESTING_HELPERS) \
$(TRACE_HELPERS) \
+ $(CGROUP_HELPERS) \
$(OUTPUT)/bench_count.o \
$(OUTPUT)/bench_rename.o \
$(OUTPUT)/bench_trigger.o \
@@ -664,6 +666,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
$(OUTPUT)/bench_local_storage_rcu_tasks_trace.o \
$(OUTPUT)/bench_bpf_hashmap_lookup.o \
$(OUTPUT)/bench_local_storage_create.o \
+ $(OUTPUT)/bench_htab_mem.o \
#
$(call msg,BINARY,,$@)
$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index d9c080ac1796..d3d9ae321b74 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -279,6 +279,7 @@ extern struct argp bench_local_storage_rcu_tasks_trace_argp;
extern struct argp bench_strncmp_argp;
extern struct argp bench_hashmap_lookup_argp;
extern struct argp bench_local_storage_create_argp;
+extern struct argp bench_htab_mem_argp;
static const struct argp_child bench_parsers[] = {
{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
@@ -290,6 +291,7 @@ static const struct argp_child bench_parsers[] = {
"local_storage RCU Tasks Trace slowdown benchmark", 0 },
{ &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 },
{ &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 },
+ { &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 },
{},
};
@@ -518,6 +520,7 @@ extern const struct bench bench_local_storage_cache_hashmap_control;
extern const struct bench bench_local_storage_tasks_trace;
extern const struct bench bench_bpf_hashmap_lookup;
extern const struct bench bench_local_storage_create;
+extern const struct bench bench_htab_mem;
static const struct bench *benchs[] = {
&bench_count_global,
@@ -559,6 +562,7 @@ static const struct bench *benchs[] = {
&bench_local_storage_tasks_trace,
&bench_bpf_hashmap_lookup,
&bench_local_storage_create,
+ &bench_htab_mem,
};
static void find_benchmark(void)
diff --git a/tools/testing/selftests/bpf/benchs/bench_htab_mem.c b/tools/testing/selftests/bpf/benchs/bench_htab_mem.c
new file mode 100644
index 000000000000..f0c2505c0868
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_htab_mem.c
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
+#include <argp.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "bench.h"
+#include "cgroup_helpers.h"
+#include "htab_mem_bench.skel.h"
+
+static struct htab_mem_ctx {
+ struct htab_mem_bench *skel;
+ pthread_barrier_t *notify;
+ int fd;
+ bool do_notify_wait;
+} ctx;
+
+static struct htab_mem_args {
+ u32 max_entries;
+ u32 value_size;
+ u32 full;
+ const char *use_case;
+ bool preallocated;
+} args = {
+ .max_entries = 16384,
+ .full = 50,
+ .value_size = 8,
+ .use_case = "overwrite",
+ .preallocated = false,
+};
+
+enum {
+ ARG_MAX_ENTRIES = 10000,
+ ARG_FULL_PERCENT = 10001,
+ ARG_VALUE_SIZE = 10002,
+ ARG_USE_CASE = 10003,
+ ARG_PREALLOCATED = 10004,
+};
+
+static const struct argp_option opts[] = {
+ { "max-entries", ARG_MAX_ENTRIES, "MAX_ENTRIES", 0,
+ "Set the max entries of hash map (default 16384)" },
+ { "full", ARG_FULL_PERCENT, "FULL", 0,
+ "Set the full percent of hash map (default 50)" },
+ { "value-size", ARG_VALUE_SIZE, "VALUE_SIZE", 0,
+ "Set the value size of hash map (default 8)" },
+ { "use-case", ARG_USE_CASE, "USE_CASE", 0,
+ "Set the use case of hash map: no_op|overwrite|batch_add_batch_del|add_del_on_diff_cpu" },
+ { "preallocated", ARG_PREALLOCATED, NULL, 0, "use preallocated hash map" },
+ {},
+};
+
+static error_t htab_mem_parse_arg(int key, char *arg, struct argp_state *state)
+{
+ switch (key) {
+ case ARG_MAX_ENTRIES:
+ args.max_entries = strtoul(arg, NULL, 10);
+ break;
+ case ARG_FULL_PERCENT:
+ args.full = strtoul(arg, NULL, 10);
+ if (!args.full || args.full > 100) {
+ fprintf(stderr, "invalid full percent %u\n", args.full);
+ argp_usage(state);
+ }
+ break;
+ case ARG_VALUE_SIZE:
+ args.value_size = strtoul(arg, NULL, 10);
+ if (args.value_size > 4096) {
+ fprintf(stderr, "too big value size %u\n", args.value_size);
+ argp_usage(state);
+ }
+ break;
+ case ARG_USE_CASE:
+ args.use_case = strdup(arg);
+ break;
+ case ARG_PREALLOCATED:
+ args.preallocated = true;
+ break;
+ default:
+ return ARGP_ERR_UNKNOWN;
+ }
+
+ return 0;
+}
+
+const struct argp bench_htab_mem_argp = {
+ .options = opts,
+ .parser = htab_mem_parse_arg,
+};
+
+static void htab_mem_validate(void)
+{
+ if (env.consumer_cnt != 1) {
+ fprintf(stderr, "htab mem benchmark doesn't support multi-consumer!\n");
+ exit(1);
+ }
+}
+
+static int setup_and_join_cgroup(const char *path)
+{
+ int err, fd;
+
+ err = setup_cgroup_environment();
+ if (err) {
+ fprintf(stderr, "setup cgroup env failed\n");
+ return -1;
+ }
+
+ err = create_and_get_cgroup(path);
+ if (err < 0) {
+ fprintf(stderr, "create cgroup %s failed\n", path);
+ goto out;
+ }
+ fd = err;
+
+ err = join_cgroup(path);
+ if (err) {
+ fprintf(stderr, "join cgroup %s failed\n", path);
+ close(fd);
+ goto out;
+ }
+
+ return fd;
+out:
+ cleanup_cgroup_environment();
+ return -1;
+}
+
+static int htab_mem_bench_init_barriers(void)
+{
+ unsigned int i, nr = (env.producer_cnt + 1) / 2;
+ pthread_barrier_t *barriers;
+
+ barriers = calloc(nr, sizeof(*barriers));
+ if (!barriers)
+ return -1;
+
+ /* Used for synchronization between two threads */
+ for (i = 0; i < nr; i++)
+ pthread_barrier_init(&barriers[i], NULL, 2);
+
+ ctx.notify = barriers;
+ return 0;
+}
+
+static void htab_mem_bench_exit_barriers(void)
+{
+ unsigned int i, nr;
+
+ if (!ctx.notify)
+ return;
+
+ nr = (env.producer_cnt + 1) / 2;
+ for (i = 0; i < nr; i++)
+ pthread_barrier_destroy(&ctx.notify[i]);
+ free(ctx.notify);
+}
+
+static void htab_mem_setup(void)
+{
+ struct bpf_program *prog;
+ struct bpf_map *map;
+ int err;
+
+ setup_libbpf();
+
+ err = setup_and_join_cgroup("/htab_mem");
+ if (err < 0)
+ exit(1);
+ ctx.fd = err;
+
+ ctx.skel = htab_mem_bench__open();
+ if (!ctx.skel) {
+ fprintf(stderr, "failed to open skeleton\n");
+ goto cleanup;
+ }
+
+ err = htab_mem_bench_init_barriers();
+ if (err) {
+ fprintf(stderr, "failed to init barrier\n");
+ goto cleanup;
+ }
+
+ map = ctx.skel->maps.htab;
+ bpf_map__set_max_entries(map, args.max_entries);
+ bpf_map__set_value_size(map, args.value_size);
+ if (args.preallocated)
+ bpf_map__set_map_flags(map, bpf_map__map_flags(map) & ~BPF_F_NO_PREALLOC);
+
+ /* Do synchronization between addition thread and deletion thread */
+ if (!strcmp("add_del_on_diff_cpu", args.use_case))
+ ctx.do_notify_wait = true;
+
+ prog = bpf_object__find_program_by_name(ctx.skel->obj, args.use_case);
+ if (!prog) {
+ fprintf(stderr, "no such use-case: %s\n", args.use_case);
+ fprintf(stderr, "available use case:");
+ bpf_object__for_each_program(prog, ctx.skel->obj)
+ fprintf(stderr, " %s", bpf_program__name(prog));
+ fprintf(stderr, "\n");
+ goto cleanup;
+ }
+ bpf_program__set_autoload(prog, true);
+
+ ctx.skel->bss->nr_thread = env.producer_cnt;
+ ctx.skel->bss->nr_entries = (uint64_t)args.max_entries * args.full / 100;
+
+ err = htab_mem_bench__load(ctx.skel);
+ if (err) {
+ fprintf(stderr, "failed to load skeleton\n");
+ goto cleanup;
+ }
+ err = htab_mem_bench__attach(ctx.skel);
+ if (err) {
+ fprintf(stderr, "failed to attach skeleton\n");
+ goto cleanup;
+ }
+ return;
+cleanup:
+ close(ctx.fd);
+ cleanup_cgroup_environment();
+ htab_mem_bench_exit_barriers();
+ htab_mem_bench__destroy(ctx.skel);
+ exit(1);
+}
+
+static void htab_mem_notify_wait_producer(pthread_barrier_t *notify)
+{
+ while (true) {
+ (void)syscall(__NR_getpgid);
+ /* Notify for start */
+ pthread_barrier_wait(notify);
+ /* Wait for completion */
+ pthread_barrier_wait(notify);
+ }
+}
+
+static void htab_mem_wait_notify_producer(pthread_barrier_t *notify)
+{
+ while (true) {
+ /* Wait for start */
+ pthread_barrier_wait(notify);
+ (void)syscall(__NR_getpgid);
+ /* Notify for completion */
+ pthread_barrier_wait(notify);
+ }
+}
+
+static void *htab_mem_producer(void *arg)
+{
+ pthread_barrier_t *notify;
+ int seq;
+
+ if (!ctx.do_notify_wait) {
+ while (true)
+ (void)syscall(__NR_getpgid);
+ return NULL;
+ }
+
+ seq = (long)arg;
+ notify = &ctx.notify[seq / 2];
+ if (seq & 1)
+ htab_mem_notify_wait_producer(notify);
+ else
+ htab_mem_wait_notify_producer(notify);
+ return NULL;
+}
+
+static void *htab_mem_consumer(void *arg)
+{
+ return NULL;
+}
+
+static void htab_mem_read_mem_cgrp_file(const char *name, unsigned long *value)
+{
+ char buf[32];
+ int fd;
+
+ fd = openat(ctx.fd, name, O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "no %s\n", name);
+ *value = 0;
+ return;
+ }
+
+ buf[sizeof(buf) - 1] = 0;
+ read(fd, buf, sizeof(buf) - 1);
+ *value = strtoull(buf, NULL, 0);
+
+ close(fd);
+}
+
+static void htab_mem_measure(struct bench_res *res)
+{
+ res->hits = atomic_swap(&ctx.skel->bss->loop_cnt, 0);
+ htab_mem_read_mem_cgrp_file("memory.current", &res->gp_ct);
+}
+
+static void htab_mem_report_progress(int iter, struct bench_res *res, long delta_ns)
+{
+ double loop, mem;
+
+ loop = res->hits / 1000.0 / (delta_ns / 1000000000.0);
+ mem = res->gp_ct / 1048576.0;
+ printf("Iter %3d (%7.3lfus): ", iter, (delta_ns - 1000000000) / 1000.0);
+ printf("loop %7.2lfk/s, memory usage %7.2lfMiB\n", loop, mem);
+}
+
+static void htab_mem_report_final(struct bench_res res[], int res_cnt)
+{
+ double mem_mean = 0.0, mem_stddev = 0.0;
+ double loop_mean = 0.0, loop_stddev = 0.0;
+ unsigned long peak_mem;
+ int i;
+
+ for (i = 0; i < res_cnt; i++) {
+ loop_mean += res[i].hits / 1000.0 / (0.0 + res_cnt);
+ mem_mean += res[i].gp_ct / 1048576.0 / (0.0 + res_cnt);
+ }
+ if (res_cnt > 1) {
+ for (i = 0; i < res_cnt; i++) {
+ loop_stddev += (loop_mean - res[i].hits / 1000.0) *
+ (loop_mean - res[i].hits / 1000.0) /
+ (res_cnt - 1.0);
+ mem_stddev += (mem_mean - res[i].gp_ct / 1048576.0) *
+ (mem_mean - res[i].gp_ct / 1048576.0) /
+ (res_cnt - 1.0);
+ }
+ loop_stddev = sqrt(loop_stddev);
+ mem_stddev = sqrt(mem_stddev);
+ }
+
+ htab_mem_read_mem_cgrp_file("memory.peak", &peak_mem);
+ printf("Summary: loop %7.2lf \u00B1 %7.2lfk/s, memory usage %7.2lf \u00B1 %7.2lfMiB, "
+ "peak memory usage %7.2lfMiB\n",
+ loop_mean, loop_stddev, mem_mean, mem_stddev, peak_mem / 1048576.0);
+}
+
+const struct bench bench_htab_mem = {
+ .name = "htab-mem",
+ .argp = &bench_htab_mem_argp,
+ .validate = htab_mem_validate,
+ .setup = htab_mem_setup,
+ .producer_thread = htab_mem_producer,
+ .consumer_thread = htab_mem_consumer,
+ .measure = htab_mem_measure,
+ .report_progress = htab_mem_report_progress,
+ .report_final = htab_mem_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_htab_mem.sh b/tools/testing/selftests/bpf/benchs/run_bench_htab_mem.sh
new file mode 100755
index 000000000000..8f9458889a17
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/run_bench_htab_mem.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source ./benchs/run_common.sh
+
+set -eufo pipefail
+
+DELAYED=/sys/module/hashtab/parameters/delayed_free
+REUSE=/sys/module/hashtab/parameters/reuse_flag
+
+htab_mem()
+{
+ echo -n "loop : "
+ echo -n "$*" | sed -E "s/.* loop\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+k\/s).*/\1/"
+ echo -n -e ", avg mem: "
+ echo -n "$*" | sed -E "s/.* memory usage\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+MiB).*/\1/"
+ echo -n ", peak mem: "
+ echo "$*" | sed -E "s/.* peak memory usage\s+([0-9]+\.[0-9]+MiB).*/\1/"
+}
+
+summarize_htab_mem()
+{
+ local bench="$1"
+ local summary=$(echo $2 | tail -n1)
+
+ printf "%-20s %s\n" "$bench" "$(htab_mem $summary)"
+}
+
+htab_mem_bench()
+{
+ local name
+
+ for name in no_op overwrite batch_add_batch_del add_del_on_diff_cpu
+ do
+ summarize_htab_mem "$name" "$(sudo ./bench htab-mem --use-case $name \
+ --max-entries 16384 --full 50 -d 1 \
+ --producers=8 --prod-affinity=0-7 "$@")"
+ done
+}
+
+header "preallocated"
+echo 0 > $DELAYED
+echo 0 > $REUSE
+htab_mem_bench "--preallocated"
+
+header "normal bpf ma"
+echo 0 > $DELAYED
+echo 0 > $REUSE
+htab_mem_bench
+
+header "reuse-after-rcu-gp bpf ma"
+echo 0 > $DELAYED
+echo 2 > $REUSE
+htab_mem_bench
+
+header "free-after-rcu-gp bpf ma"
+echo 0 > $DELAYED
+echo 4 > $REUSE
+htab_mem_bench
+
+header "extra call_rcu + normal bpf ma"
+echo 1 > $DELAYED
+echo 0 > $REUSE
+htab_mem_bench
diff --git a/tools/testing/selftests/bpf/progs/htab_mem_bench.c b/tools/testing/selftests/bpf/progs/htab_mem_bench.c
new file mode 100644
index 000000000000..fe5c0edb262e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/htab_mem_bench.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
+#include <stdbool.h>
+#include <errno.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+struct update_ctx {
+ unsigned int from;
+ unsigned int step;
+ unsigned int max;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, 4);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+} htab SEC(".maps");
+
+char _license[] SEC("license") = "GPL";
+
+unsigned char zeroed_value[4096];
+unsigned int nr_entries = 0;
+unsigned int nr_thread = 0;
+long loop_cnt = 0;
+
+static int noop_htab(unsigned int i, struct update_ctx *ctx)
+{
+ if (ctx->from >= ctx->max)
+ return 1;
+
+ ctx->from += ctx->step;
+ return 0;
+}
+
+static int write_htab(unsigned int i, struct update_ctx *ctx, unsigned int flags)
+{
+ if (ctx->from >= ctx->max)
+ return 1;
+
+ bpf_map_update_elem(&htab, &ctx->from, zeroed_value, flags);
+ ctx->from += ctx->step;
+
+ return 0;
+}
+
+static int overwrite_htab(unsigned int i, struct update_ctx *ctx)
+{
+ return write_htab(i, ctx, 0);
+}
+
+static int newwrite_htab(unsigned int i, struct update_ctx *ctx)
+{
+ return write_htab(i, ctx, BPF_NOEXIST);
+}
+
+static int del_htab(unsigned int i, struct update_ctx *ctx)
+{
+ if (ctx->from >= ctx->max)
+ return 1;
+
+ bpf_map_delete_elem(&htab, &ctx->from);
+ ctx->from += ctx->step;
+
+ return 0;
+}
+
+SEC("?tp/syscalls/sys_enter_getpgid")
+int no_op(void *ctx)
+{
+ struct update_ctx update;
+
+ update.from = bpf_get_smp_processor_id();
+ update.step = nr_thread;
+ update.max = nr_entries;
+ bpf_loop(update.max, noop_htab, &update, 0);
+ __sync_fetch_and_add(&loop_cnt, 1);
+
+ return 0;
+}
+
+SEC("?tp/syscalls/sys_enter_getpgid")
+int overwrite(void *ctx)
+{
+ struct update_ctx update;
+
+ update.from = bpf_get_smp_processor_id();
+ update.step = nr_thread;
+ update.max = nr_entries;
+ bpf_loop(update.max, overwrite_htab, &update, 0);
+ /* Increase when nr_entries / nr_thread elements are deleted and then added */
+ __sync_fetch_and_add(&loop_cnt, 1);
+ return 0;
+}
+
+SEC("?tp/syscalls/sys_enter_getpgid")
+int batch_add_batch_del(void *ctx)
+{
+ struct update_ctx update;
+
+ update.from = bpf_get_smp_processor_id();
+ update.step = nr_thread;
+ update.max = nr_entries;
+ bpf_loop(update.max, overwrite_htab, &update, 0);
+
+ update.from = bpf_get_smp_processor_id();
+ bpf_loop(update.max, del_htab, &update, 0);
+
+ /* Increase when nr_entries / nr_thread elements are added and then deleted */
+ __sync_fetch_and_add(&loop_cnt, 1);
+ return 0;
+}
+
+SEC("?tp/syscalls/sys_enter_getpgid")
+int add_del_on_diff_cpu(void *ctx)
+{
+ struct update_ctx update;
+ unsigned int from;
+
+ from = bpf_get_smp_processor_id();
+ update.from = from / 2;
+ update.step = nr_thread / 2;
+ update.max = nr_entries;
+
+ if (from & 1)
+ bpf_loop(update.max, newwrite_htab, &update, 0);
+ else
+ bpf_loop(update.max, del_htab, &update, 0);
+
+ /* Increase when nr_entries / nr_thread * 2 elements are added or deleted */
+ __sync_fetch_and_add(&loop_cnt, 1);
+ return 0;
+}
--
2.29.2
^ permalink raw reply related [flat|nested] 20+ messages in thread