BPF Archive on lore.kernel.org
 help / color / Atom feed
* [RFC Patch bpf-next] bpf: introduce bpf timer
@ 2021-04-01  4:26 Cong Wang
  2021-04-01  6:38 ` Song Liu
  2021-04-02 19:28 ` Alexei Starovoitov
  0 siblings, 2 replies; 25+ messages in thread
From: Cong Wang @ 2021-04-01  4:26 UTC (permalink / raw)
  To: netdev
  Cc: bpf, duanxiongchun, wangdongdong.6, songmuchun, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song

From: Cong Wang <cong.wang@bytedance.com>

(This patch is still in early stage and obviously incomplete. I am sending
it out to get some high-level feedbacks. Please kindly ignore any coding
details for now and focus on the design.)

This patch introduces a bpf timer map and a syscall to create bpf timer
from user-space.

The reason why we have to use a map is because the lifetime of a timer,
without a map, we have to delete the timer before exiting the eBPF program,
this would significately limit its use cases. With a map, the timer can
stay as long as the map itself and can be actually updated via map update
API's too, where the key is the timer ID and the value is the timer expire
timer.

Timer creation is not easy either. In order to prevent users creating a
timer but not adding it to a map, we have to enforce this in the API which
takes a map parameter and adds the new timer into the map in one shot.

And because timer is asynchronous, we can not just use its callback like
bpf_for_each_map_elem(). More importantly, we have to properly reference
count its struct bpf_prog too. It seems impossible to do this either in
verifier or in JIT, so we have to make its callback code a separate eBPF
program and pass a program fd from user-space. Fortunately, timer callback
can still live in the same object file with the rest eBPF code and share
data too.

Here is a quick demo of the timer callback code:

static __u64
check_expired_elem(struct bpf_map *map, __u32 *key, __u64 *val,
                  int *data)
{
  u64 expires = *val;

  if (expires < bpf_jiffies64()) {
    bpf_map_delete_elem(map, key);
    *data++;
  }
  return 0;
}

SEC("timer")
u32 timer_callback(void)
{
  int count = 0;

  bpf_for_each_map_elem(&map, check_expired_elem, &count, 0);
  if (count)
     return 0; // not re-arm this timer
  else
     return 10; // reschedule this timer after 10 jiffies
}

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/linux/bpf.h       |   2 +
 include/linux/bpf_types.h |   1 +
 include/uapi/linux/bpf.h  |  15 +++
 kernel/bpf/Makefile       |   2 +-
 kernel/bpf/syscall.c      |  16 +++
 kernel/bpf/timer.c        | 238 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c     |   6 +
 7 files changed, 279 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/timer.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9fdd839b418c..196e8f2f8c12 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2078,4 +2078,6 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 struct btf_id_set;
 bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
 
+int bpf_timer_create(union bpf_attr *attr);
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index f883f01a5061..9e3afd2dbfc6 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -133,3 +133,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
 #ifdef CONFIG_NET
 BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns)
 #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_TIMER, timer_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 598716742593..627c0fbf9dac 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -841,6 +841,7 @@ enum bpf_cmd {
 	BPF_ITER_CREATE,
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
+	BPF_TIMER_CREATE,
 };
 
 enum bpf_map_type {
@@ -874,6 +875,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_RINGBUF,
 	BPF_MAP_TYPE_INODE_STORAGE,
 	BPF_MAP_TYPE_TASK_STORAGE,
+	BPF_MAP_TYPE_TIMER,
 };
 
 /* Note that tracing related programs such as
@@ -916,6 +918,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_EXT,
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
+	BPF_PROG_TYPE_TIMER,
 };
 
 enum bpf_attach_type {
@@ -1436,6 +1439,12 @@ union bpf_attr {
 		__u32		flags;		/* extra flags */
 	} prog_bind_map;
 
+	struct { /* struct used by BPF_TIMER_CREATE command */
+		__u32		map_fd;
+		__u32		prog_fd;
+		__u32		flags;		/* timer flags */
+	} timer_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -6013,4 +6022,10 @@ enum {
 	BTF_F_ZERO	=	(1ULL << 3),
 };
 
+/* bpf timer flags */
+enum {
+	BTF_TIMER_F_DEFERRABLE	= (1ULL << 0),
+	BTF_TIMER_F_PINNED	= (1ULL << 1),
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 7f33098ca63f..0215bfd1bcea 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,7 +8,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o timer.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
 obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9603de81811a..f423f0688bd5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4350,6 +4350,19 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
 	return ret;
 }
 
+#define BPF_TIMER_CREATE_LAST_FIELD timer_create.flags
+
+static int bpf_create_timer(union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_TIMER_CREATE))
+		return -EINVAL;
+
+	if (!bpf_capable())
+		return -EPERM;
+
+	return bpf_timer_create(attr);
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr;
@@ -4486,6 +4499,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_BIND_MAP:
 		err = bpf_prog_bind_map(&attr);
 		break;
+	case BPF_TIMER_CREATE:
+		err = bpf_create_timer(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/bpf/timer.c b/kernel/bpf/timer.c
new file mode 100644
index 000000000000..0d7b5655e60a
--- /dev/null
+++ b/kernel/bpf/timer.c
@@ -0,0 +1,238 @@
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/filter.h>
+#include <uapi/linux/btf.h>
+
+struct bpf_timer_list {
+	struct timer_list timer;
+	struct bpf_prog *prog;
+	u64 expires;
+	s32 id;
+	struct rcu_head rcu;
+};
+
+struct bpf_timer_map {
+	struct bpf_map map;
+	struct idr timer_idr;
+	spinlock_t idr_lock;
+};
+
+static int timer_map_alloc_check(union bpf_attr *attr)
+{
+	if (attr->max_entries == 0 || attr->max_entries > INT_MAX ||
+	    attr->key_size != 4 || attr->value_size != 8)
+		return -EINVAL;
+
+	if (attr->map_flags & BPF_F_MMAPABLE)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct bpf_map *timer_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_timer_map *tmap;
+
+	tmap = kzalloc(sizeof(*tmap), GFP_USER | __GFP_ACCOUNT);
+	if (!tmap)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&tmap->map, attr);
+	spin_lock_init(&tmap->idr_lock);
+	idr_init(&tmap->timer_idr);
+	return &tmap->map;
+}
+
+static int bpf_timer_delete(int id, void *ptr, void *data)
+{
+	struct bpf_timer_list *t = ptr;
+
+	del_timer_sync(&t->timer);
+	kfree_rcu(t, rcu);
+	return 0;
+}
+
+static void timer_map_free(struct bpf_map *map)
+{
+	struct bpf_timer_map *tmap;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+	idr_for_each(&tmap->timer_idr, bpf_timer_delete, NULL);
+
+	rcu_barrier();
+	idr_destroy(&tmap->timer_idr);
+}
+
+static void *timer_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_timer_map *tmap;
+	s32 timer_id = *(s32 *)key;
+	struct bpf_timer_list *t;
+	void *ret = NULL;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+
+	rcu_read_lock();
+	t = idr_find(&tmap->timer_idr, timer_id);
+	if (t) {
+		t->expires = t->timer.expires;
+		ret = &t->expires;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static int timer_map_update_elem(struct bpf_map *map, void *key, void *value,
+				 u64 flags)
+{
+	u64 expires = *(u64 *)value;
+	s32 timer_id = *(s32 *)key;
+	struct bpf_timer_map *tmap;
+	struct bpf_timer_list *t;
+	int ret = 0;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+
+	rcu_read_lock();
+	t = idr_find(&tmap->timer_idr, timer_id);
+	if (!t)
+		ret = -ENOENT;
+	else
+		mod_timer(&t->timer, (unsigned long)expires);
+	rcu_read_unlock();
+	return ret;
+}
+
+static int timer_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_timer_map *tmap;
+	s32 timer_id = *(s32 *)key;
+	struct bpf_timer_list *t;
+	unsigned long flags;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+	spin_lock_irqsave(&tmap->idr_lock, flags);
+	t = idr_remove(&tmap->timer_idr, timer_id);
+	spin_unlock_irqrestore(&tmap->idr_lock, flags);
+	if (!t)
+		return -ENOENT;
+	del_timer_sync(&t->timer);
+	bpf_prog_put(t->prog);
+	kfree_rcu(t, rcu);
+	return 0;
+}
+
+static int timer_map_get_next_key(struct bpf_map *map, void *key,
+				    void *next_key)
+{
+	struct bpf_timer_map *tmap;
+	s32 next_id = *(s32 *)key;
+	int ret = 0;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+	rcu_read_lock();
+	if (!idr_get_next(&tmap->timer_idr, &next_id))
+		ret = -ENOENT;
+	rcu_read_unlock();
+	*(s32 *)next_key = next_id;
+	return ret;
+}
+
+static int timer_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+	return -ENOTSUPP;
+}
+
+static int timer_map_btf_id;
+const struct bpf_map_ops timer_map_ops = {
+	.map_meta_equal = bpf_map_meta_equal,
+	.map_alloc_check = timer_map_alloc_check,
+	.map_alloc = timer_map_alloc,
+	.map_free = timer_map_free,
+	.map_mmap = timer_map_mmap,
+	.map_lookup_elem = timer_map_lookup_elem,
+	.map_update_elem = timer_map_update_elem,
+	.map_delete_elem = timer_map_delete_elem,
+	.map_get_next_key = timer_map_get_next_key,
+	.map_btf_name = "bpf_timer_map",
+	.map_btf_id = &timer_map_btf_id,
+};
+
+static void bpf_timer_callback(struct timer_list *t)
+{
+	struct bpf_timer_list *bt = from_timer(bt, t, timer);
+	u32 ret;
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(bt->prog, NULL);
+	rcu_read_unlock();
+
+	if (ret)
+		mod_timer(&bt->timer, bt->timer.expires + ret);
+}
+
+int bpf_timer_create(union bpf_attr *attr)
+{
+	unsigned int flags, timer_flags = 0;
+	struct bpf_timer_map *tmap;
+	struct bpf_timer_list *t;
+	unsigned long irq_flags;
+	struct bpf_prog *prog;
+	struct bpf_map *map;
+	int ret = 0;
+
+	flags = attr->timer_create.flags;
+	if (flags & ~(BTF_TIMER_F_DEFERRABLE | BTF_TIMER_F_PINNED))
+		return -EINVAL;
+
+	prog = bpf_prog_get(attr->timer_create.prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+	if (prog->type != BPF_PROG_TYPE_TIMER) {
+		ret = -EINVAL;
+		goto out_prog_put;
+	}
+
+	map = bpf_map_get(attr->timer_create.map_fd);
+	if (IS_ERR(map)) {
+		ret = PTR_ERR(map);
+		goto out_prog_put;
+	}
+	if (map->map_type != BPF_MAP_TYPE_TIMER) {
+		ret = -EINVAL;
+		goto out_map_put;
+	}
+
+	t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t) {
+		ret = -ENOMEM;
+		goto out_map_put;
+	}
+
+	if (flags & BTF_TIMER_F_DEFERRABLE)
+		timer_flags |= TIMER_DEFERRABLE;
+	if (flags & BTF_TIMER_F_PINNED)
+		timer_flags |= TIMER_PINNED;
+	timer_setup(&t->timer, bpf_timer_callback, timer_flags);
+	t->prog = prog;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+	spin_lock_irqsave(&tmap->idr_lock, irq_flags);
+	ret = idr_alloc_cyclic(&tmap->timer_idr, t, 0, INT_MAX, GFP_ATOMIC);
+	spin_unlock_irqrestore(&tmap->idr_lock, irq_flags);
+	if (ret < 0)
+		kfree(t);
+	else
+		t->id = ret;
+
+out_map_put:
+	bpf_map_put(map);
+out_prog_put:
+	if (ret)
+		bpf_prog_put(prog);
+	return ret;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 852541a435ef..ed0cbce8dc4f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5991,6 +5991,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return -EINVAL;
 	}
 
+	if (func_id == BPF_FUNC_map_delete_elem &&
+	    env->prog->type == BPF_PROG_TYPE_TIMER) {
+		verbose(env, "bpf_map_delete_elem() can't be called in a timer program\n");
+		return -EINVAL;
+	}
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-01  4:26 [RFC Patch bpf-next] bpf: introduce bpf timer Cong Wang
@ 2021-04-01  6:38 ` Song Liu
  2021-04-01 17:28   ` Cong Wang
  2021-04-02 19:28 ` Alexei Starovoitov
  1 sibling, 1 reply; 25+ messages in thread
From: Song Liu @ 2021-04-01  6:38 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> From: Cong Wang <cong.wang@bytedance.com>
> 
> (This patch is still in early stage and obviously incomplete. I am sending
> it out to get some high-level feedbacks. Please kindly ignore any coding
> details for now and focus on the design.)

Could you please explain the use case of the timer? Is it the same as 
earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH? 

Assuming that is the case, I guess the use case is to assign an expire 
time for each element in a hash map; and periodically remove expired 
element from the map. 

If this is still correct, my next question is: how does this compare
against a user space timer? Will the user space timer be too slow?

> 
> This patch introduces a bpf timer map and a syscall to create bpf timer
> from user-space.
> 
> The reason why we have to use a map is because the lifetime of a timer,
> without a map, we have to delete the timer before exiting the eBPF program,
> this would significately limit its use cases. With a map, the timer can
> stay as long as the map itself and can be actually updated via map update
> API's too, where the key is the timer ID and the value is the timer expire
> timer.
> 
> Timer creation is not easy either. In order to prevent users creating a
> timer but not adding it to a map, we have to enforce this in the API which
> takes a map parameter and adds the new timer into the map in one shot.

I think we don't have to address "creating a timer but not adding it to a map" 
problem in the kernel. If the user forgot it, the user should debug it. 

Thanks,
Song

[...]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-01  6:38 ` Song Liu
@ 2021-04-01 17:28   ` Cong Wang
  2021-04-01 20:17     ` Song Liu
  0 siblings, 1 reply; 25+ messages in thread
From: Cong Wang @ 2021-04-01 17:28 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > From: Cong Wang <cong.wang@bytedance.com>
> >
> > (This patch is still in early stage and obviously incomplete. I am sending
> > it out to get some high-level feedbacks. Please kindly ignore any coding
> > details for now and focus on the design.)
>
> Could you please explain the use case of the timer? Is it the same as
> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
>
> Assuming that is the case, I guess the use case is to assign an expire
> time for each element in a hash map; and periodically remove expired
> element from the map.
>
> If this is still correct, my next question is: how does this compare
> against a user space timer? Will the user space timer be too slow?

Yes, as I explained in timeout hashmap patchset, doing it in user-space
would require a lot of syscalls (without batching) or copying (with batching).
I will add the explanation here, in case people miss why we need a timer.

>
> >
> > This patch introduces a bpf timer map and a syscall to create bpf timer
> > from user-space.
> >
> > The reason why we have to use a map is because the lifetime of a timer,
> > without a map, we have to delete the timer before exiting the eBPF program,
> > this would significately limit its use cases. With a map, the timer can
> > stay as long as the map itself and can be actually updated via map update
> > API's too, where the key is the timer ID and the value is the timer expire
> > timer.
> >
> > Timer creation is not easy either. In order to prevent users creating a
> > timer but not adding it to a map, we have to enforce this in the API which
> > takes a map parameter and adds the new timer into the map in one shot.
>
> I think we don't have to address "creating a timer but not adding it to a map"
> problem in the kernel. If the user forgot it, the user should debug it.

Good point. Initially the timer is created in kernel-space, now it is in user
space, so it is probably fine to create it without a map. But we would have
to provide more syscalls for users to manage the timer, so using a map
still has an advantage of not adding more syscalls.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-01 17:28   ` Cong Wang
@ 2021-04-01 20:17     ` Song Liu
  2021-04-02 17:34       ` Cong Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Song Liu @ 2021-04-01 20:17 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Apr 1, 2021, at 10:28 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> 
>>> From: Cong Wang <cong.wang@bytedance.com>
>>> 
>>> (This patch is still in early stage and obviously incomplete. I am sending
>>> it out to get some high-level feedbacks. Please kindly ignore any coding
>>> details for now and focus on the design.)
>> 
>> Could you please explain the use case of the timer? Is it the same as
>> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
>> 
>> Assuming that is the case, I guess the use case is to assign an expire
>> time for each element in a hash map; and periodically remove expired
>> element from the map.
>> 
>> If this is still correct, my next question is: how does this compare
>> against a user space timer? Will the user space timer be too slow?
> 
> Yes, as I explained in timeout hashmap patchset, doing it in user-space
> would require a lot of syscalls (without batching) or copying (with batching).
> I will add the explanation here, in case people miss why we need a timer.

How about we use a user space timer to trigger a BPF program (e.g. use 
BPF_PROG_TEST_RUN on a raw_tp program); then, in the BPF program, we can 
use bpf_for_each_map_elem and bpf_map_delete_elem to scan and update the 
map? With this approach, we only need one syscall per period. 

Thanks,
Song


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-01 20:17     ` Song Liu
@ 2021-04-02 17:34       ` Cong Wang
  2021-04-02 17:57         ` Song Liu
  0 siblings, 1 reply; 25+ messages in thread
From: Cong Wang @ 2021-04-02 17:34 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Thu, Apr 1, 2021 at 1:17 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Apr 1, 2021, at 10:28 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>
> >>> From: Cong Wang <cong.wang@bytedance.com>
> >>>
> >>> (This patch is still in early stage and obviously incomplete. I am sending
> >>> it out to get some high-level feedbacks. Please kindly ignore any coding
> >>> details for now and focus on the design.)
> >>
> >> Could you please explain the use case of the timer? Is it the same as
> >> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
> >>
> >> Assuming that is the case, I guess the use case is to assign an expire
> >> time for each element in a hash map; and periodically remove expired
> >> element from the map.
> >>
> >> If this is still correct, my next question is: how does this compare
> >> against a user space timer? Will the user space timer be too slow?
> >
> > Yes, as I explained in timeout hashmap patchset, doing it in user-space
> > would require a lot of syscalls (without batching) or copying (with batching).
> > I will add the explanation here, in case people miss why we need a timer.
>
> How about we use a user space timer to trigger a BPF program (e.g. use
> BPF_PROG_TEST_RUN on a raw_tp program); then, in the BPF program, we can
> use bpf_for_each_map_elem and bpf_map_delete_elem to scan and update the
> map? With this approach, we only need one syscall per period.

Interesting, I didn't know we can explicitly trigger a BPF program running
from user-space. Is it for testing purposes only?

But we also want the timer code itself to change the expire time too, it is
common to adjust the expire time based on the size of the workset, for
example, the number of elements in a hashmap.

With the current design, both kernel and user-space can modify the
expire time with map update API's.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 17:34       ` Cong Wang
@ 2021-04-02 17:57         ` Song Liu
  2021-04-02 19:08           ` Cong Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Song Liu @ 2021-04-02 17:57 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Apr 2, 2021, at 10:34 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Thu, Apr 1, 2021 at 1:17 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Apr 1, 2021, at 10:28 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> 
>>> On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>> 
>>>>> From: Cong Wang <cong.wang@bytedance.com>
>>>>> 
>>>>> (This patch is still in early stage and obviously incomplete. I am sending
>>>>> it out to get some high-level feedbacks. Please kindly ignore any coding
>>>>> details for now and focus on the design.)
>>>> 
>>>> Could you please explain the use case of the timer? Is it the same as
>>>> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
>>>> 
>>>> Assuming that is the case, I guess the use case is to assign an expire
>>>> time for each element in a hash map; and periodically remove expired
>>>> element from the map.
>>>> 
>>>> If this is still correct, my next question is: how does this compare
>>>> against a user space timer? Will the user space timer be too slow?
>>> 
>>> Yes, as I explained in timeout hashmap patchset, doing it in user-space
>>> would require a lot of syscalls (without batching) or copying (with batching).
>>> I will add the explanation here, in case people miss why we need a timer.
>> 
>> How about we use a user space timer to trigger a BPF program (e.g. use
>> BPF_PROG_TEST_RUN on a raw_tp program); then, in the BPF program, we can
>> use bpf_for_each_map_elem and bpf_map_delete_elem to scan and update the
>> map? With this approach, we only need one syscall per period.
> 
> Interesting, I didn't know we can explicitly trigger a BPF program running
> from user-space. Is it for testing purposes only?

This is not only for testing. We will use this in perf (starting in 5.13).

/* currently in Arnaldo's tree, tools/perf/util/bpf_counter.c: */

/* trigger the leader program on a cpu */
static int bperf_trigger_reading(int prog_fd, int cpu)
{
        DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
                            .ctx_in = NULL,
                            .ctx_size_in = 0,
                            .flags = BPF_F_TEST_RUN_ON_CPU,
                            .cpu = cpu,
                            .retval = 0,
                );

        return bpf_prog_test_run_opts(prog_fd, &opts);
}

test_run also passes return value (retval) back to user space, so we and 
adjust the timer interval based on retval.

Also, test_run can trigger the program on a specific cpu. This might be 
useful with percpu map (BPF_MAP_TYPE_PERCPU_HASH, etc.). 

Thanks,
Song


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 17:57         ` Song Liu
@ 2021-04-02 19:08           ` Cong Wang
  2021-04-02 19:43             ` Song Liu
  0 siblings, 1 reply; 25+ messages in thread
From: Cong Wang @ 2021-04-02 19:08 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Fri, Apr 2, 2021 at 10:57 AM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Apr 2, 2021, at 10:34 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Thu, Apr 1, 2021 at 1:17 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Apr 1, 2021, at 10:28 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>
> >>> On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>>> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>>>
> >>>>> From: Cong Wang <cong.wang@bytedance.com>
> >>>>>
> >>>>> (This patch is still in early stage and obviously incomplete. I am sending
> >>>>> it out to get some high-level feedbacks. Please kindly ignore any coding
> >>>>> details for now and focus on the design.)
> >>>>
> >>>> Could you please explain the use case of the timer? Is it the same as
> >>>> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
> >>>>
> >>>> Assuming that is the case, I guess the use case is to assign an expire
> >>>> time for each element in a hash map; and periodically remove expired
> >>>> element from the map.
> >>>>
> >>>> If this is still correct, my next question is: how does this compare
> >>>> against a user space timer? Will the user space timer be too slow?
> >>>
> >>> Yes, as I explained in timeout hashmap patchset, doing it in user-space
> >>> would require a lot of syscalls (without batching) or copying (with batching).
> >>> I will add the explanation here, in case people miss why we need a timer.
> >>
> >> How about we use a user space timer to trigger a BPF program (e.g. use
> >> BPF_PROG_TEST_RUN on a raw_tp program); then, in the BPF program, we can
> >> use bpf_for_each_map_elem and bpf_map_delete_elem to scan and update the
> >> map? With this approach, we only need one syscall per period.
> >
> > Interesting, I didn't know we can explicitly trigger a BPF program running
> > from user-space. Is it for testing purposes only?
>
> This is not only for testing. We will use this in perf (starting in 5.13).
>
> /* currently in Arnaldo's tree, tools/perf/util/bpf_counter.c: */
>
> /* trigger the leader program on a cpu */
> static int bperf_trigger_reading(int prog_fd, int cpu)
> {
>         DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
>                             .ctx_in = NULL,
>                             .ctx_size_in = 0,
>                             .flags = BPF_F_TEST_RUN_ON_CPU,
>                             .cpu = cpu,
>                             .retval = 0,
>                 );
>
>         return bpf_prog_test_run_opts(prog_fd, &opts);
> }
>
> test_run also passes return value (retval) back to user space, so we and
> adjust the timer interval based on retval.

This is really odd, every name here contains a "test" but it is not for testing
purposes. You probably need to rename/alias it. ;)

So, with this we have to get a user-space daemon running just to keep
this "timer" alive. If I want to run it every 1ms, it means I have to issue
a syscall BPF_PROG_TEST_RUN every 1ms. Even with a timer fd, we
still need poll() and timerfd_settime(). This is a considerable overhead
for just a single timer.

With current design, user-space can just exit after installing the timer,
either it can adjust itself or other eBPF code can adjust it, so the per
timer overhead is the same as a kernel timer.

The visibility to other BPF code is important for the conntrack case,
because each time we get an expired item during a lookup, we may
want to schedule the GC timer to run sooner. At least this would give
users more freedom to decide when to reschedule the timer.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-01  4:26 [RFC Patch bpf-next] bpf: introduce bpf timer Cong Wang
  2021-04-01  6:38 ` Song Liu
@ 2021-04-02 19:28 ` Alexei Starovoitov
  2021-04-02 21:24   ` Cong Wang
  1 sibling, 1 reply; 25+ messages in thread
From: Alexei Starovoitov @ 2021-04-02 19:28 UTC (permalink / raw)
  To: Cong Wang
  Cc: netdev, bpf, duanxiongchun, wangdongdong.6, songmuchun,
	Cong Wang, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song

On Wed, Mar 31, 2021 at 09:26:35PM -0700, Cong Wang wrote:

> This patch introduces a bpf timer map and a syscall to create bpf timer
> from user-space.

That will severely limit timer api usability.
I agree with Song here. If user space has to create it there is no reason
to introduce new sys_bpf command. Just do all timers in user space
and trigger bpf prog via bpf_prog_test_run cmd.

> 
> The reason why we have to use a map is because the lifetime of a timer,
> without a map, we have to delete the timer before exiting the eBPF program,
> this would significately limit its use cases. With a map, the timer can
> stay as long as the map itself and can be actually updated via map update
> API's too,

this part is correct.

> where the key is the timer ID and the value is the timer expire
> timer.

The timer ID is unnecessary. We cannot introduce new IDR for every new
bpf object. It doesn't scale.

> Timer creation is not easy either. In order to prevent users creating a
> timer but not adding it to a map, we have to enforce this in the API which
> takes a map parameter and adds the new timer into the map in one shot.

Not quite true. The timer memory should be a part of the map otherwise
the timer life time is hard to track. But arming the timer and initializing
it with a callback doesn't need to be tied with allocation of timer memory.

> And because timer is asynchronous, we can not just use its callback like
> bpf_for_each_map_elem().

Not quite. We can do it the same way as bpf_for_each_map_elem() despite
being async.

> More importantly, we have to properly reference
> count its struct bpf_prog too. 

It's true that callback prog or subprog has to stay alive while timer
is alive.
Traditional maps can live past the time of the progs that use them.
Like bpf prog can load with a pointer to already created hash map.
Then prog can unload and hashmap will stay around just fine.
All maps are like this with the exception of prog_array.
The progs get deleted from the prog_array map when appropriate.
The same thing can work for maps with embedded timers.
For example the subprog/prog can to be deleted from the timer if
that prog is going away. Similar to ref/uref distinction we have for prog_array.

> It seems impossible to do this either in
> verifier or in JIT, so we have to make its callback code a separate eBPF
> program and pass a program fd from user-space. Fortunately, timer callback
> can still live in the same object file with the rest eBPF code and share
> data too.
> 
> Here is a quick demo of the timer callback code:
> 
> static __u64
> check_expired_elem(struct bpf_map *map, __u32 *key, __u64 *val,
>                   int *data)
> {
>   u64 expires = *val;
> 
>   if (expires < bpf_jiffies64()) {
>     bpf_map_delete_elem(map, key);
>     *data++;
>   }
>   return 0;
> }
> 
> SEC("timer")
> u32 timer_callback(void)
> {
>   int count = 0;
> 
>   bpf_for_each_map_elem(&map, check_expired_elem, &count, 0);
>   if (count)
>      return 0; // not re-arm this timer
>   else
>      return 10; // reschedule this timeGr after 10 jiffies
> }

As Song pointed out the exact same thing can be done with timers in user space
and user space triggering prog exec with bpf_prog_test_run.

Here is how more general timers might look like:
https://lore.kernel.org/bpf/20210310011905.ozz4xahpkqbfkkvd@ast-mbp.dhcp.thefacebook.com/

include/uapi/linux/bpf.h:
struct bpf_timer {
  u64 opaque;
};
The 'opaque' field contains a pointer to dynamically allocated struct timer_list and other data.

The prog would do:
struct map_elem {
    int stuff;
    struct bpf_timer timer;
};

struct {
    __uint(type, BPF_MAP_TYPE_HASH);
    __uint(max_entries, 1);
    __type(key, int);
    __type(value, struct map_elem);
} hmap SEC(".maps");

static int timer_cb(struct map_elem *elem)
{
    if (whatever && elem->stuff)
        bpf_timer_mod(&elem->timer, new_expire);
}

int bpf_timer_test(...)
{
    struct map_elem *val;

    val = bpf_map_lookup_elem(&hmap, &key);
    if (val) {
        bpf_timer_init(&val->timer, timer_cb, flags);
        val->stuff = 123;
        bpf_timer_mod(&val->timer, expires);
    }
}

bpf_map_update_elem() either from bpf prog or from user space
allocates map element and zeros 8 byte space for the timer pointer.
bpf_timer_init() allocates timer_list and stores it into opaque if opaque == 0.
The validation of timer_cb() is done by the verifier.
bpf_map_delete_elem() either from bpf prog or from user space
does del_timer() if elem->opaque != 0.
If prog refers such hmap as above during prog free the kernel does
for_each_map_elem {if (elem->opaque) del_timer().}
I think that is the simplest way of prevent timers firing past the prog life time.
There could be other ways to solve it (like prog_array and ref/uref).

Pseudo code:
int bpf_timer_init(struct bpf_timer *timer, void *timer_cb, int flags)
{
  if (timer->opaque)
    return -EBUSY;
  t = alloc timer_list
  t->cb = timer_cb;
  t->..
  timer->opaque = (long)t;
}

int bpf_timer_mod(struct bpf_timer *timer, u64 expires)
{
  if (!time->opaque)
    return -EINVAL;
  t = (struct timer_list *)timer->opaque;
  mod_timer(t,..);
}

int bpf_timer_del(struct bpf_timer *timer)
{
  if (!time->opaque)
    return -EINVAL;
  t = (struct timer_list *)timer->opaque;
  del_timer(t);
}

The verifier would need to check that 8 bytes occupied by bpf_timer and not accessed
via load/store by the program. The same way it does it for bpf_spin_lock.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 19:08           ` Cong Wang
@ 2021-04-02 19:43             ` Song Liu
  2021-04-02 20:57               ` Cong Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Song Liu @ 2021-04-02 19:43 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Apr 2, 2021, at 12:08 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Fri, Apr 2, 2021 at 10:57 AM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Apr 2, 2021, at 10:34 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> 
>>> On Thu, Apr 1, 2021 at 1:17 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Apr 1, 2021, at 10:28 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>> 
>>>>> On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>> 
>>>>>> 
>>>>>> 
>>>>>>> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>>>> 
>>>>>>> From: Cong Wang <cong.wang@bytedance.com>
>>>>>>> 
>>>>>>> (This patch is still in early stage and obviously incomplete. I am sending
>>>>>>> it out to get some high-level feedbacks. Please kindly ignore any coding
>>>>>>> details for now and focus on the design.)
>>>>>> 
>>>>>> Could you please explain the use case of the timer? Is it the same as
>>>>>> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
>>>>>> 
>>>>>> Assuming that is the case, I guess the use case is to assign an expire
>>>>>> time for each element in a hash map; and periodically remove expired
>>>>>> element from the map.
>>>>>> 
>>>>>> If this is still correct, my next question is: how does this compare
>>>>>> against a user space timer? Will the user space timer be too slow?
>>>>> 
>>>>> Yes, as I explained in timeout hashmap patchset, doing it in user-space
>>>>> would require a lot of syscalls (without batching) or copying (with batching).
>>>>> I will add the explanation here, in case people miss why we need a timer.
>>>> 
>>>> How about we use a user space timer to trigger a BPF program (e.g. use
>>>> BPF_PROG_TEST_RUN on a raw_tp program); then, in the BPF program, we can
>>>> use bpf_for_each_map_elem and bpf_map_delete_elem to scan and update the
>>>> map? With this approach, we only need one syscall per period.
>>> 
>>> Interesting, I didn't know we can explicitly trigger a BPF program running
>>> from user-space. Is it for testing purposes only?
>> 
>> This is not only for testing. We will use this in perf (starting in 5.13).
>> 
>> /* currently in Arnaldo's tree, tools/perf/util/bpf_counter.c: */
>> 
>> /* trigger the leader program on a cpu */
>> static int bperf_trigger_reading(int prog_fd, int cpu)
>> {
>>        DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
>>                            .ctx_in = NULL,
>>                            .ctx_size_in = 0,
>>                            .flags = BPF_F_TEST_RUN_ON_CPU,
>>                            .cpu = cpu,
>>                            .retval = 0,
>>                );
>> 
>>        return bpf_prog_test_run_opts(prog_fd, &opts);
>> }
>> 
>> test_run also passes return value (retval) back to user space, so we and
>> adjust the timer interval based on retval.
> 
> This is really odd, every name here contains a "test" but it is not for testing
> purposes. You probably need to rename/alias it. ;)
> 
> So, with this we have to get a user-space daemon running just to keep
> this "timer" alive. If I want to run it every 1ms, it means I have to issue
> a syscall BPF_PROG_TEST_RUN every 1ms. Even with a timer fd, we
> still need poll() and timerfd_settime(). This is a considerable overhead
> for just a single timer.

sys_bpf() takes about 0.5us. I would expect poll() and timerfd_settime() to 
be slightly faster. So the overhead is less than 0.2% of a single core 
(0.5us x 3 / 1ms). Do we need many counters for conntrack?

> 
> With current design, user-space can just exit after installing the timer,
> either it can adjust itself or other eBPF code can adjust it, so the per
> timer overhead is the same as a kernel timer.

I guess we still need to hold a fd to the prog/map? Alternatively, we can 
pin the prog/map, but then the user need to clean it up. 

> 
> The visibility to other BPF code is important for the conntrack case,
> because each time we get an expired item during a lookup, we may
> want to schedule the GC timer to run sooner. At least this would give
> users more freedom to decide when to reschedule the timer.

Do we plan to share the timer program among multiple processes (which can 
start and terminate in arbitrary orders)? If that is the case, I can imagine
a timer program is better than a user space timer. 

Thanks,
Song 


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 19:43             ` Song Liu
@ 2021-04-02 20:57               ` Cong Wang
  2021-04-02 23:31                 ` Song Liu
  0 siblings, 1 reply; 25+ messages in thread
From: Cong Wang @ 2021-04-02 20:57 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Fri, Apr 2, 2021 at 12:45 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Apr 2, 2021, at 12:08 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Fri, Apr 2, 2021 at 10:57 AM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Apr 2, 2021, at 10:34 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>
> >>> On Thu, Apr 1, 2021 at 1:17 PM Song Liu <songliubraving@fb.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>>> On Apr 1, 2021, at 10:28 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>>>
> >>>>> On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>>> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>>>>>
> >>>>>>> From: Cong Wang <cong.wang@bytedance.com>
> >>>>>>>
> >>>>>>> (This patch is still in early stage and obviously incomplete. I am sending
> >>>>>>> it out to get some high-level feedbacks. Please kindly ignore any coding
> >>>>>>> details for now and focus on the design.)
> >>>>>>
> >>>>>> Could you please explain the use case of the timer? Is it the same as
> >>>>>> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
> >>>>>>
> >>>>>> Assuming that is the case, I guess the use case is to assign an expire
> >>>>>> time for each element in a hash map; and periodically remove expired
> >>>>>> element from the map.
> >>>>>>
> >>>>>> If this is still correct, my next question is: how does this compare
> >>>>>> against a user space timer? Will the user space timer be too slow?
> >>>>>
> >>>>> Yes, as I explained in timeout hashmap patchset, doing it in user-space
> >>>>> would require a lot of syscalls (without batching) or copying (with batching).
> >>>>> I will add the explanation here, in case people miss why we need a timer.
> >>>>
> >>>> How about we use a user space timer to trigger a BPF program (e.g. use
> >>>> BPF_PROG_TEST_RUN on a raw_tp program); then, in the BPF program, we can
> >>>> use bpf_for_each_map_elem and bpf_map_delete_elem to scan and update the
> >>>> map? With this approach, we only need one syscall per period.
> >>>
> >>> Interesting, I didn't know we can explicitly trigger a BPF program running
> >>> from user-space. Is it for testing purposes only?
> >>
> >> This is not only for testing. We will use this in perf (starting in 5.13).
> >>
> >> /* currently in Arnaldo's tree, tools/perf/util/bpf_counter.c: */
> >>
> >> /* trigger the leader program on a cpu */
> >> static int bperf_trigger_reading(int prog_fd, int cpu)
> >> {
> >>        DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
> >>                            .ctx_in = NULL,
> >>                            .ctx_size_in = 0,
> >>                            .flags = BPF_F_TEST_RUN_ON_CPU,
> >>                            .cpu = cpu,
> >>                            .retval = 0,
> >>                );
> >>
> >>        return bpf_prog_test_run_opts(prog_fd, &opts);
> >> }
> >>
> >> test_run also passes return value (retval) back to user space, so we and
> >> adjust the timer interval based on retval.
> >
> > This is really odd, every name here contains a "test" but it is not for testing
> > purposes. You probably need to rename/alias it. ;)
> >
> > So, with this we have to get a user-space daemon running just to keep
> > this "timer" alive. If I want to run it every 1ms, it means I have to issue
> > a syscall BPF_PROG_TEST_RUN every 1ms. Even with a timer fd, we
> > still need poll() and timerfd_settime(). This is a considerable overhead
> > for just a single timer.
>
> sys_bpf() takes about 0.5us. I would expect poll() and timerfd_settime() to
> be slightly faster. So the overhead is less than 0.2% of a single core
> (0.5us x 3 / 1ms). Do we need many counters for conntrack?

This is just for one timer. The whole system may end up with many timers
when we have more and more eBPF programs. So managing the timers
in the use-space would be a problem too someday, clearly one daemon
per-timer does not scale.

>
> >
> > With current design, user-space can just exit after installing the timer,
> > either it can adjust itself or other eBPF code can adjust it, so the per
> > timer overhead is the same as a kernel timer.
>
> I guess we still need to hold a fd to the prog/map? Alternatively, we can
> pin the prog/map, but then the user need to clean it up.

Yes, but I don't see how holding a fd could bring any overhead after
initial setup.

>
> >
> > The visibility to other BPF code is important for the conntrack case,
> > because each time we get an expired item during a lookup, we may
> > want to schedule the GC timer to run sooner. At least this would give
> > users more freedom to decide when to reschedule the timer.
>
> Do we plan to share the timer program among multiple processes (which can
> start and terminate in arbitrary orders)? If that is the case, I can imagine
> a timer program is better than a user space timer.

I mean I want other eBPF program to manage the timers in kernel-space,
as conntrack is mostly in kernel-space. If the timer is only manageable
in user-space, it would seriously limit its use case.

Ideally I even prefer to create timers in kernel-space too, but as I already
explained, this seems impossible to me.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 19:28 ` Alexei Starovoitov
@ 2021-04-02 21:24   ` Cong Wang
  2021-04-02 23:45     ` Alexei Starovoitov
  0 siblings, 1 reply; 25+ messages in thread
From: Cong Wang @ 2021-04-02 21:24 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Fri, Apr 2, 2021 at 12:28 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Mar 31, 2021 at 09:26:35PM -0700, Cong Wang wrote:
>
> > This patch introduces a bpf timer map and a syscall to create bpf timer
> > from user-space.
>
> That will severely limit timer api usability.
> I agree with Song here. If user space has to create it there is no reason
> to introduce new sys_bpf command. Just do all timers in user space
> and trigger bpf prog via bpf_prog_test_run cmd.

I think you misunderstand my point, the reason why the creation is done
in user-space is not I like it, it is because it looks impossible to
create timers
in kernel-space, hence I have to move it to user-space.

>
> >
> > The reason why we have to use a map is because the lifetime of a timer,
> > without a map, we have to delete the timer before exiting the eBPF program,
> > this would significately limit its use cases. With a map, the timer can
> > stay as long as the map itself and can be actually updated via map update
> > API's too,
>
> this part is correct.
>
> > where the key is the timer ID and the value is the timer expire
> > timer.
>
> The timer ID is unnecessary. We cannot introduce new IDR for every new
> bpf object. It doesn't scale.

The IDR is per map, not per timer.

>
> > Timer creation is not easy either. In order to prevent users creating a
> > timer but not adding it to a map, we have to enforce this in the API which
> > takes a map parameter and adds the new timer into the map in one shot.
>
> Not quite true. The timer memory should be a part of the map otherwise
> the timer life time is hard to track. But arming the timer and initializing
> it with a callback doesn't need to be tied with allocation of timer memory.
>
> > And because timer is asynchronous, we can not just use its callback like
> > bpf_for_each_map_elem().
>
> Not quite. We can do it the same way as bpf_for_each_map_elem() despite
> being async.

Well, at least bpf_for_each_map_elem() can use stack variables etc.,
but timers can't. They are very different to me.

>
> > More importantly, we have to properly reference
> > count its struct bpf_prog too.
>
> It's true that callback prog or subprog has to stay alive while timer
> is alive.
> Traditional maps can live past the time of the progs that use them.
> Like bpf prog can load with a pointer to already created hash map.
> Then prog can unload and hashmap will stay around just fine.
> All maps are like this with the exception of prog_array.
> The progs get deleted from the prog_array map when appropriate.
> The same thing can work for maps with embedded timers.
> For example the subprog/prog can to be deleted from the timer if
> that prog is going away. Similar to ref/uref distinction we have for prog_array.
>
> > It seems impossible to do this either in
> > verifier or in JIT, so we have to make its callback code a separate eBPF
> > program and pass a program fd from user-space. Fortunately, timer callback
> > can still live in the same object file with the rest eBPF code and share
> > data too.
> >
> > Here is a quick demo of the timer callback code:
> >
> > static __u64
> > check_expired_elem(struct bpf_map *map, __u32 *key, __u64 *val,
> >                   int *data)
> > {
> >   u64 expires = *val;
> >
> >   if (expires < bpf_jiffies64()) {
> >     bpf_map_delete_elem(map, key);
> >     *data++;
> >   }
> >   return 0;
> > }
> >
> > SEC("timer")
> > u32 timer_callback(void)
> > {
> >   int count = 0;
> >
> >   bpf_for_each_map_elem(&map, check_expired_elem, &count, 0);
> >   if (count)
> >      return 0; // not re-arm this timer
> >   else
> >      return 10; // reschedule this timeGr after 10 jiffies
> > }
>
> As Song pointed out the exact same thing can be done with timers in user space
> and user space triggering prog exec with bpf_prog_test_run.
>
> Here is how more general timers might look like:
> https://lore.kernel.org/bpf/20210310011905.ozz4xahpkqbfkkvd@ast-mbp.dhcp.thefacebook.com/
>
> include/uapi/linux/bpf.h:
> struct bpf_timer {
>   u64 opaque;
> };
> The 'opaque' field contains a pointer to dynamically allocated struct timer_list and other data.

This is my initial design as we already discussed, it does not work,
please see below.

>
> The prog would do:
> struct map_elem {
>     int stuff;
>     struct bpf_timer timer;
> };
>
> struct {
>     __uint(type, BPF_MAP_TYPE_HASH);
>     __uint(max_entries, 1);
>     __type(key, int);
>     __type(value, struct map_elem);
> } hmap SEC(".maps");
>
> static int timer_cb(struct map_elem *elem)
> {
>     if (whatever && elem->stuff)
>         bpf_timer_mod(&elem->timer, new_expire);
> }
>
> int bpf_timer_test(...)
> {
>     struct map_elem *val;
>
>     val = bpf_map_lookup_elem(&hmap, &key);
>     if (val) {
>         bpf_timer_init(&val->timer, timer_cb, flags);
>         val->stuff = 123;
>         bpf_timer_mod(&val->timer, expires);
>     }
> }
>
> bpf_map_update_elem() either from bpf prog or from user space
> allocates map element and zeros 8 byte space for the timer pointer.
> bpf_timer_init() allocates timer_list and stores it into opaque if opaque == 0.
> The validation of timer_cb() is done by the verifier.
> bpf_map_delete_elem() either from bpf prog or from user space
> does del_timer() if elem->opaque != 0.
> If prog refers such hmap as above during prog free the kernel does
> for_each_map_elem {if (elem->opaque) del_timer().}
> I think that is the simplest way of prevent timers firing past the prog life time.
> There could be other ways to solve it (like prog_array and ref/uref).
>
> Pseudo code:
> int bpf_timer_init(struct bpf_timer *timer, void *timer_cb, int flags)
> {
>   if (timer->opaque)
>     return -EBUSY;
>   t = alloc timer_list
>   t->cb = timer_cb;
>   t->..
>   timer->opaque = (long)t;
> }
>
> int bpf_timer_mod(struct bpf_timer *timer, u64 expires)
> {
>   if (!time->opaque)
>     return -EINVAL;
>   t = (struct timer_list *)timer->opaque;
>   mod_timer(t,..);
> }
>
> int bpf_timer_del(struct bpf_timer *timer)
> {
>   if (!time->opaque)
>     return -EINVAL;
>   t = (struct timer_list *)timer->opaque;
>   del_timer(t);
> }
>
> The verifier would need to check that 8 bytes occupied by bpf_timer and not accessed
> via load/store by the program. The same way it does it for bpf_spin_lock.

This does not work, because bpf_timer_del() has to be matched
with bpf_timer_init(), otherwise we would leak timer resources.
For example:

SEC("foo")
bad_ebpf_code()
{
  struct bpf_timer t;
  bpf_timer_init(&t, ...); // allocate a timer
  bpf_timer_mod(&t, ..);
  // end of BPF program
  // now the timer is leaked, no one will delete it
}

We can not enforce the matching in the verifier, because users would
have to call bpf_timer_del() before exiting, which is not what we want
either.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 20:57               ` Cong Wang
@ 2021-04-02 23:31                 ` Song Liu
  2021-04-05 23:49                   ` Cong Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Song Liu @ 2021-04-02 23:31 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Apr 2, 2021, at 1:57 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Fri, Apr 2, 2021 at 12:45 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Apr 2, 2021, at 12:08 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> 
>>> On Fri, Apr 2, 2021 at 10:57 AM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Apr 2, 2021, at 10:34 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>> 
>>>>> On Thu, Apr 1, 2021 at 1:17 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>> 
>>>>>> 
>>>>>> 
>>>>>>> On Apr 1, 2021, at 10:28 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>>>> 
>>>>>>> On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>>>> 
>>>>>>>> 
>>>>>>>> 
>>>>>>>>> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>>>>>> 
>>>>>>>>> From: Cong Wang <cong.wang@bytedance.com>
>>>>>>>>> 
>>>>>>>>> (This patch is still in early stage and obviously incomplete. I am sending
>>>>>>>>> it out to get some high-level feedbacks. Please kindly ignore any coding
>>>>>>>>> details for now and focus on the design.)
>>>>>>>> 
>>>>>>>> Could you please explain the use case of the timer? Is it the same as
>>>>>>>> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
>>>>>>>> 
>>>>>>>> Assuming that is the case, I guess the use case is to assign an expire
>>>>>>>> time for each element in a hash map; and periodically remove expired
>>>>>>>> element from the map.
>>>>>>>> 
>>>>>>>> If this is still correct, my next question is: how does this compare
>>>>>>>> against a user space timer? Will the user space timer be too slow?
>>>>>>> 
>>>>>>> Yes, as I explained in timeout hashmap patchset, doing it in user-space
>>>>>>> would require a lot of syscalls (without batching) or copying (with batching).
>>>>>>> I will add the explanation here, in case people miss why we need a timer.
>>>>>> 
>>>>>> How about we use a user space timer to trigger a BPF program (e.g. use
>>>>>> BPF_PROG_TEST_RUN on a raw_tp program); then, in the BPF program, we can
>>>>>> use bpf_for_each_map_elem and bpf_map_delete_elem to scan and update the
>>>>>> map? With this approach, we only need one syscall per period.
>>>>> 
>>>>> Interesting, I didn't know we can explicitly trigger a BPF program running
>>>>> from user-space. Is it for testing purposes only?
>>>> 
>>>> This is not only for testing. We will use this in perf (starting in 5.13).
>>>> 
>>>> /* currently in Arnaldo's tree, tools/perf/util/bpf_counter.c: */
>>>> 
>>>> /* trigger the leader program on a cpu */
>>>> static int bperf_trigger_reading(int prog_fd, int cpu)
>>>> {
>>>>       DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
>>>>                           .ctx_in = NULL,
>>>>                           .ctx_size_in = 0,
>>>>                           .flags = BPF_F_TEST_RUN_ON_CPU,
>>>>                           .cpu = cpu,
>>>>                           .retval = 0,
>>>>               );
>>>> 
>>>>       return bpf_prog_test_run_opts(prog_fd, &opts);
>>>> }
>>>> 
>>>> test_run also passes return value (retval) back to user space, so we and
>>>> adjust the timer interval based on retval.
>>> 
>>> This is really odd, every name here contains a "test" but it is not for testing
>>> purposes. You probably need to rename/alias it. ;)
>>> 
>>> So, with this we have to get a user-space daemon running just to keep
>>> this "timer" alive. If I want to run it every 1ms, it means I have to issue
>>> a syscall BPF_PROG_TEST_RUN every 1ms. Even with a timer fd, we
>>> still need poll() and timerfd_settime(). This is a considerable overhead
>>> for just a single timer.
>> 
>> sys_bpf() takes about 0.5us. I would expect poll() and timerfd_settime() to
>> be slightly faster. So the overhead is less than 0.2% of a single core
>> (0.5us x 3 / 1ms). Do we need many counters for conntrack?
> 
> This is just for one timer. The whole system may end up with many timers
> when we have more and more eBPF programs. So managing the timers
> in the use-space would be a problem too someday, clearly one daemon
> per-timer does not scale.

If we do need many timers, I agree that it doesn't make sense to create 
a thread for each of them. 

A less-flexible alternative is to create a perf_event of "cpu-clock" and 
attach BPF program to it. It is not easy to adjust the interval, I guess.

> 
>> 
>>> 
>>> With current design, user-space can just exit after installing the timer,
>>> either it can adjust itself or other eBPF code can adjust it, so the per
>>> timer overhead is the same as a kernel timer.
>> 
>> I guess we still need to hold a fd to the prog/map? Alternatively, we can
>> pin the prog/map, but then the user need to clean it up.
> 
> Yes, but I don't see how holding a fd could bring any overhead after
> initial setup.
>> 
>>> 
>>> The visibility to other BPF code is important for the conntrack case,
>>> because each time we get an expired item during a lookup, we may
>>> want to schedule the GC timer to run sooner. At least this would give
>>> users more freedom to decide when to reschedule the timer.
>> 
>> Do we plan to share the timer program among multiple processes (which can
>> start and terminate in arbitrary orders)? If that is the case, I can imagine
>> a timer program is better than a user space timer.
> 
> I mean I want other eBPF program to manage the timers in kernel-space,
> as conntrack is mostly in kernel-space. If the timer is only manageable
> in user-space, it would seriously limit its use case.
> 
> Ideally I even prefer to create timers in kernel-space too, but as I already
> explained, this seems impossible to me.

Would hrtimer (include/linux/hrtimer.h) work? 

Thanks,
Song






^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 21:24   ` Cong Wang
@ 2021-04-02 23:45     ` Alexei Starovoitov
  2021-04-06  0:36       ` Cong Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Alexei Starovoitov @ 2021-04-02 23:45 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Fri, Apr 02, 2021 at 02:24:51PM -0700, Cong Wang wrote:
> > > where the key is the timer ID and the value is the timer expire
> > > timer.
> >
> > The timer ID is unnecessary. We cannot introduce new IDR for every new
> > bpf object. It doesn't scale.
> 
> The IDR is per map, not per timer.

Per-map is not acceptable. One IDR for all maps with timers is not acceptable either.
We have 3 IDRs now: for progs, for maps, and for links.
No other objects need IDRs.

> > Here is how more general timers might look like:
> > https://lore.kernel.org/bpf/20210310011905.ozz4xahpkqbfkkvd@ast-mbp.dhcp.thefacebook.com/
> >
> > include/uapi/linux/bpf.h:
> > struct bpf_timer {
> >   u64 opaque;
> > };
> > The 'opaque' field contains a pointer to dynamically allocated struct timer_list and other data.
> 
> This is my initial design as we already discussed, it does not work,
> please see below.

It does work. The perceived "issue" you referred to is a misunderstanding. See below.

> >
> > The prog would do:
> > struct map_elem {
> >     int stuff;
> >     struct bpf_timer timer;
> > };
> >
> > struct {
> >     __uint(type, BPF_MAP_TYPE_HASH);
> >     __uint(max_entries, 1);
> >     __type(key, int);
> >     __type(value, struct map_elem);
> > } hmap SEC(".maps");
> >
> > static int timer_cb(struct map_elem *elem)
> > {
> >     if (whatever && elem->stuff)
> >         bpf_timer_mod(&elem->timer, new_expire);
> > }
> >
> > int bpf_timer_test(...)
> > {
> >     struct map_elem *val;
> >
> >     val = bpf_map_lookup_elem(&hmap, &key);
> >     if (val) {
> >         bpf_timer_init(&val->timer, timer_cb, flags);
> >         val->stuff = 123;
> >         bpf_timer_mod(&val->timer, expires);
> >     }
> > }
> >
> > bpf_map_update_elem() either from bpf prog or from user space
> > allocates map element and zeros 8 byte space for the timer pointer.
> > bpf_timer_init() allocates timer_list and stores it into opaque if opaque == 0.
> > The validation of timer_cb() is done by the verifier.
> > bpf_map_delete_elem() either from bpf prog or from user space
> > does del_timer() if elem->opaque != 0.
> > If prog refers such hmap as above during prog free the kernel does
> > for_each_map_elem {if (elem->opaque) del_timer().}
> > I think that is the simplest way of prevent timers firing past the prog life time.
> > There could be other ways to solve it (like prog_array and ref/uref).
> >
> > Pseudo code:
> > int bpf_timer_init(struct bpf_timer *timer, void *timer_cb, int flags)
> > {
> >   if (timer->opaque)
> >     return -EBUSY;
> >   t = alloc timer_list
> >   t->cb = timer_cb;
> >   t->..
> >   timer->opaque = (long)t;
> > }
> >
> > int bpf_timer_mod(struct bpf_timer *timer, u64 expires)
> > {
> >   if (!time->opaque)
> >     return -EINVAL;
> >   t = (struct timer_list *)timer->opaque;
> >   mod_timer(t,..);
> > }
> >
> > int bpf_timer_del(struct bpf_timer *timer)
> > {
> >   if (!time->opaque)
> >     return -EINVAL;
> >   t = (struct timer_list *)timer->opaque;
> >   del_timer(t);
> > }
> >
> > The verifier would need to check that 8 bytes occupied by bpf_timer and not accessed
> > via load/store by the program. The same way it does it for bpf_spin_lock.
> 
> This does not work, because bpf_timer_del() has to be matched
> with bpf_timer_init(), otherwise we would leak timer resources.
> For example:
> 
> SEC("foo")
> bad_ebpf_code()
> {
>   struct bpf_timer t;
>   bpf_timer_init(&t, ...); // allocate a timer
>   bpf_timer_mod(&t, ..);
>   // end of BPF program
>   // now the timer is leaked, no one will delete it
> }
> 
> We can not enforce the matching in the verifier, because users would
> have to call bpf_timer_del() before exiting, which is not what we want
> either.

```
bad_ebpf_code()
{
  struct bpf_timer t;
```
is not at all what was proposed. This kind of code will be rejected by the verifier.

'struct bpf_timer' has to be part of the map element and the verifier will enforce that
just like it does so for bpf_spin_lock.
Try writing the following program:
```
bad_ebpf_code()
{
  struct bpf_spin_lock t;
  bpf_spin_lock(&t);
}
``
and then follow the code to see why the verifier rejects it.

The implementation of what I'm proposing is straightforward.
I certainly understand that it might look intimidating and "impossible",
but it's really quite simple.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 23:31                 ` Song Liu
@ 2021-04-05 23:49                   ` Cong Wang
  2021-04-06  1:07                     ` Song Liu
  0 siblings, 1 reply; 25+ messages in thread
From: Cong Wang @ 2021-04-05 23:49 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Fri, Apr 2, 2021 at 4:31 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Apr 2, 2021, at 1:57 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > Ideally I even prefer to create timers in kernel-space too, but as I already
> > explained, this seems impossible to me.
>
> Would hrtimer (include/linux/hrtimer.h) work?

By impossible, I meant it is impossible (to me) to take a refcnt to the callback
prog if we create the timer in kernel-space. So, hrtimer is the same in this
perspective.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 23:45     ` Alexei Starovoitov
@ 2021-04-06  0:36       ` Cong Wang
  2021-04-12 23:01         ` Alexei Starovoitov
  0 siblings, 1 reply; 25+ messages in thread
From: Cong Wang @ 2021-04-06  0:36 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Fri, Apr 2, 2021 at 4:45 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Apr 02, 2021 at 02:24:51PM -0700, Cong Wang wrote:
> > > > where the key is the timer ID and the value is the timer expire
> > > > timer.
> > >
> > > The timer ID is unnecessary. We cannot introduce new IDR for every new
> > > bpf object. It doesn't scale.
> >
> > The IDR is per map, not per timer.
>
> Per-map is not acceptable. One IDR for all maps with timers is not acceptable either.
> We have 3 IDRs now: for progs, for maps, and for links.
> No other objects need IDRs.
>
> > > Here is how more general timers might look like:
> > > https://lore.kernel.org/bpf/20210310011905.ozz4xahpkqbfkkvd@ast-mbp.dhcp.thefacebook.com/
> > >
> > > include/uapi/linux/bpf.h:
> > > struct bpf_timer {
> > >   u64 opaque;
> > > };
> > > The 'opaque' field contains a pointer to dynamically allocated struct timer_list and other data.
> >
> > This is my initial design as we already discussed, it does not work,
> > please see below.
>
> It does work. The perceived "issue" you referred to is a misunderstanding. See below.
>
> > >
> > > The prog would do:
> > > struct map_elem {
> > >     int stuff;
> > >     struct bpf_timer timer;
> > > };
> > >
> > > struct {
> > >     __uint(type, BPF_MAP_TYPE_HASH);
> > >     __uint(max_entries, 1);
> > >     __type(key, int);
> > >     __type(value, struct map_elem);
> > > } hmap SEC(".maps");
> > >
> > > static int timer_cb(struct map_elem *elem)
> > > {
> > >     if (whatever && elem->stuff)
> > >         bpf_timer_mod(&elem->timer, new_expire);
> > > }
> > >
> > > int bpf_timer_test(...)
> > > {
> > >     struct map_elem *val;
> > >
> > >     val = bpf_map_lookup_elem(&hmap, &key);
> > >     if (val) {
> > >         bpf_timer_init(&val->timer, timer_cb, flags);
> > >         val->stuff = 123;
> > >         bpf_timer_mod(&val->timer, expires);
> > >     }
> > > }
> > >
> > > bpf_map_update_elem() either from bpf prog or from user space
> > > allocates map element and zeros 8 byte space for the timer pointer.
> > > bpf_timer_init() allocates timer_list and stores it into opaque if opaque == 0.
> > > The validation of timer_cb() is done by the verifier.
> > > bpf_map_delete_elem() either from bpf prog or from user space
> > > does del_timer() if elem->opaque != 0.
> > > If prog refers such hmap as above during prog free the kernel does
> > > for_each_map_elem {if (elem->opaque) del_timer().}
> > > I think that is the simplest way of prevent timers firing past the prog life time.
> > > There could be other ways to solve it (like prog_array and ref/uref).
> > >
> > > Pseudo code:
> > > int bpf_timer_init(struct bpf_timer *timer, void *timer_cb, int flags)
> > > {
> > >   if (timer->opaque)
> > >     return -EBUSY;
> > >   t = alloc timer_list
> > >   t->cb = timer_cb;
> > >   t->..
> > >   timer->opaque = (long)t;
> > > }
> > >
> > > int bpf_timer_mod(struct bpf_timer *timer, u64 expires)
> > > {
> > >   if (!time->opaque)
> > >     return -EINVAL;
> > >   t = (struct timer_list *)timer->opaque;
> > >   mod_timer(t,..);
> > > }
> > >
> > > int bpf_timer_del(struct bpf_timer *timer)
> > > {
> > >   if (!time->opaque)
> > >     return -EINVAL;
> > >   t = (struct timer_list *)timer->opaque;
> > >   del_timer(t);
> > > }
> > >
> > > The verifier would need to check that 8 bytes occupied by bpf_timer and not accessed
> > > via load/store by the program. The same way it does it for bpf_spin_lock.
> >
> > This does not work, because bpf_timer_del() has to be matched
> > with bpf_timer_init(), otherwise we would leak timer resources.
> > For example:
> >
> > SEC("foo")
> > bad_ebpf_code()
> > {
> >   struct bpf_timer t;
> >   bpf_timer_init(&t, ...); // allocate a timer
> >   bpf_timer_mod(&t, ..);
> >   // end of BPF program
> >   // now the timer is leaked, no one will delete it
> > }
> >
> > We can not enforce the matching in the verifier, because users would
> > have to call bpf_timer_del() before exiting, which is not what we want
> > either.
>
> ```
> bad_ebpf_code()
> {
>   struct bpf_timer t;
> ```
> is not at all what was proposed. This kind of code will be rejected by the verifier.
>
> 'struct bpf_timer' has to be part of the map element and the verifier will enforce that
> just like it does so for bpf_spin_lock.
> Try writing the following program:
> ```
> bad_ebpf_code()
> {
>   struct bpf_spin_lock t;
>   bpf_spin_lock(&t);
> }
> ``
> and then follow the code to see why the verifier rejects it.

Well, embedding a spinlock makes sense as it is used to protect
the value it is associated with, but for a timer, no, it has no value
to associate. Even if it does, updating it requires a lock as the
callback can run concurrently with value update. So, they are very
different hence should be treated differently rather than similarly.

>
> The implementation of what I'm proposing is straightforward.
> I certainly understand that it might look intimidating and "impossible",
> but it's really quite simple.

How do you refcnt the struct bpf_prog with your approach? Or with
actually any attempt to create timers in kernel-space. I am not intimidated
but quite happy to hear. If you do it in the verifier, we do not know which
code path is actually executed when running it. If you do it with JIT, I do
not see how JIT can even get the right struct bpf_prog pointer in context.

This is how I concluded it looks impossible, which has nothing to do
with whether we have a map or not. Map issue is much easier to solve,
whether using what you mentioned or what I showed.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-05 23:49                   ` Cong Wang
@ 2021-04-06  1:07                     ` Song Liu
  2021-04-06  1:24                       ` Cong Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Song Liu @ 2021-04-06  1:07 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Apr 5, 2021, at 4:49 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Fri, Apr 2, 2021 at 4:31 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Apr 2, 2021, at 1:57 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> 
>>> Ideally I even prefer to create timers in kernel-space too, but as I already
>>> explained, this seems impossible to me.
>> 
>> Would hrtimer (include/linux/hrtimer.h) work?
> 
> By impossible, I meant it is impossible (to me) to take a refcnt to the callback
> prog if we create the timer in kernel-space. So, hrtimer is the same in this
> perspective.
> 
> Thanks.

I guess I am not following 100%. Here is what I would propose:

We only introduce a new program type BPF_PROG_TYPE_TIMER. No new map type. 
The new program will trigger based on a timer, and the program can somehow 
control the period of the timer (for example, via return value).

With this approach, the user simply can create multiple timer programs and 
hold the fd for them. And these programs trigger up to timer expiration. 

Does this make sense?

Thanks,
Song

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-06  1:07                     ` Song Liu
@ 2021-04-06  1:24                       ` Cong Wang
  2021-04-06  6:17                         ` Song Liu
  0 siblings, 1 reply; 25+ messages in thread
From: Cong Wang @ 2021-04-06  1:24 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Mon, Apr 5, 2021 at 6:08 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Apr 5, 2021, at 4:49 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Fri, Apr 2, 2021 at 4:31 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Apr 2, 2021, at 1:57 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>
> >>> Ideally I even prefer to create timers in kernel-space too, but as I already
> >>> explained, this seems impossible to me.
> >>
> >> Would hrtimer (include/linux/hrtimer.h) work?
> >
> > By impossible, I meant it is impossible (to me) to take a refcnt to the callback
> > prog if we create the timer in kernel-space. So, hrtimer is the same in this
> > perspective.
> >
> > Thanks.
>
> I guess I am not following 100%. Here is what I would propose:
>
> We only introduce a new program type BPF_PROG_TYPE_TIMER. No new map type.
> The new program will trigger based on a timer, and the program can somehow
> control the period of the timer (for example, via return value).

Like we already discussed, with this approach the "timer" itself is not
visible to kernel, that is, only manageable in user-space. Or do you disagree?

>
> With this approach, the user simply can create multiple timer programs and
> hold the fd for them. And these programs trigger up to timer expiration.

Sure, this is precisely why I moved timer creation to user-space to solve
the refcnt issue. ;)

>
> Does this make sense?

Yes, except kernel-space code can't see it. If you look at the timeout map
I had, you will see something like this:

val = lookup(map, key);
if (val && val->expires < now)
   rearm_timer(&timer); // the timer periodically scans the hashmap

For conntrack, this is obviously in kernel-space. The point of the code is to
flush all expired items as soon as possible without doing explicit deletions
which are obviously expensive for the fast path.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-06  1:24                       ` Cong Wang
@ 2021-04-06  6:17                         ` Song Liu
  2021-04-06 16:48                           ` Cong Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Song Liu @ 2021-04-06  6:17 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Apr 5, 2021, at 6:24 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Mon, Apr 5, 2021 at 6:08 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Apr 5, 2021, at 4:49 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> 
>>> On Fri, Apr 2, 2021 at 4:31 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Apr 2, 2021, at 1:57 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>> 
>>>>> Ideally I even prefer to create timers in kernel-space too, but as I already
>>>>> explained, this seems impossible to me.
>>>> 
>>>> Would hrtimer (include/linux/hrtimer.h) work?
>>> 
>>> By impossible, I meant it is impossible (to me) to take a refcnt to the callback
>>> prog if we create the timer in kernel-space. So, hrtimer is the same in this
>>> perspective.
>>> 
>>> Thanks.
>> 
>> I guess I am not following 100%. Here is what I would propose:
>> 
>> We only introduce a new program type BPF_PROG_TYPE_TIMER. No new map type.
>> The new program will trigger based on a timer, and the program can somehow
>> control the period of the timer (for example, via return value).
> 
> Like we already discussed, with this approach the "timer" itself is not
> visible to kernel, that is, only manageable in user-space. Or do you disagree?

Do you mean we need mechanisms to control the timer, like stop the timer, 
trigger the timer immediately, etc.? And we need these mechanisms in kernel?
And by "in kernel-space" I assume you mean from BPF programs. 

If these are correct, how about something like:

1. A new program BPF_PROG_TYPE_TIMER, which by default will trigger on a timer. 
   Note that, the timer here is embedded in the program. So all the operations
   are on the program. 
2. Allow adding such BPF_PROG_TYPE_TIMER programs to a map of type 
   BPF_MAP_TYPE_PROG_ARRAY. 
3. Some new helpers that access the program via the BPF_MAP_TYPE_PROG_ARRAY map. 
   Actually, maybe we can reuse existing bpf_tail_call(). 

The BPF program and map will look like:

==================== 8< ==========================
struct data_elem {
	__u64 expiration;
	/* other data */
}; 

struct {
	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
	__uint(max_entries, 256);
	__type(key, __u32);
	__type(value, struct data_elem);
} data_map SEC(".maps");

struct {
	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
	__uint(max_entries, 256);
	__type(key, __u32);
	__type(value, __u64);
} timer_prog_map SEC(".maps");

static __u64
check_expired_elem(struct bpf_map *map, __u32 *key, __u64 *val,
                 int *data)
{
	u64 expires = *val;

	if (expires < bpf_jiffies64()) {
		bpf_map_delete_elem(map, key);
		*data++;
	}
 return 0;
}

SEC("timer")
int clean_up_timer(void)
{
	int count;

	bpf_for_each_map_elem(&data_map, check_expired_elem, &count, 0);
	if (count)
 		return 0; // not re-arm this timer
 	else
 		return 10; // reschedule this timer after 10 jiffies
}

SEC("tp_btf/XXX")
int another_trigger(void)
{
	if (some_condition)
		bpf_tail_call(NULL, &timer_prog_map, idx);
	return 0;
}

==================== 8< ==========================

Would something like this work for contract?

Thanks,
Song

> 
>> 
>> With this approach, the user simply can create multiple timer programs and
>> hold the fd for them. And these programs trigger up to timer expiration.
> 
> Sure, this is precisely why I moved timer creation to user-space to solve
> the refcnt issue. ;)
> 
>> 
>> Does this make sense?
> 
> Yes, except kernel-space code can't see it. If you look at the timeout map
> I had, you will see something like this:
> 
> val = lookup(map, key);
> if (val && val->expires < now)
>   rearm_timer(&timer); // the timer periodically scans the hashmap
> 
> For conntrack, this is obviously in kernel-space. The point of the code is to
> flush all expired items as soon as possible without doing explicit deletions
> which are obviously expensive for the fast path.
> 
> Thanks.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-06  6:17                         ` Song Liu
@ 2021-04-06 16:48                           ` Cong Wang
  2021-04-06 23:36                             ` Song Liu
  0 siblings, 1 reply; 25+ messages in thread
From: Cong Wang @ 2021-04-06 16:48 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Mon, Apr 5, 2021 at 11:18 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Apr 5, 2021, at 6:24 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Mon, Apr 5, 2021 at 6:08 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Apr 5, 2021, at 4:49 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>
> >>> On Fri, Apr 2, 2021 at 4:31 PM Song Liu <songliubraving@fb.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>>> On Apr 2, 2021, at 1:57 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>>>
> >>>>> Ideally I even prefer to create timers in kernel-space too, but as I already
> >>>>> explained, this seems impossible to me.
> >>>>
> >>>> Would hrtimer (include/linux/hrtimer.h) work?
> >>>
> >>> By impossible, I meant it is impossible (to me) to take a refcnt to the callback
> >>> prog if we create the timer in kernel-space. So, hrtimer is the same in this
> >>> perspective.
> >>>
> >>> Thanks.
> >>
> >> I guess I am not following 100%. Here is what I would propose:
> >>
> >> We only introduce a new program type BPF_PROG_TYPE_TIMER. No new map type.
> >> The new program will trigger based on a timer, and the program can somehow
> >> control the period of the timer (for example, via return value).
> >
> > Like we already discussed, with this approach the "timer" itself is not
> > visible to kernel, that is, only manageable in user-space. Or do you disagree?
>
> Do you mean we need mechanisms to control the timer, like stop the timer,
> trigger the timer immediately, etc.? And we need these mechanisms in kernel?
> And by "in kernel-space" I assume you mean from BPF programs.

Yes, of course. In the context of our discussion, kernel-space only means
eBPF code running in kernel-space. And like I showed in pseudo code,
we want to manage the timer in eBPF code too, that is, updating timer
expiration time and even deleting a timer. The point is we want to give
users as much flexibility as possible, so that they can use it in whatever
scenarios they want. We do not decide how they use them, they do.

>
> If these are correct, how about something like:
>
> 1. A new program BPF_PROG_TYPE_TIMER, which by default will trigger on a timer.
>    Note that, the timer here is embedded in the program. So all the operations
>    are on the program.
> 2. Allow adding such BPF_PROG_TYPE_TIMER programs to a map of type
>    BPF_MAP_TYPE_PROG_ARRAY.
> 3. Some new helpers that access the program via the BPF_MAP_TYPE_PROG_ARRAY map.
>    Actually, maybe we can reuse existing bpf_tail_call().

Reusing bpf_tail_call() just for timer sounds even crazier than
my current approach. So... what's the advantage of your approach
compared to mine?


>
> The BPF program and map will look like:
>
> ==================== 8< ==========================
> struct data_elem {
>         __u64 expiration;
>         /* other data */
> };

So, expiration is separated from "timer" itself. Naturally, expiration
belongs to the timer itself.

>
> struct {
>         __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>         __uint(max_entries, 256);
>         __type(key, __u32);
>         __type(value, struct data_elem);
> } data_map SEC(".maps");
>
> struct {
>         __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>         __uint(max_entries, 256);
>         __type(key, __u32);
>         __type(value, __u64);
> } timer_prog_map SEC(".maps");

So, users have to use two maps just for a timer. Looks unnecessarily
complex to me.

>
> static __u64
> check_expired_elem(struct bpf_map *map, __u32 *key, __u64 *val,
>                  int *data)
> {
>         u64 expires = *val;
>
>         if (expires < bpf_jiffies64()) {

Value is a 64-bit 'expiration', so it is not atomic to read/write it on 32bit
CPU. And user-space could update it in parallel to this timer callback.
So basically we have to use a bpf spinlock here.


>                 bpf_map_delete_elem(map, key);
>                 *data++;
>         }
>  return 0;
> }
>
> SEC("timer")
> int clean_up_timer(void)
> {
>         int count;
>
>         bpf_for_each_map_elem(&data_map, check_expired_elem, &count, 0);
>         if (count)
>                 return 0; // not re-arm this timer
>         else
>                 return 10; // reschedule this timer after 10 jiffies
> }
>
> SEC("tp_btf/XXX")
> int another_trigger(void)
> {
>         if (some_condition)
>                 bpf_tail_call(NULL, &timer_prog_map, idx);

Are you sure you can use bpf_tail_call() to call a prog asynchronously?


>         return 0;
> }
>
> ==================== 8< ==========================
>
> Would something like this work for contract?

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-06 16:48                           ` Cong Wang
@ 2021-04-06 23:36                             ` Song Liu
  2021-04-08 22:45                               ` Cong Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Song Liu @ 2021-04-06 23:36 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Apr 6, 2021, at 9:48 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Mon, Apr 5, 2021 at 11:18 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Apr 5, 2021, at 6:24 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> 
>>> On Mon, Apr 5, 2021 at 6:08 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Apr 5, 2021, at 4:49 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>> 
>>>>> On Fri, Apr 2, 2021 at 4:31 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>> 
>>>>>> 
>>>>>> 
>>>>>>> On Apr 2, 2021, at 1:57 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>>>> 
>>>>>>> Ideally I even prefer to create timers in kernel-space too, but as I already
>>>>>>> explained, this seems impossible to me.
>>>>>> 
>>>>>> Would hrtimer (include/linux/hrtimer.h) work?
>>>>> 
>>>>> By impossible, I meant it is impossible (to me) to take a refcnt to the callback
>>>>> prog if we create the timer in kernel-space. So, hrtimer is the same in this
>>>>> perspective.
>>>>> 
>>>>> Thanks.
>>>> 
>>>> I guess I am not following 100%. Here is what I would propose:
>>>> 
>>>> We only introduce a new program type BPF_PROG_TYPE_TIMER. No new map type.
>>>> The new program will trigger based on a timer, and the program can somehow
>>>> control the period of the timer (for example, via return value).
>>> 
>>> Like we already discussed, with this approach the "timer" itself is not
>>> visible to kernel, that is, only manageable in user-space. Or do you disagree?
>> 
>> Do you mean we need mechanisms to control the timer, like stop the timer,
>> trigger the timer immediately, etc.? And we need these mechanisms in kernel?
>> And by "in kernel-space" I assume you mean from BPF programs.
> 
> Yes, of course. In the context of our discussion, kernel-space only means
> eBPF code running in kernel-space. And like I showed in pseudo code,
> we want to manage the timer in eBPF code too, that is, updating timer
> expiration time and even deleting a timer. The point is we want to give
> users as much flexibility as possible, so that they can use it in whatever
> scenarios they want. We do not decide how they use them, they do.
> 
>> 
>> If these are correct, how about something like:
>> 
>> 1. A new program BPF_PROG_TYPE_TIMER, which by default will trigger on a timer.
>>   Note that, the timer here is embedded in the program. So all the operations
>>   are on the program.
>> 2. Allow adding such BPF_PROG_TYPE_TIMER programs to a map of type
>>   BPF_MAP_TYPE_PROG_ARRAY.
>> 3. Some new helpers that access the program via the BPF_MAP_TYPE_PROG_ARRAY map.
>>   Actually, maybe we can reuse existing bpf_tail_call().
> 
> Reusing bpf_tail_call() just for timer sounds even crazier than
> my current approach. So... what's the advantage of your approach
> compared to mine?

Since I don't know much about conntrack, I don't know which is better. 
The follow is just my thoughts based on the example you showed. It is 
likely that I misunderstand something. 

IIUC, the problem with user space timer is that we need a dedicated task 
for each wait-trigger loop. So I am proposing a BPF program that would
trigger up on timer expiration. 

The advantage (I think) is to not introduce a separate timer entity. 
The timer is bundled with the program.  

> 
> 
>> 
>> The BPF program and map will look like:
>> 
>> ==================== 8< ==========================
>> struct data_elem {
>>        __u64 expiration;
>>        /* other data */
>> };
> 
> So, expiration is separated from "timer" itself. Naturally, expiration
> belongs to the timer itself.

In this example, expiration is not related to the timer. It is just part
of the data element. It is possible that we won't need the expiration for 
some use cases. 

> 
>> 
>> struct {
>>        __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>>        __uint(max_entries, 256);
>>        __type(key, __u32);
>>        __type(value, struct data_elem);
>> } data_map SEC(".maps");
>> 
>> struct {
>>        __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>>        __uint(max_entries, 256);
>>        __type(key, __u32);
>>        __type(value, __u64);
>> } timer_prog_map SEC(".maps");
> 
> So, users have to use two maps just for a timer. Looks unnecessarily
> complex to me.

The data_map is not for the timer program, it is for the actual data. 
timer_prog_map is also optional here. We only need it when we want to 
trigger the program sooner than the scheduled time. If we can wait a
little longer, timer_prog_map can also be removed.

> 
>> 
>> static __u64
>> check_expired_elem(struct bpf_map *map, __u32 *key, __u64 *val,
>>                 int *data)
>> {
>>        u64 expires = *val;
>> 
>>        if (expires < bpf_jiffies64()) {
> 
> Value is a 64-bit 'expiration', so it is not atomic to read/write it on 32bit
> CPU. And user-space could update it in parallel to this timer callback.
> So basically we have to use a bpf spinlock here.
> 
> 
>>                bpf_map_delete_elem(map, key);
>>                *data++;
>>        }
>> return 0;
>> }
>> 
>> SEC("timer")
>> int clean_up_timer(void)
>> {
>>        int count;
>> 
>>        bpf_for_each_map_elem(&data_map, check_expired_elem, &count, 0);
>>        if (count)
>>                return 0; // not re-arm this timer
>>        else
>>                return 10; // reschedule this timer after 10 jiffies
>> }
>> 
>> SEC("tp_btf/XXX")
>> int another_trigger(void)
>> {
>>        if (some_condition)
>>                bpf_tail_call(NULL, &timer_prog_map, idx);
> 
> Are you sure you can use bpf_tail_call() to call a prog asynchronously?

I am not sure that we gonna use bpf_tail_call() here. If necessary, we 
can introduce a new helper. 


I am not sure whether this makes sense. I feel there is still some 
misunderstanding. It will be helpful if you can share more information 
about the overall design. 

BTW: this could be a good topic for the BPF office hour. See more details
here:

https://docs.google.com/spreadsheets/d/1LfrDXZ9-fdhvPEp_LHkxAMYyxxpwBXjywWa0AejEveU/edit#gid=0

Thanks,
Song

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-06 23:36                             ` Song Liu
@ 2021-04-08 22:45                               ` Cong Wang
  0 siblings, 0 replies; 25+ messages in thread
From: Cong Wang @ 2021-04-08 22:45 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Tue, Apr 6, 2021 at 4:36 PM Song Liu <songliubraving@fb.com> wrote:
> I am not sure whether this makes sense. I feel there is still some
> misunderstanding. It will be helpful if you can share more information
> about the overall design.
>
> BTW: this could be a good topic for the BPF office hour. See more details
> here:
>
> https://docs.google.com/spreadsheets/d/1LfrDXZ9-fdhvPEp_LHkxAMYyxxpwBXjywWa0AejEveU/edit#gid=0
>

This is a good idea. I have requested for a slot next Thursday,
I am looking forward to discussing bpf timer at that time.

Thanks!

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-06  0:36       ` Cong Wang
@ 2021-04-12 23:01         ` Alexei Starovoitov
  2021-04-15  4:02           ` Cong Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Alexei Starovoitov @ 2021-04-12 23:01 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Mon, Apr 05, 2021 at 05:36:27PM -0700, Cong Wang wrote:
> On Fri, Apr 2, 2021 at 4:45 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Fri, Apr 02, 2021 at 02:24:51PM -0700, Cong Wang wrote:
> > > > > where the key is the timer ID and the value is the timer expire
> > > > > timer.
> > > >
> > > > The timer ID is unnecessary. We cannot introduce new IDR for every new
> > > > bpf object. It doesn't scale.
> > >
> > > The IDR is per map, not per timer.
> >
> > Per-map is not acceptable. One IDR for all maps with timers is not acceptable either.
> > We have 3 IDRs now: for progs, for maps, and for links.
> > No other objects need IDRs.
> >
> > > > Here is how more general timers might look like:
> > > > https://lore.kernel.org/bpf/20210310011905.ozz4xahpkqbfkkvd@ast-mbp.dhcp.thefacebook.com/
> > > >
> > > > include/uapi/linux/bpf.h:
> > > > struct bpf_timer {
> > > >   u64 opaque;
> > > > };
> > > > The 'opaque' field contains a pointer to dynamically allocated struct timer_list and other data.
> > >
> > > This is my initial design as we already discussed, it does not work,
> > > please see below.
> >
> > It does work. The perceived "issue" you referred to is a misunderstanding. See below.
> >
> > > >
> > > > The prog would do:
> > > > struct map_elem {
> > > >     int stuff;
> > > >     struct bpf_timer timer;
> > > > };
> > > >
> > > > struct {
> > > >     __uint(type, BPF_MAP_TYPE_HASH);
> > > >     __uint(max_entries, 1);
> > > >     __type(key, int);
> > > >     __type(value, struct map_elem);
> > > > } hmap SEC(".maps");
> > > >
> > > > static int timer_cb(struct map_elem *elem)
> > > > {
> > > >     if (whatever && elem->stuff)
> > > >         bpf_timer_mod(&elem->timer, new_expire);
> > > > }
> > > >
> > > > int bpf_timer_test(...)
> > > > {
> > > >     struct map_elem *val;
> > > >
> > > >     val = bpf_map_lookup_elem(&hmap, &key);
> > > >     if (val) {
> > > >         bpf_timer_init(&val->timer, timer_cb, flags);
> > > >         val->stuff = 123;
> > > >         bpf_timer_mod(&val->timer, expires);
> > > >     }
> > > > }
> > > >
> > > > bpf_map_update_elem() either from bpf prog or from user space
> > > > allocates map element and zeros 8 byte space for the timer pointer.
> > > > bpf_timer_init() allocates timer_list and stores it into opaque if opaque == 0.
> > > > The validation of timer_cb() is done by the verifier.
> > > > bpf_map_delete_elem() either from bpf prog or from user space
> > > > does del_timer() if elem->opaque != 0.
> > > > If prog refers such hmap as above during prog free the kernel does
> > > > for_each_map_elem {if (elem->opaque) del_timer().}
> > > > I think that is the simplest way of prevent timers firing past the prog life time.
> > > > There could be other ways to solve it (like prog_array and ref/uref).
> > > >
> > > > Pseudo code:
> > > > int bpf_timer_init(struct bpf_timer *timer, void *timer_cb, int flags)
> > > > {
> > > >   if (timer->opaque)
> > > >     return -EBUSY;
> > > >   t = alloc timer_list
> > > >   t->cb = timer_cb;
> > > >   t->..
> > > >   timer->opaque = (long)t;
> > > > }
> > > >
> > > > int bpf_timer_mod(struct bpf_timer *timer, u64 expires)
> > > > {
> > > >   if (!time->opaque)
> > > >     return -EINVAL;
> > > >   t = (struct timer_list *)timer->opaque;
> > > >   mod_timer(t,..);
> > > > }
> > > >
> > > > int bpf_timer_del(struct bpf_timer *timer)
> > > > {
> > > >   if (!time->opaque)
> > > >     return -EINVAL;
> > > >   t = (struct timer_list *)timer->opaque;
> > > >   del_timer(t);
> > > > }
> > > >
> > > > The verifier would need to check that 8 bytes occupied by bpf_timer and not accessed
> > > > via load/store by the program. The same way it does it for bpf_spin_lock.
> > >
> > > This does not work, because bpf_timer_del() has to be matched
> > > with bpf_timer_init(), otherwise we would leak timer resources.
> > > For example:
> > >
> > > SEC("foo")
> > > bad_ebpf_code()
> > > {
> > >   struct bpf_timer t;
> > >   bpf_timer_init(&t, ...); // allocate a timer
> > >   bpf_timer_mod(&t, ..);
> > >   // end of BPF program
> > >   // now the timer is leaked, no one will delete it
> > > }
> > >
> > > We can not enforce the matching in the verifier, because users would
> > > have to call bpf_timer_del() before exiting, which is not what we want
> > > either.
> >
> > ```
> > bad_ebpf_code()
> > {
> >   struct bpf_timer t;
> > ```
> > is not at all what was proposed. This kind of code will be rejected by the verifier.
> >
> > 'struct bpf_timer' has to be part of the map element and the verifier will enforce that
> > just like it does so for bpf_spin_lock.
> > Try writing the following program:
> > ```
> > bad_ebpf_code()
> > {
> >   struct bpf_spin_lock t;
> >   bpf_spin_lock(&t);
> > }
> > ``
> > and then follow the code to see why the verifier rejects it.
> 
> Well, embedding a spinlock makes sense as it is used to protect
> the value it is associated with, but for a timer, no, it has no value
> to associate. 

The way kernel code is using timers is alwasy by embedding timer_list
into another data structure and then using container_of() in a callback.
So all existing use cases of timers disagree with your point.

> Even if it does, updating it requires a lock as the
> callback can run concurrently with value update. 

No lock is necessary.
map_value_update_elem can either return EBUSY if timer_list != NULL
or it can del_timer() before updating the whole value.
Both choices can be expressed with flags.

> So, they are very
> different hence should be treated differently rather than similarly.
> 
> >
> > The implementation of what I'm proposing is straightforward.
> > I certainly understand that it might look intimidating and "impossible",
> > but it's really quite simple.
> 
> How do you refcnt the struct bpf_prog with your approach? Or with

you don't. More so prog must not be refcnted otherwise it's a circular
dependency between progs and maps.
We did that in the past with prog_array and the api became unpleasant
and not user friendly. Not going to repeat the same mistake again.

> actually any attempt to create timers in kernel-space. I am not intimidated
> but quite happy to hear. If you do it in the verifier, we do not know which
> code path is actually executed when running it. If you do it with JIT, I do
> not see how JIT can even get the right struct bpf_prog pointer in context.

Neither. See pseudo code for bpf_timer_init/bpf_timer_mod in the earlier email.

> This is how I concluded it looks impossible.

Please explain what 'impossible' or buggy you see in the pseudo code.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-12 23:01         ` Alexei Starovoitov
@ 2021-04-15  4:02           ` Cong Wang
  2021-04-15  4:25             ` Alexei Starovoitov
  0 siblings, 1 reply; 25+ messages in thread
From: Cong Wang @ 2021-04-15  4:02 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Mon, Apr 12, 2021 at 4:01 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Apr 05, 2021 at 05:36:27PM -0700, Cong Wang wrote:
> > On Fri, Apr 2, 2021 at 4:45 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Fri, Apr 02, 2021 at 02:24:51PM -0700, Cong Wang wrote:
> > > > > > where the key is the timer ID and the value is the timer expire
> > > > > > timer.
> > > > >
> > > > > The timer ID is unnecessary. We cannot introduce new IDR for every new
> > > > > bpf object. It doesn't scale.
> > > >
> > > > The IDR is per map, not per timer.
> > >
> > > Per-map is not acceptable. One IDR for all maps with timers is not acceptable either.
> > > We have 3 IDRs now: for progs, for maps, and for links.
> > > No other objects need IDRs.
> > >
> > > > > Here is how more general timers might look like:
> > > > > https://lore.kernel.org/bpf/20210310011905.ozz4xahpkqbfkkvd@ast-mbp.dhcp.thefacebook.com/
> > > > >
> > > > > include/uapi/linux/bpf.h:
> > > > > struct bpf_timer {
> > > > >   u64 opaque;
> > > > > };
> > > > > The 'opaque' field contains a pointer to dynamically allocated struct timer_list and other data.
> > > >
> > > > This is my initial design as we already discussed, it does not work,
> > > > please see below.
> > >
> > > It does work. The perceived "issue" you referred to is a misunderstanding. See below.
> > >
> > > > >
> > > > > The prog would do:
> > > > > struct map_elem {
> > > > >     int stuff;
> > > > >     struct bpf_timer timer;
> > > > > };
> > > > >
> > > > > struct {
> > > > >     __uint(type, BPF_MAP_TYPE_HASH);
> > > > >     __uint(max_entries, 1);
> > > > >     __type(key, int);
> > > > >     __type(value, struct map_elem);
> > > > > } hmap SEC(".maps");
> > > > >
> > > > > static int timer_cb(struct map_elem *elem)
> > > > > {
> > > > >     if (whatever && elem->stuff)
> > > > >         bpf_timer_mod(&elem->timer, new_expire);
> > > > > }
> > > > >
> > > > > int bpf_timer_test(...)
> > > > > {
> > > > >     struct map_elem *val;
> > > > >
> > > > >     val = bpf_map_lookup_elem(&hmap, &key);
> > > > >     if (val) {
> > > > >         bpf_timer_init(&val->timer, timer_cb, flags);
> > > > >         val->stuff = 123;
> > > > >         bpf_timer_mod(&val->timer, expires);
> > > > >     }
> > > > > }
> > > > >
> > > > > bpf_map_update_elem() either from bpf prog or from user space
> > > > > allocates map element and zeros 8 byte space for the timer pointer.
> > > > > bpf_timer_init() allocates timer_list and stores it into opaque if opaque == 0.
> > > > > The validation of timer_cb() is done by the verifier.
> > > > > bpf_map_delete_elem() either from bpf prog or from user space
> > > > > does del_timer() if elem->opaque != 0.
> > > > > If prog refers such hmap as above during prog free the kernel does
> > > > > for_each_map_elem {if (elem->opaque) del_timer().}
> > > > > I think that is the simplest way of prevent timers firing past the prog life time.
> > > > > There could be other ways to solve it (like prog_array and ref/uref).
> > > > >
> > > > > Pseudo code:
> > > > > int bpf_timer_init(struct bpf_timer *timer, void *timer_cb, int flags)
> > > > > {
> > > > >   if (timer->opaque)
> > > > >     return -EBUSY;
> > > > >   t = alloc timer_list
> > > > >   t->cb = timer_cb;
> > > > >   t->..
> > > > >   timer->opaque = (long)t;
> > > > > }
> > > > >
> > > > > int bpf_timer_mod(struct bpf_timer *timer, u64 expires)
> > > > > {
> > > > >   if (!time->opaque)
> > > > >     return -EINVAL;
> > > > >   t = (struct timer_list *)timer->opaque;
> > > > >   mod_timer(t,..);
> > > > > }
> > > > >
> > > > > int bpf_timer_del(struct bpf_timer *timer)
> > > > > {
> > > > >   if (!time->opaque)
> > > > >     return -EINVAL;
> > > > >   t = (struct timer_list *)timer->opaque;
> > > > >   del_timer(t);
> > > > > }
> > > > >
> > > > > The verifier would need to check that 8 bytes occupied by bpf_timer and not accessed
> > > > > via load/store by the program. The same way it does it for bpf_spin_lock.
> > > >
> > > > This does not work, because bpf_timer_del() has to be matched
> > > > with bpf_timer_init(), otherwise we would leak timer resources.
> > > > For example:
> > > >
> > > > SEC("foo")
> > > > bad_ebpf_code()
> > > > {
> > > >   struct bpf_timer t;
> > > >   bpf_timer_init(&t, ...); // allocate a timer
> > > >   bpf_timer_mod(&t, ..);
> > > >   // end of BPF program
> > > >   // now the timer is leaked, no one will delete it
> > > > }
> > > >
> > > > We can not enforce the matching in the verifier, because users would
> > > > have to call bpf_timer_del() before exiting, which is not what we want
> > > > either.
> > >
> > > ```
> > > bad_ebpf_code()
> > > {
> > >   struct bpf_timer t;
> > > ```
> > > is not at all what was proposed. This kind of code will be rejected by the verifier.
> > >
> > > 'struct bpf_timer' has to be part of the map element and the verifier will enforce that
> > > just like it does so for bpf_spin_lock.
> > > Try writing the following program:
> > > ```
> > > bad_ebpf_code()
> > > {
> > >   struct bpf_spin_lock t;
> > >   bpf_spin_lock(&t);
> > > }
> > > ``
> > > and then follow the code to see why the verifier rejects it.
> >
> > Well, embedding a spinlock makes sense as it is used to protect
> > the value it is associated with, but for a timer, no, it has no value
> > to associate.
>
> The way kernel code is using timers is alwasy by embedding timer_list
> into another data structure and then using container_of() in a callback.
> So all existing use cases of timers disagree with your point.

Not always. Data can be passed as a global data structure visible to
timer callback.

>
> > Even if it does, updating it requires a lock as the
> > callback can run concurrently with value update.
>
> No lock is necessary.
> map_value_update_elem can either return EBUSY if timer_list != NULL
> or it can del_timer() before updating the whole value.
> Both choices can be expressed with flags.

This sounds problematic, because the hash map is visible to
users but not the timers associated, hence in user-space users
just unexpectedly get EBUSY.

>
> > So, they are very
> > different hence should be treated differently rather than similarly.
> >
> > >
> > > The implementation of what I'm proposing is straightforward.
> > > I certainly understand that it might look intimidating and "impossible",
> > > but it's really quite simple.
> >
> > How do you refcnt the struct bpf_prog with your approach? Or with
>
> you don't. More so prog must not be refcnted otherwise it's a circular
> dependency between progs and maps.
> We did that in the past with prog_array and the api became unpleasant
> and not user friendly. Not going to repeat the same mistake again.

Then how do you prevent prog being unloaded when the timer callback
is still active?


>
> > actually any attempt to create timers in kernel-space. I am not intimidated
> > but quite happy to hear. If you do it in the verifier, we do not know which
> > code path is actually executed when running it. If you do it with JIT, I do
> > not see how JIT can even get the right struct bpf_prog pointer in context.
>
> Neither. See pseudo code for bpf_timer_init/bpf_timer_mod in the earlier email.
>
> > This is how I concluded it looks impossible.
>
> Please explain what 'impossible' or buggy you see in the pseudo code.

Your pseudo code never shows how to refcnt the struct bpf_prog, which
is critical to the bpf timer design.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-15  4:02           ` Cong Wang
@ 2021-04-15  4:25             ` Alexei Starovoitov
  2021-04-15 15:51               ` Cong Wang
  0 siblings, 1 reply; 25+ messages in thread
From: Alexei Starovoitov @ 2021-04-15  4:25 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Wed, Apr 14, 2021 at 9:02 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Mon, Apr 12, 2021 at 4:01 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Apr 05, 2021 at 05:36:27PM -0700, Cong Wang wrote:
> > > On Fri, Apr 2, 2021 at 4:45 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Fri, Apr 02, 2021 at 02:24:51PM -0700, Cong Wang wrote:
> > > > > > > where the key is the timer ID and the value is the timer expire
> > > > > > > timer.
> > > > > >
> > > > > > The timer ID is unnecessary. We cannot introduce new IDR for every new
> > > > > > bpf object. It doesn't scale.
> > > > >
> > > > > The IDR is per map, not per timer.
> > > >
> > > > Per-map is not acceptable. One IDR for all maps with timers is not acceptable either.
> > > > We have 3 IDRs now: for progs, for maps, and for links.
> > > > No other objects need IDRs.
> > > >
> > > > > > Here is how more general timers might look like:
> > > > > > https://lore.kernel.org/bpf/20210310011905.ozz4xahpkqbfkkvd@ast-mbp.dhcp.thefacebook.com/
> > > > > >
> > > > > > include/uapi/linux/bpf.h:
> > > > > > struct bpf_timer {
> > > > > >   u64 opaque;
> > > > > > };
> > > > > > The 'opaque' field contains a pointer to dynamically allocated struct timer_list and other data.
> > > > >
> > > > > This is my initial design as we already discussed, it does not work,
> > > > > please see below.
> > > >
> > > > It does work. The perceived "issue" you referred to is a misunderstanding. See below.
> > > >
> > > > > >
> > > > > > The prog would do:
> > > > > > struct map_elem {
> > > > > >     int stuff;
> > > > > >     struct bpf_timer timer;
> > > > > > };
> > > > > >
> > > > > > struct {
> > > > > >     __uint(type, BPF_MAP_TYPE_HASH);
> > > > > >     __uint(max_entries, 1);
> > > > > >     __type(key, int);
> > > > > >     __type(value, struct map_elem);
> > > > > > } hmap SEC(".maps");
> > > > > >
> > > > > > static int timer_cb(struct map_elem *elem)
> > > > > > {
> > > > > >     if (whatever && elem->stuff)
> > > > > >         bpf_timer_mod(&elem->timer, new_expire);
> > > > > > }
> > > > > >
> > > > > > int bpf_timer_test(...)
> > > > > > {
> > > > > >     struct map_elem *val;
> > > > > >
> > > > > >     val = bpf_map_lookup_elem(&hmap, &key);
> > > > > >     if (val) {
> > > > > >         bpf_timer_init(&val->timer, timer_cb, flags);
> > > > > >         val->stuff = 123;
> > > > > >         bpf_timer_mod(&val->timer, expires);
> > > > > >     }
> > > > > > }
> > > > > >
> > > > > > bpf_map_update_elem() either from bpf prog or from user space
> > > > > > allocates map element and zeros 8 byte space for the timer pointer.
> > > > > > bpf_timer_init() allocates timer_list and stores it into opaque if opaque == 0.
> > > > > > The validation of timer_cb() is done by the verifier.
> > > > > > bpf_map_delete_elem() either from bpf prog or from user space
> > > > > > does del_timer() if elem->opaque != 0.
> > > > > > If prog refers such hmap as above during prog free the kernel does
> > > > > > for_each_map_elem {if (elem->opaque) del_timer().}
> > > > > > I think that is the simplest way of prevent timers firing past the prog life time.
> > > > > > There could be other ways to solve it (like prog_array and ref/uref).
> > > > > >
> > > > > > Pseudo code:
> > > > > > int bpf_timer_init(struct bpf_timer *timer, void *timer_cb, int flags)
> > > > > > {
> > > > > >   if (timer->opaque)
> > > > > >     return -EBUSY;
> > > > > >   t = alloc timer_list
> > > > > >   t->cb = timer_cb;
> > > > > >   t->..
> > > > > >   timer->opaque = (long)t;
> > > > > > }
> > > > > >
> > > > > > int bpf_timer_mod(struct bpf_timer *timer, u64 expires)
> > > > > > {
> > > > > >   if (!time->opaque)
> > > > > >     return -EINVAL;
> > > > > >   t = (struct timer_list *)timer->opaque;
> > > > > >   mod_timer(t,..);
> > > > > > }
> > > > > >
> > > > > > int bpf_timer_del(struct bpf_timer *timer)
> > > > > > {
> > > > > >   if (!time->opaque)
> > > > > >     return -EINVAL;
> > > > > >   t = (struct timer_list *)timer->opaque;
> > > > > >   del_timer(t);
> > > > > > }
> > > > > >
> > > > > > The verifier would need to check that 8 bytes occupied by bpf_timer and not accessed
> > > > > > via load/store by the program. The same way it does it for bpf_spin_lock.
> > > > >
> > > > > This does not work, because bpf_timer_del() has to be matched
> > > > > with bpf_timer_init(), otherwise we would leak timer resources.
> > > > > For example:
> > > > >
> > > > > SEC("foo")
> > > > > bad_ebpf_code()
> > > > > {
> > > > >   struct bpf_timer t;
> > > > >   bpf_timer_init(&t, ...); // allocate a timer
> > > > >   bpf_timer_mod(&t, ..);
> > > > >   // end of BPF program
> > > > >   // now the timer is leaked, no one will delete it
> > > > > }
> > > > >
> > > > > We can not enforce the matching in the verifier, because users would
> > > > > have to call bpf_timer_del() before exiting, which is not what we want
> > > > > either.
> > > >
> > > > ```
> > > > bad_ebpf_code()
> > > > {
> > > >   struct bpf_timer t;
> > > > ```
> > > > is not at all what was proposed. This kind of code will be rejected by the verifier.
> > > >
> > > > 'struct bpf_timer' has to be part of the map element and the verifier will enforce that
> > > > just like it does so for bpf_spin_lock.
> > > > Try writing the following program:
> > > > ```
> > > > bad_ebpf_code()
> > > > {
> > > >   struct bpf_spin_lock t;
> > > >   bpf_spin_lock(&t);
> > > > }
> > > > ``
> > > > and then follow the code to see why the verifier rejects it.
> > >
> > > Well, embedding a spinlock makes sense as it is used to protect
> > > the value it is associated with, but for a timer, no, it has no value
> > > to associate.
> >
> > The way kernel code is using timers is alwasy by embedding timer_list
> > into another data structure and then using container_of() in a callback.
> > So all existing use cases of timers disagree with your point.
>
> Not always. Data can be passed as a global data structure visible to
> timer callback.

global data is racy. That's not an option at all.

> >
> > > Even if it does, updating it requires a lock as the
> > > callback can run concurrently with value update.
> >
> > No lock is necessary.
> > map_value_update_elem can either return EBUSY if timer_list != NULL
> > or it can del_timer() before updating the whole value.
> > Both choices can be expressed with flags.
>
> This sounds problematic, because the hash map is visible to
> users but not the timers associated, hence in user-space users
> just unexpectedly get EBUSY.

As I said earlier:
"
bpf_map_update_elem() either from bpf prog or from user space
allocates map element and zeros 8 byte space for the timer pointer.
"
and also said that EBUSY could be default or non default behavior
expressed with flags passed into update.

> >
> > > So, they are very
> > > different hence should be treated differently rather than similarly.
> > >
> > > >
> > > > The implementation of what I'm proposing is straightforward.
> > > > I certainly understand that it might look intimidating and "impossible",
> > > > but it's really quite simple.
> > >
> > > How do you refcnt the struct bpf_prog with your approach? Or with
> >
> > you don't. More so prog must not be refcnted otherwise it's a circular
> > dependency between progs and maps.
> > We did that in the past with prog_array and the api became unpleasant
> > and not user friendly. Not going to repeat the same mistake again.
>
> Then how do you prevent prog being unloaded when the timer callback
> is still active?

As I said earlier:
"
If prog refers such hmap as above during prog free the kernel does
for_each_map_elem {if (elem->opaque) del_timer().}
"

>
> >
> > > actually any attempt to create timers in kernel-space. I am not intimidated
> > > but quite happy to hear. If you do it in the verifier, we do not know which
> > > code path is actually executed when running it. If you do it with JIT, I do
> > > not see how JIT can even get the right struct bpf_prog pointer in context.
> >
> > Neither. See pseudo code for bpf_timer_init/bpf_timer_mod in the earlier email.
> >
> > > This is how I concluded it looks impossible.
> >
> > Please explain what 'impossible' or buggy you see in the pseudo code.
>
> Your pseudo code never shows how to refcnt the struct bpf_prog, which
> is critical to the bpf timer design.

As I said earlier: nack to refcnt progs.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-15  4:25             ` Alexei Starovoitov
@ 2021-04-15 15:51               ` Cong Wang
  0 siblings, 0 replies; 25+ messages in thread
From: Cong Wang @ 2021-04-15 15:51 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Wed, Apr 14, 2021 at 9:25 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> As I said earlier:
> "
> If prog refers such hmap as above during prog free the kernel does
> for_each_map_elem {if (elem->opaque) del_timer().}
> "

This goes back to our previous discussion. Forcing timer deletions on
prog exit is not what we want. The whole point of using a map is to
extend the lifetime of a timer, that is, as long as the map exists, the
timers within it could be still running too.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, back to index

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-01  4:26 [RFC Patch bpf-next] bpf: introduce bpf timer Cong Wang
2021-04-01  6:38 ` Song Liu
2021-04-01 17:28   ` Cong Wang
2021-04-01 20:17     ` Song Liu
2021-04-02 17:34       ` Cong Wang
2021-04-02 17:57         ` Song Liu
2021-04-02 19:08           ` Cong Wang
2021-04-02 19:43             ` Song Liu
2021-04-02 20:57               ` Cong Wang
2021-04-02 23:31                 ` Song Liu
2021-04-05 23:49                   ` Cong Wang
2021-04-06  1:07                     ` Song Liu
2021-04-06  1:24                       ` Cong Wang
2021-04-06  6:17                         ` Song Liu
2021-04-06 16:48                           ` Cong Wang
2021-04-06 23:36                             ` Song Liu
2021-04-08 22:45                               ` Cong Wang
2021-04-02 19:28 ` Alexei Starovoitov
2021-04-02 21:24   ` Cong Wang
2021-04-02 23:45     ` Alexei Starovoitov
2021-04-06  0:36       ` Cong Wang
2021-04-12 23:01         ` Alexei Starovoitov
2021-04-15  4:02           ` Cong Wang
2021-04-15  4:25             ` Alexei Starovoitov
2021-04-15 15:51               ` Cong Wang

BPF Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/bpf/0 bpf/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 bpf bpf/ https://lore.kernel.org/bpf \
		bpf@vger.kernel.org
	public-inbox-index bpf

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.bpf


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git