All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC Patch bpf-next] bpf: introduce bpf timer
@ 2021-04-01  4:26 Cong Wang
  2021-04-01  6:38 ` Song Liu
  2021-04-02 19:28 ` Alexei Starovoitov
  0 siblings, 2 replies; 79+ messages in thread
From: Cong Wang @ 2021-04-01  4:26 UTC (permalink / raw)
  To: netdev
  Cc: bpf, duanxiongchun, wangdongdong.6, songmuchun, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song

From: Cong Wang <cong.wang@bytedance.com>

(This patch is still in early stage and obviously incomplete. I am sending
it out to get some high-level feedbacks. Please kindly ignore any coding
details for now and focus on the design.)

This patch introduces a bpf timer map and a syscall to create bpf timer
from user-space.

The reason why we have to use a map is because the lifetime of a timer,
without a map, we have to delete the timer before exiting the eBPF program,
this would significately limit its use cases. With a map, the timer can
stay as long as the map itself and can be actually updated via map update
API's too, where the key is the timer ID and the value is the timer expire
timer.

Timer creation is not easy either. In order to prevent users creating a
timer but not adding it to a map, we have to enforce this in the API which
takes a map parameter and adds the new timer into the map in one shot.

And because timer is asynchronous, we can not just use its callback like
bpf_for_each_map_elem(). More importantly, we have to properly reference
count its struct bpf_prog too. It seems impossible to do this either in
verifier or in JIT, so we have to make its callback code a separate eBPF
program and pass a program fd from user-space. Fortunately, timer callback
can still live in the same object file with the rest eBPF code and share
data too.

Here is a quick demo of the timer callback code:

static __u64
check_expired_elem(struct bpf_map *map, __u32 *key, __u64 *val,
                  int *data)
{
  u64 expires = *val;

  if (expires < bpf_jiffies64()) {
    bpf_map_delete_elem(map, key);
    *data++;
  }
  return 0;
}

SEC("timer")
u32 timer_callback(void)
{
  int count = 0;

  bpf_for_each_map_elem(&map, check_expired_elem, &count, 0);
  if (count)
     return 0; // not re-arm this timer
  else
     return 10; // reschedule this timer after 10 jiffies
}

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/linux/bpf.h       |   2 +
 include/linux/bpf_types.h |   1 +
 include/uapi/linux/bpf.h  |  15 +++
 kernel/bpf/Makefile       |   2 +-
 kernel/bpf/syscall.c      |  16 +++
 kernel/bpf/timer.c        | 238 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c     |   6 +
 7 files changed, 279 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/timer.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9fdd839b418c..196e8f2f8c12 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2078,4 +2078,6 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 struct btf_id_set;
 bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
 
+int bpf_timer_create(union bpf_attr *attr);
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index f883f01a5061..9e3afd2dbfc6 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -133,3 +133,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
 #ifdef CONFIG_NET
 BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns)
 #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_TIMER, timer_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 598716742593..627c0fbf9dac 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -841,6 +841,7 @@ enum bpf_cmd {
 	BPF_ITER_CREATE,
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
+	BPF_TIMER_CREATE,
 };
 
 enum bpf_map_type {
@@ -874,6 +875,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_RINGBUF,
 	BPF_MAP_TYPE_INODE_STORAGE,
 	BPF_MAP_TYPE_TASK_STORAGE,
+	BPF_MAP_TYPE_TIMER,
 };
 
 /* Note that tracing related programs such as
@@ -916,6 +918,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_EXT,
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
+	BPF_PROG_TYPE_TIMER,
 };
 
 enum bpf_attach_type {
@@ -1436,6 +1439,12 @@ union bpf_attr {
 		__u32		flags;		/* extra flags */
 	} prog_bind_map;
 
+	struct { /* struct used by BPF_TIMER_CREATE command */
+		__u32		map_fd;
+		__u32		prog_fd;
+		__u32		flags;		/* timer flags */
+	} timer_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -6013,4 +6022,10 @@ enum {
 	BTF_F_ZERO	=	(1ULL << 3),
 };
 
+/* bpf timer flags */
+enum {
+	BTF_TIMER_F_DEFERRABLE	= (1ULL << 0),
+	BTF_TIMER_F_PINNED	= (1ULL << 1),
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 7f33098ca63f..0215bfd1bcea 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,7 +8,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o timer.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
 obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9603de81811a..f423f0688bd5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4350,6 +4350,19 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
 	return ret;
 }
 
+#define BPF_TIMER_CREATE_LAST_FIELD timer_create.flags
+
+static int bpf_create_timer(union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_TIMER_CREATE))
+		return -EINVAL;
+
+	if (!bpf_capable())
+		return -EPERM;
+
+	return bpf_timer_create(attr);
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr;
@@ -4486,6 +4499,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_BIND_MAP:
 		err = bpf_prog_bind_map(&attr);
 		break;
+	case BPF_TIMER_CREATE:
+		err = bpf_create_timer(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/bpf/timer.c b/kernel/bpf/timer.c
new file mode 100644
index 000000000000..0d7b5655e60a
--- /dev/null
+++ b/kernel/bpf/timer.c
@@ -0,0 +1,238 @@
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/filter.h>
+#include <uapi/linux/btf.h>
+
+struct bpf_timer_list {
+	struct timer_list timer;
+	struct bpf_prog *prog;
+	u64 expires;
+	s32 id;
+	struct rcu_head rcu;
+};
+
+struct bpf_timer_map {
+	struct bpf_map map;
+	struct idr timer_idr;
+	spinlock_t idr_lock;
+};
+
+static int timer_map_alloc_check(union bpf_attr *attr)
+{
+	if (attr->max_entries == 0 || attr->max_entries > INT_MAX ||
+	    attr->key_size != 4 || attr->value_size != 8)
+		return -EINVAL;
+
+	if (attr->map_flags & BPF_F_MMAPABLE)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct bpf_map *timer_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_timer_map *tmap;
+
+	tmap = kzalloc(sizeof(*tmap), GFP_USER | __GFP_ACCOUNT);
+	if (!tmap)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&tmap->map, attr);
+	spin_lock_init(&tmap->idr_lock);
+	idr_init(&tmap->timer_idr);
+	return &tmap->map;
+}
+
+static int bpf_timer_delete(int id, void *ptr, void *data)
+{
+	struct bpf_timer_list *t = ptr;
+
+	del_timer_sync(&t->timer);
+	kfree_rcu(t, rcu);
+	return 0;
+}
+
+static void timer_map_free(struct bpf_map *map)
+{
+	struct bpf_timer_map *tmap;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+	idr_for_each(&tmap->timer_idr, bpf_timer_delete, NULL);
+
+	rcu_barrier();
+	idr_destroy(&tmap->timer_idr);
+}
+
+static void *timer_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_timer_map *tmap;
+	s32 timer_id = *(s32 *)key;
+	struct bpf_timer_list *t;
+	void *ret = NULL;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+
+	rcu_read_lock();
+	t = idr_find(&tmap->timer_idr, timer_id);
+	if (t) {
+		t->expires = t->timer.expires;
+		ret = &t->expires;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static int timer_map_update_elem(struct bpf_map *map, void *key, void *value,
+				 u64 flags)
+{
+	u64 expires = *(u64 *)value;
+	s32 timer_id = *(s32 *)key;
+	struct bpf_timer_map *tmap;
+	struct bpf_timer_list *t;
+	int ret = 0;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+
+	rcu_read_lock();
+	t = idr_find(&tmap->timer_idr, timer_id);
+	if (!t)
+		ret = -ENOENT;
+	else
+		mod_timer(&t->timer, (unsigned long)expires);
+	rcu_read_unlock();
+	return ret;
+}
+
+static int timer_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_timer_map *tmap;
+	s32 timer_id = *(s32 *)key;
+	struct bpf_timer_list *t;
+	unsigned long flags;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+	spin_lock_irqsave(&tmap->idr_lock, flags);
+	t = idr_remove(&tmap->timer_idr, timer_id);
+	spin_unlock_irqrestore(&tmap->idr_lock, flags);
+	if (!t)
+		return -ENOENT;
+	del_timer_sync(&t->timer);
+	bpf_prog_put(t->prog);
+	kfree_rcu(t, rcu);
+	return 0;
+}
+
+static int timer_map_get_next_key(struct bpf_map *map, void *key,
+				    void *next_key)
+{
+	struct bpf_timer_map *tmap;
+	s32 next_id = *(s32 *)key;
+	int ret = 0;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+	rcu_read_lock();
+	if (!idr_get_next(&tmap->timer_idr, &next_id))
+		ret = -ENOENT;
+	rcu_read_unlock();
+	*(s32 *)next_key = next_id;
+	return ret;
+}
+
+static int timer_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+	return -ENOTSUPP;
+}
+
+static int timer_map_btf_id;
+const struct bpf_map_ops timer_map_ops = {
+	.map_meta_equal = bpf_map_meta_equal,
+	.map_alloc_check = timer_map_alloc_check,
+	.map_alloc = timer_map_alloc,
+	.map_free = timer_map_free,
+	.map_mmap = timer_map_mmap,
+	.map_lookup_elem = timer_map_lookup_elem,
+	.map_update_elem = timer_map_update_elem,
+	.map_delete_elem = timer_map_delete_elem,
+	.map_get_next_key = timer_map_get_next_key,
+	.map_btf_name = "bpf_timer_map",
+	.map_btf_id = &timer_map_btf_id,
+};
+
+static void bpf_timer_callback(struct timer_list *t)
+{
+	struct bpf_timer_list *bt = from_timer(bt, t, timer);
+	u32 ret;
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(bt->prog, NULL);
+	rcu_read_unlock();
+
+	if (ret)
+		mod_timer(&bt->timer, bt->timer.expires + ret);
+}
+
+int bpf_timer_create(union bpf_attr *attr)
+{
+	unsigned int flags, timer_flags = 0;
+	struct bpf_timer_map *tmap;
+	struct bpf_timer_list *t;
+	unsigned long irq_flags;
+	struct bpf_prog *prog;
+	struct bpf_map *map;
+	int ret = 0;
+
+	flags = attr->timer_create.flags;
+	if (flags & ~(BTF_TIMER_F_DEFERRABLE | BTF_TIMER_F_PINNED))
+		return -EINVAL;
+
+	prog = bpf_prog_get(attr->timer_create.prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+	if (prog->type != BPF_PROG_TYPE_TIMER) {
+		ret = -EINVAL;
+		goto out_prog_put;
+	}
+
+	map = bpf_map_get(attr->timer_create.map_fd);
+	if (IS_ERR(map)) {
+		ret = PTR_ERR(map);
+		goto out_prog_put;
+	}
+	if (map->map_type != BPF_MAP_TYPE_TIMER) {
+		ret = -EINVAL;
+		goto out_map_put;
+	}
+
+	t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t) {
+		ret = -ENOMEM;
+		goto out_map_put;
+	}
+
+	if (flags & BTF_TIMER_F_DEFERRABLE)
+		timer_flags |= TIMER_DEFERRABLE;
+	if (flags & BTF_TIMER_F_PINNED)
+		timer_flags |= TIMER_PINNED;
+	timer_setup(&t->timer, bpf_timer_callback, timer_flags);
+	t->prog = prog;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+	spin_lock_irqsave(&tmap->idr_lock, irq_flags);
+	ret = idr_alloc_cyclic(&tmap->timer_idr, t, 0, INT_MAX, GFP_ATOMIC);
+	spin_unlock_irqrestore(&tmap->idr_lock, irq_flags);
+	if (ret < 0)
+		kfree(t);
+	else
+		t->id = ret;
+
+out_map_put:
+	bpf_map_put(map);
+out_prog_put:
+	if (ret)
+		bpf_prog_put(prog);
+	return ret;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 852541a435ef..ed0cbce8dc4f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5991,6 +5991,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return -EINVAL;
 	}
 
+	if (func_id == BPF_FUNC_map_delete_elem &&
+	    env->prog->type == BPF_PROG_TYPE_TIMER) {
+		verbose(env, "bpf_map_delete_elem() can't be called in a timer program\n");
+		return -EINVAL;
+	}
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-01  4:26 [RFC Patch bpf-next] bpf: introduce bpf timer Cong Wang
@ 2021-04-01  6:38 ` Song Liu
  2021-04-01 17:28   ` Cong Wang
  2021-04-02 19:28 ` Alexei Starovoitov
  1 sibling, 1 reply; 79+ messages in thread
From: Song Liu @ 2021-04-01  6:38 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> From: Cong Wang <cong.wang@bytedance.com>
> 
> (This patch is still in early stage and obviously incomplete. I am sending
> it out to get some high-level feedbacks. Please kindly ignore any coding
> details for now and focus on the design.)

Could you please explain the use case of the timer? Is it the same as 
earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH? 

Assuming that is the case, I guess the use case is to assign an expire 
time for each element in a hash map; and periodically remove expired 
element from the map. 

If this is still correct, my next question is: how does this compare
against a user space timer? Will the user space timer be too slow?

> 
> This patch introduces a bpf timer map and a syscall to create bpf timer
> from user-space.
> 
> The reason why we have to use a map is because the lifetime of a timer,
> without a map, we have to delete the timer before exiting the eBPF program,
> this would significately limit its use cases. With a map, the timer can
> stay as long as the map itself and can be actually updated via map update
> API's too, where the key is the timer ID and the value is the timer expire
> timer.
> 
> Timer creation is not easy either. In order to prevent users creating a
> timer but not adding it to a map, we have to enforce this in the API which
> takes a map parameter and adds the new timer into the map in one shot.

I think we don't have to address "creating a timer but not adding it to a map" 
problem in the kernel. If the user forgot it, the user should debug it. 

Thanks,
Song

[...]

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-01  6:38 ` Song Liu
@ 2021-04-01 17:28   ` Cong Wang
  2021-04-01 20:17     ` Song Liu
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-04-01 17:28 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > From: Cong Wang <cong.wang@bytedance.com>
> >
> > (This patch is still in early stage and obviously incomplete. I am sending
> > it out to get some high-level feedbacks. Please kindly ignore any coding
> > details for now and focus on the design.)
>
> Could you please explain the use case of the timer? Is it the same as
> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
>
> Assuming that is the case, I guess the use case is to assign an expire
> time for each element in a hash map; and periodically remove expired
> element from the map.
>
> If this is still correct, my next question is: how does this compare
> against a user space timer? Will the user space timer be too slow?

Yes, as I explained in timeout hashmap patchset, doing it in user-space
would require a lot of syscalls (without batching) or copying (with batching).
I will add the explanation here, in case people miss why we need a timer.

>
> >
> > This patch introduces a bpf timer map and a syscall to create bpf timer
> > from user-space.
> >
> > The reason why we have to use a map is because the lifetime of a timer,
> > without a map, we have to delete the timer before exiting the eBPF program,
> > this would significately limit its use cases. With a map, the timer can
> > stay as long as the map itself and can be actually updated via map update
> > API's too, where the key is the timer ID and the value is the timer expire
> > timer.
> >
> > Timer creation is not easy either. In order to prevent users creating a
> > timer but not adding it to a map, we have to enforce this in the API which
> > takes a map parameter and adds the new timer into the map in one shot.
>
> I think we don't have to address "creating a timer but not adding it to a map"
> problem in the kernel. If the user forgot it, the user should debug it.

Good point. Initially the timer is created in kernel-space, now it is in user
space, so it is probably fine to create it without a map. But we would have
to provide more syscalls for users to manage the timer, so using a map
still has an advantage of not adding more syscalls.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-01 17:28   ` Cong Wang
@ 2021-04-01 20:17     ` Song Liu
  2021-04-02 17:34       ` Cong Wang
  0 siblings, 1 reply; 79+ messages in thread
From: Song Liu @ 2021-04-01 20:17 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Apr 1, 2021, at 10:28 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> 
>>> From: Cong Wang <cong.wang@bytedance.com>
>>> 
>>> (This patch is still in early stage and obviously incomplete. I am sending
>>> it out to get some high-level feedbacks. Please kindly ignore any coding
>>> details for now and focus on the design.)
>> 
>> Could you please explain the use case of the timer? Is it the same as
>> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
>> 
>> Assuming that is the case, I guess the use case is to assign an expire
>> time for each element in a hash map; and periodically remove expired
>> element from the map.
>> 
>> If this is still correct, my next question is: how does this compare
>> against a user space timer? Will the user space timer be too slow?
> 
> Yes, as I explained in timeout hashmap patchset, doing it in user-space
> would require a lot of syscalls (without batching) or copying (with batching).
> I will add the explanation here, in case people miss why we need a timer.

How about we use a user space timer to trigger a BPF program (e.g. use 
BPF_PROG_TEST_RUN on a raw_tp program); then, in the BPF program, we can 
use bpf_for_each_map_elem and bpf_map_delete_elem to scan and update the 
map? With this approach, we only need one syscall per period. 

Thanks,
Song


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-01 20:17     ` Song Liu
@ 2021-04-02 17:34       ` Cong Wang
  2021-04-02 17:57         ` Song Liu
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-04-02 17:34 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Thu, Apr 1, 2021 at 1:17 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Apr 1, 2021, at 10:28 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>
> >>> From: Cong Wang <cong.wang@bytedance.com>
> >>>
> >>> (This patch is still in early stage and obviously incomplete. I am sending
> >>> it out to get some high-level feedbacks. Please kindly ignore any coding
> >>> details for now and focus on the design.)
> >>
> >> Could you please explain the use case of the timer? Is it the same as
> >> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
> >>
> >> Assuming that is the case, I guess the use case is to assign an expire
> >> time for each element in a hash map; and periodically remove expired
> >> element from the map.
> >>
> >> If this is still correct, my next question is: how does this compare
> >> against a user space timer? Will the user space timer be too slow?
> >
> > Yes, as I explained in timeout hashmap patchset, doing it in user-space
> > would require a lot of syscalls (without batching) or copying (with batching).
> > I will add the explanation here, in case people miss why we need a timer.
>
> How about we use a user space timer to trigger a BPF program (e.g. use
> BPF_PROG_TEST_RUN on a raw_tp program); then, in the BPF program, we can
> use bpf_for_each_map_elem and bpf_map_delete_elem to scan and update the
> map? With this approach, we only need one syscall per period.

Interesting, I didn't know we can explicitly trigger a BPF program running
from user-space. Is it for testing purposes only?

But we also want the timer code itself to change the expire time too, it is
common to adjust the expire time based on the size of the workset, for
example, the number of elements in a hashmap.

With the current design, both kernel and user-space can modify the
expire time with map update API's.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 17:34       ` Cong Wang
@ 2021-04-02 17:57         ` Song Liu
  2021-04-02 19:08           ` Cong Wang
  0 siblings, 1 reply; 79+ messages in thread
From: Song Liu @ 2021-04-02 17:57 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Apr 2, 2021, at 10:34 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Thu, Apr 1, 2021 at 1:17 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Apr 1, 2021, at 10:28 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> 
>>> On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>> 
>>>>> From: Cong Wang <cong.wang@bytedance.com>
>>>>> 
>>>>> (This patch is still in early stage and obviously incomplete. I am sending
>>>>> it out to get some high-level feedbacks. Please kindly ignore any coding
>>>>> details for now and focus on the design.)
>>>> 
>>>> Could you please explain the use case of the timer? Is it the same as
>>>> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
>>>> 
>>>> Assuming that is the case, I guess the use case is to assign an expire
>>>> time for each element in a hash map; and periodically remove expired
>>>> element from the map.
>>>> 
>>>> If this is still correct, my next question is: how does this compare
>>>> against a user space timer? Will the user space timer be too slow?
>>> 
>>> Yes, as I explained in timeout hashmap patchset, doing it in user-space
>>> would require a lot of syscalls (without batching) or copying (with batching).
>>> I will add the explanation here, in case people miss why we need a timer.
>> 
>> How about we use a user space timer to trigger a BPF program (e.g. use
>> BPF_PROG_TEST_RUN on a raw_tp program); then, in the BPF program, we can
>> use bpf_for_each_map_elem and bpf_map_delete_elem to scan and update the
>> map? With this approach, we only need one syscall per period.
> 
> Interesting, I didn't know we can explicitly trigger a BPF program running
> from user-space. Is it for testing purposes only?

This is not only for testing. We will use this in perf (starting in 5.13).

/* currently in Arnaldo's tree, tools/perf/util/bpf_counter.c: */

/* trigger the leader program on a cpu */
static int bperf_trigger_reading(int prog_fd, int cpu)
{
        DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
                            .ctx_in = NULL,
                            .ctx_size_in = 0,
                            .flags = BPF_F_TEST_RUN_ON_CPU,
                            .cpu = cpu,
                            .retval = 0,
                );

        return bpf_prog_test_run_opts(prog_fd, &opts);
}

test_run also passes return value (retval) back to user space, so we and 
adjust the timer interval based on retval.

Also, test_run can trigger the program on a specific cpu. This might be 
useful with percpu map (BPF_MAP_TYPE_PERCPU_HASH, etc.). 

Thanks,
Song


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 17:57         ` Song Liu
@ 2021-04-02 19:08           ` Cong Wang
  2021-04-02 19:43             ` Song Liu
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-04-02 19:08 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Fri, Apr 2, 2021 at 10:57 AM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Apr 2, 2021, at 10:34 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Thu, Apr 1, 2021 at 1:17 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Apr 1, 2021, at 10:28 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>
> >>> On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>>> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>>>
> >>>>> From: Cong Wang <cong.wang@bytedance.com>
> >>>>>
> >>>>> (This patch is still in early stage and obviously incomplete. I am sending
> >>>>> it out to get some high-level feedbacks. Please kindly ignore any coding
> >>>>> details for now and focus on the design.)
> >>>>
> >>>> Could you please explain the use case of the timer? Is it the same as
> >>>> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
> >>>>
> >>>> Assuming that is the case, I guess the use case is to assign an expire
> >>>> time for each element in a hash map; and periodically remove expired
> >>>> element from the map.
> >>>>
> >>>> If this is still correct, my next question is: how does this compare
> >>>> against a user space timer? Will the user space timer be too slow?
> >>>
> >>> Yes, as I explained in timeout hashmap patchset, doing it in user-space
> >>> would require a lot of syscalls (without batching) or copying (with batching).
> >>> I will add the explanation here, in case people miss why we need a timer.
> >>
> >> How about we use a user space timer to trigger a BPF program (e.g. use
> >> BPF_PROG_TEST_RUN on a raw_tp program); then, in the BPF program, we can
> >> use bpf_for_each_map_elem and bpf_map_delete_elem to scan and update the
> >> map? With this approach, we only need one syscall per period.
> >
> > Interesting, I didn't know we can explicitly trigger a BPF program running
> > from user-space. Is it for testing purposes only?
>
> This is not only for testing. We will use this in perf (starting in 5.13).
>
> /* currently in Arnaldo's tree, tools/perf/util/bpf_counter.c: */
>
> /* trigger the leader program on a cpu */
> static int bperf_trigger_reading(int prog_fd, int cpu)
> {
>         DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
>                             .ctx_in = NULL,
>                             .ctx_size_in = 0,
>                             .flags = BPF_F_TEST_RUN_ON_CPU,
>                             .cpu = cpu,
>                             .retval = 0,
>                 );
>
>         return bpf_prog_test_run_opts(prog_fd, &opts);
> }
>
> test_run also passes return value (retval) back to user space, so we and
> adjust the timer interval based on retval.

This is really odd, every name here contains a "test" but it is not for testing
purposes. You probably need to rename/alias it. ;)

So, with this we have to get a user-space daemon running just to keep
this "timer" alive. If I want to run it every 1ms, it means I have to issue
a syscall BPF_PROG_TEST_RUN every 1ms. Even with a timer fd, we
still need poll() and timerfd_settime(). This is a considerable overhead
for just a single timer.

With current design, user-space can just exit after installing the timer,
either it can adjust itself or other eBPF code can adjust it, so the per
timer overhead is the same as a kernel timer.

The visibility to other BPF code is important for the conntrack case,
because each time we get an expired item during a lookup, we may
want to schedule the GC timer to run sooner. At least this would give
users more freedom to decide when to reschedule the timer.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-01  4:26 [RFC Patch bpf-next] bpf: introduce bpf timer Cong Wang
  2021-04-01  6:38 ` Song Liu
@ 2021-04-02 19:28 ` Alexei Starovoitov
  2021-04-02 21:24   ` Cong Wang
  1 sibling, 1 reply; 79+ messages in thread
From: Alexei Starovoitov @ 2021-04-02 19:28 UTC (permalink / raw)
  To: Cong Wang
  Cc: netdev, bpf, duanxiongchun, wangdongdong.6, songmuchun,
	Cong Wang, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song

On Wed, Mar 31, 2021 at 09:26:35PM -0700, Cong Wang wrote:

> This patch introduces a bpf timer map and a syscall to create bpf timer
> from user-space.

That will severely limit timer api usability.
I agree with Song here. If user space has to create it there is no reason
to introduce new sys_bpf command. Just do all timers in user space
and trigger bpf prog via bpf_prog_test_run cmd.

> 
> The reason why we have to use a map is because the lifetime of a timer,
> without a map, we have to delete the timer before exiting the eBPF program,
> this would significately limit its use cases. With a map, the timer can
> stay as long as the map itself and can be actually updated via map update
> API's too,

this part is correct.

> where the key is the timer ID and the value is the timer expire
> timer.

The timer ID is unnecessary. We cannot introduce new IDR for every new
bpf object. It doesn't scale.

> Timer creation is not easy either. In order to prevent users creating a
> timer but not adding it to a map, we have to enforce this in the API which
> takes a map parameter and adds the new timer into the map in one shot.

Not quite true. The timer memory should be a part of the map otherwise
the timer life time is hard to track. But arming the timer and initializing
it with a callback doesn't need to be tied with allocation of timer memory.

> And because timer is asynchronous, we can not just use its callback like
> bpf_for_each_map_elem().

Not quite. We can do it the same way as bpf_for_each_map_elem() despite
being async.

> More importantly, we have to properly reference
> count its struct bpf_prog too. 

It's true that callback prog or subprog has to stay alive while timer
is alive.
Traditional maps can live past the time of the progs that use them.
Like bpf prog can load with a pointer to already created hash map.
Then prog can unload and hashmap will stay around just fine.
All maps are like this with the exception of prog_array.
The progs get deleted from the prog_array map when appropriate.
The same thing can work for maps with embedded timers.
For example the subprog/prog can to be deleted from the timer if
that prog is going away. Similar to ref/uref distinction we have for prog_array.

> It seems impossible to do this either in
> verifier or in JIT, so we have to make its callback code a separate eBPF
> program and pass a program fd from user-space. Fortunately, timer callback
> can still live in the same object file with the rest eBPF code and share
> data too.
> 
> Here is a quick demo of the timer callback code:
> 
> static __u64
> check_expired_elem(struct bpf_map *map, __u32 *key, __u64 *val,
>                   int *data)
> {
>   u64 expires = *val;
> 
>   if (expires < bpf_jiffies64()) {
>     bpf_map_delete_elem(map, key);
>     *data++;
>   }
>   return 0;
> }
> 
> SEC("timer")
> u32 timer_callback(void)
> {
>   int count = 0;
> 
>   bpf_for_each_map_elem(&map, check_expired_elem, &count, 0);
>   if (count)
>      return 0; // not re-arm this timer
>   else
>      return 10; // reschedule this timeGr after 10 jiffies
> }

As Song pointed out the exact same thing can be done with timers in user space
and user space triggering prog exec with bpf_prog_test_run.

Here is how more general timers might look like:
https://lore.kernel.org/bpf/20210310011905.ozz4xahpkqbfkkvd@ast-mbp.dhcp.thefacebook.com/

include/uapi/linux/bpf.h:
struct bpf_timer {
  u64 opaque;
};
The 'opaque' field contains a pointer to dynamically allocated struct timer_list and other data.

The prog would do:
struct map_elem {
    int stuff;
    struct bpf_timer timer;
};

struct {
    __uint(type, BPF_MAP_TYPE_HASH);
    __uint(max_entries, 1);
    __type(key, int);
    __type(value, struct map_elem);
} hmap SEC(".maps");

static int timer_cb(struct map_elem *elem)
{
    if (whatever && elem->stuff)
        bpf_timer_mod(&elem->timer, new_expire);
}

int bpf_timer_test(...)
{
    struct map_elem *val;

    val = bpf_map_lookup_elem(&hmap, &key);
    if (val) {
        bpf_timer_init(&val->timer, timer_cb, flags);
        val->stuff = 123;
        bpf_timer_mod(&val->timer, expires);
    }
}

bpf_map_update_elem() either from bpf prog or from user space
allocates map element and zeros 8 byte space for the timer pointer.
bpf_timer_init() allocates timer_list and stores it into opaque if opaque == 0.
The validation of timer_cb() is done by the verifier.
bpf_map_delete_elem() either from bpf prog or from user space
does del_timer() if elem->opaque != 0.
If prog refers such hmap as above during prog free the kernel does
for_each_map_elem {if (elem->opaque) del_timer().}
I think that is the simplest way of prevent timers firing past the prog life time.
There could be other ways to solve it (like prog_array and ref/uref).

Pseudo code:
int bpf_timer_init(struct bpf_timer *timer, void *timer_cb, int flags)
{
  if (timer->opaque)
    return -EBUSY;
  t = alloc timer_list
  t->cb = timer_cb;
  t->..
  timer->opaque = (long)t;
}

int bpf_timer_mod(struct bpf_timer *timer, u64 expires)
{
  if (!time->opaque)
    return -EINVAL;
  t = (struct timer_list *)timer->opaque;
  mod_timer(t,..);
}

int bpf_timer_del(struct bpf_timer *timer)
{
  if (!time->opaque)
    return -EINVAL;
  t = (struct timer_list *)timer->opaque;
  del_timer(t);
}

The verifier would need to check that 8 bytes occupied by bpf_timer and not accessed
via load/store by the program. The same way it does it for bpf_spin_lock.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 19:08           ` Cong Wang
@ 2021-04-02 19:43             ` Song Liu
  2021-04-02 20:57               ` Cong Wang
  0 siblings, 1 reply; 79+ messages in thread
From: Song Liu @ 2021-04-02 19:43 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Apr 2, 2021, at 12:08 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Fri, Apr 2, 2021 at 10:57 AM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Apr 2, 2021, at 10:34 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> 
>>> On Thu, Apr 1, 2021 at 1:17 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Apr 1, 2021, at 10:28 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>> 
>>>>> On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>> 
>>>>>> 
>>>>>> 
>>>>>>> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>>>> 
>>>>>>> From: Cong Wang <cong.wang@bytedance.com>
>>>>>>> 
>>>>>>> (This patch is still in early stage and obviously incomplete. I am sending
>>>>>>> it out to get some high-level feedbacks. Please kindly ignore any coding
>>>>>>> details for now and focus on the design.)
>>>>>> 
>>>>>> Could you please explain the use case of the timer? Is it the same as
>>>>>> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
>>>>>> 
>>>>>> Assuming that is the case, I guess the use case is to assign an expire
>>>>>> time for each element in a hash map; and periodically remove expired
>>>>>> element from the map.
>>>>>> 
>>>>>> If this is still correct, my next question is: how does this compare
>>>>>> against a user space timer? Will the user space timer be too slow?
>>>>> 
>>>>> Yes, as I explained in timeout hashmap patchset, doing it in user-space
>>>>> would require a lot of syscalls (without batching) or copying (with batching).
>>>>> I will add the explanation here, in case people miss why we need a timer.
>>>> 
>>>> How about we use a user space timer to trigger a BPF program (e.g. use
>>>> BPF_PROG_TEST_RUN on a raw_tp program); then, in the BPF program, we can
>>>> use bpf_for_each_map_elem and bpf_map_delete_elem to scan and update the
>>>> map? With this approach, we only need one syscall per period.
>>> 
>>> Interesting, I didn't know we can explicitly trigger a BPF program running
>>> from user-space. Is it for testing purposes only?
>> 
>> This is not only for testing. We will use this in perf (starting in 5.13).
>> 
>> /* currently in Arnaldo's tree, tools/perf/util/bpf_counter.c: */
>> 
>> /* trigger the leader program on a cpu */
>> static int bperf_trigger_reading(int prog_fd, int cpu)
>> {
>>        DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
>>                            .ctx_in = NULL,
>>                            .ctx_size_in = 0,
>>                            .flags = BPF_F_TEST_RUN_ON_CPU,
>>                            .cpu = cpu,
>>                            .retval = 0,
>>                );
>> 
>>        return bpf_prog_test_run_opts(prog_fd, &opts);
>> }
>> 
>> test_run also passes return value (retval) back to user space, so we and
>> adjust the timer interval based on retval.
> 
> This is really odd, every name here contains a "test" but it is not for testing
> purposes. You probably need to rename/alias it. ;)
> 
> So, with this we have to get a user-space daemon running just to keep
> this "timer" alive. If I want to run it every 1ms, it means I have to issue
> a syscall BPF_PROG_TEST_RUN every 1ms. Even with a timer fd, we
> still need poll() and timerfd_settime(). This is a considerable overhead
> for just a single timer.

sys_bpf() takes about 0.5us. I would expect poll() and timerfd_settime() to 
be slightly faster. So the overhead is less than 0.2% of a single core 
(0.5us x 3 / 1ms). Do we need many counters for conntrack?

> 
> With current design, user-space can just exit after installing the timer,
> either it can adjust itself or other eBPF code can adjust it, so the per
> timer overhead is the same as a kernel timer.

I guess we still need to hold a fd to the prog/map? Alternatively, we can 
pin the prog/map, but then the user need to clean it up. 

> 
> The visibility to other BPF code is important for the conntrack case,
> because each time we get an expired item during a lookup, we may
> want to schedule the GC timer to run sooner. At least this would give
> users more freedom to decide when to reschedule the timer.

Do we plan to share the timer program among multiple processes (which can 
start and terminate in arbitrary orders)? If that is the case, I can imagine
a timer program is better than a user space timer. 

Thanks,
Song 


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 19:43             ` Song Liu
@ 2021-04-02 20:57               ` Cong Wang
  2021-04-02 23:31                 ` Song Liu
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-04-02 20:57 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Fri, Apr 2, 2021 at 12:45 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Apr 2, 2021, at 12:08 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Fri, Apr 2, 2021 at 10:57 AM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Apr 2, 2021, at 10:34 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>
> >>> On Thu, Apr 1, 2021 at 1:17 PM Song Liu <songliubraving@fb.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>>> On Apr 1, 2021, at 10:28 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>>>
> >>>>> On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>>> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>>>>>
> >>>>>>> From: Cong Wang <cong.wang@bytedance.com>
> >>>>>>>
> >>>>>>> (This patch is still in early stage and obviously incomplete. I am sending
> >>>>>>> it out to get some high-level feedbacks. Please kindly ignore any coding
> >>>>>>> details for now and focus on the design.)
> >>>>>>
> >>>>>> Could you please explain the use case of the timer? Is it the same as
> >>>>>> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
> >>>>>>
> >>>>>> Assuming that is the case, I guess the use case is to assign an expire
> >>>>>> time for each element in a hash map; and periodically remove expired
> >>>>>> element from the map.
> >>>>>>
> >>>>>> If this is still correct, my next question is: how does this compare
> >>>>>> against a user space timer? Will the user space timer be too slow?
> >>>>>
> >>>>> Yes, as I explained in timeout hashmap patchset, doing it in user-space
> >>>>> would require a lot of syscalls (without batching) or copying (with batching).
> >>>>> I will add the explanation here, in case people miss why we need a timer.
> >>>>
> >>>> How about we use a user space timer to trigger a BPF program (e.g. use
> >>>> BPF_PROG_TEST_RUN on a raw_tp program); then, in the BPF program, we can
> >>>> use bpf_for_each_map_elem and bpf_map_delete_elem to scan and update the
> >>>> map? With this approach, we only need one syscall per period.
> >>>
> >>> Interesting, I didn't know we can explicitly trigger a BPF program running
> >>> from user-space. Is it for testing purposes only?
> >>
> >> This is not only for testing. We will use this in perf (starting in 5.13).
> >>
> >> /* currently in Arnaldo's tree, tools/perf/util/bpf_counter.c: */
> >>
> >> /* trigger the leader program on a cpu */
> >> static int bperf_trigger_reading(int prog_fd, int cpu)
> >> {
> >>        DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
> >>                            .ctx_in = NULL,
> >>                            .ctx_size_in = 0,
> >>                            .flags = BPF_F_TEST_RUN_ON_CPU,
> >>                            .cpu = cpu,
> >>                            .retval = 0,
> >>                );
> >>
> >>        return bpf_prog_test_run_opts(prog_fd, &opts);
> >> }
> >>
> >> test_run also passes return value (retval) back to user space, so we and
> >> adjust the timer interval based on retval.
> >
> > This is really odd, every name here contains a "test" but it is not for testing
> > purposes. You probably need to rename/alias it. ;)
> >
> > So, with this we have to get a user-space daemon running just to keep
> > this "timer" alive. If I want to run it every 1ms, it means I have to issue
> > a syscall BPF_PROG_TEST_RUN every 1ms. Even with a timer fd, we
> > still need poll() and timerfd_settime(). This is a considerable overhead
> > for just a single timer.
>
> sys_bpf() takes about 0.5us. I would expect poll() and timerfd_settime() to
> be slightly faster. So the overhead is less than 0.2% of a single core
> (0.5us x 3 / 1ms). Do we need many counters for conntrack?

This is just for one timer. The whole system may end up with many timers
when we have more and more eBPF programs. So managing the timers
in the use-space would be a problem too someday, clearly one daemon
per-timer does not scale.

>
> >
> > With current design, user-space can just exit after installing the timer,
> > either it can adjust itself or other eBPF code can adjust it, so the per
> > timer overhead is the same as a kernel timer.
>
> I guess we still need to hold a fd to the prog/map? Alternatively, we can
> pin the prog/map, but then the user need to clean it up.

Yes, but I don't see how holding a fd could bring any overhead after
initial setup.

>
> >
> > The visibility to other BPF code is important for the conntrack case,
> > because each time we get an expired item during a lookup, we may
> > want to schedule the GC timer to run sooner. At least this would give
> > users more freedom to decide when to reschedule the timer.
>
> Do we plan to share the timer program among multiple processes (which can
> start and terminate in arbitrary orders)? If that is the case, I can imagine
> a timer program is better than a user space timer.

I mean I want other eBPF program to manage the timers in kernel-space,
as conntrack is mostly in kernel-space. If the timer is only manageable
in user-space, it would seriously limit its use case.

Ideally I even prefer to create timers in kernel-space too, but as I already
explained, this seems impossible to me.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 19:28 ` Alexei Starovoitov
@ 2021-04-02 21:24   ` Cong Wang
  2021-04-02 23:45     ` Alexei Starovoitov
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-04-02 21:24 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Fri, Apr 2, 2021 at 12:28 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Mar 31, 2021 at 09:26:35PM -0700, Cong Wang wrote:
>
> > This patch introduces a bpf timer map and a syscall to create bpf timer
> > from user-space.
>
> That will severely limit timer api usability.
> I agree with Song here. If user space has to create it there is no reason
> to introduce new sys_bpf command. Just do all timers in user space
> and trigger bpf prog via bpf_prog_test_run cmd.

I think you misunderstand my point, the reason why the creation is done
in user-space is not I like it, it is because it looks impossible to
create timers
in kernel-space, hence I have to move it to user-space.

>
> >
> > The reason why we have to use a map is because the lifetime of a timer,
> > without a map, we have to delete the timer before exiting the eBPF program,
> > this would significately limit its use cases. With a map, the timer can
> > stay as long as the map itself and can be actually updated via map update
> > API's too,
>
> this part is correct.
>
> > where the key is the timer ID and the value is the timer expire
> > timer.
>
> The timer ID is unnecessary. We cannot introduce new IDR for every new
> bpf object. It doesn't scale.

The IDR is per map, not per timer.

>
> > Timer creation is not easy either. In order to prevent users creating a
> > timer but not adding it to a map, we have to enforce this in the API which
> > takes a map parameter and adds the new timer into the map in one shot.
>
> Not quite true. The timer memory should be a part of the map otherwise
> the timer life time is hard to track. But arming the timer and initializing
> it with a callback doesn't need to be tied with allocation of timer memory.
>
> > And because timer is asynchronous, we can not just use its callback like
> > bpf_for_each_map_elem().
>
> Not quite. We can do it the same way as bpf_for_each_map_elem() despite
> being async.

Well, at least bpf_for_each_map_elem() can use stack variables etc.,
but timers can't. They are very different to me.

>
> > More importantly, we have to properly reference
> > count its struct bpf_prog too.
>
> It's true that callback prog or subprog has to stay alive while timer
> is alive.
> Traditional maps can live past the time of the progs that use them.
> Like bpf prog can load with a pointer to already created hash map.
> Then prog can unload and hashmap will stay around just fine.
> All maps are like this with the exception of prog_array.
> The progs get deleted from the prog_array map when appropriate.
> The same thing can work for maps with embedded timers.
> For example the subprog/prog can to be deleted from the timer if
> that prog is going away. Similar to ref/uref distinction we have for prog_array.
>
> > It seems impossible to do this either in
> > verifier or in JIT, so we have to make its callback code a separate eBPF
> > program and pass a program fd from user-space. Fortunately, timer callback
> > can still live in the same object file with the rest eBPF code and share
> > data too.
> >
> > Here is a quick demo of the timer callback code:
> >
> > static __u64
> > check_expired_elem(struct bpf_map *map, __u32 *key, __u64 *val,
> >                   int *data)
> > {
> >   u64 expires = *val;
> >
> >   if (expires < bpf_jiffies64()) {
> >     bpf_map_delete_elem(map, key);
> >     *data++;
> >   }
> >   return 0;
> > }
> >
> > SEC("timer")
> > u32 timer_callback(void)
> > {
> >   int count = 0;
> >
> >   bpf_for_each_map_elem(&map, check_expired_elem, &count, 0);
> >   if (count)
> >      return 0; // not re-arm this timer
> >   else
> >      return 10; // reschedule this timeGr after 10 jiffies
> > }
>
> As Song pointed out the exact same thing can be done with timers in user space
> and user space triggering prog exec with bpf_prog_test_run.
>
> Here is how more general timers might look like:
> https://lore.kernel.org/bpf/20210310011905.ozz4xahpkqbfkkvd@ast-mbp.dhcp.thefacebook.com/
>
> include/uapi/linux/bpf.h:
> struct bpf_timer {
>   u64 opaque;
> };
> The 'opaque' field contains a pointer to dynamically allocated struct timer_list and other data.

This is my initial design as we already discussed, it does not work,
please see below.

>
> The prog would do:
> struct map_elem {
>     int stuff;
>     struct bpf_timer timer;
> };
>
> struct {
>     __uint(type, BPF_MAP_TYPE_HASH);
>     __uint(max_entries, 1);
>     __type(key, int);
>     __type(value, struct map_elem);
> } hmap SEC(".maps");
>
> static int timer_cb(struct map_elem *elem)
> {
>     if (whatever && elem->stuff)
>         bpf_timer_mod(&elem->timer, new_expire);
> }
>
> int bpf_timer_test(...)
> {
>     struct map_elem *val;
>
>     val = bpf_map_lookup_elem(&hmap, &key);
>     if (val) {
>         bpf_timer_init(&val->timer, timer_cb, flags);
>         val->stuff = 123;
>         bpf_timer_mod(&val->timer, expires);
>     }
> }
>
> bpf_map_update_elem() either from bpf prog or from user space
> allocates map element and zeros 8 byte space for the timer pointer.
> bpf_timer_init() allocates timer_list and stores it into opaque if opaque == 0.
> The validation of timer_cb() is done by the verifier.
> bpf_map_delete_elem() either from bpf prog or from user space
> does del_timer() if elem->opaque != 0.
> If prog refers such hmap as above during prog free the kernel does
> for_each_map_elem {if (elem->opaque) del_timer().}
> I think that is the simplest way of prevent timers firing past the prog life time.
> There could be other ways to solve it (like prog_array and ref/uref).
>
> Pseudo code:
> int bpf_timer_init(struct bpf_timer *timer, void *timer_cb, int flags)
> {
>   if (timer->opaque)
>     return -EBUSY;
>   t = alloc timer_list
>   t->cb = timer_cb;
>   t->..
>   timer->opaque = (long)t;
> }
>
> int bpf_timer_mod(struct bpf_timer *timer, u64 expires)
> {
>   if (!time->opaque)
>     return -EINVAL;
>   t = (struct timer_list *)timer->opaque;
>   mod_timer(t,..);
> }
>
> int bpf_timer_del(struct bpf_timer *timer)
> {
>   if (!time->opaque)
>     return -EINVAL;
>   t = (struct timer_list *)timer->opaque;
>   del_timer(t);
> }
>
> The verifier would need to check that 8 bytes occupied by bpf_timer and not accessed
> via load/store by the program. The same way it does it for bpf_spin_lock.

This does not work, because bpf_timer_del() has to be matched
with bpf_timer_init(), otherwise we would leak timer resources.
For example:

SEC("foo")
bad_ebpf_code()
{
  struct bpf_timer t;
  bpf_timer_init(&t, ...); // allocate a timer
  bpf_timer_mod(&t, ..);
  // end of BPF program
  // now the timer is leaked, no one will delete it
}

We can not enforce the matching in the verifier, because users would
have to call bpf_timer_del() before exiting, which is not what we want
either.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 20:57               ` Cong Wang
@ 2021-04-02 23:31                 ` Song Liu
  2021-04-05 23:49                   ` Cong Wang
  0 siblings, 1 reply; 79+ messages in thread
From: Song Liu @ 2021-04-02 23:31 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Apr 2, 2021, at 1:57 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Fri, Apr 2, 2021 at 12:45 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Apr 2, 2021, at 12:08 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> 
>>> On Fri, Apr 2, 2021 at 10:57 AM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Apr 2, 2021, at 10:34 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>> 
>>>>> On Thu, Apr 1, 2021 at 1:17 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>> 
>>>>>> 
>>>>>> 
>>>>>>> On Apr 1, 2021, at 10:28 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>>>> 
>>>>>>> On Wed, Mar 31, 2021 at 11:38 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>>>> 
>>>>>>>> 
>>>>>>>> 
>>>>>>>>> On Mar 31, 2021, at 9:26 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>>>>>> 
>>>>>>>>> From: Cong Wang <cong.wang@bytedance.com>
>>>>>>>>> 
>>>>>>>>> (This patch is still in early stage and obviously incomplete. I am sending
>>>>>>>>> it out to get some high-level feedbacks. Please kindly ignore any coding
>>>>>>>>> details for now and focus on the design.)
>>>>>>>> 
>>>>>>>> Could you please explain the use case of the timer? Is it the same as
>>>>>>>> earlier proposal of BPF_MAP_TYPE_TIMEOUT_HASH?
>>>>>>>> 
>>>>>>>> Assuming that is the case, I guess the use case is to assign an expire
>>>>>>>> time for each element in a hash map; and periodically remove expired
>>>>>>>> element from the map.
>>>>>>>> 
>>>>>>>> If this is still correct, my next question is: how does this compare
>>>>>>>> against a user space timer? Will the user space timer be too slow?
>>>>>>> 
>>>>>>> Yes, as I explained in timeout hashmap patchset, doing it in user-space
>>>>>>> would require a lot of syscalls (without batching) or copying (with batching).
>>>>>>> I will add the explanation here, in case people miss why we need a timer.
>>>>>> 
>>>>>> How about we use a user space timer to trigger a BPF program (e.g. use
>>>>>> BPF_PROG_TEST_RUN on a raw_tp program); then, in the BPF program, we can
>>>>>> use bpf_for_each_map_elem and bpf_map_delete_elem to scan and update the
>>>>>> map? With this approach, we only need one syscall per period.
>>>>> 
>>>>> Interesting, I didn't know we can explicitly trigger a BPF program running
>>>>> from user-space. Is it for testing purposes only?
>>>> 
>>>> This is not only for testing. We will use this in perf (starting in 5.13).
>>>> 
>>>> /* currently in Arnaldo's tree, tools/perf/util/bpf_counter.c: */
>>>> 
>>>> /* trigger the leader program on a cpu */
>>>> static int bperf_trigger_reading(int prog_fd, int cpu)
>>>> {
>>>>       DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
>>>>                           .ctx_in = NULL,
>>>>                           .ctx_size_in = 0,
>>>>                           .flags = BPF_F_TEST_RUN_ON_CPU,
>>>>                           .cpu = cpu,
>>>>                           .retval = 0,
>>>>               );
>>>> 
>>>>       return bpf_prog_test_run_opts(prog_fd, &opts);
>>>> }
>>>> 
>>>> test_run also passes return value (retval) back to user space, so we and
>>>> adjust the timer interval based on retval.
>>> 
>>> This is really odd, every name here contains a "test" but it is not for testing
>>> purposes. You probably need to rename/alias it. ;)
>>> 
>>> So, with this we have to get a user-space daemon running just to keep
>>> this "timer" alive. If I want to run it every 1ms, it means I have to issue
>>> a syscall BPF_PROG_TEST_RUN every 1ms. Even with a timer fd, we
>>> still need poll() and timerfd_settime(). This is a considerable overhead
>>> for just a single timer.
>> 
>> sys_bpf() takes about 0.5us. I would expect poll() and timerfd_settime() to
>> be slightly faster. So the overhead is less than 0.2% of a single core
>> (0.5us x 3 / 1ms). Do we need many counters for conntrack?
> 
> This is just for one timer. The whole system may end up with many timers
> when we have more and more eBPF programs. So managing the timers
> in the use-space would be a problem too someday, clearly one daemon
> per-timer does not scale.

If we do need many timers, I agree that it doesn't make sense to create 
a thread for each of them. 

A less-flexible alternative is to create a perf_event of "cpu-clock" and 
attach BPF program to it. It is not easy to adjust the interval, I guess.

> 
>> 
>>> 
>>> With current design, user-space can just exit after installing the timer,
>>> either it can adjust itself or other eBPF code can adjust it, so the per
>>> timer overhead is the same as a kernel timer.
>> 
>> I guess we still need to hold a fd to the prog/map? Alternatively, we can
>> pin the prog/map, but then the user need to clean it up.
> 
> Yes, but I don't see how holding a fd could bring any overhead after
> initial setup.
>> 
>>> 
>>> The visibility to other BPF code is important for the conntrack case,
>>> because each time we get an expired item during a lookup, we may
>>> want to schedule the GC timer to run sooner. At least this would give
>>> users more freedom to decide when to reschedule the timer.
>> 
>> Do we plan to share the timer program among multiple processes (which can
>> start and terminate in arbitrary orders)? If that is the case, I can imagine
>> a timer program is better than a user space timer.
> 
> I mean I want other eBPF program to manage the timers in kernel-space,
> as conntrack is mostly in kernel-space. If the timer is only manageable
> in user-space, it would seriously limit its use case.
> 
> Ideally I even prefer to create timers in kernel-space too, but as I already
> explained, this seems impossible to me.

Would hrtimer (include/linux/hrtimer.h) work? 

Thanks,
Song






^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 21:24   ` Cong Wang
@ 2021-04-02 23:45     ` Alexei Starovoitov
  2021-04-06  0:36       ` Cong Wang
  0 siblings, 1 reply; 79+ messages in thread
From: Alexei Starovoitov @ 2021-04-02 23:45 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Fri, Apr 02, 2021 at 02:24:51PM -0700, Cong Wang wrote:
> > > where the key is the timer ID and the value is the timer expire
> > > timer.
> >
> > The timer ID is unnecessary. We cannot introduce new IDR for every new
> > bpf object. It doesn't scale.
> 
> The IDR is per map, not per timer.

Per-map is not acceptable. One IDR for all maps with timers is not acceptable either.
We have 3 IDRs now: for progs, for maps, and for links.
No other objects need IDRs.

> > Here is how more general timers might look like:
> > https://lore.kernel.org/bpf/20210310011905.ozz4xahpkqbfkkvd@ast-mbp.dhcp.thefacebook.com/
> >
> > include/uapi/linux/bpf.h:
> > struct bpf_timer {
> >   u64 opaque;
> > };
> > The 'opaque' field contains a pointer to dynamically allocated struct timer_list and other data.
> 
> This is my initial design as we already discussed, it does not work,
> please see below.

It does work. The perceived "issue" you referred to is a misunderstanding. See below.

> >
> > The prog would do:
> > struct map_elem {
> >     int stuff;
> >     struct bpf_timer timer;
> > };
> >
> > struct {
> >     __uint(type, BPF_MAP_TYPE_HASH);
> >     __uint(max_entries, 1);
> >     __type(key, int);
> >     __type(value, struct map_elem);
> > } hmap SEC(".maps");
> >
> > static int timer_cb(struct map_elem *elem)
> > {
> >     if (whatever && elem->stuff)
> >         bpf_timer_mod(&elem->timer, new_expire);
> > }
> >
> > int bpf_timer_test(...)
> > {
> >     struct map_elem *val;
> >
> >     val = bpf_map_lookup_elem(&hmap, &key);
> >     if (val) {
> >         bpf_timer_init(&val->timer, timer_cb, flags);
> >         val->stuff = 123;
> >         bpf_timer_mod(&val->timer, expires);
> >     }
> > }
> >
> > bpf_map_update_elem() either from bpf prog or from user space
> > allocates map element and zeros 8 byte space for the timer pointer.
> > bpf_timer_init() allocates timer_list and stores it into opaque if opaque == 0.
> > The validation of timer_cb() is done by the verifier.
> > bpf_map_delete_elem() either from bpf prog or from user space
> > does del_timer() if elem->opaque != 0.
> > If prog refers such hmap as above during prog free the kernel does
> > for_each_map_elem {if (elem->opaque) del_timer().}
> > I think that is the simplest way of prevent timers firing past the prog life time.
> > There could be other ways to solve it (like prog_array and ref/uref).
> >
> > Pseudo code:
> > int bpf_timer_init(struct bpf_timer *timer, void *timer_cb, int flags)
> > {
> >   if (timer->opaque)
> >     return -EBUSY;
> >   t = alloc timer_list
> >   t->cb = timer_cb;
> >   t->..
> >   timer->opaque = (long)t;
> > }
> >
> > int bpf_timer_mod(struct bpf_timer *timer, u64 expires)
> > {
> >   if (!time->opaque)
> >     return -EINVAL;
> >   t = (struct timer_list *)timer->opaque;
> >   mod_timer(t,..);
> > }
> >
> > int bpf_timer_del(struct bpf_timer *timer)
> > {
> >   if (!time->opaque)
> >     return -EINVAL;
> >   t = (struct timer_list *)timer->opaque;
> >   del_timer(t);
> > }
> >
> > The verifier would need to check that 8 bytes occupied by bpf_timer and not accessed
> > via load/store by the program. The same way it does it for bpf_spin_lock.
> 
> This does not work, because bpf_timer_del() has to be matched
> with bpf_timer_init(), otherwise we would leak timer resources.
> For example:
> 
> SEC("foo")
> bad_ebpf_code()
> {
>   struct bpf_timer t;
>   bpf_timer_init(&t, ...); // allocate a timer
>   bpf_timer_mod(&t, ..);
>   // end of BPF program
>   // now the timer is leaked, no one will delete it
> }
> 
> We can not enforce the matching in the verifier, because users would
> have to call bpf_timer_del() before exiting, which is not what we want
> either.

```
bad_ebpf_code()
{
  struct bpf_timer t;
```
is not at all what was proposed. This kind of code will be rejected by the verifier.

'struct bpf_timer' has to be part of the map element and the verifier will enforce that
just like it does so for bpf_spin_lock.
Try writing the following program:
```
bad_ebpf_code()
{
  struct bpf_spin_lock t;
  bpf_spin_lock(&t);
}
``
and then follow the code to see why the verifier rejects it.

The implementation of what I'm proposing is straightforward.
I certainly understand that it might look intimidating and "impossible",
but it's really quite simple.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 23:31                 ` Song Liu
@ 2021-04-05 23:49                   ` Cong Wang
  2021-04-06  1:07                     ` Song Liu
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-04-05 23:49 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Fri, Apr 2, 2021 at 4:31 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Apr 2, 2021, at 1:57 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > Ideally I even prefer to create timers in kernel-space too, but as I already
> > explained, this seems impossible to me.
>
> Would hrtimer (include/linux/hrtimer.h) work?

By impossible, I meant it is impossible (to me) to take a refcnt to the callback
prog if we create the timer in kernel-space. So, hrtimer is the same in this
perspective.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-02 23:45     ` Alexei Starovoitov
@ 2021-04-06  0:36       ` Cong Wang
  2021-04-12 23:01         ` Alexei Starovoitov
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-04-06  0:36 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Fri, Apr 2, 2021 at 4:45 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Apr 02, 2021 at 02:24:51PM -0700, Cong Wang wrote:
> > > > where the key is the timer ID and the value is the timer expire
> > > > timer.
> > >
> > > The timer ID is unnecessary. We cannot introduce new IDR for every new
> > > bpf object. It doesn't scale.
> >
> > The IDR is per map, not per timer.
>
> Per-map is not acceptable. One IDR for all maps with timers is not acceptable either.
> We have 3 IDRs now: for progs, for maps, and for links.
> No other objects need IDRs.
>
> > > Here is how more general timers might look like:
> > > https://lore.kernel.org/bpf/20210310011905.ozz4xahpkqbfkkvd@ast-mbp.dhcp.thefacebook.com/
> > >
> > > include/uapi/linux/bpf.h:
> > > struct bpf_timer {
> > >   u64 opaque;
> > > };
> > > The 'opaque' field contains a pointer to dynamically allocated struct timer_list and other data.
> >
> > This is my initial design as we already discussed, it does not work,
> > please see below.
>
> It does work. The perceived "issue" you referred to is a misunderstanding. See below.
>
> > >
> > > The prog would do:
> > > struct map_elem {
> > >     int stuff;
> > >     struct bpf_timer timer;
> > > };
> > >
> > > struct {
> > >     __uint(type, BPF_MAP_TYPE_HASH);
> > >     __uint(max_entries, 1);
> > >     __type(key, int);
> > >     __type(value, struct map_elem);
> > > } hmap SEC(".maps");
> > >
> > > static int timer_cb(struct map_elem *elem)
> > > {
> > >     if (whatever && elem->stuff)
> > >         bpf_timer_mod(&elem->timer, new_expire);
> > > }
> > >
> > > int bpf_timer_test(...)
> > > {
> > >     struct map_elem *val;
> > >
> > >     val = bpf_map_lookup_elem(&hmap, &key);
> > >     if (val) {
> > >         bpf_timer_init(&val->timer, timer_cb, flags);
> > >         val->stuff = 123;
> > >         bpf_timer_mod(&val->timer, expires);
> > >     }
> > > }
> > >
> > > bpf_map_update_elem() either from bpf prog or from user space
> > > allocates map element and zeros 8 byte space for the timer pointer.
> > > bpf_timer_init() allocates timer_list and stores it into opaque if opaque == 0.
> > > The validation of timer_cb() is done by the verifier.
> > > bpf_map_delete_elem() either from bpf prog or from user space
> > > does del_timer() if elem->opaque != 0.
> > > If prog refers such hmap as above during prog free the kernel does
> > > for_each_map_elem {if (elem->opaque) del_timer().}
> > > I think that is the simplest way of prevent timers firing past the prog life time.
> > > There could be other ways to solve it (like prog_array and ref/uref).
> > >
> > > Pseudo code:
> > > int bpf_timer_init(struct bpf_timer *timer, void *timer_cb, int flags)
> > > {
> > >   if (timer->opaque)
> > >     return -EBUSY;
> > >   t = alloc timer_list
> > >   t->cb = timer_cb;
> > >   t->..
> > >   timer->opaque = (long)t;
> > > }
> > >
> > > int bpf_timer_mod(struct bpf_timer *timer, u64 expires)
> > > {
> > >   if (!time->opaque)
> > >     return -EINVAL;
> > >   t = (struct timer_list *)timer->opaque;
> > >   mod_timer(t,..);
> > > }
> > >
> > > int bpf_timer_del(struct bpf_timer *timer)
> > > {
> > >   if (!time->opaque)
> > >     return -EINVAL;
> > >   t = (struct timer_list *)timer->opaque;
> > >   del_timer(t);
> > > }
> > >
> > > The verifier would need to check that 8 bytes occupied by bpf_timer and not accessed
> > > via load/store by the program. The same way it does it for bpf_spin_lock.
> >
> > This does not work, because bpf_timer_del() has to be matched
> > with bpf_timer_init(), otherwise we would leak timer resources.
> > For example:
> >
> > SEC("foo")
> > bad_ebpf_code()
> > {
> >   struct bpf_timer t;
> >   bpf_timer_init(&t, ...); // allocate a timer
> >   bpf_timer_mod(&t, ..);
> >   // end of BPF program
> >   // now the timer is leaked, no one will delete it
> > }
> >
> > We can not enforce the matching in the verifier, because users would
> > have to call bpf_timer_del() before exiting, which is not what we want
> > either.
>
> ```
> bad_ebpf_code()
> {
>   struct bpf_timer t;
> ```
> is not at all what was proposed. This kind of code will be rejected by the verifier.
>
> 'struct bpf_timer' has to be part of the map element and the verifier will enforce that
> just like it does so for bpf_spin_lock.
> Try writing the following program:
> ```
> bad_ebpf_code()
> {
>   struct bpf_spin_lock t;
>   bpf_spin_lock(&t);
> }
> ``
> and then follow the code to see why the verifier rejects it.

Well, embedding a spinlock makes sense as it is used to protect
the value it is associated with, but for a timer, no, it has no value
to associate. Even if it does, updating it requires a lock as the
callback can run concurrently with value update. So, they are very
different hence should be treated differently rather than similarly.

>
> The implementation of what I'm proposing is straightforward.
> I certainly understand that it might look intimidating and "impossible",
> but it's really quite simple.

How do you refcnt the struct bpf_prog with your approach? Or with
actually any attempt to create timers in kernel-space. I am not intimidated
but quite happy to hear. If you do it in the verifier, we do not know which
code path is actually executed when running it. If you do it with JIT, I do
not see how JIT can even get the right struct bpf_prog pointer in context.

This is how I concluded it looks impossible, which has nothing to do
with whether we have a map or not. Map issue is much easier to solve,
whether using what you mentioned or what I showed.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-05 23:49                   ` Cong Wang
@ 2021-04-06  1:07                     ` Song Liu
  2021-04-06  1:24                       ` Cong Wang
  0 siblings, 1 reply; 79+ messages in thread
From: Song Liu @ 2021-04-06  1:07 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Apr 5, 2021, at 4:49 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Fri, Apr 2, 2021 at 4:31 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Apr 2, 2021, at 1:57 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> 
>>> Ideally I even prefer to create timers in kernel-space too, but as I already
>>> explained, this seems impossible to me.
>> 
>> Would hrtimer (include/linux/hrtimer.h) work?
> 
> By impossible, I meant it is impossible (to me) to take a refcnt to the callback
> prog if we create the timer in kernel-space. So, hrtimer is the same in this
> perspective.
> 
> Thanks.

I guess I am not following 100%. Here is what I would propose:

We only introduce a new program type BPF_PROG_TYPE_TIMER. No new map type. 
The new program will trigger based on a timer, and the program can somehow 
control the period of the timer (for example, via return value).

With this approach, the user simply can create multiple timer programs and 
hold the fd for them. And these programs trigger up to timer expiration. 

Does this make sense?

Thanks,
Song

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-06  1:07                     ` Song Liu
@ 2021-04-06  1:24                       ` Cong Wang
  2021-04-06  6:17                         ` Song Liu
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-04-06  1:24 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Mon, Apr 5, 2021 at 6:08 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Apr 5, 2021, at 4:49 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Fri, Apr 2, 2021 at 4:31 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Apr 2, 2021, at 1:57 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>
> >>> Ideally I even prefer to create timers in kernel-space too, but as I already
> >>> explained, this seems impossible to me.
> >>
> >> Would hrtimer (include/linux/hrtimer.h) work?
> >
> > By impossible, I meant it is impossible (to me) to take a refcnt to the callback
> > prog if we create the timer in kernel-space. So, hrtimer is the same in this
> > perspective.
> >
> > Thanks.
>
> I guess I am not following 100%. Here is what I would propose:
>
> We only introduce a new program type BPF_PROG_TYPE_TIMER. No new map type.
> The new program will trigger based on a timer, and the program can somehow
> control the period of the timer (for example, via return value).

Like we already discussed, with this approach the "timer" itself is not
visible to kernel, that is, only manageable in user-space. Or do you disagree?

>
> With this approach, the user simply can create multiple timer programs and
> hold the fd for them. And these programs trigger up to timer expiration.

Sure, this is precisely why I moved timer creation to user-space to solve
the refcnt issue. ;)

>
> Does this make sense?

Yes, except kernel-space code can't see it. If you look at the timeout map
I had, you will see something like this:

val = lookup(map, key);
if (val && val->expires < now)
   rearm_timer(&timer); // the timer periodically scans the hashmap

For conntrack, this is obviously in kernel-space. The point of the code is to
flush all expired items as soon as possible without doing explicit deletions
which are obviously expensive for the fast path.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-06  1:24                       ` Cong Wang
@ 2021-04-06  6:17                         ` Song Liu
  2021-04-06 16:48                           ` Cong Wang
  0 siblings, 1 reply; 79+ messages in thread
From: Song Liu @ 2021-04-06  6:17 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Apr 5, 2021, at 6:24 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Mon, Apr 5, 2021 at 6:08 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Apr 5, 2021, at 4:49 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> 
>>> On Fri, Apr 2, 2021 at 4:31 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Apr 2, 2021, at 1:57 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>> 
>>>>> Ideally I even prefer to create timers in kernel-space too, but as I already
>>>>> explained, this seems impossible to me.
>>>> 
>>>> Would hrtimer (include/linux/hrtimer.h) work?
>>> 
>>> By impossible, I meant it is impossible (to me) to take a refcnt to the callback
>>> prog if we create the timer in kernel-space. So, hrtimer is the same in this
>>> perspective.
>>> 
>>> Thanks.
>> 
>> I guess I am not following 100%. Here is what I would propose:
>> 
>> We only introduce a new program type BPF_PROG_TYPE_TIMER. No new map type.
>> The new program will trigger based on a timer, and the program can somehow
>> control the period of the timer (for example, via return value).
> 
> Like we already discussed, with this approach the "timer" itself is not
> visible to kernel, that is, only manageable in user-space. Or do you disagree?

Do you mean we need mechanisms to control the timer, like stop the timer, 
trigger the timer immediately, etc.? And we need these mechanisms in kernel?
And by "in kernel-space" I assume you mean from BPF programs. 

If these are correct, how about something like:

1. A new program BPF_PROG_TYPE_TIMER, which by default will trigger on a timer. 
   Note that, the timer here is embedded in the program. So all the operations
   are on the program. 
2. Allow adding such BPF_PROG_TYPE_TIMER programs to a map of type 
   BPF_MAP_TYPE_PROG_ARRAY. 
3. Some new helpers that access the program via the BPF_MAP_TYPE_PROG_ARRAY map. 
   Actually, maybe we can reuse existing bpf_tail_call(). 

The BPF program and map will look like:

==================== 8< ==========================
struct data_elem {
	__u64 expiration;
	/* other data */
}; 

struct {
	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
	__uint(max_entries, 256);
	__type(key, __u32);
	__type(value, struct data_elem);
} data_map SEC(".maps");

struct {
	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
	__uint(max_entries, 256);
	__type(key, __u32);
	__type(value, __u64);
} timer_prog_map SEC(".maps");

static __u64
check_expired_elem(struct bpf_map *map, __u32 *key, __u64 *val,
                 int *data)
{
	u64 expires = *val;

	if (expires < bpf_jiffies64()) {
		bpf_map_delete_elem(map, key);
		*data++;
	}
 return 0;
}

SEC("timer")
int clean_up_timer(void)
{
	int count;

	bpf_for_each_map_elem(&data_map, check_expired_elem, &count, 0);
	if (count)
 		return 0; // not re-arm this timer
 	else
 		return 10; // reschedule this timer after 10 jiffies
}

SEC("tp_btf/XXX")
int another_trigger(void)
{
	if (some_condition)
		bpf_tail_call(NULL, &timer_prog_map, idx);
	return 0;
}

==================== 8< ==========================

Would something like this work for contract?

Thanks,
Song

> 
>> 
>> With this approach, the user simply can create multiple timer programs and
>> hold the fd for them. And these programs trigger up to timer expiration.
> 
> Sure, this is precisely why I moved timer creation to user-space to solve
> the refcnt issue. ;)
> 
>> 
>> Does this make sense?
> 
> Yes, except kernel-space code can't see it. If you look at the timeout map
> I had, you will see something like this:
> 
> val = lookup(map, key);
> if (val && val->expires < now)
>   rearm_timer(&timer); // the timer periodically scans the hashmap
> 
> For conntrack, this is obviously in kernel-space. The point of the code is to
> flush all expired items as soon as possible without doing explicit deletions
> which are obviously expensive for the fast path.
> 
> Thanks.


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-06  6:17                         ` Song Liu
@ 2021-04-06 16:48                           ` Cong Wang
  2021-04-06 23:36                             ` Song Liu
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-04-06 16:48 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Mon, Apr 5, 2021 at 11:18 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Apr 5, 2021, at 6:24 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Mon, Apr 5, 2021 at 6:08 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Apr 5, 2021, at 4:49 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>
> >>> On Fri, Apr 2, 2021 at 4:31 PM Song Liu <songliubraving@fb.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>>> On Apr 2, 2021, at 1:57 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>>>
> >>>>> Ideally I even prefer to create timers in kernel-space too, but as I already
> >>>>> explained, this seems impossible to me.
> >>>>
> >>>> Would hrtimer (include/linux/hrtimer.h) work?
> >>>
> >>> By impossible, I meant it is impossible (to me) to take a refcnt to the callback
> >>> prog if we create the timer in kernel-space. So, hrtimer is the same in this
> >>> perspective.
> >>>
> >>> Thanks.
> >>
> >> I guess I am not following 100%. Here is what I would propose:
> >>
> >> We only introduce a new program type BPF_PROG_TYPE_TIMER. No new map type.
> >> The new program will trigger based on a timer, and the program can somehow
> >> control the period of the timer (for example, via return value).
> >
> > Like we already discussed, with this approach the "timer" itself is not
> > visible to kernel, that is, only manageable in user-space. Or do you disagree?
>
> Do you mean we need mechanisms to control the timer, like stop the timer,
> trigger the timer immediately, etc.? And we need these mechanisms in kernel?
> And by "in kernel-space" I assume you mean from BPF programs.

Yes, of course. In the context of our discussion, kernel-space only means
eBPF code running in kernel-space. And like I showed in pseudo code,
we want to manage the timer in eBPF code too, that is, updating timer
expiration time and even deleting a timer. The point is we want to give
users as much flexibility as possible, so that they can use it in whatever
scenarios they want. We do not decide how they use them, they do.

>
> If these are correct, how about something like:
>
> 1. A new program BPF_PROG_TYPE_TIMER, which by default will trigger on a timer.
>    Note that, the timer here is embedded in the program. So all the operations
>    are on the program.
> 2. Allow adding such BPF_PROG_TYPE_TIMER programs to a map of type
>    BPF_MAP_TYPE_PROG_ARRAY.
> 3. Some new helpers that access the program via the BPF_MAP_TYPE_PROG_ARRAY map.
>    Actually, maybe we can reuse existing bpf_tail_call().

Reusing bpf_tail_call() just for timer sounds even crazier than
my current approach. So... what's the advantage of your approach
compared to mine?


>
> The BPF program and map will look like:
>
> ==================== 8< ==========================
> struct data_elem {
>         __u64 expiration;
>         /* other data */
> };

So, expiration is separated from "timer" itself. Naturally, expiration
belongs to the timer itself.

>
> struct {
>         __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>         __uint(max_entries, 256);
>         __type(key, __u32);
>         __type(value, struct data_elem);
> } data_map SEC(".maps");
>
> struct {
>         __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>         __uint(max_entries, 256);
>         __type(key, __u32);
>         __type(value, __u64);
> } timer_prog_map SEC(".maps");

So, users have to use two maps just for a timer. Looks unnecessarily
complex to me.

>
> static __u64
> check_expired_elem(struct bpf_map *map, __u32 *key, __u64 *val,
>                  int *data)
> {
>         u64 expires = *val;
>
>         if (expires < bpf_jiffies64()) {

Value is a 64-bit 'expiration', so it is not atomic to read/write it on 32bit
CPU. And user-space could update it in parallel to this timer callback.
So basically we have to use a bpf spinlock here.


>                 bpf_map_delete_elem(map, key);
>                 *data++;
>         }
>  return 0;
> }
>
> SEC("timer")
> int clean_up_timer(void)
> {
>         int count;
>
>         bpf_for_each_map_elem(&data_map, check_expired_elem, &count, 0);
>         if (count)
>                 return 0; // not re-arm this timer
>         else
>                 return 10; // reschedule this timer after 10 jiffies
> }
>
> SEC("tp_btf/XXX")
> int another_trigger(void)
> {
>         if (some_condition)
>                 bpf_tail_call(NULL, &timer_prog_map, idx);

Are you sure you can use bpf_tail_call() to call a prog asynchronously?


>         return 0;
> }
>
> ==================== 8< ==========================
>
> Would something like this work for contract?

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-06 16:48                           ` Cong Wang
@ 2021-04-06 23:36                             ` Song Liu
  2021-04-08 22:45                               ` Cong Wang
  0 siblings, 1 reply; 79+ messages in thread
From: Song Liu @ 2021-04-06 23:36 UTC (permalink / raw)
  To: Cong Wang
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song



> On Apr 6, 2021, at 9:48 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> 
> On Mon, Apr 5, 2021 at 11:18 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Apr 5, 2021, at 6:24 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> 
>>> On Mon, Apr 5, 2021 at 6:08 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Apr 5, 2021, at 4:49 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>> 
>>>>> On Fri, Apr 2, 2021 at 4:31 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>> 
>>>>>> 
>>>>>> 
>>>>>>> On Apr 2, 2021, at 1:57 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>>>> 
>>>>>>> Ideally I even prefer to create timers in kernel-space too, but as I already
>>>>>>> explained, this seems impossible to me.
>>>>>> 
>>>>>> Would hrtimer (include/linux/hrtimer.h) work?
>>>>> 
>>>>> By impossible, I meant it is impossible (to me) to take a refcnt to the callback
>>>>> prog if we create the timer in kernel-space. So, hrtimer is the same in this
>>>>> perspective.
>>>>> 
>>>>> Thanks.
>>>> 
>>>> I guess I am not following 100%. Here is what I would propose:
>>>> 
>>>> We only introduce a new program type BPF_PROG_TYPE_TIMER. No new map type.
>>>> The new program will trigger based on a timer, and the program can somehow
>>>> control the period of the timer (for example, via return value).
>>> 
>>> Like we already discussed, with this approach the "timer" itself is not
>>> visible to kernel, that is, only manageable in user-space. Or do you disagree?
>> 
>> Do you mean we need mechanisms to control the timer, like stop the timer,
>> trigger the timer immediately, etc.? And we need these mechanisms in kernel?
>> And by "in kernel-space" I assume you mean from BPF programs.
> 
> Yes, of course. In the context of our discussion, kernel-space only means
> eBPF code running in kernel-space. And like I showed in pseudo code,
> we want to manage the timer in eBPF code too, that is, updating timer
> expiration time and even deleting a timer. The point is we want to give
> users as much flexibility as possible, so that they can use it in whatever
> scenarios they want. We do not decide how they use them, they do.
> 
>> 
>> If these are correct, how about something like:
>> 
>> 1. A new program BPF_PROG_TYPE_TIMER, which by default will trigger on a timer.
>>   Note that, the timer here is embedded in the program. So all the operations
>>   are on the program.
>> 2. Allow adding such BPF_PROG_TYPE_TIMER programs to a map of type
>>   BPF_MAP_TYPE_PROG_ARRAY.
>> 3. Some new helpers that access the program via the BPF_MAP_TYPE_PROG_ARRAY map.
>>   Actually, maybe we can reuse existing bpf_tail_call().
> 
> Reusing bpf_tail_call() just for timer sounds even crazier than
> my current approach. So... what's the advantage of your approach
> compared to mine?

Since I don't know much about conntrack, I don't know which is better. 
The follow is just my thoughts based on the example you showed. It is 
likely that I misunderstand something. 

IIUC, the problem with user space timer is that we need a dedicated task 
for each wait-trigger loop. So I am proposing a BPF program that would
trigger up on timer expiration. 

The advantage (I think) is to not introduce a separate timer entity. 
The timer is bundled with the program.  

> 
> 
>> 
>> The BPF program and map will look like:
>> 
>> ==================== 8< ==========================
>> struct data_elem {
>>        __u64 expiration;
>>        /* other data */
>> };
> 
> So, expiration is separated from "timer" itself. Naturally, expiration
> belongs to the timer itself.

In this example, expiration is not related to the timer. It is just part
of the data element. It is possible that we won't need the expiration for 
some use cases. 

> 
>> 
>> struct {
>>        __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>>        __uint(max_entries, 256);
>>        __type(key, __u32);
>>        __type(value, struct data_elem);
>> } data_map SEC(".maps");
>> 
>> struct {
>>        __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>>        __uint(max_entries, 256);
>>        __type(key, __u32);
>>        __type(value, __u64);
>> } timer_prog_map SEC(".maps");
> 
> So, users have to use two maps just for a timer. Looks unnecessarily
> complex to me.

The data_map is not for the timer program, it is for the actual data. 
timer_prog_map is also optional here. We only need it when we want to 
trigger the program sooner than the scheduled time. If we can wait a
little longer, timer_prog_map can also be removed.

> 
>> 
>> static __u64
>> check_expired_elem(struct bpf_map *map, __u32 *key, __u64 *val,
>>                 int *data)
>> {
>>        u64 expires = *val;
>> 
>>        if (expires < bpf_jiffies64()) {
> 
> Value is a 64-bit 'expiration', so it is not atomic to read/write it on 32bit
> CPU. And user-space could update it in parallel to this timer callback.
> So basically we have to use a bpf spinlock here.
> 
> 
>>                bpf_map_delete_elem(map, key);
>>                *data++;
>>        }
>> return 0;
>> }
>> 
>> SEC("timer")
>> int clean_up_timer(void)
>> {
>>        int count;
>> 
>>        bpf_for_each_map_elem(&data_map, check_expired_elem, &count, 0);
>>        if (count)
>>                return 0; // not re-arm this timer
>>        else
>>                return 10; // reschedule this timer after 10 jiffies
>> }
>> 
>> SEC("tp_btf/XXX")
>> int another_trigger(void)
>> {
>>        if (some_condition)
>>                bpf_tail_call(NULL, &timer_prog_map, idx);
> 
> Are you sure you can use bpf_tail_call() to call a prog asynchronously?

I am not sure that we gonna use bpf_tail_call() here. If necessary, we 
can introduce a new helper. 


I am not sure whether this makes sense. I feel there is still some 
misunderstanding. It will be helpful if you can share more information 
about the overall design. 

BTW: this could be a good topic for the BPF office hour. See more details
here:

https://docs.google.com/spreadsheets/d/1LfrDXZ9-fdhvPEp_LHkxAMYyxxpwBXjywWa0AejEveU/edit#gid=0

Thanks,
Song

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-06 23:36                             ` Song Liu
@ 2021-04-08 22:45                               ` Cong Wang
  0 siblings, 0 replies; 79+ messages in thread
From: Cong Wang @ 2021-04-08 22:45 UTC (permalink / raw)
  To: Song Liu
  Cc: open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools),
	duanxiongchun, wangdongdong.6, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Yonghong Song

On Tue, Apr 6, 2021 at 4:36 PM Song Liu <songliubraving@fb.com> wrote:
> I am not sure whether this makes sense. I feel there is still some
> misunderstanding. It will be helpful if you can share more information
> about the overall design.
>
> BTW: this could be a good topic for the BPF office hour. See more details
> here:
>
> https://docs.google.com/spreadsheets/d/1LfrDXZ9-fdhvPEp_LHkxAMYyxxpwBXjywWa0AejEveU/edit#gid=0
>

This is a good idea. I have requested for a slot next Thursday,
I am looking forward to discussing bpf timer at that time.

Thanks!

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-06  0:36       ` Cong Wang
@ 2021-04-12 23:01         ` Alexei Starovoitov
  2021-04-15  4:02           ` Cong Wang
  0 siblings, 1 reply; 79+ messages in thread
From: Alexei Starovoitov @ 2021-04-12 23:01 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Mon, Apr 05, 2021 at 05:36:27PM -0700, Cong Wang wrote:
> On Fri, Apr 2, 2021 at 4:45 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Fri, Apr 02, 2021 at 02:24:51PM -0700, Cong Wang wrote:
> > > > > where the key is the timer ID and the value is the timer expire
> > > > > timer.
> > > >
> > > > The timer ID is unnecessary. We cannot introduce new IDR for every new
> > > > bpf object. It doesn't scale.
> > >
> > > The IDR is per map, not per timer.
> >
> > Per-map is not acceptable. One IDR for all maps with timers is not acceptable either.
> > We have 3 IDRs now: for progs, for maps, and for links.
> > No other objects need IDRs.
> >
> > > > Here is how more general timers might look like:
> > > > https://lore.kernel.org/bpf/20210310011905.ozz4xahpkqbfkkvd@ast-mbp.dhcp.thefacebook.com/
> > > >
> > > > include/uapi/linux/bpf.h:
> > > > struct bpf_timer {
> > > >   u64 opaque;
> > > > };
> > > > The 'opaque' field contains a pointer to dynamically allocated struct timer_list and other data.
> > >
> > > This is my initial design as we already discussed, it does not work,
> > > please see below.
> >
> > It does work. The perceived "issue" you referred to is a misunderstanding. See below.
> >
> > > >
> > > > The prog would do:
> > > > struct map_elem {
> > > >     int stuff;
> > > >     struct bpf_timer timer;
> > > > };
> > > >
> > > > struct {
> > > >     __uint(type, BPF_MAP_TYPE_HASH);
> > > >     __uint(max_entries, 1);
> > > >     __type(key, int);
> > > >     __type(value, struct map_elem);
> > > > } hmap SEC(".maps");
> > > >
> > > > static int timer_cb(struct map_elem *elem)
> > > > {
> > > >     if (whatever && elem->stuff)
> > > >         bpf_timer_mod(&elem->timer, new_expire);
> > > > }
> > > >
> > > > int bpf_timer_test(...)
> > > > {
> > > >     struct map_elem *val;
> > > >
> > > >     val = bpf_map_lookup_elem(&hmap, &key);
> > > >     if (val) {
> > > >         bpf_timer_init(&val->timer, timer_cb, flags);
> > > >         val->stuff = 123;
> > > >         bpf_timer_mod(&val->timer, expires);
> > > >     }
> > > > }
> > > >
> > > > bpf_map_update_elem() either from bpf prog or from user space
> > > > allocates map element and zeros 8 byte space for the timer pointer.
> > > > bpf_timer_init() allocates timer_list and stores it into opaque if opaque == 0.
> > > > The validation of timer_cb() is done by the verifier.
> > > > bpf_map_delete_elem() either from bpf prog or from user space
> > > > does del_timer() if elem->opaque != 0.
> > > > If prog refers such hmap as above during prog free the kernel does
> > > > for_each_map_elem {if (elem->opaque) del_timer().}
> > > > I think that is the simplest way of prevent timers firing past the prog life time.
> > > > There could be other ways to solve it (like prog_array and ref/uref).
> > > >
> > > > Pseudo code:
> > > > int bpf_timer_init(struct bpf_timer *timer, void *timer_cb, int flags)
> > > > {
> > > >   if (timer->opaque)
> > > >     return -EBUSY;
> > > >   t = alloc timer_list
> > > >   t->cb = timer_cb;
> > > >   t->..
> > > >   timer->opaque = (long)t;
> > > > }
> > > >
> > > > int bpf_timer_mod(struct bpf_timer *timer, u64 expires)
> > > > {
> > > >   if (!time->opaque)
> > > >     return -EINVAL;
> > > >   t = (struct timer_list *)timer->opaque;
> > > >   mod_timer(t,..);
> > > > }
> > > >
> > > > int bpf_timer_del(struct bpf_timer *timer)
> > > > {
> > > >   if (!time->opaque)
> > > >     return -EINVAL;
> > > >   t = (struct timer_list *)timer->opaque;
> > > >   del_timer(t);
> > > > }
> > > >
> > > > The verifier would need to check that 8 bytes occupied by bpf_timer and not accessed
> > > > via load/store by the program. The same way it does it for bpf_spin_lock.
> > >
> > > This does not work, because bpf_timer_del() has to be matched
> > > with bpf_timer_init(), otherwise we would leak timer resources.
> > > For example:
> > >
> > > SEC("foo")
> > > bad_ebpf_code()
> > > {
> > >   struct bpf_timer t;
> > >   bpf_timer_init(&t, ...); // allocate a timer
> > >   bpf_timer_mod(&t, ..);
> > >   // end of BPF program
> > >   // now the timer is leaked, no one will delete it
> > > }
> > >
> > > We can not enforce the matching in the verifier, because users would
> > > have to call bpf_timer_del() before exiting, which is not what we want
> > > either.
> >
> > ```
> > bad_ebpf_code()
> > {
> >   struct bpf_timer t;
> > ```
> > is not at all what was proposed. This kind of code will be rejected by the verifier.
> >
> > 'struct bpf_timer' has to be part of the map element and the verifier will enforce that
> > just like it does so for bpf_spin_lock.
> > Try writing the following program:
> > ```
> > bad_ebpf_code()
> > {
> >   struct bpf_spin_lock t;
> >   bpf_spin_lock(&t);
> > }
> > ``
> > and then follow the code to see why the verifier rejects it.
> 
> Well, embedding a spinlock makes sense as it is used to protect
> the value it is associated with, but for a timer, no, it has no value
> to associate. 

The way kernel code is using timers is alwasy by embedding timer_list
into another data structure and then using container_of() in a callback.
So all existing use cases of timers disagree with your point.

> Even if it does, updating it requires a lock as the
> callback can run concurrently with value update. 

No lock is necessary.
map_value_update_elem can either return EBUSY if timer_list != NULL
or it can del_timer() before updating the whole value.
Both choices can be expressed with flags.

> So, they are very
> different hence should be treated differently rather than similarly.
> 
> >
> > The implementation of what I'm proposing is straightforward.
> > I certainly understand that it might look intimidating and "impossible",
> > but it's really quite simple.
> 
> How do you refcnt the struct bpf_prog with your approach? Or with

you don't. More so prog must not be refcnted otherwise it's a circular
dependency between progs and maps.
We did that in the past with prog_array and the api became unpleasant
and not user friendly. Not going to repeat the same mistake again.

> actually any attempt to create timers in kernel-space. I am not intimidated
> but quite happy to hear. If you do it in the verifier, we do not know which
> code path is actually executed when running it. If you do it with JIT, I do
> not see how JIT can even get the right struct bpf_prog pointer in context.

Neither. See pseudo code for bpf_timer_init/bpf_timer_mod in the earlier email.

> This is how I concluded it looks impossible.

Please explain what 'impossible' or buggy you see in the pseudo code.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-12 23:01         ` Alexei Starovoitov
@ 2021-04-15  4:02           ` Cong Wang
  2021-04-15  4:25             ` Alexei Starovoitov
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-04-15  4:02 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Mon, Apr 12, 2021 at 4:01 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Apr 05, 2021 at 05:36:27PM -0700, Cong Wang wrote:
> > On Fri, Apr 2, 2021 at 4:45 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Fri, Apr 02, 2021 at 02:24:51PM -0700, Cong Wang wrote:
> > > > > > where the key is the timer ID and the value is the timer expire
> > > > > > timer.
> > > > >
> > > > > The timer ID is unnecessary. We cannot introduce new IDR for every new
> > > > > bpf object. It doesn't scale.
> > > >
> > > > The IDR is per map, not per timer.
> > >
> > > Per-map is not acceptable. One IDR for all maps with timers is not acceptable either.
> > > We have 3 IDRs now: for progs, for maps, and for links.
> > > No other objects need IDRs.
> > >
> > > > > Here is how more general timers might look like:
> > > > > https://lore.kernel.org/bpf/20210310011905.ozz4xahpkqbfkkvd@ast-mbp.dhcp.thefacebook.com/
> > > > >
> > > > > include/uapi/linux/bpf.h:
> > > > > struct bpf_timer {
> > > > >   u64 opaque;
> > > > > };
> > > > > The 'opaque' field contains a pointer to dynamically allocated struct timer_list and other data.
> > > >
> > > > This is my initial design as we already discussed, it does not work,
> > > > please see below.
> > >
> > > It does work. The perceived "issue" you referred to is a misunderstanding. See below.
> > >
> > > > >
> > > > > The prog would do:
> > > > > struct map_elem {
> > > > >     int stuff;
> > > > >     struct bpf_timer timer;
> > > > > };
> > > > >
> > > > > struct {
> > > > >     __uint(type, BPF_MAP_TYPE_HASH);
> > > > >     __uint(max_entries, 1);
> > > > >     __type(key, int);
> > > > >     __type(value, struct map_elem);
> > > > > } hmap SEC(".maps");
> > > > >
> > > > > static int timer_cb(struct map_elem *elem)
> > > > > {
> > > > >     if (whatever && elem->stuff)
> > > > >         bpf_timer_mod(&elem->timer, new_expire);
> > > > > }
> > > > >
> > > > > int bpf_timer_test(...)
> > > > > {
> > > > >     struct map_elem *val;
> > > > >
> > > > >     val = bpf_map_lookup_elem(&hmap, &key);
> > > > >     if (val) {
> > > > >         bpf_timer_init(&val->timer, timer_cb, flags);
> > > > >         val->stuff = 123;
> > > > >         bpf_timer_mod(&val->timer, expires);
> > > > >     }
> > > > > }
> > > > >
> > > > > bpf_map_update_elem() either from bpf prog or from user space
> > > > > allocates map element and zeros 8 byte space for the timer pointer.
> > > > > bpf_timer_init() allocates timer_list and stores it into opaque if opaque == 0.
> > > > > The validation of timer_cb() is done by the verifier.
> > > > > bpf_map_delete_elem() either from bpf prog or from user space
> > > > > does del_timer() if elem->opaque != 0.
> > > > > If prog refers such hmap as above during prog free the kernel does
> > > > > for_each_map_elem {if (elem->opaque) del_timer().}
> > > > > I think that is the simplest way of prevent timers firing past the prog life time.
> > > > > There could be other ways to solve it (like prog_array and ref/uref).
> > > > >
> > > > > Pseudo code:
> > > > > int bpf_timer_init(struct bpf_timer *timer, void *timer_cb, int flags)
> > > > > {
> > > > >   if (timer->opaque)
> > > > >     return -EBUSY;
> > > > >   t = alloc timer_list
> > > > >   t->cb = timer_cb;
> > > > >   t->..
> > > > >   timer->opaque = (long)t;
> > > > > }
> > > > >
> > > > > int bpf_timer_mod(struct bpf_timer *timer, u64 expires)
> > > > > {
> > > > >   if (!time->opaque)
> > > > >     return -EINVAL;
> > > > >   t = (struct timer_list *)timer->opaque;
> > > > >   mod_timer(t,..);
> > > > > }
> > > > >
> > > > > int bpf_timer_del(struct bpf_timer *timer)
> > > > > {
> > > > >   if (!time->opaque)
> > > > >     return -EINVAL;
> > > > >   t = (struct timer_list *)timer->opaque;
> > > > >   del_timer(t);
> > > > > }
> > > > >
> > > > > The verifier would need to check that 8 bytes occupied by bpf_timer and not accessed
> > > > > via load/store by the program. The same way it does it for bpf_spin_lock.
> > > >
> > > > This does not work, because bpf_timer_del() has to be matched
> > > > with bpf_timer_init(), otherwise we would leak timer resources.
> > > > For example:
> > > >
> > > > SEC("foo")
> > > > bad_ebpf_code()
> > > > {
> > > >   struct bpf_timer t;
> > > >   bpf_timer_init(&t, ...); // allocate a timer
> > > >   bpf_timer_mod(&t, ..);
> > > >   // end of BPF program
> > > >   // now the timer is leaked, no one will delete it
> > > > }
> > > >
> > > > We can not enforce the matching in the verifier, because users would
> > > > have to call bpf_timer_del() before exiting, which is not what we want
> > > > either.
> > >
> > > ```
> > > bad_ebpf_code()
> > > {
> > >   struct bpf_timer t;
> > > ```
> > > is not at all what was proposed. This kind of code will be rejected by the verifier.
> > >
> > > 'struct bpf_timer' has to be part of the map element and the verifier will enforce that
> > > just like it does so for bpf_spin_lock.
> > > Try writing the following program:
> > > ```
> > > bad_ebpf_code()
> > > {
> > >   struct bpf_spin_lock t;
> > >   bpf_spin_lock(&t);
> > > }
> > > ``
> > > and then follow the code to see why the verifier rejects it.
> >
> > Well, embedding a spinlock makes sense as it is used to protect
> > the value it is associated with, but for a timer, no, it has no value
> > to associate.
>
> The way kernel code is using timers is alwasy by embedding timer_list
> into another data structure and then using container_of() in a callback.
> So all existing use cases of timers disagree with your point.

Not always. Data can be passed as a global data structure visible to
timer callback.

>
> > Even if it does, updating it requires a lock as the
> > callback can run concurrently with value update.
>
> No lock is necessary.
> map_value_update_elem can either return EBUSY if timer_list != NULL
> or it can del_timer() before updating the whole value.
> Both choices can be expressed with flags.

This sounds problematic, because the hash map is visible to
users but not the timers associated, hence in user-space users
just unexpectedly get EBUSY.

>
> > So, they are very
> > different hence should be treated differently rather than similarly.
> >
> > >
> > > The implementation of what I'm proposing is straightforward.
> > > I certainly understand that it might look intimidating and "impossible",
> > > but it's really quite simple.
> >
> > How do you refcnt the struct bpf_prog with your approach? Or with
>
> you don't. More so prog must not be refcnted otherwise it's a circular
> dependency between progs and maps.
> We did that in the past with prog_array and the api became unpleasant
> and not user friendly. Not going to repeat the same mistake again.

Then how do you prevent prog being unloaded when the timer callback
is still active?


>
> > actually any attempt to create timers in kernel-space. I am not intimidated
> > but quite happy to hear. If you do it in the verifier, we do not know which
> > code path is actually executed when running it. If you do it with JIT, I do
> > not see how JIT can even get the right struct bpf_prog pointer in context.
>
> Neither. See pseudo code for bpf_timer_init/bpf_timer_mod in the earlier email.
>
> > This is how I concluded it looks impossible.
>
> Please explain what 'impossible' or buggy you see in the pseudo code.

Your pseudo code never shows how to refcnt the struct bpf_prog, which
is critical to the bpf timer design.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-15  4:02           ` Cong Wang
@ 2021-04-15  4:25             ` Alexei Starovoitov
  2021-04-15 15:51               ` Cong Wang
  2021-04-26 23:00               ` Cong Wang
  0 siblings, 2 replies; 79+ messages in thread
From: Alexei Starovoitov @ 2021-04-15  4:25 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Wed, Apr 14, 2021 at 9:02 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Mon, Apr 12, 2021 at 4:01 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Apr 05, 2021 at 05:36:27PM -0700, Cong Wang wrote:
> > > On Fri, Apr 2, 2021 at 4:45 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Fri, Apr 02, 2021 at 02:24:51PM -0700, Cong Wang wrote:
> > > > > > > where the key is the timer ID and the value is the timer expire
> > > > > > > timer.
> > > > > >
> > > > > > The timer ID is unnecessary. We cannot introduce new IDR for every new
> > > > > > bpf object. It doesn't scale.
> > > > >
> > > > > The IDR is per map, not per timer.
> > > >
> > > > Per-map is not acceptable. One IDR for all maps with timers is not acceptable either.
> > > > We have 3 IDRs now: for progs, for maps, and for links.
> > > > No other objects need IDRs.
> > > >
> > > > > > Here is how more general timers might look like:
> > > > > > https://lore.kernel.org/bpf/20210310011905.ozz4xahpkqbfkkvd@ast-mbp.dhcp.thefacebook.com/
> > > > > >
> > > > > > include/uapi/linux/bpf.h:
> > > > > > struct bpf_timer {
> > > > > >   u64 opaque;
> > > > > > };
> > > > > > The 'opaque' field contains a pointer to dynamically allocated struct timer_list and other data.
> > > > >
> > > > > This is my initial design as we already discussed, it does not work,
> > > > > please see below.
> > > >
> > > > It does work. The perceived "issue" you referred to is a misunderstanding. See below.
> > > >
> > > > > >
> > > > > > The prog would do:
> > > > > > struct map_elem {
> > > > > >     int stuff;
> > > > > >     struct bpf_timer timer;
> > > > > > };
> > > > > >
> > > > > > struct {
> > > > > >     __uint(type, BPF_MAP_TYPE_HASH);
> > > > > >     __uint(max_entries, 1);
> > > > > >     __type(key, int);
> > > > > >     __type(value, struct map_elem);
> > > > > > } hmap SEC(".maps");
> > > > > >
> > > > > > static int timer_cb(struct map_elem *elem)
> > > > > > {
> > > > > >     if (whatever && elem->stuff)
> > > > > >         bpf_timer_mod(&elem->timer, new_expire);
> > > > > > }
> > > > > >
> > > > > > int bpf_timer_test(...)
> > > > > > {
> > > > > >     struct map_elem *val;
> > > > > >
> > > > > >     val = bpf_map_lookup_elem(&hmap, &key);
> > > > > >     if (val) {
> > > > > >         bpf_timer_init(&val->timer, timer_cb, flags);
> > > > > >         val->stuff = 123;
> > > > > >         bpf_timer_mod(&val->timer, expires);
> > > > > >     }
> > > > > > }
> > > > > >
> > > > > > bpf_map_update_elem() either from bpf prog or from user space
> > > > > > allocates map element and zeros 8 byte space for the timer pointer.
> > > > > > bpf_timer_init() allocates timer_list and stores it into opaque if opaque == 0.
> > > > > > The validation of timer_cb() is done by the verifier.
> > > > > > bpf_map_delete_elem() either from bpf prog or from user space
> > > > > > does del_timer() if elem->opaque != 0.
> > > > > > If prog refers such hmap as above during prog free the kernel does
> > > > > > for_each_map_elem {if (elem->opaque) del_timer().}
> > > > > > I think that is the simplest way of prevent timers firing past the prog life time.
> > > > > > There could be other ways to solve it (like prog_array and ref/uref).
> > > > > >
> > > > > > Pseudo code:
> > > > > > int bpf_timer_init(struct bpf_timer *timer, void *timer_cb, int flags)
> > > > > > {
> > > > > >   if (timer->opaque)
> > > > > >     return -EBUSY;
> > > > > >   t = alloc timer_list
> > > > > >   t->cb = timer_cb;
> > > > > >   t->..
> > > > > >   timer->opaque = (long)t;
> > > > > > }
> > > > > >
> > > > > > int bpf_timer_mod(struct bpf_timer *timer, u64 expires)
> > > > > > {
> > > > > >   if (!time->opaque)
> > > > > >     return -EINVAL;
> > > > > >   t = (struct timer_list *)timer->opaque;
> > > > > >   mod_timer(t,..);
> > > > > > }
> > > > > >
> > > > > > int bpf_timer_del(struct bpf_timer *timer)
> > > > > > {
> > > > > >   if (!time->opaque)
> > > > > >     return -EINVAL;
> > > > > >   t = (struct timer_list *)timer->opaque;
> > > > > >   del_timer(t);
> > > > > > }
> > > > > >
> > > > > > The verifier would need to check that 8 bytes occupied by bpf_timer and not accessed
> > > > > > via load/store by the program. The same way it does it for bpf_spin_lock.
> > > > >
> > > > > This does not work, because bpf_timer_del() has to be matched
> > > > > with bpf_timer_init(), otherwise we would leak timer resources.
> > > > > For example:
> > > > >
> > > > > SEC("foo")
> > > > > bad_ebpf_code()
> > > > > {
> > > > >   struct bpf_timer t;
> > > > >   bpf_timer_init(&t, ...); // allocate a timer
> > > > >   bpf_timer_mod(&t, ..);
> > > > >   // end of BPF program
> > > > >   // now the timer is leaked, no one will delete it
> > > > > }
> > > > >
> > > > > We can not enforce the matching in the verifier, because users would
> > > > > have to call bpf_timer_del() before exiting, which is not what we want
> > > > > either.
> > > >
> > > > ```
> > > > bad_ebpf_code()
> > > > {
> > > >   struct bpf_timer t;
> > > > ```
> > > > is not at all what was proposed. This kind of code will be rejected by the verifier.
> > > >
> > > > 'struct bpf_timer' has to be part of the map element and the verifier will enforce that
> > > > just like it does so for bpf_spin_lock.
> > > > Try writing the following program:
> > > > ```
> > > > bad_ebpf_code()
> > > > {
> > > >   struct bpf_spin_lock t;
> > > >   bpf_spin_lock(&t);
> > > > }
> > > > ``
> > > > and then follow the code to see why the verifier rejects it.
> > >
> > > Well, embedding a spinlock makes sense as it is used to protect
> > > the value it is associated with, but for a timer, no, it has no value
> > > to associate.
> >
> > The way kernel code is using timers is alwasy by embedding timer_list
> > into another data structure and then using container_of() in a callback.
> > So all existing use cases of timers disagree with your point.
>
> Not always. Data can be passed as a global data structure visible to
> timer callback.

global data is racy. That's not an option at all.

> >
> > > Even if it does, updating it requires a lock as the
> > > callback can run concurrently with value update.
> >
> > No lock is necessary.
> > map_value_update_elem can either return EBUSY if timer_list != NULL
> > or it can del_timer() before updating the whole value.
> > Both choices can be expressed with flags.
>
> This sounds problematic, because the hash map is visible to
> users but not the timers associated, hence in user-space users
> just unexpectedly get EBUSY.

As I said earlier:
"
bpf_map_update_elem() either from bpf prog or from user space
allocates map element and zeros 8 byte space for the timer pointer.
"
and also said that EBUSY could be default or non default behavior
expressed with flags passed into update.

> >
> > > So, they are very
> > > different hence should be treated differently rather than similarly.
> > >
> > > >
> > > > The implementation of what I'm proposing is straightforward.
> > > > I certainly understand that it might look intimidating and "impossible",
> > > > but it's really quite simple.
> > >
> > > How do you refcnt the struct bpf_prog with your approach? Or with
> >
> > you don't. More so prog must not be refcnted otherwise it's a circular
> > dependency between progs and maps.
> > We did that in the past with prog_array and the api became unpleasant
> > and not user friendly. Not going to repeat the same mistake again.
>
> Then how do you prevent prog being unloaded when the timer callback
> is still active?

As I said earlier:
"
If prog refers such hmap as above during prog free the kernel does
for_each_map_elem {if (elem->opaque) del_timer().}
"

>
> >
> > > actually any attempt to create timers in kernel-space. I am not intimidated
> > > but quite happy to hear. If you do it in the verifier, we do not know which
> > > code path is actually executed when running it. If you do it with JIT, I do
> > > not see how JIT can even get the right struct bpf_prog pointer in context.
> >
> > Neither. See pseudo code for bpf_timer_init/bpf_timer_mod in the earlier email.
> >
> > > This is how I concluded it looks impossible.
> >
> > Please explain what 'impossible' or buggy you see in the pseudo code.
>
> Your pseudo code never shows how to refcnt the struct bpf_prog, which
> is critical to the bpf timer design.

As I said earlier: nack to refcnt progs.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-15  4:25             ` Alexei Starovoitov
@ 2021-04-15 15:51               ` Cong Wang
  2021-04-26 23:00               ` Cong Wang
  1 sibling, 0 replies; 79+ messages in thread
From: Cong Wang @ 2021-04-15 15:51 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linux Kernel Network Developers, bpf, duanxiongchun,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song

On Wed, Apr 14, 2021 at 9:25 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> As I said earlier:
> "
> If prog refers such hmap as above during prog free the kernel does
> for_each_map_elem {if (elem->opaque) del_timer().}
> "

This goes back to our previous discussion. Forcing timer deletions on
prog exit is not what we want. The whole point of using a map is to
extend the lifetime of a timer, that is, as long as the map exists, the
timers within it could be still running too.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-15  4:25             ` Alexei Starovoitov
  2021-04-15 15:51               ` Cong Wang
@ 2021-04-26 23:00               ` Cong Wang
  2021-04-26 23:05                 ` Alexei Starovoitov
  1 sibling, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-04-26 23:00 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linux Kernel Network Developers, bpf, Xiongchun Duan,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, Pedro Tammela, Jamal Hadi Salim

Hi, Alexei

On Wed, Apr 14, 2021 at 9:25 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Apr 14, 2021 at 9:02 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > Then how do you prevent prog being unloaded when the timer callback
> > is still active?
>
> As I said earlier:
> "
> If prog refers such hmap as above during prog free the kernel does
> for_each_map_elem {if (elem->opaque) del_timer().}
> "

I have discussed this with my colleagues, sharing timers among different
eBPF programs is a must-have feature for conntrack.

For conntrack, we need to attach two eBPF programs, one on egress and
one on ingress. They share a conntrack table (an eBPF map), and no matter
we use a per-map or per-entry timer, updating the timer(s) could happen
on both sides, hence timers must be shared for both.

So, your proposal we discussed does not work well for this scenario. The
proposal in my RFC should still work. Please let me know if you have any
better ideas.

Thanks!

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-26 23:00               ` Cong Wang
@ 2021-04-26 23:05                 ` Alexei Starovoitov
  2021-04-26 23:37                   ` Cong Wang
  0 siblings, 1 reply; 79+ messages in thread
From: Alexei Starovoitov @ 2021-04-26 23:05 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, bpf, Xiongchun Duan,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, Pedro Tammela, Jamal Hadi Salim

On Mon, Apr 26, 2021 at 4:00 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> Hi, Alexei
>
> On Wed, Apr 14, 2021 at 9:25 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Wed, Apr 14, 2021 at 9:02 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > Then how do you prevent prog being unloaded when the timer callback
> > > is still active?
> >
> > As I said earlier:
> > "
> > If prog refers such hmap as above during prog free the kernel does
> > for_each_map_elem {if (elem->opaque) del_timer().}
> > "
>
> I have discussed this with my colleagues, sharing timers among different
> eBPF programs is a must-have feature for conntrack.
>
> For conntrack, we need to attach two eBPF programs, one on egress and
> one on ingress. They share a conntrack table (an eBPF map), and no matter
> we use a per-map or per-entry timer, updating the timer(s) could happen
> on both sides, hence timers must be shared for both.
>
> So, your proposal we discussed does not work well for this scenario.

why? The timer inside the map element will be shared just fine.
Just like different progs can see the same map value.

Also if your colleagues have something to share they should be
posting to the mailing list. Right now you're acting as a broken phone
passing info back and forth and the knowledge gets lost.
Please ask your colleagues to participate online.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-26 23:05                 ` Alexei Starovoitov
@ 2021-04-26 23:37                   ` Cong Wang
  2021-04-27  2:01                     ` Alexei Starovoitov
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-04-26 23:37 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linux Kernel Network Developers, bpf, Xiongchun Duan,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, Pedro Tammela, Jamal Hadi Salim

On Mon, Apr 26, 2021 at 4:05 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Apr 26, 2021 at 4:00 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > Hi, Alexei
> >
> > On Wed, Apr 14, 2021 at 9:25 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Wed, Apr 14, 2021 at 9:02 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >
> > > > Then how do you prevent prog being unloaded when the timer callback
> > > > is still active?
> > >
> > > As I said earlier:
> > > "
> > > If prog refers such hmap as above during prog free the kernel does
> > > for_each_map_elem {if (elem->opaque) del_timer().}
> > > "
> >
> > I have discussed this with my colleagues, sharing timers among different
> > eBPF programs is a must-have feature for conntrack.
> >
> > For conntrack, we need to attach two eBPF programs, one on egress and
> > one on ingress. They share a conntrack table (an eBPF map), and no matter
> > we use a per-map or per-entry timer, updating the timer(s) could happen
> > on both sides, hence timers must be shared for both.
> >
> > So, your proposal we discussed does not work well for this scenario.
>
> why? The timer inside the map element will be shared just fine.
> Just like different progs can see the same map value.

Hmm? In the above quotes from you, you suggested removing all the
timers installed by one eBPF program when it is freed, but they could be
still running independent of which program installs them.

In other words, timers are independent of other eBPF programs, so
they should not have an owner. With your proposal, the owner of a timer
is the program which contains the subprog (or callback) of the timer.
With my proposal, the timer callback is a standalone program hence has
no owner.

>
> Also if your colleagues have something to share they should be
> posting to the mailing list. Right now you're acting as a broken phone
> passing info back and forth and the knowledge gets lost.
> Please ask your colleagues to participate online.

They are already in CC from the very beginning. And our use case is
public, it is Cilium conntrack:
https://github.com/cilium/cilium/blob/master/bpf/lib/conntrack.h

The entries of the code are:
https://github.com/cilium/cilium/blob/master/bpf/bpf_lxc.c

The maps for conntrack are:
https://github.com/cilium/cilium/blob/master/bpf/lib/conntrack_map.h

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-26 23:37                   ` Cong Wang
@ 2021-04-27  2:01                     ` Alexei Starovoitov
  2021-04-27 11:52                       ` Jamal Hadi Salim
  2021-04-27 16:36                       ` Cong Wang
  0 siblings, 2 replies; 79+ messages in thread
From: Alexei Starovoitov @ 2021-04-27  2:01 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, bpf, Xiongchun Duan,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, Pedro Tammela, Jamal Hadi Salim

On Mon, Apr 26, 2021 at 04:37:19PM -0700, Cong Wang wrote:
> On Mon, Apr 26, 2021 at 4:05 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Apr 26, 2021 at 4:00 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > Hi, Alexei
> > >
> > > On Wed, Apr 14, 2021 at 9:25 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Wed, Apr 14, 2021 at 9:02 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > >
> > > > > Then how do you prevent prog being unloaded when the timer callback
> > > > > is still active?
> > > >
> > > > As I said earlier:
> > > > "
> > > > If prog refers such hmap as above during prog free the kernel does
> > > > for_each_map_elem {if (elem->opaque) del_timer().}
> > > > "
> > >
> > > I have discussed this with my colleagues, sharing timers among different
> > > eBPF programs is a must-have feature for conntrack.
> > >
> > > For conntrack, we need to attach two eBPF programs, one on egress and
> > > one on ingress. They share a conntrack table (an eBPF map), and no matter
> > > we use a per-map or per-entry timer, updating the timer(s) could happen
> > > on both sides, hence timers must be shared for both.
> > >
> > > So, your proposal we discussed does not work well for this scenario.
> >
> > why? The timer inside the map element will be shared just fine.
> > Just like different progs can see the same map value.
> 
> Hmm? In the above quotes from you, you suggested removing all the
> timers installed by one eBPF program when it is freed, but they could be
> still running independent of which program installs them.

Right. That was before the office hours chat where we discussed an approach
to remove timers installed by this particular prog only.
The timers armed by other progs in the same map would be preserved.

> In other words, timers are independent of other eBPF programs, so
> they should not have an owner. With your proposal, the owner of a timer
> is the program which contains the subprog (or callback) of the timer.

right. so?
How is this anything to do with "sharing timers among different eBPF programs"?

> >
> > Also if your colleagues have something to share they should be
> > posting to the mailing list. Right now you're acting as a broken phone
> > passing info back and forth and the knowledge gets lost.
> > Please ask your colleagues to participate online.
> 
> They are already in CC from the very beginning. And our use case is
> public, it is Cilium conntrack:
> https://github.com/cilium/cilium/blob/master/bpf/lib/conntrack.h
> 
> The entries of the code are:
> https://github.com/cilium/cilium/blob/master/bpf/bpf_lxc.c
> 
> The maps for conntrack are:
> https://github.com/cilium/cilium/blob/master/bpf/lib/conntrack_map.h

If that's the only goal then kernel timers are not needed.
cilium conntrack works well as-is.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-27  2:01                     ` Alexei Starovoitov
@ 2021-04-27 11:52                       ` Jamal Hadi Salim
  2021-04-27 16:36                       ` Cong Wang
  1 sibling, 0 replies; 79+ messages in thread
From: Jamal Hadi Salim @ 2021-04-27 11:52 UTC (permalink / raw)
  To: Alexei Starovoitov, Cong Wang
  Cc: Linux Kernel Network Developers, bpf, Xiongchun Duan,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, Pedro Tammela

On 2021-04-26 10:01 p.m., Alexei Starovoitov wrote:

[..]
>>
>> They are already in CC from the very beginning. And our use case is
>> public, it is Cilium conntrack:
>> https://github.com/cilium/cilium/blob/master/bpf/lib/conntrack.h
>>
>> The entries of the code are:
>> https://github.com/cilium/cilium/blob/master/bpf/bpf_lxc.c
>>
>> The maps for conntrack are:
>> https://github.com/cilium/cilium/blob/master/bpf/lib/conntrack_map.h
> 
> If that's the only goal then kernel timers are not needed.
> cilium conntrack works well as-is.

IIRC, the original patch from Cong was driven by need to scale said
conntracking in presence of large number of flows.
The arguement i heard from Cong is LRU doesnt scale in such a setup.

I would argue timers generally are useful for a variety of house
keeping purposes and they are currently missing from ebpf. This
despite Cong's use case.
Currently things in the datapath are triggered by either packets
showing up or from a control plane perspective by user space polling.

Our use case (honestly, not that it matters to justify why we need
timers) is we want to periodically, if some condition is met in the
kernel, to send unsolicited housekeeping events to user space.

Hope that helps.

cheers,
jamal


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-27  2:01                     ` Alexei Starovoitov
  2021-04-27 11:52                       ` Jamal Hadi Salim
@ 2021-04-27 16:36                       ` Cong Wang
  2021-04-27 18:33                         ` Alexei Starovoitov
  1 sibling, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-04-27 16:36 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linux Kernel Network Developers, bpf, Xiongchun Duan,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, Pedro Tammela, Jamal Hadi Salim

On Mon, Apr 26, 2021 at 7:02 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Apr 26, 2021 at 04:37:19PM -0700, Cong Wang wrote:
> > On Mon, Apr 26, 2021 at 4:05 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Mon, Apr 26, 2021 at 4:00 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >
> > > > Hi, Alexei
> > > >
> > > > On Wed, Apr 14, 2021 at 9:25 PM Alexei Starovoitov
> > > > <alexei.starovoitov@gmail.com> wrote:
> > > > >
> > > > > On Wed, Apr 14, 2021 at 9:02 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > > >
> > > > > > Then how do you prevent prog being unloaded when the timer callback
> > > > > > is still active?
> > > > >
> > > > > As I said earlier:
> > > > > "
> > > > > If prog refers such hmap as above during prog free the kernel does
> > > > > for_each_map_elem {if (elem->opaque) del_timer().}
> > > > > "
> > > >
> > > > I have discussed this with my colleagues, sharing timers among different
> > > > eBPF programs is a must-have feature for conntrack.
> > > >
> > > > For conntrack, we need to attach two eBPF programs, one on egress and
> > > > one on ingress. They share a conntrack table (an eBPF map), and no matter
> > > > we use a per-map or per-entry timer, updating the timer(s) could happen
> > > > on both sides, hence timers must be shared for both.
> > > >
> > > > So, your proposal we discussed does not work well for this scenario.
> > >
> > > why? The timer inside the map element will be shared just fine.
> > > Just like different progs can see the same map value.
> >
> > Hmm? In the above quotes from you, you suggested removing all the
> > timers installed by one eBPF program when it is freed, but they could be
> > still running independent of which program installs them.
>
> Right. That was before the office hours chat where we discussed an approach
> to remove timers installed by this particular prog only.
> The timers armed by other progs in the same map would be preserved.
>
> > In other words, timers are independent of other eBPF programs, so
> > they should not have an owner. With your proposal, the owner of a timer
> > is the program which contains the subprog (or callback) of the timer.
>
> right. so?
> How is this anything to do with "sharing timers among different eBPF programs"?

It matters a lot which program installs hence removes these timers,
because conceptually each connection inside a conntrack table does not
belong to any program, so are the timers associated with these
connections.

If we enforce this ownership, in case of conntrack the owner would be
the program which sees the connection first, which is pretty much
unpredictable. For example, if the ingress program sees a connection
first, it installs a timer for this connection, but the traffic is
bidirectional,
hence egress program needs this connection and its timer too, we
should not remove this timer when the ingress program is freed.

From another point of view: maps and programs are both first-class
resources in eBPF, a timer is stored in a map and associated with a
program, so it is naturally a first-class resource too.

>
> > >
> > > Also if your colleagues have something to share they should be
> > > posting to the mailing list. Right now you're acting as a broken phone
> > > passing info back and forth and the knowledge gets lost.
> > > Please ask your colleagues to participate online.
> >
> > They are already in CC from the very beginning. And our use case is
> > public, it is Cilium conntrack:
> > https://github.com/cilium/cilium/blob/master/bpf/lib/conntrack.h
> >
> > The entries of the code are:
> > https://github.com/cilium/cilium/blob/master/bpf/bpf_lxc.c
> >
> > The maps for conntrack are:
> > https://github.com/cilium/cilium/blob/master/bpf/lib/conntrack_map.h
>
> If that's the only goal then kernel timers are not needed.
> cilium conntrack works well as-is.

We don't go back to why user-space cleanup is inefficient again,
do we? ;)

More importantly, although conntrack is our use case, we don't
design timers just for our case, obviously. Timers must be as flexible
to use as possible, to allow other future use cases.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-27 16:36                       ` Cong Wang
@ 2021-04-27 18:33                         ` Alexei Starovoitov
  2021-05-09  5:37                           ` Cong Wang
  0 siblings, 1 reply; 79+ messages in thread
From: Alexei Starovoitov @ 2021-04-27 18:33 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, bpf, Xiongchun Duan,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, Pedro Tammela, Jamal Hadi Salim

On Tue, Apr 27, 2021 at 9:36 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> If we enforce this ownership, in case of conntrack the owner would be
> the program which sees the connection first, which is pretty much
> unpredictable. For example, if the ingress program sees a connection
> first, it installs a timer for this connection, but the traffic is
> bidirectional,
> hence egress program needs this connection and its timer too, we
> should not remove this timer when the ingress program is freed.

Sure. That's trivially achieved with pinning.
One can have an ingress prog that tailcalls into another prog
that arms the timer with one of its subprogs.
Egress prog can tailcall into the same prog as well.
The ingress and egress progs can be replaced one by one
or removed both together and middle prog can stay alive
if it's pinned in bpffs or held alive by FD.

> From another point of view: maps and programs are both first-class
> resources in eBPF, a timer is stored in a map and associated with a
> program, so it is naturally a first-class resource too.

Not really. The timer abstraction is about data. It invokes the callback.
That callback is a part of the program. The lifetime of the timer object
and lifetime of the callback can be different.
Obviously the timer logic need to make sure that callback text is alive
when the timer is armed.
Combining timer and callback concepts creates a messy abstraction.
In the normal kernel code one can have a timer in any kernel data
structure and callback in the kernel text or in the kernel module.
The code needs to make sure that the module won't go away while
the timer is armed. Same thing with bpf progs. The progs are safe
kernel modules. The timers are independent objects.

> >
> > > >
> > > > Also if your colleagues have something to share they should be
> > > > posting to the mailing list. Right now you're acting as a broken phone
> > > > passing info back and forth and the knowledge gets lost.
> > > > Please ask your colleagues to participate online.
> > >
> > > They are already in CC from the very beginning. And our use case is
> > > public, it is Cilium conntrack:
> > > https://github.com/cilium/cilium/blob/master/bpf/lib/conntrack.h
> > >
> > > The entries of the code are:
> > > https://github.com/cilium/cilium/blob/master/bpf/bpf_lxc.c
> > >
> > > The maps for conntrack are:
> > > https://github.com/cilium/cilium/blob/master/bpf/lib/conntrack_map.h
> >
> > If that's the only goal then kernel timers are not needed.
> > cilium conntrack works well as-is.
>
> We don't go back to why user-space cleanup is inefficient again,
> do we? ;)

I remain unconvinced that cilium conntrack _needs_ timer apis.
It works fine in production and I don't hear any complaints
from cilium users. So 'user space cleanup inefficiencies' is
very subjective and cannot be the reason to add timer apis.

> More importantly, although conntrack is our use case, we don't
> design timers just for our case, obviously. Timers must be as flexible
> to use as possible, to allow other future use cases.

Right. That's why I'm asking for an explanation of a specific use case.
"we want to do cilium conntrack but differently" is not a reason.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-04-27 18:33                         ` Alexei Starovoitov
@ 2021-05-09  5:37                           ` Cong Wang
  2021-05-10 20:55                             ` Jamal Hadi Salim
  2021-05-11  5:05                             ` Joe Stringer
  0 siblings, 2 replies; 79+ messages in thread
From: Cong Wang @ 2021-05-09  5:37 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linux Kernel Network Developers, bpf, Xiongchun Duan,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, Pedro Tammela, Jamal Hadi Salim, Joe Stringer

On Tue, Apr 27, 2021 at 11:34 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Apr 27, 2021 at 9:36 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > If we enforce this ownership, in case of conntrack the owner would be
> > the program which sees the connection first, which is pretty much
> > unpredictable. For example, if the ingress program sees a connection
> > first, it installs a timer for this connection, but the traffic is
> > bidirectional,
> > hence egress program needs this connection and its timer too, we
> > should not remove this timer when the ingress program is freed.
>
> Sure. That's trivially achieved with pinning.

If users forget to do so, their ebpf program would crash the kernel,
right? But ebpf programs should never crash the kernel, right?

> One can have an ingress prog that tailcalls into another prog
> that arms the timer with one of its subprogs.
> Egress prog can tailcall into the same prog as well.
> The ingress and egress progs can be replaced one by one
> or removed both together and middle prog can stay alive
> if it's pinned in bpffs or held alive by FD.

This looks necessarily complex. Look at the overhead of using
a timer properly here:

1. pin timer callback program
2. a program to install timer
3. a program array contains the above program
4. a tail call into the above program array

Why not design a simpler solution?

>
> > From another point of view: maps and programs are both first-class
> > resources in eBPF, a timer is stored in a map and associated with a
> > program, so it is naturally a first-class resource too.
>
> Not really. The timer abstraction is about data. It invokes the callback.
> That callback is a part of the program. The lifetime of the timer object
> and lifetime of the callback can be different.
> Obviously the timer logic need to make sure that callback text is alive
> when the timer is armed.

Only if the callback could reference struct bpf_prog... And even if it
could, how about users forgetting to do so? ebpf verifier has to reject
such cases.

> Combining timer and callback concepts creates a messy abstraction.
> In the normal kernel code one can have a timer in any kernel data
> structure and callback in the kernel text or in the kernel module.
> The code needs to make sure that the module won't go away while
> the timer is armed. Same thing with bpf progs. The progs are safe
> kernel modules. The timers are independent objects.

Kernel modules can take reference count of its own module very
easily, plus there is no verifier for kernel modules. I don't understand
why you want to make ebpf programs as close to kernel modules as
possible in this case.

>
> > >
> > > > >
> > > > > Also if your colleagues have something to share they should be
> > > > > posting to the mailing list. Right now you're acting as a broken phone
> > > > > passing info back and forth and the knowledge gets lost.
> > > > > Please ask your colleagues to participate online.
> > > >
> > > > They are already in CC from the very beginning. And our use case is
> > > > public, it is Cilium conntrack:
> > > > https://github.com/cilium/cilium/blob/master/bpf/lib/conntrack.h
> > > >
> > > > The entries of the code are:
> > > > https://github.com/cilium/cilium/blob/master/bpf/bpf_lxc.c
> > > >
> > > > The maps for conntrack are:
> > > > https://github.com/cilium/cilium/blob/master/bpf/lib/conntrack_map.h
> > >
> > > If that's the only goal then kernel timers are not needed.
> > > cilium conntrack works well as-is.
> >
> > We don't go back to why user-space cleanup is inefficient again,
> > do we? ;)
>
> I remain unconvinced that cilium conntrack _needs_ timer apis.
> It works fine in production and I don't hear any complaints
> from cilium users. So 'user space cleanup inefficiencies' is
> very subjective and cannot be the reason to add timer apis.

I am pretty sure I showed the original report to you when I sent
timeout hashmap patch, in case you forgot here it is again:
https://github.com/cilium/cilium/issues/5048

and let me quote the original report here:

"The current implementation (as of v1.2) for managing the contents of
the datapath connection tracking map leaves something to be desired:
Once per minute, the userspace cilium-agent makes a series of calls to
the bpf() syscall to fetch all of the entries in the map to determine
whether they should be deleted. For each entry in the map, 2-3 calls
must be made: One to fetch the next key, one to fetch the value, and
perhaps one to delete the entry. The maximum size of the map is 1
million entries, and if the current count approaches this size then
the garbage collection goroutine may spend a significant number of CPU
cycles iterating and deleting elements from the conntrack map."

(Adding Joe in Cc too.)

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-05-09  5:37                           ` Cong Wang
@ 2021-05-10 20:55                             ` Jamal Hadi Salim
  2021-05-11 21:29                               ` Cong Wang
  2021-05-11  5:05                             ` Joe Stringer
  1 sibling, 1 reply; 79+ messages in thread
From: Jamal Hadi Salim @ 2021-05-10 20:55 UTC (permalink / raw)
  To: Cong Wang, Alexei Starovoitov
  Cc: Linux Kernel Network Developers, bpf, Xiongchun Duan,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, Pedro Tammela, Joe Stringer

On 2021-05-09 1:37 a.m., Cong Wang wrote:
> On Tue, Apr 27, 2021 at 11:34 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:


[..]
> I am pretty sure I showed the original report to you when I sent
> timeout hashmap patch, in case you forgot here it is again:
> https://github.com/cilium/cilium/issues/5048
> 
> and let me quote the original report here:
> 
> "The current implementation (as of v1.2) for managing the contents of
> the datapath connection tracking map leaves something to be desired:
> Once per minute, the userspace cilium-agent makes a series of calls to
> the bpf() syscall to fetch all of the entries in the map to determine
> whether they should be deleted. For each entry in the map, 2-3 calls
> must be made: One to fetch the next key, one to fetch the value, and
> perhaps one to delete the entry. The maximum size of the map is 1
> million entries, and if the current count approaches this size then
> the garbage collection goroutine may spend a significant number of CPU
> cycles iterating and deleting elements from the conntrack map."
> 

That cilium PR was a good read of the general issues.
Our use case involves anywhere between 4-16M cached entries.

Like i mentioned earlier:
we want to periodically, if some condition is met in the
kernel on a map entry, to cleanup, update or send unsolicited
housekeeping events to user space.
Polling in order to achieve this for that many entries is expensive.

I would argue, again, timers generally are useful for a variety
of house keeping purposes and they are currently missing from ebpf.
Again, this despite Cong's use case.
Currently things in the ebpf datapath are triggered by either packets
showing up or from a control plane perspective by user space polling.
We need the timers for completion.

cheers,
jamal

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-05-09  5:37                           ` Cong Wang
  2021-05-10 20:55                             ` Jamal Hadi Salim
@ 2021-05-11  5:05                             ` Joe Stringer
  2021-05-11 21:08                               ` Cong Wang
  2021-05-12 22:43                               ` Jamal Hadi Salim
  1 sibling, 2 replies; 79+ messages in thread
From: Joe Stringer @ 2021-05-11  5:05 UTC (permalink / raw)
  To: Cong Wang
  Cc: Alexei Starovoitov, Linux Kernel Network Developers, bpf,
	Xiongchun Duan, Dongdong Wang, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, Pedro Tammela,
	Jamal Hadi Salim

Hi Cong,

On Sat, May 8, 2021 at 10:39 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Tue, Apr 27, 2021 at 11:34 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Apr 27, 2021 at 9:36 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > We don't go back to why user-space cleanup is inefficient again,
> > > do we? ;)
> >
> > I remain unconvinced that cilium conntrack _needs_ timer apis.
> > It works fine in production and I don't hear any complaints
> > from cilium users. So 'user space cleanup inefficiencies' is
> > very subjective and cannot be the reason to add timer apis.
>
> I am pretty sure I showed the original report to you when I sent
> timeout hashmap patch, in case you forgot here it is again:
> https://github.com/cilium/cilium/issues/5048
>
> and let me quote the original report here:
>
> "The current implementation (as of v1.2) for managing the contents of
> the datapath connection tracking map leaves something to be desired:
> Once per minute, the userspace cilium-agent makes a series of calls to
> the bpf() syscall to fetch all of the entries in the map to determine
> whether they should be deleted. For each entry in the map, 2-3 calls
> must be made: One to fetch the next key, one to fetch the value, and
> perhaps one to delete the entry. The maximum size of the map is 1
> million entries, and if the current count approaches this size then
> the garbage collection goroutine may spend a significant number of CPU
> cycles iterating and deleting elements from the conntrack map."

I'm also curious to hear more details as I haven't seen any recent
discussion in the common Cilium community channels (GitHub / Slack)
around deficiencies in the conntrack garbage collection since we
addressed the LRU issues upstream and switched back to LRU maps.
There's an update to the report quoted from the same link above:

"In recent releases, we've moved back to LRU for management of the CT
maps so the core problem is not as bad; furthermore we have
implemented a backoff for GC depending on the size and number of
entries in the conntrack table, so that in active environments the
userspace GC is frequent enough to prevent issues but in relatively
passive environments the userspace GC is only rarely run (to minimize
CPU impact)."

By "core problem is not as bad", I would have been referring to the
way that failing to garbage collect hashtables in a timely manner can
lead to rejecting new connections due to lack of available map space.
Switching back to LRU mitigated this concern. With a reduced frequency
of running the garbage collection logic, the CPU impact is lower as
well. I don't think we've explored batched map ops for this use case
yet either, which would already serve to improve the CPU usage
situation without extending the kernel.

The main outstanding issue I'm aware of is that we will often have a
1:1 mapping of entries in the CT map and the NAT map, and ideally we'd
like them to have tied fates but currently we have no mechanism to do
this with LRU. When LRU eviction occurs, the entries can get out of
sync until the next GC. I could imagine timers helping with this if we
were to switch back to hash maps since we could handle this problem in
custom eviction logic, but that would reintroduce the entry management
problem above. So then we'd still need more work to figure out how to
address that with a timers approach. If I were to guess right now, the
right solution for this particular problem is probably associating
programs with map entry lifecycle events (like LRU eviction) rather
than adding timers to trigger the logic we want, but that's a whole
different discussion.

Cheers,
Joe

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-05-11  5:05                             ` Joe Stringer
@ 2021-05-11 21:08                               ` Cong Wang
  2021-05-12 22:43                               ` Jamal Hadi Salim
  1 sibling, 0 replies; 79+ messages in thread
From: Cong Wang @ 2021-05-11 21:08 UTC (permalink / raw)
  To: Joe Stringer
  Cc: Alexei Starovoitov, Linux Kernel Network Developers, bpf,
	Xiongchun Duan, Dongdong Wang, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, Pedro Tammela,
	Jamal Hadi Salim

On Mon, May 10, 2021 at 10:06 PM Joe Stringer <joe@cilium.io> wrote:
>
> Hi Cong,
>
> On Sat, May 8, 2021 at 10:39 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Tue, Apr 27, 2021 at 11:34 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Tue, Apr 27, 2021 at 9:36 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >
> > > > We don't go back to why user-space cleanup is inefficient again,
> > > > do we? ;)
> > >
> > > I remain unconvinced that cilium conntrack _needs_ timer apis.
> > > It works fine in production and I don't hear any complaints
> > > from cilium users. So 'user space cleanup inefficiencies' is
> > > very subjective and cannot be the reason to add timer apis.
> >
> > I am pretty sure I showed the original report to you when I sent
> > timeout hashmap patch, in case you forgot here it is again:
> > https://github.com/cilium/cilium/issues/5048
> >
> > and let me quote the original report here:
> >
> > "The current implementation (as of v1.2) for managing the contents of
> > the datapath connection tracking map leaves something to be desired:
> > Once per minute, the userspace cilium-agent makes a series of calls to
> > the bpf() syscall to fetch all of the entries in the map to determine
> > whether they should be deleted. For each entry in the map, 2-3 calls
> > must be made: One to fetch the next key, one to fetch the value, and
> > perhaps one to delete the entry. The maximum size of the map is 1
> > million entries, and if the current count approaches this size then
> > the garbage collection goroutine may spend a significant number of CPU
> > cycles iterating and deleting elements from the conntrack map."
>
> I'm also curious to hear more details as I haven't seen any recent
> discussion in the common Cilium community channels (GitHub / Slack)
> around deficiencies in the conntrack garbage collection since we
> addressed the LRU issues upstream and switched back to LRU maps.
> There's an update to the report quoted from the same link above:
>
> "In recent releases, we've moved back to LRU for management of the CT
> maps so the core problem is not as bad; furthermore we have
> implemented a backoff for GC depending on the size and number of
> entries in the conntrack table, so that in active environments the
> userspace GC is frequent enough to prevent issues but in relatively
> passive environments the userspace GC is only rarely run (to minimize
> CPU impact)."

Thanks for sharing the update. I am sure Jamal/Pedro measured LRU
and percpu LRU as well, hope they can share the numbers here.

>
> By "core problem is not as bad", I would have been referring to the
> way that failing to garbage collect hashtables in a timely manner can
> lead to rejecting new connections due to lack of available map space.
> Switching back to LRU mitigated this concern. With a reduced frequency

LRU eviction only kicks in when the space is full, which is already too
late. More importantly, with LRU, when an entry becomes "expired"
is nondeterministic, which contradicts the definition of conntrack,
which is time based.

> of running the garbage collection logic, the CPU impact is lower as
> well. I don't think we've explored batched map ops for this use case
> yet either, which would already serve to improve the CPU usage
> situation without extending the kernel.

Sure, if we could let GC run once every year, the amortized CPU
overhead would become literally zero. ;) I am sure this is not what
you really want to suggest.

>
> The main outstanding issue I'm aware of is that we will often have a
> 1:1 mapping of entries in the CT map and the NAT map, and ideally we'd
> like them to have tied fates but currently we have no mechanism to do
> this with LRU. When LRU eviction occurs, the entries can get out of
> sync until the next GC. I could imagine timers helping with this if we
> were to switch back to hash maps since we could handle this problem in
> custom eviction logic, but that would reintroduce the entry management
> problem above. So then we'd still need more work to figure out how to
> address that with a timers approach. If I were to guess right now, the
> right solution for this particular problem is probably associating
> programs with map entry lifecycle events (like LRU eviction) rather
> than adding timers to trigger the logic we want, but that's a whole
> different discussion.

I proposed a timeout hashmap before this ebpf timer, it is Alexei
who suggested abstracting it as a timer, which makes sense to me.
So, I am not sure what you are suggesting here, at least we are not
going back to timeout hashmap or anything similarly tied closely
with a map.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-05-10 20:55                             ` Jamal Hadi Salim
@ 2021-05-11 21:29                               ` Cong Wang
  2021-05-12 22:56                                 ` Jamal Hadi Salim
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-05-11 21:29 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Alexei Starovoitov, Linux Kernel Network Developers, bpf,
	Xiongchun Duan, Dongdong Wang, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, Pedro Tammela,
	Joe Stringer

On Mon, May 10, 2021 at 1:55 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> On 2021-05-09 1:37 a.m., Cong Wang wrote:
> > On Tue, Apr 27, 2021 at 11:34 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
>
>
> [..]
> > I am pretty sure I showed the original report to you when I sent
> > timeout hashmap patch, in case you forgot here it is again:
> > https://github.com/cilium/cilium/issues/5048
> >
> > and let me quote the original report here:
> >
> > "The current implementation (as of v1.2) for managing the contents of
> > the datapath connection tracking map leaves something to be desired:
> > Once per minute, the userspace cilium-agent makes a series of calls to
> > the bpf() syscall to fetch all of the entries in the map to determine
> > whether they should be deleted. For each entry in the map, 2-3 calls
> > must be made: One to fetch the next key, one to fetch the value, and
> > perhaps one to delete the entry. The maximum size of the map is 1
> > million entries, and if the current count approaches this size then
> > the garbage collection goroutine may spend a significant number of CPU
> > cycles iterating and deleting elements from the conntrack map."
> >
>
> That cilium PR was a good read of the general issues.
> Our use case involves anywhere between 4-16M cached entries.
>
> Like i mentioned earlier:
> we want to periodically, if some condition is met in the
> kernel on a map entry, to cleanup, update or send unsolicited
> housekeeping events to user space.
> Polling in order to achieve this for that many entries is expensive.

Thanks for sharing your use case. As we discussed privately, please
also share the performance numbers you have.

I talked to my colleagues at Bytedance yesterday, we actually have
similar code which periodically collects map entry stats too, currently
we use iterator from user-space, which definitely has the same CPU
overhead.


>
> I would argue, again, timers generally are useful for a variety
> of house keeping purposes and they are currently missing from ebpf.
> Again, this despite Cong's use case.
> Currently things in the ebpf datapath are triggered by either packets
> showing up or from a control plane perspective by user space polling.
> We need the timers for completion.
>

Thanks!

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-05-11  5:05                             ` Joe Stringer
  2021-05-11 21:08                               ` Cong Wang
@ 2021-05-12 22:43                               ` Jamal Hadi Salim
  2021-05-13 18:45                                 ` Jamal Hadi Salim
  1 sibling, 1 reply; 79+ messages in thread
From: Jamal Hadi Salim @ 2021-05-12 22:43 UTC (permalink / raw)
  To: Joe Stringer, Cong Wang
  Cc: Alexei Starovoitov, Linux Kernel Network Developers, bpf,
	Xiongchun Duan, Dongdong Wang, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, Pedro Tammela

On 2021-05-11 1:05 a.m., Joe Stringer wrote:
> Hi Cong,
> 

>> and let me quote the original report here:
>>
>> "The current implementation (as of v1.2) for managing the contents of
>> the datapath connection tracking map leaves something to be desired:
>> Once per minute, the userspace cilium-agent makes a series of calls to
>> the bpf() syscall to fetch all of the entries in the map to determine
>> whether they should be deleted. For each entry in the map, 2-3 calls
>> must be made: One to fetch the next key, one to fetch the value, and
>> perhaps one to delete the entry. The maximum size of the map is 1
>> million entries, and if the current count approaches this size then
>> the garbage collection goroutine may spend a significant number of CPU
>> cycles iterating and deleting elements from the conntrack map."
> 
> I'm also curious to hear more details as I haven't seen any recent
> discussion in the common Cilium community channels (GitHub / Slack)
> around deficiencies in the conntrack garbage collection since we
> addressed the LRU issues upstream and switched back to LRU maps.

For our use case we cant use LRU. We need to account for every entry i.e
we dont want it to be gc without our consent. i.e we want to control
the GC. Your PR was pointing to LRU deleting some flow entries for TCP
which were just idling for example.


> There's an update to the report quoted from the same link above:
> 
> "In recent releases, we've moved back to LRU for management of the CT
> maps so the core problem is not as bad; furthermore we have
> implemented a backoff for GC depending on the size and number of
> entries in the conntrack table, so that in active environments the
> userspace GC is frequent enough to prevent issues but in relatively
> passive environments the userspace GC is only rarely run (to minimize
> CPU impact)."
> 
> By "core problem is not as bad", I would have been referring to the
> way that failing to garbage collect hashtables in a timely manner can
> lead to rejecting new connections due to lack of available map space.
> Switching back to LRU mitigated this concern. With a reduced frequency
> of running the garbage collection logic, the CPU impact is lower as
> well. I don't think we've explored batched map ops for this use case
> yet either, which would already serve to improve the CPU usage
> situation without extending the kernel.
> 

Will run some tests tomorrow to see the effect of batching vs nobatch
and capture cost of syscalls and cpu.

Note: even then, it is not a good general solution. Our entries can
go as high as 16M.
Our workflow is: 1) every 1-5 seconds you dump, 2) process for
what needs to be deleted etc, then do updates (another 1-3 seconds
worth of time). There is a point, depending on number of entries,
where there your time cost of processing exceeds your polling period.
The likelihood of entry state loss is high for even 1/2 sec loss
of sync.

> The main outstanding issue I'm aware of is that we will often have a
> 1:1 mapping of entries in the CT map and the NAT map, and ideally we'd
> like them to have tied fates but currently we have no mechanism to do
> this with LRU. When LRU eviction occurs, the entries can get out of
> sync until the next GC.

Yes, this ties as well to our use case (not NAT for us, but semantically
similar challenge). It goes the other way too, if userspace decides
to adjust your NAT table you need to purge related entries from the
cache.



> I could imagine timers helping with this if we

Yes, timers would solve this.

I am not even arguing that we need timers to solve these issues. I am
just saying it seems timers are just fundamental infra that is needed
even outside the scope of this.

cheers,
jamal

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-05-11 21:29                               ` Cong Wang
@ 2021-05-12 22:56                                 ` Jamal Hadi Salim
  0 siblings, 0 replies; 79+ messages in thread
From: Jamal Hadi Salim @ 2021-05-12 22:56 UTC (permalink / raw)
  To: Cong Wang
  Cc: Alexei Starovoitov, Linux Kernel Network Developers, bpf,
	Xiongchun Duan, Dongdong Wang, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, Pedro Tammela,
	Joe Stringer

On 2021-05-11 5:29 p.m., Cong Wang wrote:
> On Mon, May 10, 2021 at 1:55 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:

>>
>> That cilium PR was a good read of the general issues.
>> Our use case involves anywhere between 4-16M cached entries.
>>
>> Like i mentioned earlier:
>> we want to periodically, if some condition is met in the
>> kernel on a map entry, to cleanup, update or send unsolicited
>> housekeeping events to user space.
>> Polling in order to achieve this for that many entries is expensive.
> 
> Thanks for sharing your use case. As we discussed privately, please
> also share the performance numbers you have.
> 

The earlier tests i mentioned to you were in regards to LRU.
I can share those as well - but seems for what we are discussing
here testing cost of batch vs nobatch is more important.
Our LRU tests indicate that it is better to use global as opposed
to per-CPU LRU. We didnt dig deeper but it seemed gc/alloc - which was
happening under some lock gets very expensive regardless if you
are sending sufficient number of flows/sec (1M flows/sec in our
case).
We cannot use LRU (for reasons stated earlier). It has to be hash
table with aging under our jurisdiction. I will post numbers for
sending the entries to user space for gc.

cheers,
jamal


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-05-12 22:43                               ` Jamal Hadi Salim
@ 2021-05-13 18:45                                 ` Jamal Hadi Salim
  2021-05-14  2:53                                   ` Cong Wang
  0 siblings, 1 reply; 79+ messages in thread
From: Jamal Hadi Salim @ 2021-05-13 18:45 UTC (permalink / raw)
  To: Joe Stringer, Cong Wang
  Cc: Alexei Starovoitov, Linux Kernel Network Developers, bpf,
	Xiongchun Duan, Dongdong Wang, Muchun Song, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, Pedro Tammela

On 2021-05-12 6:43 p.m., Jamal Hadi Salim wrote:

> 
> Will run some tests tomorrow to see the effect of batching vs nobatch
> and capture cost of syscalls and cpu.
> 

So here are some numbers:
Processor: Intel(R) Xeon(R) Gold 6230R CPU @ 2.10GHz
This machine is very similar to where a real deployment
would happen.

Hyperthreading turned off so we can dedicate the core to the
dumping process and Performance mode on, so no frequency scaling
meddling.
Tests were ran about 3 times each. Results eye-balled to make
sure deviation was reasonable.
100% of the one core was used just for dumping during each run.

bpftool does linear retrieval whereas our tool does batch dumping.
bpftool does print the dumped results, for our tool we just count
the number of entries retrieved (cost would have been higher if
we actually printed). In any case in the real setup there is
a processing cost which is much higher.

Summary is: the dumping is problematic costwise as the number of
entries increase. While batching does improve things it doesnt
solve our problem (Like i said we have upto 16M entries and most
of the time we are dumping useless things)

1M entries
----------

root@SUT:/home/jhs/git-trees/ftables/src# time ./ftables show system 
cache dev enp179s0f1 > /dev/null
real    0m0.320s
user    0m0.004s
sys     0m0.316s

root@SUT:/home/jhs/git-trees/ftables/src# time 
/home/jhs/git-trees/foobar/XDP/bpftool map dump  id 3353 > /dev/null
real    0m5.419s
user    0m4.347s
sys     0m1.072s

4M entries
-----------
root@SUT:/home/jhs/git-trees/ftables/src# time ./ftables show system cache
  dev enp179s0f1 > /dev/null
real    0m1.331s
user    0m0.004s
sys     0m1.325s

root@SUT:/home/jhs/git-trees/ftables/src# time 
/home/jhs/git-trees/foobar/XDP/bpftool map dump id 1178 > /dev/null
real    0m21.677s
user    0m17.269s
sys     0m4.408s
8M Entries
------------

root@SUT:/home/jhs/git-trees/ftables/src# time ./ftables show system 
cache dev enp179s0f1 > /dev/null
real    0m2.678s
user    0m0.004s
sys     0m2.672s

t@SUT:/home/jhs/git-trees/ftables/src# time 
/home/jhs/git-trees/foobar/XDP/bpftool map dump id 2636 > /dev/null
real    0m43.267s
user    0m34.450s
sys     0m8.817s

16M entries
------------
root@SUT:/home/jhs/git-trees/ftables/src# time ./ftables show system cache
  dev enp179s0f1 > /dev/null
real    0m5.396s
user    0m0.004s
sys     0m5.389s

root@SUT:/home/jhs/git-trees/ftables/src# time 
/home/jhs/git-trees/foobar/XDP/bpftool map dump id 1919 > /dev/null
real    1m27.039s
user    1m8.371s
sys     0m18.668s



cheers,
jamal

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-05-13 18:45                                 ` Jamal Hadi Salim
@ 2021-05-14  2:53                                   ` Cong Wang
  2021-08-11 21:03                                     ` Joe Stringer
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-05-14  2:53 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Joe Stringer, Alexei Starovoitov,
	Linux Kernel Network Developers, bpf, Xiongchun Duan,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, Pedro Tammela

On Thu, May 13, 2021 at 11:46 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> On 2021-05-12 6:43 p.m., Jamal Hadi Salim wrote:
>
> >
> > Will run some tests tomorrow to see the effect of batching vs nobatch
> > and capture cost of syscalls and cpu.
> >
>
> So here are some numbers:
> Processor: Intel(R) Xeon(R) Gold 6230R CPU @ 2.10GHz
> This machine is very similar to where a real deployment
> would happen.
>
> Hyperthreading turned off so we can dedicate the core to the
> dumping process and Performance mode on, so no frequency scaling
> meddling.
> Tests were ran about 3 times each. Results eye-balled to make
> sure deviation was reasonable.
> 100% of the one core was used just for dumping during each run.

I checked with Cilium users here at Bytedance, they actually observed
100% CPU usage too.

>
> bpftool does linear retrieval whereas our tool does batch dumping.
> bpftool does print the dumped results, for our tool we just count
> the number of entries retrieved (cost would have been higher if
> we actually printed). In any case in the real setup there is
> a processing cost which is much higher.
>
> Summary is: the dumping is problematic costwise as the number of
> entries increase. While batching does improve things it doesnt
> solve our problem (Like i said we have upto 16M entries and most
> of the time we are dumping useless things)

Thank you for sharing these numbers! Hopefully they could convince
people here to accept the bpf timer. I will include your use case and
performance number in my next update.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC Patch bpf-next] bpf: introduce bpf timer
  2021-05-14  2:53                                   ` Cong Wang
@ 2021-08-11 21:03                                     ` Joe Stringer
  0 siblings, 0 replies; 79+ messages in thread
From: Joe Stringer @ 2021-08-11 21:03 UTC (permalink / raw)
  To: Cong Wang
  Cc: Jamal Hadi Salim, Joe Stringer, Alexei Starovoitov,
	Linux Kernel Network Developers, bpf, Xiongchun Duan,
	Dongdong Wang, Muchun Song, Cong Wang, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, Pedro Tammela

Hi folks, apparently I never clicked 'send' on this email, but if you
wanted to continue the discussion I had some questions and thoughts.

This is also an interesting enough topic that it may be worth
considering to submit for the upcoming LPC Networking & BPF track
(submission deadline is this Friday August 13, Conference dates 20-24
September).

On Thu, May 13, 2021 at 7:53 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Thu, May 13, 2021 at 11:46 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> >
> > On 2021-05-12 6:43 p.m., Jamal Hadi Salim wrote:
> >
> > >
> > > Will run some tests tomorrow to see the effect of batching vs nobatch
> > > and capture cost of syscalls and cpu.
> > >
> >
> > So here are some numbers:
> > Processor: Intel(R) Xeon(R) Gold 6230R CPU @ 2.10GHz
> > This machine is very similar to where a real deployment
> > would happen.
> >
> > Hyperthreading turned off so we can dedicate the core to the
> > dumping process and Performance mode on, so no frequency scaling
> > meddling.
> > Tests were ran about 3 times each. Results eye-balled to make
> > sure deviation was reasonable.
> > 100% of the one core was used just for dumping during each run.
>
> I checked with Cilium users here at Bytedance, they actually observed
> 100% CPU usage too.

Thanks for the feedback. Can you provide further details? For instance,

* Which version of Cilium?
* How long do you observe this 100% CPU usage?
* What size CT map is in use?
* How frequently do you intend for CT GC to run? (Do you use the
default settings or are they mismatched with your requirements for
some reason? If so can we learn more about the requirements/why?)
* Do you have a threshold in mind that would be sufficient?

If necessary we can take these discussions off-list if the details are
sensitive but I'd prefer to continue the discussion here to have some
public examples we can discuss & use to motivate future discussions.
We can alternatively move the discussion to a Cilium GitHub issue if
the tradeoffs are more about the userspace implementation rather than
the kernel specifics, though I suspect some of the folks here would
also like to follow along so I don't want to exclude the list from the
discussion.

FWIW I'm not inherently against a timer, in fact I've wondered for a
while what kind of interesting things we could build with such
support. At the same time, connection tracking entry management is a
nuanced topic and it's easy to fix an issue in one area only to
introduce a problem in another area.

> >
> > bpftool does linear retrieval whereas our tool does batch dumping.
> > bpftool does print the dumped results, for our tool we just count
> > the number of entries retrieved (cost would have been higher if
> > we actually printed). In any case in the real setup there is
> > a processing cost which is much higher.
> >
> > Summary is: the dumping is problematic costwise as the number of
> > entries increase. While batching does improve things it doesnt
> > solve our problem (Like i said we have upto 16M entries and most
> > of the time we are dumping useless things)
>
> Thank you for sharing these numbers! Hopefully they could convince
> people here to accept the bpf timer. I will include your use case and
> performance number in my next update.

Yes, Thanks Jamal for the numbers. It's very interesting, clearly
batch dumping is far more efficient and we should enhance bpftool to
take advantage of it where applicable.

> Like i said we have upto 16M entries and most
> of the time we are dumping useless things)

I'm curious if there's a more intelligent way to figure out this
'dumping useless things' aspect? I can see how timers would eliminate
the cycles spent on the syscall aspect of this entirely (in favor of
the timer handling logic which I'd guess is cheaper), but at some
point if you're running certain logic on every entry in a map then of
course it will scale linearly.

The use case is different for the CT problem we discussed above, but
if I look at the same question for the CT case, this is why I find LRU
useful - rather than firing off a number of timers linear on the size
of the map, the eviction logic is limited to the map insert rate,
which itself can be governed and ratelimited by logic running in eBPF.
The scan of the map then becomes less critical, so it can be run less
frequently and alleviate the CPU usage question that way.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-06-02 18:13                   ` Kumar Kartikeya Dwivedi
  2021-06-02 18:26                     ` Alexei Starovoitov
@ 2021-06-02 18:46                     ` John Fastabend
  1 sibling, 0 replies; 79+ messages in thread
From: John Fastabend @ 2021-06-02 18:46 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi, Martin KaFai Lau
  Cc: Toke Høiland-Jørgensen, Alexei Starovoitov, Cong Wang,
	David Miller, Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Lorenz Bauer, Linux Kernel Network Developers, bpf, kernel-team

Kumar Kartikeya Dwivedi wrote:
> On Wed, Jun 02, 2021 at 11:24:36PM IST, Martin KaFai Lau wrote:
> > On Wed, Jun 02, 2021 at 10:48:02AM +0200, Toke Høiland-Jørgensen wrote:
> > > Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> > >
> > > >> > In general the garbage collection in any form doesn't scale.
> > > >> > The conntrack logic doesn't need it. The cillium conntrack is a great
> > > >> > example of how to implement a conntrack without GC.
> > > >>
> > > >> That is simply not a conntrack. We expire connections based on
> > > >> its time, not based on the size of the map where it residents.
> > > >
> > > > Sounds like your goal is to replicate existing kernel conntrack
> > > > as bpf program by doing exactly the same algorithm and repeating
> > > > the same mistakes. Then add kernel conntrack functions to allow list
> > > > of kfuncs (unstable helpers) and call them from your bpf progs.
> > >
> > > FYI, we're working on exactly this (exposing kernel conntrack to BPF).
> > > Hoping to have something to show for our efforts before too long, but
> > > it's still in a bit of an early stage...
> > Just curious, what conntrack functions will be made callable to BPF?
> 
> Initially we're planning to expose the equivalent of nf_conntrack_in and
> nf_conntrack_confirm to XDP and TC programs (so XDP one works without an skb,
> and TC one works with an skb), to map these to higher level lookup/insert.
> 
> --
> Kartikeya

I think this is a missed opportunity. I can't see any advantage to
tying a XDP datapath into nft. For local connections use a socket lookup
no need for tables at all. For middle boxes you need some tables, but
again really don't see why you want nft here. An entirely XDP based
connection tracker is going to be faster, easier to debug, and
more easy to tune to do what you want as your use cases changes.

Other than architecture disagreements, the implementation of this
gets ugly. You will need to export a set of nft hooks, teach nft
about xdp_buffs and then on every packet poke nft. Just looking
at nf_conntrack_in() tells me you likely need some serious surgery
there to make this work and now you've forked a bunch of code that
could be done generically in BPF into some C hard coded stuff you
will have to maintain. Or you do an ugly hack to convert xdp into
skb on every packet, but I'll NAK that because its really defeats
the point of XDP. Maybe TC side is easier because you have skb,
but then you miss the real win in XDP side. Sorry I don't see any
upsides here and just more work to review, maintain code that is
dubious to start with.

Anyways original timers code above LGTM.

.John

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-06-02 18:26                     ` Alexei Starovoitov
@ 2021-06-02 18:30                       ` Kumar Kartikeya Dwivedi
  0 siblings, 0 replies; 79+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2021-06-02 18:30 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Martin KaFai Lau, Toke Høiland-Jørgensen, Cong Wang,
	David Miller, Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Lorenz Bauer, Linux Kernel Network Developers, bpf, kernel-team

On Wed, Jun 02, 2021 at 11:56:40PM IST, Alexei Starovoitov wrote:
> On Wed, Jun 2, 2021 at 11:14 AM Kumar Kartikeya Dwivedi
> <memxor@gmail.com> wrote:
> >
> > On Wed, Jun 02, 2021 at 11:24:36PM IST, Martin KaFai Lau wrote:
> > > On Wed, Jun 02, 2021 at 10:48:02AM +0200, Toke Høiland-Jørgensen wrote:
> > > > Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> > > >
> > > > >> > In general the garbage collection in any form doesn't scale.
> > > > >> > The conntrack logic doesn't need it. The cillium conntrack is a great
> > > > >> > example of how to implement a conntrack without GC.
> > > > >>
> > > > >> That is simply not a conntrack. We expire connections based on
> > > > >> its time, not based on the size of the map where it residents.
> > > > >
> > > > > Sounds like your goal is to replicate existing kernel conntrack
> > > > > as bpf program by doing exactly the same algorithm and repeating
> > > > > the same mistakes. Then add kernel conntrack functions to allow list
> > > > > of kfuncs (unstable helpers) and call them from your bpf progs.
> > > >
> > > > FYI, we're working on exactly this (exposing kernel conntrack to BPF).
> > > > Hoping to have something to show for our efforts before too long, but
> > > > it's still in a bit of an early stage...
> > > Just curious, what conntrack functions will be made callable to BPF?
> >
> > Initially we're planning to expose the equivalent of nf_conntrack_in and
> > nf_conntrack_confirm to XDP and TC programs (so XDP one works without an skb,
> > and TC one works with an skb), to map these to higher level lookup/insert.
>
> To make sure we're on the same page...
> I still strongly prefer to avoid exposing conntrack via stable helpers.
> Pls use kfunc and unstable interface.

Correct, that is the idea.

--
Kartikeya

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-06-02 18:13                   ` Kumar Kartikeya Dwivedi
@ 2021-06-02 18:26                     ` Alexei Starovoitov
  2021-06-02 18:30                       ` Kumar Kartikeya Dwivedi
  2021-06-02 18:46                     ` John Fastabend
  1 sibling, 1 reply; 79+ messages in thread
From: Alexei Starovoitov @ 2021-06-02 18:26 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: Martin KaFai Lau, Toke Høiland-Jørgensen, Cong Wang,
	David Miller, Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Lorenz Bauer, Linux Kernel Network Developers, bpf, kernel-team

On Wed, Jun 2, 2021 at 11:14 AM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> On Wed, Jun 02, 2021 at 11:24:36PM IST, Martin KaFai Lau wrote:
> > On Wed, Jun 02, 2021 at 10:48:02AM +0200, Toke Høiland-Jørgensen wrote:
> > > Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> > >
> > > >> > In general the garbage collection in any form doesn't scale.
> > > >> > The conntrack logic doesn't need it. The cillium conntrack is a great
> > > >> > example of how to implement a conntrack without GC.
> > > >>
> > > >> That is simply not a conntrack. We expire connections based on
> > > >> its time, not based on the size of the map where it residents.
> > > >
> > > > Sounds like your goal is to replicate existing kernel conntrack
> > > > as bpf program by doing exactly the same algorithm and repeating
> > > > the same mistakes. Then add kernel conntrack functions to allow list
> > > > of kfuncs (unstable helpers) and call them from your bpf progs.
> > >
> > > FYI, we're working on exactly this (exposing kernel conntrack to BPF).
> > > Hoping to have something to show for our efforts before too long, but
> > > it's still in a bit of an early stage...
> > Just curious, what conntrack functions will be made callable to BPF?
>
> Initially we're planning to expose the equivalent of nf_conntrack_in and
> nf_conntrack_confirm to XDP and TC programs (so XDP one works without an skb,
> and TC one works with an skb), to map these to higher level lookup/insert.

To make sure we're on the same page...
I still strongly prefer to avoid exposing conntrack via stable helpers.
Pls use kfunc and unstable interface.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-06-02 17:54                 ` Martin KaFai Lau
@ 2021-06-02 18:13                   ` Kumar Kartikeya Dwivedi
  2021-06-02 18:26                     ` Alexei Starovoitov
  2021-06-02 18:46                     ` John Fastabend
  0 siblings, 2 replies; 79+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2021-06-02 18:13 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: Toke Høiland-Jørgensen, Alexei Starovoitov, Cong Wang,
	David Miller, Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Lorenz Bauer, Linux Kernel Network Developers, bpf, kernel-team

On Wed, Jun 02, 2021 at 11:24:36PM IST, Martin KaFai Lau wrote:
> On Wed, Jun 02, 2021 at 10:48:02AM +0200, Toke Høiland-Jørgensen wrote:
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> >
> > >> > In general the garbage collection in any form doesn't scale.
> > >> > The conntrack logic doesn't need it. The cillium conntrack is a great
> > >> > example of how to implement a conntrack without GC.
> > >>
> > >> That is simply not a conntrack. We expire connections based on
> > >> its time, not based on the size of the map where it residents.
> > >
> > > Sounds like your goal is to replicate existing kernel conntrack
> > > as bpf program by doing exactly the same algorithm and repeating
> > > the same mistakes. Then add kernel conntrack functions to allow list
> > > of kfuncs (unstable helpers) and call them from your bpf progs.
> >
> > FYI, we're working on exactly this (exposing kernel conntrack to BPF).
> > Hoping to have something to show for our efforts before too long, but
> > it's still in a bit of an early stage...
> Just curious, what conntrack functions will be made callable to BPF?

Initially we're planning to expose the equivalent of nf_conntrack_in and
nf_conntrack_confirm to XDP and TC programs (so XDP one works without an skb,
and TC one works with an skb), to map these to higher level lookup/insert.

--
Kartikeya

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-06-02  8:48               ` Toke Høiland-Jørgensen
@ 2021-06-02 17:54                 ` Martin KaFai Lau
  2021-06-02 18:13                   ` Kumar Kartikeya Dwivedi
  0 siblings, 1 reply; 79+ messages in thread
From: Martin KaFai Lau @ 2021-06-02 17:54 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Alexei Starovoitov, Cong Wang, David Miller, Daniel Borkmann,
	Andrii Nakryiko, John Fastabend, Lorenz Bauer,
	Linux Kernel Network Developers, bpf, kernel-team

On Wed, Jun 02, 2021 at 10:48:02AM +0200, Toke Høiland-Jørgensen wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> >> > In general the garbage collection in any form doesn't scale.
> >> > The conntrack logic doesn't need it. The cillium conntrack is a great
> >> > example of how to implement a conntrack without GC.
> >> 
> >> That is simply not a conntrack. We expire connections based on
> >> its time, not based on the size of the map where it residents.
> >
> > Sounds like your goal is to replicate existing kernel conntrack
> > as bpf program by doing exactly the same algorithm and repeating
> > the same mistakes. Then add kernel conntrack functions to allow list
> > of kfuncs (unstable helpers) and call them from your bpf progs.
> 
> FYI, we're working on exactly this (exposing kernel conntrack to BPF).
> Hoping to have something to show for our efforts before too long, but
> it's still in a bit of an early stage...
Just curious, what conntrack functions will be made callable to BPF?

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-06-02  2:00             ` Alexei Starovoitov
@ 2021-06-02  8:48               ` Toke Høiland-Jørgensen
  2021-06-02 17:54                 ` Martin KaFai Lau
  0 siblings, 1 reply; 79+ messages in thread
From: Toke Høiland-Jørgensen @ 2021-06-02  8:48 UTC (permalink / raw)
  To: Alexei Starovoitov, Cong Wang
  Cc: David Miller, Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Lorenz Bauer, Linux Kernel Network Developers, bpf, kernel-team

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

>> > In general the garbage collection in any form doesn't scale.
>> > The conntrack logic doesn't need it. The cillium conntrack is a great
>> > example of how to implement a conntrack without GC.
>> 
>> That is simply not a conntrack. We expire connections based on
>> its time, not based on the size of the map where it residents.
>
> Sounds like your goal is to replicate existing kernel conntrack
> as bpf program by doing exactly the same algorithm and repeating
> the same mistakes. Then add kernel conntrack functions to allow list
> of kfuncs (unstable helpers) and call them from your bpf progs.

FYI, we're working on exactly this (exposing kernel conntrack to BPF).
Hoping to have something to show for our efforts before too long, but
it's still in a bit of an early stage...

-Toke


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-30  6:36           ` Cong Wang
@ 2021-06-02  2:00             ` Alexei Starovoitov
  2021-06-02  8:48               ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 79+ messages in thread
From: Alexei Starovoitov @ 2021-06-02  2:00 UTC (permalink / raw)
  To: Cong Wang
  Cc: David Miller, Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Lorenz Bauer, Linux Kernel Network Developers, bpf, kernel-team

On Sat, May 29, 2021 at 11:36:08PM -0700, Cong Wang wrote:
> On Tue, May 25, 2021 at 11:21 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, May 24, 2021 at 9:59 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > On Mon, May 24, 2021 at 8:16 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >
> > > > On Sun, May 23, 2021 at 9:01 AM Alexei Starovoitov
> > > > <alexei.starovoitov@gmail.com> wrote:
> > > > >
> > > > > On Fri, May 21, 2021 at 2:37 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > > >
> > > > > > Hi, Alexei
> > > > > >
> > > > > > On Thu, May 20, 2021 at 11:52 PM Alexei Starovoitov
> > > > > > <alexei.starovoitov@gmail.com> wrote:
> > > > > > >
> > > > > > > Introduce 'struct bpf_timer' that can be embedded in most BPF map types
> > > > > > > and helpers to operate on it:
> > > > > > > long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
> > > > > > > long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> > > > > > > long bpf_timer_del(struct bpf_timer *timer)
> > > > > >
> > > > > > Like we discussed, this approach would make the timer harder
> > > > > > to be independent of other eBPF programs, which is a must-have
> > > > > > for both of our use cases (mine and Jamal's). Like you explained,
> > > > > > this requires at least another program array, a tail call, a mandatory
> > > > > > prog pinning to work.
> > > > >
> > > > > That is simply not true.
> > > >
> > > > Which part is not true? The above is what I got from your explanation.
> > >
> > > I tried to write some code sketches to use your timer to implement
> > > our conntrack logic, below shows how difficult it is to use,
> >
> > Was it difficult because you've used tail_call and over complicated
> > the progs for no good reason?
> 
> Using tail call is what I got from you, here is the quote:
> 
> "Sure. That's trivially achieved with pinning.
> One can have an ingress prog that tailcalls into another prog
> that arms the timer with one of its subprogs.
> Egress prog can tailcall into the same prog as well.
> The ingress and egress progs can be replaced one by one
> or removed both together and middle prog can stay alive
> if it's pinned in bpffs or held alive by FD."

That was in the context of doing auto-cancel of timers.
There is only one choice to make. Either auto-cancel or not.
That quote was during the time when auto-cancel felt as it would fit
the FD model better.
We auto-detach on close(link_fd) and auto-unload on close(prog_fd).
The armed timer would prevent that and that promise felt
necessary to keep. But disappearing timer is a bigger surprise
to users than not auto-unloading progs.
Hence this patch is doing prog_refcnt++ in bpf_timer_start.
Please see other emails threads in v1 patch set.

> >
> > tail_calls are unnecessary. Just call the funcs directly.
> > All lookups and maps are unnecessary as well.
> > Looks like a single global timer will be enough for this use case.
> 
> Hmm? With your design, a timer has to be embedded into a map
> value, you said this is to mimic bpf spinlock.

The global data is a map.
When spin_lock was introduced there was no global data concept.

> >
> > In general the garbage collection in any form doesn't scale.
> > The conntrack logic doesn't need it. The cillium conntrack is a great
> > example of how to implement a conntrack without GC.
> 
> That is simply not a conntrack. We expire connections based on
> its time, not based on the size of the map where it residents.

Sounds like your goal is to replicate existing kernel conntrack
as bpf program by doing exactly the same algorithm and repeating
the same mistakes. Then add kernel conntrack functions to allow list
of kfuncs (unstable helpers) and call them from your bpf progs.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-25 18:21         ` Alexei Starovoitov
  2021-05-25 19:35           ` Jamal Hadi Salim
@ 2021-05-30  6:36           ` Cong Wang
  2021-06-02  2:00             ` Alexei Starovoitov
  1 sibling, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-05-30  6:36 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Miller, Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Lorenz Bauer, Linux Kernel Network Developers, bpf, kernel-team

On Tue, May 25, 2021 at 11:21 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, May 24, 2021 at 9:59 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Mon, May 24, 2021 at 8:16 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > On Sun, May 23, 2021 at 9:01 AM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Fri, May 21, 2021 at 2:37 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > >
> > > > > Hi, Alexei
> > > > >
> > > > > On Thu, May 20, 2021 at 11:52 PM Alexei Starovoitov
> > > > > <alexei.starovoitov@gmail.com> wrote:
> > > > > >
> > > > > > Introduce 'struct bpf_timer' that can be embedded in most BPF map types
> > > > > > and helpers to operate on it:
> > > > > > long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
> > > > > > long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> > > > > > long bpf_timer_del(struct bpf_timer *timer)
> > > > >
> > > > > Like we discussed, this approach would make the timer harder
> > > > > to be independent of other eBPF programs, which is a must-have
> > > > > for both of our use cases (mine and Jamal's). Like you explained,
> > > > > this requires at least another program array, a tail call, a mandatory
> > > > > prog pinning to work.
> > > >
> > > > That is simply not true.
> > >
> > > Which part is not true? The above is what I got from your explanation.
> >
> > I tried to write some code sketches to use your timer to implement
> > our conntrack logic, below shows how difficult it is to use,
>
> Was it difficult because you've used tail_call and over complicated
> the progs for no good reason?

Using tail call is what I got from you, here is the quote:

"Sure. That's trivially achieved with pinning.
One can have an ingress prog that tailcalls into another prog
that arms the timer with one of its subprogs.
Egress prog can tailcall into the same prog as well.
The ingress and egress progs can be replaced one by one
or removed both together and middle prog can stay alive
if it's pinned in bpffs or held alive by FD."

Here is the link:
https://lore.kernel.org/bpf/CAADnVQK9BgguVorziWgpMktLHuPCgEaKa4fz-KCfhcZtT46teQ@mail.gmail.com/


>
> > SEC("ingress")
> > void ingress(struct __sk_buff *skb)
> > {
> >         struct tuple tuple;
> >         // extract tuple from skb
> >
> >         if (bpf_map_lookup_elem(&timers, &key) == NULL)
> >                 bpf_tail_call(NULL, &jmp_table, 0);
> >                 // here is not reachable unless failure
> >         val = bpf_map_lookup_elem(&conntrack, &tuple);
> >         if (val && val->expires < now) {
> >                 bpf_tail_call(NULL, &jmp_table, 1);
> >                 // here is not reachable unless failure
> >         }
> > }
> >
> > SEC("egress")
> > void egress(struct __sk_buff *skb)
> > {
> >         struct tuple tuple;
> >         // extract tuple from skb
> >
> >         if (bpf_map_lookup_elem(&timers, &key) == NULL)
> >                 bpf_tail_call(NULL, &jmp_table, 0);
> >                 // here is not reachable unless failure
> >         val = bpf_map_lookup_elem(&conntrack, &tuple);
> >         if (val && val->expires < now) {
> >                 bpf_tail_call(NULL, &jmp_table, 1);
> >                 // here is not reachable unless failure
>
> tail_calls are unnecessary. Just call the funcs directly.
> All lookups and maps are unnecessary as well.
> Looks like a single global timer will be enough for this use case.

Hmm? With your design, a timer has to be embedded into a map
value, you said this is to mimic bpf spinlock.

>
> In general the garbage collection in any form doesn't scale.
> The conntrack logic doesn't need it. The cillium conntrack is a great
> example of how to implement a conntrack without GC.

That is simply not a conntrack. We expire connections based on
its time, not based on the size of the map where it residents.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-26 16:58                     ` Alexei Starovoitov
@ 2021-05-26 18:25                       ` Jamal Hadi Salim
  0 siblings, 0 replies; 79+ messages in thread
From: Jamal Hadi Salim @ 2021-05-26 18:25 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Cong Wang, David Miller, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Lorenz Bauer, Linux Kernel Network Developers,
	bpf, kernel-team, Pedro Tammela

On 2021-05-26 12:58 p.m., Alexei Starovoitov wrote:
> On Wed, May 26, 2021 at 11:34:04AM -0400, Jamal Hadi Salim wrote:
>> On 2021-05-25 6:08 p.m., Alexei Starovoitov wrote:
>>> On Tue, May 25, 2021 at 2:09 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:


>>
>> Didnt follow why this wouldnt work in the same way for Array?
> 
> array doesn't have delete.

Ok. But even for arrays if userspace for example does update
of an existing entry we should be able to invoke callback, no?

>> One interesting concept i see come out of this is emulating
>> netlink-like event generation towards user space i.e a user
>> space app listening to changes to a map.
> 
> Folks do it already via ringbuf events. No need for update/delete
> callback to implement such notifications.
> 

Please bear with me:
I know it is trivial to do if you are in control of the kernel
side if your prog creates/updates/deletes map entries. Ive done
it many times with perf event arrays (before ringbuf existed).
But:
What i was referring to is if another entity altogether
(possibly not under your control) was to make that change
from the kernel side then you dont get to know. Same with a
user space program doing a write to the map entry.

If you say this can be done then please do me a kindness and point
me to someone already doing this or some sample code.


>> would like to hear what the proposed ideas are.
>> I see this as a tricky problem to solve - you can make LRU
>> programmable to allow the variety of LRU replacement algos out
>> there but not all encompansing for custom or other types of algos.
>> The problem remains that LRU is very specific to evicting
>> entries that are least used. I can imagine that if i wanted to
>> do a LIFO aging for example then it can be done with some acrobatics
>> as an overlay on top of LRU with all sorts of tweaking.
>> It is sort of fitting a square peg into a round hole - you can do
>> it, but why the torture when you have a flexible architecture.
> 
> Using GC to solve 'hash table is running out of memory' problem is
> exactly the square peg.
> Timers is absolutely wrong way to address memory pressure.
> 
>> We need to provide the mechanisms (I dont see a disagreement on
>> need for timers at least).
> 
> It's an explicit non-goal for timer api to be used as GC for conntrack.

Agreed.

> You'll be able to use it as such, but when it fails to scale
> (as it's going to happen with any timer implementation) don't blame
> infrastructure for that.

Agreed again. Timers are a necessary part of the toolset.
I hope i was reading as claiming that just firing random
timers equates to gc or that on its own will scale.


>> A reasonable approach is to let the policy be defined
>> from user space. I may want the timer to keep polling
>> a map that is not being updated until the next program
>> restarts and starts updating it.
>> I thought Cong's approach with timerids/maps was a good
>> way to achieve control.
> 
> No, it's not a policy, and no, it doesn't belong to user space,
> and no, Cong's approach has nothing to do with this design choice.

You listed 3 possibilities of what could happen in the use case
i described. One person's meat is another person's poison.
i.e it is about design choice. What i meant by policy is
whether intentionaly or not, Cong's approach had the user able to
control what happens to the timer.

cheers,
jamal

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-26 15:34                   ` Jamal Hadi Salim
@ 2021-05-26 16:58                     ` Alexei Starovoitov
  2021-05-26 18:25                       ` Jamal Hadi Salim
  0 siblings, 1 reply; 79+ messages in thread
From: Alexei Starovoitov @ 2021-05-26 16:58 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Cong Wang, David Miller, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Lorenz Bauer, Linux Kernel Network Developers,
	bpf, kernel-team, Pedro Tammela

On Wed, May 26, 2021 at 11:34:04AM -0400, Jamal Hadi Salim wrote:
> On 2021-05-25 6:08 p.m., Alexei Starovoitov wrote:
> > On Tue, May 25, 2021 at 2:09 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > > 
> 
> > > This is certainly a useful feature (for other reasons as well).
> > > Does this include create/update/delete issued from user space?
> > 
> > Right. Any kind of update/delete and create is a subset of update.
> > The lookup is not included (yet or may be ever) since it doesn't
> > have deterministic start/end points.
> > The prog can do a lookup and update values in place while
> > holding on the element until prog execution ends.
> > 
> > While update/delete have precise points in hash/lru/lpm maps.
> > Array is a different story.
> > 
> 
> Didnt follow why this wouldnt work in the same way for Array?

array doesn't have delete.

> One interesting concept i see come out of this is emulating
> netlink-like event generation towards user space i.e a user
> space app listening to changes to a map.

Folks do it already via ringbuf events. No need for update/delete
callback to implement such notifications.

> > > 
> > > The challenge we have in this case is LRU makes the decision
> > > which entry to victimize. We do have some entries we want to
> > > keep longer - even if they are not seeing a lot of activity.
> > 
> > Right. That's certainly an argument to make LRU eviction
> > logic programmable.
> > John/Joe/Daniel proposed it as a concept long ago.
> > Design ideas are in demand to make further progress here :)
> > 
> 
> would like to hear what the proposed ideas are.
> I see this as a tricky problem to solve - you can make LRU
> programmable to allow the variety of LRU replacement algos out
> there but not all encompansing for custom or other types of algos.
> The problem remains that LRU is very specific to evicting
> entries that are least used. I can imagine that if i wanted to
> do a LIFO aging for example then it can be done with some acrobatics
> as an overlay on top of LRU with all sorts of tweaking.
> It is sort of fitting a square peg into a round hole - you can do
> it, but why the torture when you have a flexible architecture.

Using GC to solve 'hash table is running out of memory' problem is
exactly the square peg.
Timers is absolutely wrong way to address memory pressure.

> We need to provide the mechanisms (I dont see a disagreement on
> need for timers at least).

It's an explicit non-goal for timer api to be used as GC for conntrack.
You'll be able to use it as such, but when it fails to scale
(as it's going to happen with any timer implementation) don't blame
infrastructure for that.

> > > 
> > > What happens when both ingress and egress are ejected?
> > 
> > What is 'ejected'? Like a CD? ;)
> 
> I was going to use other verbs to describe this; but
> may have sounded obscene ;->

Please use standard terminology. The topic is difficult enough
to understand without inventing new words.

> > The kernel can choose to do different things with the timer here.
> > One option is to cancel the outstanding timers and unload
> > .text where the timer callback lives
> >
> > Another option is to let the timer stay armed and auto unload
> > .text of bpf function when it finishes executing.
> >
> > If timer callback decides to re-arm itself it can continue
> > executing indefinitely.
> > This patch is doing the latter.
> > There could be a combination of both options.
> > All options have their pros/cons.
> 
> A reasonable approach is to let the policy be defined
> from user space. I may want the timer to keep polling
> a map that is not being updated until the next program
> restarts and starts updating it.
> I thought Cong's approach with timerids/maps was a good
> way to achieve control.

No, it's not a policy, and no, it doesn't belong to user space,
and no, Cong's approach has nothing to do with this design choice.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-25 22:08                 ` Alexei Starovoitov
@ 2021-05-26 15:34                   ` Jamal Hadi Salim
  2021-05-26 16:58                     ` Alexei Starovoitov
  0 siblings, 1 reply; 79+ messages in thread
From: Jamal Hadi Salim @ 2021-05-26 15:34 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Cong Wang, David Miller, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Lorenz Bauer, Linux Kernel Network Developers,
	bpf, kernel-team, Pedro Tammela

On 2021-05-25 6:08 p.m., Alexei Starovoitov wrote:
> On Tue, May 25, 2021 at 2:09 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>>

>> This is certainly a useful feature (for other reasons as well).
>> Does this include create/update/delete issued from user space?
> 
> Right. Any kind of update/delete and create is a subset of update.
> The lookup is not included (yet or may be ever) since it doesn't
> have deterministic start/end points.
> The prog can do a lookup and update values in place while
> holding on the element until prog execution ends.
> 
> While update/delete have precise points in hash/lru/lpm maps.
> Array is a different story.
> 

Didnt follow why this wouldnt work in the same way for Array?

One interesting concept i see come out of this is emulating
netlink-like event generation towards user space i.e a user
space app listening to changes to a map.

>>
>> The challenge we have in this case is LRU makes the decision
>> which entry to victimize. We do have some entries we want to
>> keep longer - even if they are not seeing a lot of activity.
> 
> Right. That's certainly an argument to make LRU eviction
> logic programmable.
> John/Joe/Daniel proposed it as a concept long ago.
> Design ideas are in demand to make further progress here :)
> 

would like to hear what the proposed ideas are.
I see this as a tricky problem to solve - you can make LRU
programmable to allow the variety of LRU replacement algos out
there but not all encompansing for custom or other types of algos.
The problem remains that LRU is very specific to evicting
entries that are least used. I can imagine that if i wanted to
do a LIFO aging for example then it can be done with some acrobatics
as an overlay on top of LRU with all sorts of tweaking.
It is sort of fitting a square peg into a round hole - you can do
it, but why the torture when you have a flexible architecture.

We need to provide the mechanisms (I dont see a disagreement on
need for timers at least).

>> You could just notify user space to re-add the entry but then
>> you have sync challenges.
>> The timers do provide us a way to implement custom GC.
> 
> My point is that time is always going to be a heuristic that will
> break under certain traffic conditions.
> I recommend to focus development effort on creating
> building blocks that are truly great instead of reimplementing
> old ideas in bpf with all of their shortcomings.
> 

There are some basic mechanisms i dont think that we can avoid.
Agreed on the general sentiment of what you are saying.

>> So a question (which may have already been discussed),
>> assuming the following setup:
>> - 2 programs a) Ingress b) egress
>> - sharing a conntrack map which and said map pinned.
>> - a timer prog (with a map with just timers;
>>      even a single timer would be enough in some cases).
>>
>> ingress and egress do std stuff like create/update
>> timer prog does the deletes. For simplicity sake assume
>> we just have one timer that does a foreach and iterates
>> all entries.
>>
>> What happens when both ingress and egress are ejected?
> 
> What is 'ejected'? Like a CD? ;)

I was going to use other verbs to describe this; but
may have sounded obscene ;->

> I think you mean 'detached' ?

Yes.

> and then, I assume, the user space doesn't hold to prog FD?

Right. The pinning may still exist on the maps (therefore a ref
count). Note, this may be design intent.

> The kernel can choose to do different things with the timer here.
> One option is to cancel the outstanding timers and unload
> .text where the timer callback lives
 >
> Another option is to let the timer stay armed and auto unload
> .text of bpf function when it finishes executing.
 >
> If timer callback decides to re-arm itself it can continue
> executing indefinitely.
> This patch is doing the latter.
> There could be a combination of both options.
> All options have their pros/cons.

A reasonable approach is to let the policy be defined
from user space. I may want the timer to keep polling
a map that is not being updated until the next program
restarts and starts updating it.
I thought Cong's approach with timerids/maps was a good
way to achieve control.

cheers,
jamal


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-25 21:09               ` Jamal Hadi Salim
@ 2021-05-25 22:08                 ` Alexei Starovoitov
  2021-05-26 15:34                   ` Jamal Hadi Salim
  0 siblings, 1 reply; 79+ messages in thread
From: Alexei Starovoitov @ 2021-05-25 22:08 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Cong Wang, David Miller, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Lorenz Bauer, Linux Kernel Network Developers,
	bpf, kernel-team, Pedro Tammela

On Tue, May 25, 2021 at 2:09 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> On 2021-05-25 3:57 p.m., Alexei Starovoitov wrote:
> > On Tue, May 25, 2021 at 12:35 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> [..]
> > The outcome of the last bpf office hours was a general agreement
> > that we need new hooks in map update/delete operations
> > (including auto-delete by LRU) that will trigger a bpf subprog.
>
> This is certainly a useful feature (for other reasons as well).
> Does this include create/update/delete issued from user space?

Right. Any kind of update/delete and create is a subset of update.
The lookup is not included (yet or may be ever) since it doesn't
have deterministic start/end points.
The prog can do a lookup and update values in place while
holding on the element until prog execution ends.

While update/delete have precise points in hash/lru/lpm maps.
Array is a different story.

> > It might look very similar to the timer callback that is part of this patch,
> > but instead of being called by the timer the LRU logic will call it.
> > This way the subprog can transfer the data stored in the
> > about-to-be-deleted map element into some other map or pass
> > to user space via ringbuf or do any other logic.
> >
>
> The challenge we have in this case is LRU makes the decision
> which entry to victimize. We do have some entries we want to
> keep longer - even if they are not seeing a lot of activity.

Right. That's certainly an argument to make LRU eviction
logic programmable.
John/Joe/Daniel proposed it as a concept long ago.
Design ideas are in demand to make further progress here :)

> You could just notify user space to re-add the entry but then
> you have sync challenges.
> The timers do provide us a way to implement custom GC.

My point is that time is always going to be a heuristic that will
break under certain traffic conditions.
I recommend to focus development effort on creating
building blocks that are truly great instead of reimplementing
old ideas in bpf with all of their shortcomings.

> So a question (which may have already been discussed),
> assuming the following setup:
> - 2 programs a) Ingress b) egress
> - sharing a conntrack map which and said map pinned.
> - a timer prog (with a map with just timers;
>     even a single timer would be enough in some cases).
>
> ingress and egress do std stuff like create/update
> timer prog does the deletes. For simplicity sake assume
> we just have one timer that does a foreach and iterates
> all entries.
>
> What happens when both ingress and egress are ejected?

What is 'ejected'? Like a CD? ;)
I think you mean 'detached' ?
and then, I assume, the user space doesn't hold to prog FD?
The kernel can choose to do different things with the timer here.
One option is to cancel the outstanding timers and unload
.text where the timer callback lives.
Another option is to let the timer stay armed and auto unload
.text of bpf function when it finishes executing.
If timer callback decides to re-arm itself it can continue
executing indefinitely.
This patch is doing the latter.
There could be a combination of both options.
All options have their pros/cons.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-25 19:57             ` Alexei Starovoitov
@ 2021-05-25 21:09               ` Jamal Hadi Salim
  2021-05-25 22:08                 ` Alexei Starovoitov
  0 siblings, 1 reply; 79+ messages in thread
From: Jamal Hadi Salim @ 2021-05-25 21:09 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Cong Wang, David Miller, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Lorenz Bauer, Linux Kernel Network Developers,
	bpf, kernel-team, Pedro Tammela

On 2021-05-25 3:57 p.m., Alexei Starovoitov wrote:
> On Tue, May 25, 2021 at 12:35 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:

[..]
> The outcome of the last bpf office hours was a general agreement
> that we need new hooks in map update/delete operations
> (including auto-delete by LRU) that will trigger a bpf subprog.

This is certainly a useful feature (for other reasons as well).
Does this include create/update/delete issued from user space?

> It might look very similar to the timer callback that is part of this patch,
> but instead of being called by the timer the LRU logic will call it.
> This way the subprog can transfer the data stored in the
> about-to-be-deleted map element into some other map or pass
> to user space via ringbuf or do any other logic.
> 

The challenge we have in this case is LRU makes the decision
which entry to victimize. We do have some entries we want to
keep longer - even if they are not seeing a lot of activity.
You could just notify user space to re-add the entry but then
you have sync challenges.
The timers do provide us a way to implement custom GC.

So a question (which may have already been discussed),
assuming the following setup:
- 2 programs a) Ingress b) egress
- sharing a conntrack map which and said map pinned.
- a timer prog (with a map with just timers;
    even a single timer would be enough in some cases).

ingress and egress do std stuff like create/update
timer prog does the deletes. For simplicity sake assume
we just have one timer that does a foreach and iterates
all entries.

What happens when both ingress and egress are ejected?

cheers,
jamal

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-25 19:35           ` Jamal Hadi Salim
@ 2021-05-25 19:57             ` Alexei Starovoitov
  2021-05-25 21:09               ` Jamal Hadi Salim
  0 siblings, 1 reply; 79+ messages in thread
From: Alexei Starovoitov @ 2021-05-25 19:57 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Cong Wang, David Miller, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Lorenz Bauer, Linux Kernel Network Developers,
	bpf, kernel-team

On Tue, May 25, 2021 at 12:35 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> On 2021-05-25 2:21 p.m., Alexei Starovoitov wrote:
> > On Mon, May 24, 2021 at 9:59 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
>
> [..]
> > In general the garbage collection in any form doesn't scale.
> > The conntrack logic doesn't need it. The cillium conntrack is a great
> > example of how to implement a conntrack without GC.
>
> For our use case, we need to collect info on all the flows
> for various reasons (one of which is accounting of every byte and
> packet).
> So as a consequence - built-in GC (such as imposed by LRU)
> cant interfere without our consent.

The outcome of the last bpf office hours was a general agreement
that we need new hooks in map update/delete operations
(including auto-delete by LRU) that will trigger a bpf subprog.
It might look very similar to the timer callback that is part of this patch,
but instead of being called by the timer the LRU logic will call it.
This way the subprog can transfer the data stored in the
about-to-be-deleted map element into some other map or pass
to user space via ringbuf or do any other logic.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-25  5:22       ` Cong Wang
@ 2021-05-25 19:47         ` Andrii Nakryiko
  0 siblings, 0 replies; 79+ messages in thread
From: Andrii Nakryiko @ 2021-05-25 19:47 UTC (permalink / raw)
  To: Cong Wang
  Cc: Alexei Starovoitov, Lorenz Bauer, David S . Miller,
	Daniel Borkmann, Andrii Nakryiko, John Fastabend, Networking,
	bpf, Kernel Team

On Mon, May 24, 2021 at 10:22 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Mon, May 24, 2021 at 12:13 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > I second the use of BPF_PROG_TEST_RUN (a.k.a. BPF_PROG_RUN now) to
> > "mirror" such APIs to user-space. We have so much BPF-side
>
> Except the expiration time is stored in user-space too if you just
> use user-space timers to trigger BPF_PROG_TEST_RUN.
> Modifying expiration based on its current value in timer callbacks
> is very common. For example in conntrack use case, we want the
> GC timer to run sooner in the next run if we get certain amount of
> expired items in current run.

I'm not entirely sure what all this means, sorry. My general point is
that instead of doing bpf() syscall with a new custom command (e.g.,
BPF_TIMER_UPDATE), you can just fire your custom BPF program with
BPF_TEST_RUN. You can pass custom timeouts or any other
user-space-provided settings either through global variables, custom
maps, or directly as a context. So you have full control over what
should be set when and why, we just avoid adding tons of custom bpf()
syscall commands for every single feature.

>
>
> > functionality and APIs that reflecting all of that with special
> > user-space-facing BPF commands is becoming quite impractical. E.g., a
> > long time ago there was a proposal to add commands to push data to BPF
> > ringbuf from user-space for all kinds of testing scenarios. We never
> > did that because no one bothered enough, but now I'd advocate that a
> > small custom BPF program that is single-shot through BPF_PROG_RUN is a
> > better way to do this. Similarly for timers and whatever other
> > functionality. By doing everything from BPF program we also side-step
> > potential subtle differences in semantics between BPF-side and
> > user-space-side.
>
> I am confused about what you are saying, because we can already
> trigger BPF_PROG_RUN with a user-space timer for a single shot,
> with the current kernel, without any modification. So this sounds like
> you are against adding any timer on the eBPF side, but on the other
> hand, you are secoding to Alexei's patch... I am completely lost.

I'm arguing against adding more custom commands to bpf() syscall. And
I was talking about triggering BPF program directly from user-space
with BPF_PROG_TEST_RUN/BPF_PROG_RUN command, not through some timers.

>
> Very clearly, whatever you described as "single shot" is not what we
> want from any perspective.

I'm not sure we are even talking about the same things, so I doubt
"clearly" in this case.

>
> Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-25 18:21         ` Alexei Starovoitov
@ 2021-05-25 19:35           ` Jamal Hadi Salim
  2021-05-25 19:57             ` Alexei Starovoitov
  2021-05-30  6:36           ` Cong Wang
  1 sibling, 1 reply; 79+ messages in thread
From: Jamal Hadi Salim @ 2021-05-25 19:35 UTC (permalink / raw)
  To: Alexei Starovoitov, Cong Wang
  Cc: David Miller, Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Lorenz Bauer, Linux Kernel Network Developers, bpf, kernel-team

On 2021-05-25 2:21 p.m., Alexei Starovoitov wrote:
> On Mon, May 24, 2021 at 9:59 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:


[..]
> In general the garbage collection in any form doesn't scale.
> The conntrack logic doesn't need it. The cillium conntrack is a great
> example of how to implement a conntrack without GC.

For our use case, we need to collect info on all the flows
for various reasons (one of which is accounting of every byte and
packet).
So as a consequence - built-in GC (such as imposed by LRU)
cant interfere without our consent.

cheers,
jamal

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-25  4:59       ` Cong Wang
@ 2021-05-25 18:21         ` Alexei Starovoitov
  2021-05-25 19:35           ` Jamal Hadi Salim
  2021-05-30  6:36           ` Cong Wang
  0 siblings, 2 replies; 79+ messages in thread
From: Alexei Starovoitov @ 2021-05-25 18:21 UTC (permalink / raw)
  To: Cong Wang
  Cc: David Miller, Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Lorenz Bauer, Linux Kernel Network Developers, bpf, kernel-team

On Mon, May 24, 2021 at 9:59 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Mon, May 24, 2021 at 8:16 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Sun, May 23, 2021 at 9:01 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Fri, May 21, 2021 at 2:37 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >
> > > > Hi, Alexei
> > > >
> > > > On Thu, May 20, 2021 at 11:52 PM Alexei Starovoitov
> > > > <alexei.starovoitov@gmail.com> wrote:
> > > > >
> > > > > Introduce 'struct bpf_timer' that can be embedded in most BPF map types
> > > > > and helpers to operate on it:
> > > > > long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
> > > > > long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> > > > > long bpf_timer_del(struct bpf_timer *timer)
> > > >
> > > > Like we discussed, this approach would make the timer harder
> > > > to be independent of other eBPF programs, which is a must-have
> > > > for both of our use cases (mine and Jamal's). Like you explained,
> > > > this requires at least another program array, a tail call, a mandatory
> > > > prog pinning to work.
> > >
> > > That is simply not true.
> >
> > Which part is not true? The above is what I got from your explanation.
>
> I tried to write some code sketches to use your timer to implement
> our conntrack logic, below shows how difficult it is to use,

Was it difficult because you've used tail_call and over complicated
the progs for no good reason?

> SEC("ingress")
> void ingress(struct __sk_buff *skb)
> {
>         struct tuple tuple;
>         // extract tuple from skb
>
>         if (bpf_map_lookup_elem(&timers, &key) == NULL)
>                 bpf_tail_call(NULL, &jmp_table, 0);
>                 // here is not reachable unless failure
>         val = bpf_map_lookup_elem(&conntrack, &tuple);
>         if (val && val->expires < now) {
>                 bpf_tail_call(NULL, &jmp_table, 1);
>                 // here is not reachable unless failure
>         }
> }
>
> SEC("egress")
> void egress(struct __sk_buff *skb)
> {
>         struct tuple tuple;
>         // extract tuple from skb
>
>         if (bpf_map_lookup_elem(&timers, &key) == NULL)
>                 bpf_tail_call(NULL, &jmp_table, 0);
>                 // here is not reachable unless failure
>         val = bpf_map_lookup_elem(&conntrack, &tuple);
>         if (val && val->expires < now) {
>                 bpf_tail_call(NULL, &jmp_table, 1);
>                 // here is not reachable unless failure

tail_calls are unnecessary. Just call the funcs directly.
All lookups and maps are unnecessary as well.
Looks like a single global timer will be enough for this use case.

In general the garbage collection in any form doesn't scale.
The conntrack logic doesn't need it. The cillium conntrack is a great
example of how to implement a conntrack without GC.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-24 19:13     ` Andrii Nakryiko
@ 2021-05-25  5:22       ` Cong Wang
  2021-05-25 19:47         ` Andrii Nakryiko
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-05-25  5:22 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Alexei Starovoitov, Lorenz Bauer, David S . Miller,
	Daniel Borkmann, Andrii Nakryiko, John Fastabend, Networking,
	bpf, Kernel Team

On Mon, May 24, 2021 at 12:13 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> I second the use of BPF_PROG_TEST_RUN (a.k.a. BPF_PROG_RUN now) to
> "mirror" such APIs to user-space. We have so much BPF-side

Except the expiration time is stored in user-space too if you just
use user-space timers to trigger BPF_PROG_TEST_RUN.
Modifying expiration based on its current value in timer callbacks
is very common. For example in conntrack use case, we want the
GC timer to run sooner in the next run if we get certain amount of
expired items in current run.


> functionality and APIs that reflecting all of that with special
> user-space-facing BPF commands is becoming quite impractical. E.g., a
> long time ago there was a proposal to add commands to push data to BPF
> ringbuf from user-space for all kinds of testing scenarios. We never
> did that because no one bothered enough, but now I'd advocate that a
> small custom BPF program that is single-shot through BPF_PROG_RUN is a
> better way to do this. Similarly for timers and whatever other
> functionality. By doing everything from BPF program we also side-step
> potential subtle differences in semantics between BPF-side and
> user-space-side.

I am confused about what you are saying, because we can already
trigger BPF_PROG_RUN with a user-space timer for a single shot,
with the current kernel, without any modification. So this sounds like
you are against adding any timer on the eBPF side, but on the other
hand, you are secoding to Alexei's patch... I am completely lost.

Very clearly, whatever you described as "single shot" is not what we
want from any perspective.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-25  3:16     ` Cong Wang
@ 2021-05-25  4:59       ` Cong Wang
  2021-05-25 18:21         ` Alexei Starovoitov
  0 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-05-25  4:59 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Miller, Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Lorenz Bauer, Linux Kernel Network Developers, bpf, kernel-team

On Mon, May 24, 2021 at 8:16 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Sun, May 23, 2021 at 9:01 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Fri, May 21, 2021 at 2:37 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > Hi, Alexei
> > >
> > > On Thu, May 20, 2021 at 11:52 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > Introduce 'struct bpf_timer' that can be embedded in most BPF map types
> > > > and helpers to operate on it:
> > > > long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
> > > > long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> > > > long bpf_timer_del(struct bpf_timer *timer)
> > >
> > > Like we discussed, this approach would make the timer harder
> > > to be independent of other eBPF programs, which is a must-have
> > > for both of our use cases (mine and Jamal's). Like you explained,
> > > this requires at least another program array, a tail call, a mandatory
> > > prog pinning to work.
> >
> > That is simply not true.
>
> Which part is not true? The above is what I got from your explanation.

I tried to write some code sketches to use your timer to implement
our conntrack logic, below shows how difficult it is to use, it does not
even include the user-space part where eBPF programs are put
into the program array.


struct {
       __uint(type, BPF_MAP_TYPE_HASH);
       __uint(max_entries, 1000);
       __type(key, struct tuple);
       __type(value, struct foo);
} conntrack SEC(".maps");

struct map_elem {
       struct bpf_timer timer;
       struct bpf_map *target;
       u32 expires;
};

struct {
       __uint(type, BPF_MAP_TYPE_HASH);
       __uint(max_entries, 1000);
       __type(key, int);
       __type(value, struct map_elem);
} timers SEC(".maps");

struct {
        __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
        __uint(key_size, sizeof(u32));
        __uint(value_size, sizeof(u32));
        __uint(max_entries, 8);
} jmp_table SEC(".maps");

static __u64
cleanup_conntrack(struct bpf_map *map, struct tuple *key, struct foo *val,
                 struct callback_ctx *data)
{
        if (val.expires < now)
                bpf_map_delete_elem(conntrack, key);
}

static int timer_cb(struct bpf_map *map, int *key, struct map_elem *val)
{
       bpf_for_each_map_elem(val->target, cleanup_conntrack, ....);
       /* re-arm the timer again to execute after 1 msec */
       bpf_timer_mod(&val->timer, 1);
       return 0;
}

SEC("prog/0")
int install_timer(void)
{
       struct map_elem *val;
       int key = 0;

       val = bpf_map_lookup_elem(&timers, &key);
       if (val) {
               bpf_timer_init(&val->timer, timer_cb, 0);
               bpf_timer_mod(&val->timer, val->expires);
       }
}

SEC("prog/1")
int mod_timer(void)
{
       struct map_elem *val;
       int key = 0;

       val = bpf_map_lookup_elem(&timers, &key);
       if (val) {
               // XXX: how do we know if a timer has been installed?
               bpf_timer_mod(&val->timer, val->expires);
       }
}

SEC("ingress")
void ingress(struct __sk_buff *skb)
{
        struct tuple tuple;
        // extract tuple from skb

        if (bpf_map_lookup_elem(&timers, &key) == NULL)
                bpf_tail_call(NULL, &jmp_table, 0);
                // here is not reachable unless failure
        val = bpf_map_lookup_elem(&conntrack, &tuple);
        if (val && val->expires < now) {
                bpf_tail_call(NULL, &jmp_table, 1);
                // here is not reachable unless failure
        }
}

SEC("egress")
void egress(struct __sk_buff *skb)
{
        struct tuple tuple;
        // extract tuple from skb

        if (bpf_map_lookup_elem(&timers, &key) == NULL)
                bpf_tail_call(NULL, &jmp_table, 0);
                // here is not reachable unless failure
        val = bpf_map_lookup_elem(&conntrack, &tuple);
        if (val && val->expires < now) {
                bpf_tail_call(NULL, &jmp_table, 1);
                // here is not reachable unless failure
        }
}

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-23 16:01   ` Alexei Starovoitov
  2021-05-24  8:45     ` Lorenz Bauer
@ 2021-05-25  3:16     ` Cong Wang
  2021-05-25  4:59       ` Cong Wang
  1 sibling, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-05-25  3:16 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Miller, Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Lorenz Bauer, Linux Kernel Network Developers, bpf, kernel-team

On Sun, May 23, 2021 at 9:01 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, May 21, 2021 at 2:37 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > Hi, Alexei
> >
> > On Thu, May 20, 2021 at 11:52 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > From: Alexei Starovoitov <ast@kernel.org>
> >
> > Why do you intentionally keep people in the original discussion
> > out of your CC? Remember you are the one who objected the
> > idea by questioning its usefulness no matter how I hard I tried
> > to explain? I am glad you changed your mind, but it does not
> > mean you should forget to credit other people.
>
> I didn't change my mind and I still object to your stated
> _reasons_ for timers.

What is _your reason_ to introduce timers? Clearly you provide
absolutely nothing here. ;)


>
> > >
> > > Introduce 'struct bpf_timer' that can be embedded in most BPF map types
> > > and helpers to operate on it:
> > > long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
> > > long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> > > long bpf_timer_del(struct bpf_timer *timer)
> >
> > Like we discussed, this approach would make the timer harder
> > to be independent of other eBPF programs, which is a must-have
> > for both of our use cases (mine and Jamal's). Like you explained,
> > this requires at least another program array, a tail call, a mandatory
> > prog pinning to work.
>
> That is simply not true.

Which part is not true? The above is what I got from your explanation.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-24 14:56   ` Alexei Starovoitov
@ 2021-05-24 19:13     ` Andrii Nakryiko
  2021-05-25  5:22       ` Cong Wang
  0 siblings, 1 reply; 79+ messages in thread
From: Andrii Nakryiko @ 2021-05-24 19:13 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Lorenz Bauer, David S . Miller, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Networking, bpf, Kernel Team

On Mon, May 24, 2021 at 7:56 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, May 24, 2021 at 4:50 AM Lorenz Bauer <lmb@cloudflare.com> wrote:
> >
> > On Thu, 20 May 2021 at 19:55, Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > From: Alexei Starovoitov <ast@kernel.org>
> > >
> > > Introduce 'struct bpf_timer' that can be embedded in most BPF map types
> > > and helpers to operate on it:
> > > long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
> > > long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> > > long bpf_timer_del(struct bpf_timer *timer)
> >
> > I like invoking the callback with a pointer to the map element it was
> > defined in, since it solves lifetime of the context and user space
> > introspection of the same. I'm not so sure about being able to put it
> > into all different kinds of maps, is that really going to be used?
>
> Certainly. At least in array and hash maps.
> The global data is an array.
> A single global timer is a simple and easy to use pattern.
>
> >
> > It would be useful if Cong Wang could describe their use case, it's
> > kind of hard to tell what the end goal is. Should user space be able
> > to create and arm timers? Or just BPF? In the other thread it seems
> > like a primitive for waiting on a timer is proposed. Why? It also begs
> > the question how we would wait on multiple timers.
>
> In the proposed api the same callback can be invoked for multiple timers.
> The user space can create/destroy timers via prog_run cmd.
> It will also destroy timers by map_delete_elem cmd.
>
> > > + *
> > > + * long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
> >
> > In your selftest the callback has a type (int)(*callback)(struct
> > bpf_map *map, int *key, struct map_elem *val).
>
> Correct. I'll update the comment.
>
> > > + *     Description
> > > + *             Initialize the timer to call given static function.
> > > + *     Return
> > > + *             zero
> > > + *
> > > + * long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> > > + *     Description
> > > + *             Set the timer expiration N msecs from the current time.
> > > + *     Return
> > > + *             zero
> > > + *
> > > + * long bpf_timer_del(struct bpf_timer *timer)
> > > + *     Description
> > > + *             Deactivate the timer.
> > > + *     Return
> > > + *             zero
> > >   */
> > >  #define __BPF_FUNC_MAPPER(FN)          \
> > >         FN(unspec),                     \
> > > @@ -4932,6 +4950,9 @@ union bpf_attr {
> > >         FN(sys_bpf),                    \
> > >         FN(btf_find_by_name_kind),      \
> > >         FN(sys_close),                  \
> > > +       FN(timer_init),                 \
> > > +       FN(timer_mod),                  \
> > > +       FN(timer_del),                  \
> > >         /* */
> >
> > How can user space force stopping of timers (required IMO)?
>
> We can add new commands, of course, but I don't think it's
> necessary, since test_run can be used to achieve the same
> and map_delete_elem will stop them too.

I second the use of BPF_PROG_TEST_RUN (a.k.a. BPF_PROG_RUN now) to
"mirror" such APIs to user-space. We have so much BPF-side
functionality and APIs that reflecting all of that with special
user-space-facing BPF commands is becoming quite impractical. E.g., a
long time ago there was a proposal to add commands to push data to BPF
ringbuf from user-space for all kinds of testing scenarios. We never
did that because no one bothered enough, but now I'd advocate that a
small custom BPF program that is single-shot through BPF_PROG_RUN is a
better way to do this. Similarly for timers and whatever other
functionality. By doing everything from BPF program we also side-step
potential subtle differences in semantics between BPF-side and
user-space-side.

We just need to remember to enable all such functionality to
BPF_PROG_TYPE_SYSCALL as it's sleepable and always runs from user
context, so is most powerful in terms of what's safe to do through
such program type. And, of course, ideally for other types of programs
where it makes sense.


>
> > >
> > >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> > > @@ -6038,6 +6059,10 @@ struct bpf_spin_lock {
> > >         __u32   val;
> > >  };
> > >
> > > +struct bpf_timer {
> > > +       __u64 opaque;
> > > +};
> > > +
> >
> > This might be clear already, but we won't be able to modify the size
> > of bpf_timer later since it would break uapi, right?
>
> Correct. The internal implementation can change. The 'opaque'
> is just the pointer to the internal struct.
> When do you think we'd need to change this uapi struct?

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-24 17:33     ` Alexei Starovoitov
@ 2021-05-24 18:39       ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 79+ messages in thread
From: Toke Høiland-Jørgensen @ 2021-05-24 18:39 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David S. Miller, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Lorenz Bauer, Network Development, bpf,
	Kernel Team

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Sun, May 23, 2021 at 8:58 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
>>
>> On Sun, May 23, 2021 at 4:48 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>> >
>> > Still wrapping my head around this, but one thing immediately sprang to
>> > mind:
>> >
>> > > + * long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
>> > > + *   Description
>> > > + *           Set the timer expiration N msecs from the current time.
>> > > + *   Return
>> > > + *           zero
>> >
>> > Could we make this use nanoseconds (and wire it up to hrtimers) instead?
>> > I would like to eventually be able to use this for pacing out network
>> > packets, and msec precision is way too coarse for that...
>>
>> msecs are used to avoid exposing jiffies to bpf prog, since msec_to_jiffies
>> isn't trivial to do in the bpf prog unlike the kernel.
>> hrtimer would be great to support as well.
>> It could be implemented via flags (which are currently zero only)
>> but probably not as a full replacement for jiffies based timers.
>> Like array vs hash. bpf_timer can support both.
>
> After reading the hrtimer code I might take the above statement back...
> hrtimer looks strictly better than timerwheel and jiffies.
> It scales well and there are no concerns with overload,
> since sys_nanonsleep and tcp are heavy users.
> So I'm thinking to drop jiffies approach and do hrtimer only.
> wdyt?

Oops, sorry, crossed streams, didn't see this before sending my other
reply. Yeah, hrtimers only SGTM :)

-Toke


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-23 15:58   ` Alexei Starovoitov
  2021-05-24  8:42     ` Lorenz Bauer
  2021-05-24 17:33     ` Alexei Starovoitov
@ 2021-05-24 18:38     ` Toke Høiland-Jørgensen
  2 siblings, 0 replies; 79+ messages in thread
From: Toke Høiland-Jørgensen @ 2021-05-24 18:38 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David S. Miller, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Lorenz Bauer, Network Development, bpf,
	Kernel Team

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Sun, May 23, 2021 at 4:48 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> Still wrapping my head around this, but one thing immediately sprang to
>> mind:
>>
>> > + * long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
>> > + *   Description
>> > + *           Set the timer expiration N msecs from the current time.
>> > + *   Return
>> > + *           zero
>>
>> Could we make this use nanoseconds (and wire it up to hrtimers) instead?
>> I would like to eventually be able to use this for pacing out network
>> packets, and msec precision is way too coarse for that...
>
> msecs are used to avoid exposing jiffies to bpf prog, since msec_to_jiffies
> isn't trivial to do in the bpf prog unlike the kernel.
> hrtimer would be great to support as well.
> It could be implemented via flags (which are currently zero only)
> but probably not as a full replacement for jiffies based timers.
> Like array vs hash. bpf_timer can support both.

Okay, so this is really:

long bpf_timer_mod(struct bpf_timer *timer, u64 interval)

where 'interval' will be expressed in either milliseconds or nanoseconds
depending on which flags are passed to bpf_timer_init()? That's fine by
me, then; I just wanted to make sure that that 'msecs' was not an
indication that this was the only granularity these timers would
support... :)

-Toke


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-23 15:58   ` Alexei Starovoitov
  2021-05-24  8:42     ` Lorenz Bauer
@ 2021-05-24 17:33     ` Alexei Starovoitov
  2021-05-24 18:39       ` Toke Høiland-Jørgensen
  2021-05-24 18:38     ` Toke Høiland-Jørgensen
  2 siblings, 1 reply; 79+ messages in thread
From: Alexei Starovoitov @ 2021-05-24 17:33 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David S. Miller, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Lorenz Bauer, Network Development, bpf,
	Kernel Team

On Sun, May 23, 2021 at 8:58 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Sun, May 23, 2021 at 4:48 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >
> > Still wrapping my head around this, but one thing immediately sprang to
> > mind:
> >
> > > + * long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> > > + *   Description
> > > + *           Set the timer expiration N msecs from the current time.
> > > + *   Return
> > > + *           zero
> >
> > Could we make this use nanoseconds (and wire it up to hrtimers) instead?
> > I would like to eventually be able to use this for pacing out network
> > packets, and msec precision is way too coarse for that...
>
> msecs are used to avoid exposing jiffies to bpf prog, since msec_to_jiffies
> isn't trivial to do in the bpf prog unlike the kernel.
> hrtimer would be great to support as well.
> It could be implemented via flags (which are currently zero only)
> but probably not as a full replacement for jiffies based timers.
> Like array vs hash. bpf_timer can support both.

After reading the hrtimer code I might take the above statement back...
hrtimer looks strictly better than timerwheel and jiffies.
It scales well and there are no concerns with overload,
since sys_nanonsleep and tcp are heavy users.
So I'm thinking to drop jiffies approach and do hrtimer only.
wdyt?

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-24 11:49 ` Lorenz Bauer
@ 2021-05-24 14:56   ` Alexei Starovoitov
  2021-05-24 19:13     ` Andrii Nakryiko
  0 siblings, 1 reply; 79+ messages in thread
From: Alexei Starovoitov @ 2021-05-24 14:56 UTC (permalink / raw)
  To: Lorenz Bauer
  Cc: David S . Miller, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Networking, bpf, Kernel Team

On Mon, May 24, 2021 at 4:50 AM Lorenz Bauer <lmb@cloudflare.com> wrote:
>
> On Thu, 20 May 2021 at 19:55, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > Introduce 'struct bpf_timer' that can be embedded in most BPF map types
> > and helpers to operate on it:
> > long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
> > long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> > long bpf_timer_del(struct bpf_timer *timer)
>
> I like invoking the callback with a pointer to the map element it was
> defined in, since it solves lifetime of the context and user space
> introspection of the same. I'm not so sure about being able to put it
> into all different kinds of maps, is that really going to be used?

Certainly. At least in array and hash maps.
The global data is an array.
A single global timer is a simple and easy to use pattern.

>
> It would be useful if Cong Wang could describe their use case, it's
> kind of hard to tell what the end goal is. Should user space be able
> to create and arm timers? Or just BPF? In the other thread it seems
> like a primitive for waiting on a timer is proposed. Why? It also begs
> the question how we would wait on multiple timers.

In the proposed api the same callback can be invoked for multiple timers.
The user space can create/destroy timers via prog_run cmd.
It will also destroy timers by map_delete_elem cmd.

> > + *
> > + * long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
>
> In your selftest the callback has a type (int)(*callback)(struct
> bpf_map *map, int *key, struct map_elem *val).

Correct. I'll update the comment.

> > + *     Description
> > + *             Initialize the timer to call given static function.
> > + *     Return
> > + *             zero
> > + *
> > + * long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> > + *     Description
> > + *             Set the timer expiration N msecs from the current time.
> > + *     Return
> > + *             zero
> > + *
> > + * long bpf_timer_del(struct bpf_timer *timer)
> > + *     Description
> > + *             Deactivate the timer.
> > + *     Return
> > + *             zero
> >   */
> >  #define __BPF_FUNC_MAPPER(FN)          \
> >         FN(unspec),                     \
> > @@ -4932,6 +4950,9 @@ union bpf_attr {
> >         FN(sys_bpf),                    \
> >         FN(btf_find_by_name_kind),      \
> >         FN(sys_close),                  \
> > +       FN(timer_init),                 \
> > +       FN(timer_mod),                  \
> > +       FN(timer_del),                  \
> >         /* */
>
> How can user space force stopping of timers (required IMO)?

We can add new commands, of course, but I don't think it's
necessary, since test_run can be used to achieve the same
and map_delete_elem will stop them too.

> >
> >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> > @@ -6038,6 +6059,10 @@ struct bpf_spin_lock {
> >         __u32   val;
> >  };
> >
> > +struct bpf_timer {
> > +       __u64 opaque;
> > +};
> > +
>
> This might be clear already, but we won't be able to modify the size
> of bpf_timer later since it would break uapi, right?

Correct. The internal implementation can change. The 'opaque'
is just the pointer to the internal struct.
When do you think we'd need to change this uapi struct?

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-24  8:42     ` Lorenz Bauer
@ 2021-05-24 14:48       ` Alexei Starovoitov
  0 siblings, 0 replies; 79+ messages in thread
From: Alexei Starovoitov @ 2021-05-24 14:48 UTC (permalink / raw)
  To: Lorenz Bauer
  Cc: Toke Høiland-Jørgensen, David S. Miller,
	Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Network Development, bpf, Kernel Team

On Mon, May 24, 2021 at 1:42 AM Lorenz Bauer <lmb@cloudflare.com> wrote:
>
> On Sun, 23 May 2021 at 16:58, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
>
> ...
>
> >
> > msecs are used to avoid exposing jiffies to bpf prog, since msec_to_jiffies
> > isn't trivial to do in the bpf prog unlike the kernel.
>
> Isn't that already the case with bpf_jiffies64?

It's reading jiffies. To convert to time the prog needs HZ value.
The HZ is also accessible via kconfig special map type and libbpf magic,
but supplying jiffies as an end-time is an implementation detail.
Are you arguing that api should be exactly one-to-one to kernel
and force all progs to do bpf_jiffies64() + end_time/HZ ?

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-20 18:55 [RFC PATCH bpf-next] bpf: Introduce bpf_timer Alexei Starovoitov
                   ` (4 preceding siblings ...)
  2021-05-23 11:48 ` Toke Høiland-Jørgensen
@ 2021-05-24 11:49 ` Lorenz Bauer
  2021-05-24 14:56   ` Alexei Starovoitov
  5 siblings, 1 reply; 79+ messages in thread
From: Lorenz Bauer @ 2021-05-24 11:49 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David S . Miller, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Networking, bpf, Kernel Team

On Thu, 20 May 2021 at 19:55, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> From: Alexei Starovoitov <ast@kernel.org>
>
> Introduce 'struct bpf_timer' that can be embedded in most BPF map types
> and helpers to operate on it:
> long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
> long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> long bpf_timer_del(struct bpf_timer *timer)

I like invoking the callback with a pointer to the map element it was
defined in, since it solves lifetime of the context and user space
introspection of the same. I'm not so sure about being able to put it
into all different kinds of maps, is that really going to be used?

It would be useful if Cong Wang could describe their use case, it's
kind of hard to tell what the end goal is. Should user space be able
to create and arm timers? Or just BPF? In the other thread it seems
like a primitive for waiting on a timer is proposed. Why? It also begs
the question how we would wait on multiple timers.

>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
> This is work in progress, but gives an idea on how API will look.
> ---
>  include/linux/bpf.h                           |   1 +
>  include/uapi/linux/bpf.h                      |  25 ++++
>  kernel/bpf/helpers.c                          | 106 +++++++++++++++++
>  kernel/bpf/verifier.c                         | 110 ++++++++++++++++++
>  kernel/trace/bpf_trace.c                      |   2 +-
>  scripts/bpf_doc.py                            |   2 +
>  tools/include/uapi/linux/bpf.h                |  25 ++++
>  .../testing/selftests/bpf/prog_tests/timer.c  |  42 +++++++
>  tools/testing/selftests/bpf/progs/timer.c     |  53 +++++++++
>  9 files changed, 365 insertions(+), 1 deletion(-)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/timer.c
>  create mode 100644 tools/testing/selftests/bpf/progs/timer.c
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 9dc44ba97584..18e09cc0c410 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -312,6 +312,7 @@ enum bpf_arg_type {
>         ARG_PTR_TO_FUNC,        /* pointer to a bpf program function */
>         ARG_PTR_TO_STACK_OR_NULL,       /* pointer to stack or NULL */
>         ARG_PTR_TO_CONST_STR,   /* pointer to a null terminated read-only string */
> +       ARG_PTR_TO_TIMER,       /* pointer to bpf_timer */
>         __BPF_ARG_TYPE_MAX,
>  };
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 418b9b813d65..c95d7854d9fb 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -4761,6 +4761,24 @@ union bpf_attr {
>   *             Execute close syscall for given FD.
>   *     Return
>   *             A syscall result.
> + *
> + * long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)

In your selftest the callback has a type (int)(*callback)(struct
bpf_map *map, int *key, struct map_elem *val).

> + *     Description
> + *             Initialize the timer to call given static function.
> + *     Return
> + *             zero
> + *
> + * long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> + *     Description
> + *             Set the timer expiration N msecs from the current time.
> + *     Return
> + *             zero
> + *
> + * long bpf_timer_del(struct bpf_timer *timer)
> + *     Description
> + *             Deactivate the timer.
> + *     Return
> + *             zero
>   */
>  #define __BPF_FUNC_MAPPER(FN)          \
>         FN(unspec),                     \
> @@ -4932,6 +4950,9 @@ union bpf_attr {
>         FN(sys_bpf),                    \
>         FN(btf_find_by_name_kind),      \
>         FN(sys_close),                  \
> +       FN(timer_init),                 \
> +       FN(timer_mod),                  \
> +       FN(timer_del),                  \
>         /* */

How can user space force stopping of timers (required IMO)?

>
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> @@ -6038,6 +6059,10 @@ struct bpf_spin_lock {
>         __u32   val;
>  };
>
> +struct bpf_timer {
> +       __u64 opaque;
> +};
> +

This might be clear already, but we won't be able to modify the size
of bpf_timer later since it would break uapi, right?

-- 
Lorenz Bauer  |  Systems Engineer
6th Floor, County Hall/The Riverside Building, SE1 7PB, UK

www.cloudflare.com

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-23 16:01   ` Alexei Starovoitov
@ 2021-05-24  8:45     ` Lorenz Bauer
  2021-05-25  3:16     ` Cong Wang
  1 sibling, 0 replies; 79+ messages in thread
From: Lorenz Bauer @ 2021-05-24  8:45 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Cong Wang, David Miller, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Linux Kernel Network Developers, bpf,
	kernel-team

On Sun, 23 May 2021 at 17:01, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, May 21, 2021 at 2:37 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > Hi, Alexei
> >
> > Why do you intentionally keep people in the original discussion
> > out of your CC? Remember you are the one who objected the
> > idea by questioning its usefulness no matter how I hard I tried
> > to explain? I am glad you changed your mind, but it does not
> > mean you should forget to credit other people.
>
> I didn't change my mind and I still object to your stated
> _reasons_ for timers.

For others reading along, here is the original thread
https://lore.kernel.org/bpf/CAM_iQpXJ4MWUhk-j+mC4ScsX12afcuUHT-64CpVj97QdQaNZZg@mail.gmail.com/

-- 
Lorenz Bauer  |  Systems Engineer
6th Floor, County Hall/The Riverside Building, SE1 7PB, UK

www.cloudflare.com

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-23 15:58   ` Alexei Starovoitov
@ 2021-05-24  8:42     ` Lorenz Bauer
  2021-05-24 14:48       ` Alexei Starovoitov
  2021-05-24 17:33     ` Alexei Starovoitov
  2021-05-24 18:38     ` Toke Høiland-Jørgensen
  2 siblings, 1 reply; 79+ messages in thread
From: Lorenz Bauer @ 2021-05-24  8:42 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Toke Høiland-Jørgensen, David S. Miller,
	Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Network Development, bpf, Kernel Team

On Sun, 23 May 2021 at 16:58, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:

...

>
> msecs are used to avoid exposing jiffies to bpf prog, since msec_to_jiffies
> isn't trivial to do in the bpf prog unlike the kernel.

Isn't that already the case with bpf_jiffies64?

-- 
Lorenz Bauer  |  Systems Engineer
6th Floor, County Hall/The Riverside Building, SE1 7PB, UK

www.cloudflare.com

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-21 21:37 ` Cong Wang
@ 2021-05-23 16:01   ` Alexei Starovoitov
  2021-05-24  8:45     ` Lorenz Bauer
  2021-05-25  3:16     ` Cong Wang
  0 siblings, 2 replies; 79+ messages in thread
From: Alexei Starovoitov @ 2021-05-23 16:01 UTC (permalink / raw)
  To: Cong Wang
  Cc: David Miller, Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Lorenz Bauer, Linux Kernel Network Developers, bpf, kernel-team

On Fri, May 21, 2021 at 2:37 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> Hi, Alexei
>
> On Thu, May 20, 2021 at 11:52 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > From: Alexei Starovoitov <ast@kernel.org>
>
> Why do you intentionally keep people in the original discussion
> out of your CC? Remember you are the one who objected the
> idea by questioning its usefulness no matter how I hard I tried
> to explain? I am glad you changed your mind, but it does not
> mean you should forget to credit other people.

I didn't change my mind and I still object to your stated
_reasons_ for timers.

> >
> > Introduce 'struct bpf_timer' that can be embedded in most BPF map types
> > and helpers to operate on it:
> > long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
> > long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> > long bpf_timer_del(struct bpf_timer *timer)
>
> Like we discussed, this approach would make the timer harder
> to be independent of other eBPF programs, which is a must-have
> for both of our use cases (mine and Jamal's). Like you explained,
> this requires at least another program array, a tail call, a mandatory
> prog pinning to work.

That is simply not true.

> So, why do you prefer to make it harder to use?
>
> BTW, I have a V2 to send out soon and will keep you in CC, which
> still creates timers from user-space.

Don't bother.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-23 11:48 ` Toke Høiland-Jørgensen
@ 2021-05-23 15:58   ` Alexei Starovoitov
  2021-05-24  8:42     ` Lorenz Bauer
                       ` (2 more replies)
  0 siblings, 3 replies; 79+ messages in thread
From: Alexei Starovoitov @ 2021-05-23 15:58 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David S. Miller, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Lorenz Bauer, Network Development, bpf,
	Kernel Team

On Sun, May 23, 2021 at 4:48 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Still wrapping my head around this, but one thing immediately sprang to
> mind:
>
> > + * long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> > + *   Description
> > + *           Set the timer expiration N msecs from the current time.
> > + *   Return
> > + *           zero
>
> Could we make this use nanoseconds (and wire it up to hrtimers) instead?
> I would like to eventually be able to use this for pacing out network
> packets, and msec precision is way too coarse for that...

msecs are used to avoid exposing jiffies to bpf prog, since msec_to_jiffies
isn't trivial to do in the bpf prog unlike the kernel.
hrtimer would be great to support as well.
It could be implemented via flags (which are currently zero only)
but probably not as a full replacement for jiffies based timers.
Like array vs hash. bpf_timer can support both.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-20 18:55 [RFC PATCH bpf-next] bpf: Introduce bpf_timer Alexei Starovoitov
                   ` (3 preceding siblings ...)
  2021-05-22 16:41 ` kernel test robot
@ 2021-05-23 11:48 ` Toke Høiland-Jørgensen
  2021-05-23 15:58   ` Alexei Starovoitov
  2021-05-24 11:49 ` Lorenz Bauer
  5 siblings, 1 reply; 79+ messages in thread
From: Toke Høiland-Jørgensen @ 2021-05-23 11:48 UTC (permalink / raw)
  To: Alexei Starovoitov, davem
  Cc: daniel, andrii, john.fastabend, lmb, netdev, bpf, kernel-team

Still wrapping my head around this, but one thing immediately sprang to
mind:

> + * long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> + *	Description
> + *		Set the timer expiration N msecs from the current time.
> + *	Return
> + *		zero

Could we make this use nanoseconds (and wire it up to hrtimers) instead?
I would like to eventually be able to use this for pacing out network
packets, and msec precision is way too coarse for that...

-Toke


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-20 18:55 [RFC PATCH bpf-next] bpf: Introduce bpf_timer Alexei Starovoitov
                   ` (2 preceding siblings ...)
  2021-05-22 16:06 ` kernel test robot
@ 2021-05-22 16:41 ` kernel test robot
  2021-05-23 11:48 ` Toke Høiland-Jørgensen
  2021-05-24 11:49 ` Lorenz Bauer
  5 siblings, 0 replies; 79+ messages in thread
From: kernel test robot @ 2021-05-22 16:41 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 1664 bytes --]

Hi Alexei,

[FYI, it's a private test report for your RFC patch.]
[auto build test WARNING on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Alexei-Starovoitov/bpf-Introduce-bpf_timer/20210522-204413
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: i386-randconfig-s001-20210522 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.3-341-g8af24329-dirty
        # https://github.com/0day-ci/linux/commit/9380db20bb780c2f5147a795ac7f5fc133f66d55
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Alexei-Starovoitov/bpf-Introduce-bpf_timer/20210522-204413
        git checkout 9380db20bb780c2f5147a795ac7f5fc133f66d55
        # save the attached .config to linux build tree
        make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>


sparse warnings: (new ones prefixed by >>)
>> kernel/bpf/helpers.c:1030:29: sparse: sparse: symbol 'bpf_timer_init_proto' was not declared. Should it be static?
>> kernel/bpf/helpers.c:1056:29: sparse: sparse: symbol 'bpf_timer_mod_proto' was not declared. Should it be static?
>> kernel/bpf/helpers.c:1081:29: sparse: sparse: symbol 'bpf_timer_del_proto' was not declared. Should it be static?

Please review and possibly fold the followup patch.

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 41049 bytes --]

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-20 18:55 [RFC PATCH bpf-next] bpf: Introduce bpf_timer Alexei Starovoitov
  2021-05-21 14:38 ` Alexei Starovoitov
  2021-05-21 21:37 ` Cong Wang
@ 2021-05-22 16:06 ` kernel test robot
  2021-05-22 16:41 ` kernel test robot
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 79+ messages in thread
From: kernel test robot @ 2021-05-22 16:06 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2626 bytes --]

Hi Alexei,

[FYI, it's a private test report for your RFC patch.]
[auto build test WARNING on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Alexei-Starovoitov/bpf-Introduce-bpf_timer/20210522-204413
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: m68k-randconfig-r012-20210522 (attached as .config)
compiler: m68k-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/9380db20bb780c2f5147a795ac7f5fc133f66d55
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Alexei-Starovoitov/bpf-Introduce-bpf_timer/20210522-204413
        git checkout 9380db20bb780c2f5147a795ac7f5fc133f66d55
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=m68k 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   kernel/bpf/helpers.c: In function 'timer_cb':
   kernel/bpf/helpers.c:1000:18: warning: unused variable 'map' [-Wunused-variable]
    1000 |  struct bpf_map *map;
         |                  ^~~
   kernel/bpf/helpers.c: In function '____bpf_timer_mod':
>> kernel/bpf/helpers.c:1043:7: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
    1043 |  tl = (struct bpf_timer_list *)timer->opaque;
         |       ^
   kernel/bpf/helpers.c: In function '____bpf_timer_del':
   kernel/bpf/helpers.c:1068:7: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
    1068 |  tl = (struct bpf_timer_list *)timer->opaque;
         |       ^


vim +1043 kernel/bpf/helpers.c

  1038	
  1039	BPF_CALL_2(bpf_timer_mod, struct bpf_timer *, timer, u64, msecs)
  1040	{
  1041		struct bpf_timer_list *tl;
  1042	
> 1043		tl = (struct bpf_timer_list *)timer->opaque;
  1044		if (!tl)
  1045			return -EINVAL;
  1046		/* keep the prog alive until callback is invoked */
  1047		if (!mod_timer(&tl->tl, jiffies + msecs_to_jiffies(msecs))) {
  1048			/* The timer was inactive.
  1049			 * Keep the prog alive until callback is invoked
  1050			 */
  1051			bpf_prog_inc(tl->prog);
  1052		}
  1053		return 0;
  1054	}
  1055	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 26152 bytes --]

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-20 18:55 [RFC PATCH bpf-next] bpf: Introduce bpf_timer Alexei Starovoitov
  2021-05-21 14:38 ` Alexei Starovoitov
@ 2021-05-21 21:37 ` Cong Wang
  2021-05-23 16:01   ` Alexei Starovoitov
  2021-05-22 16:06 ` kernel test robot
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 79+ messages in thread
From: Cong Wang @ 2021-05-21 21:37 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Miller, Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Lorenz Bauer, Linux Kernel Network Developers, bpf, kernel-team

Hi, Alexei

On Thu, May 20, 2021 at 11:52 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> From: Alexei Starovoitov <ast@kernel.org>

Why do you intentionally keep people in the original discussion
out of your CC? Remember you are the one who objected the
idea by questioning its usefulness no matter how I hard I tried
to explain? I am glad you changed your mind, but it does not
mean you should forget to credit other people.

>
> Introduce 'struct bpf_timer' that can be embedded in most BPF map types
> and helpers to operate on it:
> long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
> long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> long bpf_timer_del(struct bpf_timer *timer)

Like we discussed, this approach would make the timer harder
to be independent of other eBPF programs, which is a must-have
for both of our use cases (mine and Jamal's). Like you explained,
this requires at least another program array, a tail call, a mandatory
prog pinning to work.

So, why do you prefer to make it harder to use?

BTW, I have a V2 to send out soon and will keep you in CC, which
still creates timers from user-space.

Thanks.

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [RFC PATCH bpf-next] bpf: Introduce bpf_timer
  2021-05-20 18:55 [RFC PATCH bpf-next] bpf: Introduce bpf_timer Alexei Starovoitov
@ 2021-05-21 14:38 ` Alexei Starovoitov
  2021-05-21 21:37 ` Cong Wang
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 79+ messages in thread
From: Alexei Starovoitov @ 2021-05-21 14:38 UTC (permalink / raw)
  To: David S. Miller
  Cc: Daniel Borkmann, Andrii Nakryiko, John Fastabend, Lorenz Bauer,
	Network Development, bpf, Kernel Team

On Thu, May 20, 2021 at 11:55 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> From: Alexei Starovoitov <ast@kernel.org>
>
> Introduce 'struct bpf_timer' that can be embedded in most BPF map types
> and helpers to operate on it:
> long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
> long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
> long bpf_timer_del(struct bpf_timer *timer)
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
> This is work in progress, but gives an idea on how API will look.

Forgot to mention the todo list:
- restrict to cap_bpf
- kfree bpf_timer_list
- verifier btf checks
- restrict to array, hash, lru, lpm. per-cpu maps cannot be supported.
- safe interaction with lookup/update/delete operations and iterator
- relax the 'first field only' requirement to allow bpf_timerr in global data.
  kinda without a map.
- check prog_rdonly, frozen, mmaped flags
- decide on a return value from the timer callback
- more tests

^ permalink raw reply	[flat|nested] 79+ messages in thread

* [RFC PATCH bpf-next] bpf: Introduce bpf_timer
@ 2021-05-20 18:55 Alexei Starovoitov
  2021-05-21 14:38 ` Alexei Starovoitov
                   ` (5 more replies)
  0 siblings, 6 replies; 79+ messages in thread
From: Alexei Starovoitov @ 2021-05-20 18:55 UTC (permalink / raw)
  To: davem; +Cc: daniel, andrii, john.fastabend, lmb, netdev, bpf, kernel-team

From: Alexei Starovoitov <ast@kernel.org>

Introduce 'struct bpf_timer' that can be embedded in most BPF map types
and helpers to operate on it:
long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
long bpf_timer_del(struct bpf_timer *timer)

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
This is work in progress, but gives an idea on how API will look.
---
 include/linux/bpf.h                           |   1 +
 include/uapi/linux/bpf.h                      |  25 ++++
 kernel/bpf/helpers.c                          | 106 +++++++++++++++++
 kernel/bpf/verifier.c                         | 110 ++++++++++++++++++
 kernel/trace/bpf_trace.c                      |   2 +-
 scripts/bpf_doc.py                            |   2 +
 tools/include/uapi/linux/bpf.h                |  25 ++++
 .../testing/selftests/bpf/prog_tests/timer.c  |  42 +++++++
 tools/testing/selftests/bpf/progs/timer.c     |  53 +++++++++
 9 files changed, 365 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/timer.c
 create mode 100644 tools/testing/selftests/bpf/progs/timer.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9dc44ba97584..18e09cc0c410 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -312,6 +312,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_FUNC,	/* pointer to a bpf program function */
 	ARG_PTR_TO_STACK_OR_NULL,	/* pointer to stack or NULL */
 	ARG_PTR_TO_CONST_STR,	/* pointer to a null terminated read-only string */
+	ARG_PTR_TO_TIMER,	/* pointer to bpf_timer */
 	__BPF_ARG_TYPE_MAX,
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 418b9b813d65..c95d7854d9fb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4761,6 +4761,24 @@ union bpf_attr {
  * 		Execute close syscall for given FD.
  * 	Return
  * 		A syscall result.
+ *
+ * long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
+ *	Description
+ *		Initialize the timer to call given static function.
+ *	Return
+ *		zero
+ *
+ * long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
+ *	Description
+ *		Set the timer expiration N msecs from the current time.
+ *	Return
+ *		zero
+ *
+ * long bpf_timer_del(struct bpf_timer *timer)
+ *	Description
+ *		Deactivate the timer.
+ *	Return
+ *		zero
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4932,6 +4950,9 @@ union bpf_attr {
 	FN(sys_bpf),			\
 	FN(btf_find_by_name_kind),	\
 	FN(sys_close),			\
+	FN(timer_init),			\
+	FN(timer_mod),			\
+	FN(timer_del),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6038,6 +6059,10 @@ struct bpf_spin_lock {
 	__u32	val;
 };
 
+struct bpf_timer {
+	__u64 opaque;
+};
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 544773970dbc..8ef0ad23c991 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -985,6 +985,106 @@ const struct bpf_func_proto bpf_snprintf_proto = {
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
+struct bpf_timer_list {
+	struct timer_list tl;
+	struct bpf_map *map;
+	struct bpf_prog *prog;
+	void *callback_fn;
+	void *key;
+	void *value;
+};
+
+static void timer_cb(struct timer_list *timer)
+{
+	struct bpf_timer_list *tl = from_timer(tl, timer, tl);
+	struct bpf_map *map;
+	int ret;
+
+	ret = BPF_CAST_CALL(tl->callback_fn)((u64)(long)tl->map,
+					     (u64)(long)tl->key,
+					     (u64)(long)tl->value, 0, 0);
+	WARN_ON(ret != 0); /* todo: define 0 vs 1 or disallow 1 in the verifier */
+	bpf_prog_put(tl->prog);
+}
+
+BPF_CALL_5(bpf_timer_init, struct bpf_timer *, timer, void *, cb, int, flags,
+	   struct bpf_map *, map, struct bpf_prog *, prog)
+{
+	struct bpf_timer_list *tl;
+
+	if (timer->opaque)
+		return -EBUSY;
+	tl = kcalloc(1, sizeof(*tl), GFP_ATOMIC);
+	if (!tl)
+		return -ENOMEM;
+	tl->callback_fn = cb;
+	tl->value = (void *)timer /* - offset of bpf_timer inside elem */;
+	tl->key = tl->value - round_up(map->key_size, 8);
+	tl->map = map;
+	tl->prog = prog;
+	timer_setup(&tl->tl, timer_cb, 0);
+	timer->opaque = (long)tl;
+	return 0;
+}
+
+const struct bpf_func_proto bpf_timer_init_proto = {
+	.func		= bpf_timer_init,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+	.arg2_type	= ARG_PTR_TO_FUNC,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_timer_mod, struct bpf_timer *, timer, u64, msecs)
+{
+	struct bpf_timer_list *tl;
+
+	tl = (struct bpf_timer_list *)timer->opaque;
+	if (!tl)
+		return -EINVAL;
+	/* keep the prog alive until callback is invoked */
+	if (!mod_timer(&tl->tl, jiffies + msecs_to_jiffies(msecs))) {
+		/* The timer was inactive.
+		 * Keep the prog alive until callback is invoked
+		 */
+		bpf_prog_inc(tl->prog);
+	}
+	return 0;
+}
+
+const struct bpf_func_proto bpf_timer_mod_proto = {
+	.func		= bpf_timer_mod,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+	.arg2_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_timer_del, struct bpf_timer *, timer)
+{
+	struct bpf_timer_list *tl;
+
+	tl = (struct bpf_timer_list *)timer->opaque;
+	if (!tl)
+		return -EINVAL;
+	if (del_timer(&tl->tl)) {
+		/* The timer was active,
+		 * drop the prog refcnt, since callback
+		 * will not be invoked.
+		 */
+		bpf_prog_put(tl->prog);
+	}
+	return 0;
+}
+
+const struct bpf_func_proto bpf_timer_del_proto = {
+	.func		= bpf_timer_del,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -1033,6 +1133,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_ringbuf_query_proto;
 	case BPF_FUNC_for_each_map_elem:
 		return &bpf_for_each_map_elem_proto;
+	case BPF_FUNC_timer_init:
+		return &bpf_timer_init_proto;
+	case BPF_FUNC_timer_mod:
+		return &bpf_timer_mod_proto;
+	case BPF_FUNC_timer_del:
+		return &bpf_timer_del_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9189eecb26dd..606c713be60a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4656,6 +4656,35 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
+static int process_timer_func(struct bpf_verifier_env *env, int regno,
+			      struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	bool is_const = tnum_is_const(reg->var_off);
+	struct bpf_map *map = reg->map_ptr;
+	u64 val = reg->var_off.value;
+
+	if (!is_const) {
+		verbose(env,
+			"R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
+			regno);
+		return -EINVAL;
+	}
+	if (!map->btf) {
+		verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
+			map->name);
+		return -EINVAL;
+	}
+	if (val) {
+		/* todo: relax this requirement */
+		verbose(env, "bpf_timer field can only be first in the map value element\n");
+		return -EINVAL;
+	}
+	WARN_ON(meta->map_ptr);
+	meta->map_ptr = map;
+	return 0;
+}
+
 static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
 {
 	return type == ARG_PTR_TO_MEM ||
@@ -4788,6 +4817,7 @@ static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PER
 static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
 static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
 static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
@@ -4819,6 +4849,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_FUNC]		= &func_ptr_types,
 	[ARG_PTR_TO_STACK_OR_NULL]	= &stack_ptr_types,
 	[ARG_PTR_TO_CONST_STR]		= &const_str_ptr_types,
+	[ARG_PTR_TO_TIMER]		= &timer_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -5000,6 +5031,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 			verbose(env, "verifier internal error\n");
 			return -EFAULT;
 		}
+	} else if (arg_type == ARG_PTR_TO_TIMER) {
+		if (process_timer_func(env, regno, meta))
+			return -EACCES;
 	} else if (arg_type == ARG_PTR_TO_FUNC) {
 		meta->subprogno = reg->subprogno;
 	} else if (arg_type_is_mem_ptr(arg_type)) {
@@ -5742,6 +5776,43 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int set_timer_init_callback_state(struct bpf_verifier_env *env,
+					 struct bpf_func_state *caller,
+					 struct bpf_func_state *callee,
+					 int insn_idx)
+{
+	struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
+	struct bpf_map *map_ptr;
+
+	if (bpf_map_ptr_poisoned(insn_aux)) {
+		verbose(env, "bpf_timer_init abusing map_ptr\n");
+		return -EINVAL;
+	}
+
+	map_ptr = BPF_MAP_PTR(insn_aux->map_ptr_state);
+
+	/* bpf_timer_init(struct bpf_timer *timer, void *callback_fn, u64 flags);
+	 * callback_fn(struct bpf_map *map, void *key, void *value);
+	 */
+	callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+	callee->regs[BPF_REG_1].map_ptr = map_ptr;
+
+	callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+	callee->regs[BPF_REG_2].map_ptr = map_ptr;
+
+	callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+	callee->regs[BPF_REG_3].map_ptr = map_ptr;
+
+	/* unused */
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+	callee->in_callback_fn = true;
+	return 0;
+}
+
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
@@ -5837,6 +5908,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	    func_id != BPF_FUNC_map_pop_elem &&
 	    func_id != BPF_FUNC_map_peek_elem &&
 	    func_id != BPF_FUNC_for_each_map_elem &&
+	    func_id != BPF_FUNC_timer_init &&
 	    func_id != BPF_FUNC_redirect_map)
 		return 0;
 
@@ -6069,6 +6141,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return -EINVAL;
 	}
 
+	if (func_id == BPF_FUNC_timer_init) {
+		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+					set_timer_init_callback_state);
+		if (err < 0)
+			return -EINVAL;
+	}
+
 	if (func_id == BPF_FUNC_snprintf) {
 		err = check_bpf_snprintf_call(env, regs);
 		if (err < 0)
@@ -12526,6 +12605,37 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			insn      = new_prog->insnsi + i + delta;
 			continue;
 		}
+		if (insn->imm == BPF_FUNC_timer_init) {
+
+			aux = &env->insn_aux_data[i + delta];
+			if (bpf_map_ptr_poisoned(aux)) {
+				verbose(env, "bpf_timer_init abusing map_ptr\n");
+				return -EINVAL;
+			}
+			map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
+			{
+				struct bpf_insn ld_addrs[4] = {
+					BPF_LD_IMM64(BPF_REG_4, (long)map_ptr),
+					BPF_LD_IMM64(BPF_REG_5, (long)prog),
+				};
+
+				insn_buf[0] = ld_addrs[0];
+				insn_buf[1] = ld_addrs[1];
+				insn_buf[2] = ld_addrs[2];
+				insn_buf[3] = ld_addrs[3];
+			}
+			insn_buf[4] = *insn;
+			cnt = 5;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			goto patch_call_imm;
+		}
 
 		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
 		 * and other inlining handlers are currently limited to 64 bit
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d2d7cf6cfe83..453a46c2d732 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1065,7 +1065,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_snprintf:
 		return &bpf_snprintf_proto;
 	default:
-		return NULL;
+		return bpf_base_func_proto(func_id);
 	}
 }
 
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index 2d94025b38e9..00ac7b79cddb 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -547,6 +547,7 @@ COMMANDS
             'struct inode',
             'struct socket',
             'struct file',
+            'struct bpf_timer',
     ]
     known_types = {
             '...',
@@ -594,6 +595,7 @@ COMMANDS
             'struct inode',
             'struct socket',
             'struct file',
+            'struct bpf_timer',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 418b9b813d65..c95d7854d9fb 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4761,6 +4761,24 @@ union bpf_attr {
  * 		Execute close syscall for given FD.
  * 	Return
  * 		A syscall result.
+ *
+ * long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
+ *	Description
+ *		Initialize the timer to call given static function.
+ *	Return
+ *		zero
+ *
+ * long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
+ *	Description
+ *		Set the timer expiration N msecs from the current time.
+ *	Return
+ *		zero
+ *
+ * long bpf_timer_del(struct bpf_timer *timer)
+ *	Description
+ *		Deactivate the timer.
+ *	Return
+ *		zero
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4932,6 +4950,9 @@ union bpf_attr {
 	FN(sys_bpf),			\
 	FN(btf_find_by_name_kind),	\
 	FN(sys_close),			\
+	FN(timer_init),			\
+	FN(timer_mod),			\
+	FN(timer_del),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6038,6 +6059,10 @@ struct bpf_spin_lock {
 	__u32	val;
 };
 
+struct bpf_timer {
+	__u64 opaque;
+};
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c
new file mode 100644
index 000000000000..6b7a16a54e70
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/timer.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <test_progs.h>
+#include "timer.skel.h"
+
+static int timer(struct timer *timer_skel)
+{
+	int err, prog_fd;
+	__u32 duration = 0, retval;
+
+	err = timer__attach(timer_skel);
+	if (!ASSERT_OK(err, "timer_attach"))
+		return err;
+
+	prog_fd = bpf_program__fd(timer_skel->progs.test1);
+	err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
+				NULL, NULL, &retval, &duration);
+	ASSERT_OK(err, "test_run");
+	ASSERT_EQ(retval, 0, "test_run");
+
+	ASSERT_EQ(timer_skel->data->callback_check, 52, "callback_check1");
+	usleep(50 * 1000); /* 10 msecs should be enough, but give it extra */
+	ASSERT_EQ(timer_skel->data->callback_check, 42, "callback_check2");
+
+	timer__detach(timer_skel);
+	return 0;
+}
+
+void test_timer(void)
+{
+	struct timer *timer_skel = NULL;
+	int err;
+
+	timer_skel = timer__open_and_load();
+	if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load"))
+		goto cleanup;
+
+	err = timer(timer_skel);
+	ASSERT_OK(err, "timer");
+cleanup:
+	timer__destroy(timer_skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c
new file mode 100644
index 000000000000..2cf0634f10c9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/timer.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+struct map_elem {
+	struct bpf_timer timer;
+	int counter;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 1000);
+	__type(key, int);
+	__type(value, struct map_elem);
+} hmap SEC(".maps");
+
+__u64 callback_check = 52;
+
+static int timer_cb(struct bpf_map *map, int *key, struct map_elem *val)
+{
+	callback_check--;
+	if (--val->counter)
+		/* re-arm the timer again to execute after 1 msec */
+		bpf_timer_mod(&val->timer, 1);
+	return 0;
+}
+
+int bpf_timer_test(void)
+{
+	struct map_elem *val;
+	int key = 0;
+
+	val = bpf_map_lookup_elem(&hmap, &key);
+	if (val) {
+		bpf_timer_init(&val->timer, timer_cb, 0);
+		bpf_timer_mod(&val->timer, 1);
+	}
+	return 0;
+}
+
+SEC("fentry/bpf_fentry_test1")
+int BPF_PROG(test1, int a)
+{
+	struct map_elem val = {};
+	int key = 0;
+
+	val.counter = 10, /* number of times to trigger timer_cb */
+	bpf_map_update_elem(&hmap, &key, &val, 0);
+	return bpf_timer_test();
+}
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 79+ messages in thread

end of thread, other threads:[~2021-08-11 21:03 UTC | newest]

Thread overview: 79+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-01  4:26 [RFC Patch bpf-next] bpf: introduce bpf timer Cong Wang
2021-04-01  6:38 ` Song Liu
2021-04-01 17:28   ` Cong Wang
2021-04-01 20:17     ` Song Liu
2021-04-02 17:34       ` Cong Wang
2021-04-02 17:57         ` Song Liu
2021-04-02 19:08           ` Cong Wang
2021-04-02 19:43             ` Song Liu
2021-04-02 20:57               ` Cong Wang
2021-04-02 23:31                 ` Song Liu
2021-04-05 23:49                   ` Cong Wang
2021-04-06  1:07                     ` Song Liu
2021-04-06  1:24                       ` Cong Wang
2021-04-06  6:17                         ` Song Liu
2021-04-06 16:48                           ` Cong Wang
2021-04-06 23:36                             ` Song Liu
2021-04-08 22:45                               ` Cong Wang
2021-04-02 19:28 ` Alexei Starovoitov
2021-04-02 21:24   ` Cong Wang
2021-04-02 23:45     ` Alexei Starovoitov
2021-04-06  0:36       ` Cong Wang
2021-04-12 23:01         ` Alexei Starovoitov
2021-04-15  4:02           ` Cong Wang
2021-04-15  4:25             ` Alexei Starovoitov
2021-04-15 15:51               ` Cong Wang
2021-04-26 23:00               ` Cong Wang
2021-04-26 23:05                 ` Alexei Starovoitov
2021-04-26 23:37                   ` Cong Wang
2021-04-27  2:01                     ` Alexei Starovoitov
2021-04-27 11:52                       ` Jamal Hadi Salim
2021-04-27 16:36                       ` Cong Wang
2021-04-27 18:33                         ` Alexei Starovoitov
2021-05-09  5:37                           ` Cong Wang
2021-05-10 20:55                             ` Jamal Hadi Salim
2021-05-11 21:29                               ` Cong Wang
2021-05-12 22:56                                 ` Jamal Hadi Salim
2021-05-11  5:05                             ` Joe Stringer
2021-05-11 21:08                               ` Cong Wang
2021-05-12 22:43                               ` Jamal Hadi Salim
2021-05-13 18:45                                 ` Jamal Hadi Salim
2021-05-14  2:53                                   ` Cong Wang
2021-08-11 21:03                                     ` Joe Stringer
2021-05-20 18:55 [RFC PATCH bpf-next] bpf: Introduce bpf_timer Alexei Starovoitov
2021-05-21 14:38 ` Alexei Starovoitov
2021-05-21 21:37 ` Cong Wang
2021-05-23 16:01   ` Alexei Starovoitov
2021-05-24  8:45     ` Lorenz Bauer
2021-05-25  3:16     ` Cong Wang
2021-05-25  4:59       ` Cong Wang
2021-05-25 18:21         ` Alexei Starovoitov
2021-05-25 19:35           ` Jamal Hadi Salim
2021-05-25 19:57             ` Alexei Starovoitov
2021-05-25 21:09               ` Jamal Hadi Salim
2021-05-25 22:08                 ` Alexei Starovoitov
2021-05-26 15:34                   ` Jamal Hadi Salim
2021-05-26 16:58                     ` Alexei Starovoitov
2021-05-26 18:25                       ` Jamal Hadi Salim
2021-05-30  6:36           ` Cong Wang
2021-06-02  2:00             ` Alexei Starovoitov
2021-06-02  8:48               ` Toke Høiland-Jørgensen
2021-06-02 17:54                 ` Martin KaFai Lau
2021-06-02 18:13                   ` Kumar Kartikeya Dwivedi
2021-06-02 18:26                     ` Alexei Starovoitov
2021-06-02 18:30                       ` Kumar Kartikeya Dwivedi
2021-06-02 18:46                     ` John Fastabend
2021-05-22 16:06 ` kernel test robot
2021-05-22 16:41 ` kernel test robot
2021-05-23 11:48 ` Toke Høiland-Jørgensen
2021-05-23 15:58   ` Alexei Starovoitov
2021-05-24  8:42     ` Lorenz Bauer
2021-05-24 14:48       ` Alexei Starovoitov
2021-05-24 17:33     ` Alexei Starovoitov
2021-05-24 18:39       ` Toke Høiland-Jørgensen
2021-05-24 18:38     ` Toke Høiland-Jørgensen
2021-05-24 11:49 ` Lorenz Bauer
2021-05-24 14:56   ` Alexei Starovoitov
2021-05-24 19:13     ` Andrii Nakryiko
2021-05-25  5:22       ` Cong Wang
2021-05-25 19:47         ` Andrii Nakryiko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.