Re: [PATCH v2 bpf-next 1/3] bpf: Introduce bpf_timer

From: Yonghong Song <yhs@fb.com>
To: Alexei Starovoitov <alexei.starovoitov@gmail.com>, <davem@davemloft.net>
Cc: <daniel@iogearbox.net>, <andrii@kernel.org>,
	<netdev@vger.kernel.org>, <bpf@vger.kernel.org>,
	<kernel-team@fb.com>
Subject: Re: [PATCH v2 bpf-next 1/3] bpf: Introduce bpf_timer
Date: Fri, 11 Jun 2021 15:12:05 -0700	[thread overview]
Message-ID: <f36d19e7-cc6f-730b-cf13-d77e1ce88d2f@fb.com> (raw)
In-Reply-To: <20210611042442.65444-2-alexei.starovoitov@gmail.com>

On 6/10/21 9:24 PM, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> Introduce 'struct bpf_timer { __u64 :64; __u64 :64; };' that can be embedded
> in hash/array/lru maps as regular field and helpers to operate on it:
> 
> // Initialize the timer to call 'callback_fn' static function
> // First 4 bits of 'flags' specify clockid.
> // Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
> long bpf_timer_init(struct bpf_timer *timer, void *callback_fn, int flags);
> 
> // Start the timer and set its expiration 'nsec' nanoseconds from the current time.
> long bpf_timer_start(struct bpf_timer *timer, u64 nsec);
> 
> // Cancel the timer and wait for callback_fn to finish if it was running.
> long bpf_timer_cancel(struct bpf_timer *timer);
> 
> Here is how BPF program might look like:
> struct map_elem {
>      int counter;
>      struct bpf_timer timer;
> };
> 
> struct {
>      __uint(type, BPF_MAP_TYPE_HASH);
>      __uint(max_entries, 1000);
>      __type(key, int);
>      __type(value, struct map_elem);
> } hmap SEC(".maps");
> 
> static int timer_cb(void *map, int *key, struct map_elem *val);
> /* val points to particular map element that contains bpf_timer. */
> 
> SEC("fentry/bpf_fentry_test1")
> int BPF_PROG(test1, int a)
> {
>      struct map_elem *val;
>      int key = 0;
> 
>      val = bpf_map_lookup_elem(&hmap, &key);
>      if (val) {
>          bpf_timer_init(&val->timer, timer_cb, CLOCK_REALTIME);
>          bpf_timer_start(&val->timer, 1000 /* call timer_cb2 in 1 usec */);
>      }
> }
> 
> This patch adds helper implementations that rely on hrtimers
> to call bpf functions as timers expire.
> The following patch adds necessary safety checks.
> 
> Only programs with CAP_BPF are allowed to use bpf_timer.
> 
> The amount of timers used by the program is constrained by
> the memcg recorded at map creation time.
> 
> The bpf_timer_init() helper is receiving hidden 'map' and 'prog' arguments
> supplied by the verifier. The prog pointer is needed to do refcnting of bpf
> program to make sure that program doesn't get freed while timer is armed.
> 
> The bpf_map_delete_elem() and bpf_map_update_elem() operations cancel
> and free the timer if given map element had it allocated.
> "bpftool map update" command can be used to cancel timers.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>   include/linux/bpf.h            |   2 +
>   include/uapi/linux/bpf.h       |  40 ++++++
>   kernel/bpf/helpers.c           | 227 +++++++++++++++++++++++++++++++++
>   kernel/bpf/verifier.c          | 109 ++++++++++++++++
>   kernel/trace/bpf_trace.c       |   2 +-
>   scripts/bpf_doc.py             |   2 +
>   tools/include/uapi/linux/bpf.h |  40 ++++++
>   7 files changed, 421 insertions(+), 1 deletion(-)
> 
[...]
>   
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 2c1ba70abbf1..d25bbcdad8e6 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -4778,6 +4778,38 @@ union bpf_attr {
>    * 		Execute close syscall for given FD.
>    * 	Return
>    * 		A syscall result.
> + *
> + * long bpf_timer_init(struct bpf_timer *timer, void *callback_fn, int flags)
> + *	Description
> + *		Initialize the timer to call *callback_fn* static function.
> + *		First 4 bits of *flags* specify clockid. Only CLOCK_MONOTONIC,
> + *		CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
> + *		All other bits of *flags* are reserved.
> + *	Return
> + *		0 on success.
> + *		**-EBUSY** if *timer* is already initialized.
> + *		**-EINVAL** if invalid *flags* are passed.
> + *
> + * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs)
> + *	Description
> + *		Start the timer and set its expiration N nanoseconds from the
> + *		current time. The timer callback_fn will be invoked in soft irq
> + *		context on some cpu and will not repeat unless another
> + *		bpf_timer_start() is made. In such case the next invocation can
> + *		migrate to a different cpu.
> + *	Return
> + *		0 on success.
> + *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
> + *
> + * long bpf_timer_cancel(struct bpf_timer *timer)
> + *	Description
> + *		Cancel the timer and wait for callback_fn to finish if it was running.
> + *	Return
> + *		0 if the timer was not active.
> + *		1 if the timer was active.
> + *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
> + *		**-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its own timer
> + *		which would have led to a deadlock otherwise.
>    */
>   #define __BPF_FUNC_MAPPER(FN)		\
>   	FN(unspec),			\
> @@ -4949,6 +4981,9 @@ union bpf_attr {
>   	FN(sys_bpf),			\
>   	FN(btf_find_by_name_kind),	\
>   	FN(sys_close),			\
> +	FN(timer_init),			\
> +	FN(timer_start),		\
> +	FN(timer_cancel),		\
>   	/* */
>   
>   /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> @@ -6061,6 +6096,11 @@ struct bpf_spin_lock {
>   	__u32	val;
>   };
>   
> +struct bpf_timer {
> +	__u64 :64;
> +	__u64 :64;
> +};
> +
>   struct bpf_sysctl {
>   	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
>   				 * Allows 1,2,4-byte read, but no write.
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 544773970dbc..3a693d451ca3 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -985,6 +985,227 @@ const struct bpf_func_proto bpf_snprintf_proto = {
>   	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
>   };
>   
> +struct bpf_hrtimer {
> +	struct hrtimer timer;
> +	struct bpf_map *map;
> +	struct bpf_prog *prog;
> +	void *callback_fn;
> +	void *value;
> +};
> +
> +/* the actual struct hidden inside uapi struct bpf_timer */
> +struct bpf_timer_kern {
> +	struct bpf_hrtimer *timer;
> +	struct bpf_spin_lock lock;
> +};

Looks like in 32bit system, sizeof(struct bpf_timer_kern) is 64
and sizeof(struct bpf_timer) is 128.

struct bpf_spin_lock {
         __u32   val;
};

struct bpf_timer {
	__u64 :64;
	__u64 :64;
};

Checking the code, we may not have issues as structure
"bpf_timer" is only used to reserve spaces and
map copy value routine handles that properly.

Maybe we can still make it consistent with
two fields in bpf_timer_kern mapping to
two fields in bpf_timer?

struct bpf_timer_kern {
	__bpf_md_ptr(struct bpf_hrtimer *, timer);
	struct bpf_spin_lock lock;
};

> +
> +static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
> +
> +static enum hrtimer_restart bpf_timer_cb(struct hrtimer *timer)
> +{
> +	struct bpf_hrtimer *t = container_of(timer, struct bpf_hrtimer, timer);
> +	struct bpf_prog *prog = t->prog;
> +	struct bpf_map *map = t->map;
> +	void *key;
> +	u32 idx;
> +	int ret;
> +
> +	/* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
> +	 * cannot be preempted by another bpf_timer_cb() on the same cpu.
> +	 * Remember the timer this callback is servicing to prevent
> +	 * deadlock if callback_fn() calls bpf_timer_cancel() on the same timer.
> +	 */
> +	this_cpu_write(hrtimer_running, t);
> +	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
> +		struct bpf_array *array = container_of(map, struct bpf_array, map);
> +
> +		/* compute the key */
> +		idx = ((char *)t->value - array->value) / array->elem_size;
> +		key = &idx;
> +	} else { /* hash or lru */
> +		key = t->value - round_up(map->key_size, 8);
> +	}
> +
> +	ret = BPF_CAST_CALL(t->callback_fn)((u64)(long)map,
> +					    (u64)(long)key,
> +					    (u64)(long)t->value, 0, 0);
> +	WARN_ON(ret != 0); /* Next patch disallows 1 in the verifier */
> +
> +	/* The bpf function finished executed. Drop the prog refcnt.
> +	 * It could reach zero here and trigger free of bpf_prog
> +	 * and subsequent free of the maps that were holding timers.
> +	 * If callback_fn called bpf_timer_start on this timer
> +	 * the prog refcnt will be > 0.
> +	 *
> +	 * If callback_fn deleted map element the 't' could have been freed,
> +	 * hence t->prog deref is done earlier.
> +	 */
> +	bpf_prog_put(prog);
> +	this_cpu_write(hrtimer_running, NULL);
> +	return HRTIMER_NORESTART;
> +}
> +
> +BPF_CALL_5(bpf_timer_init, struct bpf_timer_kern *, timer, void *, cb, int, flags,
> +	   struct bpf_map *, map, struct bpf_prog *, prog)
> +{
[...]