Re: [PATCH v2 bpf-next 1/3] bpf: Introduce bpf_timer

From: Yonghong Song <yhs@fb.com>
To: Alexei Starovoitov <alexei.starovoitov@gmail.com>, <davem@davemloft.net>
Cc: <daniel@iogearbox.net>, <andrii@kernel.org>,
	<netdev@vger.kernel.org>, <bpf@vger.kernel.org>,
	<kernel-team@fb.com>
Subject: Re: [PATCH v2 bpf-next 1/3] bpf: Introduce bpf_timer
Date: Mon, 14 Jun 2021 09:51:37 -0700	[thread overview]
Message-ID: <9b23b2c6-28b2-3ab3-4e8b-1fa0c926c4d2@fb.com> (raw)
In-Reply-To: <20210611042442.65444-2-alexei.starovoitov@gmail.com>

On 6/10/21 9:24 PM, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> Introduce 'struct bpf_timer { __u64 :64; __u64 :64; };' that can be embedded
> in hash/array/lru maps as regular field and helpers to operate on it:
> 
> // Initialize the timer to call 'callback_fn' static function
> // First 4 bits of 'flags' specify clockid.
> // Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
> long bpf_timer_init(struct bpf_timer *timer, void *callback_fn, int flags);
> 
> // Start the timer and set its expiration 'nsec' nanoseconds from the current time.
> long bpf_timer_start(struct bpf_timer *timer, u64 nsec);
> 
> // Cancel the timer and wait for callback_fn to finish if it was running.
> long bpf_timer_cancel(struct bpf_timer *timer);
> 
> Here is how BPF program might look like:
> struct map_elem {
>      int counter;
>      struct bpf_timer timer;
> };
> 
> struct {
>      __uint(type, BPF_MAP_TYPE_HASH);
>      __uint(max_entries, 1000);
>      __type(key, int);
>      __type(value, struct map_elem);
> } hmap SEC(".maps");
> 
> static int timer_cb(void *map, int *key, struct map_elem *val);
> /* val points to particular map element that contains bpf_timer. */
> 
> SEC("fentry/bpf_fentry_test1")
> int BPF_PROG(test1, int a)
> {
>      struct map_elem *val;
>      int key = 0;
> 
>      val = bpf_map_lookup_elem(&hmap, &key);
>      if (val) {
>          bpf_timer_init(&val->timer, timer_cb, CLOCK_REALTIME);
>          bpf_timer_start(&val->timer, 1000 /* call timer_cb2 in 1 usec */);
>      }
> }
> 
> This patch adds helper implementations that rely on hrtimers
> to call bpf functions as timers expire.
> The following patch adds necessary safety checks.
> 
> Only programs with CAP_BPF are allowed to use bpf_timer.
> 
> The amount of timers used by the program is constrained by
> the memcg recorded at map creation time.
> 
> The bpf_timer_init() helper is receiving hidden 'map' and 'prog' arguments
> supplied by the verifier. The prog pointer is needed to do refcnting of bpf
> program to make sure that program doesn't get freed while timer is armed.
> 
> The bpf_map_delete_elem() and bpf_map_update_elem() operations cancel
> and free the timer if given map element had it allocated.
> "bpftool map update" command can be used to cancel timers.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>   include/linux/bpf.h            |   2 +
>   include/uapi/linux/bpf.h       |  40 ++++++
>   kernel/bpf/helpers.c           | 227 +++++++++++++++++++++++++++++++++
>   kernel/bpf/verifier.c          | 109 ++++++++++++++++
>   kernel/trace/bpf_trace.c       |   2 +-
>   scripts/bpf_doc.py             |   2 +
>   tools/include/uapi/linux/bpf.h |  40 ++++++
>   7 files changed, 421 insertions(+), 1 deletion(-)
> 
[...]
> +
> +static enum hrtimer_restart bpf_timer_cb(struct hrtimer *timer)
> +{
> +	struct bpf_hrtimer *t = container_of(timer, struct bpf_hrtimer, timer);
> +	struct bpf_prog *prog = t->prog;
> +	struct bpf_map *map = t->map;
> +	void *key;
> +	u32 idx;
> +	int ret;
> +
> +	/* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
> +	 * cannot be preempted by another bpf_timer_cb() on the same cpu.
> +	 * Remember the timer this callback is servicing to prevent
> +	 * deadlock if callback_fn() calls bpf_timer_cancel() on the same timer.
> +	 */
> +	this_cpu_write(hrtimer_running, t);
> +	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
> +		struct bpf_array *array = container_of(map, struct bpf_array, map);
> +
> +		/* compute the key */
> +		idx = ((char *)t->value - array->value) / array->elem_size;
> +		key = &idx;
> +	} else { /* hash or lru */
> +		key = t->value - round_up(map->key_size, 8);
> +	}
> +
> +	ret = BPF_CAST_CALL(t->callback_fn)((u64)(long)map,
> +					    (u64)(long)key,
> +					    (u64)(long)t->value, 0, 0);
> +	WARN_ON(ret != 0); /* Next patch disallows 1 in the verifier */

I didn't find that next patch disallows callback return value 1 in the 
verifier. If we indeed disallows return value 1 in the verifier. We
don't need WARN_ON here. Did I miss anything?

> +
> +	/* The bpf function finished executed. Drop the prog refcnt.
> +	 * It could reach zero here and trigger free of bpf_prog
> +	 * and subsequent free of the maps that were holding timers.
> +	 * If callback_fn called bpf_timer_start on this timer
> +	 * the prog refcnt will be > 0.
> +	 *
> +	 * If callback_fn deleted map element the 't' could have been freed,
> +	 * hence t->prog deref is done earlier.
> +	 */
> +	bpf_prog_put(prog);
> +	this_cpu_write(hrtimer_running, NULL);
> +	return HRTIMER_NORESTART;
> +}
> +
> +BPF_CALL_5(bpf_timer_init, struct bpf_timer_kern *, timer, void *, cb, int, flags,
> +	   struct bpf_map *, map, struct bpf_prog *, prog)
> +{
> +	clockid_t clockid = flags & (MAX_CLOCKS - 1);
> +	struct bpf_hrtimer *t;
> +	int ret = 0;
> +
> +	BUILD_BUG_ON(MAX_CLOCKS != 16);
> +	if (flags >= MAX_CLOCKS ||
> +	    /* similar to timerfd except _ALARM variants are not supported */
> +            (clockid != CLOCK_MONOTONIC &&
> +             clockid != CLOCK_REALTIME &&
> +             clockid != CLOCK_BOOTTIME))
> +		return -EINVAL;
> +	____bpf_spin_lock(&timer->lock);
> +	t = timer->timer;
> +	if (t) {
> +		ret = -EBUSY;
> +		goto out;
> +	}
> +	/* allocate hrtimer via map_kmalloc to use memcg accounting */
> +	t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, NUMA_NO_NODE);
> +	if (!t) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +	t->callback_fn = cb;
> +	t->value = (void *)timer /* - offset of bpf_timer inside elem */;
> +	t->map = map;
> +	t->prog = prog;
> +	hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
> +	t->timer.function = bpf_timer_cb;
> +	timer->timer = t;
> +out:
> +	____bpf_spin_unlock(&timer->lock);
> +	return ret;
> +}
> +
> +static const struct bpf_func_proto bpf_timer_init_proto = {
> +	.func		= bpf_timer_init,
> +	.gpl_only	= true,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_TIMER,
> +	.arg2_type	= ARG_PTR_TO_FUNC,
> +	.arg3_type	= ARG_ANYTHING,
> +};
> +
> +BPF_CALL_2(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs)
> +{
> +	struct bpf_hrtimer *t;
> +	int ret = 0;
> +
> +	____bpf_spin_lock(&timer->lock);
> +	t = timer->timer;
> +	if (!t) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +	if (!hrtimer_active(&t->timer) || hrtimer_callback_running(&t->timer))
> +		/* If the timer wasn't active or callback already executing
> +		 * bump the prog refcnt to keep it alive until
> +		 * callback is invoked (again).
> +		 */
> +		bpf_prog_inc(t->prog);

I am not 100% sure. But could we have race condition here?
    cpu 1: running bpf_timer_start() helper call
    cpu 2: doing hrtimer work (calling callback etc.)

Is it possible that
   !hrtimer_active(&t->timer) || hrtimer_callback_running(&t->timer)
may be true and then right before bpf_prog_inc(t->prog), it becomes 
true? If hrtimer_callback_running() is called, it is possible that
callback function could have dropped the reference count for t->prog,
so we could already go into the body of the function
__bpf_prog_put()?

static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
         if (atomic64_dec_and_test(&prog->aux->refcnt)) {
                 perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
                 bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
                 /* bpf_prog_free_id() must be called first */
                 bpf_prog_free_id(prog, do_idr_lock);
                 __bpf_prog_put_noref(prog, true);
         }
}

> +	hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
> +out:
> +	____bpf_spin_unlock(&timer->lock);
> +	return ret;
> +}
> +
> +static const struct bpf_func_proto bpf_timer_start_proto = {
> +	.func		= bpf_timer_start,
> +	.gpl_only	= true,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_TIMER,
> +	.arg2_type	= ARG_ANYTHING,
> +};
> +
> +BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
> +{
> +	struct bpf_hrtimer *t;
> +	int ret = 0;
> +
> +	____bpf_spin_lock(&timer->lock);
> +	t = timer->timer;
> +	if (!t) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +	if (this_cpu_read(hrtimer_running) == t) {
> +		/* If bpf callback_fn is trying to bpf_timer_cancel()
> +		 * its own timer the hrtimer_cancel() will deadlock
> +		 * since it waits for callback_fn to finish
> +		 */
> +		ret = -EDEADLK;
> +		goto out;
> +	}
> +	/* Cancel the timer and wait for associated callback to finish
> +	 * if it was running.
> +	 */
> +	if (hrtimer_cancel(&t->timer) == 1) {

Again, could we have race here between bpf program and hrtimer_cancel()?

> +		/* If the timer was active then drop the prog refcnt,
> +		 * since callback will not be invoked.
> +		 */
> +		bpf_prog_put(t->prog);
> +		ret = 1;
> +	}
> +out:
> +	____bpf_spin_unlock(&timer->lock);
> +	return ret;
> +}
> +
> +static const struct bpf_func_proto bpf_timer_cancel_proto = {
> +	.func		= bpf_timer_cancel,
> +	.gpl_only	= true,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_TIMER,
> +};
> +
> +/* This function is called by delete_element in htab and lru maps
> + * and by map_free for array, lru, htab maps.
> + */
> +void bpf_timer_cancel_and_free(void *val)
> +{
> +	struct bpf_timer_kern *timer = val;
> +	struct bpf_hrtimer *t;
> +
> +	____bpf_spin_lock(&timer->lock);
> +	t = timer->timer;
> +	if (!t)
> +		goto out;
> +	/* Cancel the timer and wait for callback to complete if it was
> +	 * running. Only individual delete_element in htab or lru maps can
> +	 * return 1 from hrtimer_cancel.
> +	 * The whole map is destroyed when its refcnt reaches zero.
> +	 * That happens after bpf prog refcnt reaches zero.
> +	 * bpf prog refcnt will not reach zero until all timers are executed.
> +	 * So when maps are destroyed hrtimer_cancel will surely return 0.
> +	 * In such case t->prog is a pointer to freed memory.
> +	 *
> +	 * When htab or lru is deleting individual element check that
> +	 * bpf_map_delete_elem() isn't trying to delete elem with running timer.
> +	 * In such case don't call hrtimer_cancel() (since it will deadlock)
> +	 * and don't call hrtimer_try_to_cancel() (since it will just return -1).
> +	 * Instead free the timer and set timer->timer = NULL.
> +	 * The subsequent bpf_timer_start/cancel() helpers won't be able to use it.
> +	 * In preallocated maps it's safe to do timer->timer = NULL.
> +	 * The memory could be reused for another element while current timer
> +	 * callback can still do bpf_timer_init() on it.
> +	 * In non-preallocated maps timer->timer = NULL will happen after
> +	 * callback completes, since prog execution is an RCU critical section.
> +	 */
> +	if (this_cpu_read(hrtimer_running) != t &&
> +	    hrtimer_cancel(&t->timer) == 1)
> +		bpf_prog_put(t->prog);
> +	kfree(t);
> +	timer->timer = NULL;
> +out:
> +	____bpf_spin_unlock(&timer->lock);
> +}
> +
>   const struct bpf_func_proto bpf_get_current_task_proto __weak;
>   const struct bpf_func_proto bpf_probe_read_user_proto __weak;
>   const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
> @@ -1051,6 +1272,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
>   		return &bpf_per_cpu_ptr_proto;
>   	case BPF_FUNC_this_cpu_ptr:
>   		return &bpf_this_cpu_ptr_proto;
> +	case BPF_FUNC_timer_init:
> +		return &bpf_timer_init_proto;
> +	case BPF_FUNC_timer_start:
> +		return &bpf_timer_start_proto;
> +	case BPF_FUNC_timer_cancel:
> +		return &bpf_timer_cancel_proto;
>   	default:
>   		break;
>   	}
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 1de4b8c6ee42..44ec9760b562 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -4656,6 +4656,35 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
>   	return 0;
>   }
>   
> +static int process_timer_func(struct bpf_verifier_env *env, int regno,
> +			      struct bpf_call_arg_meta *meta)
> +{
> +	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
> +	bool is_const = tnum_is_const(reg->var_off);
> +	struct bpf_map *map = reg->map_ptr;
> +	u64 val = reg->var_off.value;
> +
> +	if (!is_const) {
> +		verbose(env,
> +			"R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
> +			regno);
> +		return -EINVAL;
> +	}
> +	if (!map->btf) {
> +		verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
> +			map->name);
> +		return -EINVAL;
> +	}
> +	if (val) {
> +		/* This restriction will be removed in the next patch */
> +		verbose(env, "bpf_timer field can only be first in the map value element\n");
> +		return -EINVAL;
> +	}
> +	WARN_ON(meta->map_ptr);

Could you explain when this could happen?

> +	meta->map_ptr = map;
> +	return 0;
> +}
> +
>   static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
>   {
>   	return type == ARG_PTR_TO_MEM ||
> @@ -4788,6 +4817,7 @@ static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PER
>   static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
>   static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
>   static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
> +static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
>   
[...]