All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines
@ 2022-04-14 16:12 Stanislav Fomichev
  2022-04-14 20:23 ` Martin KaFai Lau
  2022-04-16  1:28 ` Alexei Starovoitov
  0 siblings, 2 replies; 12+ messages in thread
From: Stanislav Fomichev @ 2022-04-14 16:12 UTC (permalink / raw)
  To: netdev, bpf; +Cc: ast, daniel, andrii, Stanislav Fomichev, Martin KaFai Lau

Commit 7d08c2c91171 ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros
into functions") switched a bunch of BPF_PROG_RUN macros to inline
routines. This changed the semantic a bit. Due to arguments expansion
of macros, it used to be:

	rcu_read_lock();
	array = rcu_dereference(cgrp->bpf.effective[atype]);
	...

Now, with with inline routines, we have:
	array_rcu = rcu_dereference(cgrp->bpf.effective[atype]);
	/* array_rcu can be kfree'd here */
	rcu_read_lock();
	array = rcu_dereference(array_rcu);

I'm assuming in practice rcu subsystem isn't fast enough to trigger
this but let's use rcu API properly.

Also, rename to lower caps to not confuse with macros. Additionally,
drop and expand BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY.

See [1] for more context.

  [1] https://lore.kernel.org/bpf/CAKH8qBs60fOinFdxiiQikK_q0EcVxGvNTQoWvHLEUGbgcj1UYg@mail.gmail.com/T/#u

v2
- keep rcu locks inside by passing cgroup_bpf

Cc: Martin KaFai Lau <kafai@fb.com>
Fixes: 7d08c2c91171 ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 drivers/media/rc/bpf-lirc.c |   8 ++-
 include/linux/bpf.h         | 115 +++-------------------------------
 kernel/bpf/cgroup.c         | 121 +++++++++++++++++++++++++++++++-----
 kernel/trace/bpf_trace.c    |   5 +-
 4 files changed, 121 insertions(+), 128 deletions(-)

diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index 3eff08d7b8e5..fe17c7f98e81 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -216,8 +216,12 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample)
 
 	raw->bpf_sample = sample;
 
-	if (raw->progs)
-		BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, bpf_prog_run);
+	if (raw->progs) {
+		rcu_read_lock();
+		bpf_prog_run_array(rcu_dereference(raw->progs),
+				   &raw->bpf_sample, bpf_prog_run);
+		rcu_read_unlock();
+	}
 }
 
 /*
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bdb5298735ce..7bf441563ffc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1221,7 +1221,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 /* an array of programs to be executed under rcu_lock.
  *
  * Typical usage:
- * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, bpf_prog_run);
+ * ret = bpf_prog_run_array(rcu_dereference(&bpf_prog_array), ctx, bpf_prog_run);
  *
  * the structure returned by bpf_prog_array_alloc() should be populated
  * with program pointers and the last pointer must be NULL.
@@ -1315,83 +1315,22 @@ static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx)
 
 typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx);
 
-static __always_inline int
-BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu,
-			    const void *ctx, bpf_prog_run_fn run_prog,
-			    int retval, u32 *ret_flags)
-{
-	const struct bpf_prog_array_item *item;
-	const struct bpf_prog *prog;
-	const struct bpf_prog_array *array;
-	struct bpf_run_ctx *old_run_ctx;
-	struct bpf_cg_run_ctx run_ctx;
-	u32 func_ret;
-
-	run_ctx.retval = retval;
-	migrate_disable();
-	rcu_read_lock();
-	array = rcu_dereference(array_rcu);
-	item = &array->items[0];
-	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
-	while ((prog = READ_ONCE(item->prog))) {
-		run_ctx.prog_item = item;
-		func_ret = run_prog(prog, ctx);
-		if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval))
-			run_ctx.retval = -EPERM;
-		*(ret_flags) |= (func_ret >> 1);
-		item++;
-	}
-	bpf_reset_run_ctx(old_run_ctx);
-	rcu_read_unlock();
-	migrate_enable();
-	return run_ctx.retval;
-}
-
-static __always_inline int
-BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu,
-		      const void *ctx, bpf_prog_run_fn run_prog,
-		      int retval)
-{
-	const struct bpf_prog_array_item *item;
-	const struct bpf_prog *prog;
-	const struct bpf_prog_array *array;
-	struct bpf_run_ctx *old_run_ctx;
-	struct bpf_cg_run_ctx run_ctx;
-
-	run_ctx.retval = retval;
-	migrate_disable();
-	rcu_read_lock();
-	array = rcu_dereference(array_rcu);
-	item = &array->items[0];
-	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
-	while ((prog = READ_ONCE(item->prog))) {
-		run_ctx.prog_item = item;
-		if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval))
-			run_ctx.retval = -EPERM;
-		item++;
-	}
-	bpf_reset_run_ctx(old_run_ctx);
-	rcu_read_unlock();
-	migrate_enable();
-	return run_ctx.retval;
-}
-
 static __always_inline u32
-BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu,
+bpf_prog_run_array(const struct bpf_prog_array *array,
 		   const void *ctx, bpf_prog_run_fn run_prog)
 {
 	const struct bpf_prog_array_item *item;
 	const struct bpf_prog *prog;
-	const struct bpf_prog_array *array;
 	struct bpf_run_ctx *old_run_ctx;
 	struct bpf_trace_run_ctx run_ctx;
 	u32 ret = 1;
 
-	migrate_disable();
-	rcu_read_lock();
-	array = rcu_dereference(array_rcu);
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
+
 	if (unlikely(!array))
-		goto out;
+		return ret;
+
+	migrate_disable();
 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 	item = &array->items[0];
 	while ((prog = READ_ONCE(item->prog))) {
@@ -1400,50 +1339,10 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu,
 		item++;
 	}
 	bpf_reset_run_ctx(old_run_ctx);
-out:
-	rcu_read_unlock();
 	migrate_enable();
 	return ret;
 }
 
-/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs
- * so BPF programs can request cwr for TCP packets.
- *
- * Current cgroup skb programs can only return 0 or 1 (0 to drop the
- * packet. This macro changes the behavior so the low order bit
- * indicates whether the packet should be dropped (0) or not (1)
- * and the next bit is a congestion notification bit. This could be
- * used by TCP to call tcp_enter_cwr()
- *
- * Hence, new allowed return values of CGROUP EGRESS BPF programs are:
- *   0: drop packet
- *   1: keep packet
- *   2: drop packet and cn
- *   3: keep packet and cn
- *
- * This macro then converts it to one of the NET_XMIT or an error
- * code that is then interpreted as drop packet (and no cn):
- *   0: NET_XMIT_SUCCESS  skb should be transmitted
- *   1: NET_XMIT_DROP     skb should be dropped and cn
- *   2: NET_XMIT_CN       skb should be transmitted and cn
- *   3: -err              skb should be dropped
- */
-#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func)		\
-	({						\
-		u32 _flags = 0;				\
-		bool _cn;				\
-		u32 _ret;				\
-		_ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \
-		_cn = _flags & BPF_RET_SET_CN;		\
-		if (_ret && !IS_ERR_VALUE((long)_ret))	\
-			_ret = -EFAULT;			\
-		if (!_ret)				\
-			_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);	\
-		else					\
-			_ret = (_cn ? NET_XMIT_DROP : _ret);		\
-		_ret;					\
-	})
-
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 extern struct mutex bpf_stats_enabled_mutex;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 128028efda64..9e95c03f8f7f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -22,6 +22,69 @@
 DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
 
+static int
+bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
+			    enum cgroup_bpf_attach_type atype,
+			    const void *ctx, bpf_prog_run_fn run_prog,
+			    int retval, u32 *ret_flags)
+{
+	const struct bpf_prog_array_item *item;
+	const struct bpf_prog *prog;
+	const struct bpf_prog_array *array;
+	struct bpf_run_ctx *old_run_ctx;
+	struct bpf_cg_run_ctx run_ctx;
+	u32 func_ret;
+
+	run_ctx.retval = retval;
+	migrate_disable();
+	rcu_read_lock();
+	array = rcu_dereference(cgrp->effective[atype]);
+	item = &array->items[0];
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+	while ((prog = READ_ONCE(item->prog))) {
+		run_ctx.prog_item = item;
+		func_ret = run_prog(prog, ctx);
+		if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval))
+			run_ctx.retval = -EPERM;
+		*(ret_flags) |= (func_ret >> 1);
+		item++;
+	}
+	bpf_reset_run_ctx(old_run_ctx);
+	rcu_read_unlock();
+	migrate_enable();
+	return run_ctx.retval;
+}
+
+static int
+bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
+		      enum cgroup_bpf_attach_type atype,
+		      const void *ctx, bpf_prog_run_fn run_prog,
+		      int retval)
+{
+	const struct bpf_prog_array_item *item;
+	const struct bpf_prog *prog;
+	const struct bpf_prog_array *array;
+	struct bpf_run_ctx *old_run_ctx;
+	struct bpf_cg_run_ctx run_ctx;
+
+	run_ctx.retval = retval;
+	migrate_disable();
+	rcu_read_lock();
+	array = rcu_dereference(cgrp->effective[atype]);
+	item = &array->items[0];
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+	while ((prog = READ_ONCE(item->prog))) {
+		run_ctx.prog_item = item;
+		if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval))
+			run_ctx.retval = -EPERM;
+		item++;
+	}
+	bpf_reset_run_ctx(old_run_ctx);
+	rcu_read_unlock();
+	migrate_enable();
+	return run_ctx.retval;
+}
+
 void cgroup_bpf_offline(struct cgroup *cgrp)
 {
 	cgroup_get(cgrp);
@@ -1075,11 +1138,38 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	bpf_compute_and_save_data_end(skb, &saved_data_end);
 
 	if (atype == CGROUP_INET_EGRESS) {
-		ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
-			cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
+		u32 flags = 0;
+		bool cn;
+
+		ret = bpf_prog_run_array_cg_flags(
+			&cgrp->bpf, atype,
+			skb, __bpf_prog_run_save_cb, 0, &flags);
+
+		/* Return values of CGROUP EGRESS BPF programs are:
+		 *   0: drop packet
+		 *   1: keep packet
+		 *   2: drop packet and cn
+		 *   3: keep packet and cn
+		 *
+		 * The returned value is then converted to one of the NET_XMIT
+		 * or an error code that is then interpreted as drop packet
+		 * (and no cn):
+		 *   0: NET_XMIT_SUCCESS  skb should be transmitted
+		 *   1: NET_XMIT_DROP     skb should be dropped and cn
+		 *   2: NET_XMIT_CN       skb should be transmitted and cn
+		 *   3: -err              skb should be dropped
+		 */
+
+		cn = flags & BPF_RET_SET_CN;
+		if (ret && !IS_ERR_VALUE((long)ret))
+			ret = -EFAULT;
+		if (!ret)
+			ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
+		else
+			ret = (cn ? NET_XMIT_DROP : ret);
 	} else {
-		ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
-					    __bpf_prog_run_save_cb, 0);
+		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
+					    skb, __bpf_prog_run_save_cb, 0);
 		if (ret && !IS_ERR_VALUE((long)ret))
 			ret = -EFAULT;
 	}
@@ -1109,8 +1199,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 
-	return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk,
-				     bpf_prog_run, 0);
+	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 
@@ -1155,8 +1244,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	}
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
-					   bpf_prog_run, 0, flags);
+	return bpf_prog_run_array_cg_flags(&cgrp->bpf, atype,
+					   &ctx, bpf_prog_run, 0, flags);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
 
@@ -1182,8 +1271,8 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 
-	return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
-				     bpf_prog_run, 0);
+	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
+				     0);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
 
@@ -1200,8 +1289,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
-				    bpf_prog_run, 0);
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0);
 	rcu_read_unlock();
 
 	return ret;
@@ -1366,8 +1454,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
-				    bpf_prog_run, 0);
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0);
 	rcu_read_unlock();
 
 	kfree(ctx.cur_val);
@@ -1459,7 +1546,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	}
 
 	lock_sock(sk);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
 				    &ctx, bpf_prog_run, 0);
 	release_sock(sk);
 
@@ -1559,7 +1646,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	}
 
 	lock_sock(sk);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
 				    &ctx, bpf_prog_run, retval);
 	release_sock(sk);
 
@@ -1608,7 +1695,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
 	 * be called if that data shouldn't be "exported".
 	 */
 
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
 				    &ctx, bpf_prog_run, retval);
 	if (ret < 0)
 		return ret;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b26f3da943de..f15b826f9899 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -129,7 +129,10 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 	 * out of events when it was updated in between this and the
 	 * rcu_dereference() which is accepted risk.
 	 */
-	ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run);
+	rcu_read_lock();
+	ret = bpf_prog_run_array(rcu_dereference(call->prog_array),
+				 ctx, bpf_prog_run);
+	rcu_read_unlock();
 
  out:
 	__this_cpu_dec(bpf_prog_active);
-- 
2.36.0.rc0.470.gd361397f0d-goog


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines
  2022-04-14 16:12 [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines Stanislav Fomichev
@ 2022-04-14 20:23 ` Martin KaFai Lau
  2022-04-16  1:28 ` Alexei Starovoitov
  1 sibling, 0 replies; 12+ messages in thread
From: Martin KaFai Lau @ 2022-04-14 20:23 UTC (permalink / raw)
  To: Stanislav Fomichev; +Cc: netdev, bpf, ast, daniel, andrii

On Thu, Apr 14, 2022 at 09:12:33AM -0700, Stanislav Fomichev wrote:
> Commit 7d08c2c91171 ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros
> into functions") switched a bunch of BPF_PROG_RUN macros to inline
> routines. This changed the semantic a bit. Due to arguments expansion
> of macros, it used to be:
> 
> 	rcu_read_lock();
> 	array = rcu_dereference(cgrp->bpf.effective[atype]);
> 	...
> 
> Now, with with inline routines, we have:
> 	array_rcu = rcu_dereference(cgrp->bpf.effective[atype]);
> 	/* array_rcu can be kfree'd here */
> 	rcu_read_lock();
> 	array = rcu_dereference(array_rcu);
> 
> I'm assuming in practice rcu subsystem isn't fast enough to trigger
> this but let's use rcu API properly.
> 
> Also, rename to lower caps to not confuse with macros. Additionally,
> drop and expand BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY.
> 
> See [1] for more context.
> 
>   [1] https://lore.kernel.org/bpf/CAKH8qBs60fOinFdxiiQikK_q0EcVxGvNTQoWvHLEUGbgcj1UYg@mail.gmail.com/T/#u
> 
> v2
> - keep rcu locks inside by passing cgroup_bpf
Acked-by: Martin KaFai Lau <kafai@fb.com>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines
  2022-04-14 16:12 [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines Stanislav Fomichev
  2022-04-14 20:23 ` Martin KaFai Lau
@ 2022-04-16  1:28 ` Alexei Starovoitov
  2022-04-18 16:50   ` sdf
  1 sibling, 1 reply; 12+ messages in thread
From: Alexei Starovoitov @ 2022-04-16  1:28 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Network Development, bpf, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau

On Thu, Apr 14, 2022 at 9:12 AM Stanislav Fomichev <sdf@google.com> wrote:
> +static int
> +bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> +                           enum cgroup_bpf_attach_type atype,
> +                           const void *ctx, bpf_prog_run_fn run_prog,
> +                           int retval, u32 *ret_flags)
> +{
> +       const struct bpf_prog_array_item *item;
> +       const struct bpf_prog *prog;
> +       const struct bpf_prog_array *array;
> +       struct bpf_run_ctx *old_run_ctx;
> +       struct bpf_cg_run_ctx run_ctx;
> +       u32 func_ret;
> +
> +       run_ctx.retval = retval;
> +       migrate_disable();
> +       rcu_read_lock();
> +       array = rcu_dereference(cgrp->effective[atype]);
> +       item = &array->items[0];
> +       old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> +       while ((prog = READ_ONCE(item->prog))) {
> +               run_ctx.prog_item = item;
> +               func_ret = run_prog(prog, ctx);
...
> +       ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
>                                     &ctx, bpf_prog_run, retval);

Did you check the asm that bpf_prog_run gets inlined
after being passed as a pointer to a function?
Crossing fingers... I suspect not every compiler can do that :(
De-virtualization optimization used to be tricky.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines
  2022-04-16  1:28 ` Alexei Starovoitov
@ 2022-04-18 16:50   ` sdf
  2022-04-19  5:18     ` Alexei Starovoitov
  0 siblings, 1 reply; 12+ messages in thread
From: sdf @ 2022-04-18 16:50 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Network Development, bpf, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau

On 04/16, Alexei Starovoitov wrote:
> On Thu, Apr 14, 2022 at 9:12 AM Stanislav Fomichev <sdf@google.com> wrote:
> > +static int
> > +bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> > +                           enum cgroup_bpf_attach_type atype,
> > +                           const void *ctx, bpf_prog_run_fn run_prog,
> > +                           int retval, u32 *ret_flags)
> > +{
> > +       const struct bpf_prog_array_item *item;
> > +       const struct bpf_prog *prog;
> > +       const struct bpf_prog_array *array;
> > +       struct bpf_run_ctx *old_run_ctx;
> > +       struct bpf_cg_run_ctx run_ctx;
> > +       u32 func_ret;
> > +
> > +       run_ctx.retval = retval;
> > +       migrate_disable();
> > +       rcu_read_lock();
> > +       array = rcu_dereference(cgrp->effective[atype]);
> > +       item = &array->items[0];
> > +       old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> > +       while ((prog = READ_ONCE(item->prog))) {
> > +               run_ctx.prog_item = item;
> > +               func_ret = run_prog(prog, ctx);
> ...
> > +       ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
> >                                     &ctx, bpf_prog_run, retval);

> Did you check the asm that bpf_prog_run gets inlined
> after being passed as a pointer to a function?
> Crossing fingers... I suspect not every compiler can do that :(
> De-virtualization optimization used to be tricky.

No, I didn't, but looking at it right now, both gcc and clang
seem to be doing inlining all way up to bpf_dispatcher_nop_func.

clang:

   0000000000001750 <__cgroup_bpf_run_filter_sock_addr>:
   __cgroup_bpf_run_filter_sock_addr():
   ./kernel/bpf/cgroup.c:1226
   int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
   				      struct sockaddr *uaddr,
   				      enum cgroup_bpf_attach_type atype,
   				      void *t_ctx,
   				      u32 *flags)
   {

   ...

   ./include/linux/filter.h:628
   		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
       1980:	49 8d 75 48          	lea    0x48(%r13),%rsi
   bpf_dispatcher_nop_func():
   ./include/linux/bpf.h:804
   	return bpf_func(ctx, insnsi);
       1984:	4c 89 f7             	mov    %r14,%rdi
       1987:	41 ff 55 30          	call   *0x30(%r13)
       198b:	89 c3                	mov    %eax,%ebx

gcc (w/retpoline):

   0000000000001110 <__cgroup_bpf_run_filter_sock_addr>:
   __cgroup_bpf_run_filter_sock_addr():
   kernel/bpf/cgroup.c:1226
   {

   ...

   ./include/linux/filter.h:628
   		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
       11c5:	49 8d 75 48          	lea    0x48(%r13),%rsi
   bpf_dispatcher_nop_func():
   ./include/linux/bpf.h:804
       11c9:	48 8d 7c 24 10       	lea    0x10(%rsp),%rdi
       11ce:	e8 00 00 00 00       	call   11d3  
<__cgroup_bpf_run_filter_sock_addr+0xc3>
   			11cf: R_X86_64_PLT32	__x86_indirect_thunk_rax-0x4
       11d3:	89 c3                	mov    %eax,%ebx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines
  2022-04-18 16:50   ` sdf
@ 2022-04-19  5:18     ` Alexei Starovoitov
  2022-04-19 15:42       ` sdf
  0 siblings, 1 reply; 12+ messages in thread
From: Alexei Starovoitov @ 2022-04-19  5:18 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Network Development, bpf, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau

On Mon, Apr 18, 2022 at 9:50 AM <sdf@google.com> wrote:
>
> On 04/16, Alexei Starovoitov wrote:
> > On Thu, Apr 14, 2022 at 9:12 AM Stanislav Fomichev <sdf@google.com> wrote:
> > > +static int
> > > +bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> > > +                           enum cgroup_bpf_attach_type atype,
> > > +                           const void *ctx, bpf_prog_run_fn run_prog,
> > > +                           int retval, u32 *ret_flags)
> > > +{
> > > +       const struct bpf_prog_array_item *item;
> > > +       const struct bpf_prog *prog;
> > > +       const struct bpf_prog_array *array;
> > > +       struct bpf_run_ctx *old_run_ctx;
> > > +       struct bpf_cg_run_ctx run_ctx;
> > > +       u32 func_ret;
> > > +
> > > +       run_ctx.retval = retval;
> > > +       migrate_disable();
> > > +       rcu_read_lock();
> > > +       array = rcu_dereference(cgrp->effective[atype]);
> > > +       item = &array->items[0];
> > > +       old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> > > +       while ((prog = READ_ONCE(item->prog))) {
> > > +               run_ctx.prog_item = item;
> > > +               func_ret = run_prog(prog, ctx);
> > ...
> > > +       ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
> > >                                     &ctx, bpf_prog_run, retval);
>
> > Did you check the asm that bpf_prog_run gets inlined
> > after being passed as a pointer to a function?
> > Crossing fingers... I suspect not every compiler can do that :(
> > De-virtualization optimization used to be tricky.
>
> No, I didn't, but looking at it right now, both gcc and clang
> seem to be doing inlining all way up to bpf_dispatcher_nop_func.
>
> clang:
>
>    0000000000001750 <__cgroup_bpf_run_filter_sock_addr>:
>    __cgroup_bpf_run_filter_sock_addr():
>    ./kernel/bpf/cgroup.c:1226
>    int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
>                                       struct sockaddr *uaddr,
>                                       enum cgroup_bpf_attach_type atype,
>                                       void *t_ctx,
>                                       u32 *flags)
>    {
>
>    ...
>
>    ./include/linux/filter.h:628
>                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
>        1980:    49 8d 75 48             lea    0x48(%r13),%rsi
>    bpf_dispatcher_nop_func():
>    ./include/linux/bpf.h:804
>         return bpf_func(ctx, insnsi);
>        1984:    4c 89 f7                mov    %r14,%rdi
>        1987:    41 ff 55 30             call   *0x30(%r13)
>        198b:    89 c3                   mov    %eax,%ebx
>
> gcc (w/retpoline):
>
>    0000000000001110 <__cgroup_bpf_run_filter_sock_addr>:
>    __cgroup_bpf_run_filter_sock_addr():
>    kernel/bpf/cgroup.c:1226
>    {
>
>    ...
>
>    ./include/linux/filter.h:628
>                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
>        11c5:    49 8d 75 48             lea    0x48(%r13),%rsi
>    bpf_dispatcher_nop_func():
>    ./include/linux/bpf.h:804
>        11c9:    48 8d 7c 24 10          lea    0x10(%rsp),%rdi
>        11ce:    e8 00 00 00 00          call   11d3
> <__cgroup_bpf_run_filter_sock_addr+0xc3>
>                         11cf: R_X86_64_PLT32    __x86_indirect_thunk_rax-0x4
>        11d3:    89 c3                   mov    %eax,%ebx

Hmm. I'm not sure how you've got this asm.
Here is what I see with gcc 8 and gcc 10:
bpf_prog_run_array_cg:
...
        movq    %rcx, %r12      # run_prog, run_prog
...
# ../kernel/bpf/cgroup.c:77:            run_ctx.prog_item = item;
        movq    %rbx, (%rsp)    # item, run_ctx.prog_item
# ../kernel/bpf/cgroup.c:78:            if (!run_prog(prog, ctx) &&
!IS_ERR_VALUE((long)run_ctx.retval))
        movq    %rbp, %rsi      # ctx,
        call    *%r12   # run_prog

__cgroup_bpf_run_filter_sk:
        movq    $bpf_prog_run, %rcx     #,
# ../kernel/bpf/cgroup.c:1202:  return
bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
        leaq    1520(%rax), %rdi        #, tmp92
# ../kernel/bpf/cgroup.c:1202:  return
bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
        jmp     bpf_prog_run_array_cg   #

This is without kasan, lockdep and all debug configs are off.

So the generated code is pretty bad as I predicted :(

So I'm afraid this approach is no go.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines
  2022-04-19  5:18     ` Alexei Starovoitov
@ 2022-04-19 15:42       ` sdf
  2022-04-19 16:20         ` Alexei Starovoitov
  0 siblings, 1 reply; 12+ messages in thread
From: sdf @ 2022-04-19 15:42 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Network Development, bpf, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau

On 04/18, Alexei Starovoitov wrote:
> On Mon, Apr 18, 2022 at 9:50 AM <sdf@google.com> wrote:
> >
> > On 04/16, Alexei Starovoitov wrote:
> > > On Thu, Apr 14, 2022 at 9:12 AM Stanislav Fomichev <sdf@google.com>  
> wrote:
> > > > +static int
> > > > +bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> > > > +                           enum cgroup_bpf_attach_type atype,
> > > > +                           const void *ctx, bpf_prog_run_fn  
> run_prog,
> > > > +                           int retval, u32 *ret_flags)
> > > > +{
> > > > +       const struct bpf_prog_array_item *item;
> > > > +       const struct bpf_prog *prog;
> > > > +       const struct bpf_prog_array *array;
> > > > +       struct bpf_run_ctx *old_run_ctx;
> > > > +       struct bpf_cg_run_ctx run_ctx;
> > > > +       u32 func_ret;
> > > > +
> > > > +       run_ctx.retval = retval;
> > > > +       migrate_disable();
> > > > +       rcu_read_lock();
> > > > +       array = rcu_dereference(cgrp->effective[atype]);
> > > > +       item = &array->items[0];
> > > > +       old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> > > > +       while ((prog = READ_ONCE(item->prog))) {
> > > > +               run_ctx.prog_item = item;
> > > > +               func_ret = run_prog(prog, ctx);
> > > ...
> > > > +       ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
> > > >                                     &ctx, bpf_prog_run, retval);
> >
> > > Did you check the asm that bpf_prog_run gets inlined
> > > after being passed as a pointer to a function?
> > > Crossing fingers... I suspect not every compiler can do that :(
> > > De-virtualization optimization used to be tricky.
> >
> > No, I didn't, but looking at it right now, both gcc and clang
> > seem to be doing inlining all way up to bpf_dispatcher_nop_func.
> >
> > clang:
> >
> >    0000000000001750 <__cgroup_bpf_run_filter_sock_addr>:
> >    __cgroup_bpf_run_filter_sock_addr():
> >    ./kernel/bpf/cgroup.c:1226
> >    int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
> >                                       struct sockaddr *uaddr,
> >                                       enum cgroup_bpf_attach_type atype,
> >                                       void *t_ctx,
> >                                       u32 *flags)
> >    {
> >
> >    ...
> >
> >    ./include/linux/filter.h:628
> >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> >        1980:    49 8d 75 48             lea    0x48(%r13),%rsi
> >    bpf_dispatcher_nop_func():
> >    ./include/linux/bpf.h:804
> >         return bpf_func(ctx, insnsi);
> >        1984:    4c 89 f7                mov    %r14,%rdi
> >        1987:    41 ff 55 30             call   *0x30(%r13)
> >        198b:    89 c3                   mov    %eax,%ebx
> >
> > gcc (w/retpoline):
> >
> >    0000000000001110 <__cgroup_bpf_run_filter_sock_addr>:
> >    __cgroup_bpf_run_filter_sock_addr():
> >    kernel/bpf/cgroup.c:1226
> >    {
> >
> >    ...
> >
> >    ./include/linux/filter.h:628
> >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> >        11c5:    49 8d 75 48             lea    0x48(%r13),%rsi
> >    bpf_dispatcher_nop_func():
> >    ./include/linux/bpf.h:804
> >        11c9:    48 8d 7c 24 10          lea    0x10(%rsp),%rdi
> >        11ce:    e8 00 00 00 00          call   11d3
> > <__cgroup_bpf_run_filter_sock_addr+0xc3>
> >                         11cf: R_X86_64_PLT32     
> __x86_indirect_thunk_rax-0x4
> >        11d3:    89 c3                   mov    %eax,%ebx

> Hmm. I'm not sure how you've got this asm.
> Here is what I see with gcc 8 and gcc 10:
> bpf_prog_run_array_cg:
> ...
>          movq    %rcx, %r12      # run_prog, run_prog
> ...
> # ../kernel/bpf/cgroup.c:77:            run_ctx.prog_item = item;
>          movq    %rbx, (%rsp)    # item, run_ctx.prog_item
> # ../kernel/bpf/cgroup.c:78:            if (!run_prog(prog, ctx) &&
> !IS_ERR_VALUE((long)run_ctx.retval))
>          movq    %rbp, %rsi      # ctx,
>          call    *%r12   # run_prog

> __cgroup_bpf_run_filter_sk:
>          movq    $bpf_prog_run, %rcx     #,
> # ../kernel/bpf/cgroup.c:1202:  return
> bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
>          leaq    1520(%rax), %rdi        #, tmp92
> # ../kernel/bpf/cgroup.c:1202:  return
> bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
>          jmp     bpf_prog_run_array_cg   #

> This is without kasan, lockdep and all debug configs are off.

> So the generated code is pretty bad as I predicted :(

> So I'm afraid this approach is no go.

I've retested again and it still unrolls it for me on gcc 11 :-/
Anyway, I guess we have two options:

1. Go back to defines.
2. Don't pass a ptr to func, but pass an enum which indicates whether
    to use bpf_prog_run or __bpf_prog_run_save_cb. Seems like in this
    case the compiler shouldn't have any trouble unwrapping it?

I'll prototype and send (2). If it won't work out we can always get back
to (1).

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines
  2022-04-19 15:42       ` sdf
@ 2022-04-19 16:20         ` Alexei Starovoitov
  2022-04-19 16:32           ` Alexei Starovoitov
  0 siblings, 1 reply; 12+ messages in thread
From: Alexei Starovoitov @ 2022-04-19 16:20 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Network Development, bpf, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau

On Tue, Apr 19, 2022 at 8:42 AM <sdf@google.com> wrote:
>
> On 04/18, Alexei Starovoitov wrote:
> > On Mon, Apr 18, 2022 at 9:50 AM <sdf@google.com> wrote:
> > >
> > > On 04/16, Alexei Starovoitov wrote:
> > > > On Thu, Apr 14, 2022 at 9:12 AM Stanislav Fomichev <sdf@google.com>
> > wrote:
> > > > > +static int
> > > > > +bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> > > > > +                           enum cgroup_bpf_attach_type atype,
> > > > > +                           const void *ctx, bpf_prog_run_fn
> > run_prog,
> > > > > +                           int retval, u32 *ret_flags)
> > > > > +{
> > > > > +       const struct bpf_prog_array_item *item;
> > > > > +       const struct bpf_prog *prog;
> > > > > +       const struct bpf_prog_array *array;
> > > > > +       struct bpf_run_ctx *old_run_ctx;
> > > > > +       struct bpf_cg_run_ctx run_ctx;
> > > > > +       u32 func_ret;
> > > > > +
> > > > > +       run_ctx.retval = retval;
> > > > > +       migrate_disable();
> > > > > +       rcu_read_lock();
> > > > > +       array = rcu_dereference(cgrp->effective[atype]);
> > > > > +       item = &array->items[0];
> > > > > +       old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> > > > > +       while ((prog = READ_ONCE(item->prog))) {
> > > > > +               run_ctx.prog_item = item;
> > > > > +               func_ret = run_prog(prog, ctx);
> > > > ...
> > > > > +       ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
> > > > >                                     &ctx, bpf_prog_run, retval);
> > >
> > > > Did you check the asm that bpf_prog_run gets inlined
> > > > after being passed as a pointer to a function?
> > > > Crossing fingers... I suspect not every compiler can do that :(
> > > > De-virtualization optimization used to be tricky.
> > >
> > > No, I didn't, but looking at it right now, both gcc and clang
> > > seem to be doing inlining all way up to bpf_dispatcher_nop_func.
> > >
> > > clang:
> > >
> > >    0000000000001750 <__cgroup_bpf_run_filter_sock_addr>:
> > >    __cgroup_bpf_run_filter_sock_addr():
> > >    ./kernel/bpf/cgroup.c:1226
> > >    int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
> > >                                       struct sockaddr *uaddr,
> > >                                       enum cgroup_bpf_attach_type atype,
> > >                                       void *t_ctx,
> > >                                       u32 *flags)
> > >    {
> > >
> > >    ...
> > >
> > >    ./include/linux/filter.h:628
> > >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > >        1980:    49 8d 75 48             lea    0x48(%r13),%rsi
> > >    bpf_dispatcher_nop_func():
> > >    ./include/linux/bpf.h:804
> > >         return bpf_func(ctx, insnsi);
> > >        1984:    4c 89 f7                mov    %r14,%rdi
> > >        1987:    41 ff 55 30             call   *0x30(%r13)
> > >        198b:    89 c3                   mov    %eax,%ebx
> > >
> > > gcc (w/retpoline):
> > >
> > >    0000000000001110 <__cgroup_bpf_run_filter_sock_addr>:
> > >    __cgroup_bpf_run_filter_sock_addr():
> > >    kernel/bpf/cgroup.c:1226
> > >    {
> > >
> > >    ...
> > >
> > >    ./include/linux/filter.h:628
> > >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > >        11c5:    49 8d 75 48             lea    0x48(%r13),%rsi
> > >    bpf_dispatcher_nop_func():
> > >    ./include/linux/bpf.h:804
> > >        11c9:    48 8d 7c 24 10          lea    0x10(%rsp),%rdi
> > >        11ce:    e8 00 00 00 00          call   11d3
> > > <__cgroup_bpf_run_filter_sock_addr+0xc3>
> > >                         11cf: R_X86_64_PLT32
> > __x86_indirect_thunk_rax-0x4
> > >        11d3:    89 c3                   mov    %eax,%ebx
>
> > Hmm. I'm not sure how you've got this asm.
> > Here is what I see with gcc 8 and gcc 10:
> > bpf_prog_run_array_cg:
> > ...
> >          movq    %rcx, %r12      # run_prog, run_prog
> > ...
> > # ../kernel/bpf/cgroup.c:77:            run_ctx.prog_item = item;
> >          movq    %rbx, (%rsp)    # item, run_ctx.prog_item
> > # ../kernel/bpf/cgroup.c:78:            if (!run_prog(prog, ctx) &&
> > !IS_ERR_VALUE((long)run_ctx.retval))
> >          movq    %rbp, %rsi      # ctx,
> >          call    *%r12   # run_prog
>
> > __cgroup_bpf_run_filter_sk:
> >          movq    $bpf_prog_run, %rcx     #,
> > # ../kernel/bpf/cgroup.c:1202:  return
> > bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
> >          leaq    1520(%rax), %rdi        #, tmp92
> > # ../kernel/bpf/cgroup.c:1202:  return
> > bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
> >          jmp     bpf_prog_run_array_cg   #
>
> > This is without kasan, lockdep and all debug configs are off.
>
> > So the generated code is pretty bad as I predicted :(
>
> > So I'm afraid this approach is no go.
>
> I've retested again and it still unrolls it for me on gcc 11 :-/
> Anyway, I guess we have two options:
>
> 1. Go back to defines.
> 2. Don't pass a ptr to func, but pass an enum which indicates whether
>     to use bpf_prog_run or __bpf_prog_run_save_cb. Seems like in this
>     case the compiler shouldn't have any trouble unwrapping it?
>
> I'll prototype and send (2). If it won't work out we can always get back
> to (1).

Going back to defines is probably not necessary.
Could you try moving bpf_prog_run_array_cg*() back to .h
and use static __always_inline ?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines
  2022-04-19 16:20         ` Alexei Starovoitov
@ 2022-04-19 16:32           ` Alexei Starovoitov
  2022-04-19 16:35             ` Stanislav Fomichev
  0 siblings, 1 reply; 12+ messages in thread
From: Alexei Starovoitov @ 2022-04-19 16:32 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Network Development, bpf, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau

On Tue, Apr 19, 2022 at 9:20 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Apr 19, 2022 at 8:42 AM <sdf@google.com> wrote:
> >
> > On 04/18, Alexei Starovoitov wrote:
> > > On Mon, Apr 18, 2022 at 9:50 AM <sdf@google.com> wrote:
> > > >
> > > > On 04/16, Alexei Starovoitov wrote:
> > > > > On Thu, Apr 14, 2022 at 9:12 AM Stanislav Fomichev <sdf@google.com>
> > > wrote:
> > > > > > +static int
> > > > > > +bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> > > > > > +                           enum cgroup_bpf_attach_type atype,
> > > > > > +                           const void *ctx, bpf_prog_run_fn
> > > run_prog,
> > > > > > +                           int retval, u32 *ret_flags)
> > > > > > +{
> > > > > > +       const struct bpf_prog_array_item *item;
> > > > > > +       const struct bpf_prog *prog;
> > > > > > +       const struct bpf_prog_array *array;
> > > > > > +       struct bpf_run_ctx *old_run_ctx;
> > > > > > +       struct bpf_cg_run_ctx run_ctx;
> > > > > > +       u32 func_ret;
> > > > > > +
> > > > > > +       run_ctx.retval = retval;
> > > > > > +       migrate_disable();
> > > > > > +       rcu_read_lock();
> > > > > > +       array = rcu_dereference(cgrp->effective[atype]);
> > > > > > +       item = &array->items[0];
> > > > > > +       old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> > > > > > +       while ((prog = READ_ONCE(item->prog))) {
> > > > > > +               run_ctx.prog_item = item;
> > > > > > +               func_ret = run_prog(prog, ctx);
> > > > > ...
> > > > > > +       ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
> > > > > >                                     &ctx, bpf_prog_run, retval);
> > > >
> > > > > Did you check the asm that bpf_prog_run gets inlined
> > > > > after being passed as a pointer to a function?
> > > > > Crossing fingers... I suspect not every compiler can do that :(
> > > > > De-virtualization optimization used to be tricky.
> > > >
> > > > No, I didn't, but looking at it right now, both gcc and clang
> > > > seem to be doing inlining all way up to bpf_dispatcher_nop_func.
> > > >
> > > > clang:
> > > >
> > > >    0000000000001750 <__cgroup_bpf_run_filter_sock_addr>:
> > > >    __cgroup_bpf_run_filter_sock_addr():
> > > >    ./kernel/bpf/cgroup.c:1226
> > > >    int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
> > > >                                       struct sockaddr *uaddr,
> > > >                                       enum cgroup_bpf_attach_type atype,
> > > >                                       void *t_ctx,
> > > >                                       u32 *flags)
> > > >    {
> > > >
> > > >    ...
> > > >
> > > >    ./include/linux/filter.h:628
> > > >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > > >        1980:    49 8d 75 48             lea    0x48(%r13),%rsi
> > > >    bpf_dispatcher_nop_func():
> > > >    ./include/linux/bpf.h:804
> > > >         return bpf_func(ctx, insnsi);
> > > >        1984:    4c 89 f7                mov    %r14,%rdi
> > > >        1987:    41 ff 55 30             call   *0x30(%r13)
> > > >        198b:    89 c3                   mov    %eax,%ebx
> > > >
> > > > gcc (w/retpoline):
> > > >
> > > >    0000000000001110 <__cgroup_bpf_run_filter_sock_addr>:
> > > >    __cgroup_bpf_run_filter_sock_addr():
> > > >    kernel/bpf/cgroup.c:1226
> > > >    {
> > > >
> > > >    ...
> > > >
> > > >    ./include/linux/filter.h:628
> > > >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > > >        11c5:    49 8d 75 48             lea    0x48(%r13),%rsi
> > > >    bpf_dispatcher_nop_func():
> > > >    ./include/linux/bpf.h:804
> > > >        11c9:    48 8d 7c 24 10          lea    0x10(%rsp),%rdi
> > > >        11ce:    e8 00 00 00 00          call   11d3
> > > > <__cgroup_bpf_run_filter_sock_addr+0xc3>
> > > >                         11cf: R_X86_64_PLT32
> > > __x86_indirect_thunk_rax-0x4
> > > >        11d3:    89 c3                   mov    %eax,%ebx
> >
> > > Hmm. I'm not sure how you've got this asm.
> > > Here is what I see with gcc 8 and gcc 10:
> > > bpf_prog_run_array_cg:
> > > ...
> > >          movq    %rcx, %r12      # run_prog, run_prog
> > > ...
> > > # ../kernel/bpf/cgroup.c:77:            run_ctx.prog_item = item;
> > >          movq    %rbx, (%rsp)    # item, run_ctx.prog_item
> > > # ../kernel/bpf/cgroup.c:78:            if (!run_prog(prog, ctx) &&
> > > !IS_ERR_VALUE((long)run_ctx.retval))
> > >          movq    %rbp, %rsi      # ctx,
> > >          call    *%r12   # run_prog
> >
> > > __cgroup_bpf_run_filter_sk:
> > >          movq    $bpf_prog_run, %rcx     #,
> > > # ../kernel/bpf/cgroup.c:1202:  return
> > > bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
> > >          leaq    1520(%rax), %rdi        #, tmp92
> > > # ../kernel/bpf/cgroup.c:1202:  return
> > > bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
> > >          jmp     bpf_prog_run_array_cg   #
> >
> > > This is without kasan, lockdep and all debug configs are off.
> >
> > > So the generated code is pretty bad as I predicted :(
> >
> > > So I'm afraid this approach is no go.
> >
> > I've retested again and it still unrolls it for me on gcc 11 :-/
> > Anyway, I guess we have two options:
> >
> > 1. Go back to defines.
> > 2. Don't pass a ptr to func, but pass an enum which indicates whether
> >     to use bpf_prog_run or __bpf_prog_run_save_cb. Seems like in this
> >     case the compiler shouldn't have any trouble unwrapping it?
> >
> > I'll prototype and send (2). If it won't work out we can always get back
> > to (1).
>
> Going back to defines is probably not necessary.
> Could you try moving bpf_prog_run_array_cg*() back to .h
> and use static __always_inline ?

Actually below was enough for gcc 8 and 10:
-static int
+static __always_inline int
 bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
                            enum cgroup_bpf_attach_type atype,
                            const void *ctx, bpf_prog_run_fn run_prog,
@@ -55,7 +55,7 @@ bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
        return run_ctx.retval;
 }

-static int
+static __always_inline int
 bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,

we can keep them in .c and generated code looks good.

I can apply it with the above change.
wdyt?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines
  2022-04-19 16:32           ` Alexei Starovoitov
@ 2022-04-19 16:35             ` Stanislav Fomichev
  2022-04-19 16:48               ` Alexei Starovoitov
  0 siblings, 1 reply; 12+ messages in thread
From: Stanislav Fomichev @ 2022-04-19 16:35 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Network Development, bpf, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau

On Tue, Apr 19, 2022 at 9:32 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Apr 19, 2022 at 9:20 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Apr 19, 2022 at 8:42 AM <sdf@google.com> wrote:
> > >
> > > On 04/18, Alexei Starovoitov wrote:
> > > > On Mon, Apr 18, 2022 at 9:50 AM <sdf@google.com> wrote:
> > > > >
> > > > > On 04/16, Alexei Starovoitov wrote:
> > > > > > On Thu, Apr 14, 2022 at 9:12 AM Stanislav Fomichev <sdf@google.com>
> > > > wrote:
> > > > > > > +static int
> > > > > > > +bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> > > > > > > +                           enum cgroup_bpf_attach_type atype,
> > > > > > > +                           const void *ctx, bpf_prog_run_fn
> > > > run_prog,
> > > > > > > +                           int retval, u32 *ret_flags)
> > > > > > > +{
> > > > > > > +       const struct bpf_prog_array_item *item;
> > > > > > > +       const struct bpf_prog *prog;
> > > > > > > +       const struct bpf_prog_array *array;
> > > > > > > +       struct bpf_run_ctx *old_run_ctx;
> > > > > > > +       struct bpf_cg_run_ctx run_ctx;
> > > > > > > +       u32 func_ret;
> > > > > > > +
> > > > > > > +       run_ctx.retval = retval;
> > > > > > > +       migrate_disable();
> > > > > > > +       rcu_read_lock();
> > > > > > > +       array = rcu_dereference(cgrp->effective[atype]);
> > > > > > > +       item = &array->items[0];
> > > > > > > +       old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> > > > > > > +       while ((prog = READ_ONCE(item->prog))) {
> > > > > > > +               run_ctx.prog_item = item;
> > > > > > > +               func_ret = run_prog(prog, ctx);
> > > > > > ...
> > > > > > > +       ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
> > > > > > >                                     &ctx, bpf_prog_run, retval);
> > > > >
> > > > > > Did you check the asm that bpf_prog_run gets inlined
> > > > > > after being passed as a pointer to a function?
> > > > > > Crossing fingers... I suspect not every compiler can do that :(
> > > > > > De-virtualization optimization used to be tricky.
> > > > >
> > > > > No, I didn't, but looking at it right now, both gcc and clang
> > > > > seem to be doing inlining all way up to bpf_dispatcher_nop_func.
> > > > >
> > > > > clang:
> > > > >
> > > > >    0000000000001750 <__cgroup_bpf_run_filter_sock_addr>:
> > > > >    __cgroup_bpf_run_filter_sock_addr():
> > > > >    ./kernel/bpf/cgroup.c:1226
> > > > >    int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
> > > > >                                       struct sockaddr *uaddr,
> > > > >                                       enum cgroup_bpf_attach_type atype,
> > > > >                                       void *t_ctx,
> > > > >                                       u32 *flags)
> > > > >    {
> > > > >
> > > > >    ...
> > > > >
> > > > >    ./include/linux/filter.h:628
> > > > >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > > > >        1980:    49 8d 75 48             lea    0x48(%r13),%rsi
> > > > >    bpf_dispatcher_nop_func():
> > > > >    ./include/linux/bpf.h:804
> > > > >         return bpf_func(ctx, insnsi);
> > > > >        1984:    4c 89 f7                mov    %r14,%rdi
> > > > >        1987:    41 ff 55 30             call   *0x30(%r13)
> > > > >        198b:    89 c3                   mov    %eax,%ebx
> > > > >
> > > > > gcc (w/retpoline):
> > > > >
> > > > >    0000000000001110 <__cgroup_bpf_run_filter_sock_addr>:
> > > > >    __cgroup_bpf_run_filter_sock_addr():
> > > > >    kernel/bpf/cgroup.c:1226
> > > > >    {
> > > > >
> > > > >    ...
> > > > >
> > > > >    ./include/linux/filter.h:628
> > > > >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > > > >        11c5:    49 8d 75 48             lea    0x48(%r13),%rsi
> > > > >    bpf_dispatcher_nop_func():
> > > > >    ./include/linux/bpf.h:804
> > > > >        11c9:    48 8d 7c 24 10          lea    0x10(%rsp),%rdi
> > > > >        11ce:    e8 00 00 00 00          call   11d3
> > > > > <__cgroup_bpf_run_filter_sock_addr+0xc3>
> > > > >                         11cf: R_X86_64_PLT32
> > > > __x86_indirect_thunk_rax-0x4
> > > > >        11d3:    89 c3                   mov    %eax,%ebx
> > >
> > > > Hmm. I'm not sure how you've got this asm.
> > > > Here is what I see with gcc 8 and gcc 10:
> > > > bpf_prog_run_array_cg:
> > > > ...
> > > >          movq    %rcx, %r12      # run_prog, run_prog
> > > > ...
> > > > # ../kernel/bpf/cgroup.c:77:            run_ctx.prog_item = item;
> > > >          movq    %rbx, (%rsp)    # item, run_ctx.prog_item
> > > > # ../kernel/bpf/cgroup.c:78:            if (!run_prog(prog, ctx) &&
> > > > !IS_ERR_VALUE((long)run_ctx.retval))
> > > >          movq    %rbp, %rsi      # ctx,
> > > >          call    *%r12   # run_prog
> > >
> > > > __cgroup_bpf_run_filter_sk:
> > > >          movq    $bpf_prog_run, %rcx     #,
> > > > # ../kernel/bpf/cgroup.c:1202:  return
> > > > bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
> > > >          leaq    1520(%rax), %rdi        #, tmp92
> > > > # ../kernel/bpf/cgroup.c:1202:  return
> > > > bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
> > > >          jmp     bpf_prog_run_array_cg   #
> > >
> > > > This is without kasan, lockdep and all debug configs are off.
> > >
> > > > So the generated code is pretty bad as I predicted :(
> > >
> > > > So I'm afraid this approach is no go.
> > >
> > > I've retested again and it still unrolls it for me on gcc 11 :-/
> > > Anyway, I guess we have two options:
> > >
> > > 1. Go back to defines.
> > > 2. Don't pass a ptr to func, but pass an enum which indicates whether
> > >     to use bpf_prog_run or __bpf_prog_run_save_cb. Seems like in this
> > >     case the compiler shouldn't have any trouble unwrapping it?
> > >
> > > I'll prototype and send (2). If it won't work out we can always get back
> > > to (1).
> >
> > Going back to defines is probably not necessary.
> > Could you try moving bpf_prog_run_array_cg*() back to .h
> > and use static __always_inline ?
>
> Actually below was enough for gcc 8 and 10:
> -static int
> +static __always_inline int
>  bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
>                             enum cgroup_bpf_attach_type atype,
>                             const void *ctx, bpf_prog_run_fn run_prog,
> @@ -55,7 +55,7 @@ bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
>         return run_ctx.retval;
>  }
>
> -static int
> +static __always_inline int
>  bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
>
> we can keep them in .c and generated code looks good.
>
> I can apply it with the above change.
> wdyt?

Sure, let's go with that if it works! On my side, I managed to get the
same bad results on gcc-8; moving them to bpf-cgroup.h with
__always_inline seems to fix it. But if we can keep them in .c, that
looks even better.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines
  2022-04-19 16:35             ` Stanislav Fomichev
@ 2022-04-19 16:48               ` Alexei Starovoitov
  2022-04-19 17:01                 ` Stanislav Fomichev
  0 siblings, 1 reply; 12+ messages in thread
From: Alexei Starovoitov @ 2022-04-19 16:48 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Network Development, bpf, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau

On Tue, Apr 19, 2022 at 9:35 AM Stanislav Fomichev <sdf@google.com> wrote:
>
> On Tue, Apr 19, 2022 at 9:32 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Apr 19, 2022 at 9:20 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Tue, Apr 19, 2022 at 8:42 AM <sdf@google.com> wrote:
> > > >
> > > > On 04/18, Alexei Starovoitov wrote:
> > > > > On Mon, Apr 18, 2022 at 9:50 AM <sdf@google.com> wrote:
> > > > > >
> > > > > > On 04/16, Alexei Starovoitov wrote:
> > > > > > > On Thu, Apr 14, 2022 at 9:12 AM Stanislav Fomichev <sdf@google.com>
> > > > > wrote:
> > > > > > > > +static int
> > > > > > > > +bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> > > > > > > > +                           enum cgroup_bpf_attach_type atype,
> > > > > > > > +                           const void *ctx, bpf_prog_run_fn
> > > > > run_prog,
> > > > > > > > +                           int retval, u32 *ret_flags)
> > > > > > > > +{
> > > > > > > > +       const struct bpf_prog_array_item *item;
> > > > > > > > +       const struct bpf_prog *prog;
> > > > > > > > +       const struct bpf_prog_array *array;
> > > > > > > > +       struct bpf_run_ctx *old_run_ctx;
> > > > > > > > +       struct bpf_cg_run_ctx run_ctx;
> > > > > > > > +       u32 func_ret;
> > > > > > > > +
> > > > > > > > +       run_ctx.retval = retval;
> > > > > > > > +       migrate_disable();
> > > > > > > > +       rcu_read_lock();
> > > > > > > > +       array = rcu_dereference(cgrp->effective[atype]);
> > > > > > > > +       item = &array->items[0];
> > > > > > > > +       old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> > > > > > > > +       while ((prog = READ_ONCE(item->prog))) {
> > > > > > > > +               run_ctx.prog_item = item;
> > > > > > > > +               func_ret = run_prog(prog, ctx);
> > > > > > > ...
> > > > > > > > +       ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
> > > > > > > >                                     &ctx, bpf_prog_run, retval);
> > > > > >
> > > > > > > Did you check the asm that bpf_prog_run gets inlined
> > > > > > > after being passed as a pointer to a function?
> > > > > > > Crossing fingers... I suspect not every compiler can do that :(
> > > > > > > De-virtualization optimization used to be tricky.
> > > > > >
> > > > > > No, I didn't, but looking at it right now, both gcc and clang
> > > > > > seem to be doing inlining all way up to bpf_dispatcher_nop_func.
> > > > > >
> > > > > > clang:
> > > > > >
> > > > > >    0000000000001750 <__cgroup_bpf_run_filter_sock_addr>:
> > > > > >    __cgroup_bpf_run_filter_sock_addr():
> > > > > >    ./kernel/bpf/cgroup.c:1226
> > > > > >    int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
> > > > > >                                       struct sockaddr *uaddr,
> > > > > >                                       enum cgroup_bpf_attach_type atype,
> > > > > >                                       void *t_ctx,
> > > > > >                                       u32 *flags)
> > > > > >    {
> > > > > >
> > > > > >    ...
> > > > > >
> > > > > >    ./include/linux/filter.h:628
> > > > > >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > > > > >        1980:    49 8d 75 48             lea    0x48(%r13),%rsi
> > > > > >    bpf_dispatcher_nop_func():
> > > > > >    ./include/linux/bpf.h:804
> > > > > >         return bpf_func(ctx, insnsi);
> > > > > >        1984:    4c 89 f7                mov    %r14,%rdi
> > > > > >        1987:    41 ff 55 30             call   *0x30(%r13)
> > > > > >        198b:    89 c3                   mov    %eax,%ebx
> > > > > >
> > > > > > gcc (w/retpoline):
> > > > > >
> > > > > >    0000000000001110 <__cgroup_bpf_run_filter_sock_addr>:
> > > > > >    __cgroup_bpf_run_filter_sock_addr():
> > > > > >    kernel/bpf/cgroup.c:1226
> > > > > >    {
> > > > > >
> > > > > >    ...
> > > > > >
> > > > > >    ./include/linux/filter.h:628
> > > > > >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > > > > >        11c5:    49 8d 75 48             lea    0x48(%r13),%rsi
> > > > > >    bpf_dispatcher_nop_func():
> > > > > >    ./include/linux/bpf.h:804
> > > > > >        11c9:    48 8d 7c 24 10          lea    0x10(%rsp),%rdi
> > > > > >        11ce:    e8 00 00 00 00          call   11d3
> > > > > > <__cgroup_bpf_run_filter_sock_addr+0xc3>
> > > > > >                         11cf: R_X86_64_PLT32
> > > > > __x86_indirect_thunk_rax-0x4
> > > > > >        11d3:    89 c3                   mov    %eax,%ebx
> > > >
> > > > > Hmm. I'm not sure how you've got this asm.
> > > > > Here is what I see with gcc 8 and gcc 10:
> > > > > bpf_prog_run_array_cg:
> > > > > ...
> > > > >          movq    %rcx, %r12      # run_prog, run_prog
> > > > > ...
> > > > > # ../kernel/bpf/cgroup.c:77:            run_ctx.prog_item = item;
> > > > >          movq    %rbx, (%rsp)    # item, run_ctx.prog_item
> > > > > # ../kernel/bpf/cgroup.c:78:            if (!run_prog(prog, ctx) &&
> > > > > !IS_ERR_VALUE((long)run_ctx.retval))
> > > > >          movq    %rbp, %rsi      # ctx,
> > > > >          call    *%r12   # run_prog
> > > >
> > > > > __cgroup_bpf_run_filter_sk:
> > > > >          movq    $bpf_prog_run, %rcx     #,
> > > > > # ../kernel/bpf/cgroup.c:1202:  return
> > > > > bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
> > > > >          leaq    1520(%rax), %rdi        #, tmp92
> > > > > # ../kernel/bpf/cgroup.c:1202:  return
> > > > > bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
> > > > >          jmp     bpf_prog_run_array_cg   #
> > > >
> > > > > This is without kasan, lockdep and all debug configs are off.
> > > >
> > > > > So the generated code is pretty bad as I predicted :(
> > > >
> > > > > So I'm afraid this approach is no go.
> > > >
> > > > I've retested again and it still unrolls it for me on gcc 11 :-/
> > > > Anyway, I guess we have two options:
> > > >
> > > > 1. Go back to defines.
> > > > 2. Don't pass a ptr to func, but pass an enum which indicates whether
> > > >     to use bpf_prog_run or __bpf_prog_run_save_cb. Seems like in this
> > > >     case the compiler shouldn't have any trouble unwrapping it?
> > > >
> > > > I'll prototype and send (2). If it won't work out we can always get back
> > > > to (1).
> > >
> > > Going back to defines is probably not necessary.
> > > Could you try moving bpf_prog_run_array_cg*() back to .h
> > > and use static __always_inline ?
> >
> > Actually below was enough for gcc 8 and 10:
> > -static int
> > +static __always_inline int
> >  bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> >                             enum cgroup_bpf_attach_type atype,
> >                             const void *ctx, bpf_prog_run_fn run_prog,
> > @@ -55,7 +55,7 @@ bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> >         return run_ctx.retval;
> >  }
> >
> > -static int
> > +static __always_inline int
> >  bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
> >
> > we can keep them in .c and generated code looks good.
> >
> > I can apply it with the above change.
> > wdyt?
>
> Sure, let's go with that if it works! On my side, I managed to get the
> same bad results on gcc-8; moving them to bpf-cgroup.h with
> __always_inline seems to fix it. But if we can keep them in .c, that
> looks even better.

Ok. Applied.

As the next step... can we combine bpf_prog_run_array_cg*()
into one function?
The only difference is:
func_ret = run_prog(prog, ctx);
if (!(func_ret & 1)
  vs
if (!run_prog(prog, ctx)

afaik we don't have a check on the verifier side for possible
return values of cgroup progs,
so it might break some progs if we just do the former
in both cases?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines
  2022-04-19 16:48               ` Alexei Starovoitov
@ 2022-04-19 17:01                 ` Stanislav Fomichev
  2022-04-19 17:05                   ` Alexei Starovoitov
  0 siblings, 1 reply; 12+ messages in thread
From: Stanislav Fomichev @ 2022-04-19 17:01 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Network Development, bpf, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau

On Tue, Apr 19, 2022 at 9:49 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Apr 19, 2022 at 9:35 AM Stanislav Fomichev <sdf@google.com> wrote:
> >
> > On Tue, Apr 19, 2022 at 9:32 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Tue, Apr 19, 2022 at 9:20 AM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Tue, Apr 19, 2022 at 8:42 AM <sdf@google.com> wrote:
> > > > >
> > > > > On 04/18, Alexei Starovoitov wrote:
> > > > > > On Mon, Apr 18, 2022 at 9:50 AM <sdf@google.com> wrote:
> > > > > > >
> > > > > > > On 04/16, Alexei Starovoitov wrote:
> > > > > > > > On Thu, Apr 14, 2022 at 9:12 AM Stanislav Fomichev <sdf@google.com>
> > > > > > wrote:
> > > > > > > > > +static int
> > > > > > > > > +bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> > > > > > > > > +                           enum cgroup_bpf_attach_type atype,
> > > > > > > > > +                           const void *ctx, bpf_prog_run_fn
> > > > > > run_prog,
> > > > > > > > > +                           int retval, u32 *ret_flags)
> > > > > > > > > +{
> > > > > > > > > +       const struct bpf_prog_array_item *item;
> > > > > > > > > +       const struct bpf_prog *prog;
> > > > > > > > > +       const struct bpf_prog_array *array;
> > > > > > > > > +       struct bpf_run_ctx *old_run_ctx;
> > > > > > > > > +       struct bpf_cg_run_ctx run_ctx;
> > > > > > > > > +       u32 func_ret;
> > > > > > > > > +
> > > > > > > > > +       run_ctx.retval = retval;
> > > > > > > > > +       migrate_disable();
> > > > > > > > > +       rcu_read_lock();
> > > > > > > > > +       array = rcu_dereference(cgrp->effective[atype]);
> > > > > > > > > +       item = &array->items[0];
> > > > > > > > > +       old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> > > > > > > > > +       while ((prog = READ_ONCE(item->prog))) {
> > > > > > > > > +               run_ctx.prog_item = item;
> > > > > > > > > +               func_ret = run_prog(prog, ctx);
> > > > > > > > ...
> > > > > > > > > +       ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
> > > > > > > > >                                     &ctx, bpf_prog_run, retval);
> > > > > > >
> > > > > > > > Did you check the asm that bpf_prog_run gets inlined
> > > > > > > > after being passed as a pointer to a function?
> > > > > > > > Crossing fingers... I suspect not every compiler can do that :(
> > > > > > > > De-virtualization optimization used to be tricky.
> > > > > > >
> > > > > > > No, I didn't, but looking at it right now, both gcc and clang
> > > > > > > seem to be doing inlining all way up to bpf_dispatcher_nop_func.
> > > > > > >
> > > > > > > clang:
> > > > > > >
> > > > > > >    0000000000001750 <__cgroup_bpf_run_filter_sock_addr>:
> > > > > > >    __cgroup_bpf_run_filter_sock_addr():
> > > > > > >    ./kernel/bpf/cgroup.c:1226
> > > > > > >    int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
> > > > > > >                                       struct sockaddr *uaddr,
> > > > > > >                                       enum cgroup_bpf_attach_type atype,
> > > > > > >                                       void *t_ctx,
> > > > > > >                                       u32 *flags)
> > > > > > >    {
> > > > > > >
> > > > > > >    ...
> > > > > > >
> > > > > > >    ./include/linux/filter.h:628
> > > > > > >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > > > > > >        1980:    49 8d 75 48             lea    0x48(%r13),%rsi
> > > > > > >    bpf_dispatcher_nop_func():
> > > > > > >    ./include/linux/bpf.h:804
> > > > > > >         return bpf_func(ctx, insnsi);
> > > > > > >        1984:    4c 89 f7                mov    %r14,%rdi
> > > > > > >        1987:    41 ff 55 30             call   *0x30(%r13)
> > > > > > >        198b:    89 c3                   mov    %eax,%ebx
> > > > > > >
> > > > > > > gcc (w/retpoline):
> > > > > > >
> > > > > > >    0000000000001110 <__cgroup_bpf_run_filter_sock_addr>:
> > > > > > >    __cgroup_bpf_run_filter_sock_addr():
> > > > > > >    kernel/bpf/cgroup.c:1226
> > > > > > >    {
> > > > > > >
> > > > > > >    ...
> > > > > > >
> > > > > > >    ./include/linux/filter.h:628
> > > > > > >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > > > > > >        11c5:    49 8d 75 48             lea    0x48(%r13),%rsi
> > > > > > >    bpf_dispatcher_nop_func():
> > > > > > >    ./include/linux/bpf.h:804
> > > > > > >        11c9:    48 8d 7c 24 10          lea    0x10(%rsp),%rdi
> > > > > > >        11ce:    e8 00 00 00 00          call   11d3
> > > > > > > <__cgroup_bpf_run_filter_sock_addr+0xc3>
> > > > > > >                         11cf: R_X86_64_PLT32
> > > > > > __x86_indirect_thunk_rax-0x4
> > > > > > >        11d3:    89 c3                   mov    %eax,%ebx
> > > > >
> > > > > > Hmm. I'm not sure how you've got this asm.
> > > > > > Here is what I see with gcc 8 and gcc 10:
> > > > > > bpf_prog_run_array_cg:
> > > > > > ...
> > > > > >          movq    %rcx, %r12      # run_prog, run_prog
> > > > > > ...
> > > > > > # ../kernel/bpf/cgroup.c:77:            run_ctx.prog_item = item;
> > > > > >          movq    %rbx, (%rsp)    # item, run_ctx.prog_item
> > > > > > # ../kernel/bpf/cgroup.c:78:            if (!run_prog(prog, ctx) &&
> > > > > > !IS_ERR_VALUE((long)run_ctx.retval))
> > > > > >          movq    %rbp, %rsi      # ctx,
> > > > > >          call    *%r12   # run_prog
> > > > >
> > > > > > __cgroup_bpf_run_filter_sk:
> > > > > >          movq    $bpf_prog_run, %rcx     #,
> > > > > > # ../kernel/bpf/cgroup.c:1202:  return
> > > > > > bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
> > > > > >          leaq    1520(%rax), %rdi        #, tmp92
> > > > > > # ../kernel/bpf/cgroup.c:1202:  return
> > > > > > bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
> > > > > >          jmp     bpf_prog_run_array_cg   #
> > > > >
> > > > > > This is without kasan, lockdep and all debug configs are off.
> > > > >
> > > > > > So the generated code is pretty bad as I predicted :(
> > > > >
> > > > > > So I'm afraid this approach is no go.
> > > > >
> > > > > I've retested again and it still unrolls it for me on gcc 11 :-/
> > > > > Anyway, I guess we have two options:
> > > > >
> > > > > 1. Go back to defines.
> > > > > 2. Don't pass a ptr to func, but pass an enum which indicates whether
> > > > >     to use bpf_prog_run or __bpf_prog_run_save_cb. Seems like in this
> > > > >     case the compiler shouldn't have any trouble unwrapping it?
> > > > >
> > > > > I'll prototype and send (2). If it won't work out we can always get back
> > > > > to (1).
> > > >
> > > > Going back to defines is probably not necessary.
> > > > Could you try moving bpf_prog_run_array_cg*() back to .h
> > > > and use static __always_inline ?
> > >
> > > Actually below was enough for gcc 8 and 10:
> > > -static int
> > > +static __always_inline int
> > >  bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> > >                             enum cgroup_bpf_attach_type atype,
> > >                             const void *ctx, bpf_prog_run_fn run_prog,
> > > @@ -55,7 +55,7 @@ bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> > >         return run_ctx.retval;
> > >  }
> > >
> > > -static int
> > > +static __always_inline int
> > >  bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
> > >
> > > we can keep them in .c and generated code looks good.
> > >
> > > I can apply it with the above change.
> > > wdyt?
> >
> > Sure, let's go with that if it works! On my side, I managed to get the
> > same bad results on gcc-8; moving them to bpf-cgroup.h with
> > __always_inline seems to fix it. But if we can keep them in .c, that
> > looks even better.
>
> Ok. Applied.
>
> As the next step... can we combine bpf_prog_run_array_cg*()
> into one function?
> The only difference is:
> func_ret = run_prog(prog, ctx);
> if (!(func_ret & 1)
>   vs
> if (!run_prog(prog, ctx)
>
> afaik we don't have a check on the verifier side for possible
> return values of cgroup progs,
> so it might break some progs if we just do the former
> in both cases?

Seems like we do have return ranges checking for cgroup progs (I'm
looking at check_return_code). Using bpf_prog_run_array_cg_flags
everywhere seems possible, I can try and post some patches if it
works.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines
  2022-04-19 17:01                 ` Stanislav Fomichev
@ 2022-04-19 17:05                   ` Alexei Starovoitov
  0 siblings, 0 replies; 12+ messages in thread
From: Alexei Starovoitov @ 2022-04-19 17:05 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Network Development, bpf, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau

On Tue, Apr 19, 2022 at 10:01 AM Stanislav Fomichev <sdf@google.com> wrote:
>
> On Tue, Apr 19, 2022 at 9:49 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Apr 19, 2022 at 9:35 AM Stanislav Fomichev <sdf@google.com> wrote:
> > >
> > > On Tue, Apr 19, 2022 at 9:32 AM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Tue, Apr 19, 2022 at 9:20 AM Alexei Starovoitov
> > > > <alexei.starovoitov@gmail.com> wrote:
> > > > >
> > > > > On Tue, Apr 19, 2022 at 8:42 AM <sdf@google.com> wrote:
> > > > > >
> > > > > > On 04/18, Alexei Starovoitov wrote:
> > > > > > > On Mon, Apr 18, 2022 at 9:50 AM <sdf@google.com> wrote:
> > > > > > > >
> > > > > > > > On 04/16, Alexei Starovoitov wrote:
> > > > > > > > > On Thu, Apr 14, 2022 at 9:12 AM Stanislav Fomichev <sdf@google.com>
> > > > > > > wrote:
> > > > > > > > > > +static int
> > > > > > > > > > +bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> > > > > > > > > > +                           enum cgroup_bpf_attach_type atype,
> > > > > > > > > > +                           const void *ctx, bpf_prog_run_fn
> > > > > > > run_prog,
> > > > > > > > > > +                           int retval, u32 *ret_flags)
> > > > > > > > > > +{
> > > > > > > > > > +       const struct bpf_prog_array_item *item;
> > > > > > > > > > +       const struct bpf_prog *prog;
> > > > > > > > > > +       const struct bpf_prog_array *array;
> > > > > > > > > > +       struct bpf_run_ctx *old_run_ctx;
> > > > > > > > > > +       struct bpf_cg_run_ctx run_ctx;
> > > > > > > > > > +       u32 func_ret;
> > > > > > > > > > +
> > > > > > > > > > +       run_ctx.retval = retval;
> > > > > > > > > > +       migrate_disable();
> > > > > > > > > > +       rcu_read_lock();
> > > > > > > > > > +       array = rcu_dereference(cgrp->effective[atype]);
> > > > > > > > > > +       item = &array->items[0];
> > > > > > > > > > +       old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> > > > > > > > > > +       while ((prog = READ_ONCE(item->prog))) {
> > > > > > > > > > +               run_ctx.prog_item = item;
> > > > > > > > > > +               func_ret = run_prog(prog, ctx);
> > > > > > > > > ...
> > > > > > > > > > +       ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
> > > > > > > > > >                                     &ctx, bpf_prog_run, retval);
> > > > > > > >
> > > > > > > > > Did you check the asm that bpf_prog_run gets inlined
> > > > > > > > > after being passed as a pointer to a function?
> > > > > > > > > Crossing fingers... I suspect not every compiler can do that :(
> > > > > > > > > De-virtualization optimization used to be tricky.
> > > > > > > >
> > > > > > > > No, I didn't, but looking at it right now, both gcc and clang
> > > > > > > > seem to be doing inlining all way up to bpf_dispatcher_nop_func.
> > > > > > > >
> > > > > > > > clang:
> > > > > > > >
> > > > > > > >    0000000000001750 <__cgroup_bpf_run_filter_sock_addr>:
> > > > > > > >    __cgroup_bpf_run_filter_sock_addr():
> > > > > > > >    ./kernel/bpf/cgroup.c:1226
> > > > > > > >    int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
> > > > > > > >                                       struct sockaddr *uaddr,
> > > > > > > >                                       enum cgroup_bpf_attach_type atype,
> > > > > > > >                                       void *t_ctx,
> > > > > > > >                                       u32 *flags)
> > > > > > > >    {
> > > > > > > >
> > > > > > > >    ...
> > > > > > > >
> > > > > > > >    ./include/linux/filter.h:628
> > > > > > > >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > > > > > > >        1980:    49 8d 75 48             lea    0x48(%r13),%rsi
> > > > > > > >    bpf_dispatcher_nop_func():
> > > > > > > >    ./include/linux/bpf.h:804
> > > > > > > >         return bpf_func(ctx, insnsi);
> > > > > > > >        1984:    4c 89 f7                mov    %r14,%rdi
> > > > > > > >        1987:    41 ff 55 30             call   *0x30(%r13)
> > > > > > > >        198b:    89 c3                   mov    %eax,%ebx
> > > > > > > >
> > > > > > > > gcc (w/retpoline):
> > > > > > > >
> > > > > > > >    0000000000001110 <__cgroup_bpf_run_filter_sock_addr>:
> > > > > > > >    __cgroup_bpf_run_filter_sock_addr():
> > > > > > > >    kernel/bpf/cgroup.c:1226
> > > > > > > >    {
> > > > > > > >
> > > > > > > >    ...
> > > > > > > >
> > > > > > > >    ./include/linux/filter.h:628
> > > > > > > >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > > > > > > >        11c5:    49 8d 75 48             lea    0x48(%r13),%rsi
> > > > > > > >    bpf_dispatcher_nop_func():
> > > > > > > >    ./include/linux/bpf.h:804
> > > > > > > >        11c9:    48 8d 7c 24 10          lea    0x10(%rsp),%rdi
> > > > > > > >        11ce:    e8 00 00 00 00          call   11d3
> > > > > > > > <__cgroup_bpf_run_filter_sock_addr+0xc3>
> > > > > > > >                         11cf: R_X86_64_PLT32
> > > > > > > __x86_indirect_thunk_rax-0x4
> > > > > > > >        11d3:    89 c3                   mov    %eax,%ebx
> > > > > >
> > > > > > > Hmm. I'm not sure how you've got this asm.
> > > > > > > Here is what I see with gcc 8 and gcc 10:
> > > > > > > bpf_prog_run_array_cg:
> > > > > > > ...
> > > > > > >          movq    %rcx, %r12      # run_prog, run_prog
> > > > > > > ...
> > > > > > > # ../kernel/bpf/cgroup.c:77:            run_ctx.prog_item = item;
> > > > > > >          movq    %rbx, (%rsp)    # item, run_ctx.prog_item
> > > > > > > # ../kernel/bpf/cgroup.c:78:            if (!run_prog(prog, ctx) &&
> > > > > > > !IS_ERR_VALUE((long)run_ctx.retval))
> > > > > > >          movq    %rbp, %rsi      # ctx,
> > > > > > >          call    *%r12   # run_prog
> > > > > >
> > > > > > > __cgroup_bpf_run_filter_sk:
> > > > > > >          movq    $bpf_prog_run, %rcx     #,
> > > > > > > # ../kernel/bpf/cgroup.c:1202:  return
> > > > > > > bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
> > > > > > >          leaq    1520(%rax), %rdi        #, tmp92
> > > > > > > # ../kernel/bpf/cgroup.c:1202:  return
> > > > > > > bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
> > > > > > >          jmp     bpf_prog_run_array_cg   #
> > > > > >
> > > > > > > This is without kasan, lockdep and all debug configs are off.
> > > > > >
> > > > > > > So the generated code is pretty bad as I predicted :(
> > > > > >
> > > > > > > So I'm afraid this approach is no go.
> > > > > >
> > > > > > I've retested again and it still unrolls it for me on gcc 11 :-/
> > > > > > Anyway, I guess we have two options:
> > > > > >
> > > > > > 1. Go back to defines.
> > > > > > 2. Don't pass a ptr to func, but pass an enum which indicates whether
> > > > > >     to use bpf_prog_run or __bpf_prog_run_save_cb. Seems like in this
> > > > > >     case the compiler shouldn't have any trouble unwrapping it?
> > > > > >
> > > > > > I'll prototype and send (2). If it won't work out we can always get back
> > > > > > to (1).
> > > > >
> > > > > Going back to defines is probably not necessary.
> > > > > Could you try moving bpf_prog_run_array_cg*() back to .h
> > > > > and use static __always_inline ?
> > > >
> > > > Actually below was enough for gcc 8 and 10:
> > > > -static int
> > > > +static __always_inline int
> > > >  bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> > > >                             enum cgroup_bpf_attach_type atype,
> > > >                             const void *ctx, bpf_prog_run_fn run_prog,
> > > > @@ -55,7 +55,7 @@ bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
> > > >         return run_ctx.retval;
> > > >  }
> > > >
> > > > -static int
> > > > +static __always_inline int
> > > >  bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
> > > >
> > > > we can keep them in .c and generated code looks good.
> > > >
> > > > I can apply it with the above change.
> > > > wdyt?
> > >
> > > Sure, let's go with that if it works! On my side, I managed to get the
> > > same bad results on gcc-8; moving them to bpf-cgroup.h with
> > > __always_inline seems to fix it. But if we can keep them in .c, that
> > > looks even better.
> >
> > Ok. Applied.
> >
> > As the next step... can we combine bpf_prog_run_array_cg*()
> > into one function?
> > The only difference is:
> > func_ret = run_prog(prog, ctx);
> > if (!(func_ret & 1)
> >   vs
> > if (!run_prog(prog, ctx)
> >
> > afaik we don't have a check on the verifier side for possible
> > return values of cgroup progs,
> > so it might break some progs if we just do the former
> > in both cases?
>
> Seems like we do have return ranges checking for cgroup progs (I'm
> looking at check_return_code). Using bpf_prog_run_array_cg_flags
> everywhere seems possible, I can try and post some patches if it
> works.

Thanks!
Since it's always_inline extra 'if (ret_flags) *ret_flags = ...'
in the critical path is fine, since it will be optimized out
by the compiler when ret_flags are NULL.
Hopefully the compiler can hoist the check out of the while loop too
in case it's not NULL. But no big deal.
That array has typically only one prog in it.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2022-04-19 17:05 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-14 16:12 [PATCH bpf-next v2] bpf: move rcu lock management out of BPF_PROG_RUN routines Stanislav Fomichev
2022-04-14 20:23 ` Martin KaFai Lau
2022-04-16  1:28 ` Alexei Starovoitov
2022-04-18 16:50   ` sdf
2022-04-19  5:18     ` Alexei Starovoitov
2022-04-19 15:42       ` sdf
2022-04-19 16:20         ` Alexei Starovoitov
2022-04-19 16:32           ` Alexei Starovoitov
2022-04-19 16:35             ` Stanislav Fomichev
2022-04-19 16:48               ` Alexei Starovoitov
2022-04-19 17:01                 ` Stanislav Fomichev
2022-04-19 17:05                   ` Alexei Starovoitov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.