[PATCH bpf-next 0/4] bpf: per program stats

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH bpf-next 0/4] bpf: per program stats
@ 2019-02-22 23:36 Alexei Starovoitov
  2019-02-22 23:36 ` [PATCH bpf-next 1/4] bpf: enable " Alexei Starovoitov
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Alexei Starovoitov @ 2019-02-22 23:36 UTC (permalink / raw)
  To: davem; +Cc: daniel, netdev, bpf, kernel-team

Introduce per program stats to monitor the usage BPF

Alexei Starovoitov (4):
  bpf: enable program stats
  bpf: expose program stats via bpf_prog_info
  tools/bpf: sync bpf.h into tools
  tools/bpftool: recognize bpf_prog_info runtime and runcnt

 include/linux/bpf.h                           |  7 ++++
 include/linux/filter.h                        | 16 ++++++++-
 include/uapi/linux/bpf.h                      |  2 ++
 kernel/bpf/core.c                             | 13 +++++++
 kernel/bpf/syscall.c                          | 29 ++++++++++++++--
 kernel/bpf/verifier.c                         |  5 +++
 kernel/sysctl.c                               | 34 +++++++++++++++++++
 .../bpftool/Documentation/bpftool-prog.rst    |  4 ++-
 tools/bpf/bpftool/prog.c                      |  7 ++++
 tools/include/uapi/linux/bpf.h                |  2 ++
 10 files changed, 115 insertions(+), 4 deletions(-)

-- 
2.20.0


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH bpf-next 1/4] bpf: enable program stats
  2019-02-22 23:36 [PATCH bpf-next 0/4] bpf: per program stats Alexei Starovoitov
@ 2019-02-22 23:36 ` Alexei Starovoitov
  2019-02-23  0:05   ` Eric Dumazet
  2019-02-23  0:34   ` Roman Gushchin
  2019-02-22 23:36 ` [PATCH bpf-next 2/4] bpf: expose program stats via bpf_prog_info Alexei Starovoitov
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 16+ messages in thread
From: Alexei Starovoitov @ 2019-02-22 23:36 UTC (permalink / raw)
  To: davem; +Cc: daniel, netdev, bpf, kernel-team

JITed BPF programs are indistinguishable from kernel functions, but unlike
kernel code BPF code can be changed often.
Typical approach of "perf record" + "perf report" profiling and tunning of
kernel code works just as well for BPF programs, but kernel code doesn't
need to be monitored whereas BPF programs do.
Users load and run large amount of BPF programs.
These BPF stats allow tools monitor the usage of BPF on the server.
The monitoring tools will turn sysctl kernel.bpf_stats_enabled
on and off for few seconds to sample average cost of the programs.
Aggregated data over hours and days will provide an insight into cost of BPF
and alarms can trigger in case given program suddenly gets more expensive.

The cost of two sched_clock() per program invocation adds ~20 nsec.
Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down
from ~40 nsec to ~60 nsec.
static_key minimizes the cost of the stats collection.
There is no measurable difference before/after this patch
with kernel.bpf_stats_enabled=0

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    |  7 +++++++
 include/linux/filter.h | 16 +++++++++++++++-
 kernel/bpf/core.c      | 13 +++++++++++++
 kernel/bpf/syscall.c   | 24 ++++++++++++++++++++++--
 kernel/bpf/verifier.c  |  5 +++++
 kernel/sysctl.c        | 34 ++++++++++++++++++++++++++++++++++
 6 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index de18227b3d95..14260674bc57 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -340,6 +340,11 @@ enum bpf_cgroup_storage_type {
 
 #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
 
+struct bpf_prog_stats {
+	u64 cnt;
+	u64 nsecs;
+};
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -389,6 +394,7 @@ struct bpf_prog_aux {
 	 * main prog always has linfo_idx == 0
 	 */
 	u32 linfo_idx;
+	struct bpf_prog_stats __percpu *stats;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -559,6 +565,7 @@ void bpf_map_area_free(void *base);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
 
 extern int sysctl_unprivileged_bpf_disabled;
+extern int sysctl_bpf_stats_enabled;
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f32b3eca5a04..7b14235b6f7d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -533,7 +533,21 @@ struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
-#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
+DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
+
+#define BPF_PROG_RUN(prog, ctx)	({				\
+	u32 ret;						\
+	cant_sleep();						\
+	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
+		u64 start = sched_clock();			\
+		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
+		this_cpu_inc(prog->aux->stats->cnt);		\
+		this_cpu_add(prog->aux->stats->nsecs,		\
+			     sched_clock() - start);		\
+	} else {						\
+		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
+	}							\
+	ret; })
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ef88b167959d..574747f98d44 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -95,6 +95,13 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 		return NULL;
 	}
 
+	aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
+	if (!aux->stats) {
+		kfree(aux);
+		vfree(fp);
+		return NULL;
+	}
+
 	fp->pages = size / PAGE_SIZE;
 	fp->aux = aux;
 	fp->aux->prog = fp;
@@ -231,6 +238,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 
 void __bpf_prog_free(struct bpf_prog *fp)
 {
+	if (fp->aux && !fp->is_func)
+		free_percpu(fp->aux->stats);
 	kfree(fp->aux);
 	vfree(fp);
 }
@@ -2069,6 +2078,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 	return -EFAULT;
 }
 
+DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
+EXPORT_SYMBOL(bpf_stats_enabled_key);
+int sysctl_bpf_stats_enabled __read_mostly;
+
 /* All definitions of tracepoints related to BPF. */
 #define CREATE_TRACE_POINTS
 #include <linux/bpf_trace.h>
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ec7c552af76b..079e7fa14f39 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1283,24 +1283,44 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static void bpf_prog_get_stats(const struct bpf_prog *prog,
+			       struct bpf_prog_stats *stats)
+{
+	u64 nsecs = 0, cnt = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		nsecs += per_cpu(prog->aux->stats->nsecs, cpu);
+		cnt += per_cpu(prog->aux->stats->cnt, cpu);
+	}
+	stats->nsecs = nsecs;
+	stats->cnt = cnt;
+}
+
 #ifdef CONFIG_PROC_FS
 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 {
 	const struct bpf_prog *prog = filp->private_data;
 	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
+	struct bpf_prog_stats stats;
 
+	bpf_prog_get_stats(prog, &stats);
 	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
 	seq_printf(m,
 		   "prog_type:\t%u\n"
 		   "prog_jited:\t%u\n"
 		   "prog_tag:\t%s\n"
 		   "memlock:\t%llu\n"
-		   "prog_id:\t%u\n",
+		   "prog_id:\t%u\n"
+		   "runtime:\t%llu\n"
+		   "runcnt:\t%llu\n",
 		   prog->type,
 		   prog->jited,
 		   prog_tag,
 		   prog->pages * 1ULL << PAGE_SHIFT,
-		   prog->aux->id);
+		   prog->aux->id,
+		   stats.nsecs,
+		   stats.cnt);
 }
 #endif
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1b9496c41383..dff61920a316 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7330,6 +7330,11 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		if (bpf_prog_calc_tag(func[i]))
 			goto out_free;
 		func[i]->is_func = 1;
+		/* BPF_PROG_RUN doesn't call subprogs directly,
+		 * hence stats are reused from main prog
+		 */
+		free_percpu(func[i]->aux->stats);
+		func[i]->aux->stats = prog->aux->stats;
 		func[i]->aux->func_idx = i;
 		/* the btf and func_info will be freed only at prog->aux */
 		func[i]->aux->btf = prog->aux->btf;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba4d9e85feb8..86e0771352f2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -224,6 +224,9 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 #endif
 static int proc_dopipe_max_size(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+					  void __user *buffer, size_t *lenp,
+					  loff_t *ppos);
 
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses its own private copy */
@@ -1230,6 +1233,15 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.procname	= "bpf_stats_enabled",
+		.data		= &sysctl_bpf_stats_enabled,
+		.maxlen		= sizeof(sysctl_bpf_stats_enabled),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax_bpf_stats,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
 	{
 		.procname	= "panic_on_rcu_stall",
@@ -3260,6 +3272,28 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 
 #endif /* CONFIG_PROC_SYSCTL */
 
+static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+					  void __user *buffer, size_t *lenp,
+					  loff_t *ppos)
+{
+	int ret, bpf_stats = *(int *)table->data;
+	struct ctl_table tmp = *table;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	tmp.data = &bpf_stats;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (write && !ret) {
+		*(int *)table->data = bpf_stats;
+		if (bpf_stats)
+			static_branch_enable(&bpf_stats_enabled_key);
+		else
+			static_branch_disable(&bpf_stats_enabled_key);
+	}
+	return ret;
+}
+
 /*
  * No sense putting this after each symbol definition, twice,
  * exception granted :-)
-- 
2.20.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH bpf-next 2/4] bpf: expose program stats via bpf_prog_info
  2019-02-22 23:36 [PATCH bpf-next 0/4] bpf: per program stats Alexei Starovoitov
  2019-02-22 23:36 ` [PATCH bpf-next 1/4] bpf: enable " Alexei Starovoitov
@ 2019-02-22 23:36 ` Alexei Starovoitov
  2019-02-22 23:36 ` [PATCH bpf-next 3/4] tools/bpf: sync bpf.h into tools Alexei Starovoitov
  2019-02-22 23:36 ` [PATCH bpf-next 4/4] tools/bpftool: recognize bpf_prog_info runtime and runcnt Alexei Starovoitov
  3 siblings, 0 replies; 16+ messages in thread
From: Alexei Starovoitov @ 2019-02-22 23:36 UTC (permalink / raw)
  To: davem; +Cc: daniel, netdev, bpf, kernel-team

Return bpf program runtime and runcnt via bpf_prog_info

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 2 ++
 kernel/bpf/syscall.c     | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bcdd2474eee7..d2cb85d85b39 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2813,6 +2813,8 @@ struct bpf_prog_info {
 	__u32 jited_line_info_rec_size;
 	__u32 nr_prog_tags;
 	__aligned_u64 prog_tags;
+	__u64 runtime;
+	__u64 runcnt;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 079e7fa14f39..6c55aeb2e1b7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2142,6 +2142,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
 	struct bpf_prog_info info = {};
 	u32 info_len = attr->info.info_len;
+	struct bpf_prog_stats stats;
 	char __user *uinsns;
 	u32 ulen;
 	int err;
@@ -2181,6 +2182,10 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (err)
 		return err;
 
+	bpf_prog_get_stats(prog, &stats);
+	info.runtime = stats.nsecs;
+	info.runcnt = stats.cnt;
+
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
-- 
2.20.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH bpf-next 3/4] tools/bpf: sync bpf.h into tools
  2019-02-22 23:36 [PATCH bpf-next 0/4] bpf: per program stats Alexei Starovoitov
  2019-02-22 23:36 ` [PATCH bpf-next 1/4] bpf: enable " Alexei Starovoitov
  2019-02-22 23:36 ` [PATCH bpf-next 2/4] bpf: expose program stats via bpf_prog_info Alexei Starovoitov
@ 2019-02-22 23:36 ` Alexei Starovoitov
  2019-02-22 23:36 ` [PATCH bpf-next 4/4] tools/bpftool: recognize bpf_prog_info runtime and runcnt Alexei Starovoitov
  3 siblings, 0 replies; 16+ messages in thread
From: Alexei Starovoitov @ 2019-02-22 23:36 UTC (permalink / raw)
  To: davem; +Cc: daniel, netdev, bpf, kernel-team

sync bpf.h into tools directory

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index bcdd2474eee7..d2cb85d85b39 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2813,6 +2813,8 @@ struct bpf_prog_info {
 	__u32 jited_line_info_rec_size;
 	__u32 nr_prog_tags;
 	__aligned_u64 prog_tags;
+	__u64 runtime;
+	__u64 runcnt;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
2.20.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH bpf-next 4/4] tools/bpftool: recognize bpf_prog_info runtime and runcnt
  2019-02-22 23:36 [PATCH bpf-next 0/4] bpf: per program stats Alexei Starovoitov
                   ` (2 preceding siblings ...)
  2019-02-22 23:36 ` [PATCH bpf-next 3/4] tools/bpf: sync bpf.h into tools Alexei Starovoitov
@ 2019-02-22 23:36 ` Alexei Starovoitov
  2019-02-23  4:17   ` Andrii Nakryiko
  3 siblings, 1 reply; 16+ messages in thread
From: Alexei Starovoitov @ 2019-02-22 23:36 UTC (permalink / raw)
  To: davem; +Cc: daniel, netdev, bpf, kernel-team

$ bpftool p s
1: kprobe  tag a56587d488d216c9  gpl runtime 79786 runcnt 8
	loaded_at 2019-02-22T12:22:51-0800  uid 0
	xlated 352B  not jited  memlock 4096B

$ bpftool --json --pretty p s
[{
        "id": 1,
        "type": "kprobe",
        "tag": "a56587d488d216c9",
        "gpl_compatible": true,
        "runtime": 79786,
        "runcnt": 8,
        "loaded_at": 1550866971,
        "uid": 0,
        "bytes_xlated": 352,
        "jited": false,
        "bytes_memlock": 4096
    }
]

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/bpf/bpftool/Documentation/bpftool-prog.rst | 4 +++-
 tools/bpf/bpftool/prog.c                         | 7 +++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 12bc1e2d4b46..102b180d3add 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -171,7 +171,7 @@ EXAMPLES
 
 ::
 
-    10: xdp  name some_prog  tag 005a3d2123620c8b  gpl
+    10: xdp  name some_prog  tag 005a3d2123620c8b  gpl runtime 81632 runcnt 10
             loaded_at 2017-09-29T20:11:00+0000  uid 0
             xlated 528B  jited 370B  memlock 4096B  map_ids 10
 
@@ -184,6 +184,8 @@ EXAMPLES
             "type": "xdp",
             "tag": "005a3d2123620c8b",
             "gpl_compatible": true,
+            "runtime": 81632,
+            "runcnt": 10,
             "loaded_at": 1506715860,
             "uid": 0,
             "bytes_xlated": 528,
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index db978c8d76a8..8065ba11b9d5 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -214,6 +214,10 @@ static void print_prog_json(struct bpf_prog_info *info, int fd)
 		     info->tag[4], info->tag[5], info->tag[6], info->tag[7]);
 
 	jsonw_bool_field(json_wtr, "gpl_compatible", info->gpl_compatible);
+	if (info->runtime) {
+		jsonw_uint_field(json_wtr, "runtime", info->runtime);
+		jsonw_uint_field(json_wtr, "runcnt", info->runcnt);
+	}
 
 	print_dev_json(info->ifindex, info->netns_dev, info->netns_ino);
 
@@ -277,6 +281,9 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd)
 	fprint_hex(stdout, info->tag, BPF_TAG_SIZE, "");
 	print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino);
 	printf("%s", info->gpl_compatible ? "  gpl" : "");
+	if (info->runtime)
+		printf(" runtime %lld runcnt %lld",
+		       info->runtime, info->runcnt);
 	printf("\n");
 
 	if (info->load_time) {
-- 
2.20.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable program stats
  2019-02-22 23:36 ` [PATCH bpf-next 1/4] bpf: enable " Alexei Starovoitov
@ 2019-02-23  0:05   ` Eric Dumazet
  2019-02-23  2:29     ` Alexei Starovoitov
  2019-02-23  0:34   ` Roman Gushchin
  1 sibling, 1 reply; 16+ messages in thread
From: Eric Dumazet @ 2019-02-23  0:05 UTC (permalink / raw)
  To: Alexei Starovoitov, davem; +Cc: daniel, netdev, bpf, kernel-team



On 02/22/2019 03:36 PM, Alexei Starovoitov wrote:

>  
> +static void bpf_prog_get_stats(const struct bpf_prog *prog,
> +			       struct bpf_prog_stats *stats)
> +{
> +	u64 nsecs = 0, cnt = 0;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		nsecs += per_cpu(prog->aux->stats->nsecs, cpu);
> +		cnt += per_cpu(prog->aux->stats->cnt, cpu);

This does not work reliably for 32 bit kernels.

> +	}
> +	stats->nsecs = nsecs;
> +	stats->cnt = cnt;
> +}


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable program stats
  2019-02-22 23:36 ` [PATCH bpf-next 1/4] bpf: enable " Alexei Starovoitov
  2019-02-23  0:05   ` Eric Dumazet
@ 2019-02-23  0:34   ` Roman Gushchin
  2019-02-23  1:06     ` Daniel Borkmann
  2019-02-23  2:31     ` Alexei Starovoitov
  1 sibling, 2 replies; 16+ messages in thread
From: Roman Gushchin @ 2019-02-23  0:34 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: davem, daniel, netdev, bpf, Kernel Team

On Fri, Feb 22, 2019 at 03:36:41PM -0800, Alexei Starovoitov wrote:
> JITed BPF programs are indistinguishable from kernel functions, but unlike
> kernel code BPF code can be changed often.
> Typical approach of "perf record" + "perf report" profiling and tunning of
> kernel code works just as well for BPF programs, but kernel code doesn't
> need to be monitored whereas BPF programs do.
> Users load and run large amount of BPF programs.
> These BPF stats allow tools monitor the usage of BPF on the server.
> The monitoring tools will turn sysctl kernel.bpf_stats_enabled
> on and off for few seconds to sample average cost of the programs.
> Aggregated data over hours and days will provide an insight into cost of BPF
> and alarms can trigger in case given program suddenly gets more expensive.
> 
> The cost of two sched_clock() per program invocation adds ~20 nsec.
> Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down
> from ~40 nsec to ~60 nsec.
> static_key minimizes the cost of the stats collection.
> There is no measurable difference before/after this patch
> with kernel.bpf_stats_enabled=0
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/bpf.h    |  7 +++++++
>  include/linux/filter.h | 16 +++++++++++++++-
>  kernel/bpf/core.c      | 13 +++++++++++++
>  kernel/bpf/syscall.c   | 24 ++++++++++++++++++++++--
>  kernel/bpf/verifier.c  |  5 +++++
>  kernel/sysctl.c        | 34 ++++++++++++++++++++++++++++++++++
>  6 files changed, 96 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index de18227b3d95..14260674bc57 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -340,6 +340,11 @@ enum bpf_cgroup_storage_type {
>  
>  #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
>  
> +struct bpf_prog_stats {
> +	u64 cnt;
> +	u64 nsecs;
> +};
> +
>  struct bpf_prog_aux {
>  	atomic_t refcnt;
>  	u32 used_map_cnt;
> @@ -389,6 +394,7 @@ struct bpf_prog_aux {
>  	 * main prog always has linfo_idx == 0
>  	 */
>  	u32 linfo_idx;
> +	struct bpf_prog_stats __percpu *stats;
>  	union {
>  		struct work_struct work;
>  		struct rcu_head	rcu;
> @@ -559,6 +565,7 @@ void bpf_map_area_free(void *base);
>  void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
>  
>  extern int sysctl_unprivileged_bpf_disabled;
> +extern int sysctl_bpf_stats_enabled;
>  
>  int bpf_map_new_fd(struct bpf_map *map, int flags);
>  int bpf_prog_new_fd(struct bpf_prog *prog);
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index f32b3eca5a04..7b14235b6f7d 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -533,7 +533,21 @@ struct sk_filter {
>  	struct bpf_prog	*prog;
>  };
>  
> -#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
> +DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
> +
> +#define BPF_PROG_RUN(prog, ctx)	({				\
> +	u32 ret;						\
> +	cant_sleep();						\
> +	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
> +		u64 start = sched_clock();			\
> +		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> +		this_cpu_inc(prog->aux->stats->cnt);		\
> +		this_cpu_add(prog->aux->stats->nsecs,		\
> +			     sched_clock() - start);		\
> +	} else {						\
> +		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> +	}							\
> +	ret; })

Can we bump "cnt" unconditionally and gate only the "nsecs" calculation?

Why it might be useful?
If the performance cost is too high to keep the statistics gathering
always enabled, it will be possible to measure the average cost
periodically and extrapolate.

Thanks!

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable program stats
  2019-02-23  0:34   ` Roman Gushchin
@ 2019-02-23  1:06     ` Daniel Borkmann
  2019-02-23  2:38       ` Alexei Starovoitov
  2019-02-23  2:31     ` Alexei Starovoitov
  1 sibling, 1 reply; 16+ messages in thread
From: Daniel Borkmann @ 2019-02-23  1:06 UTC (permalink / raw)
  To: Roman Gushchin, Alexei Starovoitov; +Cc: davem, netdev, bpf, Kernel Team

On 02/23/2019 01:34 AM, Roman Gushchin wrote:
> On Fri, Feb 22, 2019 at 03:36:41PM -0800, Alexei Starovoitov wrote:
>> JITed BPF programs are indistinguishable from kernel functions, but unlike
>> kernel code BPF code can be changed often.
>> Typical approach of "perf record" + "perf report" profiling and tunning of
>> kernel code works just as well for BPF programs, but kernel code doesn't
>> need to be monitored whereas BPF programs do.
>> Users load and run large amount of BPF programs.
>> These BPF stats allow tools monitor the usage of BPF on the server.
>> The monitoring tools will turn sysctl kernel.bpf_stats_enabled
>> on and off for few seconds to sample average cost of the programs.
>> Aggregated data over hours and days will provide an insight into cost of BPF
>> and alarms can trigger in case given program suddenly gets more expensive.
>>
>> The cost of two sched_clock() per program invocation adds ~20 nsec.
>> Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down
>> from ~40 nsec to ~60 nsec.
>> static_key minimizes the cost of the stats collection.
>> There is no measurable difference before/after this patch
>> with kernel.bpf_stats_enabled=0
>>
>> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>> ---
>>  include/linux/bpf.h    |  7 +++++++
>>  include/linux/filter.h | 16 +++++++++++++++-
>>  kernel/bpf/core.c      | 13 +++++++++++++
>>  kernel/bpf/syscall.c   | 24 ++++++++++++++++++++++--
>>  kernel/bpf/verifier.c  |  5 +++++
>>  kernel/sysctl.c        | 34 ++++++++++++++++++++++++++++++++++
>>  6 files changed, 96 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index de18227b3d95..14260674bc57 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -340,6 +340,11 @@ enum bpf_cgroup_storage_type {
>>  
>>  #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
>>  
>> +struct bpf_prog_stats {
>> +	u64 cnt;
>> +	u64 nsecs;
>> +};
>> +
>>  struct bpf_prog_aux {
>>  	atomic_t refcnt;
>>  	u32 used_map_cnt;
>> @@ -389,6 +394,7 @@ struct bpf_prog_aux {
>>  	 * main prog always has linfo_idx == 0
>>  	 */
>>  	u32 linfo_idx;
>> +	struct bpf_prog_stats __percpu *stats;
>>  	union {
>>  		struct work_struct work;
>>  		struct rcu_head	rcu;
>> @@ -559,6 +565,7 @@ void bpf_map_area_free(void *base);
>>  void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
>>  
>>  extern int sysctl_unprivileged_bpf_disabled;
>> +extern int sysctl_bpf_stats_enabled;
>>  
>>  int bpf_map_new_fd(struct bpf_map *map, int flags);
>>  int bpf_prog_new_fd(struct bpf_prog *prog);
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index f32b3eca5a04..7b14235b6f7d 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -533,7 +533,21 @@ struct sk_filter {
>>  	struct bpf_prog	*prog;
>>  };
>>  
>> -#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
>> +DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
>> +
>> +#define BPF_PROG_RUN(prog, ctx)	({				\
>> +	u32 ret;						\
>> +	cant_sleep();						\
>> +	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
>> +		u64 start = sched_clock();			\
>> +		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
>> +		this_cpu_inc(prog->aux->stats->cnt);		\
>> +		this_cpu_add(prog->aux->stats->nsecs,		\
>> +			     sched_clock() - start);		\
>> +	} else {						\
>> +		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
>> +	}							\
>> +	ret; })
> 
> Can we bump "cnt" unconditionally and gate only the "nsecs" calculation?
> 
> Why it might be useful?
> If the performance cost is too high to keep the statistics gathering
> always enabled, it will be possible to measure the average cost
> periodically and extrapolate.

In general, having some stats and timing info would be useful, but I
guess people might want to customize it in future even more specifically
beyond number of runs + time it takes. One thing that would be super
useful is to have some notion of __attribute__((constructor)) and
__attribute__((destructor)) support in BPF which gets then inlined into
prologue / epilogue of program. E.g. such infrastructure would allow to
mark an skb and measure time it takes through the BPF prog till it hits
an exit point somewhere (without having to explicitly code this into the
program everywhere). Other examples may be histograms or p99 latencies
that might probably be useful. Feels like for monitoring more ways to
program would be nice and to move it into the BPF insns sequence (e.g.
enforced globally or by choice of prog as another option)? Thoughts?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable program stats
  2019-02-23  0:05   ` Eric Dumazet
@ 2019-02-23  2:29     ` Alexei Starovoitov
  0 siblings, 0 replies; 16+ messages in thread
From: Alexei Starovoitov @ 2019-02-23  2:29 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Alexei Starovoitov, davem, daniel, netdev, bpf, kernel-team

On Fri, Feb 22, 2019 at 04:05:55PM -0800, Eric Dumazet wrote:
> 
> 
> On 02/22/2019 03:36 PM, Alexei Starovoitov wrote:
> 
> >  
> > +static void bpf_prog_get_stats(const struct bpf_prog *prog,
> > +			       struct bpf_prog_stats *stats)
> > +{
> > +	u64 nsecs = 0, cnt = 0;
> > +	int cpu;
> > +
> > +	for_each_possible_cpu(cpu) {
> > +		nsecs += per_cpu(prog->aux->stats->nsecs, cpu);
> > +		cnt += per_cpu(prog->aux->stats->cnt, cpu);
> 
> This does not work reliably for 32 bit kernels.

right. my mind was in probe_read thread. will fix.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable program stats
  2019-02-23  0:34   ` Roman Gushchin
  2019-02-23  1:06     ` Daniel Borkmann
@ 2019-02-23  2:31     ` Alexei Starovoitov
  1 sibling, 0 replies; 16+ messages in thread
From: Alexei Starovoitov @ 2019-02-23  2:31 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Alexei Starovoitov, davem, daniel, netdev, bpf, Kernel Team

On Sat, Feb 23, 2019 at 12:34:38AM +0000, Roman Gushchin wrote:
> On Fri, Feb 22, 2019 at 03:36:41PM -0800, Alexei Starovoitov wrote:
> > JITed BPF programs are indistinguishable from kernel functions, but unlike
> > kernel code BPF code can be changed often.
> > Typical approach of "perf record" + "perf report" profiling and tunning of
> > kernel code works just as well for BPF programs, but kernel code doesn't
> > need to be monitored whereas BPF programs do.
> > Users load and run large amount of BPF programs.
> > These BPF stats allow tools monitor the usage of BPF on the server.
> > The monitoring tools will turn sysctl kernel.bpf_stats_enabled
> > on and off for few seconds to sample average cost of the programs.
> > Aggregated data over hours and days will provide an insight into cost of BPF
> > and alarms can trigger in case given program suddenly gets more expensive.
> > 
> > The cost of two sched_clock() per program invocation adds ~20 nsec.
> > Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down
> > from ~40 nsec to ~60 nsec.
> > static_key minimizes the cost of the stats collection.
> > There is no measurable difference before/after this patch
> > with kernel.bpf_stats_enabled=0
> > 
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > ---
> >  include/linux/bpf.h    |  7 +++++++
> >  include/linux/filter.h | 16 +++++++++++++++-
> >  kernel/bpf/core.c      | 13 +++++++++++++
> >  kernel/bpf/syscall.c   | 24 ++++++++++++++++++++++--
> >  kernel/bpf/verifier.c  |  5 +++++
> >  kernel/sysctl.c        | 34 ++++++++++++++++++++++++++++++++++
> >  6 files changed, 96 insertions(+), 3 deletions(-)
> > 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index de18227b3d95..14260674bc57 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -340,6 +340,11 @@ enum bpf_cgroup_storage_type {
> >  
> >  #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
> >  
> > +struct bpf_prog_stats {
> > +	u64 cnt;
> > +	u64 nsecs;
> > +};
> > +
> >  struct bpf_prog_aux {
> >  	atomic_t refcnt;
> >  	u32 used_map_cnt;
> > @@ -389,6 +394,7 @@ struct bpf_prog_aux {
> >  	 * main prog always has linfo_idx == 0
> >  	 */
> >  	u32 linfo_idx;
> > +	struct bpf_prog_stats __percpu *stats;
> >  	union {
> >  		struct work_struct work;
> >  		struct rcu_head	rcu;
> > @@ -559,6 +565,7 @@ void bpf_map_area_free(void *base);
> >  void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
> >  
> >  extern int sysctl_unprivileged_bpf_disabled;
> > +extern int sysctl_bpf_stats_enabled;
> >  
> >  int bpf_map_new_fd(struct bpf_map *map, int flags);
> >  int bpf_prog_new_fd(struct bpf_prog *prog);
> > diff --git a/include/linux/filter.h b/include/linux/filter.h
> > index f32b3eca5a04..7b14235b6f7d 100644
> > --- a/include/linux/filter.h
> > +++ b/include/linux/filter.h
> > @@ -533,7 +533,21 @@ struct sk_filter {
> >  	struct bpf_prog	*prog;
> >  };
> >  
> > -#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
> > +DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
> > +
> > +#define BPF_PROG_RUN(prog, ctx)	({				\
> > +	u32 ret;						\
> > +	cant_sleep();						\
> > +	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
> > +		u64 start = sched_clock();			\
> > +		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> > +		this_cpu_inc(prog->aux->stats->cnt);		\
> > +		this_cpu_add(prog->aux->stats->nsecs,		\
> > +			     sched_clock() - start);		\
> > +	} else {						\
> > +		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> > +	}							\
> > +	ret; })
> 
> Can we bump "cnt" unconditionally and gate only the "nsecs" calculation?

xdp cannot afford to pay this penalty unconditionally.
It's not only this_cpu_inc. It's prog->aux->stats loads.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable program stats
  2019-02-23  1:06     ` Daniel Borkmann
@ 2019-02-23  2:38       ` Alexei Starovoitov
  2019-02-25 10:36         ` Daniel Borkmann
  0 siblings, 1 reply; 16+ messages in thread
From: Alexei Starovoitov @ 2019-02-23  2:38 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Roman Gushchin, Alexei Starovoitov, davem, netdev, bpf, Kernel Team

On Sat, Feb 23, 2019 at 02:06:56AM +0100, Daniel Borkmann wrote:
> 
> In general, having some stats and timing info would be useful, but I
> guess people might want to customize it in future even more specifically
> beyond number of runs + time it takes. One thing that would be super
> useful is to have some notion of __attribute__((constructor)) and
> __attribute__((destructor)) support in BPF which gets then inlined into
> prologue / epilogue of program. E.g. such infrastructure would allow to
> mark an skb and measure time it takes through the BPF prog till it hits
> an exit point somewhere (without having to explicitly code this into the
> program everywhere). Other examples may be histograms or p99 latencies
> that might probably be useful. Feels like for monitoring more ways to
> program would be nice and to move it into the BPF insns sequence (e.g.
> enforced globally or by choice of prog as another option)? Thoughts?

the constructor/destructor you mean to capture the full sequence of tail_calls?
Or full path of skb through the stack with all hook points?
That is likely very useful without any bpf, but I think hw timestamping
already serves that purpose.

I've been thinking about doing this stats per program
(instead of static_key for all).
Like replacing a bpf_prog->bpf_func with a wrapper function
that does stats, but that is more costly in retpoline world
due to extra indirect call.
Another alternative is to patch the stats in via JITs, but
it's much more complex and error prone.
So went with the simplest approach.

With all stats ideas we need to be careful not to reintroduce what
perf is already great at.
These stats are _not_ for performance analysis. That's what perf does.
These stats are for 24/7 monitoring to catch things that not suppose to happen.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH bpf-next 4/4] tools/bpftool: recognize bpf_prog_info runtime and runcnt
  2019-02-22 23:36 ` [PATCH bpf-next 4/4] tools/bpftool: recognize bpf_prog_info runtime and runcnt Alexei Starovoitov
@ 2019-02-23  4:17   ` Andrii Nakryiko
  0 siblings, 0 replies; 16+ messages in thread
From: Andrii Nakryiko @ 2019-02-23  4:17 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: davem, Daniel Borkmann, netdev, bpf, Kernel Team

On Fri, Feb 22, 2019 at 3:37 PM Alexei Starovoitov <ast@kernel.org> wrote:
>
> $ bpftool p s
> 1: kprobe  tag a56587d488d216c9  gpl runtime 79786 runcnt 8
>         loaded_at 2019-02-22T12:22:51-0800  uid 0
>         xlated 352B  not jited  memlock 4096B
>
> $ bpftool --json --pretty p s
> [{
>         "id": 1,
>         "type": "kprobe",
>         "tag": "a56587d488d216c9",
>         "gpl_compatible": true,
>         "runtime": 79786,
>         "runcnt": 8,
>         "loaded_at": 1550866971,
>         "uid": 0,
>         "bytes_xlated": 352,
>         "jited": false,
>         "bytes_memlock": 4096
>     }
> ]
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  tools/bpf/bpftool/Documentation/bpftool-prog.rst | 4 +++-
>  tools/bpf/bpftool/prog.c                         | 7 +++++++
>  2 files changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> index 12bc1e2d4b46..102b180d3add 100644
> --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> @@ -171,7 +171,7 @@ EXAMPLES
>
>  ::
>
> -    10: xdp  name some_prog  tag 005a3d2123620c8b  gpl
> +    10: xdp  name some_prog  tag 005a3d2123620c8b  gpl runtime 81632 runcnt 10
>              loaded_at 2017-09-29T20:11:00+0000  uid 0
>              xlated 528B  jited 370B  memlock 4096B  map_ids 10
>
> @@ -184,6 +184,8 @@ EXAMPLES
>              "type": "xdp",
>              "tag": "005a3d2123620c8b",
>              "gpl_compatible": true,
> +            "runtime": 81632,
> +            "runcnt": 10,

Should these fields be called "run_time" and "run_cnt" for consistency
with other fields? I'd even suggest "run_time_ns" so that there is no
question about time units.

>              "loaded_at": 1506715860,
>              "uid": 0,
>              "bytes_xlated": 528,
> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
> index db978c8d76a8..8065ba11b9d5 100644
> --- a/tools/bpf/bpftool/prog.c
> +++ b/tools/bpf/bpftool/prog.c
> @@ -214,6 +214,10 @@ static void print_prog_json(struct bpf_prog_info *info, int fd)
>                      info->tag[4], info->tag[5], info->tag[6], info->tag[7]);
>
>         jsonw_bool_field(json_wtr, "gpl_compatible", info->gpl_compatible);
> +       if (info->runtime) {
> +               jsonw_uint_field(json_wtr, "runtime", info->runtime);
> +               jsonw_uint_field(json_wtr, "runcnt", info->runcnt);
> +       }
>
>         print_dev_json(info->ifindex, info->netns_dev, info->netns_ino);
>
> @@ -277,6 +281,9 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd)
>         fprint_hex(stdout, info->tag, BPF_TAG_SIZE, "");
>         print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino);
>         printf("%s", info->gpl_compatible ? "  gpl" : "");
> +       if (info->runtime)
> +               printf(" runtime %lld runcnt %lld",
> +                      info->runtime, info->runcnt);
>         printf("\n");
>
>         if (info->load_time) {
> --
> 2.20.0
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable program stats
  2019-02-23  2:38       ` Alexei Starovoitov
@ 2019-02-25 10:36         ` Daniel Borkmann
  2019-02-26  4:27           ` Alexei Starovoitov
  0 siblings, 1 reply; 16+ messages in thread
From: Daniel Borkmann @ 2019-02-25 10:36 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Roman Gushchin, Alexei Starovoitov, davem, netdev, bpf, Kernel Team

On 02/23/2019 03:38 AM, Alexei Starovoitov wrote:
> On Sat, Feb 23, 2019 at 02:06:56AM +0100, Daniel Borkmann wrote:
>>
>> In general, having some stats and timing info would be useful, but I
>> guess people might want to customize it in future even more specifically
>> beyond number of runs + time it takes. One thing that would be super
>> useful is to have some notion of __attribute__((constructor)) and
>> __attribute__((destructor)) support in BPF which gets then inlined into
>> prologue / epilogue of program. E.g. such infrastructure would allow to
>> mark an skb and measure time it takes through the BPF prog till it hits
>> an exit point somewhere (without having to explicitly code this into the
>> program everywhere). Other examples may be histograms or p99 latencies
>> that might probably be useful. Feels like for monitoring more ways to
>> program would be nice and to move it into the BPF insns sequence (e.g.
>> enforced globally or by choice of prog as another option)? Thoughts?
> 
> the constructor/destructor you mean to capture the full sequence of tail_calls?
> Or full path of skb through the stack with all hook points?
> That is likely very useful without any bpf, but I think hw timestamping
> already serves that purpose.

Not through the stack, but was more thinking something like low-overhead
kprobes-style extension for a BPF prog where such sequence would be added
'inline' at beginning / exit of BPF prog invocation with normal ctx access
and helpers as the native program (e.g. time-stamping plus read/write into
mark as one example which kprobes couldn't do). But might go much beyond
context of this stats collection.

> I've been thinking about doing this stats per program
> (instead of static_key for all).
> Like replacing a bpf_prog->bpf_func with a wrapper function
> that does stats, but that is more costly in retpoline world
> due to extra indirect call.

That's actually an interesting thought, given the original prog->bpf_func
is a known address at that time, this could be templated where an inner
dummy bpf_func call to the normal BPF prog gets later replaced with the
actual prog->bpf_func address to have no indirect call. This would also
allow to keep the actual prog->bpf_func entry call-sites small and simple.

> Another alternative is to patch the stats in via JITs, but
> it's much more complex and error prone.
> So went with the simplest approach.

Pure BPF insns rewrite via JIT would indeed be unclear wrt per-cpu data
structures.

> With all stats ideas we need to be careful not to reintroduce what
> perf is already great at.

Yeah agree.

> These stats are _not_ for performance analysis. That's what perf does.
> These stats are for 24/7 monitoring to catch things that not suppose to happen.
Thanks,
Daniel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable program stats
  2019-02-25 10:36         ` Daniel Borkmann
@ 2019-02-26  4:27           ` Alexei Starovoitov
  2019-02-26 15:29             ` Daniel Borkmann
  0 siblings, 1 reply; 16+ messages in thread
From: Alexei Starovoitov @ 2019-02-26  4:27 UTC (permalink / raw)
  To: Daniel Borkmann, Alexei Starovoitov
  Cc: Roman Gushchin, Alexei Starovoitov, davem, netdev, bpf, Kernel Team

On 2/25/19 2:36 AM, Daniel Borkmann wrote:
> 
> Not through the stack, but was more thinking something like low-overhead
> kprobes-style extension for a BPF prog where such sequence would be added
> 'inline' at beginning / exit of BPF prog invocation with normal ctx access
> and helpers as the native program (e.g. time-stamping plus read/write into
> mark as one example which kprobes couldn't do). But might go much beyond
> context of this stats collection.

see below.

> That's actually an interesting thought, given the original prog->bpf_func
> is a known address at that time, this could be templated where an inner
> dummy bpf_func call to the normal BPF prog gets later replaced with the
> actual prog->bpf_func address to have no indirect call. This would also
> allow to keep the actual prog->bpf_func entry call-sites small and simple.

My first thought was that we indeed can have a template of stats
collecting wrapper in .text,
then at 'switch stats on' event vmalloc exec region, copy wrapper code
into it and manually patch 'call foo' x86 insn with direct jump.
That is still quite involved from coding side.
It's similar to what kprobe/ftrace is doing, but probably
an overkill for stats due to potential security
concerns with new executable code.
But it won't work due to tail_calls.
I've looked into single wrapper approach with indirect call
(because it's much easer than patching x86) and realized it
won't work for the same reason.
Inline kprobe+kretprobe insertion won't work either.
All because we have tail_calls.
we cannot have executable epilogue in progs.
tail_call will jmp into next prog and second part of stats collection
won't run. Essentially no epilogue allowed unless we disable tail_call.
Or we somehow reimplement tail_calls so that prog returns,
an epilogue is executed and then jumps into the next prog.
tail_call turned out be the worst corner case feature to deal with.

With static_key approach the main prog accounts the time
of all progs called via tail_call.
We can argue whether it's ideal semantics, but looks like
it's the only thing we can do.
I don't see a way how tail_called progs can count their own time.
Stats would somehow need to be computed right before jumping
into next prog. Which means heavy changes in all JITs
plus extra performance cost at runtime, since such if (stats_on)
check in tail_call handling would have to be done at runtime,
since JITed code is read-only and regenerating progs due to stats
is non-starter.
Or tail-called next prog would need to update stats for previous
prog, but there is no pointer to 'aux->stats' there.
So imo that is dead end.
static_key approach is the simplest by far.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable program stats
  2019-02-26  4:27           ` Alexei Starovoitov
@ 2019-02-26 15:29             ` Daniel Borkmann
  2019-02-27  4:05               ` Alexei Starovoitov
  0 siblings, 1 reply; 16+ messages in thread
From: Daniel Borkmann @ 2019-02-26 15:29 UTC (permalink / raw)
  To: Alexei Starovoitov, Alexei Starovoitov
  Cc: Roman Gushchin, Alexei Starovoitov, davem, netdev, bpf, Kernel Team

On 02/26/2019 05:27 AM, Alexei Starovoitov wrote:
> On 2/25/19 2:36 AM, Daniel Borkmann wrote:
>>
>> Not through the stack, but was more thinking something like low-overhead
>> kprobes-style extension for a BPF prog where such sequence would be added
>> 'inline' at beginning / exit of BPF prog invocation with normal ctx access
>> and helpers as the native program (e.g. time-stamping plus read/write into
>> mark as one example which kprobes couldn't do). But might go much beyond
>> context of this stats collection.
> 
> see below.
> 
>> That's actually an interesting thought, given the original prog->bpf_func
>> is a known address at that time, this could be templated where an inner
>> dummy bpf_func call to the normal BPF prog gets later replaced with the
>> actual prog->bpf_func address to have no indirect call. This would also
>> allow to keep the actual prog->bpf_func entry call-sites small and simple.
> 
> My first thought was that we indeed can have a template of stats
> collecting wrapper in .text,
> then at 'switch stats on' event vmalloc exec region, copy wrapper code
> into it and manually patch 'call foo' x86 insn with direct jump.
> That is still quite involved from coding side.
> It's similar to what kprobe/ftrace is doing, but probably
> an overkill for stats due to potential security
> concerns with new executable code.
> But it won't work due to tail_calls.

You mean for the scenario to measure the _individual_ progs that
are part of the given tail call processing such that each have
their individual inc in count and time-spent attribution. If so,
yeah, agree. Or for the case you have right now where everything
gets attributed to the "entry" program that triggers the tail
call processing? Latter would be similar as we have now in this
patch, imho.

> I've looked into single wrapper approach with indirect call
> (because it's much easer than patching x86) and realized it
> won't work for the same reason.
> Inline kprobe+kretprobe insertion won't work either.
> All because we have tail_calls.
> we cannot have executable epilogue in progs.
> tail_call will jmp into next prog and second part of stats collection
> won't run. Essentially no epilogue allowed unless we disable tail_call.
> Or we somehow reimplement tail_calls so that prog returns,
> an epilogue is executed and then jumps into the next prog.
> tail_call turned out be the worst corner case feature to deal with.

Agree, it's a pain as in most cases. Was more thinking that
'special' epilogue would inc the count, start first ktime/cycle
snapshot and then do a direct function call to the normally
JITed BPF prog (where it can do tail calls, etc, so nothing
changes there), and upon return from this call take the second
measurement, and update the per CPU buffer, similar as we have
now just with static keys approach. So, was thinking similar as
we JIT BPF-to-BPF calls on an individual prog basis.

> With static_key approach the main prog accounts the time
> of all progs called via tail_call.
> We can argue whether it's ideal semantics, but looks like
> it's the only thing we can do.

Yeah true, it's more about measuring how long the prog 'chain'
in the whole hook is taking to complete, which may be interesting
in itself, just perhaps a bit confusing when looking at individual
progs' share.

> I don't see a way how tail_called progs can count their own time.

Me neither, if so I guess it would be of similar fashion as you
have right now where it's accounted to the first prog.

> Stats would somehow need to be computed right before jumping
> into next prog. Which means heavy changes in all JITs
> plus extra performance cost at runtime, since such if (stats_on)
> check in tail_call handling would have to be done at runtime,
> since JITed code is read-only and regenerating progs due to stats
> is non-starter.
> Or tail-called next prog would need to update stats for previous
> prog, but there is no pointer to 'aux->stats' there.
> So imo that is dead end.
> static_key approach is the simplest by far.

It's simplest, yep. I guess if we get to the point of removing
indirect call for BPF_PROG_RUN itself, we might need to revisit
then and move it into bpf_func at that point. Kind of implementation
detail given we only expose count and time spent data to uapi,
though the direct switch at runtime may be a bit problematic in
future. Anyway, lets see how it goes when we get there.

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable program stats
  2019-02-26 15:29             ` Daniel Borkmann
@ 2019-02-27  4:05               ` Alexei Starovoitov
  0 siblings, 0 replies; 16+ messages in thread
From: Alexei Starovoitov @ 2019-02-27  4:05 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Alexei Starovoitov, Roman Gushchin, Alexei Starovoitov, davem,
	netdev, bpf, Kernel Team

On Tue, Feb 26, 2019 at 04:29:45PM +0100, Daniel Borkmann wrote:
> On 02/26/2019 05:27 AM, Alexei Starovoitov wrote:
> > On 2/25/19 2:36 AM, Daniel Borkmann wrote:
> >>
> >> Not through the stack, but was more thinking something like low-overhead
> >> kprobes-style extension for a BPF prog where such sequence would be added
> >> 'inline' at beginning / exit of BPF prog invocation with normal ctx access
> >> and helpers as the native program (e.g. time-stamping plus read/write into
> >> mark as one example which kprobes couldn't do). But might go much beyond
> >> context of this stats collection.
> > 
> > see below.
> > 
> >> That's actually an interesting thought, given the original prog->bpf_func
> >> is a known address at that time, this could be templated where an inner
> >> dummy bpf_func call to the normal BPF prog gets later replaced with the
> >> actual prog->bpf_func address to have no indirect call. This would also
> >> allow to keep the actual prog->bpf_func entry call-sites small and simple.
> > 
> > My first thought was that we indeed can have a template of stats
> > collecting wrapper in .text,
> > then at 'switch stats on' event vmalloc exec region, copy wrapper code
> > into it and manually patch 'call foo' x86 insn with direct jump.
> > That is still quite involved from coding side.
> > It's similar to what kprobe/ftrace is doing, but probably
> > an overkill for stats due to potential security
> > concerns with new executable code.
> > But it won't work due to tail_calls.
> 
> You mean for the scenario to measure the _individual_ progs that
> are part of the given tail call processing such that each have
> their individual inc in count and time-spent attribution. If so,
> yeah, agree. Or for the case you have right now where everything
> gets attributed to the "entry" program that triggers the tail
> call processing? Latter would be similar as we have now in this
> patch, imho.

There are two other issues with 'epilogue' approach.
1.
We call progs as (*(prog)->bpf_func)(ctx, (prog)->insnsi)
and from R2 we can get back to prog and from there go to prog->aux-stats,
but there is no room in the stack to save this R2.
So we'd need extra 16-bytes for start_ns and saved R2
that epilogue can use to update stats in the 'entry' prog.
With static_key approach gcc was smart enough to use callee-saved
registers and no extra stack was used in the caller
(at least for one callsite of BPF_PROG_RUN where I've looked at asm)

2.
when stats are flipped on and off we need to make sure that all bpf progs
in prog_array are with stats on or off. Cannot mix and match.
Otherwise this 16-byte stack mismatch will cause issues.
Which is impossible to do atomically for the whole prog_array.

> > With static_key approach the main prog accounts the time
> > of all progs called via tail_call.
> > We can argue whether it's ideal semantics, but looks like
> > it's the only thing we can do.
> 
> Yeah true, it's more about measuring how long the prog 'chain'
> in the whole hook is taking to complete, which may be interesting
> in itself, just perhaps a bit confusing when looking at individual
> progs' share.

yep. would be good to come up with a way to measure individual progs,
but I couldn't figure it out yet.

> It's simplest, yep. I guess if we get to the point of removing
> indirect call for BPF_PROG_RUN itself, we might need to revisit
> then and move it into bpf_func at that point. Kind of implementation
> detail given we only expose count and time spent data to uapi,
> though the direct switch at runtime may be a bit problematic in
> future. Anyway, lets see how it goes when we get there.

yep


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2019-02-27  4:05 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-22 23:36 [PATCH bpf-next 0/4] bpf: per program stats Alexei Starovoitov
2019-02-22 23:36 ` [PATCH bpf-next 1/4] bpf: enable " Alexei Starovoitov
2019-02-23  0:05   ` Eric Dumazet
2019-02-23  2:29     ` Alexei Starovoitov
2019-02-23  0:34   ` Roman Gushchin
2019-02-23  1:06     ` Daniel Borkmann
2019-02-23  2:38       ` Alexei Starovoitov
2019-02-25 10:36         ` Daniel Borkmann
2019-02-26  4:27           ` Alexei Starovoitov
2019-02-26 15:29             ` Daniel Borkmann
2019-02-27  4:05               ` Alexei Starovoitov
2019-02-23  2:31     ` Alexei Starovoitov
2019-02-22 23:36 ` [PATCH bpf-next 2/4] bpf: expose program stats via bpf_prog_info Alexei Starovoitov
2019-02-22 23:36 ` [PATCH bpf-next 3/4] tools/bpf: sync bpf.h into tools Alexei Starovoitov
2019-02-22 23:36 ` [PATCH bpf-next 4/4] tools/bpftool: recognize bpf_prog_info runtime and runcnt Alexei Starovoitov
2019-02-23  4:17   ` Andrii Nakryiko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).