* [PATCH v2 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls
2015-01-28 4:06 [PATCH v2 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Alexei Starovoitov
@ 2015-01-28 4:06 ` Alexei Starovoitov
2015-01-29 0:46 ` Namhyung Kim
2015-01-28 4:06 ` [PATCH v2 linux-trace 2/8] tracing: allow eBPF programs to call ktime_get_ns() Alexei Starovoitov
` (6 subsequent siblings)
7 siblings, 1 reply; 15+ messages in thread
From: Alexei Starovoitov @ 2015-01-28 4:06 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
User interface:
fd = open("/sys/kernel/debug/tracing/__event__/filter")
write(fd, "bpf_123")
where 123 is process local FD associated with eBPF program previously loaded.
__event__ is static tracepoint event or syscall.
(kprobe support is in next patch)
Once program is successfully attached to tracepoint event, the tracepoint
will be auto-enabled
close(fd)
auto-disables tracepoint event and detaches eBPF program from it
eBPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- memcmp
- fetch_ptr/u64/u32/u16/u8 values from unsafe address via probe_kernel_read(),
so that eBPF program can walk any kernel data structures
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
include/linux/ftrace_event.h | 4 ++
include/trace/bpf_trace.h | 25 +++++++
include/trace/ftrace.h | 29 ++++++++
include/uapi/linux/bpf.h | 7 ++
kernel/trace/Kconfig | 1 +
kernel/trace/Makefile | 1 +
kernel/trace/bpf_trace.c | 129 ++++++++++++++++++++++++++++++++++++
kernel/trace/trace.h | 3 +
kernel/trace/trace_events.c | 33 ++++++++-
kernel/trace/trace_events_filter.c | 79 +++++++++++++++++++++-
kernel/trace/trace_syscalls.c | 31 +++++++++
11 files changed, 340 insertions(+), 2 deletions(-)
create mode 100644 include/trace/bpf_trace.h
create mode 100644 kernel/trace/bpf_trace.c
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0bebb5c348b8..79de230b7df3 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -248,6 +248,7 @@ enum {
TRACE_EVENT_FL_WAS_ENABLED_BIT,
TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
TRACE_EVENT_FL_TRACEPOINT_BIT,
+ TRACE_EVENT_FL_BPF_BIT,
};
/*
@@ -270,6 +271,7 @@ enum {
TRACE_EVENT_FL_WAS_ENABLED = (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+ TRACE_EVENT_FL_BPF = (1 << TRACE_EVENT_FL_BPF_BIT),
};
struct ftrace_event_call {
@@ -544,6 +546,8 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
event_triggers_post_call(file, tt);
}
+unsigned int trace_filter_call_bpf(struct event_filter *filter, void *ctx);
+
enum {
FILTER_OTHER = 0,
FILTER_STATIC_STRING,
diff --git a/include/trace/bpf_trace.h b/include/trace/bpf_trace.h
new file mode 100644
index 000000000000..4e64f61f484d
--- /dev/null
+++ b/include/trace/bpf_trace.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _LINUX_KERNEL_BPF_TRACE_H
+#define _LINUX_KERNEL_BPF_TRACE_H
+
+/* For tracepoint filters argN fields match one to one to arguments
+ * passed to tracepoint events
+ *
+ * For syscall entry filters argN fields match syscall arguments
+ * For syscall exit filters arg1 is a return value
+ */
+struct bpf_context {
+ u64 arg1;
+ u64 arg2;
+ u64 arg3;
+ u64 arg4;
+ u64 arg5;
+ u64 arg6;
+};
+
+#endif /* _LINUX_KERNEL_BPF_TRACE_H */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 139b5067345b..07b68332f149 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -17,6 +17,7 @@
*/
#include <linux/ftrace_event.h>
+#include <trace/bpf_trace.h>
/*
* DECLARE_EVENT_CLASS can be used to add a generic function
@@ -617,6 +618,24 @@ static inline notrace int ftrace_get_offsets_##call( \
#undef __perf_task
#define __perf_task(t) (t)
+/* zero extend integer, pointer or aggregate type to u64 without warnings */
+#define __CAST_TO_U64(expr) ({ \
+ u64 ret = 0; \
+ switch (sizeof(expr)) { \
+ case 8: ret = *(u64 *) &expr; break; \
+ case 4: ret = *(u32 *) &expr; break; \
+ case 2: ret = *(u16 *) &expr; break; \
+ case 1: ret = *(u8 *) &expr; break; \
+ } \
+ ret; })
+
+#define __BPF_CAST1(a,...) __CAST_TO_U64(a)
+#define __BPF_CAST2(a,...) __CAST_TO_U64(a), __BPF_CAST1(__VA_ARGS__)
+#define __BPF_CAST3(a,...) __CAST_TO_U64(a), __BPF_CAST2(__VA_ARGS__)
+#define __BPF_CAST4(a,...) __CAST_TO_U64(a), __BPF_CAST3(__VA_ARGS__)
+#define __BPF_CAST5(a,...) __CAST_TO_U64(a), __BPF_CAST4(__VA_ARGS__)
+#define __BPF_CAST6(a,...) __CAST_TO_U64(a), __BPF_CAST5(__VA_ARGS__)
+
#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
\
@@ -632,6 +651,16 @@ ftrace_raw_event_##call(void *__data, proto) \
if (ftrace_trigger_soft_disabled(ftrace_file)) \
return; \
\
+ if (ftrace_file->flags & TRACE_EVENT_FL_BPF) { \
+ __maybe_unused const u64 z = 0; \
+ struct bpf_context __ctx = ((struct bpf_context) { \
+ __BPF_CAST6(args, z, z, z, z, z) \
+ }); \
+ \
+ if (!trace_filter_call_bpf(ftrace_file->filter, &__ctx))\
+ return; \
+ } \
+ \
__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
\
entry = ftrace_event_buffer_reserve(&fbuffer, ftrace_file, \
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45da7ec7d274..3bf42875287c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
enum bpf_prog_type {
BPF_PROG_TYPE_UNSPEC,
BPF_PROG_TYPE_SOCKET_FILTER,
+ BPF_PROG_TYPE_TRACING_FILTER,
};
/* flags for BPF_MAP_UPDATE_ELEM command */
@@ -162,6 +163,12 @@ enum bpf_func_id {
BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+ BPF_FUNC_fetch_ptr, /* void *bpf_fetch_ptr(void *unsafe_ptr) */
+ BPF_FUNC_fetch_u64, /* u64 bpf_fetch_u64(void *unsafe_ptr) */
+ BPF_FUNC_fetch_u32, /* u32 bpf_fetch_u32(void *unsafe_ptr) */
+ BPF_FUNC_fetch_u16, /* u16 bpf_fetch_u16(void *unsafe_ptr) */
+ BPF_FUNC_fetch_u8, /* u8 bpf_fetch_u8(void *unsafe_ptr) */
+ BPF_FUNC_memcmp, /* int bpf_memcmp(void *unsafe_ptr, void *safe_ptr, int size) */
__BPF_FUNC_MAX_ID,
};
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5da09c899dd..eb60b234b824 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -75,6 +75,7 @@ config FTRACE_NMI_ENTER
config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
+ select BPF_SYSCALL
bool
config CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979ccde26720..ef821d90f3f5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_EVENT_TRACING) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..4aabbe2626c5
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,129 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include <trace/bpf_trace.h>
+#include "trace.h"
+
+static u64 bpf_fetch_ptr(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ void *unsafe_ptr = (void *) (long) r1;
+ void *ptr = NULL;
+
+ probe_kernel_read(&ptr, unsafe_ptr, sizeof(ptr));
+ return (u64) (unsigned long) ptr;
+}
+
+#define FETCH(SIZE) \
+static u64 bpf_fetch_##SIZE(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) \
+{ \
+ void *unsafe_ptr = (void *) (long) r1; \
+ SIZE val = 0; \
+ \
+ probe_kernel_read(&val, unsafe_ptr, sizeof(val)); \
+ return (u64) (SIZE) val; \
+}
+FETCH(u64)
+FETCH(u32)
+FETCH(u16)
+FETCH(u8)
+#undef FETCH
+
+static u64 bpf_memcmp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ void *unsafe_ptr = (void *) (long) r1;
+ void *safe_ptr = (void *) (long) r2;
+ u32 size = (u32) r3;
+ char buf[64];
+ int err;
+
+ if (size < 64) {
+ err = probe_kernel_read(buf, unsafe_ptr, size);
+ if (err)
+ return err;
+ return memcmp(buf, safe_ptr, size);
+ }
+ return -1;
+}
+
+static struct bpf_func_proto tracing_filter_funcs[] = {
+#define FETCH(SIZE) \
+ [BPF_FUNC_fetch_##SIZE] = { \
+ .func = bpf_fetch_##SIZE, \
+ .gpl_only = true, \
+ .ret_type = RET_INTEGER, \
+ },
+ FETCH(ptr)
+ FETCH(u64)
+ FETCH(u32)
+ FETCH(u16)
+ FETCH(u8)
+#undef FETCH
+ [BPF_FUNC_memcmp] = {
+ .func = bpf_memcmp,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+ },
+};
+
+static const struct bpf_func_proto *tracing_filter_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ default:
+ if (func_id < 0 || func_id >= ARRAY_SIZE(tracing_filter_funcs))
+ return NULL;
+ return &tracing_filter_funcs[func_id];
+ }
+}
+
+/* check access to argN fields of 'struct bpf_context' from program */
+static bool tracing_filter_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ /* check bounds */
+ if (off < 0 || off >= sizeof(struct bpf_context))
+ return false;
+
+ /* only read is allowed */
+ if (type != BPF_READ)
+ return false;
+
+ /* disallow misaligned access */
+ if (off % size != 0)
+ return false;
+
+ return true;
+}
+
+static struct bpf_verifier_ops tracing_filter_ops = {
+ .get_func_proto = tracing_filter_func_proto,
+ .is_valid_access = tracing_filter_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl = {
+ .ops = &tracing_filter_ops,
+ .type = BPF_PROG_TYPE_TRACING_FILTER,
+};
+
+static int __init register_tracing_filter_ops(void)
+{
+ bpf_register_prog_type(&tl);
+ return 0;
+}
+late_initcall(register_tracing_filter_ops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8de48bac1ce2..d667547c6f0e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -977,12 +977,15 @@ struct ftrace_event_field {
int is_signed;
};
+struct bpf_prog;
+
struct event_filter {
int n_preds; /* Number assigned */
int a_preds; /* allocated */
struct filter_pred *preds;
struct filter_pred *root;
char *filter_string;
+ struct bpf_prog *prog;
};
struct event_subsystem {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b03a0ea77b99..70482817231a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1084,6 +1084,26 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
return r;
}
+static int event_filter_release(struct inode *inode, struct file *filp)
+{
+ struct ftrace_event_file *file;
+ char buf[2] = "0";
+
+ mutex_lock(&event_mutex);
+ file = event_file_data(filp);
+ if (file) {
+ if (file->flags & TRACE_EVENT_FL_BPF) {
+ /* auto-disable the filter */
+ ftrace_event_enable_disable(file, 0);
+
+ /* if BPF filter was used, clear it on fd close */
+ apply_event_filter(file, buf);
+ }
+ }
+ mutex_unlock(&event_mutex);
+ return 0;
+}
+
static ssize_t
event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -1107,8 +1127,18 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
file = event_file_data(filp);
- if (file)
+ if (file) {
+ /*
+ * note to user space tools:
+ * write() into debugfs/tracing/events/xxx/filter file
+ * must be done with the same privilege level as open()
+ */
err = apply_event_filter(file, buf);
+ if (!err && file->flags & TRACE_EVENT_FL_BPF)
+ /* once filter is applied, auto-enable it */
+ ftrace_event_enable_disable(file, 1);
+ }
+
mutex_unlock(&event_mutex);
free_page((unsigned long) buf);
@@ -1363,6 +1393,7 @@ static const struct file_operations ftrace_event_filter_fops = {
.open = tracing_open_generic,
.read = event_filter_read,
.write = event_filter_write,
+ .release = event_filter_release,
.llseek = default_llseek,
};
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index ced69da0ff55..e0303b3cc9fb 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -23,6 +23,9 @@
#include <linux/mutex.h>
#include <linux/perf_event.h>
#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include <linux/filter.h>
#include "trace.h"
#include "trace_output.h"
@@ -541,6 +544,21 @@ static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
return WALK_PRED_DEFAULT;
}
+unsigned int trace_filter_call_bpf(struct event_filter *filter, void *ctx)
+{
+ unsigned int ret;
+
+ if (in_nmi()) /* not supported yet */
+ return 0;
+
+ rcu_read_lock();
+ ret = BPF_PROG_RUN(filter->prog, ctx);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_filter_call_bpf);
+
/* return 1 if event matches, 0 otherwise (discard) */
int filter_match_preds(struct event_filter *filter, void *rec)
{
@@ -795,6 +813,8 @@ static void __free_filter(struct event_filter *filter)
if (!filter)
return;
+ if (filter->prog)
+ bpf_prog_put(filter->prog);
__free_preds(filter);
kfree(filter->filter_string);
kfree(filter);
@@ -1874,6 +1894,50 @@ static int create_filter_start(char *filter_str, bool set_str,
return err;
}
+static int create_filter_bpf(char *filter_str, struct event_filter **filterp)
+{
+ struct event_filter *filter;
+ struct bpf_prog *prog;
+ long ufd;
+ int err = 0;
+
+ *filterp = NULL;
+
+ filter = __alloc_filter();
+ if (!filter)
+ return -ENOMEM;
+
+ err = replace_filter_string(filter, filter_str);
+ if (err)
+ goto free_filter;
+
+ err = kstrtol(filter_str + 4, 0, &ufd);
+ if (err)
+ goto free_filter;
+
+ prog = bpf_prog_get(ufd);
+ if (IS_ERR(prog)) {
+ err = PTR_ERR(prog);
+ goto free_filter;
+ }
+
+ filter->prog = prog;
+
+ if (prog->aux->prog_type != BPF_PROG_TYPE_TRACING_FILTER) {
+ /* valid fd, but invalid bpf program type */
+ err = -EINVAL;
+ goto free_filter;
+ }
+
+ *filterp = filter;
+
+ return 0;
+
+free_filter:
+ __free_filter(filter);
+ return err;
+}
+
static void create_filter_finish(struct filter_parse_state *ps)
{
if (ps) {
@@ -1971,6 +2035,7 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
filter_disable(file);
filter = event_filter(file);
+ file->flags &= ~TRACE_EVENT_FL_BPF;
if (!filter)
return 0;
@@ -1983,7 +2048,19 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
return 0;
}
- err = create_filter(call, filter_string, true, &filter);
+ /*
+ * 'bpf_123' string is a request to attach eBPF program with id == 123
+ * also accept 'bpf 123', 'bpf.123', 'bpf-123' variants
+ */
+ if (memcmp(filter_string, "bpf", 3) == 0 && filter_string[3] != 0 &&
+ filter_string[4] != 0) {
+ err = create_filter_bpf(filter_string, &filter);
+ if (!err)
+ file->flags |= TRACE_EVENT_FL_BPF;
+ } else {
+ err = create_filter(call, filter_string, true, &filter);
+ file->flags &= ~TRACE_EVENT_FL_BPF;
+ }
/*
* Always swap the call filter with the new filter
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..e1b25a834cc7 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -7,6 +7,7 @@
#include <linux/ftrace.h>
#include <linux/perf_event.h>
#include <asm/syscall.h>
+#include <trace/bpf_trace.h>
#include "trace_output.h"
#include "trace.h"
@@ -290,6 +291,20 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
return ret;
}
+static void populate_bpf_ctx(struct bpf_context *ctx, struct pt_regs *regs)
+{
+ struct task_struct *task = current;
+ unsigned long args[6];
+
+ syscall_get_arguments(task, regs, 0, 6, args);
+ ctx->arg1 = args[0];
+ ctx->arg2 = args[1];
+ ctx->arg3 = args[2];
+ ctx->arg4 = args[3];
+ ctx->arg5 = args[4];
+ ctx->arg6 = args[5];
+}
+
static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
struct trace_array *tr = data;
@@ -319,6 +334,14 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
if (!sys_data)
return;
+ if (ftrace_file->flags & TRACE_EVENT_FL_BPF) {
+ struct bpf_context ctx;
+
+ populate_bpf_ctx(&ctx, regs);
+ if (!trace_filter_call_bpf(ftrace_file->filter, &ctx))
+ return;
+ }
+
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
local_save_flags(irq_flags);
@@ -366,6 +389,14 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
if (!sys_data)
return;
+ if (ftrace_file->flags & TRACE_EVENT_FL_BPF) {
+ struct bpf_context ctx = {};
+
+ ctx.arg1 = syscall_get_return_value(current, regs);
+ if (!trace_filter_call_bpf(ftrace_file->filter, &ctx))
+ return;
+ }
+
local_save_flags(irq_flags);
pc = preempt_count();
--
1.7.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls
2015-01-28 4:06 ` [PATCH v2 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls Alexei Starovoitov
@ 2015-01-29 0:46 ` Namhyung Kim
0 siblings, 0 replies; 15+ messages in thread
From: Namhyung Kim @ 2015-01-29 0:46 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Steven Rostedt, Ingo Molnar, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
Hi Alexei,
On Tue, Jan 27, 2015 at 08:06:06PM -0800, Alexei Starovoitov wrote:
> User interface:
> fd = open("/sys/kernel/debug/tracing/__event__/filter")
>
> write(fd, "bpf_123")
>
> where 123 is process local FD associated with eBPF program previously loaded.
> __event__ is static tracepoint event or syscall.
> (kprobe support is in next patch)
> Once program is successfully attached to tracepoint event, the tracepoint
> will be auto-enabled
>
> close(fd)
> auto-disables tracepoint event and detaches eBPF program from it
>
> eBPF programs can call in-kernel helper functions to:
> - lookup/update/delete elements in maps
> - memcmp
> - fetch_ptr/u64/u32/u16/u8 values from unsafe address via probe_kernel_read(),
> so that eBPF program can walk any kernel data structures
>
> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
> ---
[SNIP]
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index b03a0ea77b99..70482817231a 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -1084,6 +1084,26 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
> return r;
> }
>
> +static int event_filter_release(struct inode *inode, struct file *filp)
> +{
> + struct ftrace_event_file *file;
> + char buf[2] = "0";
> +
> + mutex_lock(&event_mutex);
> + file = event_file_data(filp);
> + if (file) {
> + if (file->flags & TRACE_EVENT_FL_BPF) {
> + /* auto-disable the filter */
> + ftrace_event_enable_disable(file, 0);
Hmm.. what if user already enabled an event, attached a bpf filter and
then detached the filter - I'm not sure we can always auto-disable
it..
> +
> + /* if BPF filter was used, clear it on fd close */
> + apply_event_filter(file, buf);
> + }
> + }
> + mutex_unlock(&event_mutex);
> + return 0;
> +}
> +
> static ssize_t
> event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
> loff_t *ppos)
> @@ -1107,8 +1127,18 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
>
> mutex_lock(&event_mutex);
> file = event_file_data(filp);
> - if (file)
> + if (file) {
> + /*
> + * note to user space tools:
> + * write() into debugfs/tracing/events/xxx/filter file
> + * must be done with the same privilege level as open()
> + */
> err = apply_event_filter(file, buf);
> + if (!err && file->flags & TRACE_EVENT_FL_BPF)
> + /* once filter is applied, auto-enable it */
> + ftrace_event_enable_disable(file, 1);
> + }
> +
> mutex_unlock(&event_mutex);
>
> free_page((unsigned long) buf);
> @@ -1363,6 +1393,7 @@ static const struct file_operations ftrace_event_filter_fops = {
> .open = tracing_open_generic,
> .read = event_filter_read,
> .write = event_filter_write,
> + .release = event_filter_release,
> .llseek = default_llseek,
> };
>
> diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
> index ced69da0ff55..e0303b3cc9fb 100644
> --- a/kernel/trace/trace_events_filter.c
> +++ b/kernel/trace/trace_events_filter.c
> @@ -23,6 +23,9 @@
> #include <linux/mutex.h>
> #include <linux/perf_event.h>
> #include <linux/slab.h>
> +#include <linux/bpf.h>
> +#include <trace/bpf_trace.h>
> +#include <linux/filter.h>
>
> #include "trace.h"
> #include "trace_output.h"
> @@ -541,6 +544,21 @@ static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
> return WALK_PRED_DEFAULT;
> }
>
> +unsigned int trace_filter_call_bpf(struct event_filter *filter, void *ctx)
> +{
> + unsigned int ret;
> +
> + if (in_nmi()) /* not supported yet */
> + return 0;
But doesn't this mean to auto-disable all attached events during NMI
as returning 0 will prevent the event going to ring buffer?
I think it'd be better to keep an attached event in a soft-disabled
state like event trigger and give control of enabling to users..
Thanks,
Namhyung
> +
> + rcu_read_lock();
> + ret = BPF_PROG_RUN(filter->prog, ctx);
> + rcu_read_unlock();
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(trace_filter_call_bpf);
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v2 linux-trace 2/8] tracing: allow eBPF programs to call ktime_get_ns()
2015-01-28 4:06 [PATCH v2 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Alexei Starovoitov
2015-01-28 4:06 ` [PATCH v2 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls Alexei Starovoitov
@ 2015-01-28 4:06 ` Alexei Starovoitov
2015-01-28 4:06 ` [PATCH v2 linux-trace 3/8] samples: bpf: simple tracing example in eBPF assembler Alexei Starovoitov
` (5 subsequent siblings)
7 siblings, 0 replies; 15+ messages in thread
From: Alexei Starovoitov @ 2015-01-28 4:06 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
bpf_ktime_get_ns() is used by programs to compue time delta between events
or as a timestamp
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
include/uapi/linux/bpf.h | 1 +
kernel/trace/bpf_trace.c | 10 ++++++++++
2 files changed, 11 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3bf42875287c..227a4e404726 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -169,6 +169,7 @@ enum bpf_func_id {
BPF_FUNC_fetch_u16, /* u16 bpf_fetch_u16(void *unsafe_ptr) */
BPF_FUNC_fetch_u8, /* u8 bpf_fetch_u8(void *unsafe_ptr) */
BPF_FUNC_memcmp, /* int bpf_memcmp(void *unsafe_ptr, void *safe_ptr, int size) */
+ BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */
__BPF_FUNC_MAX_ID,
};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4aabbe2626c5..1c07f55702d6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -54,6 +54,11 @@ static u64 bpf_memcmp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
return -1;
}
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return ktime_get_ns();
+}
+
static struct bpf_func_proto tracing_filter_funcs[] = {
#define FETCH(SIZE) \
[BPF_FUNC_fetch_##SIZE] = { \
@@ -75,6 +80,11 @@ static struct bpf_func_proto tracing_filter_funcs[] = {
.arg2_type = ARG_PTR_TO_STACK,
.arg3_type = ARG_CONST_STACK_SIZE,
},
+ [BPF_FUNC_ktime_get_ns] = {
+ .func = bpf_ktime_get_ns,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ },
};
static const struct bpf_func_proto *tracing_filter_func_proto(enum bpf_func_id func_id)
--
1.7.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 linux-trace 3/8] samples: bpf: simple tracing example in eBPF assembler
2015-01-28 4:06 [PATCH v2 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Alexei Starovoitov
2015-01-28 4:06 ` [PATCH v2 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls Alexei Starovoitov
2015-01-28 4:06 ` [PATCH v2 linux-trace 2/8] tracing: allow eBPF programs to call ktime_get_ns() Alexei Starovoitov
@ 2015-01-28 4:06 ` Alexei Starovoitov
2015-01-28 4:06 ` [PATCH v2 linux-trace 4/8] samples: bpf: simple tracing example in C Alexei Starovoitov
` (4 subsequent siblings)
7 siblings, 0 replies; 15+ messages in thread
From: Alexei Starovoitov @ 2015-01-28 4:06 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
simple packet drop monitor:
- in-kernel eBPF program attaches to kfree_skb() event and records number
of packet drops at given location
- userspace iterates over the map every second and prints stats
Usage:
$ sudo dropmon
location 0xffffffff81695995 count 1
location 0xffffffff816d0da9 count 2
location 0xffffffff81695995 count 2
location 0xffffffff816d0da9 count 2
location 0xffffffff81695995 count 3
location 0xffffffff816d0da9 count 2
$ addr2line -ape ./bld_x64/vmlinux 0xffffffff81695995 0xffffffff816d0da9
0xffffffff81695995: ./bld_x64/../net/ipv4/icmp.c:1038
0xffffffff816d0da9: ./bld_x64/../net/unix/af_unix.c:1231
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 2 +
samples/bpf/dropmon.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 131 insertions(+)
create mode 100644 samples/bpf/dropmon.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b5b3600dcdf5..789691374562 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,7 +6,9 @@ hostprogs-y := test_verifier test_maps
hostprogs-y += sock_example
hostprogs-y += sockex1
hostprogs-y += sockex2
+hostprogs-y += dropmon
+dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o
sock_example-objs := sock_example.o libbpf.o
diff --git a/samples/bpf/dropmon.c b/samples/bpf/dropmon.c
new file mode 100644
index 000000000000..9a2cd3344d69
--- /dev/null
+++ b/samples/bpf/dropmon.c
@@ -0,0 +1,129 @@
+/* simple packet drop monitor:
+ * - in-kernel eBPF program attaches to kfree_skb() event and records number
+ * of packet drops at given location
+ * - userspace iterates over the map every second and prints stats
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <linux/unistd.h>
+#include <string.h>
+#include <linux/filter.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include "libbpf.h"
+
+#define TRACEPOINT "/sys/kernel/debug/tracing/events/skb/kfree_skb/"
+
+static int write_to_file(const char *file, const char *str, bool keep_open)
+{
+ int fd, err;
+
+ fd = open(file, O_WRONLY);
+ err = write(fd, str, strlen(str));
+ (void) err;
+
+ if (keep_open) {
+ return fd;
+ } else {
+ close(fd);
+ return -1;
+ }
+}
+
+static int dropmon(void)
+{
+ long long key, next_key, value = 0;
+ int prog_fd, map_fd, i;
+ char fmt[32];
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 1024);
+ if (map_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ goto cleanup;
+ }
+
+ /* the following eBPF program is equivalent to C:
+ * int filter(struct bpf_context *ctx)
+ * {
+ * long loc = ctx->arg2;
+ * long init_val = 1;
+ * long *value;
+ *
+ * value = bpf_map_lookup_elem(MAP_ID, &loc);
+ * if (value) {
+ * __sync_fetch_and_add(value, 1);
+ * } else {
+ * bpf_map_update_elem(MAP_ID, &loc, &init_val, BPF_ANY);
+ * }
+ * return 0;
+ * }
+ */
+ struct bpf_insn prog[] = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), /* r2 = *(u64 *)(r1 + 8) */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* *(u64 *)(fp - 8) = r2 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* r2 = fp - 8 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+ BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 1), /* *(u64 *)(fp - 16) = 1 */
+ BPF_MOV64_IMM(BPF_REG_4, BPF_ANY),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -16), /* r3 = fp - 16 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* r2 = fp - 8 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+ BPF_EXIT_INSN(),
+ };
+
+ prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACING_FILTER, prog,
+ sizeof(prog), "GPL");
+ if (prog_fd < 0) {
+ printf("failed to load prog '%s'\n%s", strerror(errno), bpf_log_buf);
+ return -1;
+ }
+
+ sprintf(fmt, "bpf_%d", prog_fd);
+
+ write_to_file(TRACEPOINT "filter", fmt, true);
+
+ for (i = 0; i < 10; i++) {
+ key = 0;
+ while (bpf_get_next_key(map_fd, &key, &next_key) == 0) {
+ bpf_lookup_elem(map_fd, &next_key, &value);
+ printf("location 0x%llx count %lld\n", next_key, value);
+ key = next_key;
+ }
+ if (key)
+ printf("\n");
+ sleep(1);
+ }
+
+cleanup:
+ /* maps, programs, tracepoint filters will auto cleanup on process exit */
+
+ return 0;
+}
+
+int main(void)
+{
+ FILE *f;
+
+ /* start ping in the background to get some kfree_skb events */
+ f = popen("ping -c5 localhost", "r");
+ (void) f;
+
+ dropmon();
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 linux-trace 4/8] samples: bpf: simple tracing example in C
2015-01-28 4:06 [PATCH v2 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Alexei Starovoitov
` (2 preceding siblings ...)
2015-01-28 4:06 ` [PATCH v2 linux-trace 3/8] samples: bpf: simple tracing example in eBPF assembler Alexei Starovoitov
@ 2015-01-28 4:06 ` Alexei Starovoitov
2015-01-28 16:24 ` Arnaldo Carvalho de Melo
2015-01-28 4:06 ` [PATCH v2 linux-trace 5/8] samples: bpf: counting example for kfree_skb tracepoint and write syscall Alexei Starovoitov
` (3 subsequent siblings)
7 siblings, 1 reply; 15+ messages in thread
From: Alexei Starovoitov @ 2015-01-28 4:06 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
tracex1_kern.c - C program which will be compiled into eBPF
to filter netif_receive_skb events on skb->dev->name == "lo"
The programs returns 1 to continue storing an event into trace buffer
and returns 0 - to discard an event.
tracex1_user.c - corresponding user space component that
forever reads /sys/.../trace_pipe
Usage:
$ sudo tracex1
should see:
writing bpf-4 -> /sys/kernel/debug/tracing/events/net/netif_receive_skb/filter
ping-364 [000] ..s2 8.089771: netif_receive_skb: dev=lo skbaddr=ffff88000dfcc100 len=84
ping-364 [000] ..s2 8.089889: netif_receive_skb: dev=lo skbaddr=ffff88000dfcc900 len=84
Ctrl-C at any time, kernel will auto cleanup
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 +++
samples/bpf/bpf_helpers.h | 14 +++++++++++
samples/bpf/bpf_load.c | 59 +++++++++++++++++++++++++++++++++++++++-----
samples/bpf/bpf_load.h | 3 +++
samples/bpf/tracex1_kern.c | 28 +++++++++++++++++++++
samples/bpf/tracex1_user.c | 24 ++++++++++++++++++
6 files changed, 126 insertions(+), 6 deletions(-)
create mode 100644 samples/bpf/tracex1_kern.c
create mode 100644 samples/bpf/tracex1_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 789691374562..da28e1b6d3a6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -7,6 +7,7 @@ hostprogs-y += sock_example
hostprogs-y += sockex1
hostprogs-y += sockex2
hostprogs-y += dropmon
+hostprogs-y += tracex1
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -14,17 +15,20 @@ test_maps-objs := test_maps.o libbpf.o
sock_example-objs := sock_example.o libbpf.o
sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
+tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
always += sockex1_kern.o
always += sockex2_kern.o
+always += tracex1_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
HOSTLOADLIBES_sockex1 += -lelf
HOSTLOADLIBES_sockex2 += -lelf
+HOSTLOADLIBES_tracex1 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index ca0333146006..9c385c2eacf8 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -15,6 +15,20 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value,
(void *) BPF_FUNC_map_update_elem;
static int (*bpf_map_delete_elem)(void *map, void *key) =
(void *) BPF_FUNC_map_delete_elem;
+static void *(*bpf_fetch_ptr)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_ptr;
+static unsigned long long (*bpf_fetch_u64)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u64;
+static unsigned int (*bpf_fetch_u32)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u32;
+static unsigned short (*bpf_fetch_u16)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u16;
+static unsigned char (*bpf_fetch_u8)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u8;
+static int (*bpf_memcmp)(void *unsafe_ptr, void *safe_ptr, int size) =
+ (void *) BPF_FUNC_memcmp;
+static unsigned long long (*bpf_ktime_get_ns)(void) =
+ (void *) BPF_FUNC_ktime_get_ns;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 1831d236382b..788ac51c1024 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -14,6 +14,8 @@
#include "bpf_helpers.h"
#include "bpf_load.h"
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
static char license[128];
static bool processed_sec[128];
int map_fd[MAX_MAPS];
@@ -22,15 +24,18 @@ int prog_cnt;
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
- int fd;
bool is_socket = strncmp(event, "socket", 6) == 0;
+ enum bpf_prog_type prog_type;
+ char path[256] = DEBUGFS;
+ char fmt[32];
+ int fd, event_fd, err;
- if (!is_socket)
- /* tracing events tbd */
- return -1;
+ if (is_socket)
+ prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ else
+ prog_type = BPF_PROG_TYPE_TRACING_FILTER;
- fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
- prog, size, license);
+ fd = bpf_prog_load(prog_type, prog, size, license);
if (fd < 0) {
printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
@@ -39,6 +44,28 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
prog_fd[prog_cnt++] = fd;
+ if (is_socket)
+ return 0;
+
+ snprintf(fmt, sizeof(fmt), "bpf-%d", fd);
+
+ strcat(path, event);
+ strcat(path, "/filter");
+
+ printf("writing %s -> %s\n", fmt, path);
+
+ event_fd = open(path, O_WRONLY, 0);
+ if (event_fd < 0) {
+ printf("failed to open event %s\n", event);
+ return -1;
+ }
+
+ err = write(event_fd, fmt, strlen(fmt));
+ if (err < 0) {
+ printf("write to '%s' failed '%s'\n", event, strerror(errno));
+ return -1;
+ }
+
return 0;
}
@@ -201,3 +228,23 @@ int load_bpf_file(char *path)
close(fd);
return 0;
}
+
+void read_trace_pipe(void)
+{
+ int trace_fd;
+
+ trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+ if (trace_fd < 0)
+ return;
+
+ while (1) {
+ static char buf[4096];
+ ssize_t sz;
+
+ sz = read(trace_fd, buf, sizeof(buf));
+ if (sz) {
+ buf[sz] = 0;
+ puts(buf);
+ }
+ }
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 27789a34f5e6..d154fc2b0535 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -21,4 +21,7 @@ extern int prog_fd[MAX_PROGS];
*/
int load_bpf_file(char *path);
+/* forever reads /sys/.../trace_pipe */
+void read_trace_pipe(void);
+
#endif
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
new file mode 100644
index 000000000000..7849ceb4bce6
--- /dev/null
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,28 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+SEC("events/net/netif_receive_skb")
+int bpf_prog1(struct bpf_context *ctx)
+{
+ /*
+ * attaches to /sys/kernel/debug/tracing/events/net/netif_receive_skb
+ * prints events for loobpack device only
+ */
+ char devname[] = "lo";
+ struct net_device *dev;
+ struct sk_buff *skb = 0;
+
+ skb = (struct sk_buff *) ctx->arg1;
+ dev = bpf_fetch_ptr(&skb->dev);
+ if (bpf_memcmp(dev->name, devname, 2) == 0)
+ /* print event using default tracepoint format */
+ return 1;
+
+ /* drop event */
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000000..e85c1b483f57
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,24 @@
+#include <stdio.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int ac, char **argv)
+{
+ FILE *f;
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ f = popen("ping -c5 localhost", "r");
+ (void) f;
+
+ read_trace_pipe();
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2 linux-trace 4/8] samples: bpf: simple tracing example in C
2015-01-28 4:06 ` [PATCH v2 linux-trace 4/8] samples: bpf: simple tracing example in C Alexei Starovoitov
@ 2015-01-28 16:24 ` Arnaldo Carvalho de Melo
2015-01-28 16:25 ` Arnaldo Carvalho de Melo
0 siblings, 1 reply; 15+ messages in thread
From: Arnaldo Carvalho de Melo @ 2015-01-28 16:24 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Steven Rostedt, Ingo Molnar, Namhyung Kim, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
Em Tue, Jan 27, 2015 at 08:06:09PM -0800, Alexei Starovoitov escreveu:
> diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
> new file mode 100644
> index 000000000000..7849ceb4bce6
> --- /dev/null
> +++ b/samples/bpf/tracex1_kern.c
> @@ -0,0 +1,28 @@
> +#include <linux/skbuff.h>
> +#include <linux/netdevice.h>
> +#include <uapi/linux/bpf.h>
> +#include <trace/bpf_trace.h>
> +#include "bpf_helpers.h"
> +
> +SEC("events/net/netif_receive_skb")
> +int bpf_prog1(struct bpf_context *ctx)
> +{
> + /*
> + * attaches to /sys/kernel/debug/tracing/events/net/netif_receive_skb
> + * prints events for loobpack device only
> + */
> + char devname[] = "lo";
> + struct net_device *dev;
> + struct sk_buff *skb = 0;
> +
> + skb = (struct sk_buff *) ctx->arg1;
> + dev = bpf_fetch_ptr(&skb->dev);
> + if (bpf_memcmp(dev->name, devname, 2) == 0)
I'm only starting to look at all this, so bear with me... But why do we
need to have it as "bpf_memcmp"? Can't we simply use it as "memcmp" and
have it use the right function?
Less typing, perhaps we would need to have a:
#define memcmp bpf_memcmp(s1, s2, n) bpf_memcmp(s1, s2, n)
in bpf_helpers.h to have it work?
- Arnaldo
> + /* print event using default tracepoint format */
> + return 1;
> +
> + /* drop event */
> + return 0;
> +}
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 linux-trace 4/8] samples: bpf: simple tracing example in C
2015-01-28 16:24 ` Arnaldo Carvalho de Melo
@ 2015-01-28 16:25 ` Arnaldo Carvalho de Melo
2015-01-28 16:42 ` Alexei Starovoitov
0 siblings, 1 reply; 15+ messages in thread
From: Arnaldo Carvalho de Melo @ 2015-01-28 16:25 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Steven Rostedt, Ingo Molnar, Namhyung Kim, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
Em Wed, Jan 28, 2015 at 01:24:15PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Tue, Jan 27, 2015 at 08:06:09PM -0800, Alexei Starovoitov escreveu:
> > diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
> > new file mode 100644
> > index 000000000000..7849ceb4bce6
> > --- /dev/null
> > +++ b/samples/bpf/tracex1_kern.c
> > @@ -0,0 +1,28 @@
> > +#include <linux/skbuff.h>
> > +#include <linux/netdevice.h>
> > +#include <uapi/linux/bpf.h>
> > +#include <trace/bpf_trace.h>
> > +#include "bpf_helpers.h"
> > +
> > +SEC("events/net/netif_receive_skb")
> > +int bpf_prog1(struct bpf_context *ctx)
> > +{
> > + /*
> > + * attaches to /sys/kernel/debug/tracing/events/net/netif_receive_skb
> > + * prints events for loobpack device only
> > + */
> > + char devname[] = "lo";
> > + struct net_device *dev;
> > + struct sk_buff *skb = 0;
> > +
> > + skb = (struct sk_buff *) ctx->arg1;
> > + dev = bpf_fetch_ptr(&skb->dev);
> > + if (bpf_memcmp(dev->name, devname, 2) == 0)
>
> I'm only starting to look at all this, so bear with me... But why do we
> need to have it as "bpf_memcmp"? Can't we simply use it as "memcmp" and
> have it use the right function?
>
> Less typing, perhaps we would need to have a:
>
> #define memcmp bpf_memcmp(s1, s2, n) bpf_memcmp(s1, s2, n)
Argh, like this:
#define memcmp(s1, s2, n) bpf_memcmp(s1, s2, n)
> in bpf_helpers.h to have it work?
>
> - Arnaldo
>
> > + /* print event using default tracepoint format */
> > + return 1;
> > +
> > + /* drop event */
> > + return 0;
> > +}
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 linux-trace 4/8] samples: bpf: simple tracing example in C
2015-01-28 16:25 ` Arnaldo Carvalho de Melo
@ 2015-01-28 16:42 ` Alexei Starovoitov
2015-01-28 20:44 ` Arnaldo Carvalho de Melo
0 siblings, 1 reply; 15+ messages in thread
From: Alexei Starovoitov @ 2015-01-28 16:42 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo
Cc: Steven Rostedt, Ingo Molnar, Namhyung Kim, Jiri Olsa,
Masami Hiramatsu, Linux API, Network Development, LKML
On Wed, Jan 28, 2015 at 8:25 AM, Arnaldo Carvalho de Melo
<acme@kernel.org> wrote:
> Em Wed, Jan 28, 2015 at 01:24:15PM -0300, Arnaldo Carvalho de Melo escreveu:
>> Em Tue, Jan 27, 2015 at 08:06:09PM -0800, Alexei Starovoitov escreveu:
>> > diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
>> > new file mode 100644
>> > index 000000000000..7849ceb4bce6
>> > --- /dev/null
>> > +++ b/samples/bpf/tracex1_kern.c
>> > @@ -0,0 +1,28 @@
>> > +#include <linux/skbuff.h>
>> > +#include <linux/netdevice.h>
>> > +#include <uapi/linux/bpf.h>
>> > +#include <trace/bpf_trace.h>
>> > +#include "bpf_helpers.h"
>> > +
>> > +SEC("events/net/netif_receive_skb")
>> > +int bpf_prog1(struct bpf_context *ctx)
>> > +{
>> > + /*
>> > + * attaches to /sys/kernel/debug/tracing/events/net/netif_receive_skb
>> > + * prints events for loobpack device only
>> > + */
>> > + char devname[] = "lo";
>> > + struct net_device *dev;
>> > + struct sk_buff *skb = 0;
>> > +
>> > + skb = (struct sk_buff *) ctx->arg1;
>> > + dev = bpf_fetch_ptr(&skb->dev);
>> > + if (bpf_memcmp(dev->name, devname, 2) == 0)
>>
>> I'm only starting to look at all this, so bear with me... But why do we
>> need to have it as "bpf_memcmp"? Can't we simply use it as "memcmp" and
>> have it use the right function?
>>
>> Less typing, perhaps we would need to have a:
>>
>> #define memcmp bpf_memcmp(s1, s2, n) bpf_memcmp(s1, s2, n)
>
> Argh, like this:
>
> #define memcmp(s1, s2, n) bpf_memcmp(s1, s2, n)
>
>> in bpf_helpers.h to have it work?
yes, that will work just fine.
Since it's an example I made it explicit that bpf_memcmp()
has memcmp() semantics, but little bit different:
int bpf_memcmp(void *unsafe_ptr, void *safe_ptr, int size)
meaning that one of the pointers can point anywhere and
the function will be doing probe_kernel_read() underneath
similar to bpf_fetch_*() helpers.
If it was plain memcmp() it would give a wrong impression
that vanilla memcmp() can be used.
In general the programs cannot use any library functions
outside of helpers defined in uapi/linux/bpf.h
bpf_fetch_*() helpers are also explicit in examples.
If one need to do a lot of pointer walking, then macro like
#define D(P) ((typeof(P))bpf_fetch_ptr(&P))
would be easier to use: p = D(D(skb->dev)->ifalias)
multiple pointer derefs would look more natural...
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 linux-trace 4/8] samples: bpf: simple tracing example in C
2015-01-28 16:42 ` Alexei Starovoitov
@ 2015-01-28 20:44 ` Arnaldo Carvalho de Melo
0 siblings, 0 replies; 15+ messages in thread
From: Arnaldo Carvalho de Melo @ 2015-01-28 20:44 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Steven Rostedt, Ingo Molnar, Namhyung Kim, Jiri Olsa,
Masami Hiramatsu, Linux API, Network Development, LKML
Em Wed, Jan 28, 2015 at 08:42:29AM -0800, Alexei Starovoitov escreveu:
> On Wed, Jan 28, 2015 at 8:25 AM, Arnaldo Carvalho de Melo
> <acme@kernel.org> wrote:
> > Em Wed, Jan 28, 2015 at 01:24:15PM -0300, Arnaldo Carvalho de Melo escreveu:
> >> Em Tue, Jan 27, 2015 at 08:06:09PM -0800, Alexei Starovoitov escreveu:
> >> > + if (bpf_memcmp(dev->name, devname, 2) == 0)
> >> I'm only starting to look at all this, so bear with me... But why do we
> >> need to have it as "bpf_memcmp"? Can't we simply use it as "memcmp" and
> >> have it use the right function?
> >> Less typing, perhaps we would need to have a:
> >> #define memcmp bpf_memcmp(s1, s2, n) bpf_memcmp(s1, s2, n)
> > Argh, like this:
> > #define memcmp(s1, s2, n) bpf_memcmp(s1, s2, n)
> >> in bpf_helpers.h to have it work?
> yes, that will work just fine.
> Since it's an example I made it explicit that bpf_memcmp()
> has memcmp() semantics, but little bit different:
> int bpf_memcmp(void *unsafe_ptr, void *safe_ptr, int size)
Not knowing about the safe/unsafe pointers (at this point in my
conceptual eBPF learning process), I would think that it would be easier
to understand if it would reuse another well known idiom:
#define memcmp_from_user(kernel, user, n) bpf_memcmp(user, kernel, n)
That would be similar to:
copy_from_user(void *to, const void __user *from, unsigned long n)
But here, again bear with me, I'm just brainstorming, as from just
looking at:
bpf_memcmp(a, b, n)
I don't reuse anything I've learned before trying to understand eBPF,
not I see any well known marker (__user) that would help me understand
that that pointer needs special treatment/belongs to a different "domain".
> meaning that one of the pointers can point anywhere and
> the function will be doing probe_kernel_read() underneath
> similar to bpf_fetch_*() helpers.
> If it was plain memcmp() it would give a wrong impression
> that vanilla memcmp() can be used.
Since that is not the case, I agree that the 'memcmp' semantic can't be
used, as the two pointers are not on the same "domain", so to say.
> In general the programs cannot use any library functions
> outside of helpers defined in uapi/linux/bpf.h
>
> bpf_fetch_*() helpers are also explicit in examples.
> If one need to do a lot of pointer walking, then macro like
> #define D(P) ((typeof(P))bpf_fetch_ptr(&P))
> would be easier to use: p = D(D(skb->dev)->ifalias)
> multiple pointer derefs would look more natural...
And if possible, i.e. if the eBPF compiler would take care of that
somehow, would indeed be preferred as it looks more natural :-)
- Arnaldo
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v2 linux-trace 5/8] samples: bpf: counting example for kfree_skb tracepoint and write syscall
2015-01-28 4:06 [PATCH v2 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Alexei Starovoitov
` (3 preceding siblings ...)
2015-01-28 4:06 ` [PATCH v2 linux-trace 4/8] samples: bpf: simple tracing example in C Alexei Starovoitov
@ 2015-01-28 4:06 ` Alexei Starovoitov
2015-01-28 4:06 ` [PATCH v2 linux-trace 6/8] samples: bpf: IO latency analysis (iosnoop/heatmap) Alexei Starovoitov
` (2 subsequent siblings)
7 siblings, 0 replies; 15+ messages in thread
From: Alexei Starovoitov @ 2015-01-28 4:06 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
this example has two probes in one C file that attach to different tracepoints
and use two different maps.
1st probe is the similar to dropmon.c. It attaches to kfree_skb tracepoint and
count number of packet drops at different locations
2nd probe attaches to syscalls/sys_enter_write and computes a histogram of different
write sizes
Usage:
$ sudo tracex2
writing bpf-8 -> /sys/kernel/debug/tracing/events/skb/kfree_skb/filter
writing bpf-10 -> /sys/kernel/debug/tracing/events/syscalls/sys_enter_write/filter
location 0xffffffff816959a5 count 1
location 0xffffffff816959a5 count 2
557145+0 records in
557145+0 records out
285258240 bytes (285 MB) copied, 1.02379 s, 279 MB/s
syscall write() stats
byte_size : count distribution
1 -> 1 : 3 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 2 | |
32 -> 63 : 3 | |
64 -> 127 : 1 | |
128 -> 255 : 1 | |
256 -> 511 : 0 | |
512 -> 1023 : 1118968 |************************************* |
Ctrl-C at any time. Kernel will auto cleanup maps and programs
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 ++
samples/bpf/tracex2_kern.c | 71 +++++++++++++++++++++++++++++++++
samples/bpf/tracex2_user.c | 95 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 170 insertions(+)
create mode 100644 samples/bpf/tracex2_kern.c
create mode 100644 samples/bpf/tracex2_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index da28e1b6d3a6..416af24b01fd 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -8,6 +8,7 @@ hostprogs-y += sockex1
hostprogs-y += sockex2
hostprogs-y += dropmon
hostprogs-y += tracex1
+hostprogs-y += tracex2
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -16,12 +17,14 @@ sock_example-objs := sock_example.o libbpf.o
sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
+tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
always += sockex1_kern.o
always += sockex2_kern.o
always += tracex1_kern.o
+always += tracex2_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -29,6 +32,7 @@ HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
HOSTLOADLIBES_sockex1 += -lelf
HOSTLOADLIBES_sockex2 += -lelf
HOSTLOADLIBES_tracex1 += -lelf
+HOSTLOADLIBES_tracex2 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
new file mode 100644
index 000000000000..a789c456c1b4
--- /dev/null
+++ b/samples/bpf/tracex2_kern.c
@@ -0,0 +1,71 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(long),
+ .max_entries = 1024,
+};
+
+SEC("events/skb/kfree_skb")
+int bpf_prog2(struct bpf_context *ctx)
+{
+ long loc = ctx->arg2;
+ long init_val = 1;
+ long *value;
+
+ value = bpf_map_lookup_elem(&my_map, &loc);
+ if (value)
+ *value += 1;
+ else
+ bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY);
+ return 0;
+}
+
+static unsigned int log2(unsigned int v)
+{
+ unsigned int r;
+ unsigned int shift;
+
+ r = (v > 0xFFFF) << 4; v >>= r;
+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+ shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+ shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+ r |= (v >> 1);
+ return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+ unsigned int hi = v >> 32;
+ if (hi)
+ return log2(hi) + 32;
+ else
+ return log2(v);
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 64,
+};
+
+SEC("events/syscalls/sys_enter_write")
+int bpf_prog3(struct bpf_context *ctx)
+{
+ long write_size = ctx->arg3;
+ long init_val = 1;
+ long *value;
+ u32 index = log2l(write_size);
+
+ value = bpf_map_lookup_elem(&my_hist_map, &index);
+ if (value)
+ __sync_fetch_and_add(value, 1);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
new file mode 100644
index 000000000000..016a76e97cd7
--- /dev/null
+++ b/samples/bpf/tracex2_user.c
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX 64
+#define MAX_STARS 38
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+ int key;
+ long value;
+ long data[MAX_INDEX] = {};
+ char starstr[MAX_STARS];
+ int i;
+ int max_ind = -1;
+ long max_value = 0;
+
+ for (key = 0; key < MAX_INDEX; key++) {
+ bpf_lookup_elem(fd, &key, &value);
+ data[key] = value;
+ if (value && key > max_ind)
+ max_ind = key;
+ if (value > max_value)
+ max_value = value;
+ }
+
+ printf(" syscall write() stats\n");
+ printf(" byte_size : count distribution\n");
+ for (i = 1; i <= max_ind + 1; i++) {
+ stars(starstr, data[i - 1], max_value, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+ MAX_STARS, starstr);
+ }
+}
+static void int_exit(int sig)
+{
+ print_hist(map_fd[1]);
+ exit(0);
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+ long key, next_key, value;
+ FILE *f;
+ int i;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ signal(SIGINT, int_exit);
+
+ /* start 'ping' in the background to have some kfree_skb events */
+ f = popen("ping -c5 localhost", "r");
+ (void) f;
+
+ /* start 'dd' in the background to have plenty of 'write' syscalls */
+ f = popen("dd if=/dev/zero of=/dev/null", "r");
+ (void) f;
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ for (i = 0; i < 5; i++) {
+ key = 0;
+ while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+ bpf_lookup_elem(map_fd[0], &next_key, &value);
+ printf("location 0x%lx count %ld\n", next_key, value);
+ key = next_key;
+ }
+ if (key)
+ printf("\n");
+ sleep(1);
+ }
+ print_hist(map_fd[1]);
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 linux-trace 6/8] samples: bpf: IO latency analysis (iosnoop/heatmap)
2015-01-28 4:06 [PATCH v2 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Alexei Starovoitov
` (4 preceding siblings ...)
2015-01-28 4:06 ` [PATCH v2 linux-trace 5/8] samples: bpf: counting example for kfree_skb tracepoint and write syscall Alexei Starovoitov
@ 2015-01-28 4:06 ` Alexei Starovoitov
2015-01-28 4:06 ` [PATCH v2 linux-trace 7/8] tracing: attach eBPF programs to kprobe/kretprobe Alexei Starovoitov
2015-01-28 4:06 ` [PATCH v2 linux-trace 8/8] samples: bpf: simple kprobe example Alexei Starovoitov
7 siblings, 0 replies; 15+ messages in thread
From: Alexei Starovoitov @ 2015-01-28 4:06 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
eBPF C program attaches to block_rq_issue/block_rq_complete events to calculate
IO latency. Then it waits for the first 100 events to compute average latency
and uses range [0 .. ave_lat * 2] to record histogram of events in this latency
range.
User space reads this histogram map every 2 seconds and prints it as a 'heatmap'
using gray shades of text terminal. Black spaces have many events and white
spaces have very few events. Left most space is the smallest latency, right most
space is the largest latency in the range.
If kernel sees too many events that fall out of histogram range, user space
adjusts the range up, so heatmap for next 2 seconds will be more accurate.
Usage:
$ sudo ./tracex3
and do 'sudo dd if=/dev/sda of=/dev/null' in other terminal.
Observe IO latencies and how different activity (like 'make kernel') affects it.
Similar experiments can be done for network transmit latencies, syscalls, etc
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 ++
samples/bpf/tracex3_kern.c | 92 +++++++++++++++++++++++++++
samples/bpf/tracex3_user.c | 150 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 246 insertions(+)
create mode 100644 samples/bpf/tracex3_kern.c
create mode 100644 samples/bpf/tracex3_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 416af24b01fd..da0efd8032ab 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -9,6 +9,7 @@ hostprogs-y += sockex2
hostprogs-y += dropmon
hostprogs-y += tracex1
hostprogs-y += tracex2
+hostprogs-y += tracex3
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -18,6 +19,7 @@ sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
+tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -25,6 +27,7 @@ always += sockex1_kern.o
always += sockex2_kern.o
always += tracex1_kern.o
always += tracex2_kern.o
+always += tracex3_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -33,6 +36,7 @@ HOSTLOADLIBES_sockex1 += -lelf
HOSTLOADLIBES_sockex2 += -lelf
HOSTLOADLIBES_tracex1 += -lelf
HOSTLOADLIBES_tracex2 += -lelf
+HOSTLOADLIBES_tracex3 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
new file mode 100644
index 000000000000..c31f29aa6fc1
--- /dev/null
+++ b/samples/bpf/tracex3_kern.c
@@ -0,0 +1,92 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(u64),
+ .max_entries = 4096,
+};
+
+SEC("events/block/block_rq_issue")
+int bpf_prog1(struct bpf_context *ctx)
+{
+ long rq = ctx->arg2;
+ u64 val = bpf_ktime_get_ns();
+
+ bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
+ return 0;
+}
+
+struct globals {
+ u64 lat_ave;
+ u64 lat_sum;
+ u64 missed;
+ u64 max_lat;
+ int num_samples;
+};
+
+struct bpf_map_def SEC("maps") global_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct globals),
+ .max_entries = 1,
+};
+
+#define MAX_SLOT 32
+
+struct bpf_map_def SEC("maps") lat_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(u64),
+ .max_entries = MAX_SLOT,
+};
+
+SEC("events/block/block_rq_complete")
+int bpf_prog2(struct bpf_context *ctx)
+{
+ long rq = ctx->arg2;
+ void *value;
+
+ value = bpf_map_lookup_elem(&my_map, &rq);
+ if (!value)
+ return 0;
+
+ u64 cur_time = bpf_ktime_get_ns();
+ u64 delta = (cur_time - *(u64 *)value) / 1000;
+
+ bpf_map_delete_elem(&my_map, &rq);
+
+ int ind = 0;
+ struct globals *g = bpf_map_lookup_elem(&global_map, &ind);
+
+ if (!g)
+ return 0;
+ if (g->lat_ave == 0) {
+ g->num_samples++;
+ g->lat_sum += delta;
+ if (g->num_samples >= 100)
+ g->lat_ave = g->lat_sum / g->num_samples;
+ } else {
+ u64 max_lat = g->lat_ave * 2;
+
+ if (delta > max_lat) {
+ g->missed++;
+ if (delta > g->max_lat)
+ g->max_lat = delta;
+ return 0;
+ }
+
+ ind = delta * MAX_SLOT / max_lat;
+ value = bpf_map_lookup_elem(&lat_map, &ind);
+ if (!value)
+ return 0;
+ (*(u64 *)value)++;
+ }
+
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
new file mode 100644
index 000000000000..b7549adbd981
--- /dev/null
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,150 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+struct globals {
+ __u64 lat_ave;
+ __u64 lat_sum;
+ __u64 missed;
+ __u64 max_lat;
+ int num_samples;
+};
+
+static void clear_stats(int fd)
+{
+ int key;
+ __u64 value = 0;
+
+ for (key = 0; key < 32; key++)
+ bpf_update_elem(fd, &key, &value, BPF_ANY);
+}
+
+const char *color[] = {
+ "\033[48;5;255m",
+ "\033[48;5;252m",
+ "\033[48;5;250m",
+ "\033[48;5;248m",
+ "\033[48;5;246m",
+ "\033[48;5;244m",
+ "\033[48;5;242m",
+ "\033[48;5;240m",
+ "\033[48;5;238m",
+ "\033[48;5;236m",
+ "\033[48;5;234m",
+ "\033[48;5;232m",
+};
+const int num_colors = ARRAY_SIZE(color);
+
+const char nocolor[] = "\033[00m";
+
+static void print_banner(__u64 max_lat)
+{
+ printf("0 usec ... %lld usec\n", max_lat);
+}
+
+static void print_hist(int fd)
+{
+ int key;
+ __u64 value;
+ __u64 cnt[32];
+ __u64 max_cnt = 0;
+ __u64 total_events = 0;
+ int max_bucket = 0;
+
+ for (key = 0; key < 32; key++) {
+ value = 0;
+ bpf_lookup_elem(fd, &key, &value);
+ if (value > 0)
+ max_bucket = key;
+ cnt[key] = value;
+ total_events += value;
+ if (value > max_cnt)
+ max_cnt = value;
+ }
+ clear_stats(fd);
+ for (key = 0; key < 32; key++) {
+ int c = num_colors * cnt[key] / (max_cnt + 1);
+
+ printf("%s %s", color[c], nocolor);
+ }
+ printf(" captured=%lld", total_events);
+
+ key = 0;
+ struct globals g = {};
+
+ bpf_lookup_elem(map_fd[1], &key, &g);
+
+ printf(" missed=%lld max_lat=%lld usec\n",
+ g.missed, g.max_lat);
+
+ if (g.missed > 10 && g.missed > total_events / 10) {
+ printf("adjusting range UP...\n");
+ g.lat_ave = g.max_lat / 2;
+ print_banner(g.lat_ave * 2);
+ } else if (max_bucket < 4 && total_events > 100) {
+ printf("adjusting range DOWN...\n");
+ g.lat_ave = g.lat_ave / 4;
+ print_banner(g.lat_ave * 2);
+ }
+ /* clear some globals */
+ g.missed = 0;
+ g.max_lat = 0;
+ bpf_update_elem(map_fd[1], &key, &g, BPF_ANY);
+}
+
+static void int_exit(int sig)
+{
+ print_hist(map_fd[2]);
+ exit(0);
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ clear_stats(map_fd[2]);
+
+ signal(SIGINT, int_exit);
+
+ if (fork() == 0) {
+ read_trace_pipe();
+ } else {
+ struct globals g;
+
+ printf("waiting for events to determine average latency...\n");
+ for (;;) {
+ int key = 0;
+
+ bpf_lookup_elem(map_fd[1], &key, &g);
+ if (g.lat_ave)
+ break;
+ sleep(1);
+ }
+
+ printf(" IO latency in usec\n"
+ " %s %s - many events with this latency\n"
+ " %s %s - few events\n",
+ color[num_colors - 1], nocolor,
+ color[0], nocolor);
+ print_banner(g.lat_ave * 2);
+ for (;;) {
+ print_hist(map_fd[2]);
+ sleep(2);
+ }
+ }
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 linux-trace 7/8] tracing: attach eBPF programs to kprobe/kretprobe
2015-01-28 4:06 [PATCH v2 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Alexei Starovoitov
` (5 preceding siblings ...)
2015-01-28 4:06 ` [PATCH v2 linux-trace 6/8] samples: bpf: IO latency analysis (iosnoop/heatmap) Alexei Starovoitov
@ 2015-01-28 4:06 ` Alexei Starovoitov
2015-01-28 4:06 ` [PATCH v2 linux-trace 8/8] samples: bpf: simple kprobe example Alexei Starovoitov
7 siblings, 0 replies; 15+ messages in thread
From: Alexei Starovoitov @ 2015-01-28 4:06 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
introduce new type of eBPF programs BPF_PROG_TYPE_KPROBE_FILTER.
Such programs are allowed to call the same helper functions
as tracing filters, but bpf_context is different:
For tracing filters bpf_context is 6 arguments of tracepoints or syscalls
For kprobe filters bpf_context == pt_regs
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
include/linux/ftrace_event.h | 2 ++
include/uapi/linux/bpf.h | 1 +
kernel/trace/bpf_trace.c | 39 ++++++++++++++++++++++++++++++++++++
kernel/trace/trace_events_filter.c | 10 ++++++---
kernel/trace/trace_kprobe.c | 11 +++++++++-
5 files changed, 59 insertions(+), 4 deletions(-)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 79de230b7df3..b057ca0c5539 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -249,6 +249,7 @@ enum {
TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
TRACE_EVENT_FL_TRACEPOINT_BIT,
TRACE_EVENT_FL_BPF_BIT,
+ TRACE_EVENT_FL_KPROBE_BIT,
};
/*
@@ -272,6 +273,7 @@ enum {
TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
TRACE_EVENT_FL_BPF = (1 << TRACE_EVENT_FL_BPF_BIT),
+ TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT),
};
struct ftrace_event_call {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 227a4e404726..974932b8b5c6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -119,6 +119,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_UNSPEC,
BPF_PROG_TYPE_SOCKET_FILTER,
BPF_PROG_TYPE_TRACING_FILTER,
+ BPF_PROG_TYPE_KPROBE_FILTER,
};
/* flags for BPF_MAP_UPDATE_ELEM command */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1c07f55702d6..0fc50d3ecde1 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -137,3 +137,42 @@ static int __init register_tracing_filter_ops(void)
return 0;
}
late_initcall(register_tracing_filter_ops);
+
+/* check access to fields of 'struct pt_regs' from BPF program */
+static bool kprobe_filter_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ /* check bounds */
+ if (off < 0 || off >= sizeof(struct pt_regs))
+ return false;
+
+ /* only read is allowed */
+ if (type != BPF_READ)
+ return false;
+
+ /* disallow misaligned access */
+ if (off % size != 0)
+ return false;
+
+ return true;
+}
+/* kprobe filter programs are allowed to call the same helper functions
+ * as tracing filters, but bpf_context is different:
+ * For tracing filters bpf_context is 6 arguments of tracepoints or syscalls
+ * For kprobe filters bpf_context == pt_regs
+ */
+static struct bpf_verifier_ops kprobe_filter_ops = {
+ .get_func_proto = tracing_filter_func_proto,
+ .is_valid_access = kprobe_filter_is_valid_access,
+};
+
+static struct bpf_prog_type_list kprobe_tl = {
+ .ops = &kprobe_filter_ops,
+ .type = BPF_PROG_TYPE_KPROBE_FILTER,
+};
+
+static int __init register_kprobe_filter_ops(void)
+{
+ bpf_register_prog_type(&kprobe_tl);
+ return 0;
+}
+late_initcall(register_kprobe_filter_ops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e0303b3cc9fb..e4a0268f2810 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1894,7 +1894,8 @@ static int create_filter_start(char *filter_str, bool set_str,
return err;
}
-static int create_filter_bpf(char *filter_str, struct event_filter **filterp)
+static int create_filter_bpf(struct ftrace_event_call *call, char *filter_str,
+ struct event_filter **filterp)
{
struct event_filter *filter;
struct bpf_prog *prog;
@@ -1923,7 +1924,10 @@ static int create_filter_bpf(char *filter_str, struct event_filter **filterp)
filter->prog = prog;
- if (prog->aux->prog_type != BPF_PROG_TYPE_TRACING_FILTER) {
+ if (((call->flags & TRACE_EVENT_FL_KPROBE) &&
+ prog->aux->prog_type != BPF_PROG_TYPE_KPROBE_FILTER) ||
+ (!(call->flags & TRACE_EVENT_FL_KPROBE) &&
+ prog->aux->prog_type != BPF_PROG_TYPE_TRACING_FILTER)) {
/* valid fd, but invalid bpf program type */
err = -EINVAL;
goto free_filter;
@@ -2054,7 +2058,7 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
*/
if (memcmp(filter_string, "bpf", 3) == 0 && filter_string[3] != 0 &&
filter_string[4] != 0) {
- err = create_filter_bpf(filter_string, &filter);
+ err = create_filter_bpf(call, filter_string, &filter);
if (!err)
file->flags |= TRACE_EVENT_FL_BPF;
} else {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5edb518be345..ec62dd8cb35f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <linux/uaccess.h>
+#include <trace/bpf_trace.h>
#include "trace_probe.h"
@@ -930,6 +931,10 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
if (ftrace_trigger_soft_disabled(ftrace_file))
return;
+ if (ftrace_file->flags & TRACE_EVENT_FL_BPF)
+ if (!trace_filter_call_bpf(ftrace_file->filter, regs))
+ return;
+
local_save_flags(irq_flags);
pc = preempt_count();
@@ -978,6 +983,10 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
if (ftrace_trigger_soft_disabled(ftrace_file))
return;
+ if (ftrace_file->flags & TRACE_EVENT_FL_BPF)
+ if (!trace_filter_call_bpf(ftrace_file->filter, regs))
+ return;
+
local_save_flags(irq_flags);
pc = preempt_count();
@@ -1286,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
kfree(call->print_fmt);
return -ENODEV;
}
- call->flags = 0;
+ call->flags = TRACE_EVENT_FL_KPROBE;
call->class->reg = kprobe_register;
call->data = tk;
ret = trace_add_event_call(call);
--
1.7.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 linux-trace 8/8] samples: bpf: simple kprobe example
2015-01-28 4:06 [PATCH v2 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Alexei Starovoitov
` (6 preceding siblings ...)
2015-01-28 4:06 ` [PATCH v2 linux-trace 7/8] tracing: attach eBPF programs to kprobe/kretprobe Alexei Starovoitov
@ 2015-01-28 4:06 ` Alexei Starovoitov
7 siblings, 0 replies; 15+ messages in thread
From: Alexei Starovoitov @ 2015-01-28 4:06 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
the logic of the example is similar to tracex2, but syscall 'write' statistics
is capturead from kprobe placed at sys_write function instead of through
syscall instrumentation.
Also tracex4_kern.c has a different way of doing log2 in C.
Note, unlike tracepoint and syscall programs, kprobe programs receive
'struct pt_regs' as an input. It's responsibility of the program author
or higher level dynamic tracing tool to match registers to function arguments.
Since pt_regs are architecture dependent, programs are also arch dependent,
unlike tracepoint/syscalls programs which are universal.
Usage:
$ sudo tracex4
writing bpf-6 -> /sys/kernel/debug/tracing/events/kprobes/sys_write/filter
2216443+0 records in
2216442+0 records out
1134818304 bytes (1.1 GB) copied, 2.00746 s, 565 MB/s
kprobe sys_write() stats
byte_size : count distribution
1 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 1 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 2214734 |************************************* |
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 +++
samples/bpf/bpf_load.c | 3 ++
samples/bpf/tracex4_kern.c | 36 +++++++++++++++++++
samples/bpf/tracex4_user.c | 83 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 126 insertions(+)
create mode 100644 samples/bpf/tracex4_kern.c
create mode 100644 samples/bpf/tracex4_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index da0efd8032ab..22c7a38f3f95 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -10,6 +10,7 @@ hostprogs-y += dropmon
hostprogs-y += tracex1
hostprogs-y += tracex2
hostprogs-y += tracex3
+hostprogs-y += tracex4
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -20,6 +21,7 @@ sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
+tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -28,6 +30,7 @@ always += sockex2_kern.o
always += tracex1_kern.o
always += tracex2_kern.o
always += tracex3_kern.o
+always += tracex4_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -37,6 +40,7 @@ HOSTLOADLIBES_sockex2 += -lelf
HOSTLOADLIBES_tracex1 += -lelf
HOSTLOADLIBES_tracex2 += -lelf
HOSTLOADLIBES_tracex3 += -lelf
+HOSTLOADLIBES_tracex4 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 788ac51c1024..d8c5176f0564 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -25,6 +25,7 @@ int prog_cnt;
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
bool is_socket = strncmp(event, "socket", 6) == 0;
+ bool is_kprobe = strncmp(event, "events/kprobes/", 15) == 0;
enum bpf_prog_type prog_type;
char path[256] = DEBUGFS;
char fmt[32];
@@ -32,6 +33,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
if (is_socket)
prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ else if (is_kprobe)
+ prog_type = BPF_PROG_TYPE_KPROBE_FILTER;
else
prog_type = BPF_PROG_TYPE_TRACING_FILTER;
diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c
new file mode 100644
index 000000000000..9646f9e43417
--- /dev/null
+++ b/samples/bpf/tracex4_kern.c
@@ -0,0 +1,36 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+static unsigned int log2l(unsigned long long n)
+{
+#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; }
+ int i = -(n == 0);
+ S(32); S(16); S(8); S(4); S(2); S(1);
+ return i;
+#undef S
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 64,
+};
+
+SEC("events/kprobes/sys_write")
+int bpf_prog4(struct pt_regs *regs)
+{
+ long write_size = regs->dx; /* $rdx contains 3rd argument to a function */
+ long init_val = 1;
+ void *value;
+ u32 index = log2l(write_size);
+
+ value = bpf_map_lookup_elem(&my_hist_map, &index);
+ if (value)
+ __sync_fetch_and_add((long *)value, 1);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
new file mode 100644
index 000000000000..47dde2791f9e
--- /dev/null
+++ b/samples/bpf/tracex4_user.c
@@ -0,0 +1,83 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX 64
+#define MAX_STARS 38
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+ int key;
+ long value;
+ long data[MAX_INDEX] = {};
+ char starstr[MAX_STARS];
+ int i;
+ int max_ind = -1;
+ long max_value = 0;
+
+ for (key = 0; key < MAX_INDEX; key++) {
+ bpf_lookup_elem(fd, &key, &value);
+ data[key] = value;
+ if (value && key > max_ind)
+ max_ind = key;
+ if (value > max_value)
+ max_value = value;
+ }
+
+ printf("\n kprobe sys_write() stats\n");
+ printf(" byte_size : count distribution\n");
+ for (i = 1; i <= max_ind + 1; i++) {
+ stars(starstr, data[i - 1], max_value, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+ MAX_STARS, starstr);
+ }
+}
+static void int_exit(int sig)
+{
+ print_hist(map_fd[0]);
+ exit(0);
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+ FILE *f;
+ int i;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ signal(SIGINT, int_exit);
+
+ i = system("echo 'p:sys_write sys_write' > /sys/kernel/debug/tracing/kprobe_events");
+ (void) i;
+
+ /* start 'dd' in the background to have plenty of 'write' syscalls */
+ f = popen("dd if=/dev/zero of=/dev/null", "r");
+ (void) f;
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ sleep(2);
+ kill(0, SIGINT); /* send Ctrl-C to self and to 'dd' */
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread