* [PATCH v3 linux-trace 2/8] tracing: allow eBPF programs to call ktime_get_ns()
2015-02-10 3:45 [PATCH v3 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Alexei Starovoitov
@ 2015-02-10 3:45 ` Alexei Starovoitov
2015-02-10 3:45 ` [PATCH v3 linux-trace 3/8] samples: bpf: simple tracing example in eBPF assembler Alexei Starovoitov
` (5 subsequent siblings)
6 siblings, 0 replies; 19+ messages in thread
From: Alexei Starovoitov @ 2015-02-10 3:45 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
bpf_ktime_get_ns() is used by programs to compue time delta between events
or as a timestamp
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
include/uapi/linux/bpf.h | 1 +
kernel/trace/bpf_trace.c | 10 ++++++++++
2 files changed, 11 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d73d7d0abe6e..ecae21e58ba3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -169,6 +169,7 @@ enum bpf_func_id {
BPF_FUNC_fetch_u16, /* u16 bpf_fetch_u16(void *unsafe_ptr) */
BPF_FUNC_fetch_u8, /* u8 bpf_fetch_u8(void *unsafe_ptr) */
BPF_FUNC_probe_memcmp, /* int bpf_probe_memcmp(unsafe_ptr, safe_ptr, size) */
+ BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */
__BPF_FUNC_MAX_ID,
};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ec065e0a364e..e3196266b72f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -69,6 +69,11 @@ static u64 bpf_probe_memcmp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
return -1;
}
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return ktime_get_ns();
+}
+
static struct bpf_func_proto tp_prog_funcs[] = {
#define FETCH(SIZE) \
[BPF_FUNC_fetch_##SIZE] = { \
@@ -90,6 +95,11 @@ static struct bpf_func_proto tp_prog_funcs[] = {
.arg2_type = ARG_PTR_TO_STACK,
.arg3_type = ARG_CONST_STACK_SIZE,
},
+ [BPF_FUNC_ktime_get_ns] = {
+ .func = bpf_ktime_get_ns,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ },
};
static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
--
1.7.9.5
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH v3 linux-trace 3/8] samples: bpf: simple tracing example in eBPF assembler
2015-02-10 3:45 [PATCH v3 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Alexei Starovoitov
2015-02-10 3:45 ` [PATCH v3 linux-trace 2/8] tracing: allow eBPF programs to call ktime_get_ns() Alexei Starovoitov
@ 2015-02-10 3:45 ` Alexei Starovoitov
2015-02-10 3:45 ` [PATCH v3 linux-trace 4/8] samples: bpf: simple tracing example in C Alexei Starovoitov
` (4 subsequent siblings)
6 siblings, 0 replies; 19+ messages in thread
From: Alexei Starovoitov @ 2015-02-10 3:45 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
simple packet drop monitor:
- in-kernel eBPF program attaches to skb:kfree_skb event and records number
of packet drops at given location
- userspace iterates over the map every second and prints stats
Usage:
$ sudo dropmon
location 0xffffffff81695995 count 1
location 0xffffffff816d0da9 count 2
location 0xffffffff81695995 count 2
location 0xffffffff816d0da9 count 2
location 0xffffffff81695995 count 3
location 0xffffffff816d0da9 count 2
$ addr2line -ape ./bld_x64/vmlinux 0xffffffff81695995 0xffffffff816d0da9
0xffffffff81695995: ./bld_x64/../net/ipv4/icmp.c:1038
0xffffffff816d0da9: ./bld_x64/../net/unix/af_unix.c:1231
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 2 +
samples/bpf/dropmon.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++
samples/bpf/libbpf.c | 7 +++
samples/bpf/libbpf.h | 4 ++
4 files changed, 156 insertions(+)
create mode 100644 samples/bpf/dropmon.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b5b3600dcdf5..789691374562 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,7 +6,9 @@ hostprogs-y := test_verifier test_maps
hostprogs-y += sock_example
hostprogs-y += sockex1
hostprogs-y += sockex2
+hostprogs-y += dropmon
+dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o
sock_example-objs := sock_example.o libbpf.o
diff --git a/samples/bpf/dropmon.c b/samples/bpf/dropmon.c
new file mode 100644
index 000000000000..515504f68506
--- /dev/null
+++ b/samples/bpf/dropmon.c
@@ -0,0 +1,143 @@
+/* simple packet drop monitor:
+ * - in-kernel eBPF program attaches to kfree_skb() event and records number
+ * of packet drops at given location
+ * - userspace iterates over the map every second and prints stats
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <linux/unistd.h>
+#include <string.h>
+#include <linux/filter.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include "libbpf.h"
+
+#define TRACEPOINT "/sys/kernel/debug/tracing/events/skb/kfree_skb/id"
+
+static int dropmon(void)
+{
+ long long key, next_key, value = 0;
+ int prog_fd, map_fd, i, event_fd, efd, err;
+ char buf[32];
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 1024);
+ if (map_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ goto cleanup;
+ }
+
+ /* the following eBPF program is equivalent to C:
+ * int filter(struct bpf_context *ctx)
+ * {
+ * long loc = ctx->arg2;
+ * long init_val = 1;
+ * long *value;
+ *
+ * value = bpf_map_lookup_elem(MAP_ID, &loc);
+ * if (value) {
+ * __sync_fetch_and_add(value, 1);
+ * } else {
+ * bpf_map_update_elem(MAP_ID, &loc, &init_val, BPF_ANY);
+ * }
+ * return 0;
+ * }
+ */
+ struct bpf_insn prog[] = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), /* r2 = *(u64 *)(r1 + 8) */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* *(u64 *)(fp - 8) = r2 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* r2 = fp - 8 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+ BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 1), /* *(u64 *)(fp - 16) = 1 */
+ BPF_MOV64_IMM(BPF_REG_4, BPF_ANY),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -16), /* r3 = fp - 16 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* r2 = fp - 8 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+ BPF_EXIT_INSN(),
+ };
+
+ prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, prog,
+ sizeof(prog), "GPL");
+ if (prog_fd < 0) {
+ printf("failed to load prog '%s'\n%s",
+ strerror(errno), bpf_log_buf);
+ return -1;
+ }
+
+
+ event_fd = open(TRACEPOINT, O_RDONLY, 0);
+ if (event_fd < 0) {
+ printf("failed to open event %s\n", TRACEPOINT);
+ return -1;
+ }
+
+ err = read(event_fd, buf, sizeof(buf));
+ if (err < 0 || err >= sizeof(buf)) {
+ printf("read from '%s' failed '%s'\n",
+ TRACEPOINT, strerror(errno));
+ return -1;
+ }
+
+ close(event_fd);
+
+ buf[err] = 0;
+
+ struct perf_event_attr attr = {.type = PERF_TYPE_TRACEPOINT};
+ attr.config = atoi(buf);
+
+ efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+ if (efd < 0) {
+ printf("event %lld fd %d err %s\n",
+ attr.config, efd, strerror(errno));
+ return -1;
+ }
+ ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
+ ioctl(efd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+
+ for (i = 0; i < 10; i++) {
+ key = 0;
+ while (bpf_get_next_key(map_fd, &key, &next_key) == 0) {
+ bpf_lookup_elem(map_fd, &next_key, &value);
+ printf("location 0x%llx count %lld\n", next_key, value);
+ key = next_key;
+ }
+ if (key)
+ printf("\n");
+ sleep(1);
+ }
+
+cleanup:
+ /* maps, programs, tracepoint filters will auto cleanup on process exit */
+
+ return 0;
+}
+
+int main(void)
+{
+ FILE *f;
+
+ /* start ping in the background to get some kfree_skb events */
+ f = popen("ping -c5 localhost", "r");
+ (void) f;
+
+ dropmon();
+ return 0;
+}
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 46d50b7ddf79..f4f428149a7d 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -121,3 +121,10 @@ int open_raw_sock(const char *name)
return sock;
}
+
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+ int group_fd, unsigned long flags)
+{
+ return syscall(__NR_perf_event_open, attr, pid, cpu,
+ group_fd, flags);
+}
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 58c5fe1bdba1..92ff824eaed5 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -182,4 +182,8 @@ extern char bpf_log_buf[LOG_BUF_SIZE];
/* create RAW socket and bind to interface 'name' */
int open_raw_sock(const char *name);
+struct perf_event_attr;
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+ int group_fd, unsigned long flags);
+
#endif
--
1.7.9.5
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH v3 linux-trace 4/8] samples: bpf: simple tracing example in C
2015-02-10 3:45 [PATCH v3 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Alexei Starovoitov
2015-02-10 3:45 ` [PATCH v3 linux-trace 2/8] tracing: allow eBPF programs to call ktime_get_ns() Alexei Starovoitov
2015-02-10 3:45 ` [PATCH v3 linux-trace 3/8] samples: bpf: simple tracing example in eBPF assembler Alexei Starovoitov
@ 2015-02-10 3:45 ` Alexei Starovoitov
2015-02-10 4:08 ` Steven Rostedt
[not found] ` <1423539961-21792-5-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
[not found] ` <1423539961-21792-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
` (3 subsequent siblings)
6 siblings, 2 replies; 19+ messages in thread
From: Alexei Starovoitov @ 2015-02-10 3:45 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
tracex1_kern.c - C program which will be compiled into eBPF
to filter netif_receive_skb events on skb->dev->name == "lo"
The programs returns 1 to store an event into ring_buffer
and returns 0 - to discard an event.
tracex1_user.c - corresponding user space component that:
- loads bpf program via bpf() syscall
- opens net:netif_receive_skb event via perf_event_open() syscall
- attaches the program to event via ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
- mmaps event_fd
- polls event_fd, walks ring_buffer and prints events
Usage:
$ sudo tracex1
pid 29241 skb 0xffff88045e58c500 len 84 dev lo
pid 29241 skb 0xffff88045e58cd00 len 84 dev lo
pid 29241 skb 0xffff880074c35000 len 84 dev lo
pid 29241 skb 0xffff880074c35200 len 84 dev lo
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 ++
samples/bpf/bpf_helpers.h | 14 +++++
samples/bpf/bpf_load.c | 129 +++++++++++++++++++++++++++++++++++++++++---
samples/bpf/bpf_load.h | 12 +++++
samples/bpf/tracex1_kern.c | 28 ++++++++++
samples/bpf/tracex1_user.c | 50 +++++++++++++++++
6 files changed, 231 insertions(+), 6 deletions(-)
create mode 100644 samples/bpf/tracex1_kern.c
create mode 100644 samples/bpf/tracex1_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 789691374562..da28e1b6d3a6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -7,6 +7,7 @@ hostprogs-y += sock_example
hostprogs-y += sockex1
hostprogs-y += sockex2
hostprogs-y += dropmon
+hostprogs-y += tracex1
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -14,17 +15,20 @@ test_maps-objs := test_maps.o libbpf.o
sock_example-objs := sock_example.o libbpf.o
sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
+tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
always += sockex1_kern.o
always += sockex2_kern.o
+always += tracex1_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
HOSTLOADLIBES_sockex1 += -lelf
HOSTLOADLIBES_sockex2 += -lelf
+HOSTLOADLIBES_tracex1 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index ca0333146006..406e9705d99e 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -15,6 +15,20 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value,
(void *) BPF_FUNC_map_update_elem;
static int (*bpf_map_delete_elem)(void *map, void *key) =
(void *) BPF_FUNC_map_delete_elem;
+static void *(*bpf_fetch_ptr)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_ptr;
+static unsigned long long (*bpf_fetch_u64)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u64;
+static unsigned int (*bpf_fetch_u32)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u32;
+static unsigned short (*bpf_fetch_u16)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u16;
+static unsigned char (*bpf_fetch_u8)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u8;
+static int (*bpf_probe_memcmp)(void *unsafe_ptr, void *safe_ptr, int size) =
+ (void *) BPF_FUNC_probe_memcmp;
+static unsigned long long (*bpf_ktime_get_ns)(void) =
+ (void *) BPF_FUNC_ktime_get_ns;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 1831d236382b..2aece65963e4 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -8,29 +8,47 @@
#include <unistd.h>
#include <string.h>
#include <stdbool.h>
+#include <stdlib.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <poll.h>
#include "libbpf.h"
#include "bpf_helpers.h"
#include "bpf_load.h"
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
static char license[128];
static bool processed_sec[128];
int map_fd[MAX_MAPS];
int prog_fd[MAX_PROGS];
+int event_fd[MAX_PROGS];
int prog_cnt;
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
- int fd;
bool is_socket = strncmp(event, "socket", 6) == 0;
+ enum bpf_prog_type prog_type;
+ char path[256] = DEBUGFS;
+ char buf[32];
+ int fd, efd, err, id;
+ struct perf_event_attr attr;
- if (!is_socket)
- /* tracing events tbd */
- return -1;
+ attr.type = PERF_TYPE_TRACEPOINT;
+ attr.sample_type = PERF_SAMPLE_RAW;
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+
+ if (is_socket)
+ prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ else
+ prog_type = BPF_PROG_TYPE_TRACEPOINT;
- fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
- prog, size, license);
+ fd = bpf_prog_load(prog_type, prog, size, license);
if (fd < 0) {
printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
@@ -39,6 +57,39 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
prog_fd[prog_cnt++] = fd;
+ if (is_socket)
+ return 0;
+
+ strcat(path, event);
+ strcat(path, "/id");
+
+ efd = open(path, O_RDONLY, 0);
+ if (efd < 0) {
+ printf("failed to open event %s\n", event);
+ return -1;
+ }
+
+ err = read(efd, buf, sizeof(buf));
+ if (err < 0 || err >= sizeof(buf)) {
+ printf("read from '%s' failed '%s'\n", event, strerror(errno));
+ return -1;
+ }
+
+ close(efd);
+
+ buf[err] = 0;
+ id = atoi(buf);
+ attr.config = id;
+
+ efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+ if (efd < 0) {
+ printf("event %d fd %d err %s\n", id, efd, strerror(errno));
+ return -1;
+ }
+ event_fd[prog_cnt - 1] = efd;
+ ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
+ ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
+
return 0;
}
@@ -201,3 +252,69 @@ int load_bpf_file(char *path)
close(fd);
return 0;
}
+
+int page_size;
+int page_cnt = 8;
+volatile struct perf_event_mmap_page *header;
+
+int perf_event_mmap(int fd)
+{
+ void *base;
+ int mmap_size;
+
+ page_size = getpagesize();
+ mmap_size = page_size * (page_cnt + 1);
+
+ base = mmap(NULL, mmap_size, PROT_READ, MAP_SHARED, fd, 0);
+ if (base == MAP_FAILED) {
+ printf("mmap err\n");
+ return -1;
+ }
+
+ header = base;
+ return 0;
+}
+
+int perf_event_poll(int fd)
+{
+ struct pollfd pfd = {.fd = fd, .events = POLLIN};
+ return poll(&pfd, 1, -1);
+}
+
+struct perf_event_sample {
+ struct perf_event_header header;
+ __u32 size;
+ char data[];
+};
+
+void perf_event_read(print_fn fn)
+{
+ static __u64 old_data_head = 0;
+ __u64 data_head = header->data_head;
+ __u64 buffer_size = page_cnt * page_size;
+ void *base, *begin, *end;
+
+ if (data_head == old_data_head)
+ return;
+
+ base = ((char *)header) + page_size;
+
+ begin = base + old_data_head % buffer_size;
+ end = base + data_head % buffer_size;
+
+ while (begin < end) {
+ struct perf_event_sample *e;
+
+ e = begin;
+
+ if (e->header.type != PERF_RECORD_SAMPLE) {
+ printf("event is not a sample type %d\n", e->header.type);
+ }
+
+ begin += sizeof(*e) + e->size;
+ fn(e->data, e->size);
+ }
+ /* else when end > begin - the events had wrapped. ignored for now */
+
+ old_data_head = data_head;
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 27789a34f5e6..cf55663405da 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -6,6 +6,7 @@
extern int map_fd[MAX_MAPS];
extern int prog_fd[MAX_PROGS];
+extern int event_fd[MAX_PROGS];
/* parses elf file compiled by llvm .c->.o
* . parses 'maps' section and creates maps via BPF syscall
@@ -21,4 +22,15 @@ extern int prog_fd[MAX_PROGS];
*/
int load_bpf_file(char *path);
+int perf_event_mmap(int fd);
+int perf_event_poll(int fd);
+typedef void (*print_fn)(void *data, int size);
+void perf_event_read(print_fn fn);
+struct trace_entry {
+ unsigned short type;
+ unsigned char flags;
+ unsigned char preempt_count;
+ int pid;
+};
+
#endif
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
new file mode 100644
index 000000000000..449db961c642
--- /dev/null
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,28 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+SEC("events/net/netif_receive_skb")
+int bpf_prog1(struct bpf_context *ctx)
+{
+ /*
+ * attaches to net:netif_receive_skb
+ * and filters events for loobpack device only
+ */
+ char devname[] = "lo";
+ struct net_device *dev;
+ struct sk_buff *skb = 0;
+
+ skb = (struct sk_buff *) ctx->arg1;
+ dev = bpf_fetch_ptr(&skb->dev);
+ if (bpf_probe_memcmp(dev->name, devname, 2) == 0)
+ /* pass event to userspace via perf ring_buffer */
+ return 1;
+
+ /* drop event */
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000000..a49de23eb30b
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+static char *get_str(void *entry, __u32 arrloc)
+{
+ int off = arrloc & 0xffff;
+ return entry + off;
+}
+
+static void print_netif_receive_skb(void *data, int size)
+{
+ struct ftrace_raw_netif_receive_skb {
+ struct trace_entry t;
+ void *skb;
+ __u32 len;
+ __u32 name;
+ } *e = data;
+
+ printf("pid %d skb %p len %d dev %s\n",
+ e->t.pid, e->skb, e->len, get_str(e, e->name));
+}
+
+int main(int ac, char **argv)
+{
+ FILE *f;
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ if (perf_event_mmap(event_fd[0]) < 0)
+ return 1;
+
+ f = popen("taskset 1 ping -c5 localhost", "r");
+ (void) f;
+
+ for (;;) {
+ perf_event_poll(event_fd[0]);
+ perf_event_read(print_netif_receive_skb);
+ }
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH v3 linux-trace 4/8] samples: bpf: simple tracing example in C
2015-02-10 3:45 ` [PATCH v3 linux-trace 4/8] samples: bpf: simple tracing example in C Alexei Starovoitov
@ 2015-02-10 4:08 ` Steven Rostedt
[not found] ` <20150209230836.7f913c60-2kNGR76GQU9OHLTnHDQRgA@public.gmane.org>
[not found] ` <1423539961-21792-5-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
1 sibling, 1 reply; 19+ messages in thread
From: Steven Rostedt @ 2015-02-10 4:08 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
On Mon, 9 Feb 2015 19:45:57 -0800
Alexei Starovoitov <ast@plumgrid.com> wrote:
> +int perf_event_mmap(int fd);
> +int perf_event_poll(int fd);
> +typedef void (*print_fn)(void *data, int size);
> +void perf_event_read(print_fn fn);
> +struct trace_entry {
> + unsigned short type;
> + unsigned char flags;
> + unsigned char preempt_count;
> + int pid;
> +};
> +
Please do not hard code any structures. This is not a stable ABI, and
it may not even match if you are running 32 bit userspace on top of a
64 bit kernel.
Please parse the format files. libtraceevent does this for you. If need
be, link to that. But if you look at the event format files you'll see
the offsets and sizes in the binary code:
field:unsigned short common_type; offset:0; size:2; signed:0;
field:unsigned char common_flags; offset:2; size:1; signed:0;
field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
field:int common_pid; offset:4; size:4; signed:1;
I don't want to get stuck with pinned kernel data structures again. We
had 4 blank bytes of data for every event, because latency top hard
coded the field. Luckily, the 64 bit / 32 bit interface caused latency
top to have to use the event_parse code to work, and we were able to
remove that field after it was converted.
-- Steve
^ permalink raw reply [flat|nested] 19+ messages in thread
[parent not found: <1423539961-21792-5-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>]
* Re: [PATCH v3 linux-trace 4/8] samples: bpf: simple tracing example in C
[not found] ` <1423539961-21792-5-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
@ 2015-02-10 4:12 ` Steven Rostedt
0 siblings, 0 replies; 19+ messages in thread
From: Steven Rostedt @ 2015-02-10 4:12 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA
On Mon, 9 Feb 2015 19:45:57 -0800
Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:
> +static void print_netif_receive_skb(void *data, int size)
> +{
> + struct ftrace_raw_netif_receive_skb {
> + struct trace_entry t;
> + void *skb;
> + __u32 len;
> + __u32 name;
> + } *e = data;
Same here, there's no guarantee that this structure will always match.
cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/format
name: netif_receive_skb
ID: 975
format:
field:unsigned short common_type; offset:0; size:2; signed:0;
field:unsigned char common_flags; offset:2; size:1; signed:0;
field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
field:int common_pid; offset:4; size:4; signed:1;
field:void * skbaddr; offset:8; size:8; signed:0;
field:unsigned int len; offset:16; size:4; signed:0;
field:__data_loc char[] name; offset:20; size:4; signed:1;
print fmt: "dev=%s skbaddr=%p len=%u", __get_str(name), REC->skbaddr, REC->len
-- Steve
^ permalink raw reply [flat|nested] 19+ messages in thread
[parent not found: <1423539961-21792-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>]
* [PATCH v3 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls
[not found] ` <1423539961-21792-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
@ 2015-02-10 3:45 ` Alexei Starovoitov
[not found] ` <1423539961-21792-2-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
2015-02-10 3:45 ` [PATCH v3 linux-trace 5/8] samples: bpf: counting example for kfree_skb tracepoint and write syscall Alexei Starovoitov
2015-02-10 14:55 ` [PATCH v3 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Steven Rostedt
2 siblings, 1 reply; 19+ messages in thread
From: Alexei Starovoitov @ 2015-02-10 3:45 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA
User interface:
struct perf_event_attr attr = {.type = PERF_TYPE_TRACEPOINT, .config = event_id, ...};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
prog_fd is a file descriptor associated with eBPF program previously loaded.
event_id is an ID of static tracepoint event or syscall.
(kprobe support is in next patch)
close(event_fd) - automatically detaches eBPF program from it
eBPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- fetch_ptr/u64/u32/u16/u8 values from unsafe address via probe_kernel_read(),
so that eBPF program can walk any kernel data structures
- probe_memcmp - combination of probe_kernel_read() and memcmp()
Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
include/linux/bpf.h | 6 +-
include/linux/ftrace_event.h | 11 +++
include/trace/bpf_trace.h | 25 +++++++
include/trace/ftrace.h | 31 +++++++++
include/uapi/linux/bpf.h | 7 ++
include/uapi/linux/perf_event.h | 1 +
kernel/events/core.c | 55 +++++++++++++++
kernel/trace/Makefile | 1 +
kernel/trace/bpf_trace.c | 145 +++++++++++++++++++++++++++++++++++++++
kernel/trace/trace_syscalls.c | 35 ++++++++++
10 files changed, 316 insertions(+), 1 deletion(-)
create mode 100644 include/trace/bpf_trace.h
create mode 100644 kernel/trace/bpf_trace.c
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbfceb756452..a0f6f636ced0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -130,10 +130,14 @@ struct bpf_prog_aux {
#ifdef CONFIG_BPF_SYSCALL
void bpf_prog_put(struct bpf_prog *prog);
+struct bpf_prog *bpf_prog_get(u32 ufd);
#else
static inline void bpf_prog_put(struct bpf_prog *prog) {}
+static inline struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+ return ERR_PTR(-ENOENT);
+}
#endif
-struct bpf_prog *bpf_prog_get(u32 ufd);
/* verify correctness of eBPF program */
int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0bebb5c348b8..479d0a4a42b3 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -13,6 +13,7 @@ struct trace_array;
struct trace_buffer;
struct tracer;
struct dentry;
+struct bpf_prog;
struct trace_print_flags {
unsigned long mask;
@@ -299,6 +300,7 @@ struct ftrace_event_call {
#ifdef CONFIG_PERF_EVENTS
int perf_refcount;
struct hlist_head __percpu *perf_events;
+ struct bpf_prog *prog;
int (*perf_perm)(struct ftrace_event_call *,
struct perf_event *);
@@ -544,6 +546,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
event_triggers_post_call(file, tt);
}
+#ifdef CONFIG_BPF_SYSCALL
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+#else
+static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+ return 1;
+}
+#endif
+
enum {
FILTER_OTHER = 0,
FILTER_STATIC_STRING,
diff --git a/include/trace/bpf_trace.h b/include/trace/bpf_trace.h
new file mode 100644
index 000000000000..4e64f61f484d
--- /dev/null
+++ b/include/trace/bpf_trace.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _LINUX_KERNEL_BPF_TRACE_H
+#define _LINUX_KERNEL_BPF_TRACE_H
+
+/* For tracepoint filters argN fields match one to one to arguments
+ * passed to tracepoint events
+ *
+ * For syscall entry filters argN fields match syscall arguments
+ * For syscall exit filters arg1 is a return value
+ */
+struct bpf_context {
+ u64 arg1;
+ u64 arg2;
+ u64 arg3;
+ u64 arg4;
+ u64 arg5;
+ u64 arg6;
+};
+
+#endif /* _LINUX_KERNEL_BPF_TRACE_H */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 139b5067345b..4c275ce2dcf0 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -17,6 +17,7 @@
*/
#include <linux/ftrace_event.h>
+#include <trace/bpf_trace.h>
/*
* DECLARE_EVENT_CLASS can be used to add a generic function
@@ -755,12 +756,32 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
#undef __perf_task
#define __perf_task(t) (__task = (t))
+/* zero extend integer, pointer or aggregate type to u64 without warnings */
+#define __CAST_TO_U64(EXPR) ({ \
+ u64 ret = 0; \
+ typeof(EXPR) expr = EXPR; \
+ switch (sizeof(expr)) { \
+ case 8: ret = *(u64 *) &expr; break; \
+ case 4: ret = *(u32 *) &expr; break; \
+ case 2: ret = *(u16 *) &expr; break; \
+ case 1: ret = *(u8 *) &expr; break; \
+ } \
+ ret; })
+
+#define __BPF_CAST1(a,...) __CAST_TO_U64(a)
+#define __BPF_CAST2(a,...) __CAST_TO_U64(a), __BPF_CAST1(__VA_ARGS__)
+#define __BPF_CAST3(a,...) __CAST_TO_U64(a), __BPF_CAST2(__VA_ARGS__)
+#define __BPF_CAST4(a,...) __CAST_TO_U64(a), __BPF_CAST3(__VA_ARGS__)
+#define __BPF_CAST5(a,...) __CAST_TO_U64(a), __BPF_CAST4(__VA_ARGS__)
+#define __BPF_CAST6(a,...) __CAST_TO_U64(a), __BPF_CAST5(__VA_ARGS__)
+
#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
static notrace void \
perf_trace_##call(void *__data, proto) \
{ \
struct ftrace_event_call *event_call = __data; \
+ struct bpf_prog *prog = event_call->prog; \
struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
struct ftrace_raw_##call *entry; \
struct pt_regs __regs; \
@@ -771,6 +792,16 @@ perf_trace_##call(void *__data, proto) \
int __data_size; \
int rctx; \
\
+ if (prog) { \
+ __maybe_unused const u64 z = 0; \
+ struct bpf_context __ctx = ((struct bpf_context) { \
+ __BPF_CAST6(args, z, z, z, z, z) \
+ }); \
+ \
+ if (!trace_call_bpf(prog, &__ctx)) \
+ return; \
+ } \
+ \
__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
\
head = this_cpu_ptr(event_call->perf_events); \
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45da7ec7d274..d73d7d0abe6e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
enum bpf_prog_type {
BPF_PROG_TYPE_UNSPEC,
BPF_PROG_TYPE_SOCKET_FILTER,
+ BPF_PROG_TYPE_TRACEPOINT,
};
/* flags for BPF_MAP_UPDATE_ELEM command */
@@ -162,6 +163,12 @@ enum bpf_func_id {
BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+ BPF_FUNC_fetch_ptr, /* void *bpf_fetch_ptr(void *unsafe_ptr) */
+ BPF_FUNC_fetch_u64, /* u64 bpf_fetch_u64(void *unsafe_ptr) */
+ BPF_FUNC_fetch_u32, /* u32 bpf_fetch_u32(void *unsafe_ptr) */
+ BPF_FUNC_fetch_u16, /* u16 bpf_fetch_u16(void *unsafe_ptr) */
+ BPF_FUNC_fetch_u8, /* u8 bpf_fetch_u8(void *unsafe_ptr) */
+ BPF_FUNC_probe_memcmp, /* int bpf_probe_memcmp(unsafe_ptr, safe_ptr, size) */
__BPF_FUNC_MAX_ID,
};
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 9b79abbd1ab8..d7ba67234761 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -360,6 +360,7 @@ struct perf_event_attr {
#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5)
#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *)
#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
enum perf_event_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 882f835a0d85..674a8ca17190 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -42,6 +42,8 @@
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
#include "internal.h"
@@ -3283,6 +3285,7 @@ errout:
}
static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
static void free_event_rcu(struct rcu_head *head)
{
@@ -3292,6 +3295,7 @@ static void free_event_rcu(struct rcu_head *head)
if (event->ns)
put_pid_ns(event->ns);
perf_event_free_filter(event);
+ perf_event_free_bpf_prog(event);
kfree(event);
}
@@ -3795,6 +3799,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
@@ -3849,6 +3854,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PERF_EVENT_IOC_SET_FILTER:
return perf_event_set_filter(event, (void __user *)arg);
+ case PERF_EVENT_IOC_SET_BPF:
+ return perf_event_set_bpf_prog(event, arg);
+
default:
return -ENOTTY;
}
@@ -6266,6 +6274,45 @@ static void perf_event_free_filter(struct perf_event *event)
ftrace_profile_free_filter(event);
}
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+ struct bpf_prog *prog;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -EINVAL;
+
+ if (event->tp_event->prog)
+ return -EEXIST;
+
+ prog = bpf_prog_get(prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->aux->prog_type != BPF_PROG_TYPE_TRACEPOINT) {
+ /* valid fd, but invalid bpf program type */
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ event->tp_event->prog = prog;
+
+ return 0;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+ struct bpf_prog *prog;
+
+ if (!event->tp_event)
+ return;
+
+ prog = event->tp_event->prog;
+ if (prog) {
+ event->tp_event->prog = NULL;
+ bpf_prog_put(prog);
+ }
+}
+
#else
static inline void perf_tp_register(void)
@@ -6281,6 +6328,14 @@ static void perf_event_free_filter(struct perf_event *event)
{
}
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+ return -ENOENT;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
#endif /* CONFIG_EVENT_TRACING */
#ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979ccde26720..54ae225e5fc6 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..ec065e0a364e
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,145 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include <trace/bpf_trace.h>
+#include "trace.h"
+
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+ unsigned int ret;
+
+ if (in_nmi()) /* not supported yet */
+ return 1;
+
+ rcu_read_lock();
+ ret = BPF_PROG_RUN(prog, ctx);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+
+static u64 bpf_fetch_ptr(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ void *unsafe_ptr = (void *) (long) r1;
+ void *ptr = NULL;
+
+ probe_kernel_read(&ptr, unsafe_ptr, sizeof(ptr));
+ return (u64) (unsigned long) ptr;
+}
+
+#define FETCH(SIZE) \
+static u64 bpf_fetch_##SIZE(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) \
+{ \
+ void *unsafe_ptr = (void *) (long) r1; \
+ SIZE val = 0; \
+ \
+ probe_kernel_read(&val, unsafe_ptr, sizeof(val)); \
+ return (u64) (SIZE) val; \
+}
+FETCH(u64)
+FETCH(u32)
+FETCH(u16)
+FETCH(u8)
+#undef FETCH
+
+static u64 bpf_probe_memcmp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ void *unsafe_ptr = (void *) (long) r1;
+ void *safe_ptr = (void *) (long) r2;
+ u32 size = (u32) r3;
+ char buf[64];
+ int err;
+
+ if (size < 64) {
+ err = probe_kernel_read(buf, unsafe_ptr, size);
+ if (err)
+ return err;
+ return memcmp(buf, safe_ptr, size);
+ }
+ return -1;
+}
+
+static struct bpf_func_proto tp_prog_funcs[] = {
+#define FETCH(SIZE) \
+ [BPF_FUNC_fetch_##SIZE] = { \
+ .func = bpf_fetch_##SIZE, \
+ .gpl_only = true, \
+ .ret_type = RET_INTEGER, \
+ },
+ FETCH(ptr)
+ FETCH(u64)
+ FETCH(u32)
+ FETCH(u16)
+ FETCH(u8)
+#undef FETCH
+ [BPF_FUNC_probe_memcmp] = {
+ .func = bpf_probe_memcmp,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+ },
+};
+
+static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ default:
+ if (func_id < 0 || func_id >= ARRAY_SIZE(tp_prog_funcs))
+ return NULL;
+ return &tp_prog_funcs[func_id];
+ }
+}
+
+/* check access to argN fields of 'struct bpf_context' from program */
+static bool tp_prog_is_valid_access(int off, int size,
+ enum bpf_access_type type)
+{
+ /* check bounds */
+ if (off < 0 || off >= sizeof(struct bpf_context))
+ return false;
+
+ /* only read is allowed */
+ if (type != BPF_READ)
+ return false;
+
+ /* disallow misaligned access */
+ if (off % size != 0)
+ return false;
+
+ return true;
+}
+
+static struct bpf_verifier_ops tp_prog_ops = {
+ .get_func_proto = tp_prog_func_proto,
+ .is_valid_access = tp_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl = {
+ .ops = &tp_prog_ops,
+ .type = BPF_PROG_TYPE_TRACEPOINT,
+};
+
+static int __init register_tp_prog_ops(void)
+{
+ bpf_register_prog_type(&tl);
+ return 0;
+}
+late_initcall(register_tp_prog_ops);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..3487c41f4c0e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -7,6 +7,7 @@
#include <linux/ftrace.h>
#include <linux/perf_event.h>
#include <asm/syscall.h>
+#include <trace/bpf_trace.h>
#include "trace_output.h"
#include "trace.h"
@@ -545,11 +546,26 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
+static void populate_bpf_ctx(struct bpf_context *ctx, struct pt_regs *regs)
+{
+ struct task_struct *task = current;
+ unsigned long args[6];
+
+ syscall_get_arguments(task, regs, 0, 6, args);
+ ctx->arg1 = args[0];
+ ctx->arg2 = args[1];
+ ctx->arg3 = args[2];
+ ctx->arg4 = args[3];
+ ctx->arg5 = args[4];
+ ctx->arg6 = args[5];
+}
+
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
struct hlist_head *head;
+ struct bpf_prog *prog;
int syscall_nr;
int rctx;
int size;
@@ -564,6 +580,15 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (!sys_data)
return;
+ prog = sys_data->enter_event->prog;
+ if (prog) {
+ struct bpf_context ctx;
+
+ populate_bpf_ctx(&ctx, regs);
+ if (!trace_call_bpf(prog, &ctx))
+ return;
+ }
+
head = this_cpu_ptr(sys_data->enter_event->perf_events);
if (hlist_empty(head))
return;
@@ -624,6 +649,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
struct hlist_head *head;
+ struct bpf_prog *prog;
int syscall_nr;
int rctx;
int size;
@@ -638,6 +664,15 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data)
return;
+ prog = sys_data->exit_event->prog;
+ if (prog) {
+ struct bpf_context ctx = {};
+
+ ctx.arg1 = syscall_get_return_value(current, regs);
+ if (!trace_call_bpf(prog, &ctx))
+ return;
+ }
+
head = this_cpu_ptr(sys_data->exit_event->perf_events);
if (hlist_empty(head))
return;
--
1.7.9.5
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH v3 linux-trace 5/8] samples: bpf: counting example for kfree_skb tracepoint and write syscall
[not found] ` <1423539961-21792-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
2015-02-10 3:45 ` [PATCH v3 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls Alexei Starovoitov
@ 2015-02-10 3:45 ` Alexei Starovoitov
2015-02-10 14:55 ` [PATCH v3 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Steven Rostedt
2 siblings, 0 replies; 19+ messages in thread
From: Alexei Starovoitov @ 2015-02-10 3:45 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA
this example has two probes in one C file that attach to different tracepoints
and use two different maps.
1st probe is the similar to dropmon.c. It attaches to kfree_skb tracepoint and
count number of packet drops at different locations
2nd probe attaches to syscalls/sys_enter_write and computes a histogram of different
write sizes
Usage:
$ sudo tracex2
location 0xffffffff816959a5 count 1
location 0xffffffff816959a5 count 2
557145+0 records in
557145+0 records out
285258240 bytes (285 MB) copied, 1.02379 s, 279 MB/s
syscall write() stats
byte_size : count distribution
1 -> 1 : 3 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 2 | |
32 -> 63 : 3 | |
64 -> 127 : 1 | |
128 -> 255 : 1 | |
256 -> 511 : 0 | |
512 -> 1023 : 1118968 |************************************* |
Ctrl-C at any time. Kernel will auto cleanup maps and programs
Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
samples/bpf/Makefile | 4 ++
samples/bpf/tracex2_kern.c | 71 +++++++++++++++++++++++++++++++++
samples/bpf/tracex2_user.c | 95 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 170 insertions(+)
create mode 100644 samples/bpf/tracex2_kern.c
create mode 100644 samples/bpf/tracex2_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index da28e1b6d3a6..416af24b01fd 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -8,6 +8,7 @@ hostprogs-y += sockex1
hostprogs-y += sockex2
hostprogs-y += dropmon
hostprogs-y += tracex1
+hostprogs-y += tracex2
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -16,12 +17,14 @@ sock_example-objs := sock_example.o libbpf.o
sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
+tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
always += sockex1_kern.o
always += sockex2_kern.o
always += tracex1_kern.o
+always += tracex2_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -29,6 +32,7 @@ HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
HOSTLOADLIBES_sockex1 += -lelf
HOSTLOADLIBES_sockex2 += -lelf
HOSTLOADLIBES_tracex1 += -lelf
+HOSTLOADLIBES_tracex2 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
new file mode 100644
index 000000000000..a789c456c1b4
--- /dev/null
+++ b/samples/bpf/tracex2_kern.c
@@ -0,0 +1,71 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(long),
+ .max_entries = 1024,
+};
+
+SEC("events/skb/kfree_skb")
+int bpf_prog2(struct bpf_context *ctx)
+{
+ long loc = ctx->arg2;
+ long init_val = 1;
+ long *value;
+
+ value = bpf_map_lookup_elem(&my_map, &loc);
+ if (value)
+ *value += 1;
+ else
+ bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY);
+ return 0;
+}
+
+static unsigned int log2(unsigned int v)
+{
+ unsigned int r;
+ unsigned int shift;
+
+ r = (v > 0xFFFF) << 4; v >>= r;
+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+ shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+ shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+ r |= (v >> 1);
+ return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+ unsigned int hi = v >> 32;
+ if (hi)
+ return log2(hi) + 32;
+ else
+ return log2(v);
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 64,
+};
+
+SEC("events/syscalls/sys_enter_write")
+int bpf_prog3(struct bpf_context *ctx)
+{
+ long write_size = ctx->arg3;
+ long init_val = 1;
+ long *value;
+ u32 index = log2l(write_size);
+
+ value = bpf_map_lookup_elem(&my_hist_map, &index);
+ if (value)
+ __sync_fetch_and_add(value, 1);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
new file mode 100644
index 000000000000..91b8d0896fbb
--- /dev/null
+++ b/samples/bpf/tracex2_user.c
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX 64
+#define MAX_STARS 38
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+ int key;
+ long value;
+ long data[MAX_INDEX] = {};
+ char starstr[MAX_STARS];
+ int i;
+ int max_ind = -1;
+ long max_value = 0;
+
+ for (key = 0; key < MAX_INDEX; key++) {
+ bpf_lookup_elem(fd, &key, &value);
+ data[key] = value;
+ if (value && key > max_ind)
+ max_ind = key;
+ if (value > max_value)
+ max_value = value;
+ }
+
+ printf(" syscall write() stats\n");
+ printf(" byte_size : count distribution\n");
+ for (i = 1; i <= max_ind + 1; i++) {
+ stars(starstr, data[i - 1], max_value, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+ MAX_STARS, starstr);
+ }
+}
+static void int_exit(int sig)
+{
+ print_hist(map_fd[1]);
+ exit(0);
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+ long key, next_key, value;
+ FILE *f;
+ int i;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ signal(SIGINT, int_exit);
+
+ /* start 'ping' in the background to have some kfree_skb events */
+ f = popen("ping -c5 localhost", "r");
+ (void) f;
+
+ /* start 'dd' in the background to have plenty of 'write' syscalls */
+ f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
+ (void) f;
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ for (i = 0; i < 5; i++) {
+ key = 0;
+ while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+ bpf_lookup_elem(map_fd[0], &next_key, &value);
+ printf("location 0x%lx count %ld\n", next_key, value);
+ key = next_key;
+ }
+ if (key)
+ printf("\n");
+ sleep(1);
+ }
+ print_hist(map_fd[1]);
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH v3 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe
[not found] ` <1423539961-21792-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
2015-02-10 3:45 ` [PATCH v3 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls Alexei Starovoitov
2015-02-10 3:45 ` [PATCH v3 linux-trace 5/8] samples: bpf: counting example for kfree_skb tracepoint and write syscall Alexei Starovoitov
@ 2015-02-10 14:55 ` Steven Rostedt
2 siblings, 0 replies; 19+ messages in thread
From: Steven Rostedt @ 2015-02-10 14:55 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA
On Mon, 9 Feb 2015 19:45:53 -0800
Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:
> So the overhead of realistic bpf program is 5.05963/4.80074 = ~5%
> which is faster than perf_event filtering: 5.69732/4.80074 = ~18%
> or ftrace filtering: 6.50091/4.80074 = ~35%
Come to think of it, this is comparing apples to oranges, as you move
the filtering before the recording. It would be interesting to see the
ftrace speed up, if it were to use eBPF instead of its own filtering.
Maybe that 35% is the filter part, and not the discard part.
I just tried the dd test with count==1234 and count!=1234 and the one
that drops events is only slightly slower. In this case it does seem
that the most overhead is in the filter logic.
But by moving it before the recording, we can not use the fields
defined in the format files, as the parameters and the fields do not
match in most trace points. And to use the parameters, as I have
stated, there's no interface to know what those parameters are, then
filtering on them is a one shot deal. Might as well write a module and
hook directly to the tracepoint and do the filtering natively. That
would be faster than BPF too.
My point is, what's the use case? If you filter before recording, you
can not use the fields of the tracepoint. That limits you to filtering
only syscalls, and perhaps kprobes.
-- Steve
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH v3 linux-trace 6/8] samples: bpf: IO latency analysis (iosnoop/heatmap)
2015-02-10 3:45 [PATCH v3 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Alexei Starovoitov
` (3 preceding siblings ...)
[not found] ` <1423539961-21792-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
@ 2015-02-10 3:45 ` Alexei Starovoitov
2015-02-10 3:46 ` [PATCH v3 linux-trace 7/8] tracing: attach eBPF programs to kprobe/kretprobe Alexei Starovoitov
2015-02-10 3:46 ` [PATCH v3 linux-trace 8/8] samples: bpf: simple kprobe example Alexei Starovoitov
6 siblings, 0 replies; 19+ messages in thread
From: Alexei Starovoitov @ 2015-02-10 3:45 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
eBPF C program attaches to block_rq_issue/block_rq_complete events to calculate
IO latency. Then it waits for the first 100 events to compute average latency
and uses range [0 .. ave_lat * 2] to record histogram of events in this latency
range.
User space reads this histogram map every 2 seconds and prints it as a 'heatmap'
using gray shades of text terminal. Black spaces have many events and white
spaces have very few events. Left most space is the smallest latency, right most
space is the largest latency in the range.
If kernel sees too many events that fall out of histogram range, user space
adjusts the range up, so heatmap for next 2 seconds will be more accurate.
Usage:
$ sudo ./tracex3
and do 'sudo dd if=/dev/sda of=/dev/null' in other terminal.
Observe IO latencies and how different activity (like 'make kernel') affects it.
Similar experiments can be done for network transmit latencies, syscalls, etc
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 ++
samples/bpf/tracex3_kern.c | 98 ++++++++++++++++++++++++++++
samples/bpf/tracex3_user.c | 152 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 254 insertions(+)
create mode 100644 samples/bpf/tracex3_kern.c
create mode 100644 samples/bpf/tracex3_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 416af24b01fd..da0efd8032ab 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -9,6 +9,7 @@ hostprogs-y += sockex2
hostprogs-y += dropmon
hostprogs-y += tracex1
hostprogs-y += tracex2
+hostprogs-y += tracex3
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -18,6 +19,7 @@ sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
+tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -25,6 +27,7 @@ always += sockex1_kern.o
always += sockex2_kern.o
always += tracex1_kern.o
always += tracex2_kern.o
+always += tracex3_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -33,6 +36,7 @@ HOSTLOADLIBES_sockex1 += -lelf
HOSTLOADLIBES_sockex2 += -lelf
HOSTLOADLIBES_tracex1 += -lelf
HOSTLOADLIBES_tracex2 += -lelf
+HOSTLOADLIBES_tracex3 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
new file mode 100644
index 000000000000..961f3f373270
--- /dev/null
+++ b/samples/bpf/tracex3_kern.c
@@ -0,0 +1,98 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(u64),
+ .max_entries = 4096,
+};
+
+SEC("events/block/block_rq_issue")
+int bpf_prog1(struct bpf_context *ctx)
+{
+ long rq = ctx->arg2;
+ u64 val = bpf_ktime_get_ns();
+
+ bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
+ return 0;
+}
+
+struct globals {
+ u64 lat_ave;
+ u64 lat_sum;
+ u64 missed;
+ u64 max_lat;
+ int num_samples;
+};
+
+struct bpf_map_def SEC("maps") global_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct globals),
+ .max_entries = 1,
+};
+
+#define MAX_SLOT 32
+
+struct bpf_map_def SEC("maps") lat_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(u64),
+ .max_entries = MAX_SLOT,
+};
+
+SEC("events/block/block_rq_complete")
+int bpf_prog2(struct bpf_context *ctx)
+{
+ long rq = ctx->arg2;
+ void *value;
+
+ value = bpf_map_lookup_elem(&my_map, &rq);
+ if (!value)
+ return 0;
+
+ u64 cur_time = bpf_ktime_get_ns();
+ u64 delta = (cur_time - *(u64 *)value) / 1000;
+
+ bpf_map_delete_elem(&my_map, &rq);
+
+ int ind = 0;
+ struct globals *g = bpf_map_lookup_elem(&global_map, &ind);
+
+ if (!g)
+ return 0;
+ if (g->lat_ave == 0) {
+ g->num_samples++;
+ g->lat_sum += delta;
+ if (g->num_samples >= 100)
+ g->lat_ave = g->lat_sum / g->num_samples;
+ } else {
+ u64 max_lat = g->lat_ave * 2;
+
+ if (delta > max_lat) {
+ g->missed++;
+ if (delta > g->max_lat)
+ g->max_lat = delta;
+ return 0;
+ }
+
+ ind = delta * MAX_SLOT / max_lat;
+ value = bpf_map_lookup_elem(&lat_map, &ind);
+ if (!value)
+ return 0;
+ (*(u64 *)value)++;
+ }
+
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
new file mode 100644
index 000000000000..c49f41f28cba
--- /dev/null
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,152 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+struct globals {
+ __u64 lat_ave;
+ __u64 lat_sum;
+ __u64 missed;
+ __u64 max_lat;
+ int num_samples;
+};
+
+static void clear_stats(int fd)
+{
+ int key;
+ __u64 value = 0;
+
+ for (key = 0; key < 32; key++)
+ bpf_update_elem(fd, &key, &value, BPF_ANY);
+}
+
+const char *color[] = {
+ "\033[48;5;255m",
+ "\033[48;5;252m",
+ "\033[48;5;250m",
+ "\033[48;5;248m",
+ "\033[48;5;246m",
+ "\033[48;5;244m",
+ "\033[48;5;242m",
+ "\033[48;5;240m",
+ "\033[48;5;238m",
+ "\033[48;5;236m",
+ "\033[48;5;234m",
+ "\033[48;5;232m",
+};
+const int num_colors = ARRAY_SIZE(color);
+
+const char nocolor[] = "\033[00m";
+
+static void print_banner(__u64 max_lat)
+{
+ printf("0 usec ... %lld usec\n", max_lat);
+}
+
+static void print_hist(int fd)
+{
+ int key;
+ __u64 value;
+ __u64 cnt[32];
+ __u64 max_cnt = 0;
+ __u64 total_events = 0;
+ int max_bucket = 0;
+
+ for (key = 0; key < 32; key++) {
+ value = 0;
+ bpf_lookup_elem(fd, &key, &value);
+ if (value > 0)
+ max_bucket = key;
+ cnt[key] = value;
+ total_events += value;
+ if (value > max_cnt)
+ max_cnt = value;
+ }
+ clear_stats(fd);
+ for (key = 0; key < 32; key++) {
+ int c = num_colors * cnt[key] / (max_cnt + 1);
+
+ printf("%s %s", color[c], nocolor);
+ }
+ printf(" captured=%lld", total_events);
+
+ key = 0;
+ struct globals g = {};
+
+ bpf_lookup_elem(map_fd[1], &key, &g);
+
+ printf(" missed=%lld max_lat=%lld usec\n",
+ g.missed, g.max_lat);
+
+ if (g.missed > 10 && g.missed > total_events / 10) {
+ printf("adjusting range UP...\n");
+ g.lat_ave = g.max_lat / 2;
+ print_banner(g.lat_ave * 2);
+ } else if (max_bucket < 4 && total_events > 100) {
+ printf("adjusting range DOWN...\n");
+ g.lat_ave = g.lat_ave / 4;
+ print_banner(g.lat_ave * 2);
+ }
+ /* clear some globals */
+ g.missed = 0;
+ g.max_lat = 0;
+ bpf_update_elem(map_fd[1], &key, &g, BPF_ANY);
+}
+
+static void int_exit(int sig)
+{
+ print_hist(map_fd[2]);
+ exit(0);
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ clear_stats(map_fd[2]);
+
+ signal(SIGINT, int_exit);
+
+ struct globals g;
+
+ printf("waiting for events to determine average latency...\n");
+ for (;;) {
+ int key = 0;
+
+ bpf_lookup_elem(map_fd[1], &key, &g);
+ if (g.lat_ave)
+ break;
+ sleep(1);
+ }
+
+ printf(" IO latency in usec\n"
+ " %s %s - many events with this latency\n"
+ " %s %s - few events\n",
+ color[num_colors - 1], nocolor,
+ color[0], nocolor);
+ print_banner(g.lat_ave * 2);
+ for (;;) {
+ print_hist(map_fd[2]);
+ sleep(2);
+ }
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH v3 linux-trace 7/8] tracing: attach eBPF programs to kprobe/kretprobe
2015-02-10 3:45 [PATCH v3 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Alexei Starovoitov
` (4 preceding siblings ...)
2015-02-10 3:45 ` [PATCH v3 linux-trace 6/8] samples: bpf: IO latency analysis (iosnoop/heatmap) Alexei Starovoitov
@ 2015-02-10 3:46 ` Alexei Starovoitov
2015-02-10 3:46 ` [PATCH v3 linux-trace 8/8] samples: bpf: simple kprobe example Alexei Starovoitov
6 siblings, 0 replies; 19+ messages in thread
From: Alexei Starovoitov @ 2015-02-10 3:46 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
introduce new type of eBPF programs BPF_PROG_TYPE_KPROBE.
Such programs are allowed to call the same helper functions
as tracing filters, but bpf_context is different:
For tracing filters bpf_context is 6 arguments of tracepoints or syscalls
For kprobe filters bpf_context == pt_regs
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
include/linux/ftrace_event.h | 3 +++
include/uapi/linux/bpf.h | 1 +
kernel/events/core.c | 5 ++++-
kernel/trace/bpf_trace.c | 39 +++++++++++++++++++++++++++++++++++++++
kernel/trace/trace_kprobe.c | 10 +++++++++-
5 files changed, 56 insertions(+), 2 deletions(-)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 479d0a4a42b3..cd6efd23bfae 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -249,6 +249,7 @@ enum {
TRACE_EVENT_FL_WAS_ENABLED_BIT,
TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
TRACE_EVENT_FL_TRACEPOINT_BIT,
+ TRACE_EVENT_FL_KPROBE_BIT,
};
/*
@@ -262,6 +263,7 @@ enum {
* it is best to clear the buffers that used it).
* USE_CALL_FILTER - For ftrace internal events, don't use file filter
* TRACEPOINT - Event is a tracepoint
+ * KPROBE - Event is a kprobe
*/
enum {
TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -271,6 +273,7 @@ enum {
TRACE_EVENT_FL_WAS_ENABLED = (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+ TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT),
};
struct ftrace_event_call {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ecae21e58ba3..cf443900318d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -119,6 +119,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_UNSPEC,
BPF_PROG_TYPE_SOCKET_FILTER,
BPF_PROG_TYPE_TRACEPOINT,
+ BPF_PROG_TYPE_KPROBE,
};
/* flags for BPF_MAP_UPDATE_ELEM command */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 674a8ca17190..94de727a8c0c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6288,7 +6288,10 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
if (IS_ERR(prog))
return PTR_ERR(prog);
- if (prog->aux->prog_type != BPF_PROG_TYPE_TRACEPOINT) {
+ if (((event->tp_event->flags & TRACE_EVENT_FL_KPROBE) &&
+ prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) ||
+ (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE) &&
+ prog->aux->prog_type != BPF_PROG_TYPE_TRACEPOINT)) {
/* valid fd, but invalid bpf program type */
bpf_prog_put(prog);
return -EINVAL;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e3196266b72f..52dbd1e7dc28 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -153,3 +153,42 @@ static int __init register_tp_prog_ops(void)
return 0;
}
late_initcall(register_tp_prog_ops);
+
+/* check access to fields of 'struct pt_regs' from BPF program */
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ /* check bounds */
+ if (off < 0 || off >= sizeof(struct pt_regs))
+ return false;
+
+ /* only read is allowed */
+ if (type != BPF_READ)
+ return false;
+
+ /* disallow misaligned access */
+ if (off % size != 0)
+ return false;
+
+ return true;
+}
+/* kprobe filter programs are allowed to call the same helper functions
+ * as tracing filters, but bpf_context is different:
+ * For tracing filters bpf_context is 6 arguments of tracepoints or syscalls
+ * For kprobe filters bpf_context == pt_regs
+ */
+static struct bpf_verifier_ops kprobe_prog_ops = {
+ .get_func_proto = tp_prog_func_proto,
+ .is_valid_access = kprobe_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list kprobe_tl = {
+ .ops = &kprobe_prog_ops,
+ .type = BPF_PROG_TYPE_KPROBE,
+};
+
+static int __init register_kprobe_prog_ops(void)
+{
+ bpf_register_prog_type(&kprobe_tl);
+ return 0;
+}
+late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5edb518be345..d503ac43b85b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1134,11 +1134,15 @@ static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct ftrace_event_call *call = &tk->tp.call;
+ struct bpf_prog *prog = call->prog;
struct kprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
+ if (prog && !trace_call_bpf(prog, regs))
+ return;
+
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return;
@@ -1165,11 +1169,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct ftrace_event_call *call = &tk->tp.call;
+ struct bpf_prog *prog = call->prog;
struct kretprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
+ if (prog && !trace_call_bpf(prog, regs))
+ return;
+
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return;
@@ -1286,7 +1294,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
kfree(call->print_fmt);
return -ENODEV;
}
- call->flags = 0;
+ call->flags = TRACE_EVENT_FL_KPROBE;
call->class->reg = kprobe_register;
call->data = tk;
ret = trace_add_event_call(call);
--
1.7.9.5
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH v3 linux-trace 8/8] samples: bpf: simple kprobe example
2015-02-10 3:45 [PATCH v3 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe Alexei Starovoitov
` (5 preceding siblings ...)
2015-02-10 3:46 ` [PATCH v3 linux-trace 7/8] tracing: attach eBPF programs to kprobe/kretprobe Alexei Starovoitov
@ 2015-02-10 3:46 ` Alexei Starovoitov
6 siblings, 0 replies; 19+ messages in thread
From: Alexei Starovoitov @ 2015-02-10 3:46 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
the logic of the example is similar to tracex2, but syscall 'write' statistics
is capturead from kprobe placed at sys_write function instead of through
syscall instrumentation.
Also tracex4_kern.c has a different way of doing log2 in C.
Note, unlike tracepoint and syscall programs, kprobe programs receive
'struct pt_regs' as an input. It's responsibility of the program author
or higher level dynamic tracing tool to match registers to function arguments.
Since pt_regs are architecture dependent, programs are also arch dependent,
unlike tracepoint/syscalls programs which are universal.
Usage:
$ sudo tracex4
2216443+0 records in
2216442+0 records out
1134818304 bytes (1.1 GB) copied, 2.00746 s, 565 MB/s
kprobe sys_write() stats
byte_size : count distribution
1 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 1 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 2214734 |************************************* |
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 +++
samples/bpf/bpf_load.c | 3 ++
samples/bpf/tracex4_kern.c | 36 +++++++++++++++++++
samples/bpf/tracex4_user.c | 83 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 126 insertions(+)
create mode 100644 samples/bpf/tracex4_kern.c
create mode 100644 samples/bpf/tracex4_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index da0efd8032ab..22c7a38f3f95 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -10,6 +10,7 @@ hostprogs-y += dropmon
hostprogs-y += tracex1
hostprogs-y += tracex2
hostprogs-y += tracex3
+hostprogs-y += tracex4
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -20,6 +21,7 @@ sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
+tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -28,6 +30,7 @@ always += sockex2_kern.o
always += tracex1_kern.o
always += tracex2_kern.o
always += tracex3_kern.o
+always += tracex4_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -37,6 +40,7 @@ HOSTLOADLIBES_sockex2 += -lelf
HOSTLOADLIBES_tracex1 += -lelf
HOSTLOADLIBES_tracex2 += -lelf
HOSTLOADLIBES_tracex3 += -lelf
+HOSTLOADLIBES_tracex4 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 2aece65963e4..2206b49df625 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -32,6 +32,7 @@ int prog_cnt;
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
bool is_socket = strncmp(event, "socket", 6) == 0;
+ bool is_kprobe = strncmp(event, "events/kprobes/", 15) == 0;
enum bpf_prog_type prog_type;
char path[256] = DEBUGFS;
char buf[32];
@@ -45,6 +46,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
if (is_socket)
prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ else if (is_kprobe)
+ prog_type = BPF_PROG_TYPE_KPROBE;
else
prog_type = BPF_PROG_TYPE_TRACEPOINT;
diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c
new file mode 100644
index 000000000000..9646f9e43417
--- /dev/null
+++ b/samples/bpf/tracex4_kern.c
@@ -0,0 +1,36 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+static unsigned int log2l(unsigned long long n)
+{
+#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; }
+ int i = -(n == 0);
+ S(32); S(16); S(8); S(4); S(2); S(1);
+ return i;
+#undef S
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 64,
+};
+
+SEC("events/kprobes/sys_write")
+int bpf_prog4(struct pt_regs *regs)
+{
+ long write_size = regs->dx; /* $rdx contains 3rd argument to a function */
+ long init_val = 1;
+ void *value;
+ u32 index = log2l(write_size);
+
+ value = bpf_map_lookup_elem(&my_hist_map, &index);
+ if (value)
+ __sync_fetch_and_add((long *)value, 1);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
new file mode 100644
index 000000000000..741206127768
--- /dev/null
+++ b/samples/bpf/tracex4_user.c
@@ -0,0 +1,83 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX 64
+#define MAX_STARS 38
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+ int key;
+ long value;
+ long data[MAX_INDEX] = {};
+ char starstr[MAX_STARS];
+ int i;
+ int max_ind = -1;
+ long max_value = 0;
+
+ for (key = 0; key < MAX_INDEX; key++) {
+ bpf_lookup_elem(fd, &key, &value);
+ data[key] = value;
+ if (value && key > max_ind)
+ max_ind = key;
+ if (value > max_value)
+ max_value = value;
+ }
+
+ printf("\n kprobe sys_write() stats\n");
+ printf(" byte_size : count distribution\n");
+ for (i = 1; i <= max_ind + 1; i++) {
+ stars(starstr, data[i - 1], max_value, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+ MAX_STARS, starstr);
+ }
+}
+static void int_exit(int sig)
+{
+ print_hist(map_fd[0]);
+ exit(0);
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+ FILE *f;
+ int i;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ signal(SIGINT, int_exit);
+
+ i = system("echo 'p:sys_write sys_write' > /sys/kernel/debug/tracing/kprobe_events");
+ (void) i;
+
+ /* start 'dd' in the background to have plenty of 'write' syscalls */
+ f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
+ (void) f;
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ sleep(2);
+ kill(0, SIGINT); /* send Ctrl-C to self and to 'dd' */
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related [flat|nested] 19+ messages in thread