From mboxrd@z Thu Jan 1 00:00:00 1970 From: Sebastiano Miano Subject: [bpf-next PATCH 3/3] bpf: add sample program to trace map events Date: Wed, 18 Apr 2018 17:30:59 +0200 Message-ID: <152406545918.3465.14253635905960610284.stgit@localhost.localdomain> References: <152406544226.3465.948692097697975172.stgit@localhost.localdomain> Mime-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit Cc: mingo@redhat.com, rostedt@goodmis.org, brouer@redhat.com, fulvio.risso@polito.it To: netdev@vger.kernel.org, ast@kernel.org, daniel@iogearbox.net Return-path: Received: from fm1nodo5.polito.it ([130.192.180.18]:43841 "EHLO fm1nodo5.polito.it" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752632AbeDRP6i (ORCPT ); Wed, 18 Apr 2018 11:58:38 -0400 In-Reply-To: <152406544226.3465.948692097697975172.stgit@localhost.localdomain> Sender: netdev-owner@vger.kernel.org List-ID: This patch adds a sample program, called trace_map_events, that shows how to capture map events and filter them based on the map id. The program accepts a list of map IDs, via the -i command line option, and filters all the map events related to those IDs (i.e., map_create/update/lookup/next_key). If no IDs are specified, all map events are listed and no filtering is performed. Sample usage: # trace_map_events -i -i -i ... Signed-off-by: Sebastiano Miano --- samples/bpf/Makefile | 4 samples/bpf/trace_map_events_kern.c | 225 +++++++++++++++++++++++++ samples/bpf/trace_map_events_user.c | 314 +++++++++++++++++++++++++++++++++++ 3 files changed, 543 insertions(+) create mode 100644 samples/bpf/trace_map_events_kern.c create mode 100644 samples/bpf/trace_map_events_user.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 4d6a6ed..a7d52b6 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -15,6 +15,7 @@ hostprogs-y += tracex6 hostprogs-y += tracex7 hostprogs-y += test_probe_write_user hostprogs-y += trace_output +hostprogs-y += trace_map_events hostprogs-y += lathist hostprogs-y += offwaketime hostprogs-y += spintest @@ -65,6 +66,7 @@ tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o +trace_map_events-objs := bpf_load.o $(LIBBPF) trace_map_events_user.o lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o @@ -111,6 +113,7 @@ always += tracex7_kern.o always += sock_flags_kern.o always += test_probe_write_user_kern.o always += trace_output_kern.o +always += trace_map_events_kern.o always += tcbpf1_kern.o always += tcbpf2_kern.o always += tc_l2_redirect_kern.o @@ -171,6 +174,7 @@ HOSTLOADLIBES_test_cgrp2_sock2 += -lelf HOSTLOADLIBES_load_sock_ops += -lelf HOSTLOADLIBES_test_probe_write_user += -lelf HOSTLOADLIBES_trace_output += -lelf -lrt +HOSTLOADLIBES_trace_map_events += -lelf -lrt HOSTLOADLIBES_lathist += -lelf HOSTLOADLIBES_offwaketime += -lelf HOSTLOADLIBES_spintest += -lelf diff --git a/samples/bpf/trace_map_events_kern.c b/samples/bpf/trace_map_events_kern.c new file mode 100644 index 0000000..f887b5b --- /dev/null +++ b/samples/bpf/trace_map_events_kern.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2018 Politecnico di Torino, Italy + * + * Author: Sebastiano Miano + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include +#include +#include "bpf_helpers.h" + +enum map_event_type { + MAP_CREATE = 0, + MAP_UPDATE = 1, + MAP_LOOKUP = 2, + MAP_NEXT_KEY = 3 +}; + +struct map_event_data { + u32 map_id; + enum map_event_type evnt_type; + u32 map_type; +}; + +struct bpf_map_def SEC("maps") map_event_trace = { + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(u32), + .max_entries = 64, +}; + +struct bpf_map_def SEC("maps") filtered_ids = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 64, +}; + +struct bpf_map_def SEC("maps") filter_events = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(bool), + .max_entries = 1, +}; + +/* + * Tracepoint format: /sys/kernel/debug/tracing/events/bpf/bpf_map_create/format + * Code in: kernel/include/trace/events/bpf.h + */ +struct bpf_map_create_ctx { + u64 pad; // First 8 bytes are not accessible by bpf code + u32 type; // offset:8; size:4; signed:0; + u32 size_key; // offset:12; size:4; signed:0; + u32 size_value; // offset:16; size:4; signed:0; + u32 max_entries; // offset:20; size:4; signed:0; + u32 flags; // offset:24; size:4; signed:0; + int ufd; // offset:28; size:4; signed:1; + u32 id; // offset:32; size:4; signed:0; +}; + +SEC("tracepoint/bpf/bpf_map_create") +int trace_bpf_map_create(struct bpf_map_create_ctx *ctx) +{ + struct map_event_data data; + int cpu = bpf_get_smp_processor_id(); + bool *filter; + u32 key = 0, map_id = ctx->id; + + filter = bpf_map_lookup_elem(&filter_events, &key); + if (!filter) + return 1; + + if (!*filter) + goto send_event; + + /* + * If the map_id is not in the list of filtered + * ids we immediately return + */ + if (!bpf_map_lookup_elem(&filtered_ids, &map_id)) + return 0; + +send_event: + data.map_id = map_id; + data.evnt_type = MAP_CREATE; + data.map_type = ctx->type; + + bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data)); + return 0; +} + +/* + * Tracepoint: /sys/kernel/debug/tracing/events/bpf/bpf_map_lookup_elem/format + * Tracepoint: /sys/kernel/debug/tracing/events/bpf/bpf_map_update_elem/format + * Code in: kernel/include/trace/events/bpf.h + */ +struct bpf_map_keyval_ctx { + u64 pad; // First 8 bytes are not accessible by bpf code + u32 type; // offset:8; size:4; signed:0; + u32 key_len; // offset:12; size:4; signed:0; + u32 key; // offset:16; size:4; signed:0; + bool key_trunc; // offset:20; size:1; signed:0; + u32 val_len; // offset:24; size:4; signed:0; + u32 val; // offset:28; size:4; signed:0; + bool val_trunc; // offset:32; size:1; signed:0; + int ufd; // offset:36; size:4; signed:1; + u32 id; // offset:40; size:4; signed:0; +}; + +SEC("tracepoint/bpf/bpf_map_lookup_elem") +int trace_bpf_map_lookup_elem(struct bpf_map_keyval_ctx *ctx) +{ + struct map_event_data data; + int cpu = bpf_get_smp_processor_id(); + bool *filter; + u32 key = 0, map_id = ctx->id; + + filter = bpf_map_lookup_elem(&filter_events, &key); + if (!filter) + return 1; + + if (!*filter) + goto send_event; + + /* + * If the map_id is not in the list of filtered + * ids we immediately return + */ + if (!bpf_map_lookup_elem(&filtered_ids, &map_id)) + return 0; + +send_event: + data.map_id = map_id; + data.evnt_type = MAP_LOOKUP; + data.map_type = ctx->type; + + bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data)); + return 0; +} + +SEC("tracepoint/bpf/bpf_map_update_elem") +int trace_bpf_map_update_elem(struct bpf_map_keyval_ctx *ctx) +{ + struct map_event_data data; + int cpu = bpf_get_smp_processor_id(); + bool *filter; + u32 key = 0, map_id = ctx->id; + + filter = bpf_map_lookup_elem(&filter_events, &key); + if (!filter) + return 1; + + if (!*filter) + goto send_event; + + /* + * If the map_id is not in the list of filtered + * ids we immediately return + */ + if (!bpf_map_lookup_elem(&filtered_ids, &map_id)) + return 0; + +send_event: + data.map_id = map_id; + data.evnt_type = MAP_UPDATE; + data.map_type = ctx->type; + + bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data)); + return 0; +} + +/* + * Tracepoint: /sys/kernel/debug/tracing/events/bpf/bpf_map_next_key/format + * Code in: kernel/include/trace/events/bpf.h + */ +struct bpf_map_next_key_ctx { + u64 pad; // First 8 bytes are not accessible by bpf code + u32 type; // offset:8; size:4; signed:0; + u32 key_len; // offset:12; size:4; signed:0; + u32 key; // offset:16; size:4; signed:0; + u32 nxt; // offset:20; size:4; signed:0; + bool key_trunc; // offset:24; size:1; signed:0; + bool key_null; // offset:25; size:1; signed:0; + int ufd; // offset:28; size:4; signed:1; + u32 id; // offset:32; size:4; signed:0; +}; + +SEC("tracepoint/bpf/bpf_map_next_key") +int trace_bpf_map_next_key(struct bpf_map_next_key_ctx *ctx) +{ + struct map_event_data data; + int cpu = bpf_get_smp_processor_id(); + bool *filter; + u32 key = 0, map_id = ctx->id; + + filter = bpf_map_lookup_elem(&filter_events, &key); + if (!filter) + return 1; + + if (!*filter) + goto send_event; + + /* + * If the map_id is not in the list of filtered + * ids we immediately return + */ + if (!bpf_map_lookup_elem(&filtered_ids, &map_id)) + return 0; + +send_event: + data.map_id = map_id; + data.evnt_type = MAP_NEXT_KEY; + data.map_type = ctx->type; + + bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data)); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/trace_map_events_user.c b/samples/bpf/trace_map_events_user.c new file mode 100644 index 0000000..bc7447e --- /dev/null +++ b/samples/bpf/trace_map_events_user.c @@ -0,0 +1,314 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2018 Politecnico di Torino, Italy + * + * Author: Sebastiano Miano + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +static const char *__desc__ = +"Sample program to trace map related events\n" +"The -i option allows to set the id(s) of the map you are interested in.\n" +"If no ID is specified, all map events are listed.\n"; + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libbpf.h" +#include "bpf_load.h" +#include "bpf_util.h" +#include "perf-sys.h" + +#define MAX_FILTERED_IDS 64 + +static int *perf_fd; + +int epoll_fd; +int page_size; +int page_cnt = 8; +volatile struct perf_event_mmap_page **readers; + +typedef void (*event_cb)(void *data, int size); + +enum map_event_type { + MAP_CREATE = 0, + MAP_UPDATE = 1, + MAP_LOOKUP = 2, + MAP_NEXT_KEY = 3 +}; + +static void usage(char *argv[]) +{ + printf("\nDESCRIPTION:\n%s", __desc__); + printf("\n"); + printf(" Usage: %s [-i map_id1] [-i map_id2] ...\n", argv[0]); + printf("\n"); +} + +static int perf_event_mmap(int fd, int cpu) +{ + void *base; + int mmap_size; + + page_size = getpagesize(); + mmap_size = page_size * (page_cnt + 1); + + base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (base == MAP_FAILED) { + printf("mmap err\n"); + return -1; + } + + readers[cpu] = base; + return 0; +} + +static void init_bpf_perf_event_on_cpu(int cpu) +{ + struct perf_event_attr attr = { + .sample_type = PERF_SAMPLE_RAW, + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_BPF_OUTPUT, + .sample_period = 1, + .wakeup_events = 1, + }; + int key = cpu; + + perf_fd[cpu] = sys_perf_event_open(&attr, -1, cpu, -1, 0); + + assert(perf_fd[cpu] >= 0); + assert(perf_event_mmap(perf_fd[cpu], cpu) >= 0); + assert(ioctl(perf_fd[cpu], PERF_EVENT_IOC_ENABLE, 0) >= 0); + assert(bpf_map_update_elem(map_fd[0], &key, &perf_fd[cpu], 0) == 0); + + struct epoll_event e = { .events = EPOLLIN, .data.u32 = cpu }; + + assert(epoll_ctl(epoll_fd, EPOLL_CTL_ADD, perf_fd[cpu], &e) == 0); +} + +static int perf_event_poll(int fd, int num_cpus, struct epoll_event *events) +{ + return epoll_wait(fd, events, num_cpus, -1); +} + +struct perf_event_sample { + struct perf_event_header header; + __u32 size; + char data[]; +}; + +static void perf_event_read(event_cb fn, __u32 index) +{ + __u64 data_tail = readers[index]->data_tail; + __u64 data_head = readers[index]->data_head; + __u64 buffer_size = page_cnt * page_size; + void *base, *begin, *end; + char buf[256]; + + asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */ + if (data_head == data_tail) + return; + + base = ((char *)readers[index]) + page_size; + + begin = base + data_tail % buffer_size; + end = base + data_head % buffer_size; + + while (begin != end) { + struct perf_event_sample *e; + + e = begin; + if (begin + e->header.size > base + buffer_size) { + long len = base + buffer_size - begin; + + assert(len < e->header.size); + memcpy(buf, begin, len); + memcpy(buf + len, base, e->header.size - len); + e = (void *) buf; + begin = base + e->header.size - len; + } else if (begin + e->header.size == base + buffer_size) { + begin = base; + } else { + begin += e->header.size; + } + + if (e->header.type == PERF_RECORD_SAMPLE) { + fn(e->data, e->size); + } else if (e->header.type == PERF_RECORD_LOST) { + struct { + struct perf_event_header header; + __u64 id; + __u64 lost; + } *lost = (void *) e; + printf("lost %lld events\n", lost->lost); + } else { + printf("unknown event type=%d size=%d\n", + e->header.type, e->header.size); + } + } + + __sync_synchronize(); /* smp_mb() */ + readers[index]->data_tail = data_head; +} + +static const char *get_event_type(enum map_event_type event) +{ + switch (event) { + case MAP_CREATE: + return "CREATE"; + case MAP_LOOKUP: + return "LOOKUP"; + case MAP_UPDATE: + return "UPDATE"; + case MAP_NEXT_KEY: + return "NEXT_KEY"; + } + + return "UNKNOWN"; +} + + +static void map_event_callback(void *data, int size) +{ + struct { + __u32 map_id; + enum map_event_type event_type; + __u32 map_type; + } *e = data; + + printf("%s event for map id: %d and type: %d\n", + get_event_type(e->event_type), e->map_id, e->map_type); +} + +static bool init_filtered_ids_map(int num_ids, int *filtered_ids) +{ + int i, key, value; + bool filtering = false; + /* + * I am going to put the IDs in the map. Only event related to those IDs + * will be shown. The key indicates the ID of the map while the value + * is not used and then is set to 0. + */ + for (i = 0; i < num_ids; i++) { + key = filtered_ids[i]; + value = 0; + if (bpf_map_update_elem(map_fd[1], &key, &value, 0) != 0) { + fprintf(stderr, + "ERR: bpf_map_update_elem failed key:0x%X\n", key); + return false; + } + } + + if (num_ids > 0) + filtering = true; + + key = 0; + assert(bpf_map_update_elem(map_fd[2], &key, &filtering, BPF_ANY) == 0); + return true; +} + +static bool init_perf_buffer_data_structures(int nr_cpus) +{ + int i; + + perf_fd = malloc(sizeof(int) * nr_cpus); + assert(perf_fd); + readers = malloc(sizeof(*readers) * nr_cpus); + assert(readers); + + epoll_fd = epoll_create1(EPOLL_CLOEXEC); + + for (i = 0; i < nr_cpus; i++) { + printf("Init bpf_perf_event for cpu:%d\n", i); + init_bpf_perf_event_on_cpu(i); + } + + return true; +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int i, cnt, opt, ret = EXIT_SUCCESS; + char bpf_obj_file[256]; + int num_ids = 0, nr_cpus = bpf_num_possible_cpus(); + int filtered_ids[MAX_FILTERED_IDS]; + + snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]); + + /* Parse commands line args */ + while ((opt = getopt(argc, argv, "hi:")) != -1) { + switch (opt) { + case 'i': + if (num_ids == MAX_FILTERED_IDS) { + printf("Reached maximum number of IDs"); + return EXIT_FAILURE; + } + i = atoi(optarg); + if (!i) + printf("ERROR - Invalid id %s", optarg); + else + filtered_ids[num_ids++] = i; + break; + case 'h': + default: + usage(argv); + return EXIT_FAILURE; + } + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return EXIT_FAILURE; + } + + if (load_bpf_file(bpf_obj_file)) { + printf("ERROR - bpf_log_buf: %s", bpf_log_buf); + return EXIT_FAILURE; + } + + if (!prog_fd[0]) { + printf("ERROR - load_bpf_file: %s\n", strerror(errno)); + return EXIT_FAILURE; + } + + init_filtered_ids_map(num_ids, filtered_ids); + init_perf_buffer_data_structures(nr_cpus); + + struct epoll_event *events = calloc(nr_cpus, sizeof(*events)); + + while (true) { + printf("Waiting for map events...\n"); + cnt = perf_event_poll(epoll_fd, nr_cpus, events); + for (i = 0; i < cnt; i++) + perf_event_read(map_event_callback, events[i].data.u32); + } + + free(perf_fd); + free(readers); + free(events); + + return ret; +}