netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH bpf-next 0/2] libbpf: add perf buffer API
@ 2019-06-25 23:25 Andrii Nakryiko
  2019-06-25 23:26 ` [PATCH bpf-next 1/2] libbpf: add perf buffer reading API Andrii Nakryiko
  2019-06-25 23:26 ` [PATCH bpf-next 2/2] selftests/bpf: test perf buffer API Andrii Nakryiko
  0 siblings, 2 replies; 7+ messages in thread
From: Andrii Nakryiko @ 2019-06-25 23:25 UTC (permalink / raw)
  To: andrii.nakryiko, ast, daniel, bpf, netdev, kernel-team; +Cc: Andrii Nakryiko

This patchset adds a high-level API for setting up and polling perf buffers
associated with BPF_MAP_TYPE_PERF_EVENT_ARRAY map. Details of APIs are
described in corresponding commit.

Andrii Nakryiko (2):
  libbpf: add perf buffer reading API
  selftests/bpf: test perf buffer API

 tools/lib/bpf/libbpf.c                        | 282 ++++++++++++++++++
 tools/lib/bpf/libbpf.h                        |  12 +
 tools/lib/bpf/libbpf.map                      |   5 +-
 .../selftests/bpf/prog_tests/perf_buffer.c    |  86 ++++++
 .../selftests/bpf/progs/test_perf_buffer.c    |  31 ++
 5 files changed, 415 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/perf_buffer.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_perf_buffer.c

-- 
2.17.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH bpf-next 1/2] libbpf: add perf buffer reading API
  2019-06-25 23:25 [PATCH bpf-next 0/2] libbpf: add perf buffer API Andrii Nakryiko
@ 2019-06-25 23:26 ` Andrii Nakryiko
  2019-06-26  2:18   ` Song Liu
  2019-06-25 23:26 ` [PATCH bpf-next 2/2] selftests/bpf: test perf buffer API Andrii Nakryiko
  1 sibling, 1 reply; 7+ messages in thread
From: Andrii Nakryiko @ 2019-06-25 23:26 UTC (permalink / raw)
  To: andrii.nakryiko, ast, daniel, bpf, netdev, kernel-team; +Cc: Andrii Nakryiko

BPF_MAP_TYPE_PERF_EVENT_ARRAY map is often used to send data from BPF program
to user space for additional processing. libbpf already has very low-level API
to read single CPU perf buffer, bpf_perf_event_read_simple(), but it's hard to
use and requires a lot of code to set everything up. This patch adds
perf_buffer abstraction on top of it, abstracting setting up and polling
per-CPU logic into simple and convenient API, similar to what BCC provides.

perf_buffer__new() sets up per-CPU ring buffers and updates corresponding BPF
map entries. It accepts two user-provided callbacks: one for handling raw
samples and one for get notifications of lost samples due to buffer overflow.

perf_buffer__poll() is used to fetch ring buffer data across all CPUs,
utilizing epoll instance.

perf_buffer__free() does corresponding clean up and unsets FDs from BPF map.

All APIs are not thread-safe. User should ensure proper locking/coordination if
used in multi-threaded set up.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 tools/lib/bpf/libbpf.c   | 282 +++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/libbpf.h   |  12 ++
 tools/lib/bpf/libbpf.map |   5 +-
 3 files changed, 298 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 9a4199b51300..c74cc535902a 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -32,7 +32,9 @@
 #include <linux/limits.h>
 #include <linux/perf_event.h>
 #include <linux/ring_buffer.h>
+#include <sys/epoll.h>
 #include <sys/ioctl.h>
+#include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
@@ -4322,6 +4324,286 @@ bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
 	return ret;
 }
 
+struct perf_cpu_buf {
+	int fd;
+	void *base; /* mmap()'ed memory */
+	void *buf; /* for reconstructing segmented data */
+	size_t buf_size;
+};
+
+struct perf_buffer {
+	perf_buffer_sample_fn sample_cb;
+	perf_buffer_lost_fn lost_cb;
+	void *ctx; /* passed into callbacks */
+
+	size_t page_size;
+	size_t mmap_size;
+	struct perf_cpu_buf **cpu_bufs;
+	struct epoll_event *events;
+	int cpu_cnt;
+	int epfd; /* perf event FD */
+	int mapfd; /* BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map FD */
+};
+
+static void perf_buffer__free_cpu_buf(struct perf_buffer *pb,
+				      struct perf_cpu_buf *cpu_buf, int cpu)
+{
+	if (!cpu_buf)
+		return;
+	if (cpu_buf->base &&
+	    munmap(cpu_buf->base, pb->mmap_size + pb->page_size))
+		pr_warning("failed to munmap cpu_buf #%d\n", cpu);
+	if (cpu_buf->fd >= 0) {
+		ioctl(cpu_buf->fd, PERF_EVENT_IOC_DISABLE, 0);
+		close(cpu_buf->fd);
+	}
+	free(cpu_buf->buf);
+	free(cpu_buf);
+}
+
+void perf_buffer__free(struct perf_buffer *pb)
+{
+	int i;
+
+	if (!pb)
+		return;
+	if (pb->cpu_bufs) {
+		for (i = 0; i < pb->cpu_cnt && pb->cpu_bufs[i]; i++) {
+			struct perf_cpu_buf *cpu_buf = pb->cpu_bufs[i];
+
+			bpf_map_delete_elem(pb->mapfd, &i);
+			perf_buffer__free_cpu_buf(pb, cpu_buf, i);
+		}
+		free(pb->cpu_bufs);
+	}
+	if (pb->epfd >= 0)
+		close(pb->epfd);
+	free(pb->events);
+	free(pb);
+}
+
+static struct perf_cpu_buf *perf_buffer__open_cpu_buf(struct perf_buffer *pb,
+						      int cpu)
+{
+	struct perf_event_attr attr = {};
+	struct perf_cpu_buf *cpu_buf;
+	char msg[STRERR_BUFSIZE];
+	int err;
+
+	cpu_buf = calloc(1, sizeof(*cpu_buf));
+	if (!cpu_buf)
+		return ERR_PTR(-ENOMEM);
+
+	attr.config = PERF_COUNT_SW_BPF_OUTPUT;
+	attr.type = PERF_TYPE_SOFTWARE;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	cpu_buf->fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, cpu,
+			      -1, PERF_FLAG_FD_CLOEXEC);
+	if (cpu_buf->fd < 0) {
+		err = -errno;
+		pr_warning("failed to open perf buffer event on cpu #%d: %s\n",
+			   cpu, libbpf_strerror_r(err, msg, sizeof(msg)));
+		goto error;
+	}
+
+	cpu_buf->base = mmap(NULL, pb->mmap_size + pb->page_size,
+			     PROT_READ | PROT_WRITE, MAP_SHARED,
+			     cpu_buf->fd, 0);
+	if (cpu_buf->base == MAP_FAILED) {
+		cpu_buf->base = NULL;
+		err = -errno;
+		pr_warning("failed to mmap perf buffer on cpu #%d: %s\n",
+			   cpu, libbpf_strerror_r(err, msg, sizeof(msg)));
+		goto error;
+	}
+
+	if (ioctl(cpu_buf->fd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
+		err = -errno;
+		pr_warning("failed to enable perf buffer event on cpu #%d: %s\n",
+			   cpu, libbpf_strerror_r(err, msg, sizeof(msg)));
+		goto error;
+	}
+
+	return cpu_buf;
+
+error:
+	perf_buffer__free_cpu_buf(pb, cpu_buf, cpu);
+	return (struct perf_cpu_buf *)ERR_PTR(err);
+}
+
+struct perf_buffer *perf_buffer__new(struct bpf_map *map, size_t page_cnt,
+				     perf_buffer_sample_fn sample_cb,
+				     perf_buffer_lost_fn lost_cb, void *ctx)
+{
+	char msg[STRERR_BUFSIZE];
+	struct perf_buffer *pb;
+	int err, cpu;
+
+	if (bpf_map__def(map)->type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
+		pr_warning("map '%s' should be BPF_MAP_TYPE_PERF_EVENT_ARRAY\n",
+			   bpf_map__name(map));
+		return ERR_PTR(-EINVAL);
+	}
+	if (bpf_map__fd(map) < 0) {
+		pr_warning("map '%s' doesn't have associated FD\n",
+			   bpf_map__name(map));
+		return ERR_PTR(-EINVAL);
+	}
+	if (page_cnt & (page_cnt - 1)) {
+		pr_warning("page count should be power of two, but is %zu\n",
+			   page_cnt);
+		return ERR_PTR(-EINVAL);
+	}
+
+	pb = calloc(1, sizeof(*pb));
+	if (!pb)
+		return ERR_PTR(-ENOMEM);
+
+	pb->sample_cb = sample_cb;
+	pb->lost_cb = lost_cb;
+	pb->ctx = ctx;
+	pb->page_size = getpagesize();
+	pb->mmap_size = pb->page_size * page_cnt;
+	pb->mapfd = bpf_map__fd(map);
+
+	pb->epfd = epoll_create1(EPOLL_CLOEXEC);
+	if (pb->epfd < 0) {
+		err = -errno;
+		pr_warning("failed to create epoll instance: %s\n",
+			   libbpf_strerror_r(err, msg, sizeof(msg)));
+		goto error;
+	}
+
+	pb->cpu_cnt = libbpf_num_possible_cpus();
+	if (pb->cpu_cnt < 0) {
+		err = pb->cpu_cnt;
+		goto error;
+	}
+	pb->events = calloc(pb->cpu_cnt, sizeof(*pb->events));
+	if (!pb->events) {
+		err = -ENOMEM;
+		pr_warning("failed to allocate events: out of memory\n");
+		goto error;
+	}
+	pb->cpu_bufs = calloc(pb->cpu_cnt, sizeof(*pb->cpu_bufs));
+	if (!pb->cpu_bufs) {
+		err = -ENOMEM;
+		pr_warning("failed to allocate buffers: out of memory\n");
+		goto error;
+	}
+
+	for (cpu = 0; cpu < pb->cpu_cnt; cpu++) {
+		struct perf_cpu_buf *cpu_buf;
+
+		cpu_buf = perf_buffer__open_cpu_buf(pb, cpu);
+		if (IS_ERR(cpu_buf)) {
+			err = PTR_ERR(cpu_buf);
+			goto error;
+		}
+
+		pb->cpu_bufs[cpu] = cpu_buf;
+
+		err = bpf_map_update_elem(pb->mapfd, &cpu, &cpu_buf->fd, 0);
+		if (err) {
+			pr_warning("failed to set cpu #%d perf FD %d: %s\n",
+				   cpu, cpu_buf->fd,
+				   libbpf_strerror_r(err, msg, sizeof(msg)));
+			goto error;
+		}
+
+		pb->events[cpu].events = EPOLLIN;
+		pb->events[cpu].data.ptr = cpu_buf;
+		if (epoll_ctl(pb->epfd, EPOLL_CTL_ADD, cpu_buf->fd,
+			      &pb->events[cpu]) < 0) {
+			err = -errno;
+			pr_warning("failed to epoll_ctl cpu #%d perf FD %d: %s\n",
+				   cpu, cpu_buf->fd,
+				   libbpf_strerror_r(err, msg, sizeof(msg)));
+			goto error;
+		}
+	}
+
+	return pb;
+
+error:
+	if (pb)
+		perf_buffer__free(pb);
+	return ERR_PTR(err);
+}
+
+struct perf_sample_raw {
+	struct perf_event_header header;
+	uint32_t size;
+	char data[0];
+};
+
+struct perf_sample_lost {
+	struct perf_event_header header;
+	uint64_t id;
+	uint64_t lost;
+	uint64_t sample_id;
+};
+
+static enum bpf_perf_event_ret
+perf_buffer__process_record(struct perf_event_header *e, void *ctx)
+{
+	struct perf_buffer *pb = ctx;
+	void *data = e;
+
+	switch (e->type) {
+	case PERF_RECORD_SAMPLE: {
+		struct perf_sample_raw *s = data;
+
+		pb->sample_cb(pb->ctx, s->data, s->size);
+		break;
+	}
+	case PERF_RECORD_LOST: {
+		struct perf_sample_lost *s = data;
+
+		if (pb->lost_cb)
+			pb->lost_cb(pb->ctx, s->lost);
+		break;
+	}
+	default:
+		pr_warning("unknown perf sample type %d\n", e->type);
+		return LIBBPF_PERF_EVENT_ERROR;
+	}
+	return LIBBPF_PERF_EVENT_CONT;
+}
+
+static int perf_buffer__process_records(struct perf_buffer *pb,
+					struct perf_cpu_buf *cpu_buf)
+{
+	enum bpf_perf_event_ret ret;
+
+	ret = bpf_perf_event_read_simple(cpu_buf->base, pb->mmap_size,
+					 pb->page_size, &cpu_buf->buf,
+					 &cpu_buf->buf_size,
+					 perf_buffer__process_record, pb);
+	if (ret != LIBBPF_PERF_EVENT_CONT)
+		return ret;
+	return 0;
+}
+
+int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms)
+{
+	int cnt, err;
+
+	cnt = epoll_wait(pb->epfd, pb->events, pb->cpu_cnt, timeout_ms);
+	for (int i = 0; i < cnt; i++) {
+		struct perf_cpu_buf *cpu_buf = pb->events[i].data.ptr;
+
+		err = perf_buffer__process_records(pb, cpu_buf);
+		if (err) {
+			pr_warning("error while processing records: %d\n", err);
+			return err;
+		}
+	}
+	return cnt < 0 ? -errno : cnt;
+}
+
 struct bpf_prog_info_array_desc {
 	int	array_offset;	/* e.g. offset of jited_prog_insns */
 	int	count_offset;	/* e.g. offset of jited_prog_len */
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index bf7020a565c6..3bfde1a475ce 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -354,6 +354,18 @@ LIBBPF_API int bpf_prog_load(const char *file, enum bpf_prog_type type,
 LIBBPF_API int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
 LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags);
 
+struct perf_buffer;
+typedef void (*perf_buffer_sample_fn)(void *ctx, void *data, __u32 size);
+typedef void (*perf_buffer_lost_fn)(void *ctx, __u64 cnt);
+
+LIBBPF_API struct perf_buffer *perf_buffer__new(struct bpf_map *map,
+						size_t page_cnt,
+						perf_buffer_sample_fn sample_cb,
+						perf_buffer_lost_fn lost_cb,
+						void *ctx);
+LIBBPF_API void perf_buffer__free(struct perf_buffer *pb);
+LIBBPF_API int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms);
+
 enum bpf_perf_event_ret {
 	LIBBPF_PERF_EVENT_DONE	= 0,
 	LIBBPF_PERF_EVENT_ERROR	= -1,
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 2382fbda4cbb..10f48103110a 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -170,13 +170,16 @@ LIBBPF_0.0.4 {
 		btf_dump__dump_type;
 		btf_dump__free;
 		btf_dump__new;
-		btf__parse_elf;
 		bpf_object__load_xattr;
 		bpf_program__attach_kprobe;
 		bpf_program__attach_perf_event;
 		bpf_program__attach_raw_tracepoint;
 		bpf_program__attach_tracepoint;
 		bpf_program__attach_uprobe;
+		btf__parse_elf;
 		libbpf_num_possible_cpus;
 		libbpf_perf_event_disable_and_close;
+		perf_buffer__free;
+		perf_buffer__new;
+		perf_buffer__poll;
 } LIBBPF_0.0.3;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH bpf-next 2/2] selftests/bpf: test perf buffer API
  2019-06-25 23:25 [PATCH bpf-next 0/2] libbpf: add perf buffer API Andrii Nakryiko
  2019-06-25 23:26 ` [PATCH bpf-next 1/2] libbpf: add perf buffer reading API Andrii Nakryiko
@ 2019-06-25 23:26 ` Andrii Nakryiko
  2019-06-26  2:21   ` Song Liu
  2019-06-26  5:11   ` Andrii Nakryiko
  1 sibling, 2 replies; 7+ messages in thread
From: Andrii Nakryiko @ 2019-06-25 23:26 UTC (permalink / raw)
  To: andrii.nakryiko, ast, daniel, bpf, netdev, kernel-team; +Cc: Andrii Nakryiko

Add test verifying perf buffer API functionality.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 .../selftests/bpf/prog_tests/perf_buffer.c    | 86 +++++++++++++++++++
 .../selftests/bpf/progs/test_perf_buffer.c    | 31 +++++++
 2 files changed, 117 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/perf_buffer.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_perf_buffer.c

diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
new file mode 100644
index 000000000000..3ba3e26141ac
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <sched.h>
+#include <sys/socket.h>
+#include <test_progs.h>
+
+static void on_sample(void *ctx, void *data, __u32 size)
+{
+	cpu_set_t *cpu_seen = ctx;
+	int cpu = *(int *)data;
+
+	CPU_SET(cpu, cpu_seen);
+}
+
+void test_perf_buffer(void)
+{
+	int err, prog_fd, prog_pfd, nr_cpus, i, duration = 0;
+	const char *prog_name = "kprobe/sys_nanosleep";
+	const char *file = "./test_perf_buffer.o";
+	struct bpf_map *perf_buf_map;
+	cpu_set_t cpu_set, cpu_seen;
+	struct bpf_program *prog;
+	struct bpf_object *obj;
+	struct perf_buffer *pb;
+
+	nr_cpus = libbpf_num_possible_cpus();
+	if (CHECK(nr_cpus < 0, "nr_cpus", "err %d\n", nr_cpus))
+		return;
+
+	/* load program */
+	err = bpf_prog_load(file, BPF_PROG_TYPE_KPROBE, &obj, &prog_fd);
+	if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno))
+		return;
+
+	prog = bpf_object__find_program_by_title(obj, prog_name);
+	if (CHECK(!prog, "find_probe", "prog '%s' not found\n", prog_name))
+		goto out_close;
+
+	/* load map */
+	perf_buf_map = bpf_object__find_map_by_name(obj, "perf_buf_map");
+	if (CHECK(!perf_buf_map, "find_perf_buf_map", "not found\n"))
+		goto out_close;
+
+	/* attach kprobe */
+	prog_pfd = bpf_program__attach_kprobe(prog, false /* retprobe */,
+					      "sys_nanosleep");
+	if (CHECK(prog_pfd < 0, "attach_kprobe", "err %d\n", prog_pfd))
+		goto out_close;
+
+	/* set up perf buffer */
+	pb = perf_buffer__new(perf_buf_map, 1, on_sample, NULL, &cpu_seen);
+	if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
+		goto out_detach;
+
+	/* trigger kprobe on every CPU */
+	CPU_ZERO(&cpu_seen);
+	for (i = 0; i < nr_cpus; i++) {
+		CPU_ZERO(&cpu_set);
+		CPU_SET(i, &cpu_set);
+
+		err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set),
+					     &cpu_set);
+		if (err && CHECK(err, "set_affinity", "cpu #%d, err %d\n",
+				 i, err))
+			goto out_detach;
+
+		usleep(1);
+	}
+
+	/* read perf buffer */
+	err = perf_buffer__poll(pb, 100);
+	if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err))
+		goto out_free_pb;
+
+	if (CHECK(CPU_COUNT(&cpu_seen) != nr_cpus, "seen_cpu_cnt",
+		  "expect %d, seen %d\n", nr_cpus, CPU_COUNT(&cpu_seen)))
+		goto out_free_pb;
+
+out_free_pb:
+	perf_buffer__free(pb);
+out_detach:
+	libbpf_perf_event_disable_and_close(prog_pfd);
+out_close:
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_perf_buffer.c b/tools/testing/selftests/bpf/progs/test_perf_buffer.c
new file mode 100644
index 000000000000..ba961f608fd5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_perf_buffer.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct {
+	int type;
+	int key_size;
+	int value_size;
+	int max_entries;
+} perf_buf_map SEC(".maps") = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 56,
+};
+
+SEC("kprobe/sys_nanosleep")
+int handle_sys_nanosleep_entry(struct pt_regs *ctx)
+{
+	int cpu = bpf_get_smp_processor_id();
+
+	bpf_perf_event_output(ctx, &perf_buf_map, BPF_F_CURRENT_CPU,
+			      &cpu, sizeof(cpu));
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH bpf-next 1/2] libbpf: add perf buffer reading API
  2019-06-25 23:26 ` [PATCH bpf-next 1/2] libbpf: add perf buffer reading API Andrii Nakryiko
@ 2019-06-26  2:18   ` Song Liu
  2019-06-26  4:44     ` Andrii Nakryiko
  0 siblings, 1 reply; 7+ messages in thread
From: Song Liu @ 2019-06-26  2:18 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, Alexei Starovoitov, Daniel Borkmann, bpf,
	Networking, Kernel Team

On Tue, Jun 25, 2019 at 4:28 PM Andrii Nakryiko <andriin@fb.com> wrote:
>
> BPF_MAP_TYPE_PERF_EVENT_ARRAY map is often used to send data from BPF program
> to user space for additional processing. libbpf already has very low-level API
> to read single CPU perf buffer, bpf_perf_event_read_simple(), but it's hard to
> use and requires a lot of code to set everything up. This patch adds
> perf_buffer abstraction on top of it, abstracting setting up and polling
> per-CPU logic into simple and convenient API, similar to what BCC provides.
>
> perf_buffer__new() sets up per-CPU ring buffers and updates corresponding BPF
> map entries. It accepts two user-provided callbacks: one for handling raw
> samples and one for get notifications of lost samples due to buffer overflow.
>
> perf_buffer__poll() is used to fetch ring buffer data across all CPUs,
> utilizing epoll instance.
>
> perf_buffer__free() does corresponding clean up and unsets FDs from BPF map.
>
> All APIs are not thread-safe. User should ensure proper locking/coordination if
> used in multi-threaded set up.
>
> Signed-off-by: Andrii Nakryiko <andriin@fb.com>

Overall looks good. Some nit below.

> ---
>  tools/lib/bpf/libbpf.c   | 282 +++++++++++++++++++++++++++++++++++++++
>  tools/lib/bpf/libbpf.h   |  12 ++
>  tools/lib/bpf/libbpf.map |   5 +-
>  3 files changed, 298 insertions(+), 1 deletion(-)

[...]

> +struct perf_buffer *perf_buffer__new(struct bpf_map *map, size_t page_cnt,
> +                                    perf_buffer_sample_fn sample_cb,
> +                                    perf_buffer_lost_fn lost_cb, void *ctx)
> +{
> +       char msg[STRERR_BUFSIZE];
> +       struct perf_buffer *pb;
> +       int err, cpu;
> +
> +       if (bpf_map__def(map)->type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
> +               pr_warning("map '%s' should be BPF_MAP_TYPE_PERF_EVENT_ARRAY\n",
> +                          bpf_map__name(map));
> +               return ERR_PTR(-EINVAL);
> +       }
> +       if (bpf_map__fd(map) < 0) {
> +               pr_warning("map '%s' doesn't have associated FD\n",
> +                          bpf_map__name(map));
> +               return ERR_PTR(-EINVAL);
> +       }
> +       if (page_cnt & (page_cnt - 1)) {
> +               pr_warning("page count should be power of two, but is %zu\n",
> +                          page_cnt);
> +               return ERR_PTR(-EINVAL);
> +       }
> +
> +       pb = calloc(1, sizeof(*pb));
> +       if (!pb)
> +               return ERR_PTR(-ENOMEM);
> +
> +       pb->sample_cb = sample_cb;
> +       pb->lost_cb = lost_cb;

I think we need to check sample_cb != NULL && lost_cb != NULL.

> +       pb->ctx = ctx;
> +       pb->page_size = getpagesize();
> +       pb->mmap_size = pb->page_size * page_cnt;
> +       pb->mapfd = bpf_map__fd(map);
> +
> +       pb->epfd = epoll_create1(EPOLL_CLOEXEC);
[...]
> +perf_buffer__process_record(struct perf_event_header *e, void *ctx)
> +{
> +       struct perf_buffer *pb = ctx;
> +       void *data = e;
> +
> +       switch (e->type) {
> +       case PERF_RECORD_SAMPLE: {
> +               struct perf_sample_raw *s = data;
> +
> +               pb->sample_cb(pb->ctx, s->data, s->size);
> +               break;
> +       }
> +       case PERF_RECORD_LOST: {
> +               struct perf_sample_lost *s = data;
> +
> +               if (pb->lost_cb)
> +                       pb->lost_cb(pb->ctx, s->lost);

OK, we test lost_cb here, so not necessary at init time.

[...]
>                 bpf_program__attach_perf_event;
>                 bpf_program__attach_raw_tracepoint;
>                 bpf_program__attach_tracepoint;
>                 bpf_program__attach_uprobe;
> +               btf__parse_elf;

Why move btf__parse_elf ?

Thanks,
Song

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH bpf-next 2/2] selftests/bpf: test perf buffer API
  2019-06-25 23:26 ` [PATCH bpf-next 2/2] selftests/bpf: test perf buffer API Andrii Nakryiko
@ 2019-06-26  2:21   ` Song Liu
  2019-06-26  5:11   ` Andrii Nakryiko
  1 sibling, 0 replies; 7+ messages in thread
From: Song Liu @ 2019-06-26  2:21 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, Alexei Starovoitov, Daniel Borkmann, bpf,
	Networking, Kernel Team

On Tue, Jun 25, 2019 at 4:27 PM Andrii Nakryiko <andriin@fb.com> wrote:
>
> Add test verifying perf buffer API functionality.
>
> Signed-off-by: Andrii Nakryiko <andriin@fb.com>

Acked-by: Song Liu <songliubraving@fb.com>

> ---
>  .../selftests/bpf/prog_tests/perf_buffer.c    | 86 +++++++++++++++++++
>  .../selftests/bpf/progs/test_perf_buffer.c    | 31 +++++++
>  2 files changed, 117 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/perf_buffer.c
>  create mode 100644 tools/testing/selftests/bpf/progs/test_perf_buffer.c
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
> new file mode 100644
> index 000000000000..3ba3e26141ac
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
> @@ -0,0 +1,86 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define _GNU_SOURCE
> +#include <pthread.h>
> +#include <sched.h>
> +#include <sys/socket.h>
> +#include <test_progs.h>
> +
> +static void on_sample(void *ctx, void *data, __u32 size)
> +{
> +       cpu_set_t *cpu_seen = ctx;
> +       int cpu = *(int *)data;
> +
> +       CPU_SET(cpu, cpu_seen);
> +}
> +
> +void test_perf_buffer(void)
> +{
> +       int err, prog_fd, prog_pfd, nr_cpus, i, duration = 0;
> +       const char *prog_name = "kprobe/sys_nanosleep";
> +       const char *file = "./test_perf_buffer.o";
> +       struct bpf_map *perf_buf_map;
> +       cpu_set_t cpu_set, cpu_seen;
> +       struct bpf_program *prog;
> +       struct bpf_object *obj;
> +       struct perf_buffer *pb;
> +
> +       nr_cpus = libbpf_num_possible_cpus();
> +       if (CHECK(nr_cpus < 0, "nr_cpus", "err %d\n", nr_cpus))
> +               return;
> +
> +       /* load program */
> +       err = bpf_prog_load(file, BPF_PROG_TYPE_KPROBE, &obj, &prog_fd);
> +       if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno))
> +               return;
> +
> +       prog = bpf_object__find_program_by_title(obj, prog_name);
> +       if (CHECK(!prog, "find_probe", "prog '%s' not found\n", prog_name))
> +               goto out_close;
> +
> +       /* load map */
> +       perf_buf_map = bpf_object__find_map_by_name(obj, "perf_buf_map");
> +       if (CHECK(!perf_buf_map, "find_perf_buf_map", "not found\n"))
> +               goto out_close;
> +
> +       /* attach kprobe */
> +       prog_pfd = bpf_program__attach_kprobe(prog, false /* retprobe */,
> +                                             "sys_nanosleep");
> +       if (CHECK(prog_pfd < 0, "attach_kprobe", "err %d\n", prog_pfd))
> +               goto out_close;
> +
> +       /* set up perf buffer */
> +       pb = perf_buffer__new(perf_buf_map, 1, on_sample, NULL, &cpu_seen);
> +       if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
> +               goto out_detach;
> +
> +       /* trigger kprobe on every CPU */
> +       CPU_ZERO(&cpu_seen);
> +       for (i = 0; i < nr_cpus; i++) {
> +               CPU_ZERO(&cpu_set);
> +               CPU_SET(i, &cpu_set);
> +
> +               err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set),
> +                                            &cpu_set);
> +               if (err && CHECK(err, "set_affinity", "cpu #%d, err %d\n",
> +                                i, err))
> +                       goto out_detach;
> +
> +               usleep(1);
> +       }
> +
> +       /* read perf buffer */
> +       err = perf_buffer__poll(pb, 100);
> +       if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err))
> +               goto out_free_pb;
> +
> +       if (CHECK(CPU_COUNT(&cpu_seen) != nr_cpus, "seen_cpu_cnt",
> +                 "expect %d, seen %d\n", nr_cpus, CPU_COUNT(&cpu_seen)))
> +               goto out_free_pb;
> +
> +out_free_pb:
> +       perf_buffer__free(pb);
> +out_detach:
> +       libbpf_perf_event_disable_and_close(prog_pfd);
> +out_close:
> +       bpf_object__close(obj);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/test_perf_buffer.c b/tools/testing/selftests/bpf/progs/test_perf_buffer.c
> new file mode 100644
> index 000000000000..ba961f608fd5
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/test_perf_buffer.c
> @@ -0,0 +1,31 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (c) 2019 Facebook
> +
> +#include <linux/ptrace.h>
> +#include <linux/bpf.h>
> +#include "bpf_helpers.h"
> +
> +struct {
> +       int type;
> +       int key_size;
> +       int value_size;
> +       int max_entries;
> +} perf_buf_map SEC(".maps") = {
> +       .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> +       .key_size = sizeof(int),
> +       .value_size = sizeof(int),
> +       .max_entries = 56,
> +};
> +
> +SEC("kprobe/sys_nanosleep")
> +int handle_sys_nanosleep_entry(struct pt_regs *ctx)
> +{
> +       int cpu = bpf_get_smp_processor_id();
> +
> +       bpf_perf_event_output(ctx, &perf_buf_map, BPF_F_CURRENT_CPU,
> +                             &cpu, sizeof(cpu));
> +       return 0;
> +}
> +
> +char _license[] SEC("license") = "GPL";
> +__u32 _version SEC("version") = 1;
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH bpf-next 1/2] libbpf: add perf buffer reading API
  2019-06-26  2:18   ` Song Liu
@ 2019-06-26  4:44     ` Andrii Nakryiko
  0 siblings, 0 replies; 7+ messages in thread
From: Andrii Nakryiko @ 2019-06-26  4:44 UTC (permalink / raw)
  To: Song Liu
  Cc: Andrii Nakryiko, Alexei Starovoitov, Daniel Borkmann, bpf,
	Networking, Kernel Team

On Tue, Jun 25, 2019 at 7:19 PM Song Liu <liu.song.a23@gmail.com> wrote:
>
> On Tue, Jun 25, 2019 at 4:28 PM Andrii Nakryiko <andriin@fb.com> wrote:
> >
> > BPF_MAP_TYPE_PERF_EVENT_ARRAY map is often used to send data from BPF program
> > to user space for additional processing. libbpf already has very low-level API
> > to read single CPU perf buffer, bpf_perf_event_read_simple(), but it's hard to
> > use and requires a lot of code to set everything up. This patch adds
> > perf_buffer abstraction on top of it, abstracting setting up and polling
> > per-CPU logic into simple and convenient API, similar to what BCC provides.
> >
> > perf_buffer__new() sets up per-CPU ring buffers and updates corresponding BPF
> > map entries. It accepts two user-provided callbacks: one for handling raw
> > samples and one for get notifications of lost samples due to buffer overflow.
> >
> > perf_buffer__poll() is used to fetch ring buffer data across all CPUs,
> > utilizing epoll instance.
> >
> > perf_buffer__free() does corresponding clean up and unsets FDs from BPF map.
> >
> > All APIs are not thread-safe. User should ensure proper locking/coordination if
> > used in multi-threaded set up.
> >
> > Signed-off-by: Andrii Nakryiko <andriin@fb.com>
>
> Overall looks good. Some nit below.

Thanks for review!

>
> > ---
> >  tools/lib/bpf/libbpf.c   | 282 +++++++++++++++++++++++++++++++++++++++
> >  tools/lib/bpf/libbpf.h   |  12 ++
> >  tools/lib/bpf/libbpf.map |   5 +-
> >  3 files changed, 298 insertions(+), 1 deletion(-)
>
> [...]
>
> > +struct perf_buffer *perf_buffer__new(struct bpf_map *map, size_t page_cnt,
> > +                                    perf_buffer_sample_fn sample_cb,
> > +                                    perf_buffer_lost_fn lost_cb, void *ctx)
> > +{
> > +       char msg[STRERR_BUFSIZE];
> > +       struct perf_buffer *pb;
> > +       int err, cpu;
> > +
> > +       if (bpf_map__def(map)->type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
> > +               pr_warning("map '%s' should be BPF_MAP_TYPE_PERF_EVENT_ARRAY\n",
> > +                          bpf_map__name(map));
> > +               return ERR_PTR(-EINVAL);
> > +       }
> > +       if (bpf_map__fd(map) < 0) {
> > +               pr_warning("map '%s' doesn't have associated FD\n",
> > +                          bpf_map__name(map));
> > +               return ERR_PTR(-EINVAL);
> > +       }
> > +       if (page_cnt & (page_cnt - 1)) {
> > +               pr_warning("page count should be power of two, but is %zu\n",
> > +                          page_cnt);
> > +               return ERR_PTR(-EINVAL);
> > +       }
> > +
> > +       pb = calloc(1, sizeof(*pb));
> > +       if (!pb)
> > +               return ERR_PTR(-ENOMEM);
> > +
> > +       pb->sample_cb = sample_cb;
> > +       pb->lost_cb = lost_cb;
>
> I think we need to check sample_cb != NULL && lost_cb != NULL.

I was thinking about making them all either optional or required, but
eventually decided on making sample_cb required and lost_cb optional,
as in practice rarely sample_cb wouldn't be provided, while not every
application would care about handling lost samples (as there is little
you can do about that, except for bumping some counter).

As for checking for NULL. I feel like that's overkill. If someone
provided NULL for sample_cb, they will get SIGSEGV with stack trace
immediately showing that's it's sample_cb being NULL. Unlike Java, C
libraries tend not to double-check every pointer for NULL. Checking
for things like whether map has FD or is of correct type is valuable,
because if you don't check it early, then you'll just eventually get
-EINVAL from kernel and will start a guessing game of what's wrong.
Checking for callback to be non-null feels unnecessary, as it will be
immediately obvious (and it's quite unlikely this will happen in
practice).

>
> > +       pb->ctx = ctx;
> > +       pb->page_size = getpagesize();
> > +       pb->mmap_size = pb->page_size * page_cnt;
> > +       pb->mapfd = bpf_map__fd(map);
> > +
> > +       pb->epfd = epoll_create1(EPOLL_CLOEXEC);
> [...]
> > +perf_buffer__process_record(struct perf_event_header *e, void *ctx)
> > +{
> > +       struct perf_buffer *pb = ctx;
> > +       void *data = e;
> > +
> > +       switch (e->type) {
> > +       case PERF_RECORD_SAMPLE: {
> > +               struct perf_sample_raw *s = data;
> > +
> > +               pb->sample_cb(pb->ctx, s->data, s->size);
> > +               break;
> > +       }
> > +       case PERF_RECORD_LOST: {
> > +               struct perf_sample_lost *s = data;
> > +
> > +               if (pb->lost_cb)
> > +                       pb->lost_cb(pb->ctx, s->lost);
>
> OK, we test lost_cb here, so not necessary at init time.
>
> [...]
> >                 bpf_program__attach_perf_event;
> >                 bpf_program__attach_raw_tracepoint;
> >                 bpf_program__attach_tracepoint;
> >                 bpf_program__attach_uprobe;
> > +               btf__parse_elf;
>
> Why move btf__parse_elf ?

I realized that I haven't put it in correct alphabetical order,
decided to fix it here, as it's just a single line change.

>
> Thanks,
> Song

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH bpf-next 2/2] selftests/bpf: test perf buffer API
  2019-06-25 23:26 ` [PATCH bpf-next 2/2] selftests/bpf: test perf buffer API Andrii Nakryiko
  2019-06-26  2:21   ` Song Liu
@ 2019-06-26  5:11   ` Andrii Nakryiko
  1 sibling, 0 replies; 7+ messages in thread
From: Andrii Nakryiko @ 2019-06-26  5:11 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Alexei Starovoitov, Daniel Borkmann, bpf, Networking, Kernel Team

On Tue, Jun 25, 2019 at 4:26 PM Andrii Nakryiko <andriin@fb.com> wrote:
>
> Add test verifying perf buffer API functionality.
>
> Signed-off-by: Andrii Nakryiko <andriin@fb.com>
> ---
>  .../selftests/bpf/prog_tests/perf_buffer.c    | 86 +++++++++++++++++++
>  .../selftests/bpf/progs/test_perf_buffer.c    | 31 +++++++
>  2 files changed, 117 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/perf_buffer.c
>  create mode 100644 tools/testing/selftests/bpf/progs/test_perf_buffer.c
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
> new file mode 100644
> index 000000000000..3ba3e26141ac
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
> @@ -0,0 +1,86 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define _GNU_SOURCE
> +#include <pthread.h>
> +#include <sched.h>
> +#include <sys/socket.h>
> +#include <test_progs.h>
> +
> +static void on_sample(void *ctx, void *data, __u32 size)
> +{
> +       cpu_set_t *cpu_seen = ctx;
> +       int cpu = *(int *)data;
> +
> +       CPU_SET(cpu, cpu_seen);
> +}
> +
> +void test_perf_buffer(void)
> +{
> +       int err, prog_fd, prog_pfd, nr_cpus, i, duration = 0;
> +       const char *prog_name = "kprobe/sys_nanosleep";
> +       const char *file = "./test_perf_buffer.o";
> +       struct bpf_map *perf_buf_map;
> +       cpu_set_t cpu_set, cpu_seen;
> +       struct bpf_program *prog;
> +       struct bpf_object *obj;
> +       struct perf_buffer *pb;
> +
> +       nr_cpus = libbpf_num_possible_cpus();
> +       if (CHECK(nr_cpus < 0, "nr_cpus", "err %d\n", nr_cpus))
> +               return;
> +
> +       /* load program */
> +       err = bpf_prog_load(file, BPF_PROG_TYPE_KPROBE, &obj, &prog_fd);
> +       if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno))
> +               return;
> +
> +       prog = bpf_object__find_program_by_title(obj, prog_name);
> +       if (CHECK(!prog, "find_probe", "prog '%s' not found\n", prog_name))
> +               goto out_close;
> +
> +       /* load map */
> +       perf_buf_map = bpf_object__find_map_by_name(obj, "perf_buf_map");
> +       if (CHECK(!perf_buf_map, "find_perf_buf_map", "not found\n"))
> +               goto out_close;
> +
> +       /* attach kprobe */
> +       prog_pfd = bpf_program__attach_kprobe(prog, false /* retprobe */,
> +                                             "sys_nanosleep");
> +       if (CHECK(prog_pfd < 0, "attach_kprobe", "err %d\n", prog_pfd))
> +               goto out_close;
> +
> +       /* set up perf buffer */
> +       pb = perf_buffer__new(perf_buf_map, 1, on_sample, NULL, &cpu_seen);
> +       if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
> +               goto out_detach;
> +
> +       /* trigger kprobe on every CPU */
> +       CPU_ZERO(&cpu_seen);
> +       for (i = 0; i < nr_cpus; i++) {
> +               CPU_ZERO(&cpu_set);
> +               CPU_SET(i, &cpu_set);
> +
> +               err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set),
> +                                            &cpu_set);
> +               if (err && CHECK(err, "set_affinity", "cpu #%d, err %d\n",
> +                                i, err))
> +                       goto out_detach;
> +
> +               usleep(1);
> +       }
> +
> +       /* read perf buffer */
> +       err = perf_buffer__poll(pb, 100);
> +       if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err))
> +               goto out_free_pb;
> +
> +       if (CHECK(CPU_COUNT(&cpu_seen) != nr_cpus, "seen_cpu_cnt",
> +                 "expect %d, seen %d\n", nr_cpus, CPU_COUNT(&cpu_seen)))
> +               goto out_free_pb;
> +
> +out_free_pb:
> +       perf_buffer__free(pb);
> +out_detach:
> +       libbpf_perf_event_disable_and_close(prog_pfd);
> +out_close:
> +       bpf_object__close(obj);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/test_perf_buffer.c b/tools/testing/selftests/bpf/progs/test_perf_buffer.c
> new file mode 100644
> index 000000000000..ba961f608fd5
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/test_perf_buffer.c
> @@ -0,0 +1,31 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (c) 2019 Facebook
> +
> +#include <linux/ptrace.h>
> +#include <linux/bpf.h>
> +#include "bpf_helpers.h"
> +
> +struct {
> +       int type;
> +       int key_size;
> +       int value_size;
> +       int max_entries;
> +} perf_buf_map SEC(".maps") = {
> +       .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> +       .key_size = sizeof(int),
> +       .value_size = sizeof(int),
> +       .max_entries = 56,

Oh, this is not right, "works for me" only :). I've been meaning to
actually have another change to handle not specified max_entries (or
equivalently, max_entries == 0) to mean "all possible CPUs" for
BPF_MAP_TYPE_PERF_EVENT_ARRAY. Will produce v2 with that change.

> +};
> +
> +SEC("kprobe/sys_nanosleep")
> +int handle_sys_nanosleep_entry(struct pt_regs *ctx)
> +{
> +       int cpu = bpf_get_smp_processor_id();
> +
> +       bpf_perf_event_output(ctx, &perf_buf_map, BPF_F_CURRENT_CPU,
> +                             &cpu, sizeof(cpu));
> +       return 0;
> +}
> +
> +char _license[] SEC("license") = "GPL";
> +__u32 _version SEC("version") = 1;
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2019-06-26  5:12 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-25 23:25 [PATCH bpf-next 0/2] libbpf: add perf buffer API Andrii Nakryiko
2019-06-25 23:26 ` [PATCH bpf-next 1/2] libbpf: add perf buffer reading API Andrii Nakryiko
2019-06-26  2:18   ` Song Liu
2019-06-26  4:44     ` Andrii Nakryiko
2019-06-25 23:26 ` [PATCH bpf-next 2/2] selftests/bpf: test perf buffer API Andrii Nakryiko
2019-06-26  2:21   ` Song Liu
2019-06-26  5:11   ` Andrii Nakryiko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).