[RFC PATCH 11/11] dtrace: make use of writable buffers in BPF

From: Kris Van Hees <kris.van.hees@oracle.com>
To: netdev@vger.kernel.org, bpf@vger.kernel.org,
	dtrace-devel@oss.oracle.com, linux-kernel@vger.kernel.org
Cc: rostedt@goodmis.org, mhiramat@kernel.org, acme@kernel.org,
	ast@kernel.org, daniel@iogearbox.net
Subject: [RFC PATCH 11/11] dtrace: make use of writable buffers in BPF
Date: Mon, 20 May 2019 23:55:05 +0000 (UTC)	[thread overview]
Message-ID: <201905202355.x4KNt6ti001323@aserv0121.oracle.com> (raw)

This commit modifies the tiny proof-of-concept DTrace utility to use
the writable-buffer support in BPF along with the new helpers for
buffer reservation and commit.  The dtrace_finalize_context() helper
is updated and is now marked with ctx_update because it sets the
buffer pointer to NULL (and size 0).

Signed-off-by: Kris Van Hees <kris.van.hees@oracle.com>
Reviewed-by: Nick Alcock <nick.alcock@oracle.com>
---
 include/uapi/linux/dtrace.h |   4 +
 kernel/trace/dtrace/bpf.c   | 150 ++++++++++++++++++++++++++++++++++++
 tools/dtrace/dt_buffer.c    |  54 +++++--------
 tools/dtrace/probe1_bpf.c   |  47 ++++++-----
 4 files changed, 198 insertions(+), 57 deletions(-)

diff --git a/include/uapi/linux/dtrace.h b/include/uapi/linux/dtrace.h
index bbe2562c11f2..3fcc075a429f 100644
--- a/include/uapi/linux/dtrace.h
+++ b/include/uapi/linux/dtrace.h
@@ -33,6 +33,10 @@ struct dtrace_bpf_context {
 	u32 gid;	/* from_kgid(&init_user_ns, current_real_cred()->gid */
 	u32 euid;	/* from_kuid(&init_user_ns, current_real_cred()->euid */
 	u32 egid;	/* from_kgid(&init_user_ns, current_real_cred()->egid */
+
+	/* General output buffer */
+	__bpf_md_ptr(u8 *, buf);
+	__bpf_md_ptr(u8 *, buf_end);
 };
 
 /*
diff --git a/kernel/trace/dtrace/bpf.c b/kernel/trace/dtrace/bpf.c
index 95f4103d749e..93bd2f0319cc 100644
--- a/kernel/trace/dtrace/bpf.c
+++ b/kernel/trace/dtrace/bpf.c
@@ -7,6 +7,7 @@
 #include <linux/filter.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/perf_event.h>
 
 /*
  * Actual kernel definition of the DTrace BPF context.
@@ -16,6 +17,9 @@ struct dtrace_bpf_ctx {
 	u32				ecb_id;
 	u32				probe_id;
 	struct task_struct		*task;
+	struct perf_output_handle	handle;
+	u64				buf_len;
+	u8				*buf;
 };
 
 /*
@@ -55,6 +59,8 @@ BPF_CALL_2(dtrace_finalize_context, struct dtrace_bpf_ctx *, ctx,
 
 	ctx->ecb_id = ecb->id;
 	ctx->probe_id = ecb->probe_id;
+	ctx->buf_len = 0;
+	ctx->buf = NULL;
 
 	return 0;
 }
@@ -62,17 +68,119 @@ BPF_CALL_2(dtrace_finalize_context, struct dtrace_bpf_ctx *, ctx,
 static const struct bpf_func_proto dtrace_finalize_context_proto = {
 	.func           = dtrace_finalize_context,
 	.gpl_only       = false,
+	.ctx_update	= true,
 	.ret_type       = RET_INTEGER,
 	.arg1_type      = ARG_PTR_TO_CTX,		/* ctx */
 	.arg2_type      = ARG_CONST_MAP_PTR,		/* map */
 };
 
+BPF_CALL_4(dtrace_buffer_reserve, struct dtrace_bpf_ctx *, ctx,
+				  int, id, struct bpf_map *, map, int, size)
+{
+	struct bpf_array	*arr = container_of(map, struct bpf_array, map);
+	int			cpu = smp_processor_id();
+	struct bpf_event_entry	*ee;
+	struct perf_event	*ev;
+	int			err;
+
+	/*
+	 * Make sure the writable-buffer id is valid.  We use the default which
+	 * is the offset of the start-of-buffer pointer in the public context.
+	 */
+	if (id != offsetof(struct dtrace_bpf_context, buf))
+		return -EINVAL;
+
+	/*
+	 * Verify whether we have an uncommitted reserve.  If so, we deny this
+	 * request.
+	 */
+	if (ctx->handle.rb)
+		return -EBUSY;
+
+	/*
+	 * Perform sanity checks.
+	 */
+	if (cpu >= arr->map.max_entries)
+		return -E2BIG;
+	ee = READ_ONCE(arr->ptrs[cpu]);
+	if (!ee)
+		return -ENOENT;
+	ev = ee->event;
+	if (unlikely(ev->attr.type != PERF_TYPE_SOFTWARE ||
+		     ev->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
+		return -EINVAL;
+	if (unlikely(ev->oncpu != cpu))
+		return -EOPNOTSUPP;
+
+	size = round_up(size, sizeof(u64));
+
+	err = perf_output_begin_forward_in_page(&ctx->handle, ev, size);
+	if (err < 0)
+		return err;
+
+	ctx->buf_len = size;
+	ctx->buf = ctx->handle.addr;
+
+	return 0;
+}
+
+static const struct bpf_func_proto dtrace_buffer_reserve_proto = {
+	.func           = dtrace_buffer_reserve,
+	.gpl_only       = false,
+	.ctx_update	= true,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,		/* ctx */
+	.arg2_type      = ARG_ANYTHING,			/* id */
+	.arg3_type      = ARG_CONST_MAP_PTR,		/* map */
+	.arg4_type      = ARG_ANYTHING,			/* size */
+};
+
+BPF_CALL_3(dtrace_buffer_commit, struct dtrace_bpf_ctx *, ctx,
+				 int, id, struct bpf_map *, map)
+{
+	/*
+	 * Make sure the writable-buffer id is valid.  We use the default which
+	 * is the offset of the start-of-buffer pointer in the public context.
+	 */
+	if (id != offsetof(struct dtrace_bpf_context, buf))
+		return -EINVAL;
+
+	/*
+	 * Verify that we have an uncommitted reserve.  If not, there is really
+	 * nothing to be done here.
+	 */
+	if (!ctx->handle.rb)
+		return 0;
+
+	perf_output_end(&ctx->handle);
+
+	ctx->handle.rb = NULL;
+	ctx->buf_len = 0;
+	ctx->buf = NULL;
+
+	return 0;
+}
+
+static const struct bpf_func_proto dtrace_buffer_commit_proto = {
+	.func           = dtrace_buffer_commit,
+	.gpl_only       = false,
+	.ctx_update	= true,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,		/* ctx */
+	.arg2_type      = ARG_ANYTHING,			/* id */
+	.arg3_type      = ARG_CONST_MAP_PTR,		/* map */
+};
+
 static const struct bpf_func_proto *
 dtrace_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_finalize_context:
 		return &dtrace_finalize_context_proto;
+	case BPF_FUNC_buffer_reserve:
+		return &dtrace_buffer_reserve_proto;
+	case BPF_FUNC_buffer_commit:
+		return &dtrace_buffer_commit_proto;
 	case BPF_FUNC_perf_event_output:
 		return bpf_get_perf_event_output_proto();
 	case BPF_FUNC_trace_printk:
@@ -131,6 +239,22 @@ static bool dtrace_is_valid_access(int off, int size, enum bpf_access_type type,
 		if (bpf_ctx_narrow_access_ok(off, size, sizeof(u32)))
 			return true;
 		break;
+	case bpf_ctx_range(struct dtrace_bpf_context, buf):
+		info->reg_type = PTR_TO_BUFFER;
+		info->buf_id = offsetof(struct dtrace_bpf_context, buf);
+
+		bpf_ctx_record_field_size(info, sizeof(u64));
+		if (bpf_ctx_narrow_access_ok(off, size, sizeof(u64)))
+			return true;
+		break;
+	case bpf_ctx_range(struct dtrace_bpf_context, buf_end):
+		info->reg_type = PTR_TO_BUFFER_END;
+		info->buf_id = offsetof(struct dtrace_bpf_context, buf);
+
+		bpf_ctx_record_field_size(info, sizeof(u64));
+		if (bpf_ctx_narrow_access_ok(off, size, sizeof(u64)))
+			return true;
+		break;
 	default:
 		if (size == sizeof(unsigned long))
 			return true;
@@ -152,6 +276,10 @@ static bool dtrace_is_valid_access(int off, int size, enum bpf_access_type type,
  *	si->dst_reg = ((type *)si->src_reg)->member
  *	target_size = sizeof(((type *)si->src_reg)->member)
  *
+ *  BPF_LDX_CTX_FIELD_DST(type, member, dst, si, target_size)
+ *	dst = ((type *)si->src_reg)->member
+ *	target_size = sizeof(((type *)si->src_reg)->member)
+ *
  *  BPF_LDX_LNK_FIELD(type, member, si, target_size)
  *	si->dst_reg = ((type *)si->dst_reg)->member
  *	target_size = sizeof(((type *)si->dst_reg)->member)
@@ -172,6 +300,13 @@ static bool dtrace_is_valid_access(int off, int size, enum bpf_access_type type,
 			*(target_size) = FIELD_SIZEOF(type, member); \
 			offsetof(type, member); \
 		    }))
+#define BPF_LDX_CTX_FIELD_DST(type, member, dst, si, target_size) \
+	BPF_LDX_MEM(BPF_FIELD_SIZEOF(type, member), \
+		    (dst), (si)->src_reg, \
+		    ({ \
+			*(target_size) = FIELD_SIZEOF(type, member); \
+			offsetof(type, member); \
+		    }))
 #define BPF_LDX_LNK_FIELD(type, member, si, target_size) \
 	BPF_LDX_MEM(BPF_FIELD_SIZEOF(type, member), \
 		    (si)->dst_reg, (si)->dst_reg, \
@@ -261,6 +396,18 @@ static u32 dtrace_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_LDX_LNK_PTR(struct task_struct, cred, si);
 		*insn++ = BPF_LDX_LNK_FIELD(struct cred, egid, si, target_size);
 		break;
+	case offsetof(struct dtrace_bpf_context, buf):
+		*insn++ = BPF_LDX_CTX_FIELD(struct dtrace_bpf_ctx, buf, si,
+					    target_size);
+		break;
+	case offsetof(struct dtrace_bpf_context, buf_end):
+		/* buf_end = ctx->buf + ctx->buf_len */
+		*insn++ = BPF_LDX_CTX_FIELD(struct dtrace_bpf_ctx, buf, si,
+					    target_size);
+		*insn++ = BPF_LDX_CTX_FIELD_DST(struct dtrace_bpf_ctx, buf_len,
+						BPF_REG_AX, si, target_size);
+		*insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+		break;
 	default:
 		*insn++ = BPF_LDX_CTX_PTR(struct dtrace_bpf_ctx, regs, si);
 		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg,
@@ -308,6 +455,9 @@ static void *dtrace_convert_ctx(enum bpf_prog_type stype, void *ctx)
 		gctx = this_cpu_ptr(&dtrace_ctx);
 		gctx->regs = (struct pt_regs *)ctx;
 		gctx->task = current;
+		gctx->handle.rb = NULL;
+		gctx->buf_len = 0;
+		gctx->buf = NULL;
 
 		return gctx;
 	}
diff --git a/tools/dtrace/dt_buffer.c b/tools/dtrace/dt_buffer.c
index 65c107ca8ac4..28fac9036d69 100644
--- a/tools/dtrace/dt_buffer.c
+++ b/tools/dtrace/dt_buffer.c
@@ -282,33 +282,27 @@ static void write_rb_tail(volatile struct perf_event_mmap_page *rb_page,
  */
 static int output_event(u64 *buf)
 {
-	u8				*data = (u8 *)buf;
-	struct perf_event_header	*hdr;
-	u32				size;
-	u64				probe_id, task;
-	u32				pid, ppid, cpu, euid, egid, tag;
+	u8	*data = (u8 *)buf;
+	u32	probe_id;
+	u32	flags;
+	u64	task;
+	u32	pid, ppid, cpu, euid, egid, tag;
 
-	hdr = (struct perf_event_header *)data;
-	data += sizeof(struct perf_event_header);
+	probe_id = *(u32 *)&(data[0]);
 
-	if (hdr->type != PERF_RECORD_SAMPLE)
-		return 1;
+	if (probe_id == PERF_RECORD_LOST) {
+		u16	size;
+		u64	lost;
 
-	size = *(u32 *)data;
-	data += sizeof(u32);
+		size = *(u16 *)&(data[6]);
+		lost = *(u16 *)&(data[16]);
 
-	/*
-	 * The sample should only take up 48 bytes, but as a result of how the
-	 * BPF program stores the data (filling in a struct that resides on the
-	 * stack, and sending that off using bpf_perf_event_output()), there is
-	 * some internal padding
-	 */
-	if (size != 52) {
-		printf("Sample size is wrong (%d vs expected %d)\n", size, 52);
-		goto out;
+		printf("[%ld probes dropped]\n", lost);
+
+		return size;
 	}
 
-	probe_id = *(u64 *)&(data[0]);
+	flags = *(u32 *)&(data[4]);
 	pid = *(u32 *)&(data[8]);
 	ppid = *(u32 *)&(data[12]);
 	cpu = *(u32 *)&(data[16]);
@@ -318,19 +312,14 @@ static int output_event(u64 *buf)
 	tag = *(u32 *)&(data[40]);
 
 	if (probe_id != 123)
-		printf("Corrupted data (probe_id = %ld)\n", probe_id);
+		printf("Corrupted data (probe_id = %d)\n", probe_id);
 	if (tag != 0xdace)
 		printf("Corrupted data (tag = %x)\n", tag);
 
-	printf("CPU-%d: EPID %ld PID %d PPID %d EUID %d EGID %d TASK %08lx\n",
-	       cpu, probe_id, pid, ppid, euid, egid, task);
+	printf("CPU-%d: [%d/%d] PID %d PPID %d EUID %d EGID %d TASK %08lx\n",
+	       cpu, probe_id, flags, pid, ppid, euid, egid, task);
 
-out:
-	/*
-	 * We processed the perf_event_header, the size, and ;size; bytes of
-	 * probe data.
-	 */
-	return sizeof(struct perf_event_header) + sizeof(u32) + size;
+	return 48;
 }
 
 /*
@@ -351,10 +340,9 @@ static void process_data(struct dtrace_buffer *buf)
 
 		/*
 		 * Ensure that the buffer contains enough data for at least one
-		 * sample (header + sample size + sample data).
+		 * sample.
 		 */
-		if (head - tail < sizeof(struct perf_event_header) +
-				  sizeof(u32) + 48)
+		if (head - tail < 48)
 			break;
 
 		if (*ptr)
diff --git a/tools/dtrace/probe1_bpf.c b/tools/dtrace/probe1_bpf.c
index 5b34edb61412..a3196261e66e 100644
--- a/tools/dtrace/probe1_bpf.c
+++ b/tools/dtrace/probe1_bpf.c
@@ -37,25 +37,16 @@ struct bpf_map_def SEC("maps") buffer_map = {
 	.max_entries = 2,
 };
 
-struct sample {
-	u64 probe_id;
-	u32 pid;
-	u32 ppid;
-	u32 cpu;
-	u32 euid;
-	u32 egid;
-	u64 task;
-	u32 tag;
-};
-
 #define DPROG(F)	SEC("dtrace/"__stringify(F)) int bpf_func_##F
+#define BUF_ID		offsetof(struct dtrace_bpf_context, buf)
 
 /* we jump here when syscall number == __NR_write */
 DPROG(__NR_write)(struct dtrace_bpf_context *ctx)
 {
 	int			cpu = bpf_get_smp_processor_id();
 	struct dtrace_ecb	*ecb;
-	struct sample		smpl;
+	u8			*buf, *buf_end;
+	int			err;
 
 	bpf_finalize_context(ctx, &probemap);
 
@@ -63,17 +54,25 @@ DPROG(__NR_write)(struct dtrace_bpf_context *ctx)
 	if (!ecb)
 		return 0;
 
-	memset(&smpl, 0, sizeof(smpl));
-	smpl.probe_id = ecb->probe_id;
-	smpl.pid = ctx->pid;
-	smpl.ppid = ctx->ppid;
-	smpl.cpu = ctx->cpu;
-	smpl.euid = ctx->euid;
-	smpl.egid = ctx->egid;
-	smpl.task = ctx->task;
-	smpl.tag = 0xdace;
-
-	bpf_perf_event_output(ctx, &buffer_map, cpu, &smpl, sizeof(smpl));
+	err = bpf_buffer_reserve(ctx, BUF_ID, &buffer_map, 48);
+	if (err < 0)
+		return -1;
+	buf = ctx->buf;
+	buf_end = ctx->buf_end;
+	if (buf + 48 > buf_end)
+		return -1;
+
+	*(u32 *)(&buf[0]) = ecb->probe_id;
+	*(u32 *)(&buf[4]) = 0;
+	*(u32 *)(&buf[8]) = ctx->pid;
+	*(u32 *)(&buf[12]) = ctx->ppid;
+	*(u32 *)(&buf[16]) = ctx->cpu;
+	*(u32 *)(&buf[20]) = ctx->euid;
+	*(u32 *)(&buf[24]) = ctx->egid;
+	*(u64 *)(&buf[32]) = ctx->task;
+	*(u32 *)(&buf[40]) = 0xdace;
+
+	bpf_buffer_commit(ctx, BUF_ID, &buffer_map);
 
 	return 0;
 }
@@ -84,7 +83,7 @@ int bpf_prog1(struct pt_regs *ctx)
 	struct dtrace_ecb	ecb;
 	int			cpu = bpf_get_smp_processor_id();
 
-	ecb.id = 1;
+	ecb.id = 3;
 	ecb.probe_id = 123;
 
 	bpf_map_update_elem(&probemap, &cpu, &ecb, BPF_ANY);
-- 
2.20.1