latency histogram with BPF

* latency histogram with BPF
@ 2015-06-11  7:25 Daniel Wagner
  2015-06-11 22:08 ` Alexei Starovoitov
  2015-06-12  6:58 ` Wangnan (F)
  0 siblings, 2 replies; 8+ messages in thread
From: Daniel Wagner @ 2015-06-11  7:25 UTC (permalink / raw)
  To: Steven Rostedt, Tom Zanussi; +Cc: linux-kernel, Alexei Starovoitov

Hi Steven and Tom,

I was playing a bit with BPF to see if it would be possible to
get a preempt off latency histogram. On paper this looks like
doable but adding kprobes on trace_preempt_[on|off] is not a
clever idea. Who would have thought? :)

In both cases BPF or based on Tom's 'hist' triggers' patches, there is
some trickery necessary to get it working. While the first approach
has more flexibility what you want to measure or how you want to
present it, I suspect it will be harder to get it working/accepted. 

Anyway, here is some code to laugh at.  

>From e3b9e7122f78953248eae54376b5a65f8beba362 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Thu, 11 Jun 2015 09:19:03 +0200
Subject: [PATCH] hwlathist: BPF based latency tracing

BPF offers another way to generate latency histograms. We attach
kprobes at trace_preempt_off and trace_preempt_on and calculate the
time it takes to from seeing the off/on transition.

The first hashmap is used to store the start time stamp. The key is the
CPU id. The second hashmap stores the log2(time diff). To be able to
have a per-cpu histogram the CPU ID is added to the upper
bits of the hash key.

Obviously, adding kprobes to trace_preempt_[on|off] is
asking for problems:

[ 4425.690514] Modules linked in:
[ 4425.690770] CPU: 1 PID: 1803 Comm: hackbench Not tainted 4.1.0-rc7+ #689
[ 4425.691029] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153950- 04/01/2014
[ 4425.691029] task: ffff88007c305240 ti: ffff880078fbc000 task.ti: ffff880078fbc000
[ 4425.691029] RIP: 0010:[<ffffffff8115dc37>]  [<ffffffff8115dc37>] htab_map_delete_elem+0x1a7/0x240
[ 4425.691029] RSP: 0018:ffff88007d007af8  EFLAGS: 00010082
[ 4425.691029] RAX: ffff880079bfe880 RBX: ffff88007d007dbc RCX: dead000000200200
[ 4425.691029] RDX: ffff880079bfe880 RSI: ffff88007d007dbc RDI: ffff880079bfe8a8
[ 4425.691029] RBP: ffff88007d007b38 R08: 0000000000000000 R09: 0000000000000000
[ 4425.691029] R10: 0000000000000001 R11: 0000000000000008 R12: ffff880079894500
[ 4425.691029] R13: 0000000000000004 R14: ffff880079894578 R15: 0000000000000096
[ 4425.691029] FS:  00007ff5c4d51740(0000) GS:ffff88007d000000(0000) knlGS:0000000000000000
[ 4425.691029] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4425.691029] CR2: 0000000000fad118 CR3: 0000000078ed6000 CR4: 00000000001406e0
[ 4425.691029] Stack:
[ 4425.691029]  ffff88007d007b28 72a82a9b00000092 ffff880079894500 ffff880079894500
[ 4425.691029]  ffff88007d007dbc 0000000000000001 ffff88000004d000 ffffffff81141ca1
[ 4425.691029]  ffff88007d007b58 ffffffff8115e2d8 ffffc9000021c358 ffffffff81c19780
[ 4425.691029] Call Trace:
[ 4425.691029]  <#DB>
[ 4425.691029]  [<ffffffff81141ca1>] ? trace_preempt_on+0x1/0x110
[ 4425.691029]  [<ffffffff8115e2d8>] bpf_map_delete_elem+0x28/0x60
[ 4425.691029]  [<ffffffff81158e53>] __bpf_prog_run+0xa63/0x1230
[ 4425.691029]  [<ffffffff81090895>] ? sched_clock_local+0x25/0x90
[ 4425.691029]  [<ffffffff81090b88>] ? sched_clock_cpu+0xa8/0x100
[ 4425.691029]  [<ffffffff810acb1c>] ? __lock_acquire+0xaac/0x2280
[ 4425.691029]  [<ffffffff81090895>] ? sched_clock_local+0x25/0x90
[ 4425.691029]  [<ffffffff8114e44f>] ? trace_call_bpf+0x5f/0x1a0
[ 4425.691029]  [<ffffffff8114e497>] trace_call_bpf+0xa7/0x1a0
[ 4425.691029]  [<ffffffff8114e44f>] ? trace_call_bpf+0x5f/0x1a0
[ 4425.691029]  [<ffffffff8114e758>] kprobe_perf_func+0x28/0x240
[ 4425.691029]  [<ffffffff810447c4>] ? skip_prefixes+0x24/0x50
[ 4425.691029]  [<ffffffff81141ca1>] ? trace_preempt_on+0x1/0x110
[ 4425.691029]  [<ffffffff81150408>] kprobe_dispatcher+0x38/0x60
[ 4425.691029]  [<ffffffff81141ca0>] ? time_hardirqs_off+0x110/0x110
[ 4425.691029]  [<ffffffff81141ca0>] ? time_hardirqs_off+0x110/0x110
[ 4425.691029]  [<ffffffff81045628>] kprobe_int3_handler+0x1b8/0x1e0
[ 4425.691029]  [<ffffffff81003683>] do_int3+0x53/0x140
[ 4425.691029]  [<ffffffff8111ca18>] ? audit_alloc+0xb8/0x2a0
[ 4425.691029]  [<ffffffff81b714f1>] int3+0x41/0x80
[ 4425.691029]  [<ffffffff8111ca18>] ? audit_alloc+0xb8/0x2a0
[ 4425.691029]  [<ffffffff81b71de6>] ? smp_apic_timer_interrupt+0x46/0x60
[ 4425.691029]  [<ffffffff81b71de6>] ? smp_apic_timer_interrupt+0x46/0x60
[ 4425.691029]  [<ffffffff810606e9>] ? irq_exit+0x19/0xc0
[ 4425.691029]  [<ffffffff81141ca1>] ? trace_preempt_on+0x1/0x110
[ 4425.691029]  <<EOE>>
[ 4425.691029]  <IRQ> [ 4425.691029]  [<ffffffff8108a77b>] ? preempt_count_sub+0xab/0xf0
[ 4425.691029]  [<ffffffff810606e9>] irq_exit+0x19/0xc0
[ 4425.691029]  [<ffffffff81b71de6>] smp_apic_timer_interrupt+0x46/0x60
[ 4425.691029]  [<ffffffff81b70800>] apic_timer_interrupt+0x70/0x80
[ 4425.691029]  <EOI>
[ 4425.691029]  [<ffffffff810af7e0>] ? lock_release+0x130/0x480
[ 4425.691029]  [<ffffffff8111ca3d>] audit_alloc+0xdd/0x2a0
[ 4425.691029]  [<ffffffff8111c978>] ? audit_alloc+0x18/0x2a0
[ 4425.691029]  [<ffffffff810579d3>] copy_process.part.37+0x753/0x1b10
[ 4425.691029]  [<ffffffff8108a77b>] ? preempt_count_sub+0xab/0xf0
[ 4425.691029]  [<ffffffff8111cedc>] ? __audit_syscall_entry+0x9c/0xf0
[ 4425.691029]  [<ffffffff81058f4b>] do_fork+0xdb/0x7c0
[ 4425.691029]  [<ffffffff8111cedc>] ? __audit_syscall_entry+0x9c/0xf0
[ 4425.691029]  [<ffffffff8111cedc>] ? __audit_syscall_entry+0x9c/0xf0
[ 4425.691029]  [<ffffffff81010dfc>] ? do_audit_syscall_entry+0x6c/0x70
[ 4425.691029]  [<ffffffff81011bd3>] ? syscall_trace_enter_phase1+0x103/0x170
[ 4425.691029]  [<ffffffff814fb4e8>] ? trace_hardirqs_on_thunk+0x17/0x19
[ 4425.691029]  [<ffffffff810596b6>] SyS_clone+0x16/0x20
[ 4425.691029]  [<ffffffff81b6f997>] system_call_fastpath+0x12/0x6f
[ 4425.691029] Code: 8b 54 24 70 44 89 e9 83 e8 01 21 f0 48 8d 3c c2 48 89 da e8 3c f7 ff ff 48 85 c0 0f 84 8f 00 00 00 48 8b 10 48 8b 48 08 48 85 d2 <48> 89 11 74 04 48 89 4a 08 48 bb 00 02 20 00 00 00 ad de 48 8d
[ 4425.691029] RIP  [<ffffffff8115dc37>] htab_map_delete_elem+0x1a7/0x240
[ 4425.691029]  RSP <ffff88007d007af8>
[ 4425.691029] ---[ end trace 42ef1259d92d0b4a ]---
[ 4425.691029] Kernel panic - not syncing: Fatal exception in interrupt
[ 4425.691029] Kernel Offset: disabled
[ 4425.691029] ---[ end Kernel panic - not syncing: Fatal exception in interrupt

Nevertheless, I was able run it for a while before it exploded
and got this:

CPU 0
      latency        : count     distribution
       1 -> 1        : 0        |                                                  |
       2 -> 3        : 0        |                                                  |
       4 -> 7        : 0        |                                                  |
       8 -> 15       : 3602     |************************************************* |
      16 -> 31       : 22       |                                                  |
      32 -> 63       : 0        |                                                  |
      64 -> 127      : 0        |                                                  |
     128 -> 255      : 0        |                                                  |
     256 -> 511      : 0        |                                                  |
     512 -> 1023     : 0        |                                                  |
CPU 1
      latency        : count     distribution
       1 -> 1        : 0        |                                                  |
       2 -> 3        : 0        |                                                  |
       4 -> 7        : 0        |                                                  |
       8 -> 15       : 7533     |************************************************* |
      16 -> 31       : 125      |                                                  |
      32 -> 63       : 0        |                                                  |
      64 -> 127      : 0        |                                                  |
     128 -> 255      : 0        |                                                  |
     256 -> 511      : 0        |                                                  |
     512 -> 1023     : 0        |                                                  |
CPU 2
      latency        : count     distribution
       1 -> 1        : 0        |                                                  |
       2 -> 3        : 0        |                                                  |
       4 -> 7        : 0        |                                                  |
       8 -> 15       : 3044     |************************************************* |
      16 -> 31       : 13       |                                                  |
      32 -> 63       : 0        |                                                  |
      64 -> 127      : 0        |                                                  |
     128 -> 255      : 0        |                                                  |
     256 -> 511      : 0        |                                                  |
     512 -> 1023     : 0        |                                                  |
CPU 3
      latency        : count     distribution
       1 -> 1        : 0        |                                                  |
       2 -> 3        : 0        |                                                  |
       4 -> 7        : 0        |                                                  |
       8 -> 15       : 25182    |************************************************* |
      16 -> 31       : 1675     |**                                                |
      32 -> 63       : 0        |                                                  |
      64 -> 127      : 0        |                                                  |
     128 -> 255      : 0        |                                                  |
     256 -> 511      : 0        |                                                  |
     512 -> 1023     : 0        |                                                  |

The numbers look a bit too nice and low. I suspect something is going
wrong.

All this is based on the tracex3 examples written by
Alexei Starovoitov <ast@plumgrid.com>.

Not for inclusion!

Not-signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
---
 kernel/trace/bpf_trace.c     |   2 +
 samples/bpf/Makefile         |   6 ++-
 samples/bpf/bpf_helpers.h    |   2 +
 samples/bpf/hwlathist_kern.c |  92 ++++++++++++++++++++++++++++++++++++
 samples/bpf/hwlathist_user.c | 108 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 209 insertions(+), 1 deletion(-)
 create mode 100644 samples/bpf/hwlathist_kern.c
 create mode 100644 samples/bpf/hwlathist_user.c

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 2d56ce5..e4d3e76 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -172,6 +172,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_probe_read_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
+	case BPF_FUNC_get_smp_processor_id:
+		return &bpf_get_smp_processor_id_proto;
 
 	case BPF_FUNC_trace_printk:
 		/*
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 8fdbd73..8420648 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -10,6 +10,7 @@ hostprogs-y += tracex1
 hostprogs-y += tracex2
 hostprogs-y += tracex3
 hostprogs-y += tracex4
+hostprogs-y += hwlathist
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -20,6 +21,7 @@ tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
+hwlathist-objs := bpf_load.o libbpf.o hwlathist_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -30,6 +32,7 @@ always += tracex2_kern.o
 always += tracex3_kern.o
 always += tracex4_kern.o
 always += tcbpf1_kern.o
+always += hwlathist_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -40,9 +43,10 @@ HOSTLOADLIBES_tracex1 += -lelf
 HOSTLOADLIBES_tracex2 += -lelf
 HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
+HOSTLOADLIBES_hwlathist += -lelf
 
 # point this to your LLVM backend with bpf support
-LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
+LLC = /opt/llvm/bin/llc
 
 $(obj)/%.o: $(src)/%.c
 	clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index f960b5f..c99bdb6 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -21,6 +21,8 @@ static unsigned long long (*bpf_ktime_get_ns)(void) =
 	(void *) BPF_FUNC_ktime_get_ns;
 static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
 	(void *) BPF_FUNC_trace_printk;
+static unsigned int (*bpf_get_smp_processor_id)(void) =
+	(void *) BPF_FUNC_get_smp_processor_id;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/hwlathist_kern.c b/samples/bpf/hwlathist_kern.c
new file mode 100644
index 0000000..7a97e7e
--- /dev/null
+++ b/samples/bpf/hwlathist_kern.c
@@ -0,0 +1,92 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2015 BMW Car IT GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(unsigned int),
+	.value_size = sizeof(u64),
+	.max_entries = 1024,
+};
+
+static unsigned int log2(unsigned int v)
+{
+	unsigned int r;
+	unsigned int shift;
+
+	r = (v > 0xFFFF) << 4; v >>= r;
+	shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+	shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+	shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+	r |= (v >> 1);
+
+	return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+	unsigned int hi = v >> 32;
+
+	if (hi)
+		return log2(hi) + 32;
+	else
+		return log2(v);
+}
+
+SEC("kprobe/trace_preempt_off")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	int cpu = bpf_get_smp_processor_id();
+	u64 ts = bpf_ktime_get_ns();
+
+	bpf_map_update_elem(&my_map, &cpu, &ts, BPF_ANY);
+
+	return 0;
+}
+
+struct bpf_map_def SEC("maps") my_lat = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(unsigned long long),
+	.value_size = sizeof(long),
+	.max_entries = 1024,
+};
+
+SEC("kprobe/trace_preempt_on")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	long init_val = 1;
+	long *value;
+	u64 *ts;
+	int cpu;
+
+	cpu = bpf_get_smp_processor_id();
+	ts = bpf_map_lookup_elem(&my_map, &cpu);
+	if (!ts)
+		return 0;
+
+	u64 cur_time = bpf_ktime_get_ns();
+	u64 delta = log2l(cur_time - *ts);
+
+	bpf_map_delete_elem(&my_map, &cpu);
+
+	u64 key = (((u64)cpu) << 32) | (log2l(delta) & 0xffffffff);
+	value = bpf_map_lookup_elem(&my_lat, &key);
+	if (value)
+		__sync_fetch_and_add((long *)value, 1);
+	else
+		bpf_map_update_elem(&my_lat, &key, &init_val, BPF_ANY);
+
+	return 0;
+
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/hwlathist_user.c b/samples/bpf/hwlathist_user.c
new file mode 100644
index 0000000..7e19f2c
--- /dev/null
+++ b/samples/bpf/hwlathist_user.c
@@ -0,0 +1,108 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2015 BMW Car IT GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_ENTRIES	10
+#define MAX_CPU		8
+#define MAX_STARS	50
+
+static void stars(char *str, long val, long max, int width)
+{
+	int i;
+
+	for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+		str[i] = '*';
+	if (val > max)
+		str[i - 1] = '+';
+	str[i] = '\0';
+}
+
+struct cpu_hist {
+	long data[MAX_ENTRIES];
+	unsigned long max;
+};
+
+static struct cpu_hist cpu_hist[MAX_CPU];
+
+static void get_data(int fd)
+{
+	unsigned long long key, next_key, k;
+	long value;
+	int i, cpu;
+
+	for (i = 0; i < MAX_CPU; i++)
+		cpu_hist[i].max = 0;
+
+	key = 0;
+	while (bpf_get_next_key(fd, &key, &next_key) == 0) {
+		bpf_lookup_elem(fd, &next_key, &value);
+		cpu = next_key >> 32;
+		if (cpu >= MAX_CPU)
+			goto next;
+
+		k = next_key & 0xffffffff;
+		if (k > MAX_ENTRIES)
+			k = MAX_ENTRIES - 1;
+
+		cpu_hist[cpu].data[k] = value;
+		if (value > cpu_hist[cpu].max)
+			cpu_hist[cpu].max = value;
+
+	next:
+		key = next_key;
+	}
+}
+
+static void print_hist(void)
+{
+	char starstr[MAX_STARS];
+	struct cpu_hist *hist;
+	int i, j;
+
+	printf("\033[2J");
+
+	for (j = 0; j < MAX_CPU; j++) {
+		hist = &cpu_hist[j];
+		if (hist->max == 0)
+			continue;
+		printf("CPU %d\n", j);
+		printf("      latency        : count     distribution\n");
+		for (i = 1; i <= MAX_ENTRIES; i++) {
+			stars(starstr, hist->data[i - 1], hist->max, MAX_STARS);
+			printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+				(1l << i) >> 1, (1l << i) - 1,
+				hist->data[i - 1], MAX_STARS, starstr);
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	while (1) {
+		get_data(map_fd[1]);
+		print_hist();
+		sleep(5);
+	}
+
+	return 0;
+}
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread