From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753436AbbFLG70 (ORCPT ); Fri, 12 Jun 2015 02:59:26 -0400 Received: from szxga01-in.huawei.com ([58.251.152.64]:5811 "EHLO szxga01-in.huawei.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750781AbbFLG7Y (ORCPT ); Fri, 12 Jun 2015 02:59:24 -0400 Message-ID: <557A832B.7020405@huawei.com> Date: Fri, 12 Jun 2015 14:58:51 +0800 From: "Wangnan (F)" User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Thunderbird/31.6.0 MIME-Version: 1.0 To: Daniel Wagner , Steven Rostedt , Tom Zanussi CC: "linux-kernel@vger.kernel.org" , "Alexei Starovoitov" Subject: Re: latency histogram with BPF References: <557937D8.90606@bmw-carit.de> In-Reply-To: <557937D8.90606@bmw-carit.de> Content-Type: text/plain; charset="utf-8"; format=flowed Content-Transfer-Encoding: 7bit X-Originating-IP: [10.111.66.109] X-CFilter-Loop: Reflected Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On 2015/6/11 15:25, Daniel Wagner wrote: > Hi Steven and Tom, > > I was playing a bit with BPF to see if it would be possible to > get a preempt off latency histogram. On paper this looks like > doable but adding kprobes on trace_preempt_[on|off] is not a > clever idea. Who would have thought? :) > > In both cases BPF or based on Tom's 'hist' triggers' patches, there is > some trickery necessary to get it working. While the first approach > has more flexibility what you want to measure or how you want to > present it, I suspect it will be harder to get it working/accepted. > > Anyway, here is some code to laugh at. > > From e3b9e7122f78953248eae54376b5a65f8beba362 Mon Sep 17 00:00:00 2001 > From: Daniel Wagner > Date: Thu, 11 Jun 2015 09:19:03 +0200 > Subject: [PATCH] hwlathist: BPF based latency tracing > > BPF offers another way to generate latency histograms. We attach > kprobes at trace_preempt_off and trace_preempt_on and calculate the > time it takes to from seeing the off/on transition. > > The first hashmap is used to store the start time stamp. The key is the > CPU id. The second hashmap stores the log2(time diff). To be able to > have a per-cpu histogram the CPU ID is added to the upper > bits of the hash key. > > Obviously, adding kprobes to trace_preempt_[on|off] is > asking for problems: [SNIP] > > CPU 0 > latency : count distribution > 1 -> 1 : 0 | | > 2 -> 3 : 0 | | > 4 -> 7 : 0 | | > 8 -> 15 : 3602 |************************************************* | > 16 -> 31 : 22 | | > 32 -> 63 : 0 | | > 64 -> 127 : 0 | | > 128 -> 255 : 0 | | > 256 -> 511 : 0 | | > 512 -> 1023 : 0 | | > CPU 1 > latency : count distribution > 1 -> 1 : 0 | | > 2 -> 3 : 0 | | > 4 -> 7 : 0 | | > 8 -> 15 : 7533 |************************************************* | > 16 -> 31 : 125 | | > 32 -> 63 : 0 | | > 64 -> 127 : 0 | | > 128 -> 255 : 0 | | > 256 -> 511 : 0 | | > 512 -> 1023 : 0 | | > CPU 2 > latency : count distribution > 1 -> 1 : 0 | | > 2 -> 3 : 0 | | > 4 -> 7 : 0 | | > 8 -> 15 : 3044 |************************************************* | > 16 -> 31 : 13 | | > 32 -> 63 : 0 | | > 64 -> 127 : 0 | | > 128 -> 255 : 0 | | > 256 -> 511 : 0 | | > 512 -> 1023 : 0 | | > CPU 3 > latency : count distribution > 1 -> 1 : 0 | | > 2 -> 3 : 0 | | > 4 -> 7 : 0 | | > 8 -> 15 : 25182 |************************************************* | > 16 -> 31 : 1675 |** | > 32 -> 63 : 0 | | > 64 -> 127 : 0 | | > 128 -> 255 : 0 | | > 256 -> 511 : 0 | | > 512 -> 1023 : 0 | | > > The numbers look a bit too nice and low. I suspect something is going > wrong. > > All this is based on the tracex3 examples written by > Alexei Starovoitov . > > Not for inclusion! > > Not-signed-off-by: Daniel Wagner > --- > kernel/trace/bpf_trace.c | 2 + > samples/bpf/Makefile | 6 ++- > samples/bpf/bpf_helpers.h | 2 + > samples/bpf/hwlathist_kern.c | 92 ++++++++++++++++++++++++++++++++++++ > samples/bpf/hwlathist_user.c | 108 +++++++++++++++++++++++++++++++++++++++++++ > 5 files changed, 209 insertions(+), 1 deletion(-) > create mode 100644 samples/bpf/hwlathist_kern.c > create mode 100644 samples/bpf/hwlathist_user.c > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index 2d56ce5..e4d3e76 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -172,6 +172,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func > return &bpf_probe_read_proto; > case BPF_FUNC_ktime_get_ns: > return &bpf_ktime_get_ns_proto; > + case BPF_FUNC_get_smp_processor_id: > + return &bpf_get_smp_processor_id_proto; > > case BPF_FUNC_trace_printk: > /* > diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile > index 8fdbd73..8420648 100644 > --- a/samples/bpf/Makefile > +++ b/samples/bpf/Makefile > @@ -10,6 +10,7 @@ hostprogs-y += tracex1 > hostprogs-y += tracex2 > hostprogs-y += tracex3 > hostprogs-y += tracex4 > +hostprogs-y += hwlathist > > test_verifier-objs := test_verifier.o libbpf.o > test_maps-objs := test_maps.o libbpf.o > @@ -20,6 +21,7 @@ tracex1-objs := bpf_load.o libbpf.o tracex1_user.o > tracex2-objs := bpf_load.o libbpf.o tracex2_user.o > tracex3-objs := bpf_load.o libbpf.o tracex3_user.o > tracex4-objs := bpf_load.o libbpf.o tracex4_user.o > +hwlathist-objs := bpf_load.o libbpf.o hwlathist_user.o > > # Tell kbuild to always build the programs > always := $(hostprogs-y) > @@ -30,6 +32,7 @@ always += tracex2_kern.o > always += tracex3_kern.o > always += tracex4_kern.o > always += tcbpf1_kern.o > +always += hwlathist_kern.o > > HOSTCFLAGS += -I$(objtree)/usr/include > > @@ -40,9 +43,10 @@ HOSTLOADLIBES_tracex1 += -lelf > HOSTLOADLIBES_tracex2 += -lelf > HOSTLOADLIBES_tracex3 += -lelf > HOSTLOADLIBES_tracex4 += -lelf -lrt > +HOSTLOADLIBES_hwlathist += -lelf > > # point this to your LLVM backend with bpf support > -LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc > +LLC = /opt/llvm/bin/llc > > $(obj)/%.o: $(src)/%.c > clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \ > diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h > index f960b5f..c99bdb6 100644 > --- a/samples/bpf/bpf_helpers.h > +++ b/samples/bpf/bpf_helpers.h > @@ -21,6 +21,8 @@ static unsigned long long (*bpf_ktime_get_ns)(void) = > (void *) BPF_FUNC_ktime_get_ns; > static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) = > (void *) BPF_FUNC_trace_printk; > +static unsigned int (*bpf_get_smp_processor_id)(void) = > + (void *) BPF_FUNC_get_smp_processor_id; > > /* llvm builtin functions that eBPF C program may use to > * emit BPF_LD_ABS and BPF_LD_IND instructions > diff --git a/samples/bpf/hwlathist_kern.c b/samples/bpf/hwlathist_kern.c > new file mode 100644 > index 0000000..7a97e7e > --- /dev/null > +++ b/samples/bpf/hwlathist_kern.c > @@ -0,0 +1,92 @@ > +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com > + * Copyright (c) 2015 BMW Car IT GmbH > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of version 2 of the GNU General Public > + * License as published by the Free Software Foundation. > + */ > +#include > +#include > +#include > +#include "bpf_helpers.h" > + > +struct bpf_map_def SEC("maps") my_map = { > + .type = BPF_MAP_TYPE_HASH, > + .key_size = sizeof(unsigned int), > + .value_size = sizeof(u64), > + .max_entries = 1024, > +}; > + > +static unsigned int log2(unsigned int v) > +{ > + unsigned int r; > + unsigned int shift; > + > + r = (v > 0xFFFF) << 4; v >>= r; > + shift = (v > 0xFF) << 3; v >>= shift; r |= shift; > + shift = (v > 0xF) << 2; v >>= shift; r |= shift; > + shift = (v > 0x3) << 1; v >>= shift; r |= shift; > + r |= (v >> 1); > + > + return r; > +} > + > +static unsigned int log2l(unsigned long v) > +{ > + unsigned int hi = v >> 32; > + > + if (hi) > + return log2(hi) + 32; > + else > + return log2(v); > +} > + Is it possible to make 'uapi/linux/bpf.h' hold something useful for eBPF programming, so we can get rid from bpf_helpers? Like this: #ifndef _UAPI__LINUX_BPF_H__ #define _UAPI__LINUX_BPF_H__ #include #include ... #ifdef __BPF__ #define SEC(NAME) __attribute__((section(NAME), used)) ... static void *(*bpf_map_lookup_elem)(void *map, void *key) = (void *) BPF_FUNC_map_lookup_elem; ... static unsigned int log2l(unsigned long v) { unsigned int hi = v >> 32; if (hi) return log2(hi) + 32; else return log2(v); } ... #endif #endif Then by using the shell script Alexei suggested in https://lkml.kernel.org/r/55777A17.7030000@plumgrid.com and automatically kernel header detector I posted today, users will easily find helpers, even if the source file is not in 'samples/bpf'. Thank you. > +SEC("kprobe/trace_preempt_off") > +int bpf_prog1(struct pt_regs *ctx) > +{ > + int cpu = bpf_get_smp_processor_id(); > + u64 ts = bpf_ktime_get_ns(); > + > + bpf_map_update_elem(&my_map, &cpu, &ts, BPF_ANY); > + > + return 0; > +} > + > +struct bpf_map_def SEC("maps") my_lat = { > + .type = BPF_MAP_TYPE_HASH, > + .key_size = sizeof(unsigned long long), > + .value_size = sizeof(long), > + .max_entries = 1024, > +}; > + > +SEC("kprobe/trace_preempt_on") > +int bpf_prog2(struct pt_regs *ctx) > +{ > + long init_val = 1; > + long *value; > + u64 *ts; > + int cpu; > + > + cpu = bpf_get_smp_processor_id(); > + ts = bpf_map_lookup_elem(&my_map, &cpu); > + if (!ts) > + return 0; > + > + u64 cur_time = bpf_ktime_get_ns(); > + u64 delta = log2l(cur_time - *ts); > + > + bpf_map_delete_elem(&my_map, &cpu); > + > + u64 key = (((u64)cpu) << 32) | (log2l(delta) & 0xffffffff); > + value = bpf_map_lookup_elem(&my_lat, &key); > + if (value) > + __sync_fetch_and_add((long *)value, 1); > + else > + bpf_map_update_elem(&my_lat, &key, &init_val, BPF_ANY); > + > + return 0; > + > +} > + > +char _license[] SEC("license") = "GPL"; > +u32 _version SEC("version") = LINUX_VERSION_CODE; > diff --git a/samples/bpf/hwlathist_user.c b/samples/bpf/hwlathist_user.c > new file mode 100644 > index 0000000..7e19f2c > --- /dev/null > +++ b/samples/bpf/hwlathist_user.c > @@ -0,0 +1,108 @@ > +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com > + * Copyright (c) 2015 BMW Car IT GmbH > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of version 2 of the GNU General Public > + * License as published by the Free Software Foundation. > + */ > +#include > +#include > +#include > +#include > +#include > +#include "libbpf.h" > +#include "bpf_load.h" > + > +#define MAX_ENTRIES 10 > +#define MAX_CPU 8 > +#define MAX_STARS 50 > + > +static void stars(char *str, long val, long max, int width) > +{ > + int i; > + > + for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++) > + str[i] = '*'; > + if (val > max) > + str[i - 1] = '+'; > + str[i] = '\0'; > +} > + > +struct cpu_hist { > + long data[MAX_ENTRIES]; > + unsigned long max; > +}; > + > +static struct cpu_hist cpu_hist[MAX_CPU]; > + > +static void get_data(int fd) > +{ > + unsigned long long key, next_key, k; > + long value; > + int i, cpu; > + > + for (i = 0; i < MAX_CPU; i++) > + cpu_hist[i].max = 0; > + > + key = 0; > + while (bpf_get_next_key(fd, &key, &next_key) == 0) { > + bpf_lookup_elem(fd, &next_key, &value); > + cpu = next_key >> 32; > + if (cpu >= MAX_CPU) > + goto next; > + > + k = next_key & 0xffffffff; > + if (k > MAX_ENTRIES) > + k = MAX_ENTRIES - 1; > + > + cpu_hist[cpu].data[k] = value; > + if (value > cpu_hist[cpu].max) > + cpu_hist[cpu].max = value; > + > + next: > + key = next_key; > + } > +} > + > +static void print_hist(void) > +{ > + char starstr[MAX_STARS]; > + struct cpu_hist *hist; > + int i, j; > + > + printf("\033[2J"); > + > + for (j = 0; j < MAX_CPU; j++) { > + hist = &cpu_hist[j]; > + if (hist->max == 0) > + continue; > + printf("CPU %d\n", j); > + printf(" latency : count distribution\n"); > + for (i = 1; i <= MAX_ENTRIES; i++) { > + stars(starstr, hist->data[i - 1], hist->max, MAX_STARS); > + printf("%8ld -> %-8ld : %-8ld |%-*s|\n", > + (1l << i) >> 1, (1l << i) - 1, > + hist->data[i - 1], MAX_STARS, starstr); > + } > + } > +} > + > +int main(int argc, char **argv) > +{ > + char filename[256]; > + > + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); > + > + if (load_bpf_file(filename)) { > + printf("%s", bpf_log_buf); > + return 1; > + } > + > + while (1) { > + get_data(map_fd[1]); > + print_hist(); > + sleep(5); > + } > + > + return 0; > +}