* [PATCH v3 1/3] bpf: Add new bpf map type to store the pointer to struct perf_event
2015-07-23 9:42 [PATCH v3 0/3] bpf: Introduce the new ability of eBPF programs to access hardware PMU counter Kaixu Xia
@ 2015-07-23 9:42 ` Kaixu Xia
2015-07-23 22:54 ` Alexei Starovoitov
2015-08-03 9:38 ` Peter Zijlstra
2015-07-23 9:42 ` [PATCH v3 2/3] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter Kaixu Xia
2015-07-23 9:42 ` [PATCH v3 3/3] samples/bpf: example of get selected PMU counter value Kaixu Xia
2 siblings, 2 replies; 15+ messages in thread
From: Kaixu Xia @ 2015-07-23 9:42 UTC (permalink / raw)
To: ast, davem, acme, mingo, a.p.zijlstra, masami.hiramatsu.pt, jolsa
Cc: xiakaixu, wangnan0, linux-kernel, pi3orama, hekuang
Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
This map only stores the pointer to struct perf_event. The
user space event FDs from perf_event_open() syscall are converted
to the pointer to struct perf_event and stored in map.
Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
---
include/linux/bpf.h | 2 +
include/linux/perf_event.h | 2 +
include/uapi/linux/bpf.h | 1 +
kernel/bpf/arraymap.c | 113 +++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/verifier.c | 15 ++++++
kernel/events/core.c | 23 +++++++++
6 files changed, 156 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4383476..9cf74c0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -10,6 +10,7 @@
#include <uapi/linux/bpf.h>
#include <linux/workqueue.h>
#include <linux/file.h>
+#include <linux/perf_event.h>
struct bpf_map;
@@ -143,6 +144,7 @@ struct bpf_array {
union {
char value[0] __aligned(8);
struct bpf_prog *prog[0] __aligned(8);
+ struct perf_event *events[0] __aligned(8);
};
};
#define MAX_TAIL_CALL_CNT 32
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2027809..2ea4067 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -641,6 +641,7 @@ extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
+extern struct perf_event *perf_event_get(unsigned int fd);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
@@ -979,6 +980,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; }
static inline void perf_event_exit_task(struct task_struct *child) { }
static inline void perf_event_free_task(struct task_struct *task) { }
static inline void perf_event_delayed_put(struct task_struct *task) { }
+static struct perf_event *perf_event_get(unsigned int fd) { return NULL; }
static inline void perf_event_print_debug(void) { }
static inline int perf_event_task_disable(void) { return -EINVAL; }
static inline int perf_event_task_enable(void) { return -EINVAL; }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 29ef6f9..69a1f6b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -114,6 +114,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_HASH,
BPF_MAP_TYPE_ARRAY,
BPF_MAP_TYPE_PROG_ARRAY,
+ BPF_MAP_TYPE_PERF_EVENT_ARRAY,
};
enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229..e97efbc 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -255,3 +255,116 @@ static int __init register_prog_array_map(void)
return 0;
}
late_initcall(register_prog_array_map);
+
+static struct bpf_map *perf_event_array_map_alloc(union bpf_attr *attr)
+{
+ /* only the pointer to struct perf_event can be stored in
+ * perf_event_array map
+ */
+ if (attr->value_size != sizeof(u32))
+ return ERR_PTR(-EINVAL);
+
+ return array_map_alloc(attr);
+}
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_event *event;
+ int i;
+
+ synchronize_rcu();
+
+ /* release the struct perf_event in perf_event_array_map */
+ for(i = 0; i < array->map.max_entries; i++) {
+ event = array->events[i];
+ if (event)
+ perf_event_release_kernel(event);
+ }
+ kvfree(array);
+}
+
+static int perf_event_array_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -EINVAL;
+}
+
+static void *perf_event_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+static struct perf_event *convert_map_with_perf_event(void *value)
+{
+ struct perf_event *event;
+ u32 fd;
+
+ fd = *(u32 *)value;
+
+ event = perf_event_get(fd);
+ if (IS_ERR(event))
+ return NULL;
+
+ /* limit the event type to PERF_TYPE_RAW
+ * and PERF_TYPE_HARDWARE.
+ */
+ if (event->attr.type != PERF_TYPE_RAW &&
+ event->attr.type != PERF_TYPE_HARDWARE)
+ return NULL;
+
+ return event;
+}
+
+/* only called from syscall */
+static int perf_event_array_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_event *event;
+ u32 index = *(u32 *)key;
+
+ if (map_flags != BPF_ANY)
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ /* check if the value is already stored */
+ if (array->events[index])
+ return -EINVAL;
+
+ /* convert the fd to the pointer to struct perf_event */
+ event = convert_map_with_perf_event(value);
+ if (!event)
+ return -EBADF;
+
+ xchg(array->events + index, event);
+ return 0;
+}
+
+static int perf_event_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+ .map_alloc = perf_event_array_map_alloc,
+ .map_free = perf_event_array_map_free,
+ .map_get_next_key = perf_event_array_map_get_next_key,
+ .map_lookup_elem = perf_event_array_map_lookup_elem,
+ .map_update_elem = perf_event_array_map_update_elem,
+ .map_delete_elem = perf_event_array_map_delete_elem,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+ .ops = &perf_event_array_ops,
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+ bpf_register_map_type(&perf_event_array_type);
+ return 0;
+}
+late_initcall(register_perf_event_array_map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 039d866..c70f7e7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -924,6 +924,21 @@ static int check_call(struct verifier_env *env, int func_id)
*/
return -EINVAL;
+ if (map && map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
+ func_id != BPF_FUNC_perf_event_read)
+ /* perf_event_array map type needs extra care:
+ * only allow to pass it into bpf_perf_event_read() for now.
+ * bpf_map_update/delete_elem() must only be done via syscall
+ */
+ return -EINVAL;
+
+ if (func_id == BPF_FUNC_perf_event_read &&
+ map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+ /* don't allow any other map type to be passed into
+ * bpf_perf_event_read()
+ */
+ return -EINVAL;
+
return 0;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae34..08cb467 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8574,6 +8574,29 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
+struct perf_event *perf_event_get(unsigned int fd)
+{
+ struct perf_event *event;
+ struct fd f;
+
+ f = fdget(fd);
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &perf_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ event = f.file->private_data;
+
+ atomic_long_inc(&event->refcount);
+ fdput(f);
+
+ return event;
+}
+
/*
* inherit a event from parent task to child task:
*/
--
1.8.3.4
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v3 1/3] bpf: Add new bpf map type to store the pointer to struct perf_event
2015-07-23 9:42 ` [PATCH v3 1/3] bpf: Add new bpf map type to store the pointer to struct perf_event Kaixu Xia
@ 2015-07-23 22:54 ` Alexei Starovoitov
2015-07-24 2:22 ` xiakaixu
2015-08-03 9:38 ` Peter Zijlstra
1 sibling, 1 reply; 15+ messages in thread
From: Alexei Starovoitov @ 2015-07-23 22:54 UTC (permalink / raw)
To: Kaixu Xia, davem, acme, mingo, a.p.zijlstra, masami.hiramatsu.pt, jolsa
Cc: wangnan0, linux-kernel, pi3orama, hekuang
On 7/23/15 2:42 AM, Kaixu Xia wrote:
> Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
> This map only stores the pointer to struct perf_event. The
> user space event FDs from perf_event_open() syscall are converted
> to the pointer to struct perf_event and stored in map.
...
> +static struct bpf_map *perf_event_array_map_alloc(union bpf_attr *attr)
> +{
> + /* only the pointer to struct perf_event can be stored in
> + * perf_event_array map
> + */
> + if (attr->value_size != sizeof(u32))
> + return ERR_PTR(-EINVAL);
> +
> + return array_map_alloc(attr);
> +}
since it's exactly the same as prog_array_map_alloc(),
just rename it to something like 'fd_array_map_alloc'
and use for both types.
> +static int perf_event_array_map_get_next_key(struct bpf_map *map, void *key,
> + void *next_key)
> +{
> + return -EINVAL;
> +}
> +
> +static void *perf_event_array_map_lookup_elem(struct bpf_map *map, void *key)
> +{
> + return NULL;
> +}
same for the above two.
rename prog_array_map_* into fd_array_map_* and use for both map types.
> +static struct perf_event *convert_map_with_perf_event(void *value)
> +{
> + struct perf_event *event;
> + u32 fd;
> +
> + fd = *(u32 *)value;
> +
> + event = perf_event_get(fd);
> + if (IS_ERR(event))
> + return NULL;
don't lose error code, do 'return event' instead.
> +
> + /* limit the event type to PERF_TYPE_RAW
> + * and PERF_TYPE_HARDWARE.
> + */
> + if (event->attr.type != PERF_TYPE_RAW &&
> + event->attr.type != PERF_TYPE_HARDWARE)
> + return NULL;
perf_event refcnt leak? need to do put_event.
and return ERR_PTR(-EINVAL)
> +
> + return event;
> +}
> +
> +/* only called from syscall */
> +static int perf_event_array_map_update_elem(struct bpf_map *map, void *key,
> + void *value, u64 map_flags)
> +{
> + struct bpf_array *array = container_of(map, struct bpf_array, map);
> + struct perf_event *event;
> + u32 index = *(u32 *)key;
> +
> + if (map_flags != BPF_ANY)
> + return -EINVAL;
> +
> + if (index >= array->map.max_entries)
> + return -E2BIG;
> +
> + /* check if the value is already stored */
> + if (array->events[index])
> + return -EINVAL;
> +
> + /* convert the fd to the pointer to struct perf_event */
> + event = convert_map_with_perf_event(value);
imo helper name is misleading and it's too short to be separate
function. Just inline it and you can reuse 'index' variable.
> + if (!event)
> + return -EBADF;
> +
> + xchg(array->events + index, event);
refcnt leak of old event! Please think it through.
This type of bugs I shouldn't be finding.
> +static int perf_event_array_map_delete_elem(struct bpf_map *map, void *key)
> +{
> + return -EINVAL;
> +}
no way to dec refcnt of perf_event from user space?
why not to do the same as prog_array_delete?
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v3 1/3] bpf: Add new bpf map type to store the pointer to struct perf_event
2015-07-23 22:54 ` Alexei Starovoitov
@ 2015-07-24 2:22 ` xiakaixu
2015-07-24 2:26 ` Alexei Starovoitov
0 siblings, 1 reply; 15+ messages in thread
From: xiakaixu @ 2015-07-24 2:22 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: davem, acme, mingo, a.p.zijlstra, masami.hiramatsu.pt, jolsa,
wangnan0, linux-kernel, pi3orama, hekuang
于 2015/7/24 6:54, Alexei Starovoitov 写道:
> On 7/23/15 2:42 AM, Kaixu Xia wrote:
>> Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
>> This map only stores the pointer to struct perf_event. The
>> user space event FDs from perf_event_open() syscall are converted
>> to the pointer to struct perf_event and stored in map.
> ...
>> +static struct bpf_map *perf_event_array_map_alloc(union bpf_attr *attr)
>> +{
>> + /* only the pointer to struct perf_event can be stored in
>> + * perf_event_array map
>> + */
>> + if (attr->value_size != sizeof(u32))
>> + return ERR_PTR(-EINVAL);
>> +
>> + return array_map_alloc(attr);
>> +}
>
> since it's exactly the same as prog_array_map_alloc(),
> just rename it to something like 'fd_array_map_alloc'
> and use for both types.
>
>> +static int perf_event_array_map_get_next_key(struct bpf_map *map, void *key,
>> + void *next_key)
>> +{
>> + return -EINVAL;
>> +}
>> +
>> +static void *perf_event_array_map_lookup_elem(struct bpf_map *map, void *key)
>> +{
>> + return NULL;
>> +}
>
> same for the above two.
> rename prog_array_map_* into fd_array_map_* and use for both map types.
>
>> +static struct perf_event *convert_map_with_perf_event(void *value)
>> +{
>> + struct perf_event *event;
>> + u32 fd;
>> +
>> + fd = *(u32 *)value;
>> +
>> + event = perf_event_get(fd);
>> + if (IS_ERR(event))
>> + return NULL;
>
> don't lose error code, do 'return event' instead.
>
>> +
>> + /* limit the event type to PERF_TYPE_RAW
>> + * and PERF_TYPE_HARDWARE.
>> + */
>> + if (event->attr.type != PERF_TYPE_RAW &&
>> + event->attr.type != PERF_TYPE_HARDWARE)
>> + return NULL;
>
> perf_event refcnt leak? need to do put_event.
> and return ERR_PTR(-EINVAL)
>
>> +
>> + return event;
>> +}
>> +
>> +/* only called from syscall */
>> +static int perf_event_array_map_update_elem(struct bpf_map *map, void *key,
>> + void *value, u64 map_flags)
>> +{
>> + struct bpf_array *array = container_of(map, struct bpf_array, map);
>> + struct perf_event *event;
>> + u32 index = *(u32 *)key;
>> +
>> + if (map_flags != BPF_ANY)
>> + return -EINVAL;
>> +
>> + if (index >= array->map.max_entries)
>> + return -E2BIG;
>> +
>> + /* check if the value is already stored */
>> + if (array->events[index])
>> + return -EINVAL;
>> +
>> + /* convert the fd to the pointer to struct perf_event */
>> + event = convert_map_with_perf_event(value);
>
> imo helper name is misleading and it's too short to be separate
> function. Just inline it and you can reuse 'index' variable.
>
>> + if (!event)
>> + return -EBADF;
>> +
>> + xchg(array->events + index, event);
>
> refcnt leak of old event! Please think it through.
> This type of bugs I shouldn't be finding.
Maybe the commit message is not elaborate. Here I prevent
user space from updating the existed event, so the return
value of xchg() is NULL and no refcnt leak of old event.
I will do the same as prog_array in next version.
>
>> +static int perf_event_array_map_delete_elem(struct bpf_map *map, void *key)
>> +{
>> + return -EINVAL;
>> +}
>
> no way to dec refcnt of perf_event from user space?
> why not to do the same as prog_array_delete?
Will follow them in V4.
>
>
> .
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v3 1/3] bpf: Add new bpf map type to store the pointer to struct perf_event
2015-07-24 2:22 ` xiakaixu
@ 2015-07-24 2:26 ` Alexei Starovoitov
0 siblings, 0 replies; 15+ messages in thread
From: Alexei Starovoitov @ 2015-07-24 2:26 UTC (permalink / raw)
To: xiakaixu
Cc: davem, acme, mingo, a.p.zijlstra, masami.hiramatsu.pt, jolsa,
wangnan0, linux-kernel, pi3orama, hekuang
On 7/23/15 7:22 PM, xiakaixu wrote:
>>> + /* check if the value is already stored */
>>> >>+ if (array->events[index])
>>> >>+ return -EINVAL;
>>> >>+
>>> >>+ /* convert the fd to the pointer to struct perf_event */
>>> >>+ event = convert_map_with_perf_event(value);
>> >
>> >imo helper name is misleading and it's too short to be separate
>> >function. Just inline it and you can reuse 'index' variable.
>> >
>>> >>+ if (!event)
>>> >>+ return -EBADF;
>>> >>+
>>> >>+ xchg(array->events + index, event);
>> >
>> >refcnt leak of old event! Please think it through.
>> >This type of bugs I shouldn't be finding.
> Maybe the commit message is not elaborate. Here I prevent
> user space from updating the existed event, so the return
> value of xchg() is NULL and no refcnt leak of old event.
> I will do the same as prog_array in next version.
I see then it's even worse.
You think that above check:
+ if (array->events[index])
+ return -EINVAL;
will protect the double insert?
It won't, since there are no locks here.
You can have two processes both seeing empty slot and
racing to do xchg.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v3 1/3] bpf: Add new bpf map type to store the pointer to struct perf_event
2015-07-23 9:42 ` [PATCH v3 1/3] bpf: Add new bpf map type to store the pointer to struct perf_event Kaixu Xia
2015-07-23 22:54 ` Alexei Starovoitov
@ 2015-08-03 9:38 ` Peter Zijlstra
1 sibling, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2015-08-03 9:38 UTC (permalink / raw)
To: Kaixu Xia
Cc: ast, davem, acme, mingo, masami.hiramatsu.pt, jolsa, wangnan0,
linux-kernel, pi3orama, hekuang
On Thu, Jul 23, 2015 at 09:42:40AM +0000, Kaixu Xia wrote:
> +static struct perf_event *convert_map_with_perf_event(void *value)
> +{
> + struct perf_event *event;
> + u32 fd;
> +
> + fd = *(u32 *)value;
> +
> + event = perf_event_get(fd);
> + if (IS_ERR(event))
> + return NULL;
> +
> + /* limit the event type to PERF_TYPE_RAW
> + * and PERF_TYPE_HARDWARE.
> + */
> + if (event->attr.type != PERF_TYPE_RAW &&
> + event->attr.type != PERF_TYPE_HARDWARE)
> + return NULL;
Aside from the ref-leak already mentioned; please introduce something
like:
const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
To avoid having to poke inside of the event outside of perf code.
> +
> + return event;
> +}
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v3 2/3] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter
2015-07-23 9:42 [PATCH v3 0/3] bpf: Introduce the new ability of eBPF programs to access hardware PMU counter Kaixu Xia
2015-07-23 9:42 ` [PATCH v3 1/3] bpf: Add new bpf map type to store the pointer to struct perf_event Kaixu Xia
@ 2015-07-23 9:42 ` Kaixu Xia
2015-07-23 22:56 ` Alexei Starovoitov
2015-08-03 9:34 ` Peter Zijlstra
2015-07-23 9:42 ` [PATCH v3 3/3] samples/bpf: example of get selected PMU counter value Kaixu Xia
2 siblings, 2 replies; 15+ messages in thread
From: Kaixu Xia @ 2015-07-23 9:42 UTC (permalink / raw)
To: ast, davem, acme, mingo, a.p.zijlstra, masami.hiramatsu.pt, jolsa
Cc: xiakaixu, wangnan0, linux-kernel, pi3orama, hekuang
According to the perf_event_map_fd and index, the function
bpf_perf_event_read() can convert the corresponding map
value to the pointer to struct perf_event and return the
Hardware PMU counter value.
Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
---
include/linux/bpf.h | 1 +
include/linux/perf_event.h | 3 ++-
include/uapi/linux/bpf.h | 1 +
kernel/bpf/helpers.c | 36 ++++++++++++++++++++++++++++++++++++
kernel/events/core.c | 4 ++--
kernel/trace/bpf_trace.c | 2 ++
6 files changed, 44 insertions(+), 3 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9cf74c0..0954b8f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -187,6 +187,7 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
extern const struct bpf_func_proto bpf_map_update_elem_proto;
extern const struct bpf_func_proto bpf_map_delete_elem_proto;
+extern const struct bpf_func_proto bpf_perf_event_read_proto;
extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
extern const struct bpf_func_proto bpf_tail_call_proto;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2ea4067..899abcb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -662,7 +662,8 @@ extern void perf_pmu_migrate_context(struct pmu *pmu,
int src_cpu, int dst_cpu);
extern u64 perf_event_read_value(struct perf_event *event,
u64 *enabled, u64 *running);
-
+extern void __perf_event_read(void *info);
+extern u64 perf_event_count(struct perf_event *event);
struct perf_sample_data {
/*
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 69a1f6b..b9b13ce 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -250,6 +250,7 @@ enum bpf_func_id {
* Return: 0 on success
*/
BPF_FUNC_get_current_comm,
+ BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, index) */
__BPF_FUNC_MAX_ID,
};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1447ec0..aab219d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -182,3 +182,39 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg1_type = ARG_PTR_TO_STACK,
.arg2_type = ARG_CONST_STACK_SIZE,
};
+
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_event *event;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ event = array->events[index];
+ if (!event)
+ return -EBADF;
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return -ENOENT;
+
+ if (event->oncpu != raw_smp_processor_id() &&
+ event->ctx->task != current)
+ return -EINVAL;
+
+ if (event->attr.inherit)
+ return -EINVAL;
+
+ __perf_event_read(event);
+
+ return perf_event_count(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+ .func = bpf_perf_event_read,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 08cb467..c59d9c6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3177,7 +3177,7 @@ void perf_event_exec(void)
/*
* Cross CPU call to read the hardware event
*/
-static void __perf_event_read(void *info)
+void __perf_event_read(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
@@ -3204,7 +3204,7 @@ static void __perf_event_read(void *info)
raw_spin_unlock(&ctx->lock);
}
-static inline u64 perf_event_count(struct perf_event *event)
+u64 perf_event_count(struct perf_event *event)
{
if (event->pmu->count)
return event->pmu->count(event);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041a..9cf094f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -183,6 +183,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return bpf_get_trace_printk_proto();
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_perf_event_read:
+ return &bpf_perf_event_read_proto;
default:
return NULL;
}
--
1.8.3.4
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v3 2/3] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter
2015-07-23 9:42 ` [PATCH v3 2/3] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter Kaixu Xia
@ 2015-07-23 22:56 ` Alexei Starovoitov
2015-07-24 1:57 ` xiakaixu
2015-08-03 9:34 ` Peter Zijlstra
1 sibling, 1 reply; 15+ messages in thread
From: Alexei Starovoitov @ 2015-07-23 22:56 UTC (permalink / raw)
To: Kaixu Xia, davem, acme, mingo, a.p.zijlstra, masami.hiramatsu.pt, jolsa
Cc: wangnan0, linux-kernel, pi3orama, hekuang
On 7/23/15 2:42 AM, Kaixu Xia wrote:
> According to the perf_event_map_fd and index, the function
> bpf_perf_event_read() can convert the corresponding map
> value to the pointer to struct perf_event and return the
> Hardware PMU counter value.
>
> Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
...
> +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
> +{
> + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
> + struct bpf_array *array = container_of(map, struct bpf_array, map);
> + struct perf_event *event;
> +
> + if (index >= array->map.max_entries)
> + return -E2BIG;
> +
> + event = array->events[index];
> + if (!event)
> + return -EBADF;
probably ENOENT makes more sense here.
> +
> + if (event->state != PERF_EVENT_STATE_ACTIVE)
> + return -ENOENT;
and -EINVAL here?
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v3 2/3] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter
2015-07-23 22:56 ` Alexei Starovoitov
@ 2015-07-24 1:57 ` xiakaixu
0 siblings, 0 replies; 15+ messages in thread
From: xiakaixu @ 2015-07-24 1:57 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: davem, acme, mingo, a.p.zijlstra, masami.hiramatsu.pt, jolsa,
wangnan0, linux-kernel, pi3orama, hekuang
于 2015/7/24 6:56, Alexei Starovoitov 写道:
> On 7/23/15 2:42 AM, Kaixu Xia wrote:
>> According to the perf_event_map_fd and index, the function
>> bpf_perf_event_read() can convert the corresponding map
>> value to the pointer to struct perf_event and return the
>> Hardware PMU counter value.
>>
>> Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
> ...
>> +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
>> +{
>> + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
>> + struct bpf_array *array = container_of(map, struct bpf_array, map);
>> + struct perf_event *event;
>> +
>> + if (index >= array->map.max_entries)
>> + return -E2BIG;
>> +
>> + event = array->events[index];
>> + if (!event)
>> + return -EBADF;
>
> probably ENOENT makes more sense here.
>
>> +
>> + if (event->state != PERF_EVENT_STATE_ACTIVE)
>> + return -ENOENT;
>
> and -EINVAL here?
Yeah, the errno is better.
Thanks!
>
>
> .
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v3 2/3] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter
2015-07-23 9:42 ` [PATCH v3 2/3] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter Kaixu Xia
2015-07-23 22:56 ` Alexei Starovoitov
@ 2015-08-03 9:34 ` Peter Zijlstra
2015-08-03 10:32 ` xiakaixu
1 sibling, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2015-08-03 9:34 UTC (permalink / raw)
To: Kaixu Xia
Cc: ast, davem, acme, mingo, masami.hiramatsu.pt, jolsa, wangnan0,
linux-kernel, pi3orama, hekuang
On Thu, Jul 23, 2015 at 09:42:41AM +0000, Kaixu Xia wrote:
> +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
> +{
> + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
> + struct bpf_array *array = container_of(map, struct bpf_array, map);
> + struct perf_event *event;
> +
> + if (index >= array->map.max_entries)
> + return -E2BIG;
> +
> + event = array->events[index];
> + if (!event)
> + return -EBADF;
> +
> + if (event->state != PERF_EVENT_STATE_ACTIVE)
> + return -ENOENT;
> +
> + if (event->oncpu != raw_smp_processor_id() &&
> + event->ctx->task != current)
> + return -EINVAL;
> +
> + if (event->attr.inherit)
> + return -EINVAL;
> +
> + __perf_event_read(event);
> +
> + return perf_event_count(event);
> +}
Please no poking of event internal state outside of perf code.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v3 2/3] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter
2015-08-03 9:34 ` Peter Zijlstra
@ 2015-08-03 10:32 ` xiakaixu
0 siblings, 0 replies; 15+ messages in thread
From: xiakaixu @ 2015-08-03 10:32 UTC (permalink / raw)
To: Peter Zijlstra
Cc: ast, davem, acme, mingo, masami.hiramatsu.pt, jolsa, wangnan0,
linux-kernel, pi3orama, hekuang
于 2015/8/3 17:34, Peter Zijlstra 写道:
> On Thu, Jul 23, 2015 at 09:42:41AM +0000, Kaixu Xia wrote:
>> +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
>> +{
>> + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
>> + struct bpf_array *array = container_of(map, struct bpf_array, map);
>> + struct perf_event *event;
>> +
>> + if (index >= array->map.max_entries)
>> + return -E2BIG;
>> +
>> + event = array->events[index];
>> + if (!event)
>> + return -EBADF;
>> +
>> + if (event->state != PERF_EVENT_STATE_ACTIVE)
>> + return -ENOENT;
>> +
>> + if (event->oncpu != raw_smp_processor_id() &&
>> + event->ctx->task != current)
>> + return -EINVAL;
>> +
>> + if (event->attr.inherit)
>> + return -EINVAL;
>> +
>> + __perf_event_read(event);
>> +
>> + return perf_event_count(event);
>> +}
>
> Please no poking of event internal state outside of perf code.
Thanks for your review. I will move it to kernel/events/core.c.
>
> .
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v3 3/3] samples/bpf: example of get selected PMU counter value
2015-07-23 9:42 [PATCH v3 0/3] bpf: Introduce the new ability of eBPF programs to access hardware PMU counter Kaixu Xia
2015-07-23 9:42 ` [PATCH v3 1/3] bpf: Add new bpf map type to store the pointer to struct perf_event Kaixu Xia
2015-07-23 9:42 ` [PATCH v3 2/3] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter Kaixu Xia
@ 2015-07-23 9:42 ` Kaixu Xia
2015-07-23 22:59 ` Alexei Starovoitov
2 siblings, 1 reply; 15+ messages in thread
From: Kaixu Xia @ 2015-07-23 9:42 UTC (permalink / raw)
To: ast, davem, acme, mingo, a.p.zijlstra, masami.hiramatsu.pt, jolsa
Cc: xiakaixu, wangnan0, linux-kernel, pi3orama, hekuang
This is a simple example and shows how to use the new ability
to get the selected Hardware PMU counter value.
Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
---
samples/bpf/Makefile | 4 +++
samples/bpf/bpf_helpers.h | 2 ++
samples/bpf/tracex6_kern.c | 26 ++++++++++++++++++
samples/bpf/tracex6_user.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 99 insertions(+)
create mode 100644 samples/bpf/tracex6_kern.c
create mode 100644 samples/bpf/tracex6_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4450fed..63e7d50 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -12,6 +12,7 @@ hostprogs-y += tracex2
hostprogs-y += tracex3
hostprogs-y += tracex4
hostprogs-y += tracex5
+hostprogs-y += tracex6
hostprogs-y += lathist
test_verifier-objs := test_verifier.o libbpf.o
@@ -25,6 +26,7 @@ tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
+tracex6-objs := bpf_load.o libbpf.o tracex6_user.o
lathist-objs := bpf_load.o libbpf.o lathist_user.o
# Tell kbuild to always build the programs
@@ -37,6 +39,7 @@ always += tracex2_kern.o
always += tracex3_kern.o
always += tracex4_kern.o
always += tracex5_kern.o
+always += tracex6_kern.o
always += tcbpf1_kern.o
always += lathist_kern.o
@@ -51,6 +54,7 @@ HOSTLOADLIBES_tracex2 += -lelf
HOSTLOADLIBES_tracex3 += -lelf
HOSTLOADLIBES_tracex4 += -lelf -lrt
HOSTLOADLIBES_tracex5 += -lelf
+HOSTLOADLIBES_tracex6 += -lelf
HOSTLOADLIBES_lathist += -lelf
# point this to your LLVM backend with bpf support
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index bdf1c16..c8a3594 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -31,6 +31,8 @@ static unsigned long long (*bpf_get_current_uid_gid)(void) =
(void *) BPF_FUNC_get_current_uid_gid;
static int (*bpf_get_current_comm)(void *buf, int buf_size) =
(void *) BPF_FUNC_get_current_comm;
+static int (*bpf_perf_event_read)(void *map, int index) =
+ (void *) BPF_FUNC_perf_event_read;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c
new file mode 100644
index 0000000..d213161
--- /dev/null
+++ b/samples/bpf/tracex6_kern.c
@@ -0,0 +1,26 @@
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(unsigned long),
+ .max_entries = 32,
+};
+
+SEC("kprobe/sys_write")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ u64 count;
+ u32 key = bpf_get_smp_processor_id();
+ char fmt[] = "CPU-%d %llu\n";
+
+ count = bpf_perf_event_read(&my_map, &key);
+ bpf_trace_printk(fmt, sizeof(fmt), key, count);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
new file mode 100644
index 0000000..30307c9
--- /dev/null
+++ b/samples/bpf/tracex6_user.c
@@ -0,0 +1,67 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+static void test_bpf_perf_event(void)
+{
+ int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ int *pmu_fd = malloc(nr_cpus * sizeof(int));
+ unsigned long value;
+ int i;
+
+ struct perf_event_attr attr_insn_pmu = {
+ .freq = 0,
+ .sample_period = 0x7fffffffffffffffULL,
+ .inherit = 0,
+ .type = PERF_TYPE_HARDWARE,
+ .read_format = 0,
+ .sample_type = 0,
+ .config = 0,/* PMU: cycles */
+ };
+
+ for(i = 0; i < nr_cpus; i++) {
+ pmu_fd[i] = perf_event_open(&attr_insn_pmu, -1/*pid*/, i/*cpu*/, -1/*group_fd*/, 0);
+ if (pmu_fd[i] < 0)
+ printf("event syscall failed ****\n");
+
+ bpf_update_elem(map_fd[0], &i, (pmu_fd + i), BPF_ANY);
+
+ ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0);
+ }
+
+ system("ls");
+ system("pwd");
+ system("sleep 2");
+
+ for(i = 0; i < nr_cpus; i++)
+ close(pmu_fd[i]);
+
+ close(map_fd);
+
+ free(pmu_fd);
+}
+
+int main(int argc, char **argv)
+{
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ test_bpf_perf_event();
+
+ return 0;
+}
--
1.8.3.4
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v3 3/3] samples/bpf: example of get selected PMU counter value
2015-07-23 9:42 ` [PATCH v3 3/3] samples/bpf: example of get selected PMU counter value Kaixu Xia
@ 2015-07-23 22:59 ` Alexei Starovoitov
2015-07-24 1:54 ` xiakaixu
0 siblings, 1 reply; 15+ messages in thread
From: Alexei Starovoitov @ 2015-07-23 22:59 UTC (permalink / raw)
To: Kaixu Xia, davem, acme, mingo, a.p.zijlstra, masami.hiramatsu.pt, jolsa
Cc: wangnan0, linux-kernel, pi3orama, hekuang
On 7/23/15 2:42 AM, Kaixu Xia wrote:
> This is a simple example and shows how to use the new ability
> to get the selected Hardware PMU counter value.
>
> Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
...
> +struct bpf_map_def SEC("maps") my_map = {
> + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> + .key_size = sizeof(int),
> + .value_size = sizeof(unsigned long),
> + .max_entries = 32,
> +};
wait. how did it work here? value_size should be u32.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v3 3/3] samples/bpf: example of get selected PMU counter value
2015-07-23 22:59 ` Alexei Starovoitov
@ 2015-07-24 1:54 ` xiakaixu
2015-07-24 2:23 ` Alexei Starovoitov
0 siblings, 1 reply; 15+ messages in thread
From: xiakaixu @ 2015-07-24 1:54 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: davem, acme, mingo, a.p.zijlstra, masami.hiramatsu.pt, jolsa,
wangnan0, linux-kernel, pi3orama, hekuang
于 2015/7/24 6:59, Alexei Starovoitov 写道:
> On 7/23/15 2:42 AM, Kaixu Xia wrote:
>> This is a simple example and shows how to use the new ability
>> to get the selected Hardware PMU counter value.
>>
>> Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
> ...
>> +struct bpf_map_def SEC("maps") my_map = {
>> + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
>> + .key_size = sizeof(int),
>> + .value_size = sizeof(unsigned long),
>> + .max_entries = 32,
>> +};
>
> wait. how did it work here? value_size should be u32.
I tested the whole thing on ARM board. You are ringt, it
should be u32.
When create the array map, we choose the array->elem_size as
round_up(attr->value_size, 8), why 8?
Thanks!
>
>
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v3 3/3] samples/bpf: example of get selected PMU counter value
2015-07-24 1:54 ` xiakaixu
@ 2015-07-24 2:23 ` Alexei Starovoitov
0 siblings, 0 replies; 15+ messages in thread
From: Alexei Starovoitov @ 2015-07-24 2:23 UTC (permalink / raw)
To: xiakaixu
Cc: davem, acme, mingo, a.p.zijlstra, masami.hiramatsu.pt, jolsa,
wangnan0, linux-kernel, pi3orama, hekuang
On 7/23/15 6:54 PM, xiakaixu wrote:
> 于 2015/7/24 6:59, Alexei Starovoitov 写道:
>> On 7/23/15 2:42 AM, Kaixu Xia wrote:
>>> This is a simple example and shows how to use the new ability
>>> to get the selected Hardware PMU counter value.
>>>
>>> Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
>> ...
>>> +struct bpf_map_def SEC("maps") my_map = {
>>> + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
>>> + .key_size = sizeof(int),
>>> + .value_size = sizeof(unsigned long),
>>> + .max_entries = 32,
>>> +};
>>
>> wait. how did it work here? value_size should be u32.
>
> I tested the whole thing on ARM board. You are ringt, it
> should be u32.
> When create the array map, we choose the array->elem_size as
> round_up(attr->value_size, 8), why 8?
because from user space point of view we're storing FDs
which are u32, but kernel stores pointers.
but round_up(attr->value_size, 8) is done because there
can be 8 byte fields in there and we have 8-byte load/store insns.
So whether pointer is 32 or 64-bit they still fit.
^ permalink raw reply [flat|nested] 15+ messages in thread