* [RFC PATCH 1/2] perf/namespaces: Add a new namespace for isolated tracing
2016-06-14 16:49 [RFC PATCH 0/2] perf: Container-aware tracing support Aravinda Prasad
@ 2016-06-14 16:49 ` Aravinda Prasad
2016-06-14 16:49 ` [RFC PATCH 2/2] perf: Filter events based on perf-namespace Aravinda Prasad
1 sibling, 0 replies; 13+ messages in thread
From: Aravinda Prasad @ 2016-06-14 16:49 UTC (permalink / raw)
To: a.p.zijlstra, linux-kernel, rostedt, mingo, paulus, acme, ebiederm
Cc: hbathini, ananth
From: Hari Bathini <hbathini@linux.vnet.ibm.com>
This patch adds a new namespace to the kernel inline with the existing
namespaces like pid, uts, etc. The aim of this namespace is to support
isolated tracing within the context of this new namespace.
Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
---
fs/proc/namespaces.c | 4 +
include/linux/nsproxy.h | 2 +
include/linux/perf_namespace.h | 52 +++++++++++++++++
include/linux/proc_ns.h | 2 +
include/uapi/linux/sched.h | 1
init/Kconfig | 7 ++
kernel/Makefile | 1
kernel/fork.c | 3 +
kernel/nsproxy.c | 20 ++++++
kernel/perf_namespace.c | 124 ++++++++++++++++++++++++++++++++++++++++
10 files changed, 213 insertions(+), 3 deletions(-)
create mode 100644 include/linux/perf_namespace.h
create mode 100644 kernel/perf_namespace.c
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 51b8b0a..f9812fc 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -8,6 +8,7 @@
#include <linux/ipc_namespace.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
+#include <linux/perf_namespace.h>
#include "internal.h"
@@ -31,6 +32,9 @@ static const struct proc_ns_operations *ns_entries[] = {
#ifdef CONFIG_CGROUPS
&cgroupns_operations,
#endif
+#ifdef CONFIG_PERF_NS
+ &perfns_operations,
+#endif
};
static const char *proc_ns_get_link(struct dentry *dentry,
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index ac0d65b..7e83e63 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -9,6 +9,7 @@ struct uts_namespace;
struct ipc_namespace;
struct pid_namespace;
struct cgroup_namespace;
+struct perf_namespace;
struct fs_struct;
/*
@@ -35,6 +36,7 @@ struct nsproxy {
struct pid_namespace *pid_ns_for_children;
struct net *net_ns;
struct cgroup_namespace *cgroup_ns;
+ struct perf_namespace *perf_ns;
};
extern struct nsproxy init_nsproxy;
diff --git a/include/linux/perf_namespace.h b/include/linux/perf_namespace.h
new file mode 100644
index 0000000..9713724
--- /dev/null
+++ b/include/linux/perf_namespace.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_PERF_NS_H
+#define _LINUX_PERF_NS_H
+
+#include <linux/nsproxy.h>
+#include <linux/kref.h>
+#include <linux/ns_common.h>
+
+struct user_namespace;
+extern struct user_namespace init_user_ns;
+
+struct perf_namespace {
+ struct kref kref;
+ struct user_namespace *user_ns; /* Owning user namespace */
+ struct ns_common ns;
+};
+extern struct perf_namespace init_perf_ns;
+
+#ifdef CONFIG_PERF_NS
+extern struct perf_namespace *copy_perf_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct perf_namespace *old_ns);
+extern void free_perf_ns(struct kref *kref);
+
+static inline void get_perf_ns(struct perf_namespace *ns)
+{
+ kref_get(&ns->kref);
+}
+
+static inline void put_perf_ns(struct perf_namespace *ns)
+{
+ kref_put(&ns->kref, free_perf_ns);
+}
+
+#else /* !CONFIG_PERF_NS */
+static inline void get_perf_ns(struct perf_namespace *ns)
+{
+}
+
+static inline void put_perf_ns(struct perf_namespace *ns)
+{
+}
+
+static inline struct perf_namespace *copy_perf_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct perf_namespace *old_ns)
+{
+ if (flags & CLONE_NEWPERF)
+ return ERR_PTR(-EINVAL);
+
+ return old_ns;
+}
+#endif /* CONFIG_PERF_NS */
+
+#endif /* _LINUX_PERF_NS_H */
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index de0e771..c2916a7 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -27,6 +27,7 @@ extern const struct proc_ns_operations pidns_operations;
extern const struct proc_ns_operations userns_operations;
extern const struct proc_ns_operations mntns_operations;
extern const struct proc_ns_operations cgroupns_operations;
+extern const struct proc_ns_operations perfns_operations;
/*
* We always define these enumerators
@@ -38,6 +39,7 @@ enum {
PROC_USER_INIT_INO = 0xEFFFFFFDU,
PROC_PID_INIT_INO = 0xEFFFFFFCU,
PROC_CGROUP_INIT_INO = 0xEFFFFFFBU,
+ PROC_PERF_INIT_INO = 0xEFFFFFFAU,
};
#ifdef CONFIG_PROC_FS
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 5f0fe01..6a13d40 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -9,6 +9,7 @@
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
+#define CLONE_NEWPERF 0x00001000 /* New perf namespace */
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
diff --git a/init/Kconfig b/init/Kconfig
index f755a60..e0b23f2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1221,6 +1221,13 @@ config NET_NS
Allow user space to create what appear to be multiple instances
of the network stack.
+config PERF_NS
+ bool "Perf Namespaces"
+ default y if PERF_EVENTS
+ help
+ Support perf namespaces. A namespace to provide isolated tracing
+ support in context of this namespace.
+
endif # NAMESPACES
config SCHED_AUTOGROUP
diff --git a/kernel/Makefile b/kernel/Makefile
index e2ec54e..ee94119 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -71,6 +71,7 @@ obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
+obj-$(CONFIG_PERF_NS) += perf_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
diff --git a/kernel/fork.c b/kernel/fork.c
index 5c2c355..d53756c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1926,7 +1926,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
- CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
+ CLONE_NEWUSER|CLONE_NEWPID|
+ CLONE_NEWCGROUP|CLONE_NEWPERF))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 782102e..b9a9831 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,7 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
+#include <linux/perf_namespace.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
@@ -43,6 +44,9 @@ struct nsproxy init_nsproxy = {
#ifdef CONFIG_CGROUPS
.cgroup_ns = &init_cgroup_ns,
#endif
+#ifdef CONFIG_PERF_NS
+ .perf_ns = &init_perf_ns,
+#endif
};
static inline struct nsproxy *create_nsproxy(void)
@@ -103,6 +107,12 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_cgroup;
}
+ new_nsp->perf_ns = copy_perf_ns(flags, user_ns, tsk->nsproxy->perf_ns);
+ if (IS_ERR(new_nsp->perf_ns)) {
+ err = PTR_ERR(new_nsp->perf_ns);
+ goto out_perf;
+ }
+
new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
if (IS_ERR(new_nsp->net_ns)) {
err = PTR_ERR(new_nsp->net_ns);
@@ -113,6 +123,9 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
out_net:
put_cgroup_ns(new_nsp->cgroup_ns);
+out_perf:
+ if (new_nsp->net_ns)
+ put_net(new_nsp->net_ns);
out_cgroup:
if (new_nsp->pid_ns_for_children)
put_pid_ns(new_nsp->pid_ns_for_children);
@@ -142,7 +155,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
- CLONE_NEWCGROUP)))) {
+ CLONE_NEWCGROUP | CLONE_NEWPERF)))) {
get_nsproxy(old_ns);
return 0;
}
@@ -177,6 +190,8 @@ void free_nsproxy(struct nsproxy *ns)
put_uts_ns(ns->uts_ns);
if (ns->ipc_ns)
put_ipc_ns(ns->ipc_ns);
+ if (ns->perf_ns)
+ put_perf_ns(ns->perf_ns);
if (ns->pid_ns_for_children)
put_pid_ns(ns->pid_ns_for_children);
put_cgroup_ns(ns->cgroup_ns);
@@ -195,7 +210,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
+ CLONE_NEWNET | CLONE_NEWPID |
+ CLONE_NEWCGROUP | CLONE_NEWPERF)))
return 0;
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
diff --git a/kernel/perf_namespace.c b/kernel/perf_namespace.c
new file mode 100644
index 0000000..5b76fd8
--- /dev/null
+++ b/kernel/perf_namespace.c
@@ -0,0 +1,124 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/export.h>
+#include <linux/perf_namespace.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+#include <linux/proc_ns.h>
+
+static struct perf_namespace *create_perf_ns(struct user_namespace *user_ns)
+{
+ struct perf_namespace *perf_ns;
+ int err;
+
+ perf_ns = kmalloc(sizeof(struct perf_namespace), GFP_KERNEL);
+ if (!perf_ns)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&perf_ns->kref);
+ err = ns_alloc_inum(&perf_ns->ns);
+ if (err) {
+ kfree(perf_ns);
+ return ERR_PTR(err);
+ }
+
+ perf_ns->ns.ops = &perfns_operations;
+ perf_ns->user_ns = get_user_ns(user_ns);
+ return perf_ns;
+}
+
+struct perf_namespace *copy_perf_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct perf_namespace *old_ns)
+{
+ struct perf_namespace *new_ns;
+
+ BUG_ON(!old_ns);
+ get_perf_ns(old_ns);
+
+ if (!(flags & CLONE_NEWPERF))
+ return old_ns;
+
+ new_ns = create_perf_ns(user_ns);
+
+ put_perf_ns(old_ns);
+ return new_ns;
+}
+
+void free_perf_ns(struct kref *kref)
+{
+ struct perf_namespace *ns;
+
+ ns = container_of(kref, struct perf_namespace, kref);
+ put_user_ns(ns->user_ns);
+ ns_free_inum(&ns->ns);
+ kfree(ns);
+}
+
+static inline struct perf_namespace *to_perf_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct perf_namespace, ns);
+}
+
+static struct ns_common *perfns_get(struct task_struct *task)
+{
+ struct perf_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->perf_ns;
+ get_perf_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static void perfns_put(struct ns_common *ns)
+{
+ put_perf_ns(to_perf_ns(ns));
+}
+
+static int perfns_install(struct nsproxy *nsproxy, struct ns_common *new)
+{
+ struct perf_namespace *ns = to_perf_ns(new);
+
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
+ get_perf_ns(ns);
+ put_perf_ns(nsproxy->perf_ns);
+ nsproxy->perf_ns = ns;
+ return 0;
+}
+
+const struct proc_ns_operations perfns_operations = {
+ .name = "perf",
+ .type = CLONE_NEWPERF,
+ .get = perfns_get,
+ .put = perfns_put,
+ .install = perfns_install,
+};
+
+/*
+ * TODO: Find a better place to put this..
+ */
+struct perf_namespace init_perf_ns = {
+ .kref = {
+ .refcount = ATOMIC_INIT(2),
+ },
+ .user_ns = &init_user_ns,
+ .ns.inum = PROC_PERF_INIT_INO,
+#ifdef CONFIG_PERF_NS
+ .ns.ops = &perfns_operations,
+#endif
+};
+EXPORT_SYMBOL_GPL(init_perf_ns);
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [RFC PATCH 2/2] perf: Filter events based on perf-namespace
2016-06-14 16:49 [RFC PATCH 0/2] perf: Container-aware tracing support Aravinda Prasad
2016-06-14 16:49 ` [RFC PATCH 1/2] perf/namespaces: Add a new namespace for isolated tracing Aravinda Prasad
@ 2016-06-14 16:49 ` Aravinda Prasad
2016-06-27 15:50 ` Peter Zijlstra
1 sibling, 1 reply; 13+ messages in thread
From: Aravinda Prasad @ 2016-06-14 16:49 UTC (permalink / raw)
To: a.p.zijlstra, linux-kernel, rostedt, mingo, paulus, acme, ebiederm
Cc: hbathini, ananth
Whenever perf tool is executed inside a container, this
patch restricts the events to the perf-namespace in which
the perf tool is executing.
This patch is based on the existing support available
for tracing with cgroups.
TODO:
- Avoid code duplication.
Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
---
include/linux/perf_event.h | 8 +
include/linux/perf_namespace.h | 6 +
kernel/events/core.c | 347 ++++++++++++++++++++++++++++++++++++++++
kernel/perf_namespace.c | 8 +
4 files changed, 368 insertions(+), 1 deletion(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1a827ce..8d797d9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -43,6 +43,7 @@ struct perf_guest_info_callbacks {
#include <linux/hrtimer.h>
#include <linux/fs.h>
#include <linux/pid_namespace.h>
+#include <linux/perf_namespace.h>
#include <linux/workqueue.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
@@ -656,6 +657,11 @@ struct perf_event {
struct rcu_head rcu_head;
struct pid_namespace *ns;
+#ifdef CONFIG_PERF_NS
+ struct perf_namespace *perf_ns;
+ int perfns_defer_enabled;
+#endif
+
u64 id;
u64 (*clock)(void);
@@ -725,6 +731,7 @@ struct perf_event_context {
u64 generation;
int pin_count;
int nr_cgroups; /* cgroup evts */
+ int nr_perfns;
void *task_ctx_data; /* pmu specific data */
struct rcu_head rcu_head;
};
@@ -751,6 +758,7 @@ struct perf_cpu_context {
struct pmu *unique_pmu;
struct perf_cgroup *cgrp;
+ struct perf_namespace *perf_ns;
};
struct perf_output_handle {
diff --git a/include/linux/perf_namespace.h b/include/linux/perf_namespace.h
index 9713724..2aad0e9 100644
--- a/include/linux/perf_namespace.h
+++ b/include/linux/perf_namespace.h
@@ -8,8 +8,14 @@
struct user_namespace;
extern struct user_namespace init_user_ns;
+struct perf_ns_info {
+ u64 time;
+ u64 timestamp;
+};
+
struct perf_namespace {
struct kref kref;
+ struct perf_ns_info __percpu *info;
struct user_namespace *user_ns; /* Owning user namespace */
struct ns_common ns;
};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 274450e..757a169 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -334,6 +334,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(atomic_t, perf_perfns_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static atomic_t nr_mmap_events __read_mostly;
@@ -914,6 +915,288 @@ perf_cgroup_mark_enabled(struct perf_event *event,
}
#endif
+#ifdef CONFIG_PERF_NS
+static inline bool perf_perfns_match(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ /* @event doesn't care about perfns */
+ if (!event->perf_ns)
+ return true;
+
+ if (cpuctx->perf_ns != event->perf_ns)
+ return false;
+
+ return true;
+}
+
+static inline void perf_detach_perfns(struct perf_event *event)
+{
+ event->perf_ns = NULL;
+}
+
+static inline int is_perfns_event(struct perf_event *event)
+{
+ return event->perf_ns != NULL;
+}
+
+static inline u64 perf_perfns_event_time(struct perf_event *event)
+{
+ struct perf_ns_info *t;
+
+ t = per_cpu_ptr(event->perf_ns->info, event->cpu);
+ return t ? t->time : 0;
+}
+
+static inline void __update_perfns_time(struct perf_namespace *p_ns)
+{
+ struct perf_ns_info *info;
+ u64 now;
+
+ now = perf_clock();
+
+ if (!p_ns->info)
+ return;
+
+ info = this_cpu_ptr(p_ns->info);
+
+ info->time += now - info->timestamp;
+ info->timestamp = now;
+}
+
+static inline void update_perfns_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+ struct perf_namespace *perfns_out = cpuctx->perf_ns;
+
+ if (perfns_out)
+ __update_perfns_time(perfns_out);
+}
+
+static inline void update_perfns_time_from_event(struct perf_event *event)
+{
+ struct perf_namespace *perf_ns = current->nsproxy->perf_ns;
+
+ if (!is_perfns_event(event))
+ return;
+
+ if (perf_ns == event->perf_ns)
+ __update_perfns_time(event->perf_ns);
+}
+
+static inline void
+perf_perfns_set_timestamp(struct task_struct *task,
+ struct perf_event_context *ctx)
+{
+ struct perf_namespace *perf_ns = task->nsproxy->perf_ns;
+ struct perf_ns_info *info;
+
+ if (!task || !ctx->nr_perfns)
+ return;
+
+ if (!perf_ns->info)
+ return;
+
+ info = this_cpu_ptr(perf_ns->info);
+ info->timestamp = ctx->timestamp;
+}
+
+#define PERF_PERFNS_SWOUT 0x1 /* perfns switch out every event */
+#define PERF_PERFNS_SWIN 0x2 /* perfns switch in events based on task */
+
+/*
+ * mode SWOUT : schedule out everything
+ * mode SWIN : schedule in based on perfns for next
+ */
+static void perf_perfns_switch(struct task_struct *task, int mode)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->unique_pmu != pmu)
+ continue; /* ensure we process each cpuctx once */
+
+ if (cpuctx->ctx.nr_perfns > 0) {
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
+
+ if (mode & PERF_PERFNS_SWOUT) {
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to event_filter_match() in event_sched_out()
+ */
+ cpuctx->perf_ns = NULL;
+ }
+
+ if (mode & PERF_PERFNS_SWIN) {
+ WARN_ON_ONCE(cpuctx->perf_ns);
+
+ cpuctx->perf_ns = task->nsproxy->perf_ns;
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+ }
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ }
+ }
+
+ local_irq_restore(flags);
+}
+
+static inline void perf_perfns_sched_out(struct task_struct *task,
+ struct task_struct *next)
+{
+ rcu_read_lock();
+ perf_perfns_switch(task, PERF_PERFNS_SWOUT);
+ rcu_read_unlock();
+}
+
+static inline void perf_perfns_sched_in(struct task_struct *prev,
+ struct task_struct *task)
+{
+ rcu_read_lock();
+
+ if (task->nsproxy->perf_ns != &init_perf_ns)
+ perf_perfns_switch(task, PERF_PERFNS_SWIN);
+
+ rcu_read_unlock();
+}
+
+static inline int perf_perfns_connect(struct perf_event *event,
+ struct perf_event *group_leader)
+{
+ if (current->nsproxy->perf_ns != &init_perf_ns) {
+ /*
+ * If we are called from our own perf namespace, set
+ * event->perf_ns
+ */
+ event->perf_ns = current->nsproxy->perf_ns;
+
+ if (group_leader && group_leader->perf_ns != event->perf_ns) {
+ perf_detach_perfns(event);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static inline void
+perf_perfns_set_shadow_time(struct perf_event *event, u64 now)
+{
+ struct perf_ns_info *t;
+
+ t = per_cpu_ptr(event->perf_ns->info, event->cpu);
+ event->shadow_ctx_time = now - t->timestamp;
+}
+
+static inline void
+perf_perfns_defer_enabled(struct perf_event *event)
+{
+ if (is_perfns_event(event) && !perf_perfns_match(event))
+ event->perfns_defer_enabled = 1;
+}
+
+static inline void
+perf_perfns_mark_enabled(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *sub;
+ u64 tstamp = perf_event_time(event);
+
+ if (!event->perfns_defer_enabled)
+ return;
+
+ event->perfns_defer_enabled = 0;
+
+ event->tstamp_enabled = tstamp - event->total_time_enabled;
+ list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+ sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ sub->perfns_defer_enabled = 0;
+ }
+ }
+}
+#else /* CONFIG_PERFNS */
+static inline bool perf_perfns_match(struct perf_event *event)
+{
+ return true;
+}
+
+static inline void perf_detach_perfns(struct perf_event *event)
+{}
+
+static inline int is_perfns_event(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline u64 perf_perfns_event_perfns_time(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline void update_perfns_time_from_event(struct perf_event *event)
+{
+}
+
+static inline void update_perfns_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+}
+
+static inline void perf_perfns_sched_out(struct task_struct *task,
+ struct task_struct *next)
+{
+}
+
+static inline void perf_perfns_sched_in(struct task_struct *prev,
+ struct task_struct *task)
+{
+}
+
+static inline void
+perf_perfns_set_timestamp(struct task_struct *task,
+ struct perf_event_context *ctx)
+{
+}
+
+void
+perf_perfns_switch(struct task_struct *task, struct task_struct *next)
+{
+}
+
+
+static inline int perf_perfns_connect(struct perf_event *event,
+ struct perf_event *group_leader)
+{
+ return 0;
+}
+
+static inline void
+perf_perfns_set_shadow_time(struct perf_event *event, u64 now)
+{
+}
+
+static inline u64 perf_perfns_event_time(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline void
+perf_perfns_defer_enabled(struct perf_event *event)
+{
+}
+
+static inline void
+perf_perfns_mark_enabled(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+#endif /* CONFIG_PERF_NS */
+
/*
* set default to be dependent on timer tick just
* like original code
@@ -1311,6 +1594,9 @@ static u64 perf_event_time(struct perf_event *event)
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
+ if (is_perfns_event(event))
+ return perf_perfns_event_time(event);
+
return ctx ? ctx->time : 0;
}
@@ -1340,6 +1626,8 @@ static void update_event_times(struct perf_event *event)
*/
if (is_cgroup_event(event))
run_end = perf_cgroup_event_time(event);
+ else if (is_perfns_event(event))
+ run_end = perf_perfns_event_time(event);
else if (ctx->is_active)
run_end = ctx->time;
else
@@ -1407,6 +1695,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
if (is_cgroup_event(event))
ctx->nr_cgroups++;
+ if (is_perfns_event(event))
+ ctx->nr_perfns++;
+
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
if (event->attr.inherit_stat)
@@ -1601,6 +1892,13 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
cpuctx->cgrp = NULL;
}
+ if (is_perfns_event(event)) {
+ ctx->nr_perfns--;
+ cpuctx = __get_cpu_context(ctx);
+ if (!ctx->nr_perfns)
+ cpuctx->perf_ns = NULL;
+ }
+
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -1688,7 +1986,8 @@ static inline int
event_filter_match(struct perf_event *event)
{
return (event->cpu == -1 || event->cpu == smp_processor_id())
- && perf_cgroup_match(event) && pmu_filter_match(event);
+ && perf_cgroup_match(event) && pmu_filter_match(event)
+ && perf_perfns_match(event);
}
static void
@@ -1821,6 +2120,7 @@ static void __perf_event_disable(struct perf_event *event,
update_context_time(ctx);
update_cgrp_time_from_event(event);
+ update_perfns_time_from_event(event);
update_group_times(event);
if (event == event->group_leader)
group_sched_out(event, cpuctx, ctx);
@@ -1907,6 +2207,8 @@ static void perf_set_shadow_time(struct perf_event *event,
*/
if (is_cgroup_event(event))
perf_cgroup_set_shadow_time(event, tstamp);
+ else if (is_perfns_event(event))
+ perf_perfns_set_shadow_time(event, tstamp);
else
event->shadow_ctx_time = tstamp - ctx->timestamp;
}
@@ -2300,6 +2602,8 @@ static void __perf_event_enable(struct perf_event *event,
if (!event_filter_match(event)) {
if (is_cgroup_event(event))
perf_cgroup_defer_enabled(event);
+ if (is_perfns_event(event))
+ perf_perfns_defer_enabled(event);
ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
return;
}
@@ -2546,6 +2850,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
/* update (and stop) ctx time */
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx);
+ update_perfns_time_from_cpuctx(cpuctx);
}
is_active ^= ctx->is_active; /* changed bits */
@@ -2837,6 +3142,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
*/
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
perf_cgroup_sched_out(task, next);
+
+ if (atomic_read(this_cpu_ptr(&perf_perfns_events)))
+ perf_perfns_sched_out(task, next);
}
/*
@@ -2864,6 +3172,9 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
if (is_cgroup_event(event))
perf_cgroup_mark_enabled(event, ctx);
+ if (is_perfns_event(event))
+ perf_perfns_mark_enabled(event, ctx);
+
if (group_can_go_on(event, cpuctx, 1))
group_sched_in(event, cpuctx, ctx);
@@ -2900,6 +3211,9 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
if (is_cgroup_event(event))
perf_cgroup_mark_enabled(event, ctx);
+ if (is_perfns_event(event))
+ perf_perfns_mark_enabled(event, ctx);
+
if (group_can_go_on(event, cpuctx, can_add_hw)) {
if (group_sched_in(event, cpuctx, ctx))
can_add_hw = 0;
@@ -2936,6 +3250,7 @@ ctx_sched_in(struct perf_event_context *ctx,
now = perf_clock();
ctx->timestamp = now;
perf_cgroup_set_timestamp(task, ctx);
+ perf_perfns_set_timestamp(task, ctx);
}
/*
@@ -3008,6 +3323,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);
+ if (atomic_read(this_cpu_ptr(&perf_perfns_events)))
+ perf_perfns_sched_in(prev, task);
+
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (likely(!ctx))
@@ -3353,6 +3671,7 @@ static void __perf_event_read(void *info)
if (ctx->is_active) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
+ update_perfns_time_from_event(event);
}
update_event_times(event);
@@ -3477,6 +3796,7 @@ static int perf_event_read(struct perf_event *event, bool group)
if (ctx->is_active) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
+ update_perfns_time_from_event(event);
}
if (group)
update_group_times(event);
@@ -3672,6 +3992,9 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
if (is_cgroup_event(event))
atomic_dec(&per_cpu(perf_cgroup_events, cpu));
+
+ if (is_perfns_event(event))
+ atomic_dec(&per_cpu(perf_perfns_events, cpu));
}
#ifdef CONFIG_NO_HZ_FULL
@@ -3719,6 +4042,8 @@ static void unaccount_event(struct perf_event *event)
}
if (is_cgroup_event(event))
dec = true;
+ if (is_perfns_event(event))
+ dec = true;
if (has_branch_stack(event))
dec = true;
@@ -3847,6 +4172,9 @@ static void _free_event(struct perf_event *event)
if (is_cgroup_event(event))
perf_detach_cgroup(event);
+ if (is_perfns_event(event))
+ perf_detach_perfns(event);
+
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
put_callchain_buffers();
@@ -8655,6 +8983,9 @@ static void account_event_cpu(struct perf_event *event, int cpu)
if (is_cgroup_event(event))
atomic_inc(&per_cpu(perf_cgroup_events, cpu));
+
+ if (is_perfns_event(event))
+ atomic_inc(&per_cpu(perf_perfns_events, cpu));
}
/* Freq events need the tick to stay alive (see perf_event_task_tick). */
@@ -8703,6 +9034,8 @@ static void account_event(struct perf_event *event)
inc = true;
if (is_cgroup_event(event))
inc = true;
+ if (is_perfns_event(event))
+ inc = true;
if (inc) {
if (atomic_inc_not_zero(&perf_sched_count))
@@ -8851,6 +9184,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
goto err_ns;
}
+ if (!task) {
+ err = perf_perfns_connect(event, group_leader);
+ if (err)
+ goto err_ns;
+ }
+
pmu = perf_init_event(event);
if (!pmu)
goto err_ns;
@@ -8900,6 +9239,8 @@ err_pmu:
err_ns:
if (is_cgroup_event(event))
perf_detach_cgroup(event);
+ if (is_perfns_event(event))
+ perf_detach_perfns(event);
if (event->ns)
put_pid_ns(event->ns);
kfree(event);
@@ -10367,6 +10708,10 @@ void __init perf_event_init(void)
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
+ init_perf_ns.info = alloc_percpu(struct perf_ns_info);
+ if (!(init_perf_ns.info))
+ WARN(-ENOMEM, "perf namespace memory allocation failed");
+
/*
* Build time assertion that we keep the data_head at the intended
* location. IOW, validation we got the __reserved[] size right.
diff --git a/kernel/perf_namespace.c b/kernel/perf_namespace.c
index 5b76fd8..7991a93 100644
--- a/kernel/perf_namespace.c
+++ b/kernel/perf_namespace.c
@@ -30,6 +30,13 @@ static struct perf_namespace *create_perf_ns(struct user_namespace *user_ns)
perf_ns->ns.ops = &perfns_operations;
perf_ns->user_ns = get_user_ns(user_ns);
+
+ perf_ns->info = alloc_percpu(struct perf_ns_info);
+ if (!perf_ns->info) {
+ kfree(perf_ns);
+ return ERR_PTR(-ENOMEM);
+ }
+
return perf_ns;
}
@@ -115,6 +122,7 @@ struct perf_namespace init_perf_ns = {
.kref = {
.refcount = ATOMIC_INIT(2),
},
+ .info = NULL,
.user_ns = &init_user_ns,
.ns.inum = PROC_PERF_INIT_INO,
#ifdef CONFIG_PERF_NS
^ permalink raw reply related [flat|nested] 13+ messages in thread