From: kan.liang@linux.intel.com
To: peterz@infradead.org, mingo@redhat.com, acme@kernel.org,
tglx@linutronix.de, bp@alien8.de, linux-kernel@vger.kernel.org
Cc: eranian@google.com, alexey.budankov@linux.intel.com,
vitaly.slobodskoy@intel.com, ak@linux.intel.com,
Kan Liang <kan.liang@linux.intel.com>
Subject: [RFC PATCH 3/8] perf: Init/fini PMU specific data
Date: Thu, 28 Nov 2019 07:14:26 -0800 [thread overview]
Message-ID: <1574954071-6321-3-git-send-email-kan.liang@linux.intel.com> (raw)
In-Reply-To: <1574954071-6321-1-git-send-email-kan.liang@linux.intel.com>
From: Kan Liang <kan.liang@linux.intel.com>
For per-process event, only allocate the space for current task.
The space will be allocated by the first user.
For system-wide event, allocation for all the existing tasks and
upcoming tasks are required. Add variable nr_task_data_events to track
the number of system-wide event.
In perf_event_alloc(), the space for all the existing tasks will be
allocated.
The space for new tasks will be allocated in perf_event_fork().
The allocation may be failed. For per-process event, it error out.
For system-wide event, it doesn't error out, a debug message will be
dumped to system log instead. LBR callstack may be cutoff for the task
which doesn't have the space allocated.
Reviewed-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
kernel/events/core.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 209 insertions(+)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b4976e0..e519720 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -387,6 +387,10 @@ static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
+/* Track the number of system-wide event which requires pmu specific data */
+static atomic_t nr_task_data_events;
+static DEFINE_RAW_SPINLOCK(task_data_events_lock);
+
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
@@ -4552,6 +4556,104 @@ init_task_ctx_data_rcu(struct task_struct *task, size_t ctx_size, gfp_t flags)
return ret;
}
+static int
+init_task_ctx_data(struct task_struct *task, size_t ctx_size)
+{
+ struct perf_ctx_data *ctx_data;
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&task->perf_ctx_data_lock, flags);
+
+ ret = __init_task_ctx_data_rcu(task, ctx_size, GFP_KERNEL);
+ if (ret)
+ goto unlock;
+
+ ctx_data = task->perf_ctx_data;
+ /* System-wide event is active as well */
+ if ((ctx_data->refcount == 1) && atomic_read(&nr_task_data_events))
+ ctx_data->refcount++;
+
+unlock:
+ raw_spin_unlock_irqrestore(&task->perf_ctx_data_lock, flags);
+ return ret;
+}
+
+static int
+init_system_wide_ctx_data(size_t ctx_size)
+{
+ struct task_struct *g, *p;
+ int failed_alloc = 0;
+ unsigned long flags;
+
+ /*
+ * Allocate perf_ctx_data for all existing threads by the first event.
+ *
+ * The perf_ctx_data for new thread will be allocated in
+ * perf_event_fork(). The perf_event_fork() is called after the thread
+ * is added into the tasklist. It guarantees that any new threads will
+ * not be missed.
+ */
+ raw_spin_lock_irqsave(&task_data_events_lock, flags);
+ if (atomic_inc_return(&nr_task_data_events) > 1)
+ goto unlock;
+
+ read_lock(&tasklist_lock);
+
+ for_each_process_thread(g, p) {
+ /*
+ * The PMU specific data may already be allocated by
+ * per-process event. Need to update refcounter.
+ * init_task_ctx_data_rcu() is called here.
+ * Do a quick allocation in first round with GFP_ATOMIC.
+ */
+ if (init_task_ctx_data_rcu(p, ctx_size, GFP_ATOMIC))
+ failed_alloc++;
+ }
+
+ /*
+ * Failed to allocate the ctx data for some tasks.
+ * Repeat the allocation.
+ */
+ if (!failed_alloc)
+ goto tasklist_unlock;
+
+ failed_alloc = 0;
+ for_each_process_thread(g, p) {
+ /*
+ * Doesn't need to update refcounter for the task which
+ * is monitored by per-process event, or new created
+ * PMU specific data. They are done in first round allocation.
+ * alloc_task_ctx_data_rcu() is called here.
+ */
+ if (alloc_task_ctx_data_rcu(p, ctx_size, GFP_KERNEL) &&
+ ((++failed_alloc) == 1)) {
+ printk(KERN_DEBUG
+ "Failed to allocate space for LBR callstack for some tasks. "
+ "Their LBR callstack may be cutoff.\n");
+ }
+ }
+
+tasklist_unlock:
+ read_unlock(&tasklist_lock);
+unlock:
+ raw_spin_unlock_irqrestore(&task_data_events_lock, flags);
+
+ return 0;
+}
+
+static int
+init_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+ size_t ctx_size = event->pmu->task_ctx_size;
+
+ if (task)
+ return init_task_ctx_data(task, ctx_size);
+ else
+ return init_system_wide_ctx_data(ctx_size);
+}
+
static void
free_perf_ctx_data(struct rcu_head *rcu_head)
{
@@ -4594,6 +4696,40 @@ fini_task_ctx_data_rcu(struct task_struct *task, bool force)
raw_spin_unlock_irqrestore(&task->perf_ctx_data_lock, flags);
}
+static void fini_task_ctx_data(struct task_struct *task)
+{
+ fini_task_ctx_data_rcu(task, false);
+}
+
+static void fini_system_wide_ctx_data(void)
+{
+ struct task_struct *g, *p;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task_data_events_lock, flags);
+ if (!atomic_dec_and_test(&nr_task_data_events))
+ goto unlock;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p)
+ fini_task_ctx_data_rcu(p, false);
+
+ read_unlock(&tasklist_lock);
+
+unlock:
+ raw_spin_unlock_irqrestore(&task_data_events_lock, flags);
+}
+
+static void fini_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+
+ if (task)
+ fini_task_ctx_data(task);
+ else
+ fini_system_wide_ctx_data();
+}
+
static void unaccount_event(struct perf_event *event)
{
bool dec = false;
@@ -4625,6 +4761,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_ksymbol_events);
if (event->attr.bpf_event)
atomic_dec(&nr_bpf_events);
+ if (event->attach_state & PERF_ATTACH_TASK_DATA)
+ fini_perf_ctx_data(event);
if (dec) {
if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -7451,10 +7589,72 @@ static void perf_event_task(struct task_struct *task,
task_ctx);
}
+/*
+ * Allocate data for a new task when profiling system-wide
+ * events which require PMU specific data
+ */
+static void perf_event_alloc_task_data(struct task_struct *child,
+ struct task_struct *parent)
+{
+ struct perf_ctx_data *ctx_data;
+ size_t ctx_size = 0;
+ unsigned long flags;
+
+ if (!atomic_read(&nr_task_data_events))
+ return;
+
+ rcu_read_lock();
+ ctx_data = rcu_dereference(parent->perf_ctx_data);
+ if (ctx_data)
+ ctx_size = ctx_data->data_size;
+ rcu_read_unlock();
+
+ if (!ctx_size)
+ return;
+
+ /*
+ * The refcount of a new task may not be updated correctly for some
+ * rare case as below.
+ * CPU A CPU B
+ * perf_event_alloc_task_data(): init_system_wide_ctx_data():
+ * inc(&nr_task_data_events)
+ * read(nr_task_data_events)
+ * alloc_task_ctx_data_rcu()
+ * init_task_ctx_data_rcu()
+ * The refcount of new task is double count.
+ * Lock is required to prevent the case by serializing the allocation.
+ */
+ raw_spin_lock_irqsave(&task_data_events_lock, flags);
+
+ /*
+ * System-wide event may be unaccount when attaching the perf_ctx_data.
+ * For example,
+ * CPU A CPU B
+ * perf_event_alloc_task_data(): fini_system_wide_ctx_data():
+ * read(nr_task_data_events)
+ * fini_task_ctx_data_rcu()
+ * alloc_task_ctx_data_rcu()
+ *
+ * The perf_ctx_data may never be freed until the task is terminated.
+ */
+ if (unlikely(!atomic_read(&nr_task_data_events)))
+ goto unlock;
+
+ if (alloc_task_ctx_data_rcu(child, ctx_size, GFP_KERNEL)) {
+ printk_once(KERN_DEBUG
+ "LBR callstack may be cutoff for task %s (%d) ctx_size %zu\n",
+ child->comm, task_pid_nr(child), ctx_size);
+ }
+
+unlock:
+ raw_spin_unlock_irqrestore(&task_data_events_lock, flags);
+}
+
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
perf_event_namespaces(task);
+ perf_event_alloc_task_data(task, current);
}
/*
@@ -10980,11 +11180,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (err)
goto err_callchain_buffer;
+ if ((event->attach_state & PERF_ATTACH_TASK_DATA) &&
+ init_perf_ctx_data(event))
+ goto err_task_ctx_data;
+
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
return event;
+err_task_ctx_data:
+ if (!event->parent && (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
+ put_callchain_buffers();
err_callchain_buffer:
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
@@ -12046,6 +12253,8 @@ void perf_event_exit_task(struct task_struct *child)
* At this point we need to send EXIT events to cpu contexts.
*/
perf_event_task(child, NULL, 0);
+
+ fini_task_ctx_data_rcu(child, true);
}
static void perf_free_event(struct perf_event *event,
--
2.7.4
next prev parent reply other threads:[~2019-11-28 15:17 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-11-28 15:14 [RFC PATCH 1/8] perf: Save PMU specific data in task_struct kan.liang
2019-11-28 15:14 ` [RFC PATCH 2/8] perf: Helpers for alloc/init/fini PMU specific data kan.liang
2019-12-02 13:16 ` Peter Zijlstra
2019-12-02 15:25 ` Alexey Budankov
2019-12-02 20:35 ` Liang, Kan
2019-12-04 12:36 ` Peter Zijlstra
2019-11-28 15:14 ` kan.liang [this message]
2019-12-02 12:40 ` [RFC PATCH 3/8] perf: Init/fini " Peter Zijlstra
2019-12-02 14:59 ` Andi Kleen
2019-12-02 16:21 ` Peter Zijlstra
2019-12-02 19:15 ` Andi Kleen
2019-12-02 20:13 ` Liang, Kan
2019-12-02 20:25 ` Andi Kleen
2019-12-02 20:44 ` Liang, Kan
2019-12-04 13:12 ` Peter Zijlstra
2019-12-02 16:38 ` Alexey Budankov
2019-12-02 16:43 ` Peter Zijlstra
2019-12-02 17:42 ` Alexey Budankov
2019-11-28 15:14 ` [RFC PATCH 4/8] perf: Supply task information to sched_task() kan.liang
2019-11-28 15:14 ` [RFC PATCH 5/8] perf/x86/lbr: Fix shorter LBRs call stacks for system-wide mode kan.liang
2019-11-28 15:14 ` [RFC PATCH 6/8] perf/x86: Remove swap_task_ctx() kan.liang
2019-11-28 15:14 ` [RFC PATCH 7/8] perf: Clean up pmu specific data kan.liang
2019-11-28 15:14 ` [RFC PATCH 8/8] perf: Clean up event context from sched_task() kan.liang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1574954071-6321-3-git-send-email-kan.liang@linux.intel.com \
--to=kan.liang@linux.intel.com \
--cc=acme@kernel.org \
--cc=ak@linux.intel.com \
--cc=alexey.budankov@linux.intel.com \
--cc=bp@alien8.de \
--cc=eranian@google.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=tglx@linutronix.de \
--cc=vitaly.slobodskoy@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).