All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] perf_events: fix event scheduling issues introduced by transactional API (take 2)
@ 2010-05-25 13:20 Stephane Eranian
  2010-05-25 13:35 ` Peter Zijlstra
  0 siblings, 1 reply; 15+ messages in thread
From: Stephane Eranian @ 2010-05-25 13:20 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, mingo, paulus, davem, fweisbec, acme, ming.m.lin,
	perfmon2-devel, eranian, eranian

The transactional API patch between the generic and model-specific
code introduced several important bugs with event scheduling, at
least on X86. If you had pinned events, e.g., watchdog,  and were
over-committing the PMU, you would get bogus counts. The bug was
showing up on Intel CPU because events would move around more
often that on AMD. But the problem also existed on AMD, though
harder to expose.

The issues were:

 - group_sched_in() was missing a cancel_txn() in the error path

 - cpuc->n_added was not properly maintained, leading to missing
   actions in hw_perf_enable(), i.e., n_running being 0. You cannot
   update n_added until you know the transaction has succeeded. In
   case of failed transaction n_added was not adjusted back.

 - in case of failed transactions, event_sched_out() was called
   and eventually invoked x86_disable_event() to touch the HW reg.
   But with transactions, on X86, event_sched_in() does not touch
   HW registers, it simply collects events into a list. Thus, you
   could end up calling x86_disable_event() on a counter which
   did not correspond to the current event when idx != -1.

The patch modifies the generic and X86 code to avoid all those problems.

First, we keep track of the number of events added last. In case the
transaction fails, we substract them from n_added. This approach is
necessary (as opposed to delaying updates to n_added) because not all
event updates use the transaction API, e.g., single events.

Second, we encapsulate the event_sched_in() and event_sched_out() in
group_sched_in() inside the transaction. That makes the operations
symmetrical and you can also detect that you are inside a transaction
and skip the HW reg access by checking cpuc->group_flag.

With this patch, you can now overcommit the PMU even with pinned
system-wide events present and still get valid counts.

Signed-off-by: Stephane Eranian <eranian@google.com>

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c775860..3ea4c5c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -105,6 +105,7 @@ struct cpu_hw_events {
 	int			enabled;
 
 	int			n_events;
+	int			n_events_trans;
 	int			n_added;
 	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
 	u64			tags[X86_PMC_IDX_MAX];
@@ -591,7 +592,7 @@ void hw_perf_disable(void)
 	if (!cpuc->enabled)
 		return;
 
-	cpuc->n_added = 0;
+	cpuc->n_added = cpuc->n_events_trans = 0;
 	cpuc->enabled = 0;
 	barrier();
 
@@ -820,7 +821,7 @@ void hw_perf_enable(void)
 		 * apply assignment obtained either from
 		 * hw_perf_group_sched_in() or x86_pmu_enable()
 		 *
-		 * step1: save events moving to new counters
+		 * step1: stop-save old events moving to new counters
 		 * step2: reprogram moved events into new counters
 		 */
 		for (i = 0; i < n_running; i++) {
@@ -983,6 +984,7 @@ static int x86_pmu_enable(struct perf_event *event)
 out:
 	cpuc->n_events = n;
 	cpuc->n_added += n - n0;
+	cpuc->n_events_trans = n - n0;
 
 	return 0;
 }
@@ -1070,7 +1072,7 @@ static void x86_pmu_stop(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
 
-	if (!__test_and_clear_bit(idx, cpuc->active_mask))
+	if (idx == -1 || !__test_and_clear_bit(idx, cpuc->active_mask))
 		return;
 
 	x86_pmu.disable(event);
@@ -1087,14 +1089,30 @@ static void x86_pmu_stop(struct perf_event *event)
 static void x86_pmu_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	bool no_trans;
 	int i;
 
-	x86_pmu_stop(event);
+	/*
+	 * when coming from a transaction, then the event has not
+	 * actually been scheduled in yet. It has only been collected
+	 * (collect_events), thus we cannot apply a full disable:
+	 * - put_constraints() assumes went thru schedule_events()
+	 * - x86_pmu_stop() assumes went thru x86_pmu_start() because
+	 *   maybe idx != -1 and thus we may overwrite another the wrong
+	 *   counter (e.g, pinned event).
+	 *
+	 * When called from a transaction, we need to un-collect the
+	 * event, i..e, remove the event from event_list[]
+	 */
+	no_trans = !(cpuc->group_flag & PERF_EVENT_TXN_STARTED);
+
+	if (no_trans)
+		x86_pmu_stop(event);
 
 	for (i = 0; i < cpuc->n_events; i++) {
 		if (event == cpuc->event_list[i]) {
 
-			if (x86_pmu.put_event_constraints)
+			if (no_trans && x86_pmu.put_event_constraints)
 				x86_pmu.put_event_constraints(cpuc, event);
 
 			while (++i < cpuc->n_events)
@@ -1104,7 +1122,8 @@ static void x86_pmu_disable(struct perf_event *event)
 			break;
 		}
 	}
-	perf_event_update_userpage(event);
+	if (no_trans)
+		perf_event_update_userpage(event);
 }
 
 static int x86_pmu_handle_irq(struct pt_regs *regs)
@@ -1391,6 +1410,14 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
 	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+
+	/*
+	 * adjust for failed transaction
+	 * failed if n_events_trans != 0
+	 */
+	cpuc->n_added -= cpuc->n_events_trans;
+
+	cpuc->n_events_trans = 0;
 }
 
 /*
@@ -1419,6 +1446,14 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
+	/*
+	 * transaction succeeded, no need to adjust
+	 * n_added on cancel_txn()
+	 *
+	 * adjust in cancel_txn for cases with failures
+	 * before we reach commit_txn()
+	 */
+	cpuc->n_events_trans = 0;
 	return 0;
 }
 
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e099650..b521a0b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -652,8 +652,11 @@ group_sched_in(struct perf_event *group_event,
 	if (txn)
 		pmu->start_txn(pmu);
 
-	if (event_sched_in(group_event, cpuctx, ctx))
+	if (event_sched_in(group_event, cpuctx, ctx)) {
+		if (txn)
+			pmu->cancel_txn(pmu);
 		return -EAGAIN;
+	}
 
 	/*
 	 * Schedule in siblings as one group (if any):
@@ -675,8 +678,6 @@ group_sched_in(struct perf_event *group_event,
 	}
 
 group_error:
-	if (txn)
-		pmu->cancel_txn(pmu);
 
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
@@ -689,6 +690,9 @@ group_error:
 	}
 	event_sched_out(group_event, cpuctx, ctx);
 
+	if (txn)
+		pmu->cancel_txn(pmu);
+
 	return -EAGAIN;
 }
 

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] perf_events: fix event scheduling issues introduced by transactional API (take 2)
  2010-05-25 13:20 [PATCH] perf_events: fix event scheduling issues introduced by transactional API (take 2) Stephane Eranian
@ 2010-05-25 13:35 ` Peter Zijlstra
  2010-05-25 13:39   ` stephane eranian
  0 siblings, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2010-05-25 13:35 UTC (permalink / raw)
  To: eranian
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, acme, ming.m.lin,
	perfmon2-devel, eranian

On Tue, 2010-05-25 at 15:20 +0200, Stephane Eranian wrote:

> With this patch, you can now overcommit the PMU even with pinned
> system-wide events present and still get valid counts.

Does this patch differ from the one you send earlier?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] perf_events: fix event scheduling issues introduced by  transactional API (take 2)
  2010-05-25 13:35 ` Peter Zijlstra
@ 2010-05-25 13:39   ` stephane eranian
  2010-05-25 14:03     ` Peter Zijlstra
  0 siblings, 1 reply; 15+ messages in thread
From: stephane eranian @ 2010-05-25 13:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: eranian, linux-kernel, mingo, paulus, davem, fweisbec, acme,
	ming.m.lin, perfmon2-devel

On Tue, May 25, 2010 at 3:35 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, 2010-05-25 at 15:20 +0200, Stephane Eranian wrote:
>
>> With this patch, you can now overcommit the PMU even with pinned
>> system-wide events present and still get valid counts.
>
> Does this patch differ from the one you send earlier?
>
Yes, it does. It changes the way n_added is updated.

The first version was buggy (perf top would crash the machine).
You cannot delay updating n_added until commit, because there
are paths where you don't go through transactions. This version
instead keeps the number of events last added and speculatively
updates n_added (assuming success). If the transaction succeeds,
then no correction is done (subtracting 0), if it fails, then the number
of events just added to n_added is subtracted.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] perf_events: fix event scheduling issues introduced by transactional API (take 2)
  2010-05-25 13:39   ` stephane eranian
@ 2010-05-25 14:03     ` Peter Zijlstra
  2010-05-25 14:19       ` Stephane Eranian
  2010-05-31  7:20       ` [tip:perf/urgent] perf_events: Fix event scheduling issues introduced by transactional API tip-bot for Stephane Eranian
  0 siblings, 2 replies; 15+ messages in thread
From: Peter Zijlstra @ 2010-05-25 14:03 UTC (permalink / raw)
  To: eranian
  Cc: eranian, linux-kernel, mingo, paulus, davem, fweisbec, acme,
	ming.m.lin, perfmon2-devel

On Tue, 2010-05-25 at 15:39 +0200, stephane eranian wrote:
> On Tue, May 25, 2010 at 3:35 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> > On Tue, 2010-05-25 at 15:20 +0200, Stephane Eranian wrote:
> >
> >> With this patch, you can now overcommit the PMU even with pinned
> >> system-wide events present and still get valid counts.
> >
> > Does this patch differ from the one you send earlier?
> >
> Yes, it does. It changes the way n_added is updated.
> 
> The first version was buggy (perf top would crash the machine).
> You cannot delay updating n_added until commit, because there
> are paths where you don't go through transactions. This version
> instead keeps the number of events last added and speculatively
> updates n_added (assuming success). If the transaction succeeds,
> then no correction is done (subtracting 0), if it fails, then the number
> of events just added to n_added is subtracted.


OK, just to see if I got it right, I wrote a similar patch from just the
changelog.

Does the below look good?

---
 arch/x86/kernel/cpu/perf_event.c |   13 +++++++++++++
 kernel/perf_event.c              |   11 +++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 856aeeb..be84944 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -106,6 +106,7 @@ struct cpu_hw_events {
 
 	int			n_events;
 	int			n_added;
+	int			n_txn;
 	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
 	u64			tags[X86_PMC_IDX_MAX];
 	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
@@ -983,6 +984,7 @@ static int x86_pmu_enable(struct perf_event *event)
 out:
 	cpuc->n_events = n;
 	cpuc->n_added += n - n0;
+	cpuc->n_txn += n - n0;
 
 	return 0;
 }
@@ -1089,6 +1091,14 @@ static void x86_pmu_disable(struct perf_event *event)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int i;
 
+	/*
+	 * If we're called during a txn, we don't need to do anything.
+	 * The events never got scheduled and ->cancel_txn will truncate
+	 * the event_list.
+	 */
+	if (cpuc->group_flags & PERF_EVENT_TXN_STARTED)
+		return;
+
 	x86_pmu_stop(event);
 
 	for (i = 0; i < cpuc->n_events; i++) {
@@ -1379,6 +1389,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
 	cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuc->n_txn = 0;
 }
 
 /*
@@ -1391,6 +1402,8 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
 	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuc->n_added -= cpuc->n_txn;
+	cpuc->n_events -= cpuc->n_txn;
 }
 
 /*
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e099650..ca79f2a 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -652,8 +652,11 @@ group_sched_in(struct perf_event *group_event,
 	if (txn)
 		pmu->start_txn(pmu);
 
-	if (event_sched_in(group_event, cpuctx, ctx))
+	if (event_sched_in(group_event, cpuctx, ctx)) {
+		if (txn)
+			pmu->cancel_txn(pmu);
 		return -EAGAIN;
+	}
 
 	/*
 	 * Schedule in siblings as one group (if any):
@@ -675,9 +678,6 @@ group_sched_in(struct perf_event *group_event,
 	}
 
 group_error:
-	if (txn)
-		pmu->cancel_txn(pmu);
-
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
 	 * partial group before returning:
@@ -689,6 +689,9 @@ group_error:
 	}
 	event_sched_out(group_event, cpuctx, ctx);
 
+	if (txn)
+		pmu->cancel_txn(pmu);
+
 	return -EAGAIN;
 }
 


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] perf_events: fix event scheduling issues introduced by  transactional API (take 2)
  2010-05-25 14:03     ` Peter Zijlstra
@ 2010-05-25 14:19       ` Stephane Eranian
  2010-05-25 15:02         ` Stephane Eranian
  2010-05-31  7:20       ` [tip:perf/urgent] perf_events: Fix event scheduling issues introduced by transactional API tip-bot for Stephane Eranian
  1 sibling, 1 reply; 15+ messages in thread
From: Stephane Eranian @ 2010-05-25 14:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: eranian, linux-kernel, mingo, paulus, davem, fweisbec, acme,
	ming.m.lin, perfmon2-devel

Ok, let me check. I think it is almost right.

On Tue, May 25, 2010 at 4:03 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, 2010-05-25 at 15:39 +0200, stephane eranian wrote:
>> On Tue, May 25, 2010 at 3:35 PM, Peter Zijlstra <peterz@infradead.org> wrote:
>> > On Tue, 2010-05-25 at 15:20 +0200, Stephane Eranian wrote:
>> >
>> >> With this patch, you can now overcommit the PMU even with pinned
>> >> system-wide events present and still get valid counts.
>> >
>> > Does this patch differ from the one you send earlier?
>> >
>> Yes, it does. It changes the way n_added is updated.
>>
>> The first version was buggy (perf top would crash the machine).
>> You cannot delay updating n_added until commit, because there
>> are paths where you don't go through transactions. This version
>> instead keeps the number of events last added and speculatively
>> updates n_added (assuming success). If the transaction succeeds,
>> then no correction is done (subtracting 0), if it fails, then the number
>> of events just added to n_added is subtracted.
>
>
> OK, just to see if I got it right, I wrote a similar patch from just the
> changelog.
>
> Does the below look good?
>
> ---
>  arch/x86/kernel/cpu/perf_event.c |   13 +++++++++++++
>  kernel/perf_event.c              |   11 +++++++----
>  2 files changed, 20 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index 856aeeb..be84944 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -106,6 +106,7 @@ struct cpu_hw_events {
>
>        int                     n_events;
>        int                     n_added;
> +       int                     n_txn;
>        int                     assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
>        u64                     tags[X86_PMC_IDX_MAX];
>        struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
> @@ -983,6 +984,7 @@ static int x86_pmu_enable(struct perf_event *event)
>  out:
>        cpuc->n_events = n;
>        cpuc->n_added += n - n0;
> +       cpuc->n_txn += n - n0;
>
>        return 0;
>  }
> @@ -1089,6 +1091,14 @@ static void x86_pmu_disable(struct perf_event *event)
>        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
>        int i;
>
> +       /*
> +        * If we're called during a txn, we don't need to do anything.
> +        * The events never got scheduled and ->cancel_txn will truncate
> +        * the event_list.
> +        */
> +       if (cpuc->group_flags & PERF_EVENT_TXN_STARTED)
> +               return;
> +
>        x86_pmu_stop(event);
>
>        for (i = 0; i < cpuc->n_events; i++) {
> @@ -1379,6 +1389,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
>        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
>
>        cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
> +       cpuc->n_txn = 0;
>  }
>
>  /*
> @@ -1391,6 +1402,8 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
>        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
>
>        cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
> +       cpuc->n_added -= cpuc->n_txn;
> +       cpuc->n_events -= cpuc->n_txn;
>  }
>
>  /*
> diff --git a/kernel/perf_event.c b/kernel/perf_event.c
> index e099650..ca79f2a 100644
> --- a/kernel/perf_event.c
> +++ b/kernel/perf_event.c
> @@ -652,8 +652,11 @@ group_sched_in(struct perf_event *group_event,
>        if (txn)
>                pmu->start_txn(pmu);
>
> -       if (event_sched_in(group_event, cpuctx, ctx))
> +       if (event_sched_in(group_event, cpuctx, ctx)) {
> +               if (txn)
> +                       pmu->cancel_txn(pmu);
>                return -EAGAIN;
> +       }
>
>        /*
>         * Schedule in siblings as one group (if any):
> @@ -675,9 +678,6 @@ group_sched_in(struct perf_event *group_event,
>        }
>
>  group_error:
> -       if (txn)
> -               pmu->cancel_txn(pmu);
> -
>        /*
>         * Groups can be scheduled in as one unit only, so undo any
>         * partial group before returning:
> @@ -689,6 +689,9 @@ group_error:
>        }
>        event_sched_out(group_event, cpuctx, ctx);
>
> +       if (txn)
> +               pmu->cancel_txn(pmu);
> +
>        return -EAGAIN;
>  }
>
>
>



-- 
Stephane Eranian  | EMEA Software Engineering
Google France | 38 avenue de l'Opéra | 75002 Paris
Tel : +33 (0) 1 42 68 53 00
This email may be confidential or privileged. If you received this
communication by mistake, please
don't forward it to anyone else, please erase all copies and
attachments, and please let me know that
it went to the wrong person. Thanks

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] perf_events: fix event scheduling issues introduced by  transactional API (take 2)
  2010-05-25 14:19       ` Stephane Eranian
@ 2010-05-25 15:02         ` Stephane Eranian
  2010-05-25 15:32           ` Peter Zijlstra
  0 siblings, 1 reply; 15+ messages in thread
From: Stephane Eranian @ 2010-05-25 15:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: eranian, linux-kernel, mingo, paulus, davem, fweisbec, acme,
	ming.m.lin, perfmon2-devel

Ok, the patch look good expect it needs:

static int x86_pmu_commit_txn(const struct pmu *pmu)
{
        ......
        /*
         * copy new assignment, now we know it is possible
         * will be used by hw_perf_enable()
         */
        memcpy(cpuc->assign, assign, n*sizeof(int));

        cpuc->n_txn = 0;

        return 0;
}

Because you always call cancel_txn() even when commit()
succeeds. I don't really understand why. I think it could be
avoided by clearing the group_flag in commit_txn() if it
succeeds. It would also make the logical flow more natural. Why
cancel something that has succeeded. You cancel when you fail/abort.


On Tue, May 25, 2010 at 4:19 PM, Stephane Eranian <eranian@google.com> wrote:
> Ok, let me check. I think it is almost right.
>
> On Tue, May 25, 2010 at 4:03 PM, Peter Zijlstra <peterz@infradead.org> wrote:
>> On Tue, 2010-05-25 at 15:39 +0200, stephane eranian wrote:
>>> On Tue, May 25, 2010 at 3:35 PM, Peter Zijlstra <peterz@infradead.org> wrote:
>>> > On Tue, 2010-05-25 at 15:20 +0200, Stephane Eranian wrote:
>>> >
>>> >> With this patch, you can now overcommit the PMU even with pinned
>>> >> system-wide events present and still get valid counts.
>>> >
>>> > Does this patch differ from the one you send earlier?
>>> >
>>> Yes, it does. It changes the way n_added is updated.
>>>
>>> The first version was buggy (perf top would crash the machine).
>>> You cannot delay updating n_added until commit, because there
>>> are paths where you don't go through transactions. This version
>>> instead keeps the number of events last added and speculatively
>>> updates n_added (assuming success). If the transaction succeeds,
>>> then no correction is done (subtracting 0), if it fails, then the number
>>> of events just added to n_added is subtracted.
>>
>>
>> OK, just to see if I got it right, I wrote a similar patch from just the
>> changelog.
>>
>> Does the below look good?
>>
>> ---
>>  arch/x86/kernel/cpu/perf_event.c |   13 +++++++++++++
>>  kernel/perf_event.c              |   11 +++++++----
>>  2 files changed, 20 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
>> index 856aeeb..be84944 100644
>> --- a/arch/x86/kernel/cpu/perf_event.c
>> +++ b/arch/x86/kernel/cpu/perf_event.c
>> @@ -106,6 +106,7 @@ struct cpu_hw_events {
>>
>>        int                     n_events;
>>        int                     n_added;
>> +       int                     n_txn;
>>        int                     assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
>>        u64                     tags[X86_PMC_IDX_MAX];
>>        struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
>> @@ -983,6 +984,7 @@ static int x86_pmu_enable(struct perf_event *event)
>>  out:
>>        cpuc->n_events = n;
>>        cpuc->n_added += n - n0;
>> +       cpuc->n_txn += n - n0;
>>
>>        return 0;
>>  }
>> @@ -1089,6 +1091,14 @@ static void x86_pmu_disable(struct perf_event *event)
>>        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
>>        int i;
>>
>> +       /*
>> +        * If we're called during a txn, we don't need to do anything.
>> +        * The events never got scheduled and ->cancel_txn will truncate
>> +        * the event_list.
>> +        */
>> +       if (cpuc->group_flags & PERF_EVENT_TXN_STARTED)
>> +               return;
>> +
>>        x86_pmu_stop(event);
>>
>>        for (i = 0; i < cpuc->n_events; i++) {
>> @@ -1379,6 +1389,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
>>        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
>>
>>        cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
>> +       cpuc->n_txn = 0;
>>  }
>>
>>  /*
>> @@ -1391,6 +1402,8 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
>>        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
>>
>>        cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
>> +       cpuc->n_added -= cpuc->n_txn;
>> +       cpuc->n_events -= cpuc->n_txn;
>>  }
>>
>>  /*
>> diff --git a/kernel/perf_event.c b/kernel/perf_event.c
>> index e099650..ca79f2a 100644
>> --- a/kernel/perf_event.c
>> +++ b/kernel/perf_event.c
>> @@ -652,8 +652,11 @@ group_sched_in(struct perf_event *group_event,
>>        if (txn)
>>                pmu->start_txn(pmu);
>>
>> -       if (event_sched_in(group_event, cpuctx, ctx))
>> +       if (event_sched_in(group_event, cpuctx, ctx)) {
>> +               if (txn)
>> +                       pmu->cancel_txn(pmu);
>>                return -EAGAIN;
>> +       }
>>
>>        /*
>>         * Schedule in siblings as one group (if any):
>> @@ -675,9 +678,6 @@ group_sched_in(struct perf_event *group_event,
>>        }
>>
>>  group_error:
>> -       if (txn)
>> -               pmu->cancel_txn(pmu);
>> -
>>        /*
>>         * Groups can be scheduled in as one unit only, so undo any
>>         * partial group before returning:
>> @@ -689,6 +689,9 @@ group_error:
>>        }
>>        event_sched_out(group_event, cpuctx, ctx);
>>
>> +       if (txn)
>> +               pmu->cancel_txn(pmu);
>> +
>>        return -EAGAIN;
>>  }
>>
>>
>>
>
>
>
> --
> Stephane Eranian  | EMEA Software Engineering
> Google France | 38 avenue de l'Opéra | 75002 Paris
> Tel : +33 (0) 1 42 68 53 00
> This email may be confidential or privileged. If you received this
> communication by mistake, please
> don't forward it to anyone else, please erase all copies and
> attachments, and please let me know that
> it went to the wrong person. Thanks
>



-- 
Stephane Eranian  | EMEA Software Engineering
Google France | 38 avenue de l'Opéra | 75002 Paris
Tel : +33 (0) 1 42 68 53 00
This email may be confidential or privileged. If you received this
communication by mistake, please
don't forward it to anyone else, please erase all copies and
attachments, and please let me know that
it went to the wrong person. Thanks

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] perf_events: fix event scheduling issues introduced by transactional API (take 2)
  2010-05-25 15:02         ` Stephane Eranian
@ 2010-05-25 15:32           ` Peter Zijlstra
  2010-05-25 15:58             ` Peter Zijlstra
  2010-05-26  6:34             ` [PATCH] perf_events: fix event scheduling issues introduced by transactional API (take 2) Lin Ming
  0 siblings, 2 replies; 15+ messages in thread
From: Peter Zijlstra @ 2010-05-25 15:32 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: eranian, linux-kernel, mingo, paulus, davem, fweisbec, acme,
	ming.m.lin, perfmon2-devel

On Tue, 2010-05-25 at 17:02 +0200, Stephane Eranian wrote:
> Ok, the patch look good expect it needs:
> 
> static int x86_pmu_commit_txn(const struct pmu *pmu)
> {
>         ......
>         /*
>          * copy new assignment, now we know it is possible
>          * will be used by hw_perf_enable()
>          */
>         memcpy(cpuc->assign, assign, n*sizeof(int));
> 
>         cpuc->n_txn = 0;
> 
>         return 0;
> }
> 
> Because you always call cancel_txn() even when commit()
> succeeds. I don't really understand why. I think it could be
> avoided by clearing the group_flag in commit_txn() if it
> succeeds. It would also make the logical flow more natural. Why
> cancel something that has succeeded. You cancel when you fail/abort.

Gah, I forgot about that. I think I suggested to Lin to do that and then
promptly forgot.

Let me add that and at least push this patch fwd, we can try and clean
up that detail later on.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] perf_events: fix event scheduling issues introduced by transactional API (take 2)
  2010-05-25 15:32           ` Peter Zijlstra
@ 2010-05-25 15:58             ` Peter Zijlstra
  2010-05-25 16:10               ` Stephane Eranian
  2010-06-09 10:15               ` [tip:perf/core] perf: Cleanup {start,commit,cancel}_txn details tip-bot for Peter Zijlstra
  2010-05-26  6:34             ` [PATCH] perf_events: fix event scheduling issues introduced by transactional API (take 2) Lin Ming
  1 sibling, 2 replies; 15+ messages in thread
From: Peter Zijlstra @ 2010-05-25 15:58 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: eranian, linux-kernel, mingo, paulus, davem, fweisbec, acme,
	ming.m.lin, perfmon2-devel

On Tue, 2010-05-25 at 17:32 +0200, Peter Zijlstra wrote:
> > Because you always call cancel_txn() even when commit()
> > succeeds. I don't really understand why. I think it could be
> > avoided by clearing the group_flag in commit_txn() if it
> > succeeds. It would also make the logical flow more natural. Why
> > cancel something that has succeeded. You cancel when you fail/abort.
> 
> Gah, I forgot about that. I think I suggested to Lin to do that and
> then
> promptly forgot.
> 
> Let me add that and at least push this patch fwd, we can try and clean
> up that detail later on. 

How about something like the below?

---
Subject: perf: Cleanup {start,commit,cancel}_txn details
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue May 25 17:49:05 CEST 2010

Clarify some of the transactional group scheduling API details
and change it so that a successfull ->commit_txn also closes
the transaction.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
 arch/powerpc/kernel/perf_event.c |    7 ++++---
 arch/sparc/kernel/perf_event.c   |    7 ++++---
 arch/x86/kernel/cpu/perf_event.c |   14 +++++---------
 include/linux/perf_event.h       |   27 ++++++++++++++++++++++-----
 kernel/perf_event.c              |    8 +-------
 5 files changed, 36 insertions(+), 27 deletions(-)

Index: linux-2.6/arch/powerpc/kernel/perf_event.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/perf_event.c
+++ linux-2.6/arch/powerpc/kernel/perf_event.c
@@ -754,7 +754,7 @@ static int power_pmu_enable(struct perf_
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuhw->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuhw->group_flag & PERF_EVENT_TXN)
 		goto nocheck;
 
 	if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
@@ -858,7 +858,7 @@ void power_pmu_start_txn(const struct pm
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag |= PERF_EVENT_TXN;
 	cpuhw->n_txn_start = cpuhw->n_events;
 }
 
@@ -871,7 +871,7 @@ void power_pmu_cancel_txn(const struct p
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 }
 
 /*
@@ -897,6 +897,7 @@ int power_pmu_commit_txn(const struct pm
 	for (i = cpuhw->n_txn_start; i < n; ++i)
 		cpuhw->event[i]->hw.config = cpuhw->events[i];
 
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 	return 0;
 }
 
Index: linux-2.6/arch/sparc/kernel/perf_event.c
===================================================================
--- linux-2.6.orig/arch/sparc/kernel/perf_event.c
+++ linux-2.6/arch/sparc/kernel/perf_event.c
@@ -1005,7 +1005,7 @@ static int sparc_pmu_enable(struct perf_
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		goto nocheck;
 
 	if (check_excludes(cpuc->event, n0, 1))
@@ -1102,7 +1102,7 @@ static void sparc_pmu_start_txn(const st
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag |= PERF_EVENT_TXN;
 }
 
 /*
@@ -1114,7 +1114,7 @@ static void sparc_pmu_cancel_txn(const s
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 }
 
 /*
@@ -1137,6 +1137,7 @@ static int sparc_pmu_commit_txn(const st
 	if (sparc_check_constraints(cpuc->event, cpuc->events, n))
 		return -EAGAIN;
 
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 	return 0;
 }
 
Index: linux-2.6/arch/x86/kernel/cpu/perf_event.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/perf_event.c
+++ linux-2.6/arch/x86/kernel/cpu/perf_event.c
@@ -969,7 +969,7 @@ static int x86_pmu_enable(struct perf_ev
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		goto out;
 
 	ret = x86_pmu.schedule_events(cpuc, n, assign);
@@ -1096,7 +1096,7 @@ static void x86_pmu_disable(struct perf_
 	 * The events never got scheduled and ->cancel_txn will truncate
 	 * the event_list.
 	 */
-	if (cpuc->group_flags & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flags & PERF_EVENT_TXN)
 		return;
 
 	x86_pmu_stop(event);
@@ -1388,7 +1388,7 @@ static void x86_pmu_start_txn(const stru
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuc->group_flag |= PERF_EVENT_TXN;
 	cpuc->n_txn = 0;
 }
 
@@ -1401,7 +1401,7 @@ static void x86_pmu_cancel_txn(const str
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 	/*
 	 * Truncate the collected events.
 	 */
@@ -1435,11 +1435,7 @@ static int x86_pmu_commit_txn(const stru
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
-	/*
-	 * Clear out the txn count so that ->cancel_txn() which gets
-	 * run after ->commit_txn() doesn't undo things.
-	 */
-	cpuc->n_txn = 0;
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 
 	return 0;
 }
Index: linux-2.6/include/linux/perf_event.h
===================================================================
--- linux-2.6.orig/include/linux/perf_event.h
+++ linux-2.6/include/linux/perf_event.h
@@ -549,7 +549,10 @@ struct hw_perf_event {
 
 struct perf_event;
 
-#define PERF_EVENT_TXN_STARTED 1
+/*
+ * Common implementation detail of pmu::{start,commit,cancel}_txn
+ */
+#define PERF_EVENT_TXN 0x1
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -563,14 +566,28 @@ struct pmu {
 	void (*unthrottle)		(struct perf_event *event);
 
 	/*
-	 * group events scheduling is treated as a transaction,
-	 * add group events as a whole and perform one schedulability test.
-	 * If test fails, roll back the whole group
+	 * Group events scheduling is treated as a transaction, add group
+	 * events as a whole and perform one schedulability test. If the test
+	 * fails, roll back the whole group
 	 */
 
+	/*
+	 * Start the transaction, after this ->enable() doesn't need
+	 * to do schedulability tests.
+	 */
 	void (*start_txn)	(const struct pmu *pmu);
-	void (*cancel_txn)	(const struct pmu *pmu);
+	/*
+	 * If ->start_txn() disabled the ->enable() schedulability test
+	 * then ->commit_txn() is required to perform one. On success
+	 * the transaction is closed. On error the transaction is kept
+	 * open until ->cancel_txn() is called.
+	 */
 	int  (*commit_txn)	(const struct pmu *pmu);
+	/*
+	 * Will cancel the transaction, assumes ->disable() is called for
+	 * each successfull ->enable() during the transaction.
+	 */
+	void (*cancel_txn)	(const struct pmu *pmu);
 };
 
 /**
Index: linux-2.6/kernel/perf_event.c
===================================================================
--- linux-2.6.orig/kernel/perf_event.c
+++ linux-2.6/kernel/perf_event.c
@@ -668,15 +668,9 @@ group_sched_in(struct perf_event *group_
 		}
 	}
 
-	if (!txn)
+	if (!txn || !pmu->commit_txn(pmu))
 		return 0;
 
-	ret = pmu->commit_txn(pmu);
-	if (!ret) {
-		pmu->cancel_txn(pmu);
-		return 0;
-	}
-
 group_error:
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] perf_events: fix event scheduling issues introduced by  transactional API (take 2)
  2010-05-25 15:58             ` Peter Zijlstra
@ 2010-05-25 16:10               ` Stephane Eranian
  2010-05-25 16:18                 ` Peter Zijlstra
  2010-06-09 10:15               ` [tip:perf/core] perf: Cleanup {start,commit,cancel}_txn details tip-bot for Peter Zijlstra
  1 sibling, 1 reply; 15+ messages in thread
From: Stephane Eranian @ 2010-05-25 16:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: eranian, linux-kernel, mingo, paulus, davem, fweisbec, acme,
	ming.m.lin, perfmon2-devel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=UTF-8, Size: 945 bytes --]

> Index: linux-2.6/kernel/perf_event.c> ===================================================================> --- linux-2.6.orig/kernel/perf_event.c> +++ linux-2.6/kernel/perf_event.c> @@ -668,15 +668,9 @@ group_sched_in(struct perf_event *group_>                }>        }>> -       if (!txn)> +       if (!txn || !pmu->commit_txn(pmu))>                return 0;>> -       ret = pmu->commit_txn(pmu);> -       if (!ret) {> -               pmu->cancel_txn(pmu);> -               return 0;> -       }> ->  group_error:>        /*>         * Groups can be scheduled in as one unit only, so undo any>Looks okay.
I believe you can also drop the txn test in group_sched_in() after group_error:,given you have the if !(txn) return 0.ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] perf_events: fix event scheduling issues introduced by transactional API (take 2)
  2010-05-25 16:10               ` Stephane Eranian
@ 2010-05-25 16:18                 ` Peter Zijlstra
  2010-05-25 16:20                   ` Stephane Eranian
  0 siblings, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2010-05-25 16:18 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: eranian, linux-kernel, mingo, paulus, davem, fweisbec, acme,
	ming.m.lin, perfmon2-devel

On Tue, 2010-05-25 at 18:10 +0200, Stephane Eranian wrote:
> > Index: linux-2.6/kernel/perf_event.c
> > ===================================================================
> > --- linux-2.6.orig/kernel/perf_event.c
> > +++ linux-2.6/kernel/perf_event.c
> > @@ -668,15 +668,9 @@ group_sched_in(struct perf_event *group_
> >                }
> >        }
> >
> > -       if (!txn)
> > +       if (!txn || !pmu->commit_txn(pmu))
> >                return 0;
> >
> > -       ret = pmu->commit_txn(pmu);
> > -       if (!ret) {
> > -               pmu->cancel_txn(pmu);
> > -               return 0;
> > -       }
> > -
> >  group_error:
> >        /*
> >         * Groups can be scheduled in as one unit only, so undo any
> >
> Looks okay.
> 
> I believe you can also drop the txn test in group_sched_in() after group_error:,
> given you have the if !(txn) return 0.

Can't we still get in the group_error: branch with either scenario?

!txn will get there if ->enable() of a group sibling fails,
txn will get there if either ->enable() or ->commit_txn() fails.

(Although typically ->enable() would not fail for txn)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] perf_events: fix event scheduling issues introduced by  transactional API (take 2)
  2010-05-25 16:18                 ` Peter Zijlstra
@ 2010-05-25 16:20                   ` Stephane Eranian
  0 siblings, 0 replies; 15+ messages in thread
From: Stephane Eranian @ 2010-05-25 16:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: eranian, linux-kernel, mingo, paulus, davem, fweisbec, acme,
	ming.m.lin, perfmon2-devel

On Tue, May 25, 2010 at 6:18 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, 2010-05-25 at 18:10 +0200, Stephane Eranian wrote:
>> > Index: linux-2.6/kernel/perf_event.c
>> > ===================================================================
>> > --- linux-2.6.orig/kernel/perf_event.c
>> > +++ linux-2.6/kernel/perf_event.c
>> > @@ -668,15 +668,9 @@ group_sched_in(struct perf_event *group_
>> >                }
>> >        }
>> >
>> > -       if (!txn)
>> > +       if (!txn || !pmu->commit_txn(pmu))
>> >                return 0;
>> >
>> > -       ret = pmu->commit_txn(pmu);
>> > -       if (!ret) {
>> > -               pmu->cancel_txn(pmu);
>> > -               return 0;
>> > -       }
>> > -
>> >  group_error:
>> >        /*
>> >         * Groups can be scheduled in as one unit only, so undo any
>> >
>> Looks okay.
>>
>> I believe you can also drop the txn test in group_sched_in() after group_error:,
>> given you have the if !(txn) return 0.
>
> Can't we still get in the group_error: branch with either scenario?
>
You're right. We must keep it because of failure in the siblings' loop.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] perf_events: fix event scheduling issues introduced by transactional API (take 2)
  2010-05-25 15:32           ` Peter Zijlstra
  2010-05-25 15:58             ` Peter Zijlstra
@ 2010-05-26  6:34             ` Lin Ming
  2010-05-26  6:58               ` Stephane Eranian
  1 sibling, 1 reply; 15+ messages in thread
From: Lin Ming @ 2010-05-26  6:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Stephane Eranian, eranian, linux-kernel, mingo, paulus, davem,
	fweisbec, acme, perfmon2-devel

On Tue, 2010-05-25 at 23:32 +0800, Peter Zijlstra wrote:
> On Tue, 2010-05-25 at 17:02 +0200, Stephane Eranian wrote:
> > Ok, the patch look good expect it needs:
> > 
> > static int x86_pmu_commit_txn(const struct pmu *pmu)
> > {
> >         ......
> >         /*
> >          * copy new assignment, now we know it is possible
> >          * will be used by hw_perf_enable()
> >          */
> >         memcpy(cpuc->assign, assign, n*sizeof(int));
> > 
> >         cpuc->n_txn = 0;
> > 
> >         return 0;
> > }
> > 
> > Because you always call cancel_txn() even when commit()
> > succeeds. I don't really understand why. I think it could be
> > avoided by clearing the group_flag in commit_txn() if it
> > succeeds. It would also make the logical flow more natural. Why
> > cancel something that has succeeded. You cancel when you fail/abort.
> 
> Gah, I forgot about that. I think I suggested to Lin to do that and then
> promptly forgot.

cancel_txn() clears the transaction flag, so it is needed after both
success and fail transaction, although the function name is a bit
misleading.

Peter's patch adds the clear of transaction flag into each
implementation of ->commit_txn.

So cancel_txn() is only called after fail transaction now.

Thanks,
Lin Ming

> 
> Let me add that and at least push this patch fwd, we can try and clean
> up that detail later on.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] perf_events: fix event scheduling issues introduced by  transactional API (take 2)
  2010-05-26  6:34             ` [PATCH] perf_events: fix event scheduling issues introduced by transactional API (take 2) Lin Ming
@ 2010-05-26  6:58               ` Stephane Eranian
  0 siblings, 0 replies; 15+ messages in thread
From: Stephane Eranian @ 2010-05-26  6:58 UTC (permalink / raw)
  To: Lin Ming
  Cc: Peter Zijlstra, eranian, linux-kernel, mingo, paulus, davem,
	fweisbec, acme, perfmon2-devel

On Wed, May 26, 2010 at 8:34 AM, Lin Ming <ming.m.lin@intel.com> wrote:
> On Tue, 2010-05-25 at 23:32 +0800, Peter Zijlstra wrote:
>> On Tue, 2010-05-25 at 17:02 +0200, Stephane Eranian wrote:
>> > Ok, the patch look good expect it needs:
>> >
>> > static int x86_pmu_commit_txn(const struct pmu *pmu)
>> > {
>> >         ......
>> >         /*
>> >          * copy new assignment, now we know it is possible
>> >          * will be used by hw_perf_enable()
>> >          */
>> >         memcpy(cpuc->assign, assign, n*sizeof(int));
>> >
>> >         cpuc->n_txn = 0;
>> >
>> >         return 0;
>> > }
>> >
>> > Because you always call cancel_txn() even when commit()
>> > succeeds. I don't really understand why. I think it could be
>> > avoided by clearing the group_flag in commit_txn() if it
>> > succeeds. It would also make the logical flow more natural. Why
>> > cancel something that has succeeded. You cancel when you fail/abort.
>>
>> Gah, I forgot about that. I think I suggested to Lin to do that and then
>> promptly forgot.
>
> cancel_txn() clears the transaction flag, so it is needed after both
> success and fail transaction, although the function name is a bit
> misleading.
>
> Peter's patch adds the clear of transaction flag into each
> implementation of ->commit_txn.
>
> So cancel_txn() is only called after fail transaction now.
>
Yes and I think it is less prone to confusion now.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [tip:perf/urgent] perf_events: Fix event scheduling issues introduced by transactional API
  2010-05-25 14:03     ` Peter Zijlstra
  2010-05-25 14:19       ` Stephane Eranian
@ 2010-05-31  7:20       ` tip-bot for Stephane Eranian
  1 sibling, 0 replies; 15+ messages in thread
From: tip-bot for Stephane Eranian @ 2010-05-31  7:20 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, eranian, hpa, mingo, a.p.zijlstra, tglx, mingo

Commit-ID:  90151c35b19633e0cab5a6c80f1ba4a51e7c913b
Gitweb:     http://git.kernel.org/tip/90151c35b19633e0cab5a6c80f1ba4a51e7c913b
Author:     Stephane Eranian <eranian@google.com>
AuthorDate: Tue, 25 May 2010 16:23:10 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 31 May 2010 08:46:10 +0200

perf_events: Fix event scheduling issues introduced by transactional API

The transactional API patch between the generic and model-specific
code introduced several important bugs with event scheduling, at
least on X86. If you had pinned events, e.g., watchdog,  and were
over-committing the PMU, you would get bogus counts. The bug was
showing up on Intel CPU because events would move around more
often that on AMD. But the problem also existed on AMD, though
harder to expose.

The issues were:

 - group_sched_in() was missing a cancel_txn() in the error path

 - cpuc->n_added was not properly maintained, leading to missing
   actions in hw_perf_enable(), i.e., n_running being 0. You cannot
   update n_added until you know the transaction has succeeded. In
   case of failed transaction n_added was not adjusted back.

 - in case of failed transactions, event_sched_out() was called
   and eventually invoked x86_disable_event() to touch the HW reg.
   But with transactions, on X86, event_sched_in() does not touch
   HW registers, it simply collects events into a list. Thus, you
   could end up calling x86_disable_event() on a counter which
   did not correspond to the current event when idx != -1.

The patch modifies the generic and X86 code to avoid all those problems.

First, we keep track of the number of events added last. In case the
transaction fails, we substract them from n_added. This approach is
necessary (as opposed to delaying updates to n_added) because not all
event updates use the transaction API, e.g., single events.

Second, we encapsulate the event_sched_in() and event_sched_out() in
group_sched_in() inside the transaction. That makes the operations
symmetrical and you can also detect that you are inside a transaction
and skip the HW reg access by checking cpuc->group_flag.

With this patch, you can now overcommit the PMU even with pinned
system-wide events present and still get valid counts.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1274796225.5882.1389.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c |   22 ++++++++++++++++++++++
 kernel/perf_event.c              |   11 +++++++----
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c775860..5db5b7d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -106,6 +106,7 @@ struct cpu_hw_events {
 
 	int			n_events;
 	int			n_added;
+	int			n_txn;
 	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
 	u64			tags[X86_PMC_IDX_MAX];
 	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
@@ -983,6 +984,7 @@ static int x86_pmu_enable(struct perf_event *event)
 out:
 	cpuc->n_events = n;
 	cpuc->n_added += n - n0;
+	cpuc->n_txn += n - n0;
 
 	return 0;
 }
@@ -1089,6 +1091,14 @@ static void x86_pmu_disable(struct perf_event *event)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int i;
 
+	/*
+	 * If we're called during a txn, we don't need to do anything.
+	 * The events never got scheduled and ->cancel_txn will truncate
+	 * the event_list.
+	 */
+	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+		return;
+
 	x86_pmu_stop(event);
 
 	for (i = 0; i < cpuc->n_events; i++) {
@@ -1379,6 +1389,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
 	cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuc->n_txn = 0;
 }
 
 /*
@@ -1391,6 +1402,11 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
 	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	/*
+	 * Truncate the collected events.
+	 */
+	cpuc->n_added -= cpuc->n_txn;
+	cpuc->n_events -= cpuc->n_txn;
 }
 
 /*
@@ -1419,6 +1435,12 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
+	/*
+	 * Clear out the txn count so that ->cancel_txn() which gets
+	 * run after ->commit_txn() doesn't undo things.
+	 */
+	cpuc->n_txn = 0;
+
 	return 0;
 }
 
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 10a1aee..42a0e91 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -687,8 +687,11 @@ group_sched_in(struct perf_event *group_event,
 	if (txn)
 		pmu->start_txn(pmu);
 
-	if (event_sched_in(group_event, cpuctx, ctx))
+	if (event_sched_in(group_event, cpuctx, ctx)) {
+		if (txn)
+			pmu->cancel_txn(pmu);
 		return -EAGAIN;
+	}
 
 	/*
 	 * Schedule in siblings as one group (if any):
@@ -710,9 +713,6 @@ group_sched_in(struct perf_event *group_event,
 	}
 
 group_error:
-	if (txn)
-		pmu->cancel_txn(pmu);
-
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
 	 * partial group before returning:
@@ -724,6 +724,9 @@ group_error:
 	}
 	event_sched_out(group_event, cpuctx, ctx);
 
+	if (txn)
+		pmu->cancel_txn(pmu);
+
 	return -EAGAIN;
 }
 

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [tip:perf/core] perf: Cleanup {start,commit,cancel}_txn details
  2010-05-25 15:58             ` Peter Zijlstra
  2010-05-25 16:10               ` Stephane Eranian
@ 2010-06-09 10:15               ` tip-bot for Peter Zijlstra
  1 sibling, 0 replies; 15+ messages in thread
From: tip-bot for Peter Zijlstra @ 2010-06-09 10:15 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, acme, hpa, mingo, a.p.zijlstra, efault,
	fweisbec, rostedt, tglx, mingo

Commit-ID:  8d2cacbbb8deadfae78aa16e4e1ee619bdd7019e
Gitweb:     http://git.kernel.org/tip/8d2cacbbb8deadfae78aa16e4e1ee619bdd7019e
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Tue, 25 May 2010 17:49:05 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Wed, 9 Jun 2010 11:12:34 +0200

perf: Cleanup {start,commit,cancel}_txn details

Clarify some of the transactional group scheduling API details
and change it so that a successfull ->commit_txn also closes
the transaction.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1274803086.5882.1752.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_event.c |    7 ++++---
 arch/sparc/kernel/perf_event.c   |    7 ++++---
 arch/x86/kernel/cpu/perf_event.c |   14 +++++---------
 include/linux/perf_event.h       |   27 ++++++++++++++++++++++-----
 kernel/perf_event.c              |    9 +--------
 5 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index 43b83c3..ac2a8c2 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -754,7 +754,7 @@ static int power_pmu_enable(struct perf_event *event)
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuhw->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuhw->group_flag & PERF_EVENT_TXN)
 		goto nocheck;
 
 	if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
@@ -858,7 +858,7 @@ void power_pmu_start_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag |= PERF_EVENT_TXN;
 	cpuhw->n_txn_start = cpuhw->n_events;
 }
 
@@ -871,7 +871,7 @@ void power_pmu_cancel_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 }
 
 /*
@@ -897,6 +897,7 @@ int power_pmu_commit_txn(const struct pmu *pmu)
 	for (i = cpuhw->n_txn_start; i < n; ++i)
 		cpuhw->event[i]->hw.config = cpuhw->events[i];
 
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 	return 0;
 }
 
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 0ec92c8..beeb92f 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1005,7 +1005,7 @@ static int sparc_pmu_enable(struct perf_event *event)
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		goto nocheck;
 
 	if (check_excludes(cpuc->event, n0, 1))
@@ -1102,7 +1102,7 @@ static void sparc_pmu_start_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag |= PERF_EVENT_TXN;
 }
 
 /*
@@ -1114,7 +1114,7 @@ static void sparc_pmu_cancel_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 }
 
 /*
@@ -1137,6 +1137,7 @@ static int sparc_pmu_commit_txn(const struct pmu *pmu)
 	if (sparc_check_constraints(cpuc->event, cpuc->events, n))
 		return -EAGAIN;
 
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5db5b7d..af04c6f 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -969,7 +969,7 @@ static int x86_pmu_enable(struct perf_event *event)
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		goto out;
 
 	ret = x86_pmu.schedule_events(cpuc, n, assign);
@@ -1096,7 +1096,7 @@ static void x86_pmu_disable(struct perf_event *event)
 	 * The events never got scheduled and ->cancel_txn will truncate
 	 * the event_list.
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		return;
 
 	x86_pmu_stop(event);
@@ -1388,7 +1388,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuc->group_flag |= PERF_EVENT_TXN;
 	cpuc->n_txn = 0;
 }
 
@@ -1401,7 +1401,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 	/*
 	 * Truncate the collected events.
 	 */
@@ -1435,11 +1435,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
-	/*
-	 * Clear out the txn count so that ->cancel_txn() which gets
-	 * run after ->commit_txn() doesn't undo things.
-	 */
-	cpuc->n_txn = 0;
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 
 	return 0;
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 36efad9..f1b6ba0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -549,7 +549,10 @@ struct hw_perf_event {
 
 struct perf_event;
 
-#define PERF_EVENT_TXN_STARTED 1
+/*
+ * Common implementation detail of pmu::{start,commit,cancel}_txn
+ */
+#define PERF_EVENT_TXN 0x1
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -563,14 +566,28 @@ struct pmu {
 	void (*unthrottle)		(struct perf_event *event);
 
 	/*
-	 * group events scheduling is treated as a transaction,
-	 * add group events as a whole and perform one schedulability test.
-	 * If test fails, roll back the whole group
+	 * Group events scheduling is treated as a transaction, add group
+	 * events as a whole and perform one schedulability test. If the test
+	 * fails, roll back the whole group
 	 */
 
+	/*
+	 * Start the transaction, after this ->enable() doesn't need
+	 * to do schedulability tests.
+	 */
 	void (*start_txn)	(const struct pmu *pmu);
-	void (*cancel_txn)	(const struct pmu *pmu);
+	/*
+	 * If ->start_txn() disabled the ->enable() schedulability test
+	 * then ->commit_txn() is required to perform one. On success
+	 * the transaction is closed. On error the transaction is kept
+	 * open until ->cancel_txn() is called.
+	 */
 	int  (*commit_txn)	(const struct pmu *pmu);
+	/*
+	 * Will cancel the transaction, assumes ->disable() is called for
+	 * each successfull ->enable() during the transaction.
+	 */
+	void (*cancel_txn)	(const struct pmu *pmu);
 };
 
 /**
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 227ed9c..6f60920 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -675,7 +675,6 @@ group_sched_in(struct perf_event *group_event,
 	struct perf_event *event, *partial_group = NULL;
 	const struct pmu *pmu = group_event->pmu;
 	bool txn = false;
-	int ret;
 
 	if (group_event->state == PERF_EVENT_STATE_OFF)
 		return 0;
@@ -703,15 +702,9 @@ group_sched_in(struct perf_event *group_event,
 		}
 	}
 
-	if (!txn)
+	if (!txn || !pmu->commit_txn(pmu))
 		return 0;
 
-	ret = pmu->commit_txn(pmu);
-	if (!ret) {
-		pmu->cancel_txn(pmu);
-		return 0;
-	}
-
 group_error:
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any

^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2010-06-09 10:16 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-05-25 13:20 [PATCH] perf_events: fix event scheduling issues introduced by transactional API (take 2) Stephane Eranian
2010-05-25 13:35 ` Peter Zijlstra
2010-05-25 13:39   ` stephane eranian
2010-05-25 14:03     ` Peter Zijlstra
2010-05-25 14:19       ` Stephane Eranian
2010-05-25 15:02         ` Stephane Eranian
2010-05-25 15:32           ` Peter Zijlstra
2010-05-25 15:58             ` Peter Zijlstra
2010-05-25 16:10               ` Stephane Eranian
2010-05-25 16:18                 ` Peter Zijlstra
2010-05-25 16:20                   ` Stephane Eranian
2010-06-09 10:15               ` [tip:perf/core] perf: Cleanup {start,commit,cancel}_txn details tip-bot for Peter Zijlstra
2010-05-26  6:34             ` [PATCH] perf_events: fix event scheduling issues introduced by transactional API (take 2) Lin Ming
2010-05-26  6:58               ` Stephane Eranian
2010-05-31  7:20       ` [tip:perf/urgent] perf_events: Fix event scheduling issues introduced by transactional API tip-bot for Stephane Eranian

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.