All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
@ 2011-07-05 10:03 Cyrill Gorcunov
  2011-07-05 10:20 ` Ingo Molnar
  0 siblings, 1 reply; 29+ messages in thread
From: Cyrill Gorcunov @ 2011-07-05 10:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Don Zickus, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

Due to restriction and specifics of Netburst PMU we need a separated
event for NMI watchdog. In particular every Netburst event consume not
just a counter and config register, but also an additional ESCR register.
Since ESCR registers are grouped upon counters (i.e. if ESCR is occupied
for some event there is no room for another event to enter until it's
released) we need to pick up "least" used ESCR (or most available)
for nmi-watchdog purpose -- MSR_P4_CRU_ESCR2/3 was chosen.

v2: Add a comment about non-sleeping clockticks spotted by Ingo Molnar.
v3: Peter Zijlstra and Stephane Eranian pointed out that making new
    event global visible (up to userspace) will bring problems supporting
    this ABI in future. So now this event is x86 specific and hidden
    from userspace.
v4: Stephane proposed to use __weak arch specific callback instead of
    new hidden generic event.

Tested-and-reviewed-by: Don Zickus <dzickus@redhat.com>
Tested-and-reviewed-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Ingo Molnar <mingo@redhat.com>
CC: Lin Ming <ming.m.lin@intel.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Arnaldo Carvalho de Melo <acme@redhat.com>
CC: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/cpu/perf_event.c    |    7 +++++++
 arch/x86/kernel/cpu/perf_event_p4.c |   26 ++++++++++++++++++++++++++
 kernel/watchdog.c                   |    6 +++++-
 3 files changed, 38 insertions(+), 1 deletion(-)

Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event.c
@@ -233,6 +233,7 @@ struct x86_pmu {
 	void		(*enable_all)(int added);
 	void		(*enable)(struct perf_event *);
 	void		(*disable)(struct perf_event *);
+	void		(*hw_watchdog_set_attr)(struct perf_event_attr *attr);
 	int		(*hw_config)(struct perf_event *event);
 	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
 	unsigned	eventsel;
@@ -315,6 +316,12 @@ static u64 __read_mostly hw_cache_extra_
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX];
 
+void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr)
+{
+	if (x86_pmu.hw_watchdog_set_attr)
+		x86_pmu.hw_watchdog_set_attr(wd_attr);
+}
+
 /*
  * Propagate event elapsed time into the generic event.
  * Can only be executed on the CPU where the event is active.
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
@@ -705,6 +705,31 @@ static int p4_validate_raw_event(struct 
 	return 0;
 }
 
+static void p4_hw_watchdog_set_attr(struct perf_event_attr *wd_attr)
+{
+	/*
+	 * Watchdog ticks are special on Netburst, we use
+	 * that named "non-sleeping" ticks as recommended
+	 * by Intel SDM Vol3b.
+	 */
+	WARN_ON_ONCE(wd_attr->type	!= PERF_TYPE_HARDWARE ||
+		     wd_attr->config	!= PERF_COUNT_HW_CPU_CYCLES);
+
+	wd_attr->type	= PERF_TYPE_RAW;
+	wd_attr->config	=
+		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))		|
+		p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT		|
+			P4_CCCR_COMPARE);
+}
+
 static int p4_hw_config(struct perf_event *event)
 {
 	int cpu = get_cpu();
@@ -1179,6 +1204,7 @@ static __initconst const struct x86_pmu 
 	.cntval_bits		= ARCH_P4_CNTRVAL_BITS,
 	.cntval_mask		= ARCH_P4_CNTRVAL_MASK,
 	.max_period		= (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
+	.hw_watchdog_set_attr	= p4_hw_watchdog_set_attr,
 	.hw_config		= p4_hw_config,
 	.schedule_events	= p4_pmu_schedule_events,
 	/*
Index: linux-2.6.git/kernel/watchdog.c
===================================================================
--- linux-2.6.git.orig/kernel/watchdog.c
+++ linux-2.6.git/kernel/watchdog.c
@@ -200,6 +200,8 @@ static int is_softlockup(unsigned long t
 }
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
+void __weak hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) { }
+
 static struct perf_event_attr wd_hw_attr = {
 	.type		= PERF_TYPE_HARDWARE,
 	.config		= PERF_COUNT_HW_CPU_CYCLES,
@@ -368,9 +370,11 @@ static int watchdog_nmi_enable(int cpu)
 	if (event != NULL)
 		goto out_enable;
 
-	/* Try to register using hardware perf events */
 	wd_attr = &wd_hw_attr;
 	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+	hw_nmi_watchdog_set_attr(wd_attr);
+
+	/* Try to register using hardware perf events */
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
 	if (!IS_ERR(event)) {
 		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 10:03 [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4 Cyrill Gorcunov
@ 2011-07-05 10:20 ` Ingo Molnar
  2011-07-05 10:34   ` Cyrill Gorcunov
  0 siblings, 1 reply; 29+ messages in thread
From: Ingo Molnar @ 2011-07-05 10:20 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Don Zickus, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML


* Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> +static void p4_hw_watchdog_set_attr(struct perf_event_attr *wd_attr)
> +{
> +	/*
> +	 * Watchdog ticks are special on Netburst, we use
> +	 * that named "non-sleeping" ticks as recommended
> +	 * by Intel SDM Vol3b.
> +	 */
> +	WARN_ON_ONCE(wd_attr->type	!= PERF_TYPE_HARDWARE ||
> +		     wd_attr->config	!= PERF_COUNT_HW_CPU_CYCLES);
> +
> +	wd_attr->type	= PERF_TYPE_RAW;
> +	wd_attr->config	=
> +		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)		|
> +			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)		|
> +			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)		|
> +			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)		|
> +			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)		|
> +			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)		|
> +			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)		|
> +			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)		|
> +			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))		|
> +		p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT		|
> +			P4_CCCR_COMPARE);
> +}

So why don't we simply set this for all PERF_COUNT_HW_CPU_CYCLES 
events in the P4 PMU driver? That would remove half of the patch 
AFAICS.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 10:20 ` Ingo Molnar
@ 2011-07-05 10:34   ` Cyrill Gorcunov
  2011-07-05 10:59     ` Ingo Molnar
  0 siblings, 1 reply; 29+ messages in thread
From: Cyrill Gorcunov @ 2011-07-05 10:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Don Zickus, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, Jul 05, 2011 at 12:20:17PM +0200, Ingo Molnar wrote:
> 
> * Cyrill Gorcunov <gorcunov@gmail.com> wrote:
...
> 
> So why don't we simply set this for all PERF_COUNT_HW_CPU_CYCLES 
> events in the P4 PMU driver? That would remove half of the patch 
> AFAICS.
> 
> Thanks,
> 
> 	Ingo

Unfortunately it doesn't solve the main issue -- nmi-watchdog events
and cpu clock events should be a separate events with non-intersected
ESCRs, otherwise nmi-watchdog and cpu-cycles can't operate simultaneously
like other PMUs does.

non-sleeping ticks use

	[P4_EVENT_EXECUTION_EVENT] = {
		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
		.cntr		= { {12, 13, 16}, {14, 15, 17}
	},

while cpu-cycles

	[P4_EVENT_GLOBAL_POWER_EVENTS] = {
		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
		.cntr		= { {0, -1, -1}, {2, -1, -1} },
	},

Note non-intersection of { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 } and
{ MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 } here (together with counters
itselves pointed by .cntr).

	Cyrill

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 10:34   ` Cyrill Gorcunov
@ 2011-07-05 10:59     ` Ingo Molnar
  2011-07-05 11:05       ` Cyrill Gorcunov
  0 siblings, 1 reply; 29+ messages in thread
From: Ingo Molnar @ 2011-07-05 10:59 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Don Zickus, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML


* Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> On Tue, Jul 05, 2011 at 12:20:17PM +0200, Ingo Molnar wrote:
> > 
> > * Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> ...
> > 
> > So why don't we simply set this for all PERF_COUNT_HW_CPU_CYCLES 
> > events in the P4 PMU driver? That would remove half of the patch 
> > AFAICS.
> > 
> > Thanks,
> > 
> > 	Ingo
> 
> Unfortunately it doesn't solve the main issue -- nmi-watchdog events
> and cpu clock events should be a separate events with non-intersected
> ESCRs, otherwise nmi-watchdog and cpu-cycles can't operate simultaneously
> like other PMUs does.
> 
> non-sleeping ticks use
> 
> 	[P4_EVENT_EXECUTION_EVENT] = {
> 		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
> 		.cntr		= { {12, 13, 16}, {14, 15, 17}
> 	},
> 
> while cpu-cycles
> 
> 	[P4_EVENT_GLOBAL_POWER_EVENTS] = {
> 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
> 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
> 	},

are 'non-sleeping ticks' non-halted cycles - i.e. cycles that always 
count with CPU frequency and can thus be used for periodic 
frequencies?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 10:59     ` Ingo Molnar
@ 2011-07-05 11:05       ` Cyrill Gorcunov
  2011-07-05 11:20         ` Ingo Molnar
  0 siblings, 1 reply; 29+ messages in thread
From: Cyrill Gorcunov @ 2011-07-05 11:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Don Zickus, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, Jul 05, 2011 at 12:59:59PM +0200, Ingo Molnar wrote:
...
> 
> are 'non-sleeping ticks' non-halted cycles - i.e. cycles that always 
> count with CPU frequency and can thus be used for periodic 
> frequencies?
> 

Yes, I think so (btw, as far as I remember oprofile does the same 'threshold'
gaming for nmi-watchdog).

	Cyrill

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 11:05       ` Cyrill Gorcunov
@ 2011-07-05 11:20         ` Ingo Molnar
  2011-07-05 11:36           ` Cyrill Gorcunov
  0 siblings, 1 reply; 29+ messages in thread
From: Ingo Molnar @ 2011-07-05 11:20 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Don Zickus, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML


* Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> On Tue, Jul 05, 2011 at 12:59:59PM +0200, Ingo Molnar wrote:
> ...
> > 
> > are 'non-sleeping ticks' non-halted cycles - i.e. cycles that 
> > always count with CPU frequency and can thus be used for periodic 
> > frequencies?
> 
> Yes, I think so (btw, as far as I remember oprofile does the same 
> 'threshold' gaming for nmi-watchdog).

But the NMI watchdog does not need a constant frequency event - it 
can use unhalted cycles just fine. Why does it need unhalted cycles 
on P4?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 11:20         ` Ingo Molnar
@ 2011-07-05 11:36           ` Cyrill Gorcunov
  2011-07-05 11:44             ` Ingo Molnar
  0 siblings, 1 reply; 29+ messages in thread
From: Cyrill Gorcunov @ 2011-07-05 11:36 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Don Zickus, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, Jul 05, 2011 at 01:20:02PM +0200, Ingo Molnar wrote:
> 
> * Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> 
> > On Tue, Jul 05, 2011 at 12:59:59PM +0200, Ingo Molnar wrote:
> > ...
> > > 
> > > are 'non-sleeping ticks' non-halted cycles - i.e. cycles that 
> > > always count with CPU frequency and can thus be used for periodic 
> > > frequencies?
> > 
> > Yes, I think so (btw, as far as I remember oprofile does the same 
> > 'threshold' gaming for nmi-watchdog).
> 
> But the NMI watchdog does not need a constant frequency event - it 
> can use unhalted cycles just fine. Why does it need unhalted cycles 
> on P4?
> 

Ingo, the main problem there is not halted or unhalted events but rather
inability of p4 architecture to move events between counters, with core
or nehalem you can (if constraints allow) simply move nmi-watchdog event
to another free counter and assign cpu cycles to a free slot. With p4 the
situation is radically different -- every event has a number of contraints
and once nmi-watchdog armed (it could be halted or unhaled event, whatever)
ESCR/CCCR/counter registers tuple borrowed forever and we need to find out
some different non-intersected ESCR/CCCR/counter tuple to count cpu-cycles
in a sake of perf utility purpose. That is why this assignments done in
a such weird way -- just to be able to run nmi-watchdog and cpu-cycles
simultaneously.

Probably I miss something and you mean something completely different?

	Cyrill

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 11:36           ` Cyrill Gorcunov
@ 2011-07-05 11:44             ` Ingo Molnar
  2011-07-05 11:49               ` Cyrill Gorcunov
  2011-07-05 12:24               ` Don Zickus
  0 siblings, 2 replies; 29+ messages in thread
From: Ingo Molnar @ 2011-07-05 11:44 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Don Zickus, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML


* Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> On Tue, Jul 05, 2011 at 01:20:02PM +0200, Ingo Molnar wrote:
> > 
> > * Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> > 
> > > On Tue, Jul 05, 2011 at 12:59:59PM +0200, Ingo Molnar wrote:
> > > ...
> > > > 
> > > > are 'non-sleeping ticks' non-halted cycles - i.e. cycles that 
> > > > always count with CPU frequency and can thus be used for periodic 
> > > > frequencies?
> > > 
> > > Yes, I think so (btw, as far as I remember oprofile does the same 
> > > 'threshold' gaming for nmi-watchdog).
> > 
> > But the NMI watchdog does not need a constant frequency event - it 
> > can use unhalted cycles just fine. Why does it need unhalted cycles 
> > on P4?
> > 
> 
> Ingo, the main problem there is not halted or unhalted events but rather
> inability of p4 architecture to move events between counters, with core
> or nehalem you can (if constraints allow) simply move nmi-watchdog event
> to another free counter and assign cpu cycles to a free slot. With p4 the
> situation is radically different -- every event has a number of contraints
> and once nmi-watchdog armed (it could be halted or unhaled event, whatever)
> ESCR/CCCR/counter registers tuple borrowed forever and we need to find out
> some different non-intersected ESCR/CCCR/counter tuple to count cpu-cycles
> in a sake of perf utility purpose. That is why this assignments done in
> a such weird way -- just to be able to run nmi-watchdog and cpu-cycles
> simultaneously.
> 
> Probably I miss something and you mean something completely different?

What i am missing is that you have not pointed out the *core problem* 
you are fixing and it's not obvious from the changelog either!

That the P4 has weird event constraints is not a problem users care 
about...

What 'bad thing' happens in practice without this patch, what 
limitations arise in perf or in the NMI watchdog? Does it lock up? 
Does it not work? Does 'perf top' get confused?

Conversely, what 'good thing' happens if the patch is applied and are 
there any limitations left?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 11:44             ` Ingo Molnar
@ 2011-07-05 11:49               ` Cyrill Gorcunov
  2011-07-05 12:14                 ` Cyrill Gorcunov
  2011-07-05 12:24               ` Don Zickus
  1 sibling, 1 reply; 29+ messages in thread
From: Cyrill Gorcunov @ 2011-07-05 11:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Don Zickus, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, Jul 05, 2011 at 01:44:37PM +0200, Ingo Molnar wrote:
...
> 
> What i am missing is that you have not pointed out the *core problem* 
> you are fixing and it's not obvious from the changelog either!

OK, seems I put there some tech details while human readable changelog
was needed. Will fix that, thanks!

	Cyrill

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 11:49               ` Cyrill Gorcunov
@ 2011-07-05 12:14                 ` Cyrill Gorcunov
  2011-07-05 13:10                   ` Ingo Molnar
  0 siblings, 1 reply; 29+ messages in thread
From: Cyrill Gorcunov @ 2011-07-05 12:14 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Don Zickus, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, Jul 05, 2011 at 03:49:44PM +0400, Cyrill Gorcunov wrote:
> On Tue, Jul 05, 2011 at 01:44:37PM +0200, Ingo Molnar wrote:
> ...
> > 
> > What i am missing is that you have not pointed out the *core problem* 
> > you are fixing and it's not obvious from the changelog either!
> 
> OK, seems I put there some tech details while human readable changelog
> was needed. Will fix that, thanks!
> 
> 	Cyrill

Ingo, what about this one?

	Cyrill
---
perf, x86: P4 PMU - Add hw_watchdog_set_attr helper to simulate cpu-cycles counting in nmi-watchdog

Because of constraints existed in Netburst PMU counting
cpu cycles is allowed for one consumer only.

If the kernel is booted up with nmi-watchdog enabled
the watchdog become a consumer of such event and there
is no more room left for "perf top" and friends (ie any
attempts to count cpu cycles simultaneously with nmi-watchdog
doomed to fail).

The patch tries to improve situation a bit -- an event counting
non-sleeping cpu clocks is added and assigned to nmi-watchdog only,
leaving the former PERF_COUNT_HW_CPU_CYCLES event free, so say
"perf top" now can run simultaneously with nmi-watchdog.

Note that there is a disadvantage as well -- MSR_P4_CRU_ESCR2
and MSR_P4_CRU_ESCR3 now always occupied by watchdog so if some
application needs this ESCRs the nmi-watchdog should be turned
off first, otherwise access to these registers will be never
granted.

v2: Add a comment about non-sleeping clockticks spotted by Ingo Molnar.
v3: Peter Zijlstra and Stephane Eranian pointed out that making new
    event global visible (up to userspace) will bring problems supporting
    this ABI in future. So now this event is x86 specific and hidden
    from userspace.
v4: Stephane proposed to use __weak arch specific callback instead of
    new hidden generic event.

Tested-and-reviewed-by: Don Zickus <dzickus@redhat.com>
Tested-and-reviewed-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Ingo Molnar <mingo@redhat.com>
CC: Lin Ming <ming.m.lin@intel.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Arnaldo Carvalho de Melo <acme@redhat.com>
CC: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/cpu/perf_event.c    |    7 +++++++
 arch/x86/kernel/cpu/perf_event_p4.c |   26 ++++++++++++++++++++++++++
 kernel/watchdog.c                   |    6 +++++-
 3 files changed, 38 insertions(+), 1 deletion(-)

Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event.c
@@ -233,6 +233,7 @@ struct x86_pmu {
 	void		(*enable_all)(int added);
 	void		(*enable)(struct perf_event *);
 	void		(*disable)(struct perf_event *);
+	void		(*hw_watchdog_set_attr)(struct perf_event_attr *attr);
 	int		(*hw_config)(struct perf_event *event);
 	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
 	unsigned	eventsel;
@@ -315,6 +316,12 @@ static u64 __read_mostly hw_cache_extra_
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX];
 
+void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr)
+{
+	if (x86_pmu.hw_watchdog_set_attr)
+		x86_pmu.hw_watchdog_set_attr(wd_attr);
+}
+
 /*
  * Propagate event elapsed time into the generic event.
  * Can only be executed on the CPU where the event is active.
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
@@ -705,6 +705,31 @@ static int p4_validate_raw_event(struct 
 	return 0;
 }
 
+static void p4_hw_watchdog_set_attr(struct perf_event_attr *wd_attr)
+{
+	/*
+	 * Watchdog ticks are special on Netburst, we use
+	 * that named "non-sleeping" ticks as recommended
+	 * by Intel SDM Vol3b.
+	 */
+	WARN_ON_ONCE(wd_attr->type	!= PERF_TYPE_HARDWARE ||
+		     wd_attr->config	!= PERF_COUNT_HW_CPU_CYCLES);
+
+	wd_attr->type	= PERF_TYPE_RAW;
+	wd_attr->config	=
+		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))		|
+		p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT		|
+			P4_CCCR_COMPARE);
+}
+
 static int p4_hw_config(struct perf_event *event)
 {
 	int cpu = get_cpu();
@@ -1179,6 +1204,7 @@ static __initconst const struct x86_pmu 
 	.cntval_bits		= ARCH_P4_CNTRVAL_BITS,
 	.cntval_mask		= ARCH_P4_CNTRVAL_MASK,
 	.max_period		= (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
+	.hw_watchdog_set_attr	= p4_hw_watchdog_set_attr,
 	.hw_config		= p4_hw_config,
 	.schedule_events	= p4_pmu_schedule_events,
 	/*
Index: linux-2.6.git/kernel/watchdog.c
===================================================================
--- linux-2.6.git.orig/kernel/watchdog.c
+++ linux-2.6.git/kernel/watchdog.c
@@ -200,6 +200,8 @@ static int is_softlockup(unsigned long t
 }
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
+void __weak hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) { }
+
 static struct perf_event_attr wd_hw_attr = {
 	.type		= PERF_TYPE_HARDWARE,
 	.config		= PERF_COUNT_HW_CPU_CYCLES,
@@ -368,9 +370,11 @@ static int watchdog_nmi_enable(int cpu)
 	if (event != NULL)
 		goto out_enable;
 
-	/* Try to register using hardware perf events */
 	wd_attr = &wd_hw_attr;
 	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+	hw_nmi_watchdog_set_attr(wd_attr);
+
+	/* Try to register using hardware perf events */
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
 	if (!IS_ERR(event)) {
 		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 11:44             ` Ingo Molnar
  2011-07-05 11:49               ` Cyrill Gorcunov
@ 2011-07-05 12:24               ` Don Zickus
  2011-07-05 12:26                 ` Cyrill Gorcunov
  1 sibling, 1 reply; 29+ messages in thread
From: Don Zickus @ 2011-07-05 12:24 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Cyrill Gorcunov, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, Jul 05, 2011 at 01:44:37PM +0200, Ingo Molnar wrote:
> > Probably I miss something and you mean something completely different?
> 
> What i am missing is that you have not pointed out the *core problem* 
> you are fixing and it's not obvious from the changelog either!

Cyril,

I think you didn't explain the fact that without this patch if you enable
nmi_watchdog, then the 'perf' tool doesn't work on a P4.  This is bad for
end users.  It seems like with all the P4 register coupling an end user
can run either the nmi_watchdog _or_ perf but not both.

For RHEL-6 this was bad because we like to enable the nmi_watchdog.

This patch does some magic and allows the end user to use the nmi_watchdog
_and_ perf concurrently (a common expected behaviour).

I hope that clears things up Ingo.

Cheers,
Don

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 12:24               ` Don Zickus
@ 2011-07-05 12:26                 ` Cyrill Gorcunov
  2011-07-05 12:44                   ` Don Zickus
  0 siblings, 1 reply; 29+ messages in thread
From: Cyrill Gorcunov @ 2011-07-05 12:26 UTC (permalink / raw)
  To: Don Zickus
  Cc: Ingo Molnar, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, Jul 05, 2011 at 08:24:05AM -0400, Don Zickus wrote:
...
> 
> I hope that clears things up Ingo.
> 
> Cheers,
> Don

Don, I've just sent out updated changelog, please check it out, does it
look better?

	Cyrill

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 12:26                 ` Cyrill Gorcunov
@ 2011-07-05 12:44                   ` Don Zickus
  2011-07-05 12:56                     ` Cyrill Gorcunov
  0 siblings, 1 reply; 29+ messages in thread
From: Don Zickus @ 2011-07-05 12:44 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Ingo Molnar, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, Jul 05, 2011 at 04:26:20PM +0400, Cyrill Gorcunov wrote:
> On Tue, Jul 05, 2011 at 08:24:05AM -0400, Don Zickus wrote:
> ...
> > 
> > I hope that clears things up Ingo.
> > 
> > Cheers,
> > Don
> 
> Don, I've just sent out updated changelog, please check it out, does it
> look better?

It seems to be, then again I am biased because I know what the patch does
(and would like it :-) ).

Cheers,
Don

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 12:44                   ` Don Zickus
@ 2011-07-05 12:56                     ` Cyrill Gorcunov
  0 siblings, 0 replies; 29+ messages in thread
From: Cyrill Gorcunov @ 2011-07-05 12:56 UTC (permalink / raw)
  To: Don Zickus
  Cc: Ingo Molnar, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, Jul 05, 2011 at 08:44:44AM -0400, Don Zickus wrote:
> On Tue, Jul 05, 2011 at 04:26:20PM +0400, Cyrill Gorcunov wrote:
> > On Tue, Jul 05, 2011 at 08:24:05AM -0400, Don Zickus wrote:
> > ...
> > > 
> > > I hope that clears things up Ingo.
> > > 
> > > Cheers,
> > > Don
> > 
> > Don, I've just sent out updated changelog, please check it out, does it
> > look better?
> 
> It seems to be, then again I am biased because I know what the patch does
> (and would like it :-) ).
> 
> Cheers,
> Don

OK ;) i'm waiting for Ingo's feedback.

	Cyrill

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 12:14                 ` Cyrill Gorcunov
@ 2011-07-05 13:10                   ` Ingo Molnar
  2011-07-05 13:17                     ` Peter Zijlstra
  2011-07-05 13:26                     ` Cyrill Gorcunov
  0 siblings, 2 replies; 29+ messages in thread
From: Ingo Molnar @ 2011-07-05 13:10 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Don Zickus, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML


* Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> perf, x86: P4 PMU - Add hw_watchdog_set_attr helper to simulate cpu-cycles counting in nmi-watchdog
> 
> Because of constraints existed in Netburst PMU counting
> cpu cycles is allowed for one consumer only.
> 
> If the kernel is booted up with nmi-watchdog enabled
> the watchdog become a consumer of such event and there
> is no more room left for "perf top" and friends (ie any
> attempts to count cpu cycles simultaneously with nmi-watchdog
> doomed to fail).

Hm, what is the symptom - 'perf top' reports nothing?

If multiple users request cycles then perf will time-share them - 
this is what happens if you run many 'perf top' or 'perf stat' 
sessions in parallel for example. For example i just tried to run six 
separate 'perf top' in parallel - and all six worked fine.

So the question is, why does the NMI watchdog prevent 'perf top' from 
working on a P4?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 13:10                   ` Ingo Molnar
@ 2011-07-05 13:17                     ` Peter Zijlstra
  2011-07-05 13:31                       ` Ingo Molnar
  2011-07-05 13:26                     ` Cyrill Gorcunov
  1 sibling, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2011-07-05 13:17 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Cyrill Gorcunov, Don Zickus, Stephane Eranian, Lin Ming,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, 2011-07-05 at 15:10 +0200, Ingo Molnar wrote:
> * Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> 
> > perf, x86: P4 PMU - Add hw_watchdog_set_attr helper to simulate cpu-cycles counting in nmi-watchdog
> > 
> > Because of constraints existed in Netburst PMU counting
> > cpu cycles is allowed for one consumer only.
> > 
> > If the kernel is booted up with nmi-watchdog enabled
> > the watchdog become a consumer of such event and there
> > is no more room left for "perf top" and friends (ie any
> > attempts to count cpu cycles simultaneously with nmi-watchdog
> > doomed to fail).
> 
> Hm, what is the symptom - 'perf top' reports nothing?
> 
> If multiple users request cycles then perf will time-share them - 
> this is what happens if you run many 'perf top' or 'perf stat' 
> sessions in parallel for example. For example i just tried to run six 
> separate 'perf top' in parallel - and all six worked fine.
> 
> So the question is, why does the NMI watchdog prevent 'perf top' from 
> working on a P4?

because the NMI watchdog is a pinned event, you don't want to share the
counter, that would be very bad, suppose you lock up when the NMI
watchdog was scheduled out. Unreliably debug tools are worse than no
tools.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 13:10                   ` Ingo Molnar
  2011-07-05 13:17                     ` Peter Zijlstra
@ 2011-07-05 13:26                     ` Cyrill Gorcunov
  1 sibling, 0 replies; 29+ messages in thread
From: Cyrill Gorcunov @ 2011-07-05 13:26 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Don Zickus, Stephane Eranian, Lin Ming, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, Jul 05, 2011 at 03:10:05PM +0200, Ingo Molnar wrote:
> 
> * Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> 
> > perf, x86: P4 PMU - Add hw_watchdog_set_attr helper to simulate cpu-cycles counting in nmi-watchdog
> > 
> > Because of constraints existed in Netburst PMU counting
> > cpu cycles is allowed for one consumer only.
> > 
> > If the kernel is booted up with nmi-watchdog enabled
> > the watchdog become a consumer of such event and there
> > is no more room left for "perf top" and friends (ie any
> > attempts to count cpu cycles simultaneously with nmi-watchdog
> > doomed to fail).
> 
> Hm, what is the symptom - 'perf top' reports nothing?

Exactly, last time I had access to p4 machine (about 2 month
ago I think) with nmi-watchdog enabled perf top simply rendered
empty screen with standart header on top.

> 
> If multiple users request cycles then perf will time-share them - 
> this is what happens if you run many 'perf top' or 'perf stat' 
> sessions in parallel for example. For example i just tried to run six 
> separate 'perf top' in parallel - and all six worked fine.
> 
> So the question is, why does the NMI watchdog prevent 'perf top' from 
> working on a P4?
>

Because core and nehalem can use two cpu-cycles counters simultaneously
counting same event, and when nmi-watchdog is enabled on such pmus the one
counter assigned to nmi-watchdog, a second counter assigned to perf top and friends.
And nmi-watchdog event is "never released" event as far as I remember,
it runs all the time while system works.

In P4 every event has a restriction on which counter it can run and which
assitional registers it can use. So once nmi-watchdog has borrowed an event
(and associated resurses) noone else can access to these resources forever,
and hence perf top fails since it tries to gain access to same event.

	Cyrill

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 13:17                     ` Peter Zijlstra
@ 2011-07-05 13:31                       ` Ingo Molnar
  2011-07-05 14:19                         ` Cyrill Gorcunov
                                           ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Ingo Molnar @ 2011-07-05 13:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Cyrill Gorcunov, Don Zickus, Stephane Eranian, Lin Ming,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> > So the question is, why does the NMI watchdog prevent 'perf top' 
> > from working on a P4?
> 
> because the NMI watchdog is a pinned event, you don't want to share 
> the counter, that would be very bad, suppose you lock up when the 
> NMI watchdog was scheduled out. Unreliably debug tools are worse 
> than no tools.

Yeah, indeed that explains the symptom.

Firstly, we should fix/enhance perf top to print out an error message 
in this case, not just hang there doing nothing.

Secondly, the proper solution would be to allow the multiplexing of 
like-minded hw events. Here if we have two events:

  - pinned NMI watchdog, set to a period of 2 billion cycles
  - perf top with a default of 1 khz auto-freq cycles

We should first change the NMI watchdog to use auto-freq samples - 
the hw_nmi_get_sample_period() looks unnecessary - if we set the NMI 
watchdog to 1 Hz by default it should be more than enough.

Thus we'd have two events:

  - pinned NMI watchdog, set to 1 Hz
  - perf top, set to 1000 Hz

The 1000 Hz event could serve the 1 Hz event just fine.

Thirdly, even if we mucked the NMI event to be somewhat different 
from the real cycles event (so the P4 PMU can co-schedule it with 
perf top), the real solution there would *still* be to express this 
as a variation of a cycles event: use the same trick to allow up to 
two cycles events to be present in the PMU - but hide this from the 
generic interfaces, just allow up to two cycles events to be 
scheduled at once.

This will have the advantage of not only fixing the NMI watchdog, but 
other users as well.

Now, if there's some real behavioral difference between the two 
events (one is halted cycles the other is unhalted cycles), then i'd 
suggest to use PERF_COUNT_HW_BUS_CYCLES in the NMI watchdog - that is 
the generic 'constant frequency' cycles event.

So there's lots of options to fix/improve this more intelligently.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 13:31                       ` Ingo Molnar
@ 2011-07-05 14:19                         ` Cyrill Gorcunov
  2011-07-08 12:44                           ` Ingo Molnar
  2011-07-05 14:20                         ` Peter Zijlstra
  2011-07-05 14:40                         ` Peter Zijlstra
  2 siblings, 1 reply; 29+ messages in thread
From: Cyrill Gorcunov @ 2011-07-05 14:19 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Don Zickus, Stephane Eranian, Lin Ming,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, Jul 05, 2011 at 03:31:05PM +0200, Ingo Molnar wrote:
> 
> * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > > So the question is, why does the NMI watchdog prevent 'perf top' 
> > > from working on a P4?
> > 
> > because the NMI watchdog is a pinned event, you don't want to share 
> > the counter, that would be very bad, suppose you lock up when the 
> > NMI watchdog was scheduled out. Unreliably debug tools are worse 
> > than no tools.
> 
> Yeah, indeed that explains the symptom.
> 
> Firstly, we should fix/enhance perf top to print out an error message 
> in this case, not just hang there doing nothing.

It waits for event to be scheduled, so seems first the kernel should
have top level context-schedule-in functions changed from void to
int (so I have admit I might be missin something here).

> 
> Secondly, the proper solution would be to allow the multiplexing of 
> like-minded hw events. Here if we have two events:
> 
>   - pinned NMI watchdog, set to a period of 2 billion cycles
>   - perf top with a default of 1 khz auto-freq cycles
> 
> We should first change the NMI watchdog to use auto-freq samples - 
> the hw_nmi_get_sample_period() looks unnecessary - if we set the NMI 
> watchdog to 1 Hz by default it should be more than enough.

Should we drop tunability of watchdog period as well? At moment
there is a way to setup period via /sys.

> 
> Thus we'd have two events:
> 
>   - pinned NMI watchdog, set to 1 Hz
>   - perf top, set to 1000 Hz
> 
> The 1000 Hz event could serve the 1 Hz event just fine.
> 
> Thirdly, even if we mucked the NMI event to be somewhat different 
> from the real cycles event (so the P4 PMU can co-schedule it with 
> perf top), the real solution there would *still* be to express this 
> as a variation of a cycles event: use the same trick to allow up to 
> two cycles events to be present in the PMU - but hide this from the 
> generic interfaces, just allow up to two cycles events to be 
> scheduled at once.

This sounds like event aliases PeterZ was proposed pretty ago, I tried
it but didn't success, one of problem was that current p4 code structure
does not care how exactly events came into, it simply checks for registers
resource being 'free' and schedule event in if possible.

I will try to implement aliases again, probably there is still a way
and I overlooked it in first place.

> 
> This will have the advantage of not only fixing the NMI watchdog, but 
> other users as well.
> 
> Now, if there's some real behavioral difference between the two 
> events (one is halted cycles the other is unhalted cycles), then i'd 
> suggest to use PERF_COUNT_HW_BUS_CYCLES in the NMI watchdog - that is 
> the generic 'constant frequency' cycles event.

Should not be much difference here as far as I can tell.

> 
> So there's lots of options to fix/improve this more intelligently.
> 

	Cyrill

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 13:31                       ` Ingo Molnar
  2011-07-05 14:19                         ` Cyrill Gorcunov
@ 2011-07-05 14:20                         ` Peter Zijlstra
  2011-07-05 14:40                         ` Peter Zijlstra
  2 siblings, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2011-07-05 14:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Cyrill Gorcunov, Don Zickus, Stephane Eranian, Lin Ming,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, 2011-07-05 at 15:31 +0200, Ingo Molnar wrote:
> Secondly, the proper solution would be to allow the multiplexing of 
> like-minded hw events. 

Right, that's something that's been mentioned before (wasn't that akpm
who asked that?). Anyway that's terribly non-trivial, but I guess we can
minimize impact on the sample fast-path by using a custom overflow
handler for those.

/me gets a migraine just thinking about how to implement that ;-)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 13:31                       ` Ingo Molnar
  2011-07-05 14:19                         ` Cyrill Gorcunov
  2011-07-05 14:20                         ` Peter Zijlstra
@ 2011-07-05 14:40                         ` Peter Zijlstra
  2011-07-05 14:56                           ` Ingo Molnar
  2 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2011-07-05 14:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Cyrill Gorcunov, Don Zickus, Stephane Eranian, Lin Ming,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, 2011-07-05 at 15:31 +0200, Ingo Molnar wrote:
> So there's lots of options to fix/improve this more intelligently.

Sure, and I like the alternative encoding thing best, but doing that
will take time, in the meantime this patch, which is relatively small
(and easy to revert once we get something better going) does fix a
problem for those few unfortunate souls still using P4 hardware.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 14:40                         ` Peter Zijlstra
@ 2011-07-05 14:56                           ` Ingo Molnar
  2011-07-05 15:25                             ` Cyrill Gorcunov
  0 siblings, 1 reply; 29+ messages in thread
From: Ingo Molnar @ 2011-07-05 14:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Cyrill Gorcunov, Don Zickus, Stephane Eranian, Lin Ming,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Tue, 2011-07-05 at 15:31 +0200, Ingo Molnar wrote:
> > So there's lots of options to fix/improve this more intelligently.
> 
> Sure, and I like the alternative encoding thing best, but doing 
> that will take time, in the meantime this patch, which is 
> relatively small (and easy to revert once we get something better 
> going) does fix a problem for those few unfortunate souls still 
> using P4 hardware.

Well, the BUS_CYCLES thing looks similarly straightforward and should 
result in an even simpler patch.

On P4 BUS_CYCLES would be able to co-exist with CPU_CYCLES so it will 
solve the P4 issue naturally as well.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 14:56                           ` Ingo Molnar
@ 2011-07-05 15:25                             ` Cyrill Gorcunov
  2011-07-06  7:01                               ` Cyrill Gorcunov
  2011-07-08 12:49                               ` Ingo Molnar
  0 siblings, 2 replies; 29+ messages in thread
From: Cyrill Gorcunov @ 2011-07-05 15:25 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Don Zickus, Stephane Eranian, Lin Ming,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, Jul 05, 2011 at 04:56:56PM +0200, Ingo Molnar wrote:
> 
> * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > On Tue, 2011-07-05 at 15:31 +0200, Ingo Molnar wrote:
> > > So there's lots of options to fix/improve this more intelligently.
> > 
> > Sure, and I like the alternative encoding thing best, but doing 
> > that will take time, in the meantime this patch, which is 
> > relatively small (and easy to revert once we get something better 
> > going) does fix a problem for those few unfortunate souls still 
> > using P4 hardware.
> 
> Well, the BUS_CYCLES thing looks similarly straightforward and should 
> result in an even simpler patch.
>

nope, bus cycles count fsb driving which is not the same as execution unit
productions, so I fear it is not that reliable and i guess (note _guess_ here
since I can't prove it without hardware handy :) if cpus are stuck with endless
loop inside kernel we might miss such lockup (hard scenario, since there
will be activity on fsb anyway, but still).

> On P4 BUS_CYCLES would be able to co-exist with CPU_CYCLES so it will 
> solve the P4 issue naturally as well.
> 
> Thanks,
> 
> 	Ingo

i don't think it changes much, Ingo, if I change it to bus cycles I still
will have to setup nmi-watchdog event separately (but simply with bus
event).

so an only option is the aliases, i'll try to deal with it but no milestones

	Cyrill

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 15:25                             ` Cyrill Gorcunov
@ 2011-07-06  7:01                               ` Cyrill Gorcunov
  2011-07-08 12:49                               ` Ingo Molnar
  1 sibling, 0 replies; 29+ messages in thread
From: Cyrill Gorcunov @ 2011-07-06  7:01 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Don Zickus, Stephane Eranian, Lin Ming,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Tue, Jul 05, 2011 at 07:25:15PM +0400, Cyrill Gorcunov wrote:
...
> 
> i don't think it changes much, Ingo, if I change it to bus cycles I still
> will have to setup nmi-watchdog event separately (but simply with bus
> event).
> 
> so an only option is the aliases, i'll try to deal with it but no milestones
> 
> 	Cyrill

Seems I recall one of the problem with aliasing. Look, for example one
cpu-cycles (as native non-halted ticks) is served as nmi-watchdog, then
say perf top is started and it uses non-sleeping ticks (via execution unit
events) for counting cycles. Note the end user doesn't know about it since
this all is transparent. But what if some other user needs execution unit
events via RAW interface and because this event is already borrowed as alias
it never be granted, and what is worse is that a user has no idea why. Is it
acceptable tradeoff?

	Cyrill

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 14:19                         ` Cyrill Gorcunov
@ 2011-07-08 12:44                           ` Ingo Molnar
  0 siblings, 0 replies; 29+ messages in thread
From: Ingo Molnar @ 2011-07-08 12:44 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Peter Zijlstra, Don Zickus, Stephane Eranian, Lin Ming,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML


* Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> On Tue, Jul 05, 2011 at 03:31:05PM +0200, Ingo Molnar wrote:
> > 
> > * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > 
> > > > So the question is, why does the NMI watchdog prevent 'perf top' 
> > > > from working on a P4?
> > > 
> > > because the NMI watchdog is a pinned event, you don't want to share 
> > > the counter, that would be very bad, suppose you lock up when the 
> > > NMI watchdog was scheduled out. Unreliably debug tools are worse 
> > > than no tools.
> > 
> > Yeah, indeed that explains the symptom.
> > 
> > Firstly, we should fix/enhance perf top to print out an error message 
> > in this case, not just hang there doing nothing.
> 
> It waits for event to be scheduled, so seems first the kernel 
> should have top level context-schedule-in functions changed from 
> void to int (so I have admit I might be missin something here).
> 
> > 
> > Secondly, the proper solution would be to allow the multiplexing of 
> > like-minded hw events. Here if we have two events:
> > 
> >   - pinned NMI watchdog, set to a period of 2 billion cycles
> >   - perf top with a default of 1 khz auto-freq cycles
> > 
> > We should first change the NMI watchdog to use auto-freq samples - 
> > the hw_nmi_get_sample_period() looks unnecessary - if we set the NMI 
> > watchdog to 1 Hz by default it should be more than enough.
> 
> Should we drop tunability of watchdog period as well? At moment
> there is a way to setup period via /sys.

Well, the tunable can remain - there's some real usecases that want 
to increased/decrease the frequency.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-05 15:25                             ` Cyrill Gorcunov
  2011-07-06  7:01                               ` Cyrill Gorcunov
@ 2011-07-08 12:49                               ` Ingo Molnar
  2011-07-08 13:01                                 ` Cyrill Gorcunov
  1 sibling, 1 reply; 29+ messages in thread
From: Ingo Molnar @ 2011-07-08 12:49 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Peter Zijlstra, Don Zickus, Stephane Eranian, Lin Ming,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML


* Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> > On P4 BUS_CYCLES would be able to co-exist with CPU_CYCLES so it 
> > will solve the P4 issue naturally as well.
> 
> i don't think it changes much, Ingo, if I change it to bus cycles I 
> still will have to setup nmi-watchdog event separately (but simply 
> with bus event).

You did not understand my point, my suggestion is to change this in 
kernel/watchdog.c:

static struct perf_event_attr wd_hw_attr = {
        .type           = PERF_TYPE_HARDWARE,
        .config         = PERF_COUNT_HW_CPU_CYCLES,

to:

static struct perf_event_attr wd_hw_attr = {
        .type           = PERF_TYPE_HARDWARE,
        .config         = PERF_COUNT_HW_BUS_CYCLES,

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-08 12:49                               ` Ingo Molnar
@ 2011-07-08 13:01                                 ` Cyrill Gorcunov
  2011-07-08 13:09                                   ` Ingo Molnar
  2011-07-08 13:12                                   ` Cyrill Gorcunov
  0 siblings, 2 replies; 29+ messages in thread
From: Cyrill Gorcunov @ 2011-07-08 13:01 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Don Zickus, Stephane Eranian, Lin Ming,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Fri, Jul 08, 2011 at 02:49:06PM +0200, Ingo Molnar wrote:
> 
> * Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> 
> > > On P4 BUS_CYCLES would be able to co-exist with CPU_CYCLES so it 
> > > will solve the P4 issue naturally as well.
> > 
> > i don't think it changes much, Ingo, if I change it to bus cycles I 
> > still will have to setup nmi-watchdog event separately (but simply 
> > with bus event).
> 
> You did not understand my point, my suggestion is to change this in 
> kernel/watchdog.c:
> 
> static struct perf_event_attr wd_hw_attr = {
>         .type           = PERF_TYPE_HARDWARE,
>         .config         = PERF_COUNT_HW_CPU_CYCLES,
> 
> to:
> 
> static struct perf_event_attr wd_hw_attr = {
>         .type           = PERF_TYPE_HARDWARE,
>         .config         = PERF_COUNT_HW_BUS_CYCLES,
> 
> Thanks,
> 
> 	Ingo

Ah, I see. Ingo, if events alias patch I posted yesterday will do
the trick we will be able to simply drop all this hooks. So as
only Don (or someone) confirm the patch works I'll kill this mess.

Changing cpu-cycles to bus cycles will allow nmi-watchdog and
perf top co-exsist but it means perf bus-cycles won't work anymore
with active nmi-watchdog. So I think events aliases (again, if it
work :) will be better for us, right?

	Cyrill

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-08 13:01                                 ` Cyrill Gorcunov
@ 2011-07-08 13:09                                   ` Ingo Molnar
  2011-07-08 13:12                                   ` Cyrill Gorcunov
  1 sibling, 0 replies; 29+ messages in thread
From: Ingo Molnar @ 2011-07-08 13:09 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Peter Zijlstra, Don Zickus, Stephane Eranian, Lin Ming,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML


* Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> On Fri, Jul 08, 2011 at 02:49:06PM +0200, Ingo Molnar wrote:
> > 
> > * Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> > 
> > > > On P4 BUS_CYCLES would be able to co-exist with CPU_CYCLES so it 
> > > > will solve the P4 issue naturally as well.
> > > 
> > > i don't think it changes much, Ingo, if I change it to bus cycles I 
> > > still will have to setup nmi-watchdog event separately (but simply 
> > > with bus event).
> > 
> > You did not understand my point, my suggestion is to change this in 
> > kernel/watchdog.c:
> > 
> > static struct perf_event_attr wd_hw_attr = {
> >         .type           = PERF_TYPE_HARDWARE,
> >         .config         = PERF_COUNT_HW_CPU_CYCLES,
> > 
> > to:
> > 
> > static struct perf_event_attr wd_hw_attr = {
> >         .type           = PERF_TYPE_HARDWARE,
> >         .config         = PERF_COUNT_HW_BUS_CYCLES,
> > 
> > Thanks,
> > 
> > 	Ingo
> 
> Ah, I see. Ingo, if events alias patch I posted yesterday will do 
> the trick we will be able to simply drop all this hooks. So as only 
> Don (or someone) confirm the patch works I'll kill this mess.
> 
> Changing cpu-cycles to bus cycles will allow nmi-watchdog and perf 
> top co-exsist but it means perf bus-cycles won't work anymore with 
> active nmi-watchdog. So I think events aliases (again, if it work
> :) will be better for us, right?

Sure, being able to start two cycles events at once is better - 
assuming that those aliases work fine.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4
  2011-07-08 13:01                                 ` Cyrill Gorcunov
  2011-07-08 13:09                                   ` Ingo Molnar
@ 2011-07-08 13:12                                   ` Cyrill Gorcunov
  1 sibling, 0 replies; 29+ messages in thread
From: Cyrill Gorcunov @ 2011-07-08 13:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Don Zickus, Stephane Eranian, Lin Ming,
	Arnaldo Carvalho de Melo, Frederic Weisbecker, LKML

On Fri, Jul 08, 2011 at 05:01:47PM +0400, Cyrill Gorcunov wrote:
...
> 
> Changing cpu-cycles to bus cycles will allow nmi-watchdog and
> perf top co-exsist but it means perf bus-cycles won't work anymore

oops, I forgot that they both using MSR_P4_FSB_ESCR0/1, so no, they won't work
together (ie nmi-watchdog and perf top won't work simultaneously). So lets
wait for test results, I hope they pass well and we finally isolate all this
crap inside p4 pmu code leaving all code around untouched ;)

> with active nmi-watchdog. So I think events aliases (again, if it
> work :) will be better for us, right?
> 

	Cyrill

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2011-07-08 13:12 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-07-05 10:03 [PATCH -tip, final] perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4 Cyrill Gorcunov
2011-07-05 10:20 ` Ingo Molnar
2011-07-05 10:34   ` Cyrill Gorcunov
2011-07-05 10:59     ` Ingo Molnar
2011-07-05 11:05       ` Cyrill Gorcunov
2011-07-05 11:20         ` Ingo Molnar
2011-07-05 11:36           ` Cyrill Gorcunov
2011-07-05 11:44             ` Ingo Molnar
2011-07-05 11:49               ` Cyrill Gorcunov
2011-07-05 12:14                 ` Cyrill Gorcunov
2011-07-05 13:10                   ` Ingo Molnar
2011-07-05 13:17                     ` Peter Zijlstra
2011-07-05 13:31                       ` Ingo Molnar
2011-07-05 14:19                         ` Cyrill Gorcunov
2011-07-08 12:44                           ` Ingo Molnar
2011-07-05 14:20                         ` Peter Zijlstra
2011-07-05 14:40                         ` Peter Zijlstra
2011-07-05 14:56                           ` Ingo Molnar
2011-07-05 15:25                             ` Cyrill Gorcunov
2011-07-06  7:01                               ` Cyrill Gorcunov
2011-07-08 12:49                               ` Ingo Molnar
2011-07-08 13:01                                 ` Cyrill Gorcunov
2011-07-08 13:09                                   ` Ingo Molnar
2011-07-08 13:12                                   ` Cyrill Gorcunov
2011-07-05 13:26                     ` Cyrill Gorcunov
2011-07-05 12:24               ` Don Zickus
2011-07-05 12:26                 ` Cyrill Gorcunov
2011-07-05 12:44                   ` Don Zickus
2011-07-05 12:56                     ` Cyrill Gorcunov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.